251 lines
9.2 KiB
Python
251 lines
9.2 KiB
Python
"""Emergency escalation service for automation blockers.
|
||
|
||
Keeps Redis dedup, Telegram fanout, and operation-log writes out of API
|
||
routers while giving auto-repair / drift paths a single emergency channel.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.core.config import settings
|
||
from src.core.redis_client import get_redis
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
def _drift_emergency_fingerprint(report: Any) -> str:
|
||
"""Return stable fingerprint for one drift escalation dedup window."""
|
||
try:
|
||
from src.services.drift_repeat_state import build_drift_fingerprint
|
||
|
||
return build_drift_fingerprint(
|
||
str(getattr(report, "namespace", "") or ""),
|
||
list(getattr(report, "items", []) or []),
|
||
include_values=False,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"drift_emergency_fingerprint_failed",
|
||
report_id=getattr(report, "report_id", None),
|
||
error=str(exc),
|
||
)
|
||
return str(getattr(report, "report_id", "") or "unknown")
|
||
|
||
|
||
async def escalate_auto_repair_unavailable(
|
||
*,
|
||
incident_id: str,
|
||
approval_id: str | None,
|
||
alert_type: str,
|
||
target_resource: str,
|
||
namespace: str,
|
||
failure_reason: str,
|
||
attempted_actions: str,
|
||
) -> None:
|
||
"""Open an emergency channel when auto repair cannot safely continue."""
|
||
|
||
# 2026-05-02 Claude Opus 4.7 + 統帥 ogt:dedup key 從 incident_id → fingerprint(alertname+target)
|
||
# 鐵證:4 條 ESCALATION 卡 17:35-17:36 連發(HostOutOfDiskSpace + 3×HostDiskUsageHigh,全 target=node-exporter-110)
|
||
# 原本 incident_id 是 uuid4 隨機,TTL 900s 太短 → 同症狀換 INC ID 完全不去重
|
||
# 改成 alertname+target fingerprint + TTL 86400s,與 decision_manager.py:218 對齊。
|
||
_alertname_fp = (alert_type or "AutoRepairBlocked").strip().lower().replace(" ", "_")[:60]
|
||
_target_fp = (target_resource or "unknown").lower()[:40]
|
||
dedup_key = f"auto_repair:emergency_escalated:fp:{_alertname_fp}:{_target_fp}"
|
||
if not await _dedup_first_send(dedup_key, ttl=86400, event="auto_repair"):
|
||
logger.info(
|
||
"auto_repair_escalation_dedup_skipped",
|
||
incident_id=incident_id,
|
||
approval_id=approval_id,
|
||
fingerprint=f"{_alertname_fp}:{_target_fp}",
|
||
)
|
||
return
|
||
|
||
try:
|
||
from src.repositories.alert_operation_log_repository import (
|
||
get_alert_operation_log_repository,
|
||
)
|
||
from src.services.approval_db import get_timeline_service
|
||
from src.services.telegram_gateway import get_telegram_gateway
|
||
|
||
await get_telegram_gateway().send_escalation_card(
|
||
incident_id=incident_id,
|
||
original_alertname=alert_type or "AutoRepairBlocked",
|
||
duration_min=0,
|
||
priority=0,
|
||
attempted_actions=attempted_actions,
|
||
failure_reason=failure_reason,
|
||
current_impact=f"target={target_resource or 'unknown'} namespace={namespace or 'unknown'}",
|
||
group_chat_id=settings.SRE_GROUP_CHAT_ID or None,
|
||
)
|
||
|
||
await get_alert_operation_log_repository().append(
|
||
"APPROVAL_ESCALATED",
|
||
incident_id=incident_id,
|
||
approval_id=approval_id,
|
||
actor="auto_repair",
|
||
action_detail="auto_repair_unavailable_emergency_channel",
|
||
success=True,
|
||
context={
|
||
"alert_type": alert_type,
|
||
"target_resource": target_resource,
|
||
"namespace": namespace,
|
||
"failure_reason": failure_reason,
|
||
"attempted_actions": attempted_actions,
|
||
},
|
||
)
|
||
try:
|
||
await get_timeline_service().add_event(
|
||
event_type="agent",
|
||
status="warning",
|
||
title="AI emergency intervention requested",
|
||
description=(
|
||
f"{failure_reason} | attempted={attempted_actions}"
|
||
)[:500],
|
||
actor="auto_repair",
|
||
actor_role="emergency_intervention",
|
||
approval_id=approval_id,
|
||
incident_id=incident_id,
|
||
)
|
||
except Exception as timeline_exc:
|
||
logger.warning(
|
||
"auto_repair_emergency_timeline_failed",
|
||
incident_id=incident_id,
|
||
approval_id=approval_id,
|
||
error=str(timeline_exc),
|
||
)
|
||
logger.warning(
|
||
"auto_repair_emergency_escalated",
|
||
incident_id=incident_id,
|
||
approval_id=approval_id,
|
||
reason=failure_reason,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"auto_repair_emergency_escalation_failed",
|
||
incident_id=incident_id,
|
||
approval_id=approval_id,
|
||
error=str(exc),
|
||
)
|
||
|
||
|
||
async def escalate_drift_auto_adopt_blocked(
|
||
*,
|
||
report: Any,
|
||
reason: str,
|
||
interpretation: Any,
|
||
) -> None:
|
||
"""Notify the emergency channel when drift cannot be auto-adopted safely."""
|
||
|
||
fingerprint = _drift_emergency_fingerprint(report)
|
||
dedup_key = f"drift:auto_adopt_emergency:fp:{fingerprint}"
|
||
if not await _dedup_first_send(dedup_key, ttl=86400, event="drift"):
|
||
logger.info(
|
||
"drift_emergency_escalation_dedup_skipped",
|
||
report_id=report.report_id,
|
||
fingerprint=fingerprint,
|
||
)
|
||
return
|
||
|
||
try:
|
||
from src.repositories.alert_operation_log_repository import (
|
||
get_alert_operation_log_repository,
|
||
)
|
||
from src.services.approval_db import get_timeline_service
|
||
from src.services.telegram_gateway import get_telegram_gateway
|
||
|
||
actionable_count = sum(
|
||
1 for item in report.items
|
||
if not getattr(item, "is_allowlisted", False)
|
||
)
|
||
intent = getattr(getattr(interpretation, "intent", None), "value", "unknown")
|
||
confidence = getattr(interpretation, "confidence", 0.0) if interpretation else 0.0
|
||
risk = getattr(interpretation, "risk", "unknown") if interpretation else "unknown"
|
||
|
||
await get_telegram_gateway().send_escalation_card(
|
||
incident_id=report.report_id,
|
||
original_alertname="ConfigDriftAutoAdoptBlocked",
|
||
duration_min=0,
|
||
priority=0 if report.high_count > 0 else 1,
|
||
attempted_actions="drift_interpreter -> auto_adopt_if_safe -> emergency_escalation",
|
||
failure_reason=reason,
|
||
current_impact=(
|
||
f"namespace={report.namespace} high={report.high_count} "
|
||
f"medium={report.medium_count} actionable={actionable_count} "
|
||
f"intent={intent} confidence={confidence:.0%} risk={risk} "
|
||
f"fingerprint={fingerprint}"
|
||
),
|
||
group_chat_id=settings.SRE_GROUP_CHAT_ID or None,
|
||
)
|
||
await get_alert_operation_log_repository().append(
|
||
"APPROVAL_ESCALATED",
|
||
incident_id=report.report_id,
|
||
actor="drift_auto_adopt",
|
||
action_detail="drift_auto_adopt_blocked_emergency_channel",
|
||
success=True,
|
||
context={
|
||
"namespace": report.namespace,
|
||
"reason": reason,
|
||
"high_count": report.high_count,
|
||
"medium_count": report.medium_count,
|
||
"actionable_count": actionable_count,
|
||
"intent": intent,
|
||
"confidence": confidence,
|
||
"risk": risk,
|
||
"fingerprint": fingerprint,
|
||
},
|
||
)
|
||
try:
|
||
await get_timeline_service().add_event(
|
||
event_type="agent",
|
||
status="warning",
|
||
title="Drift emergency intervention requested",
|
||
description=(
|
||
f"{reason} | namespace={report.namespace} "
|
||
f"high={report.high_count} medium={report.medium_count} "
|
||
f"intent={intent} confidence={confidence:.0%}"
|
||
)[:500],
|
||
actor="drift_auto_adopt",
|
||
actor_role="emergency_intervention",
|
||
incident_id=report.report_id,
|
||
)
|
||
except Exception as timeline_exc:
|
||
logger.warning(
|
||
"drift_emergency_timeline_failed",
|
||
report_id=report.report_id,
|
||
error=str(timeline_exc),
|
||
)
|
||
logger.warning(
|
||
"drift_auto_adopt_emergency_escalated",
|
||
report_id=report.report_id,
|
||
reason=reason,
|
||
high=report.high_count,
|
||
medium=report.medium_count,
|
||
actionable=actionable_count,
|
||
fingerprint=fingerprint,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"drift_emergency_escalation_failed",
|
||
report_id=report.report_id,
|
||
error=str(exc),
|
||
)
|
||
|
||
|
||
async def _dedup_first_send(key: str, *, ttl: int, event: str) -> bool:
|
||
"""Return True when this is the first escalation in the dedup window."""
|
||
|
||
try:
|
||
redis = get_redis()
|
||
return bool(await redis.set(key, "1", ex=ttl, nx=True))
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"emergency_escalation_dedup_failed_open",
|
||
key=key,
|
||
event=event,
|
||
error=str(exc),
|
||
)
|
||
return True
|