Files
awoooi/apps/api/src/services/emergency_escalation_service.py
Your Name 9843c59450
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m15s
CD Pipeline / build-and-deploy (push) Successful in 3m26s
CD Pipeline / post-deploy-checks (push) Successful in 1m34s
fix(drift): dedupe semantic fingerprint repeats
2026-05-19 01:12:55 +08:00

251 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Emergency escalation service for automation blockers.
Keeps Redis dedup, Telegram fanout, and operation-log writes out of API
routers while giving auto-repair / drift paths a single emergency channel.
"""
from __future__ import annotations
from typing import Any
import structlog
from src.core.config import settings
from src.core.redis_client import get_redis
logger = structlog.get_logger(__name__)
def _drift_emergency_fingerprint(report: Any) -> str:
"""Return stable fingerprint for one drift escalation dedup window."""
try:
from src.services.drift_repeat_state import build_drift_fingerprint
return build_drift_fingerprint(
str(getattr(report, "namespace", "") or ""),
list(getattr(report, "items", []) or []),
include_values=False,
)
except Exception as exc:
logger.warning(
"drift_emergency_fingerprint_failed",
report_id=getattr(report, "report_id", None),
error=str(exc),
)
return str(getattr(report, "report_id", "") or "unknown")
async def escalate_auto_repair_unavailable(
*,
incident_id: str,
approval_id: str | None,
alert_type: str,
target_resource: str,
namespace: str,
failure_reason: str,
attempted_actions: str,
) -> None:
"""Open an emergency channel when auto repair cannot safely continue."""
# 2026-05-02 Claude Opus 4.7 + 統帥 ogtdedup key 從 incident_id → fingerprint(alertname+target)
# 鐵證4 條 ESCALATION 卡 17:35-17:36 連發HostOutOfDiskSpace + 3×HostDiskUsageHigh全 target=node-exporter-110
# 原本 incident_id 是 uuid4 隨機TTL 900s 太短 → 同症狀換 INC ID 完全不去重
# 改成 alertname+target fingerprint + TTL 86400s與 decision_manager.py:218 對齊。
_alertname_fp = (alert_type or "AutoRepairBlocked").strip().lower().replace(" ", "_")[:60]
_target_fp = (target_resource or "unknown").lower()[:40]
dedup_key = f"auto_repair:emergency_escalated:fp:{_alertname_fp}:{_target_fp}"
if not await _dedup_first_send(dedup_key, ttl=86400, event="auto_repair"):
logger.info(
"auto_repair_escalation_dedup_skipped",
incident_id=incident_id,
approval_id=approval_id,
fingerprint=f"{_alertname_fp}:{_target_fp}",
)
return
try:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
from src.services.approval_db import get_timeline_service
from src.services.telegram_gateway import get_telegram_gateway
await get_telegram_gateway().send_escalation_card(
incident_id=incident_id,
original_alertname=alert_type or "AutoRepairBlocked",
duration_min=0,
priority=0,
attempted_actions=attempted_actions,
failure_reason=failure_reason,
current_impact=f"target={target_resource or 'unknown'} namespace={namespace or 'unknown'}",
group_chat_id=settings.SRE_GROUP_CHAT_ID or None,
)
await get_alert_operation_log_repository().append(
"APPROVAL_ESCALATED",
incident_id=incident_id,
approval_id=approval_id,
actor="auto_repair",
action_detail="auto_repair_unavailable_emergency_channel",
success=True,
context={
"alert_type": alert_type,
"target_resource": target_resource,
"namespace": namespace,
"failure_reason": failure_reason,
"attempted_actions": attempted_actions,
},
)
try:
await get_timeline_service().add_event(
event_type="agent",
status="warning",
title="AI emergency intervention requested",
description=(
f"{failure_reason} | attempted={attempted_actions}"
)[:500],
actor="auto_repair",
actor_role="emergency_intervention",
approval_id=approval_id,
incident_id=incident_id,
)
except Exception as timeline_exc:
logger.warning(
"auto_repair_emergency_timeline_failed",
incident_id=incident_id,
approval_id=approval_id,
error=str(timeline_exc),
)
logger.warning(
"auto_repair_emergency_escalated",
incident_id=incident_id,
approval_id=approval_id,
reason=failure_reason,
)
except Exception as exc:
logger.warning(
"auto_repair_emergency_escalation_failed",
incident_id=incident_id,
approval_id=approval_id,
error=str(exc),
)
async def escalate_drift_auto_adopt_blocked(
*,
report: Any,
reason: str,
interpretation: Any,
) -> None:
"""Notify the emergency channel when drift cannot be auto-adopted safely."""
fingerprint = _drift_emergency_fingerprint(report)
dedup_key = f"drift:auto_adopt_emergency:fp:{fingerprint}"
if not await _dedup_first_send(dedup_key, ttl=86400, event="drift"):
logger.info(
"drift_emergency_escalation_dedup_skipped",
report_id=report.report_id,
fingerprint=fingerprint,
)
return
try:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
from src.services.approval_db import get_timeline_service
from src.services.telegram_gateway import get_telegram_gateway
actionable_count = sum(
1 for item in report.items
if not getattr(item, "is_allowlisted", False)
)
intent = getattr(getattr(interpretation, "intent", None), "value", "unknown")
confidence = getattr(interpretation, "confidence", 0.0) if interpretation else 0.0
risk = getattr(interpretation, "risk", "unknown") if interpretation else "unknown"
await get_telegram_gateway().send_escalation_card(
incident_id=report.report_id,
original_alertname="ConfigDriftAutoAdoptBlocked",
duration_min=0,
priority=0 if report.high_count > 0 else 1,
attempted_actions="drift_interpreter -> auto_adopt_if_safe -> emergency_escalation",
failure_reason=reason,
current_impact=(
f"namespace={report.namespace} high={report.high_count} "
f"medium={report.medium_count} actionable={actionable_count} "
f"intent={intent} confidence={confidence:.0%} risk={risk} "
f"fingerprint={fingerprint}"
),
group_chat_id=settings.SRE_GROUP_CHAT_ID or None,
)
await get_alert_operation_log_repository().append(
"APPROVAL_ESCALATED",
incident_id=report.report_id,
actor="drift_auto_adopt",
action_detail="drift_auto_adopt_blocked_emergency_channel",
success=True,
context={
"namespace": report.namespace,
"reason": reason,
"high_count": report.high_count,
"medium_count": report.medium_count,
"actionable_count": actionable_count,
"intent": intent,
"confidence": confidence,
"risk": risk,
"fingerprint": fingerprint,
},
)
try:
await get_timeline_service().add_event(
event_type="agent",
status="warning",
title="Drift emergency intervention requested",
description=(
f"{reason} | namespace={report.namespace} "
f"high={report.high_count} medium={report.medium_count} "
f"intent={intent} confidence={confidence:.0%}"
)[:500],
actor="drift_auto_adopt",
actor_role="emergency_intervention",
incident_id=report.report_id,
)
except Exception as timeline_exc:
logger.warning(
"drift_emergency_timeline_failed",
report_id=report.report_id,
error=str(timeline_exc),
)
logger.warning(
"drift_auto_adopt_emergency_escalated",
report_id=report.report_id,
reason=reason,
high=report.high_count,
medium=report.medium_count,
actionable=actionable_count,
fingerprint=fingerprint,
)
except Exception as exc:
logger.warning(
"drift_emergency_escalation_failed",
report_id=report.report_id,
error=str(exc),
)
async def _dedup_first_send(key: str, *, ttl: int, event: str) -> bool:
"""Return True when this is the first escalation in the dedup window."""
try:
redis = get_redis()
return bool(await redis.set(key, "1", ex=ttl, nx=True))
except Exception as exc:
logger.warning(
"emergency_escalation_dedup_failed_open",
key=key,
event=event,
error=str(exc),
)
return True