From d0c24275d651e7269f319c13d92db27d2e3e96ae Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 27 Apr 2026 19:41:03 +0800 Subject: [PATCH] =?UTF-8?q?fix(incident):=20Alertmanager=20=E5=91=8A?= =?UTF-8?q?=E8=AD=A6=E8=A3=9C=E5=AF=AB=20frequency=5Fstats=20=E2=86=92=20?= =?UTF-8?q?=E6=AD=B7=E5=8F=B2=E7=B5=B1=E8=A8=88=E4=B8=8D=E5=86=8D=E7=A9=BA?= =?UTF-8?q?=E7=99=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因:create_incident_for_approval 建立 Incident 時從未查詢 AnomalyCounter → frequency_snapshot 永遠 null → 歷史按鈕顯示「無建立時快照」 signoz/sentry webhook 有寫,Alertmanager 路徑漏掉 修復:建立前 record_anomaly → 頻率快照存入 frequency_stats → PG 持久化 失敗無害(try/except,不阻斷主流程) Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/incident_service.py | 38 +++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index fd4e0b6c..b32cbb9c 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -32,6 +32,7 @@ from src.db.base import get_db_context from src.db.models import IncidentRecord from src.models.incident import ( Incident, + IncidentFrequencyStats, IncidentStatus, Severity, Signal, @@ -341,6 +342,40 @@ async def create_incident_for_approval( _affected_services = extract_affected_services(_labels, target_resource) + # 2026-04-27 ogt + Claude Sonnet 4.6: 補 frequency_stats 寫入 + # 根因:Alertmanager 告警建立 Incident 時從未查詢 AnomalyCounter + # → frequency_snapshot 永遠 null → 歷史統計顯示「無建立時快照」 + # 修復:建立前先 record_anomaly,將頻率快照存入 frequency_stats + _freq_stats = None + try: + from src.services.anomaly_counter import get_anomaly_counter + _anomaly_sig = { + "alert_name": alertname or alert_type, + "service": (alert_labels or {}).get("service", target_resource), + "namespace": namespace, + "error_type": (alert_labels or {}).get("error_type", alert_type), + } + _freq = await get_anomaly_counter().record_anomaly(_anomaly_sig) + if _freq: + _freq_stats = _freq.to_dict() + except Exception as _freq_err: + logger.warning("incident_frequency_stats_failed", error=str(_freq_err)) + + _freq_model = None + if _freq_stats: + try: + _freq_model = IncidentFrequencyStats( + anomaly_key=_freq_stats.get("anomaly_key", "unknown"), + count_1h=_freq_stats.get("count_1h", 0), + count_24h=_freq_stats.get("count_24h", 0), + count_7d=_freq_stats.get("count_7d", 0), + count_30d=_freq_stats.get("count_30d", 0), + escalation_level=_freq_stats.get("escalation_level"), + auto_repair_count=_freq_stats.get("auto_repair_count", 0), + ) + except Exception: + pass + incident = Incident( status=IncidentStatus.INVESTIGATING, severity=severity, @@ -349,6 +384,7 @@ async def create_incident_for_approval( proposal_ids=[UUID(approval_id)], notification_type=notification_type, # ADR-073 Phase 2-2 alert_category=alert_category, # ADR-073 Phase 2-2 + frequency_stats=_freq_model, ) await incident_service.save_to_working_memory(incident) @@ -813,8 +849,6 @@ class IncidentService: # 2. 建立 Incident (含頻率統計) # ADR-037: 統帥指示「重啟只是治標,太常發生的異常必須徹底解決」 - from src.models.incident import IncidentFrequencyStats - freq_stats = None if frequency_stats: freq_stats = IncidentFrequencyStats(