fix(incident): Alertmanager 告警補寫 frequency_stats → 歷史統計不再空白
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

根因:create_incident_for_approval 建立 Incident 時從未查詢 AnomalyCounter
     → frequency_snapshot 永遠 null → 歷史按鈕顯示「無建立時快照」
     signoz/sentry webhook 有寫,Alertmanager 路徑漏掉

修復:建立前 record_anomaly → 頻率快照存入 frequency_stats → PG 持久化
     失敗無害(try/except,不阻斷主流程)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-04-27 19:41:03 +08:00
parent 0a22f49932
commit d0c24275d6

View File

@@ -32,6 +32,7 @@ from src.db.base import get_db_context
from src.db.models import IncidentRecord
from src.models.incident import (
Incident,
IncidentFrequencyStats,
IncidentStatus,
Severity,
Signal,
@@ -341,6 +342,40 @@ async def create_incident_for_approval(
_affected_services = extract_affected_services(_labels, target_resource)
# 2026-04-27 ogt + Claude Sonnet 4.6: 補 frequency_stats 寫入
# 根因Alertmanager 告警建立 Incident 時從未查詢 AnomalyCounter
# → frequency_snapshot 永遠 null → 歷史統計顯示「無建立時快照」
# 修復:建立前先 record_anomaly將頻率快照存入 frequency_stats
_freq_stats = None
try:
from src.services.anomaly_counter import get_anomaly_counter
_anomaly_sig = {
"alert_name": alertname or alert_type,
"service": (alert_labels or {}).get("service", target_resource),
"namespace": namespace,
"error_type": (alert_labels or {}).get("error_type", alert_type),
}
_freq = await get_anomaly_counter().record_anomaly(_anomaly_sig)
if _freq:
_freq_stats = _freq.to_dict()
except Exception as _freq_err:
logger.warning("incident_frequency_stats_failed", error=str(_freq_err))
_freq_model = None
if _freq_stats:
try:
_freq_model = IncidentFrequencyStats(
anomaly_key=_freq_stats.get("anomaly_key", "unknown"),
count_1h=_freq_stats.get("count_1h", 0),
count_24h=_freq_stats.get("count_24h", 0),
count_7d=_freq_stats.get("count_7d", 0),
count_30d=_freq_stats.get("count_30d", 0),
escalation_level=_freq_stats.get("escalation_level"),
auto_repair_count=_freq_stats.get("auto_repair_count", 0),
)
except Exception:
pass
incident = Incident(
status=IncidentStatus.INVESTIGATING,
severity=severity,
@@ -349,6 +384,7 @@ async def create_incident_for_approval(
proposal_ids=[UUID(approval_id)],
notification_type=notification_type, # ADR-073 Phase 2-2
alert_category=alert_category, # ADR-073 Phase 2-2
frequency_stats=_freq_model,
)
await incident_service.save_to_working_memory(incident)
@@ -813,8 +849,6 @@ class IncidentService:
# 2. 建立 Incident (含頻率統計)
# ADR-037: 統帥指示「重啟只是治標,太常發生的異常必須徹底解決」
from src.models.incident import IncidentFrequencyStats
freq_stats = None
if frequency_stats:
freq_stats = IncidentFrequencyStats(