fix(incident): Alertmanager 告警補寫 frequency_stats → 歷史統計不再空白
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
根因:create_incident_for_approval 建立 Incident 時從未查詢 AnomalyCounter
→ frequency_snapshot 永遠 null → 歷史按鈕顯示「無建立時快照」
signoz/sentry webhook 有寫,Alertmanager 路徑漏掉
修復:建立前 record_anomaly → 頻率快照存入 frequency_stats → PG 持久化
失敗無害(try/except,不阻斷主流程)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,7 @@ from src.db.base import get_db_context
|
||||
from src.db.models import IncidentRecord
|
||||
from src.models.incident import (
|
||||
Incident,
|
||||
IncidentFrequencyStats,
|
||||
IncidentStatus,
|
||||
Severity,
|
||||
Signal,
|
||||
@@ -341,6 +342,40 @@ async def create_incident_for_approval(
|
||||
|
||||
_affected_services = extract_affected_services(_labels, target_resource)
|
||||
|
||||
# 2026-04-27 ogt + Claude Sonnet 4.6: 補 frequency_stats 寫入
|
||||
# 根因:Alertmanager 告警建立 Incident 時從未查詢 AnomalyCounter
|
||||
# → frequency_snapshot 永遠 null → 歷史統計顯示「無建立時快照」
|
||||
# 修復:建立前先 record_anomaly,將頻率快照存入 frequency_stats
|
||||
_freq_stats = None
|
||||
try:
|
||||
from src.services.anomaly_counter import get_anomaly_counter
|
||||
_anomaly_sig = {
|
||||
"alert_name": alertname or alert_type,
|
||||
"service": (alert_labels or {}).get("service", target_resource),
|
||||
"namespace": namespace,
|
||||
"error_type": (alert_labels or {}).get("error_type", alert_type),
|
||||
}
|
||||
_freq = await get_anomaly_counter().record_anomaly(_anomaly_sig)
|
||||
if _freq:
|
||||
_freq_stats = _freq.to_dict()
|
||||
except Exception as _freq_err:
|
||||
logger.warning("incident_frequency_stats_failed", error=str(_freq_err))
|
||||
|
||||
_freq_model = None
|
||||
if _freq_stats:
|
||||
try:
|
||||
_freq_model = IncidentFrequencyStats(
|
||||
anomaly_key=_freq_stats.get("anomaly_key", "unknown"),
|
||||
count_1h=_freq_stats.get("count_1h", 0),
|
||||
count_24h=_freq_stats.get("count_24h", 0),
|
||||
count_7d=_freq_stats.get("count_7d", 0),
|
||||
count_30d=_freq_stats.get("count_30d", 0),
|
||||
escalation_level=_freq_stats.get("escalation_level"),
|
||||
auto_repair_count=_freq_stats.get("auto_repair_count", 0),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
incident = Incident(
|
||||
status=IncidentStatus.INVESTIGATING,
|
||||
severity=severity,
|
||||
@@ -349,6 +384,7 @@ async def create_incident_for_approval(
|
||||
proposal_ids=[UUID(approval_id)],
|
||||
notification_type=notification_type, # ADR-073 Phase 2-2
|
||||
alert_category=alert_category, # ADR-073 Phase 2-2
|
||||
frequency_stats=_freq_model,
|
||||
)
|
||||
|
||||
await incident_service.save_to_working_memory(incident)
|
||||
@@ -813,8 +849,6 @@ class IncidentService:
|
||||
|
||||
# 2. 建立 Incident (含頻率統計)
|
||||
# ADR-037: 統帥指示「重啟只是治標,太常發生的異常必須徹底解決」
|
||||
from src.models.incident import IncidentFrequencyStats
|
||||
|
||||
freq_stats = None
|
||||
if frequency_stats:
|
||||
freq_stats = IncidentFrequencyStats(
|
||||
|
||||
Reference in New Issue
Block a user