fix(alert): fingerprint 加 alertname 防跨告警指紋衝突 + 補入缺漏心跳分類
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

問題根因:
1. generate_fingerprint 用 alert_type(大量 alertname 落入 "custom")
   → 不同告警名稱同目標共用指紋 → 30 分鐘 debounce 互相擋截
2. classify_alert_early 漏掉 DeadMansSwitch / NoAlertsReceived /
   PrometheusNotConnectedToAlertmanager → 落入 TYPE-3 一般告警

修復:
- alert_analyzer_service.py: 指紋改為 namespace:deployment:alertname:target_resource
  alertname 取自 labels(Alertmanager),fallback 到 alert_type(其他來源)
- incident_service.py: DeadMansSwitch → backup/TYPE-1;
  NoAlertsReceived + PrometheusNotConnectedToAlertmanager → alertchain_health/TYPE-8M
- 補 2 個測試,全套 627 passed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-12 22:50:20 +08:00
parent b17a677b97
commit 1a4b52ed28
3 changed files with 26 additions and 4 deletions

View File

@@ -182,10 +182,14 @@ class AlertAnalyzer:
"""
生成告警唯一指紋 (SHA256 Hash)
指紋組成: namespace:deployment:alert_type:target_resource
指紋組成: namespace:deployment:alertname:target_resource
同一個告警模式(相同位置、相同類型)會產生相同指紋
用於識別重複告警並進行聚合。
使用 alertname而非 alert_type確保不同告警名稱不共用指紋
原本用 alert_type 導致許多不同 alertname 都落入 "custom"
造成同目標上的不同告警互相擋截ADR-073 修復 2026-04-12 ogt
同一個告警名稱 + 同一目標 → 相同指紋 → 觸發 debounce 去重。
不同告警名稱(即使同目標)→ 不同指紋 → 各自建立 Incident。
"""
# 從 labels 取得 deployment如果沒有則用 target_resource
deployment = ""
@@ -194,8 +198,13 @@ class AlertAnalyzer:
if not deployment:
deployment = alert.target_resource
# alertname 優先取 labelsfallback 到 alert_type非 Alertmanager 來源)
alertname = (
alert.labels.get("alertname", "") if alert.labels else ""
) or alert.alert_type
# 組合指紋來源
fingerprint_source = f"{alert.namespace}:{deployment}:{alert.alert_type}:{alert.target_resource}"
fingerprint_source = f"{alert.namespace}:{deployment}:{alertname}:{alert.target_resource}"
# SHA256 Hash
return hashlib.sha256(fingerprint_source.encode()).hexdigest()[:32]

View File

@@ -130,11 +130,14 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
return "config_drift", "TYPE-4D"
# 2. 告警鏈路健康meta-monitoring優先於 severity 判斷)
# 2026-04-12 ogt: 補入 NoAlertsReceived + PrometheusNotConnectedToAlertmanager
if alertname in (
"AlertChainBroken_Alertmanager",
"AlertChainBroken_Sentry",
"NoAlertsReceived",
"NoAlertsReceived2Hours",
"AlertChainUnhealthy",
"PrometheusNotConnectedToAlertmanager",
):
return "alertchain_health", "TYPE-8M"
@@ -171,8 +174,10 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
"HostBackupFailed", "HostBackupStale", "HostBackupMissing",
"BackupRestoreTestFailed", "BackupRestoreTestStale",
}
# 2026-04-12 ogt: 補入 DeadMansSwitchHEARTBEAT_ALERT_NAMES 中但之前漏掉)
if (
"watchdog" in alertname_lower
or "deadmansswitch" in alertname_lower
or alertname == "Heartbeat"
or alertname in _BACKUP_TYPE1_NAMES
or alertname.startswith("HostBackup")

View File

@@ -66,6 +66,12 @@ class TestInfoAlerts:
ac, nt = classify_alert_early("Watchdog", "none", {})
assert nt == "TYPE-1"
def test_deadmansswitch_heartbeat(self):
# DeadMansSwitch 心跳 → TYPE-1補入 2026-04-12 ogt
ac, nt = classify_alert_early("DeadMansSwitch", "warning", {})
assert ac == "backup"
assert nt == "TYPE-1"
def test_backup_critical_not_type1(self):
# critical backup 告警應走各自 prefix不是純資訊
ac, nt = classify_alert_early("BACKUP_MISSING", "critical", {})
@@ -115,6 +121,8 @@ class TestAlertchainHealth:
"AlertChainBroken_Sentry",
"NoAlertsReceived2Hours",
"AlertChainUnhealthy",
"NoAlertsReceived",
"PrometheusNotConnectedToAlertmanager",
])
def test_alertchain_alerts(self, alertname):
ac, nt = classify_alert_early(alertname, "critical", {})