fix(alert): fingerprint 加 alertname 防跨告警指紋衝突 + 補入缺漏心跳分類
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
問題根因: 1. generate_fingerprint 用 alert_type(大量 alertname 落入 "custom") → 不同告警名稱同目標共用指紋 → 30 分鐘 debounce 互相擋截 2. classify_alert_early 漏掉 DeadMansSwitch / NoAlertsReceived / PrometheusNotConnectedToAlertmanager → 落入 TYPE-3 一般告警 修復: - alert_analyzer_service.py: 指紋改為 namespace:deployment:alertname:target_resource alertname 取自 labels(Alertmanager),fallback 到 alert_type(其他來源) - incident_service.py: DeadMansSwitch → backup/TYPE-1; NoAlertsReceived + PrometheusNotConnectedToAlertmanager → alertchain_health/TYPE-8M - 補 2 個測試,全套 627 passed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -182,10 +182,14 @@ class AlertAnalyzer:
|
||||
"""
|
||||
生成告警唯一指紋 (SHA256 Hash)
|
||||
|
||||
指紋組成: namespace:deployment:alert_type:target_resource
|
||||
指紋組成: namespace:deployment:alertname:target_resource
|
||||
|
||||
同一個告警模式(相同位置、相同類型)會產生相同指紋,
|
||||
用於識別重複告警並進行聚合。
|
||||
使用 alertname(而非 alert_type)確保不同告警名稱不共用指紋。
|
||||
原本用 alert_type 導致許多不同 alertname 都落入 "custom",
|
||||
造成同目標上的不同告警互相擋截(ADR-073 修復 2026-04-12 ogt)。
|
||||
|
||||
同一個告警名稱 + 同一目標 → 相同指紋 → 觸發 debounce 去重。
|
||||
不同告警名稱(即使同目標)→ 不同指紋 → 各自建立 Incident。
|
||||
"""
|
||||
# 從 labels 取得 deployment,如果沒有則用 target_resource
|
||||
deployment = ""
|
||||
@@ -194,8 +198,13 @@ class AlertAnalyzer:
|
||||
if not deployment:
|
||||
deployment = alert.target_resource
|
||||
|
||||
# alertname 優先取 labels,fallback 到 alert_type(非 Alertmanager 來源)
|
||||
alertname = (
|
||||
alert.labels.get("alertname", "") if alert.labels else ""
|
||||
) or alert.alert_type
|
||||
|
||||
# 組合指紋來源
|
||||
fingerprint_source = f"{alert.namespace}:{deployment}:{alert.alert_type}:{alert.target_resource}"
|
||||
fingerprint_source = f"{alert.namespace}:{deployment}:{alertname}:{alert.target_resource}"
|
||||
|
||||
# SHA256 Hash
|
||||
return hashlib.sha256(fingerprint_source.encode()).hexdigest()[:32]
|
||||
|
||||
@@ -130,11 +130,14 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
|
||||
return "config_drift", "TYPE-4D"
|
||||
|
||||
# 2. 告警鏈路健康(meta-monitoring,優先於 severity 判斷)
|
||||
# 2026-04-12 ogt: 補入 NoAlertsReceived + PrometheusNotConnectedToAlertmanager
|
||||
if alertname in (
|
||||
"AlertChainBroken_Alertmanager",
|
||||
"AlertChainBroken_Sentry",
|
||||
"NoAlertsReceived",
|
||||
"NoAlertsReceived2Hours",
|
||||
"AlertChainUnhealthy",
|
||||
"PrometheusNotConnectedToAlertmanager",
|
||||
):
|
||||
return "alertchain_health", "TYPE-8M"
|
||||
|
||||
@@ -171,8 +174,10 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
|
||||
"HostBackupFailed", "HostBackupStale", "HostBackupMissing",
|
||||
"BackupRestoreTestFailed", "BackupRestoreTestStale",
|
||||
}
|
||||
# 2026-04-12 ogt: 補入 DeadMansSwitch(HEARTBEAT_ALERT_NAMES 中但之前漏掉)
|
||||
if (
|
||||
"watchdog" in alertname_lower
|
||||
or "deadmansswitch" in alertname_lower
|
||||
or alertname == "Heartbeat"
|
||||
or alertname in _BACKUP_TYPE1_NAMES
|
||||
or alertname.startswith("HostBackup")
|
||||
|
||||
@@ -66,6 +66,12 @@ class TestInfoAlerts:
|
||||
ac, nt = classify_alert_early("Watchdog", "none", {})
|
||||
assert nt == "TYPE-1"
|
||||
|
||||
def test_deadmansswitch_heartbeat(self):
|
||||
# DeadMansSwitch 心跳 → TYPE-1(補入 2026-04-12 ogt)
|
||||
ac, nt = classify_alert_early("DeadMansSwitch", "warning", {})
|
||||
assert ac == "backup"
|
||||
assert nt == "TYPE-1"
|
||||
|
||||
def test_backup_critical_not_type1(self):
|
||||
# critical backup 告警應走各自 prefix,不是純資訊
|
||||
ac, nt = classify_alert_early("BACKUP_MISSING", "critical", {})
|
||||
@@ -115,6 +121,8 @@ class TestAlertchainHealth:
|
||||
"AlertChainBroken_Sentry",
|
||||
"NoAlertsReceived2Hours",
|
||||
"AlertChainUnhealthy",
|
||||
"NoAlertsReceived",
|
||||
"PrometheusNotConnectedToAlertmanager",
|
||||
])
|
||||
def test_alertchain_alerts(self, alertname):
|
||||
ac, nt = classify_alert_early(alertname, "critical", {})
|
||||
|
||||
Reference in New Issue
Block a user