From 2cef2098d3e24389c67e7a7e36710b487192b36d Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 18:35:56 +0800 Subject: [PATCH] =?UTF-8?q?feat(adr-075):=20=E4=BF=AE=E5=BE=A9=20Telegram?= =?UTF-8?q?=20=E5=8B=95=E6=85=8B=E6=8C=89=E9=88=95=204=20=E5=80=8B?= =?UTF-8?q?=E6=96=B7=E9=BB=9E=20+=20=E6=96=B0=E5=A2=9E=207=20=E7=A8=AE?= =?UTF-8?q?=E5=91=8A=E8=AD=A6=E5=88=86=E9=A1=9E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 斷點 A: decision_manager 提取 alert_category/notification_type 傳入 send_approval_card 斷點 B: send_approval_card 新增參數並傳遞至 _build_inline_keyboard 斷點 C: 互動型通知 (TYPE-3/4/4D/8M) 禁止發 SRE 群組,防 nonce 洩漏 斷點 D: _CATEGORY_BUTTONS k8s_workload → kubernetes + 新增 6 類按鈕組 classify_alert_early 新增: alertchain_health, flywheel_health, storage, devops_tool, external_site, ssl_cert, host_resource (從 infrastructure 分離) Test: 52 classify + 664 total passed Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/decision_manager.py | 5 + apps/api/src/services/incident_service.py | 59 ++++++++- apps/api/src/services/telegram_gateway.py | 35 ++++- apps/api/tests/test_classify_alert_early.py | 139 +++++++++++++++++++- 4 files changed, 226 insertions(+), 12 deletions(-) diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index df1085d1..8f397e3d 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -246,6 +246,9 @@ async def _push_decision_to_telegram( ) else: # TYPE-2 / TYPE-3 / TYPE-4 都走 send_approval_card(按鈕組合由 alert_category 決定) + # 2026-04-12 ogt: ADR-075 斷點 A 修復 — 從 Incident 提取 alert_category/notification_type + _alert_category = getattr(incident, "alert_category", "") or "" + _notification_type = getattr(incident, "notification_type", "") or _notif_type.value if _notif_type else "" tg_result = await gateway.send_approval_card( approval_id=approval_id, risk_level=risk_level, @@ -265,6 +268,8 @@ async def _push_decision_to_telegram( nemotron_tool_model=nemotron_tool_model, nemotron_tool_backend=nemotron_tool_backend, incident_id=incident.incident_id, + alert_category=_alert_category, + notification_type=_notification_type, ) # 2026-04-09 Claude Sonnet 4.6: 存 message_id → 後續狀態更新在原訊息延續 diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index fc98b337..4c06e474 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -121,20 +121,73 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No Returns: tuple[str, str]: (alert_category, notification_type) """ + # ADR-075 (2026-04-12): 完整重寫,修正 category 命名對齊 _build_inline_keyboard。 + # 統帥決議:kubernetes(非 k8s_workload)、host_resource 從 infrastructure 分離。 alertname_lower = alertname.lower() + + # 1. Config Drift(最高優先,類型特殊不受 severity 影響) if alertname in ("ConfigurationDrift", "KubeConfigDrift"): return "config_drift", "TYPE-4D" + + # 2. 告警鏈路健康(meta-monitoring,優先於 severity 判斷) + if alertname in ( + "AlertChainBroken_Alertmanager", + "AlertChainBroken_Sentry", + "NoAlertsReceived2Hours", + "AlertChainUnhealthy", + ): + return "alertchain_health", "TYPE-8M" + + # 3. 飛輪/AI 系統健康(優先於 severity 判斷) + if alertname in ("AutoRepairLowSuccessRate", "PermanentFixRequired") or alertname.startswith("Flywheel"): + return "flywheel_health", "TYPE-8M" + + # 4. 純資訊 if severity in ("info", "none"): return "info", "TYPE-1" - # Watchdog/Heartbeat 永遠是 TYPE-1(Alertmanager 心跳) - if "watchdog" in alertname_lower or alertname in ("Heartbeat",): + + # 5. Backup / Heartbeat(Velero 已由 K8s prefix 規則接管) + if "watchdog" in alertname_lower or alertname == "Heartbeat": return "backup", "TYPE-1" - if alertname.startswith(("Docker", "Host")): + + # 6. 主機資源(從 infrastructure 分離,ADR-075 統帥決議) + if alertname.startswith("Host"): + return "host_resource", "TYPE-3" + + # 7. Docker 容器 + if alertname.startswith("Docker"): return "infrastructure", "TYPE-3" + + # 8. K8s(統帥決議:統一用 kubernetes,不用 k8s_workload) if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")): return "kubernetes", "TYPE-3" + + # 9. 資料庫 if alertname.startswith(("Postgres", "Redis")): return "database", "TYPE-3" + + # 10. 物件儲存 + if alertname == "MinIODown": + return "storage", "TYPE-3" + + # 11. DevOps 工具(ADR-075 修正:從 general 分離) + if alertname in ( + "OpenClawDown", "SignOzDown", "GiteaDown", "HarborDown", + "SentryDown", "AlertmanagerDown", "KaliScannerDown", "GiteaCIPipelineFailed", + ): + return "devops_tool", "TYPE-3" + + # 12. 外部網站(ADR-075 修正:從 general 分離) + if alertname in ( + "MoWoooWorkDown", "TsenyangWebsiteDown", + "StockWoooWorkDown", "BitanWoooWorkDown", + ): + return "external_site", "TYPE-3" + + # 13. SSL 憑證(ADR-075 修正:從 general 分離) + if alertname.startswith(("ExternalSiteSSL", "TLSCert")): + return "ssl_cert", "TYPE-3" + return "general", "TYPE-3" diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index d7c7d1aa..489b73a4 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -1355,8 +1355,9 @@ class TelegramGateway: notification_type: 通知類型 (TYPE-1/2/3/4/4D) """ # TYPE-3 動態操作按鈕 (ADR-071-E) + # ADR-075: 統一用 kubernetes(移除舊 k8s_workload),新增 storage/external_site/alertchain_health/flywheel_health _CATEGORY_BUTTONS: dict[str, list[tuple[str, str]]] = { - "k8s_workload": [ + "kubernetes": [ ("🔄 重啟", f"action:restart:{incident_id}"), ("📈 擴容", f"action:scale_up:{incident_id}"), ("📉 縮容", f"action:scale_down:{incident_id}"), @@ -1379,6 +1380,22 @@ class TelegramGateway: ("🔄 重啟服務", f"action:restart_service:{incident_id}"), ("📋 查 Log", f"action:check_log:{incident_id}"), ], + "storage": [ + ("🔄 重啟 MinIO", f"action:restart_service:{incident_id}"), + ("📋 查 Log", f"action:check_log:{incident_id}"), + ], + "external_site": [ + ("🔍 查健康狀態", f"action:check_health:{incident_id}"), + ("📋 查 Log", f"action:check_log:{incident_id}"), + ], + "alertchain_health": [ + ("🔍 查 Alertmanager", f"action:check_log:{incident_id}"), + ("🔄 重載設定", f"action:reload_config:{incident_id}"), + ], + "flywheel_health": [ + ("🔍 查飛輪狀態", f"action:check_health:{incident_id}"), + ("📋 查 Log", f"action:check_log:{incident_id}"), + ], "ai_system": [ ("🔀 切換 Provider", f"action:switch_provider:{incident_id}"), ], @@ -1473,6 +1490,9 @@ class TelegramGateway: nemotron_tool_backend: str = "", # 2026-04-05 Claude Code: incident_id 用於 detail/reanalyze/history 按鈕 incident_id: str = "", + # 2026-04-12 ogt: ADR-075 告警分類與通知類型(斷點 B 修復) + alert_category: str = "", + notification_type: str = "", ) -> dict: """ 推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合) @@ -1553,11 +1573,14 @@ class TelegramGateway: # 建立按鈕 (含自動調優) # 2026-04-05 Claude Code: 傳入 incident_id 以啟用 detail/reanalyze/history 按鈕 + # ADR-075: 傳入 alert_category/notification_type 以啟用分類動態按鈕(斷點 B 修復) keyboard = self._build_inline_keyboard( approval_id=approval_id, include_auto_tuning=bool(auto_tuning_command), auto_tuning_command=auto_tuning_command, incident_id=incident_id, + alert_category=alert_category, + notification_type=notification_type, ) # 發送訊息 @@ -1597,8 +1620,14 @@ class TelegramGateway: # 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053) # 2026-04-05 ogt: 升級為完整 v7.0 格式,與個人 chat 一致 - # 非同步執行,失敗不影響告警主流程 - if settings.SRE_GROUP_CHAT_ID: + # 2026-04-12 ogt: ADR-075 斷點 C 修復 — 含按鈕的互動型通知禁止發群組(nonce 洩漏) + # TYPE-1/TYPE-2 → 可發群組(純資訊,無 callback 按鈕) + # TYPE-3/TYPE-4/TYPE-4D/TYPE-8M → 僅 DM,不發群組 + _interactive_types = {"TYPE-3", "TYPE-4", "TYPE-4D", "TYPE-8M"} + _is_interactive = notification_type in _interactive_types or ( + not notification_type and alert_category # 有分類但無明確型別 → 視為互動型 + ) + if settings.SRE_GROUP_CHAT_ID and not _is_interactive: asyncio.create_task( self._send_approval_card_to_group( approval_id=approval_id, diff --git a/apps/api/tests/test_classify_alert_early.py b/apps/api/tests/test_classify_alert_early.py index 22393bda..8f747767 100644 --- a/apps/api/tests/test_classify_alert_early.py +++ b/apps/api/tests/test_classify_alert_early.py @@ -76,17 +76,144 @@ class TestInfrastructure: def test_docker_prefix(self): ac, nt = classify_alert_early("DockerContainerOOM", "critical", {}) assert nt == "TYPE-3" - assert ac == "infrastructure" - - def test_host_prefix(self): - ac, nt = classify_alert_early("HostHighCpuLoad", "warning", {}) - assert nt == "TYPE-3" - assert ac == "infrastructure" + assert ac == "infrastructure" # Docker → 保留 infrastructure def test_docker_restart(self): ac, nt = classify_alert_early("DockerContainerRestarting", "warning", {}) assert ac == "infrastructure" + # ADR-075: Host* 從 infrastructure 分離為 host_resource + def test_host_prefix_is_host_resource(self): + ac, nt = classify_alert_early("HostHighCpuLoad", "warning", {}) + assert nt == "TYPE-3" + assert ac == "host_resource" + + def test_host_down(self): + ac, nt = classify_alert_early("HostDown", "critical", {}) + assert ac == "host_resource" + + def test_host_memory(self): + ac, nt = classify_alert_early("HostOutOfMemory", "warning", {}) + assert ac == "host_resource" + + def test_host_disk(self): + ac, nt = classify_alert_early("HostOutOfDiskSpace", "warning", {}) + assert ac == "host_resource" + + +# --------------------------------------------------------------------------- # +# ADR-075: alertchain_health (TYPE-8M) +# --------------------------------------------------------------------------- # + +class TestAlertchainHealth: + @pytest.mark.parametrize("alertname", [ + "AlertChainBroken_Alertmanager", + "AlertChainBroken_Sentry", + "NoAlertsReceived2Hours", + "AlertChainUnhealthy", + ]) + def test_alertchain_alerts(self, alertname): + ac, nt = classify_alert_early(alertname, "critical", {}) + assert ac == "alertchain_health" + assert nt == "TYPE-8M" + + def test_alertchain_beats_severity_info(self): + # 即使 severity=info,AlertChainBroken 也必須是 alertchain_health + ac, nt = classify_alert_early("AlertChainBroken_Alertmanager", "info", {}) + assert ac == "alertchain_health" + assert nt == "TYPE-8M" + + +# --------------------------------------------------------------------------- # +# ADR-075: flywheel_health (TYPE-8M) +# --------------------------------------------------------------------------- # + +class TestFlywheelHealth: + def test_auto_repair_low_success(self): + ac, nt = classify_alert_early("AutoRepairLowSuccessRate", "warning", {}) + assert ac == "flywheel_health" + assert nt == "TYPE-8M" + + def test_permanent_fix_required(self): + ac, nt = classify_alert_early("PermanentFixRequired", "warning", {}) + assert ac == "flywheel_health" + assert nt == "TYPE-8M" + + def test_flywheel_prefix(self): + ac, nt = classify_alert_early("FlywheelPlaybookZero", "critical", {}) + assert ac == "flywheel_health" + assert nt == "TYPE-8M" + + def test_flywheel_beats_severity_info(self): + ac, nt = classify_alert_early("AutoRepairLowSuccessRate", "info", {}) + assert ac == "flywheel_health" + assert nt == "TYPE-8M" + + +# --------------------------------------------------------------------------- # +# ADR-075: storage (TYPE-3) +# --------------------------------------------------------------------------- # + +class TestStorage: + def test_minio_down(self): + ac, nt = classify_alert_early("MinIODown", "critical", {}) + assert ac == "storage" + assert nt == "TYPE-3" + + +# --------------------------------------------------------------------------- # +# ADR-075: devops_tool (TYPE-3) +# --------------------------------------------------------------------------- # + +class TestDevopsTool: + @pytest.mark.parametrize("alertname", [ + "OpenClawDown", + "SignOzDown", + "GiteaDown", + "HarborDown", + "SentryDown", + "AlertmanagerDown", + "KaliScannerDown", + "GiteaCIPipelineFailed", + ]) + def test_devops_tools(self, alertname): + ac, nt = classify_alert_early(alertname, "critical", {}) + assert ac == "devops_tool" + assert nt == "TYPE-3" + + +# --------------------------------------------------------------------------- # +# ADR-075: external_site (TYPE-3) +# --------------------------------------------------------------------------- # + +class TestExternalSite: + @pytest.mark.parametrize("alertname", [ + "MoWoooWorkDown", + "TsenyangWebsiteDown", + "StockWoooWorkDown", + "BitanWoooWorkDown", + ]) + def test_external_sites(self, alertname): + ac, nt = classify_alert_early(alertname, "critical", {}) + assert ac == "external_site" + assert nt == "TYPE-3" + + +# --------------------------------------------------------------------------- # +# ADR-075: ssl_cert (TYPE-3) +# --------------------------------------------------------------------------- # + +class TestSslCert: + def test_external_site_ssl(self): + ac, nt = classify_alert_early("ExternalSiteSSLExpiringSoon", "warning", {}) + assert ac == "ssl_cert" + assert nt == "TYPE-3" + + def test_tls_cert(self): + ac, nt = classify_alert_early("TLSCertExpiryCritical", "critical", {}) + assert ac == "ssl_cert" + assert nt == "TYPE-3" + # --------------------------------------------------------------------------- # # TYPE-3: Kubernetes