feat(adr-075): 修復 Telegram 動態按鈕 4 個斷點 + 新增 7 種告警分類
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

斷點 A: decision_manager 提取 alert_category/notification_type 傳入 send_approval_card
斷點 B: send_approval_card 新增參數並傳遞至 _build_inline_keyboard
斷點 C: 互動型通知 (TYPE-3/4/4D/8M) 禁止發 SRE 群組,防 nonce 洩漏
斷點 D: _CATEGORY_BUTTONS k8s_workload → kubernetes + 新增 6 類按鈕組

classify_alert_early 新增: alertchain_health, flywheel_health, storage,
devops_tool, external_site, ssl_cert, host_resource (從 infrastructure 分離)
Test: 52 classify + 664 total passed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-12 18:35:56 +08:00
parent db282cd0e9
commit 2cef2098d3
4 changed files with 226 additions and 12 deletions

View File

@@ -246,6 +246,9 @@ async def _push_decision_to_telegram(
) )
else: else:
# TYPE-2 / TYPE-3 / TYPE-4 都走 send_approval_card按鈕組合由 alert_category 決定) # TYPE-2 / TYPE-3 / TYPE-4 都走 send_approval_card按鈕組合由 alert_category 決定)
# 2026-04-12 ogt: ADR-075 斷點 A 修復 — 從 Incident 提取 alert_category/notification_type
_alert_category = getattr(incident, "alert_category", "") or ""
_notification_type = getattr(incident, "notification_type", "") or _notif_type.value if _notif_type else ""
tg_result = await gateway.send_approval_card( tg_result = await gateway.send_approval_card(
approval_id=approval_id, approval_id=approval_id,
risk_level=risk_level, risk_level=risk_level,
@@ -265,6 +268,8 @@ async def _push_decision_to_telegram(
nemotron_tool_model=nemotron_tool_model, nemotron_tool_model=nemotron_tool_model,
nemotron_tool_backend=nemotron_tool_backend, nemotron_tool_backend=nemotron_tool_backend,
incident_id=incident.incident_id, incident_id=incident.incident_id,
alert_category=_alert_category,
notification_type=_notification_type,
) )
# 2026-04-09 Claude Sonnet 4.6: 存 message_id → 後續狀態更新在原訊息延續 # 2026-04-09 Claude Sonnet 4.6: 存 message_id → 後續狀態更新在原訊息延續

View File

@@ -121,20 +121,73 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
Returns: Returns:
tuple[str, str]: (alert_category, notification_type) tuple[str, str]: (alert_category, notification_type)
""" """
# ADR-075 (2026-04-12): 完整重寫,修正 category 命名對齊 _build_inline_keyboard。
# 統帥決議kubernetes非 k8s_workload、host_resource 從 infrastructure 分離。
alertname_lower = alertname.lower() alertname_lower = alertname.lower()
# 1. Config Drift最高優先類型特殊不受 severity 影響)
if alertname in ("ConfigurationDrift", "KubeConfigDrift"): if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
return "config_drift", "TYPE-4D" return "config_drift", "TYPE-4D"
# 2. 告警鏈路健康meta-monitoring優先於 severity 判斷)
if alertname in (
"AlertChainBroken_Alertmanager",
"AlertChainBroken_Sentry",
"NoAlertsReceived2Hours",
"AlertChainUnhealthy",
):
return "alertchain_health", "TYPE-8M"
# 3. 飛輪/AI 系統健康(優先於 severity 判斷)
if alertname in ("AutoRepairLowSuccessRate", "PermanentFixRequired") or alertname.startswith("Flywheel"):
return "flywheel_health", "TYPE-8M"
# 4. 純資訊
if severity in ("info", "none"): if severity in ("info", "none"):
return "info", "TYPE-1" return "info", "TYPE-1"
# Watchdog/Heartbeat 永遠是 TYPE-1Alertmanager 心跳)
if "watchdog" in alertname_lower or alertname in ("Heartbeat",): # 5. Backup / HeartbeatVelero 已由 K8s prefix 規則接管)
if "watchdog" in alertname_lower or alertname == "Heartbeat":
return "backup", "TYPE-1" return "backup", "TYPE-1"
if alertname.startswith(("Docker", "Host")):
# 6. 主機資源(從 infrastructure 分離ADR-075 統帥決議)
if alertname.startswith("Host"):
return "host_resource", "TYPE-3"
# 7. Docker 容器
if alertname.startswith("Docker"):
return "infrastructure", "TYPE-3" return "infrastructure", "TYPE-3"
# 8. K8s統帥決議統一用 kubernetes不用 k8s_workload
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")): if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
return "kubernetes", "TYPE-3" return "kubernetes", "TYPE-3"
# 9. 資料庫
if alertname.startswith(("Postgres", "Redis")): if alertname.startswith(("Postgres", "Redis")):
return "database", "TYPE-3" return "database", "TYPE-3"
# 10. 物件儲存
if alertname == "MinIODown":
return "storage", "TYPE-3"
# 11. DevOps 工具ADR-075 修正:從 general 分離)
if alertname in (
"OpenClawDown", "SignOzDown", "GiteaDown", "HarborDown",
"SentryDown", "AlertmanagerDown", "KaliScannerDown", "GiteaCIPipelineFailed",
):
return "devops_tool", "TYPE-3"
# 12. 外部網站ADR-075 修正:從 general 分離)
if alertname in (
"MoWoooWorkDown", "TsenyangWebsiteDown",
"StockWoooWorkDown", "BitanWoooWorkDown",
):
return "external_site", "TYPE-3"
# 13. SSL 憑證ADR-075 修正:從 general 分離)
if alertname.startswith(("ExternalSiteSSL", "TLSCert")):
return "ssl_cert", "TYPE-3"
return "general", "TYPE-3" return "general", "TYPE-3"

View File

@@ -1355,8 +1355,9 @@ class TelegramGateway:
notification_type: 通知類型 (TYPE-1/2/3/4/4D) notification_type: 通知類型 (TYPE-1/2/3/4/4D)
""" """
# TYPE-3 動態操作按鈕 (ADR-071-E) # TYPE-3 動態操作按鈕 (ADR-071-E)
# ADR-075: 統一用 kubernetes移除舊 k8s_workload新增 storage/external_site/alertchain_health/flywheel_health
_CATEGORY_BUTTONS: dict[str, list[tuple[str, str]]] = { _CATEGORY_BUTTONS: dict[str, list[tuple[str, str]]] = {
"k8s_workload": [ "kubernetes": [
("🔄 重啟", f"action:restart:{incident_id}"), ("🔄 重啟", f"action:restart:{incident_id}"),
("📈 擴容", f"action:scale_up:{incident_id}"), ("📈 擴容", f"action:scale_up:{incident_id}"),
("📉 縮容", f"action:scale_down:{incident_id}"), ("📉 縮容", f"action:scale_down:{incident_id}"),
@@ -1379,6 +1380,22 @@ class TelegramGateway:
("🔄 重啟服務", f"action:restart_service:{incident_id}"), ("🔄 重啟服務", f"action:restart_service:{incident_id}"),
("📋 查 Log", f"action:check_log:{incident_id}"), ("📋 查 Log", f"action:check_log:{incident_id}"),
], ],
"storage": [
("🔄 重啟 MinIO", f"action:restart_service:{incident_id}"),
("📋 查 Log", f"action:check_log:{incident_id}"),
],
"external_site": [
("🔍 查健康狀態", f"action:check_health:{incident_id}"),
("📋 查 Log", f"action:check_log:{incident_id}"),
],
"alertchain_health": [
("🔍 查 Alertmanager", f"action:check_log:{incident_id}"),
("🔄 重載設定", f"action:reload_config:{incident_id}"),
],
"flywheel_health": [
("🔍 查飛輪狀態", f"action:check_health:{incident_id}"),
("📋 查 Log", f"action:check_log:{incident_id}"),
],
"ai_system": [ "ai_system": [
("🔀 切換 Provider", f"action:switch_provider:{incident_id}"), ("🔀 切換 Provider", f"action:switch_provider:{incident_id}"),
], ],
@@ -1473,6 +1490,9 @@ class TelegramGateway:
nemotron_tool_backend: str = "", nemotron_tool_backend: str = "",
# 2026-04-05 Claude Code: incident_id 用於 detail/reanalyze/history 按鈕 # 2026-04-05 Claude Code: incident_id 用於 detail/reanalyze/history 按鈕
incident_id: str = "", incident_id: str = "",
# 2026-04-12 ogt: ADR-075 告警分類與通知類型(斷點 B 修復)
alert_category: str = "",
notification_type: str = "",
) -> dict: ) -> dict:
""" """
推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合) 推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合)
@@ -1553,11 +1573,14 @@ class TelegramGateway:
# 建立按鈕 (含自動調優) # 建立按鈕 (含自動調優)
# 2026-04-05 Claude Code: 傳入 incident_id 以啟用 detail/reanalyze/history 按鈕 # 2026-04-05 Claude Code: 傳入 incident_id 以啟用 detail/reanalyze/history 按鈕
# ADR-075: 傳入 alert_category/notification_type 以啟用分類動態按鈕(斷點 B 修復)
keyboard = self._build_inline_keyboard( keyboard = self._build_inline_keyboard(
approval_id=approval_id, approval_id=approval_id,
include_auto_tuning=bool(auto_tuning_command), include_auto_tuning=bool(auto_tuning_command),
auto_tuning_command=auto_tuning_command, auto_tuning_command=auto_tuning_command,
incident_id=incident_id, incident_id=incident_id,
alert_category=alert_category,
notification_type=notification_type,
) )
# 發送訊息 # 發送訊息
@@ -1597,8 +1620,14 @@ class TelegramGateway:
# 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053) # 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053)
# 2026-04-05 ogt: 升級為完整 v7.0 格式,與個人 chat 一致 # 2026-04-05 ogt: 升級為完整 v7.0 格式,與個人 chat 一致
# 非同步執行,失敗不影響告警主流程 # 2026-04-12 ogt: ADR-075 斷點 C 修復 — 含按鈕的互動型通知禁止發群組nonce 洩漏)
if settings.SRE_GROUP_CHAT_ID: # TYPE-1/TYPE-2 → 可發群組(純資訊,無 callback 按鈕)
# TYPE-3/TYPE-4/TYPE-4D/TYPE-8M → 僅 DM不發群組
_interactive_types = {"TYPE-3", "TYPE-4", "TYPE-4D", "TYPE-8M"}
_is_interactive = notification_type in _interactive_types or (
not notification_type and alert_category # 有分類但無明確型別 → 視為互動型
)
if settings.SRE_GROUP_CHAT_ID and not _is_interactive:
asyncio.create_task( asyncio.create_task(
self._send_approval_card_to_group( self._send_approval_card_to_group(
approval_id=approval_id, approval_id=approval_id,

View File

@@ -76,17 +76,144 @@ class TestInfrastructure:
def test_docker_prefix(self): def test_docker_prefix(self):
ac, nt = classify_alert_early("DockerContainerOOM", "critical", {}) ac, nt = classify_alert_early("DockerContainerOOM", "critical", {})
assert nt == "TYPE-3" assert nt == "TYPE-3"
assert ac == "infrastructure" assert ac == "infrastructure" # Docker → 保留 infrastructure
def test_host_prefix(self):
ac, nt = classify_alert_early("HostHighCpuLoad", "warning", {})
assert nt == "TYPE-3"
assert ac == "infrastructure"
def test_docker_restart(self): def test_docker_restart(self):
ac, nt = classify_alert_early("DockerContainerRestarting", "warning", {}) ac, nt = classify_alert_early("DockerContainerRestarting", "warning", {})
assert ac == "infrastructure" assert ac == "infrastructure"
# ADR-075: Host* 從 infrastructure 分離為 host_resource
def test_host_prefix_is_host_resource(self):
ac, nt = classify_alert_early("HostHighCpuLoad", "warning", {})
assert nt == "TYPE-3"
assert ac == "host_resource"
def test_host_down(self):
ac, nt = classify_alert_early("HostDown", "critical", {})
assert ac == "host_resource"
def test_host_memory(self):
ac, nt = classify_alert_early("HostOutOfMemory", "warning", {})
assert ac == "host_resource"
def test_host_disk(self):
ac, nt = classify_alert_early("HostOutOfDiskSpace", "warning", {})
assert ac == "host_resource"
# --------------------------------------------------------------------------- #
# ADR-075: alertchain_health (TYPE-8M)
# --------------------------------------------------------------------------- #
class TestAlertchainHealth:
@pytest.mark.parametrize("alertname", [
"AlertChainBroken_Alertmanager",
"AlertChainBroken_Sentry",
"NoAlertsReceived2Hours",
"AlertChainUnhealthy",
])
def test_alertchain_alerts(self, alertname):
ac, nt = classify_alert_early(alertname, "critical", {})
assert ac == "alertchain_health"
assert nt == "TYPE-8M"
def test_alertchain_beats_severity_info(self):
# 即使 severity=infoAlertChainBroken 也必須是 alertchain_health
ac, nt = classify_alert_early("AlertChainBroken_Alertmanager", "info", {})
assert ac == "alertchain_health"
assert nt == "TYPE-8M"
# --------------------------------------------------------------------------- #
# ADR-075: flywheel_health (TYPE-8M)
# --------------------------------------------------------------------------- #
class TestFlywheelHealth:
def test_auto_repair_low_success(self):
ac, nt = classify_alert_early("AutoRepairLowSuccessRate", "warning", {})
assert ac == "flywheel_health"
assert nt == "TYPE-8M"
def test_permanent_fix_required(self):
ac, nt = classify_alert_early("PermanentFixRequired", "warning", {})
assert ac == "flywheel_health"
assert nt == "TYPE-8M"
def test_flywheel_prefix(self):
ac, nt = classify_alert_early("FlywheelPlaybookZero", "critical", {})
assert ac == "flywheel_health"
assert nt == "TYPE-8M"
def test_flywheel_beats_severity_info(self):
ac, nt = classify_alert_early("AutoRepairLowSuccessRate", "info", {})
assert ac == "flywheel_health"
assert nt == "TYPE-8M"
# --------------------------------------------------------------------------- #
# ADR-075: storage (TYPE-3)
# --------------------------------------------------------------------------- #
class TestStorage:
def test_minio_down(self):
ac, nt = classify_alert_early("MinIODown", "critical", {})
assert ac == "storage"
assert nt == "TYPE-3"
# --------------------------------------------------------------------------- #
# ADR-075: devops_tool (TYPE-3)
# --------------------------------------------------------------------------- #
class TestDevopsTool:
@pytest.mark.parametrize("alertname", [
"OpenClawDown",
"SignOzDown",
"GiteaDown",
"HarborDown",
"SentryDown",
"AlertmanagerDown",
"KaliScannerDown",
"GiteaCIPipelineFailed",
])
def test_devops_tools(self, alertname):
ac, nt = classify_alert_early(alertname, "critical", {})
assert ac == "devops_tool"
assert nt == "TYPE-3"
# --------------------------------------------------------------------------- #
# ADR-075: external_site (TYPE-3)
# --------------------------------------------------------------------------- #
class TestExternalSite:
@pytest.mark.parametrize("alertname", [
"MoWoooWorkDown",
"TsenyangWebsiteDown",
"StockWoooWorkDown",
"BitanWoooWorkDown",
])
def test_external_sites(self, alertname):
ac, nt = classify_alert_early(alertname, "critical", {})
assert ac == "external_site"
assert nt == "TYPE-3"
# --------------------------------------------------------------------------- #
# ADR-075: ssl_cert (TYPE-3)
# --------------------------------------------------------------------------- #
class TestSslCert:
def test_external_site_ssl(self):
ac, nt = classify_alert_early("ExternalSiteSSLExpiringSoon", "warning", {})
assert ac == "ssl_cert"
assert nt == "TYPE-3"
def test_tls_cert(self):
ac, nt = classify_alert_early("TLSCertExpiryCritical", "critical", {})
assert ac == "ssl_cert"
assert nt == "TYPE-3"
# --------------------------------------------------------------------------- # # --------------------------------------------------------------------------- #
# TYPE-3: Kubernetes # TYPE-3: Kubernetes