feat(adr-075): 修復 Telegram 動態按鈕 4 個斷點 + 新增 7 種告警分類
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
斷點 A: decision_manager 提取 alert_category/notification_type 傳入 send_approval_card 斷點 B: send_approval_card 新增參數並傳遞至 _build_inline_keyboard 斷點 C: 互動型通知 (TYPE-3/4/4D/8M) 禁止發 SRE 群組,防 nonce 洩漏 斷點 D: _CATEGORY_BUTTONS k8s_workload → kubernetes + 新增 6 類按鈕組 classify_alert_early 新增: alertchain_health, flywheel_health, storage, devops_tool, external_site, ssl_cert, host_resource (從 infrastructure 分離) Test: 52 classify + 664 total passed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -246,6 +246,9 @@ async def _push_decision_to_telegram(
|
||||
)
|
||||
else:
|
||||
# TYPE-2 / TYPE-3 / TYPE-4 都走 send_approval_card(按鈕組合由 alert_category 決定)
|
||||
# 2026-04-12 ogt: ADR-075 斷點 A 修復 — 從 Incident 提取 alert_category/notification_type
|
||||
_alert_category = getattr(incident, "alert_category", "") or ""
|
||||
_notification_type = getattr(incident, "notification_type", "") or _notif_type.value if _notif_type else ""
|
||||
tg_result = await gateway.send_approval_card(
|
||||
approval_id=approval_id,
|
||||
risk_level=risk_level,
|
||||
@@ -265,6 +268,8 @@ async def _push_decision_to_telegram(
|
||||
nemotron_tool_model=nemotron_tool_model,
|
||||
nemotron_tool_backend=nemotron_tool_backend,
|
||||
incident_id=incident.incident_id,
|
||||
alert_category=_alert_category,
|
||||
notification_type=_notification_type,
|
||||
)
|
||||
|
||||
# 2026-04-09 Claude Sonnet 4.6: 存 message_id → 後續狀態更新在原訊息延續
|
||||
|
||||
@@ -121,20 +121,73 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
|
||||
Returns:
|
||||
tuple[str, str]: (alert_category, notification_type)
|
||||
"""
|
||||
# ADR-075 (2026-04-12): 完整重寫,修正 category 命名對齊 _build_inline_keyboard。
|
||||
# 統帥決議:kubernetes(非 k8s_workload)、host_resource 從 infrastructure 分離。
|
||||
alertname_lower = alertname.lower()
|
||||
|
||||
# 1. Config Drift(最高優先,類型特殊不受 severity 影響)
|
||||
if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
|
||||
return "config_drift", "TYPE-4D"
|
||||
|
||||
# 2. 告警鏈路健康(meta-monitoring,優先於 severity 判斷)
|
||||
if alertname in (
|
||||
"AlertChainBroken_Alertmanager",
|
||||
"AlertChainBroken_Sentry",
|
||||
"NoAlertsReceived2Hours",
|
||||
"AlertChainUnhealthy",
|
||||
):
|
||||
return "alertchain_health", "TYPE-8M"
|
||||
|
||||
# 3. 飛輪/AI 系統健康(優先於 severity 判斷)
|
||||
if alertname in ("AutoRepairLowSuccessRate", "PermanentFixRequired") or alertname.startswith("Flywheel"):
|
||||
return "flywheel_health", "TYPE-8M"
|
||||
|
||||
# 4. 純資訊
|
||||
if severity in ("info", "none"):
|
||||
return "info", "TYPE-1"
|
||||
# Watchdog/Heartbeat 永遠是 TYPE-1(Alertmanager 心跳)
|
||||
if "watchdog" in alertname_lower or alertname in ("Heartbeat",):
|
||||
|
||||
# 5. Backup / Heartbeat(Velero 已由 K8s prefix 規則接管)
|
||||
if "watchdog" in alertname_lower or alertname == "Heartbeat":
|
||||
return "backup", "TYPE-1"
|
||||
if alertname.startswith(("Docker", "Host")):
|
||||
|
||||
# 6. 主機資源(從 infrastructure 分離,ADR-075 統帥決議)
|
||||
if alertname.startswith("Host"):
|
||||
return "host_resource", "TYPE-3"
|
||||
|
||||
# 7. Docker 容器
|
||||
if alertname.startswith("Docker"):
|
||||
return "infrastructure", "TYPE-3"
|
||||
|
||||
# 8. K8s(統帥決議:統一用 kubernetes,不用 k8s_workload)
|
||||
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
|
||||
return "kubernetes", "TYPE-3"
|
||||
|
||||
# 9. 資料庫
|
||||
if alertname.startswith(("Postgres", "Redis")):
|
||||
return "database", "TYPE-3"
|
||||
|
||||
# 10. 物件儲存
|
||||
if alertname == "MinIODown":
|
||||
return "storage", "TYPE-3"
|
||||
|
||||
# 11. DevOps 工具(ADR-075 修正:從 general 分離)
|
||||
if alertname in (
|
||||
"OpenClawDown", "SignOzDown", "GiteaDown", "HarborDown",
|
||||
"SentryDown", "AlertmanagerDown", "KaliScannerDown", "GiteaCIPipelineFailed",
|
||||
):
|
||||
return "devops_tool", "TYPE-3"
|
||||
|
||||
# 12. 外部網站(ADR-075 修正:從 general 分離)
|
||||
if alertname in (
|
||||
"MoWoooWorkDown", "TsenyangWebsiteDown",
|
||||
"StockWoooWorkDown", "BitanWoooWorkDown",
|
||||
):
|
||||
return "external_site", "TYPE-3"
|
||||
|
||||
# 13. SSL 憑證(ADR-075 修正:從 general 分離)
|
||||
if alertname.startswith(("ExternalSiteSSL", "TLSCert")):
|
||||
return "ssl_cert", "TYPE-3"
|
||||
|
||||
return "general", "TYPE-3"
|
||||
|
||||
|
||||
|
||||
@@ -1355,8 +1355,9 @@ class TelegramGateway:
|
||||
notification_type: 通知類型 (TYPE-1/2/3/4/4D)
|
||||
"""
|
||||
# TYPE-3 動態操作按鈕 (ADR-071-E)
|
||||
# ADR-075: 統一用 kubernetes(移除舊 k8s_workload),新增 storage/external_site/alertchain_health/flywheel_health
|
||||
_CATEGORY_BUTTONS: dict[str, list[tuple[str, str]]] = {
|
||||
"k8s_workload": [
|
||||
"kubernetes": [
|
||||
("🔄 重啟", f"action:restart:{incident_id}"),
|
||||
("📈 擴容", f"action:scale_up:{incident_id}"),
|
||||
("📉 縮容", f"action:scale_down:{incident_id}"),
|
||||
@@ -1379,6 +1380,22 @@ class TelegramGateway:
|
||||
("🔄 重啟服務", f"action:restart_service:{incident_id}"),
|
||||
("📋 查 Log", f"action:check_log:{incident_id}"),
|
||||
],
|
||||
"storage": [
|
||||
("🔄 重啟 MinIO", f"action:restart_service:{incident_id}"),
|
||||
("📋 查 Log", f"action:check_log:{incident_id}"),
|
||||
],
|
||||
"external_site": [
|
||||
("🔍 查健康狀態", f"action:check_health:{incident_id}"),
|
||||
("📋 查 Log", f"action:check_log:{incident_id}"),
|
||||
],
|
||||
"alertchain_health": [
|
||||
("🔍 查 Alertmanager", f"action:check_log:{incident_id}"),
|
||||
("🔄 重載設定", f"action:reload_config:{incident_id}"),
|
||||
],
|
||||
"flywheel_health": [
|
||||
("🔍 查飛輪狀態", f"action:check_health:{incident_id}"),
|
||||
("📋 查 Log", f"action:check_log:{incident_id}"),
|
||||
],
|
||||
"ai_system": [
|
||||
("🔀 切換 Provider", f"action:switch_provider:{incident_id}"),
|
||||
],
|
||||
@@ -1473,6 +1490,9 @@ class TelegramGateway:
|
||||
nemotron_tool_backend: str = "",
|
||||
# 2026-04-05 Claude Code: incident_id 用於 detail/reanalyze/history 按鈕
|
||||
incident_id: str = "",
|
||||
# 2026-04-12 ogt: ADR-075 告警分類與通知類型(斷點 B 修復)
|
||||
alert_category: str = "",
|
||||
notification_type: str = "",
|
||||
) -> dict:
|
||||
"""
|
||||
推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合)
|
||||
@@ -1553,11 +1573,14 @@ class TelegramGateway:
|
||||
|
||||
# 建立按鈕 (含自動調優)
|
||||
# 2026-04-05 Claude Code: 傳入 incident_id 以啟用 detail/reanalyze/history 按鈕
|
||||
# ADR-075: 傳入 alert_category/notification_type 以啟用分類動態按鈕(斷點 B 修復)
|
||||
keyboard = self._build_inline_keyboard(
|
||||
approval_id=approval_id,
|
||||
include_auto_tuning=bool(auto_tuning_command),
|
||||
auto_tuning_command=auto_tuning_command,
|
||||
incident_id=incident_id,
|
||||
alert_category=alert_category,
|
||||
notification_type=notification_type,
|
||||
)
|
||||
|
||||
# 發送訊息
|
||||
@@ -1597,8 +1620,14 @@ class TelegramGateway:
|
||||
|
||||
# 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053)
|
||||
# 2026-04-05 ogt: 升級為完整 v7.0 格式,與個人 chat 一致
|
||||
# 非同步執行,失敗不影響告警主流程
|
||||
if settings.SRE_GROUP_CHAT_ID:
|
||||
# 2026-04-12 ogt: ADR-075 斷點 C 修復 — 含按鈕的互動型通知禁止發群組(nonce 洩漏)
|
||||
# TYPE-1/TYPE-2 → 可發群組(純資訊,無 callback 按鈕)
|
||||
# TYPE-3/TYPE-4/TYPE-4D/TYPE-8M → 僅 DM,不發群組
|
||||
_interactive_types = {"TYPE-3", "TYPE-4", "TYPE-4D", "TYPE-8M"}
|
||||
_is_interactive = notification_type in _interactive_types or (
|
||||
not notification_type and alert_category # 有分類但無明確型別 → 視為互動型
|
||||
)
|
||||
if settings.SRE_GROUP_CHAT_ID and not _is_interactive:
|
||||
asyncio.create_task(
|
||||
self._send_approval_card_to_group(
|
||||
approval_id=approval_id,
|
||||
|
||||
@@ -76,17 +76,144 @@ class TestInfrastructure:
|
||||
def test_docker_prefix(self):
|
||||
ac, nt = classify_alert_early("DockerContainerOOM", "critical", {})
|
||||
assert nt == "TYPE-3"
|
||||
assert ac == "infrastructure"
|
||||
|
||||
def test_host_prefix(self):
|
||||
ac, nt = classify_alert_early("HostHighCpuLoad", "warning", {})
|
||||
assert nt == "TYPE-3"
|
||||
assert ac == "infrastructure"
|
||||
assert ac == "infrastructure" # Docker → 保留 infrastructure
|
||||
|
||||
def test_docker_restart(self):
|
||||
ac, nt = classify_alert_early("DockerContainerRestarting", "warning", {})
|
||||
assert ac == "infrastructure"
|
||||
|
||||
# ADR-075: Host* 從 infrastructure 分離為 host_resource
|
||||
def test_host_prefix_is_host_resource(self):
|
||||
ac, nt = classify_alert_early("HostHighCpuLoad", "warning", {})
|
||||
assert nt == "TYPE-3"
|
||||
assert ac == "host_resource"
|
||||
|
||||
def test_host_down(self):
|
||||
ac, nt = classify_alert_early("HostDown", "critical", {})
|
||||
assert ac == "host_resource"
|
||||
|
||||
def test_host_memory(self):
|
||||
ac, nt = classify_alert_early("HostOutOfMemory", "warning", {})
|
||||
assert ac == "host_resource"
|
||||
|
||||
def test_host_disk(self):
|
||||
ac, nt = classify_alert_early("HostOutOfDiskSpace", "warning", {})
|
||||
assert ac == "host_resource"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# ADR-075: alertchain_health (TYPE-8M)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
class TestAlertchainHealth:
|
||||
@pytest.mark.parametrize("alertname", [
|
||||
"AlertChainBroken_Alertmanager",
|
||||
"AlertChainBroken_Sentry",
|
||||
"NoAlertsReceived2Hours",
|
||||
"AlertChainUnhealthy",
|
||||
])
|
||||
def test_alertchain_alerts(self, alertname):
|
||||
ac, nt = classify_alert_early(alertname, "critical", {})
|
||||
assert ac == "alertchain_health"
|
||||
assert nt == "TYPE-8M"
|
||||
|
||||
def test_alertchain_beats_severity_info(self):
|
||||
# 即使 severity=info,AlertChainBroken 也必須是 alertchain_health
|
||||
ac, nt = classify_alert_early("AlertChainBroken_Alertmanager", "info", {})
|
||||
assert ac == "alertchain_health"
|
||||
assert nt == "TYPE-8M"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# ADR-075: flywheel_health (TYPE-8M)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
class TestFlywheelHealth:
|
||||
def test_auto_repair_low_success(self):
|
||||
ac, nt = classify_alert_early("AutoRepairLowSuccessRate", "warning", {})
|
||||
assert ac == "flywheel_health"
|
||||
assert nt == "TYPE-8M"
|
||||
|
||||
def test_permanent_fix_required(self):
|
||||
ac, nt = classify_alert_early("PermanentFixRequired", "warning", {})
|
||||
assert ac == "flywheel_health"
|
||||
assert nt == "TYPE-8M"
|
||||
|
||||
def test_flywheel_prefix(self):
|
||||
ac, nt = classify_alert_early("FlywheelPlaybookZero", "critical", {})
|
||||
assert ac == "flywheel_health"
|
||||
assert nt == "TYPE-8M"
|
||||
|
||||
def test_flywheel_beats_severity_info(self):
|
||||
ac, nt = classify_alert_early("AutoRepairLowSuccessRate", "info", {})
|
||||
assert ac == "flywheel_health"
|
||||
assert nt == "TYPE-8M"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# ADR-075: storage (TYPE-3)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
class TestStorage:
|
||||
def test_minio_down(self):
|
||||
ac, nt = classify_alert_early("MinIODown", "critical", {})
|
||||
assert ac == "storage"
|
||||
assert nt == "TYPE-3"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# ADR-075: devops_tool (TYPE-3)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
class TestDevopsTool:
|
||||
@pytest.mark.parametrize("alertname", [
|
||||
"OpenClawDown",
|
||||
"SignOzDown",
|
||||
"GiteaDown",
|
||||
"HarborDown",
|
||||
"SentryDown",
|
||||
"AlertmanagerDown",
|
||||
"KaliScannerDown",
|
||||
"GiteaCIPipelineFailed",
|
||||
])
|
||||
def test_devops_tools(self, alertname):
|
||||
ac, nt = classify_alert_early(alertname, "critical", {})
|
||||
assert ac == "devops_tool"
|
||||
assert nt == "TYPE-3"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# ADR-075: external_site (TYPE-3)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
class TestExternalSite:
|
||||
@pytest.mark.parametrize("alertname", [
|
||||
"MoWoooWorkDown",
|
||||
"TsenyangWebsiteDown",
|
||||
"StockWoooWorkDown",
|
||||
"BitanWoooWorkDown",
|
||||
])
|
||||
def test_external_sites(self, alertname):
|
||||
ac, nt = classify_alert_early(alertname, "critical", {})
|
||||
assert ac == "external_site"
|
||||
assert nt == "TYPE-3"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# ADR-075: ssl_cert (TYPE-3)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
class TestSslCert:
|
||||
def test_external_site_ssl(self):
|
||||
ac, nt = classify_alert_early("ExternalSiteSSLExpiringSoon", "warning", {})
|
||||
assert ac == "ssl_cert"
|
||||
assert nt == "TYPE-3"
|
||||
|
||||
def test_tls_cert(self):
|
||||
ac, nt = classify_alert_early("TLSCertExpiryCritical", "critical", {})
|
||||
assert ac == "ssl_cert"
|
||||
assert nt == "TYPE-3"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# TYPE-3: Kubernetes
|
||||
|
||||
Reference in New Issue
Block a user