feat(adr-075): 修復 Telegram 動態按鈕 4 個斷點 + 新增 7 種告警分類
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
斷點 A: decision_manager 提取 alert_category/notification_type 傳入 send_approval_card 斷點 B: send_approval_card 新增參數並傳遞至 _build_inline_keyboard 斷點 C: 互動型通知 (TYPE-3/4/4D/8M) 禁止發 SRE 群組,防 nonce 洩漏 斷點 D: _CATEGORY_BUTTONS k8s_workload → kubernetes + 新增 6 類按鈕組 classify_alert_early 新增: alertchain_health, flywheel_health, storage, devops_tool, external_site, ssl_cert, host_resource (從 infrastructure 分離) Test: 52 classify + 664 total passed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -246,6 +246,9 @@ async def _push_decision_to_telegram(
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# TYPE-2 / TYPE-3 / TYPE-4 都走 send_approval_card(按鈕組合由 alert_category 決定)
|
# TYPE-2 / TYPE-3 / TYPE-4 都走 send_approval_card(按鈕組合由 alert_category 決定)
|
||||||
|
# 2026-04-12 ogt: ADR-075 斷點 A 修復 — 從 Incident 提取 alert_category/notification_type
|
||||||
|
_alert_category = getattr(incident, "alert_category", "") or ""
|
||||||
|
_notification_type = getattr(incident, "notification_type", "") or _notif_type.value if _notif_type else ""
|
||||||
tg_result = await gateway.send_approval_card(
|
tg_result = await gateway.send_approval_card(
|
||||||
approval_id=approval_id,
|
approval_id=approval_id,
|
||||||
risk_level=risk_level,
|
risk_level=risk_level,
|
||||||
@@ -265,6 +268,8 @@ async def _push_decision_to_telegram(
|
|||||||
nemotron_tool_model=nemotron_tool_model,
|
nemotron_tool_model=nemotron_tool_model,
|
||||||
nemotron_tool_backend=nemotron_tool_backend,
|
nemotron_tool_backend=nemotron_tool_backend,
|
||||||
incident_id=incident.incident_id,
|
incident_id=incident.incident_id,
|
||||||
|
alert_category=_alert_category,
|
||||||
|
notification_type=_notification_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 2026-04-09 Claude Sonnet 4.6: 存 message_id → 後續狀態更新在原訊息延續
|
# 2026-04-09 Claude Sonnet 4.6: 存 message_id → 後續狀態更新在原訊息延續
|
||||||
|
|||||||
@@ -121,20 +121,73 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
|
|||||||
Returns:
|
Returns:
|
||||||
tuple[str, str]: (alert_category, notification_type)
|
tuple[str, str]: (alert_category, notification_type)
|
||||||
"""
|
"""
|
||||||
|
# ADR-075 (2026-04-12): 完整重寫,修正 category 命名對齊 _build_inline_keyboard。
|
||||||
|
# 統帥決議:kubernetes(非 k8s_workload)、host_resource 從 infrastructure 分離。
|
||||||
alertname_lower = alertname.lower()
|
alertname_lower = alertname.lower()
|
||||||
|
|
||||||
|
# 1. Config Drift(最高優先,類型特殊不受 severity 影響)
|
||||||
if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
|
if alertname in ("ConfigurationDrift", "KubeConfigDrift"):
|
||||||
return "config_drift", "TYPE-4D"
|
return "config_drift", "TYPE-4D"
|
||||||
|
|
||||||
|
# 2. 告警鏈路健康(meta-monitoring,優先於 severity 判斷)
|
||||||
|
if alertname in (
|
||||||
|
"AlertChainBroken_Alertmanager",
|
||||||
|
"AlertChainBroken_Sentry",
|
||||||
|
"NoAlertsReceived2Hours",
|
||||||
|
"AlertChainUnhealthy",
|
||||||
|
):
|
||||||
|
return "alertchain_health", "TYPE-8M"
|
||||||
|
|
||||||
|
# 3. 飛輪/AI 系統健康(優先於 severity 判斷)
|
||||||
|
if alertname in ("AutoRepairLowSuccessRate", "PermanentFixRequired") or alertname.startswith("Flywheel"):
|
||||||
|
return "flywheel_health", "TYPE-8M"
|
||||||
|
|
||||||
|
# 4. 純資訊
|
||||||
if severity in ("info", "none"):
|
if severity in ("info", "none"):
|
||||||
return "info", "TYPE-1"
|
return "info", "TYPE-1"
|
||||||
# Watchdog/Heartbeat 永遠是 TYPE-1(Alertmanager 心跳)
|
|
||||||
if "watchdog" in alertname_lower or alertname in ("Heartbeat",):
|
# 5. Backup / Heartbeat(Velero 已由 K8s prefix 規則接管)
|
||||||
|
if "watchdog" in alertname_lower or alertname == "Heartbeat":
|
||||||
return "backup", "TYPE-1"
|
return "backup", "TYPE-1"
|
||||||
if alertname.startswith(("Docker", "Host")):
|
|
||||||
|
# 6. 主機資源(從 infrastructure 分離,ADR-075 統帥決議)
|
||||||
|
if alertname.startswith("Host"):
|
||||||
|
return "host_resource", "TYPE-3"
|
||||||
|
|
||||||
|
# 7. Docker 容器
|
||||||
|
if alertname.startswith("Docker"):
|
||||||
return "infrastructure", "TYPE-3"
|
return "infrastructure", "TYPE-3"
|
||||||
|
|
||||||
|
# 8. K8s(統帥決議:統一用 kubernetes,不用 k8s_workload)
|
||||||
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
|
if alertname.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
|
||||||
return "kubernetes", "TYPE-3"
|
return "kubernetes", "TYPE-3"
|
||||||
|
|
||||||
|
# 9. 資料庫
|
||||||
if alertname.startswith(("Postgres", "Redis")):
|
if alertname.startswith(("Postgres", "Redis")):
|
||||||
return "database", "TYPE-3"
|
return "database", "TYPE-3"
|
||||||
|
|
||||||
|
# 10. 物件儲存
|
||||||
|
if alertname == "MinIODown":
|
||||||
|
return "storage", "TYPE-3"
|
||||||
|
|
||||||
|
# 11. DevOps 工具(ADR-075 修正:從 general 分離)
|
||||||
|
if alertname in (
|
||||||
|
"OpenClawDown", "SignOzDown", "GiteaDown", "HarborDown",
|
||||||
|
"SentryDown", "AlertmanagerDown", "KaliScannerDown", "GiteaCIPipelineFailed",
|
||||||
|
):
|
||||||
|
return "devops_tool", "TYPE-3"
|
||||||
|
|
||||||
|
# 12. 外部網站(ADR-075 修正:從 general 分離)
|
||||||
|
if alertname in (
|
||||||
|
"MoWoooWorkDown", "TsenyangWebsiteDown",
|
||||||
|
"StockWoooWorkDown", "BitanWoooWorkDown",
|
||||||
|
):
|
||||||
|
return "external_site", "TYPE-3"
|
||||||
|
|
||||||
|
# 13. SSL 憑證(ADR-075 修正:從 general 分離)
|
||||||
|
if alertname.startswith(("ExternalSiteSSL", "TLSCert")):
|
||||||
|
return "ssl_cert", "TYPE-3"
|
||||||
|
|
||||||
return "general", "TYPE-3"
|
return "general", "TYPE-3"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1355,8 +1355,9 @@ class TelegramGateway:
|
|||||||
notification_type: 通知類型 (TYPE-1/2/3/4/4D)
|
notification_type: 通知類型 (TYPE-1/2/3/4/4D)
|
||||||
"""
|
"""
|
||||||
# TYPE-3 動態操作按鈕 (ADR-071-E)
|
# TYPE-3 動態操作按鈕 (ADR-071-E)
|
||||||
|
# ADR-075: 統一用 kubernetes(移除舊 k8s_workload),新增 storage/external_site/alertchain_health/flywheel_health
|
||||||
_CATEGORY_BUTTONS: dict[str, list[tuple[str, str]]] = {
|
_CATEGORY_BUTTONS: dict[str, list[tuple[str, str]]] = {
|
||||||
"k8s_workload": [
|
"kubernetes": [
|
||||||
("🔄 重啟", f"action:restart:{incident_id}"),
|
("🔄 重啟", f"action:restart:{incident_id}"),
|
||||||
("📈 擴容", f"action:scale_up:{incident_id}"),
|
("📈 擴容", f"action:scale_up:{incident_id}"),
|
||||||
("📉 縮容", f"action:scale_down:{incident_id}"),
|
("📉 縮容", f"action:scale_down:{incident_id}"),
|
||||||
@@ -1379,6 +1380,22 @@ class TelegramGateway:
|
|||||||
("🔄 重啟服務", f"action:restart_service:{incident_id}"),
|
("🔄 重啟服務", f"action:restart_service:{incident_id}"),
|
||||||
("📋 查 Log", f"action:check_log:{incident_id}"),
|
("📋 查 Log", f"action:check_log:{incident_id}"),
|
||||||
],
|
],
|
||||||
|
"storage": [
|
||||||
|
("🔄 重啟 MinIO", f"action:restart_service:{incident_id}"),
|
||||||
|
("📋 查 Log", f"action:check_log:{incident_id}"),
|
||||||
|
],
|
||||||
|
"external_site": [
|
||||||
|
("🔍 查健康狀態", f"action:check_health:{incident_id}"),
|
||||||
|
("📋 查 Log", f"action:check_log:{incident_id}"),
|
||||||
|
],
|
||||||
|
"alertchain_health": [
|
||||||
|
("🔍 查 Alertmanager", f"action:check_log:{incident_id}"),
|
||||||
|
("🔄 重載設定", f"action:reload_config:{incident_id}"),
|
||||||
|
],
|
||||||
|
"flywheel_health": [
|
||||||
|
("🔍 查飛輪狀態", f"action:check_health:{incident_id}"),
|
||||||
|
("📋 查 Log", f"action:check_log:{incident_id}"),
|
||||||
|
],
|
||||||
"ai_system": [
|
"ai_system": [
|
||||||
("🔀 切換 Provider", f"action:switch_provider:{incident_id}"),
|
("🔀 切換 Provider", f"action:switch_provider:{incident_id}"),
|
||||||
],
|
],
|
||||||
@@ -1473,6 +1490,9 @@ class TelegramGateway:
|
|||||||
nemotron_tool_backend: str = "",
|
nemotron_tool_backend: str = "",
|
||||||
# 2026-04-05 Claude Code: incident_id 用於 detail/reanalyze/history 按鈕
|
# 2026-04-05 Claude Code: incident_id 用於 detail/reanalyze/history 按鈕
|
||||||
incident_id: str = "",
|
incident_id: str = "",
|
||||||
|
# 2026-04-12 ogt: ADR-075 告警分類與通知類型(斷點 B 修復)
|
||||||
|
alert_category: str = "",
|
||||||
|
notification_type: str = "",
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合)
|
推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合)
|
||||||
@@ -1553,11 +1573,14 @@ class TelegramGateway:
|
|||||||
|
|
||||||
# 建立按鈕 (含自動調優)
|
# 建立按鈕 (含自動調優)
|
||||||
# 2026-04-05 Claude Code: 傳入 incident_id 以啟用 detail/reanalyze/history 按鈕
|
# 2026-04-05 Claude Code: 傳入 incident_id 以啟用 detail/reanalyze/history 按鈕
|
||||||
|
# ADR-075: 傳入 alert_category/notification_type 以啟用分類動態按鈕(斷點 B 修復)
|
||||||
keyboard = self._build_inline_keyboard(
|
keyboard = self._build_inline_keyboard(
|
||||||
approval_id=approval_id,
|
approval_id=approval_id,
|
||||||
include_auto_tuning=bool(auto_tuning_command),
|
include_auto_tuning=bool(auto_tuning_command),
|
||||||
auto_tuning_command=auto_tuning_command,
|
auto_tuning_command=auto_tuning_command,
|
||||||
incident_id=incident_id,
|
incident_id=incident_id,
|
||||||
|
alert_category=alert_category,
|
||||||
|
notification_type=notification_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 發送訊息
|
# 發送訊息
|
||||||
@@ -1597,8 +1620,14 @@ class TelegramGateway:
|
|||||||
|
|
||||||
# 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053)
|
# 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053)
|
||||||
# 2026-04-05 ogt: 升級為完整 v7.0 格式,與個人 chat 一致
|
# 2026-04-05 ogt: 升級為完整 v7.0 格式,與個人 chat 一致
|
||||||
# 非同步執行,失敗不影響告警主流程
|
# 2026-04-12 ogt: ADR-075 斷點 C 修復 — 含按鈕的互動型通知禁止發群組(nonce 洩漏)
|
||||||
if settings.SRE_GROUP_CHAT_ID:
|
# TYPE-1/TYPE-2 → 可發群組(純資訊,無 callback 按鈕)
|
||||||
|
# TYPE-3/TYPE-4/TYPE-4D/TYPE-8M → 僅 DM,不發群組
|
||||||
|
_interactive_types = {"TYPE-3", "TYPE-4", "TYPE-4D", "TYPE-8M"}
|
||||||
|
_is_interactive = notification_type in _interactive_types or (
|
||||||
|
not notification_type and alert_category # 有分類但無明確型別 → 視為互動型
|
||||||
|
)
|
||||||
|
if settings.SRE_GROUP_CHAT_ID and not _is_interactive:
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self._send_approval_card_to_group(
|
self._send_approval_card_to_group(
|
||||||
approval_id=approval_id,
|
approval_id=approval_id,
|
||||||
|
|||||||
@@ -76,17 +76,144 @@ class TestInfrastructure:
|
|||||||
def test_docker_prefix(self):
|
def test_docker_prefix(self):
|
||||||
ac, nt = classify_alert_early("DockerContainerOOM", "critical", {})
|
ac, nt = classify_alert_early("DockerContainerOOM", "critical", {})
|
||||||
assert nt == "TYPE-3"
|
assert nt == "TYPE-3"
|
||||||
assert ac == "infrastructure"
|
assert ac == "infrastructure" # Docker → 保留 infrastructure
|
||||||
|
|
||||||
def test_host_prefix(self):
|
|
||||||
ac, nt = classify_alert_early("HostHighCpuLoad", "warning", {})
|
|
||||||
assert nt == "TYPE-3"
|
|
||||||
assert ac == "infrastructure"
|
|
||||||
|
|
||||||
def test_docker_restart(self):
|
def test_docker_restart(self):
|
||||||
ac, nt = classify_alert_early("DockerContainerRestarting", "warning", {})
|
ac, nt = classify_alert_early("DockerContainerRestarting", "warning", {})
|
||||||
assert ac == "infrastructure"
|
assert ac == "infrastructure"
|
||||||
|
|
||||||
|
# ADR-075: Host* 從 infrastructure 分離為 host_resource
|
||||||
|
def test_host_prefix_is_host_resource(self):
|
||||||
|
ac, nt = classify_alert_early("HostHighCpuLoad", "warning", {})
|
||||||
|
assert nt == "TYPE-3"
|
||||||
|
assert ac == "host_resource"
|
||||||
|
|
||||||
|
def test_host_down(self):
|
||||||
|
ac, nt = classify_alert_early("HostDown", "critical", {})
|
||||||
|
assert ac == "host_resource"
|
||||||
|
|
||||||
|
def test_host_memory(self):
|
||||||
|
ac, nt = classify_alert_early("HostOutOfMemory", "warning", {})
|
||||||
|
assert ac == "host_resource"
|
||||||
|
|
||||||
|
def test_host_disk(self):
|
||||||
|
ac, nt = classify_alert_early("HostOutOfDiskSpace", "warning", {})
|
||||||
|
assert ac == "host_resource"
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# ADR-075: alertchain_health (TYPE-8M)
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
|
||||||
|
class TestAlertchainHealth:
|
||||||
|
@pytest.mark.parametrize("alertname", [
|
||||||
|
"AlertChainBroken_Alertmanager",
|
||||||
|
"AlertChainBroken_Sentry",
|
||||||
|
"NoAlertsReceived2Hours",
|
||||||
|
"AlertChainUnhealthy",
|
||||||
|
])
|
||||||
|
def test_alertchain_alerts(self, alertname):
|
||||||
|
ac, nt = classify_alert_early(alertname, "critical", {})
|
||||||
|
assert ac == "alertchain_health"
|
||||||
|
assert nt == "TYPE-8M"
|
||||||
|
|
||||||
|
def test_alertchain_beats_severity_info(self):
|
||||||
|
# 即使 severity=info,AlertChainBroken 也必須是 alertchain_health
|
||||||
|
ac, nt = classify_alert_early("AlertChainBroken_Alertmanager", "info", {})
|
||||||
|
assert ac == "alertchain_health"
|
||||||
|
assert nt == "TYPE-8M"
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# ADR-075: flywheel_health (TYPE-8M)
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
|
||||||
|
class TestFlywheelHealth:
|
||||||
|
def test_auto_repair_low_success(self):
|
||||||
|
ac, nt = classify_alert_early("AutoRepairLowSuccessRate", "warning", {})
|
||||||
|
assert ac == "flywheel_health"
|
||||||
|
assert nt == "TYPE-8M"
|
||||||
|
|
||||||
|
def test_permanent_fix_required(self):
|
||||||
|
ac, nt = classify_alert_early("PermanentFixRequired", "warning", {})
|
||||||
|
assert ac == "flywheel_health"
|
||||||
|
assert nt == "TYPE-8M"
|
||||||
|
|
||||||
|
def test_flywheel_prefix(self):
|
||||||
|
ac, nt = classify_alert_early("FlywheelPlaybookZero", "critical", {})
|
||||||
|
assert ac == "flywheel_health"
|
||||||
|
assert nt == "TYPE-8M"
|
||||||
|
|
||||||
|
def test_flywheel_beats_severity_info(self):
|
||||||
|
ac, nt = classify_alert_early("AutoRepairLowSuccessRate", "info", {})
|
||||||
|
assert ac == "flywheel_health"
|
||||||
|
assert nt == "TYPE-8M"
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# ADR-075: storage (TYPE-3)
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
|
||||||
|
class TestStorage:
|
||||||
|
def test_minio_down(self):
|
||||||
|
ac, nt = classify_alert_early("MinIODown", "critical", {})
|
||||||
|
assert ac == "storage"
|
||||||
|
assert nt == "TYPE-3"
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# ADR-075: devops_tool (TYPE-3)
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
|
||||||
|
class TestDevopsTool:
|
||||||
|
@pytest.mark.parametrize("alertname", [
|
||||||
|
"OpenClawDown",
|
||||||
|
"SignOzDown",
|
||||||
|
"GiteaDown",
|
||||||
|
"HarborDown",
|
||||||
|
"SentryDown",
|
||||||
|
"AlertmanagerDown",
|
||||||
|
"KaliScannerDown",
|
||||||
|
"GiteaCIPipelineFailed",
|
||||||
|
])
|
||||||
|
def test_devops_tools(self, alertname):
|
||||||
|
ac, nt = classify_alert_early(alertname, "critical", {})
|
||||||
|
assert ac == "devops_tool"
|
||||||
|
assert nt == "TYPE-3"
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# ADR-075: external_site (TYPE-3)
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
|
||||||
|
class TestExternalSite:
|
||||||
|
@pytest.mark.parametrize("alertname", [
|
||||||
|
"MoWoooWorkDown",
|
||||||
|
"TsenyangWebsiteDown",
|
||||||
|
"StockWoooWorkDown",
|
||||||
|
"BitanWoooWorkDown",
|
||||||
|
])
|
||||||
|
def test_external_sites(self, alertname):
|
||||||
|
ac, nt = classify_alert_early(alertname, "critical", {})
|
||||||
|
assert ac == "external_site"
|
||||||
|
assert nt == "TYPE-3"
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# ADR-075: ssl_cert (TYPE-3)
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
|
||||||
|
class TestSslCert:
|
||||||
|
def test_external_site_ssl(self):
|
||||||
|
ac, nt = classify_alert_early("ExternalSiteSSLExpiringSoon", "warning", {})
|
||||||
|
assert ac == "ssl_cert"
|
||||||
|
assert nt == "TYPE-3"
|
||||||
|
|
||||||
|
def test_tls_cert(self):
|
||||||
|
ac, nt = classify_alert_early("TLSCertExpiryCritical", "critical", {})
|
||||||
|
assert ac == "ssl_cert"
|
||||||
|
assert nt == "TYPE-3"
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------------- #
|
# --------------------------------------------------------------------------- #
|
||||||
# TYPE-3: Kubernetes
|
# TYPE-3: Kubernetes
|
||||||
|
|||||||
Reference in New Issue
Block a user