From ca22ec2fd2c3717d408adef08da7e961752082df Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 1 May 2026 10:11:10 +0800 Subject: [PATCH] fix(aiops): route backup failures rule-first --- apps/api/src/api/v1/webhooks.py | 10 +++++----- .../tests/test_alertmanager_rule_bypass.py | 20 +++++++++++++++++++ docs/LOGBOOK.md | 14 +++++++++++++ ops/monitoring/alerts-unified.yml | 2 +- 4 files changed, 40 insertions(+), 6 deletions(-) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index e963ee36..23774b49 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -101,13 +101,13 @@ def _should_bypass_alertmanager_llm( rule_response: dict | None, alert_category: str, ) -> bool: - """Host 類告警命中 YAML NO_ACTION 時,直接走人工排查卡片。""" + """主機/備份類告警命中 YAML NO_ACTION 時,直接走人工排查卡片。""" return ( rule_response is not None and rule_response.get("suggested_action") == "NO_ACTION" and not str(rule_response.get("kubectl_command", "")).strip() and rule_response.get("rule_id", "") not in ("generic_fallback", "") - and alert_category == "host_resource" + and alert_category in {"host_resource", "backup_failure"} ) @@ -115,9 +115,9 @@ def _should_use_alertmanager_rule_first( rule_response: dict | None, alert_category: str, ) -> bool: - """Host 類告警命中權威 YAML 規則時,避免再被 LLM 污染成 K8s 動作。""" + """主機/備份類告警命中權威 YAML 規則時,避免再被 LLM 污染成 K8s 動作。""" - if not rule_response or alert_category != "host_resource": + if not rule_response or alert_category not in {"host_resource", "backup_failure"}: return False if rule_response.get("rule_id", "") in ("generic_fallback", ""): return False @@ -1363,7 +1363,7 @@ async def _process_new_alert_background( alertname=alertname, rule_id=rule_response.get("rule_id", ""), alert_category=alert_category, - reason="host_resource YAML 權威規則命中,跳過 LLM 避免產生錯誤 K8s 動作", + reason="host/backup YAML 權威規則命中,跳過 LLM 避免產生錯誤 K8s 動作", ) risk_mapping = { "low": RiskLevel.LOW, diff --git a/apps/api/tests/test_alertmanager_rule_bypass.py b/apps/api/tests/test_alertmanager_rule_bypass.py index 0f58f655..4b10a46d 100644 --- a/apps/api/tests/test_alertmanager_rule_bypass.py +++ b/apps/api/tests/test_alertmanager_rule_bypass.py @@ -38,6 +38,16 @@ def test_non_host_category_does_not_bypass_llm(): assert _should_bypass_alertmanager_llm(rule_response, "kubernetes") is False +def test_backup_failure_yaml_no_action_bypasses_llm(): + rule_response = { + "rule_id": "host_backup_failed", + "suggested_action": "NO_ACTION", + "kubectl_command": "", + } + + assert _should_bypass_alertmanager_llm(rule_response, "backup_failure") is True + + def test_host_resource_ssh_rule_uses_rule_first(): rule_response = { "rule_id": "host_resource_alert", @@ -48,6 +58,16 @@ def test_host_resource_ssh_rule_uses_rule_first(): assert _should_use_alertmanager_rule_first(rule_response, "host_resource") is True +def test_backup_failure_ssh_rule_uses_rule_first(): + rule_response = { + "rule_id": "host_backup_failed", + "suggested_action": "SSH_DIAGNOSE", + "kubectl_command": "ssh {host} 'tail -80 backup.log'", + } + + assert _should_use_alertmanager_rule_first(rule_response, "backup_failure") is True + + def test_generic_fallback_does_not_use_rule_first(): rule_response = { "rule_id": "generic_fallback", diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 4833d81a..e71e30ea 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,20 @@ --- +## 2026-05-01 | HostBackupFailed rule-first e2e 補洞 + +Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會被分類成 `backup_failure`,未命中原本只允許 `host_resource` 的 rule-first gate,導致又進 OpenClaw LLM。 + +### 完成 +- `_should_use_alertmanager_rule_first()` / `_should_bypass_alertmanager_llm()` 納入 `backup_failure`,備份失敗 YAML `SSH_DIAGNOSE` 不再被 LLM 覆蓋成 K8s 動作。 +- `NodeExporterDown` Prometheus rule `auto_repair` 改為 `true`,與 YAML rule catalog 的 exporter restart 策略一致。 +- 補 `backup_failure` NO_ACTION / SSH_DIAGNOSE 單元測試。 + +### 驗證 +- `python3 -m py_compile apps/api/src/api/v1/webhooks.py` 通過。 +- `cd apps/api && pytest tests/test_alertmanager_rule_bypass.py tests/test_telegram_ai_automation_block.py tests/test_ai_router_diagnose_fallback.py -q` → 24 passed。 +- YAML parse `ops/monitoring/alerts-unified.yml`、`apps/api/alert_rules.yaml` 通過。 + ## 2026-04-30 | ADR-104 Playbook 版本化 lineage 承接「自動建立 Playbook」第二段,讓 LLM 生成的改良 Playbook 不覆蓋舊知識,而是形成可審核、可追溯、可替換的版本鏈。 diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index cba08d8f..d4e3db3a 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -1022,7 +1022,7 @@ groups: team: ops alert_category: infrastructure notification_type: TYPE-3 - auto_repair: "false" + auto_repair: "true" annotations: summary: "node-exporter ({{ $labels.instance }}) 停擺" description: "主機 {{ $labels.instance }} node-exporter 已停擺 5 分鐘,主機 metrics 中斷。"