From da772a1605bdb2b8e98ea2c7fb943649fefdb69d Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 2 May 2026 17:41:28 +0800 Subject: [PATCH] fix(decision): block kubectl actions on bare_metal host alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When HostHighCpuLoad / HostOutOfMemory fire on a bare-metal host (192.168.0.110 et al, where Sentry / ClickHouse / Snuba are eating CPU), the LLM kept proposing "kubectl rollout restart awoooi-api", which is a wrong-domain action — restarting awoooi cannot fix a third-party process's CPU usage on the host. Auto-execute would then either run the no-op kubectl restart (wasted) or escalate after ssh_diagnose because no safe action was found, producing the "AI 自動修復失敗" Telegram noise the user just complained about. Adds a guard at the top of DecisionManager._auto_execute: if the incident's primary signal carries host_type=bare_metal AND the proposed action starts with "kubectl", refuse to execute. The incident is marked READY with a clear blocked_reason so human operators see why automation declined, and emergency_escalation records the event in AOL for audit. Also patches /home/wooo/monitoring/alerts.yml on 110 (and the new ops/monitoring/alerts.yml in repo) to add an explicit auto_repair_action annotation on HostHighCpuLoad / HostOutOfMemory that hints LLM toward `ssh ... ps aux` rather than kubectl restart. Prometheus reload returned 200. Tests: tests/test_decision_manager_bare_metal_kubectl_guard.py covers (1) bare_metal+kubectl blocked, (2) kubectl get also blocked, (3) bare_metal+ssh NOT blocked, (4) k8s host_type+kubectl NOT blocked, (5) missing host_type label NOT blocked. Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/decision_manager.py | 36 +++++ ...cision_manager_bare_metal_kubectl_guard.py | 148 ++++++++++++++++++ ops/monitoring/alerts.yml | 6 + 3 files changed, 190 insertions(+) create mode 100644 apps/api/tests/test_decision_manager_bare_metal_kubectl_guard.py diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 542db95a..8d7dd87b 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1753,6 +1753,42 @@ class DecisionManager: """ action = token.proposal_data.get("kubectl_command", "") + # 2026-05-02 ogt + Claude Sonnet 4.6: bare_metal × kubectl 拒絕守衛 + # 根因:HostHighCpuLoad / HostOutOfMemory 等主機層告警 fire 在實體機(如 110, + # 上面跑 Sentry/ClickHouse/Snuba 等第三方),但 LLM 看到 instance 後容易 + # 亂提「kubectl rollout restart awoooi-api」這種對症錯誤的 K8s action, + # 重啟 awoooi 服務根本解不了第三方 CPU 燒爆,只是拖累自己。 + # 修法:偵測到 alert host_type=bare_metal 且 action 是 kubectl 類,立即降級人工, + # Telegram 明示「跨 domain 動作被攔下」。auto_repair 走 SSH 診斷或人工。 + _alert_labels = incident.signals[0].labels if incident.signals else {} + _host_type = (_alert_labels.get("host_type") or "").lower() + _action_stripped = action.lstrip().lower() + if _host_type == "bare_metal" and _action_stripped.startswith("kubectl"): + logger.warning( + "auto_execute_blocked_bare_metal_kubectl", + incident_id=incident.incident_id, + alertname=_alert_labels.get("alertname", ""), + instance=_alert_labels.get("instance", ""), + proposed_action=action[:120], + reason="bare_metal host alert + kubectl action = wrong domain", + ) + token.state = DecisionState.READY + token.proposal_data["auto_executed"] = False + token.proposal_data["blocked_reason"] = ( + f"host_type=bare_metal 但 LLM 提案 kubectl 動作 ({action[:60]})。" + " 主機層告警的根因常在第三方服務(如 Sentry / ClickHouse)," + " 重啟 K8s deployment 解不了,已降級人工。" + ) + await self._save_token(token) + _fire_and_forget(_escalate_decision_auto_repair_unavailable( + incident=incident, + token=token, + failure_reason="bare_metal alert routed to kubectl action (wrong domain)", + attempted_actions=f"action={action[:120]}", + )) + _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) + return + # 2026-04-15 ogt: YAML 規則引擎優先 — 架構斷點修復 # 根因:LLM 生成的 kubectl_command 與 YAML 規則引擎的 NO_ACTION / SSH 指令完全脫節 # YAML 規則是人工審閱的權威來源,LLM 只是輔助 diff --git a/apps/api/tests/test_decision_manager_bare_metal_kubectl_guard.py b/apps/api/tests/test_decision_manager_bare_metal_kubectl_guard.py new file mode 100644 index 00000000..2b3778e2 --- /dev/null +++ b/apps/api/tests/test_decision_manager_bare_metal_kubectl_guard.py @@ -0,0 +1,148 @@ +""" +DecisionManager._auto_execute bare_metal × kubectl 守衛測試 +========================================================= +2026-05-02 ogt + Claude Sonnet 4.6: + +當 alert label `host_type=bare_metal` 而 LLM 提案的 action 是 kubectl 類, +代表 LLM 弄錯責任域(host 層問題不該動 K8s deployment),守衛應立刻擋下、 +降級人工,並透過 emergency_escalation 留痕,不執行 kubectl 動作。 + +回歸場景:HostHighCpuLoad on 192.168.0.110(Sentry/ClickHouse 燒 CPU), +LLM 亂提「kubectl rollout restart awoooi-api」,要被擋下。 +""" + +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from src.services.decision_manager import DecisionManager, DecisionState + + +def _fake_incident(host_type: str = "bare_metal", alertname: str = "HostHighCpuLoad") -> SimpleNamespace: + signal = SimpleNamespace(labels={ + "host_type": host_type, + "alertname": alertname, + "instance": "192.168.0.110:9100", + }) + return SimpleNamespace( + incident_id="INC-TEST-GUARD", + signals=[signal], + affected_services=["awoooi-api"], + ) + + +def _fake_token(action: str = "kubectl rollout restart deployment awoooi-api") -> SimpleNamespace: + return SimpleNamespace( + state=None, + proposal_data={"kubectl_command": action}, + error=None, + ) + + +@pytest.fixture +def manager(monkeypatch): + with patch("src.services.decision_manager.get_openclaw"), \ + patch("src.services.knowledge_service.get_knowledge_service"), \ + patch("src.plugins.mcp.providers.k8s_provider.K8sProvider"), \ + patch("src.plugins.mcp.providers.ssh_provider.SSHProvider"): + mgr = DecisionManager() + mgr._save_token = AsyncMock() + monkeypatch.setattr("src.services.decision_manager._fire_and_forget", lambda *a, **k: None) + monkeypatch.setattr( + "src.services.decision_manager._push_decision_to_telegram", + lambda *a, **k: None, + ) + monkeypatch.setattr( + "src.services.decision_manager._escalate_decision_auto_repair_unavailable", + lambda **k: None, + ) + return mgr + + +class TestBareMetalKubectlGuard: + @pytest.mark.asyncio + async def test_bare_metal_kubectl_action_is_blocked(self, manager): + incident = _fake_incident(host_type="bare_metal") + token = _fake_token("kubectl rollout restart deployment awoooi-api") + await manager._auto_execute(incident, token) + + assert token.state == DecisionState.READY + assert token.proposal_data["auto_executed"] is False + assert "host_type=bare_metal" in token.proposal_data["blocked_reason"] + assert "kubectl" in token.proposal_data["blocked_reason"] + + @pytest.mark.asyncio + async def test_bare_metal_kubectl_get_also_blocked(self, manager): + """kubectl get 也是 K8s 域,host_type=bare_metal 一樣不該走""" + incident = _fake_incident(host_type="bare_metal") + token = _fake_token("kubectl get pods -n awoooi-prod") + await manager._auto_execute(incident, token) + assert token.proposal_data["auto_executed"] is False + + @pytest.mark.asyncio + async def test_bare_metal_ssh_action_is_NOT_blocked_here(self, manager, monkeypatch): + """bare_metal + ssh action 不該被本守衛攔下(守衛只擋 kubectl) + + 後續 YAML 規則或 SSH 路由會處理;本 test 確認沒誤殺 SSH path。 + 實際執行會在守衛後繼續走 YAML 規則 / NEMOTRON 路由,這裡用 mock 攔住確保 + 守衛沒提早 return。 + """ + incident = _fake_incident(host_type="bare_metal") + token = _fake_token("ssh 192.168.0.110 'ps aux --sort=-%cpu'") + + # 提早 mock 後續流程的入口,讓我們可確認沒被守衛 short-circuit + monkeypatch.setattr( + "src.services.decision_manager.parse_kubectl_action", + lambda *a, **k: SimpleNamespace(operation=None, target=None), + ) + # 守衛若沒擋下,會繼續執行 → token 不會立刻被設成 READY+blocked + try: + await manager._auto_execute(incident, token) + except Exception: + pass # 後續路徑可能因 mock 不全而 raise,重點是沒被守衛 early-return + # blocked_reason 不應該是「host_type=bare_metal」(那是守衛 message) + br = token.proposal_data.get("blocked_reason", "") + assert "host_type=bare_metal" not in br + + @pytest.mark.asyncio + async def test_k8s_alert_kubectl_NOT_blocked(self, manager, monkeypatch): + """non-bare_metal alert(如 K8s pod 告警)+ kubectl action → 守衛不該攔""" + incident = _fake_incident(host_type="kubernetes", alertname="KubePodCrashLooping") + token = _fake_token("kubectl rollout restart deployment awoooi-api") + + monkeypatch.setattr( + "src.services.decision_manager.parse_kubectl_action", + lambda *a, **k: SimpleNamespace(operation=None, target=None), + ) + try: + await manager._auto_execute(incident, token) + except Exception: + pass + br = token.proposal_data.get("blocked_reason", "") + # 不該是 bare_metal 守衛擋的(其他守衛擋是另回事) + assert "host_type=bare_metal" not in br + + @pytest.mark.asyncio + async def test_no_host_type_label_NOT_blocked(self, manager, monkeypatch): + """沒 host_type label(舊規則)+ kubectl action → 守衛不該攔(保留現狀)""" + signal = SimpleNamespace(labels={"alertname": "PodCrashLoop"}) + incident = SimpleNamespace( + incident_id="INC-NO-HOST-TYPE", + signals=[signal], + affected_services=["awoooi-api"], + ) + token = _fake_token("kubectl rollout restart deployment awoooi-api") + + monkeypatch.setattr( + "src.services.decision_manager.parse_kubectl_action", + lambda *a, **k: SimpleNamespace(operation=None, target=None), + ) + try: + await manager._auto_execute(incident, token) + except Exception: + pass + br = token.proposal_data.get("blocked_reason", "") + assert "host_type=bare_metal" not in br diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index bd078ede..d5e39905 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -47,6 +47,9 @@ groups: annotations: summary: "主機 {{ $labels.host }} CPU 高負載" description: "CPU 使用率超過 80%" + # 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷而非 kubectl + auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷;禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)" + runbook: "host CPU 高負載排查:先 SSH ps aux 看 top 進程;若為第三方服務(Sentry/ClickHouse 等)寫 ADR 升級資源或調 limit,禁止 kubectl restart 跨 domain" - alert: HostOutOfMemory expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 @@ -62,6 +65,9 @@ groups: annotations: summary: "主機 {{ $labels.host }} 記憶體不足" description: "記憶體使用率超過 85%" + # 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷 + auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%mem | head -20' (host 記憶體診斷;禁 kubectl restart — 主因常為第三方服務)" + runbook: "host 記憶體不足排查:SSH 看 top 進程;若為第三方服務需擴容或調 limit" - alert: HostOutOfDiskSpace expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85