From 3696fb5938ba779d0ae84732b5a1d54a7c1dffeb Mon Sep 17 00:00:00 2001 From: OG T Date: Wed, 15 Apr 2026 21:45:46 +0800 Subject: [PATCH] =?UTF-8?q?fix(prod):=20=E4=BF=AE=E5=BE=A9=20host=5Fresour?= =?UTF-8?q?ce=20=E8=AA=A4=E7=99=BC=20K8s=20kubectl=20+=20=E8=87=AA?= =?UTF-8?q?=E5=8B=95=E5=9F=B7=E8=A1=8C=E9=87=8D=E8=A4=87=E9=A2=A8=E6=9A=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. decision_manager: host_resource 告警(HostHighCpuLoad 等) 不得執行 kubectl 操作 → 降級人工審核 根因:原本只擋 infrastructure,host_resource 漏進 K8s 路徑 → 導致 kubectl rollout restart deployment/HostHighCpuLoad 被真實執行 2. decision_manager: auto_execute 路徑補入 Redis cooldown 同一 target 5 分鐘內最多自動執行 2 次,防止 awoooi-worker 3x 風暴 根因:decision_manager 自動執行路徑完全無冷卻保護 2026-04-15 ogt + Claude Sonnet 4.6(亞太): 生產緊急修復第二批 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/decision_manager.py | 48 ++++++++++++++++++++++- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 999a7064..d774a080 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1527,14 +1527,32 @@ class DecisionManager: logger.debug("target_rescue_skipped", error=str(_rescue_err)) # ADR-073 Phase 3-2: infrastructure 告警 (Docker/Host) → SSH MCP routing (2026-04-12 ogt) - # alert_category = "infrastructure" 表示 Docker/Host 告警,不走 K8s executor - # action 格式應為 "docker restart " 或 "systemctl restart " + # alert_category = "infrastructure" 表示 Docker 告警,非 kubectl action → SSH # P1-1 fix 2026-04-12: 必須在 kubectl safety guard 之前 routing,否則 docker 指令被 _action_safe=False 攔截 _alert_category = getattr(incident, "alert_category", None) or "" if _alert_category == "infrastructure" and action and not action.startswith("kubectl"): await self._ssh_execute(incident, token, action, _target) return + # 2026-04-15 ogt: host_resource 告警(HostHighCpuLoad 等)不是 K8s workload 問題 + # 不得執行 kubectl 操作,改降級人工審核 + # 根因:原本只擋了 infrastructure,忘記 host_resource 也不走 K8s + if _alert_category == "host_resource" and action and action.startswith("kubectl"): + logger.warning( + "auto_execute_blocked_host_resource_no_k8s", + incident_id=incident.incident_id, + alert_category=_alert_category, + action=action[:80], + reason="host_resource 告警不應執行 K8s kubectl 操作,降級人工審核", + ) + token.state = DecisionState.READY + token.proposal_data["auto_executed"] = False + token.proposal_data["mcp_all_failed"] = True + token.proposal_data["blocked_reason"] = "host_resource 告警禁止 K8s kubectl,請人工排查主機" + await self._save_token(token) + _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) + return + # 安全守衛: 替換後仍含 "unknown" 或未替換的 <...>/{...} → 拒絕執行 # 另外:若 target 等於 alertname,代表 LLM 把告警名稱填入 deployment_name,也拒絕 _alertname = incident.signals[0].labels.get("alertname", "") if incident.signals else "" @@ -1667,6 +1685,32 @@ class DecisionManager: _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) return + # 2026-04-15 ogt: 同一 target 5 分鐘內最多執行 2 次,防止修復風暴 + # 根因:多個 incident 共享同一 target 時,各自獨立自動執行 → 重複重啟 + try: + from src.core.redis_client import get_redis as _get_redis_dm + _redis_dm = _get_redis_dm() + _dm_cooldown_key = f"awoooi:auto_execute_cooldown:{_ns}:{_target}" + _dm_exec_count = await _redis_dm.get(_dm_cooldown_key) + if _dm_exec_count and int(_dm_exec_count) >= 2: + logger.warning( + "auto_execute_cooldown_blocked", + incident_id=incident.incident_id, + target=_target, + namespace=_ns, + exec_count=int(_dm_exec_count), + reason="同一 target 5 分鐘內已自動執行 2 次,冷卻中", + ) + token.state = DecisionState.READY + token.proposal_data["auto_executed"] = False + token.proposal_data["cooldown_blocked"] = True + await self._save_token(token) + return + await _redis_dm.incr(_dm_cooldown_key) + await _redis_dm.expire(_dm_cooldown_key, 300) # 5 分鐘 + except Exception as _cd_err: + logger.debug("auto_execute_cooldown_check_error", error=str(_cd_err)) + try: # 延遲導入避免循環依賴 from src.models.approval import ApprovalRequest, ApprovalStatus