diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 999a7064..d774a080 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1527,14 +1527,32 @@ class DecisionManager: logger.debug("target_rescue_skipped", error=str(_rescue_err)) # ADR-073 Phase 3-2: infrastructure 告警 (Docker/Host) → SSH MCP routing (2026-04-12 ogt) - # alert_category = "infrastructure" 表示 Docker/Host 告警,不走 K8s executor - # action 格式應為 "docker restart " 或 "systemctl restart " + # alert_category = "infrastructure" 表示 Docker 告警,非 kubectl action → SSH # P1-1 fix 2026-04-12: 必須在 kubectl safety guard 之前 routing,否則 docker 指令被 _action_safe=False 攔截 _alert_category = getattr(incident, "alert_category", None) or "" if _alert_category == "infrastructure" and action and not action.startswith("kubectl"): await self._ssh_execute(incident, token, action, _target) return + # 2026-04-15 ogt: host_resource 告警(HostHighCpuLoad 等)不是 K8s workload 問題 + # 不得執行 kubectl 操作,改降級人工審核 + # 根因:原本只擋了 infrastructure,忘記 host_resource 也不走 K8s + if _alert_category == "host_resource" and action and action.startswith("kubectl"): + logger.warning( + "auto_execute_blocked_host_resource_no_k8s", + incident_id=incident.incident_id, + alert_category=_alert_category, + action=action[:80], + reason="host_resource 告警不應執行 K8s kubectl 操作,降級人工審核", + ) + token.state = DecisionState.READY + token.proposal_data["auto_executed"] = False + token.proposal_data["mcp_all_failed"] = True + token.proposal_data["blocked_reason"] = "host_resource 告警禁止 K8s kubectl,請人工排查主機" + await self._save_token(token) + _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) + return + # 安全守衛: 替換後仍含 "unknown" 或未替換的 <...>/{...} → 拒絕執行 # 另外:若 target 等於 alertname,代表 LLM 把告警名稱填入 deployment_name,也拒絕 _alertname = incident.signals[0].labels.get("alertname", "") if incident.signals else "" @@ -1667,6 +1685,32 @@ class DecisionManager: _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) return + # 2026-04-15 ogt: 同一 target 5 分鐘內最多執行 2 次,防止修復風暴 + # 根因:多個 incident 共享同一 target 時,各自獨立自動執行 → 重複重啟 + try: + from src.core.redis_client import get_redis as _get_redis_dm + _redis_dm = _get_redis_dm() + _dm_cooldown_key = f"awoooi:auto_execute_cooldown:{_ns}:{_target}" + _dm_exec_count = await _redis_dm.get(_dm_cooldown_key) + if _dm_exec_count and int(_dm_exec_count) >= 2: + logger.warning( + "auto_execute_cooldown_blocked", + incident_id=incident.incident_id, + target=_target, + namespace=_ns, + exec_count=int(_dm_exec_count), + reason="同一 target 5 分鐘內已自動執行 2 次,冷卻中", + ) + token.state = DecisionState.READY + token.proposal_data["auto_executed"] = False + token.proposal_data["cooldown_blocked"] = True + await self._save_token(token) + return + await _redis_dm.incr(_dm_cooldown_key) + await _redis_dm.expire(_dm_cooldown_key, 300) # 5 分鐘 + except Exception as _cd_err: + logger.debug("auto_execute_cooldown_check_error", error=str(_cd_err)) + try: # 延遲導入避免循環依賴 from src.models.approval import ApprovalRequest, ApprovalStatus