fix(prod): 修復 host_resource 誤發 K8s kubectl + 自動執行重複風暴
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
1. decision_manager: host_resource 告警(HostHighCpuLoad 等) 不得執行 kubectl 操作 → 降級人工審核 根因:原本只擋 infrastructure,host_resource 漏進 K8s 路徑 → 導致 kubectl rollout restart deployment/HostHighCpuLoad 被真實執行 2. decision_manager: auto_execute 路徑補入 Redis cooldown 同一 target 5 分鐘內最多自動執行 2 次,防止 awoooi-worker 3x 風暴 根因:decision_manager 自動執行路徑完全無冷卻保護 2026-04-15 ogt + Claude Sonnet 4.6(亞太): 生產緊急修復第二批 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1527,14 +1527,32 @@ class DecisionManager:
|
||||
logger.debug("target_rescue_skipped", error=str(_rescue_err))
|
||||
|
||||
# ADR-073 Phase 3-2: infrastructure 告警 (Docker/Host) → SSH MCP routing (2026-04-12 ogt)
|
||||
# alert_category = "infrastructure" 表示 Docker/Host 告警,不走 K8s executor
|
||||
# action 格式應為 "docker restart <container>" 或 "systemctl restart <service>"
|
||||
# alert_category = "infrastructure" 表示 Docker 告警,非 kubectl action → SSH
|
||||
# P1-1 fix 2026-04-12: 必須在 kubectl safety guard 之前 routing,否則 docker 指令被 _action_safe=False 攔截
|
||||
_alert_category = getattr(incident, "alert_category", None) or ""
|
||||
if _alert_category == "infrastructure" and action and not action.startswith("kubectl"):
|
||||
await self._ssh_execute(incident, token, action, _target)
|
||||
return
|
||||
|
||||
# 2026-04-15 ogt: host_resource 告警(HostHighCpuLoad 等)不是 K8s workload 問題
|
||||
# 不得執行 kubectl 操作,改降級人工審核
|
||||
# 根因:原本只擋了 infrastructure,忘記 host_resource 也不走 K8s
|
||||
if _alert_category == "host_resource" and action and action.startswith("kubectl"):
|
||||
logger.warning(
|
||||
"auto_execute_blocked_host_resource_no_k8s",
|
||||
incident_id=incident.incident_id,
|
||||
alert_category=_alert_category,
|
||||
action=action[:80],
|
||||
reason="host_resource 告警不應執行 K8s kubectl 操作,降級人工審核",
|
||||
)
|
||||
token.state = DecisionState.READY
|
||||
token.proposal_data["auto_executed"] = False
|
||||
token.proposal_data["mcp_all_failed"] = True
|
||||
token.proposal_data["blocked_reason"] = "host_resource 告警禁止 K8s kubectl,請人工排查主機"
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
|
||||
return
|
||||
|
||||
# 安全守衛: 替換後仍含 "unknown" 或未替換的 <...>/{...} → 拒絕執行
|
||||
# 另外:若 target 等於 alertname,代表 LLM 把告警名稱填入 deployment_name,也拒絕
|
||||
_alertname = incident.signals[0].labels.get("alertname", "") if incident.signals else ""
|
||||
@@ -1667,6 +1685,32 @@ class DecisionManager:
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data))
|
||||
return
|
||||
|
||||
# 2026-04-15 ogt: 同一 target 5 分鐘內最多執行 2 次,防止修復風暴
|
||||
# 根因:多個 incident 共享同一 target 時,各自獨立自動執行 → 重複重啟
|
||||
try:
|
||||
from src.core.redis_client import get_redis as _get_redis_dm
|
||||
_redis_dm = _get_redis_dm()
|
||||
_dm_cooldown_key = f"awoooi:auto_execute_cooldown:{_ns}:{_target}"
|
||||
_dm_exec_count = await _redis_dm.get(_dm_cooldown_key)
|
||||
if _dm_exec_count and int(_dm_exec_count) >= 2:
|
||||
logger.warning(
|
||||
"auto_execute_cooldown_blocked",
|
||||
incident_id=incident.incident_id,
|
||||
target=_target,
|
||||
namespace=_ns,
|
||||
exec_count=int(_dm_exec_count),
|
||||
reason="同一 target 5 分鐘內已自動執行 2 次,冷卻中",
|
||||
)
|
||||
token.state = DecisionState.READY
|
||||
token.proposal_data["auto_executed"] = False
|
||||
token.proposal_data["cooldown_blocked"] = True
|
||||
await self._save_token(token)
|
||||
return
|
||||
await _redis_dm.incr(_dm_cooldown_key)
|
||||
await _redis_dm.expire(_dm_cooldown_key, 300) # 5 分鐘
|
||||
except Exception as _cd_err:
|
||||
logger.debug("auto_execute_cooldown_check_error", error=str(_cd_err))
|
||||
|
||||
try:
|
||||
# 延遲導入避免循環依賴
|
||||
from src.models.approval import ApprovalRequest, ApprovalStatus
|
||||
|
||||
Reference in New Issue
Block a user