diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index f0c9b02e..0843add9 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1312,6 +1312,56 @@ class DecisionManager: action = _re.sub(r"", _target, action) action = _re.sub(r"<[^>]+>", _target, action) + # GAP-A4 Phase 2 (2026-04-14 Claude Sonnet 4.6): LLM 路徑 target 救援 + # 真兇:LLM 直接產出 `kubectl scale deployment HostHighCpuLoad` (target=alertname) + # GAP-A4 Phase 1 只修了 rule_engine._extract_vars,LLM 路徑沒檢查 + # 結果:12 次 auto_execute_blocked_unresolved_placeholder 直接攔下 → 飛輪 0 + try: + from src.services.alert_rule_engine import ( + _extract_vars as _rule_extract_vars, + _is_bad_target as _rule_is_bad_target, + ) + _alertname_for_rescue = ( + incident.signals[0].labels.get("alertname", "") if incident.signals else "" + ) + # 從 action 提取 deployment 名稱(kubectl scale/restart deployment XXX 或 deployment/XXX) + _kubectl_target_match = _re.search( + r"deployment[/\s]+([\w.\-]+)", action + ) + if _kubectl_target_match: + _llm_target = _kubectl_target_match.group(1) + if _rule_is_bad_target(_llm_target, _alertname_for_rescue): + # LLM 把垃圾當 deployment 名(alertname/unknown/IP)→ 重推 + _alert_ctx = { + "labels": incident.signals[0].labels if incident.signals else {}, + "target_resource": _target, + "namespace": _ns, + "alert_type": _alertname_for_rescue, + } + _good_target = _rule_extract_vars(_alert_ctx).get("target", "") + if _good_target and _good_target != "unknown" and not _rule_is_bad_target(_good_target, _alertname_for_rescue): + _old_action = action + action = action.replace(_llm_target, _good_target) + logger.info( + "auto_execute_target_rescued", + incident_id=incident.incident_id, + llm_target=_llm_target, + rescued_target=_good_target, + old_action=_old_action[:120], + new_action=action[:120], + reason="LLM 產出垃圾 target,從 labels 重推 deployment 名", + ) + else: + logger.warning( + "auto_execute_target_rescue_failed", + incident_id=incident.incident_id, + llm_target=_llm_target, + alertname=_alertname_for_rescue, + reason="labels 也找不到合法 deployment,將進 safety guard 攔截 → 人工", + ) + except Exception as _rescue_err: + logger.debug("target_rescue_skipped", error=str(_rescue_err)) + # ADR-073 Phase 3-2: infrastructure 告警 (Docker/Host) → SSH MCP routing (2026-04-12 ogt) # alert_category = "infrastructure" 表示 Docker/Host 告警,不走 K8s executor # action 格式應為 "docker restart " 或 "systemctl restart "