diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index d774a080..0c2bca8b 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1318,6 +1318,50 @@ class DecisionManager: """ action = token.proposal_data.get("kubectl_command", "") + # 2026-04-15 ogt: YAML 規則引擎優先 — 架構斷點修復 + # 根因:LLM 生成的 kubectl_command 與 YAML 規則引擎的 NO_ACTION / SSH 指令完全脫節 + # YAML 規則是人工審閱的權威來源,LLM 只是輔助 + # 修復策略: + # 1. YAML → NO_ACTION → 立即返回,不執行任何操作 + # 2. YAML → SSH 指令(非 kubectl)→ 覆蓋 LLM 生成的 action,讓 SSH 路由生效 + _alertname_for_yaml = incident.signals[0].labels.get("alertname", "") if incident.signals else "" + if _alertname_for_yaml: + try: + from src.services.alert_rule_engine import match_rule as _yaml_match + _yaml_r = _yaml_match({ + "labels": incident.signals[0].labels if incident.signals else {}, + "alert_type": _alertname_for_yaml, + "message": "", + "target_resource": incident.affected_services[0] if incident.affected_services else "unknown", + "namespace": "awoooi-prod", + }) + if _yaml_r: + if _yaml_r.get("suggested_action") == "NO_ACTION": + logger.info( + "auto_execute_yaml_no_action", + incident_id=incident.incident_id, + alertname=_alertname_for_yaml, + reason="YAML 規則明確標記 NO_ACTION,不執行自動修復", + ) + token.state = DecisionState.READY + token.proposal_data["auto_executed"] = False + token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_alertname_for_yaml}" + await self._save_token(token) + _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) + return + _yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip() + if _yaml_cmd and not _yaml_cmd.startswith("kubectl"): + # YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 kubectl action + action = _yaml_cmd + logger.info( + "auto_execute_yaml_cmd_override", + incident_id=incident.incident_id, + alertname=_alertname_for_yaml, + yaml_cmd=_yaml_cmd[:80], + ) + except Exception as _yaml_err: + logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err)) + # Phase 6 ADR-087: 自我降級守衛(AIOPS_P6_SELF_DEMOTION 控制) # SLO 違反 → 全域信心閾值調高;連續違反 → 保守模式,所有自動執行降為人工 # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立