From ecfb7148bf63681ff0ba389d211de9987f380d96 Mon Sep 17 00:00:00 2001 From: OG T Date: Wed, 15 Apr 2026 21:50:25 +0800 Subject: [PATCH] =?UTF-8?q?fix(prod):=20=E6=8E=A5=E9=80=9A=20YAML=20?= =?UTF-8?q?=E8=A6=8F=E5=89=87=E5=BC=95=E6=93=8E=E8=88=87=E8=87=AA=E5=8B=95?= =?UTF-8?q?=E5=9F=B7=E8=A1=8C=E8=B7=AF=E5=BE=91=20=E2=80=94=20=E6=9E=B6?= =?UTF-8?q?=E6=A7=8B=E6=A0=B8=E5=BF=83=E6=96=B7=E9=BB=9E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 架構斷點根因: YAML 規則引擎(alert_rules.yaml)是人工審閱的權威動作來源, 但自動執行路徑只讀 proposal_data["kubectl_command"](LLM 生成), 兩者完全脫節 → HostHighCpuLoad 得到 kubectl restart,DockerContainerUnhealthy 的 SSH 指令被 LLM 的 kubectl 覆蓋。 修復策略: 在 auto_execute 入口,先查 YAML match_rule: 1. YAML → NO_ACTION(如 HostHighCpuLoad)→ 立即返回,不執行任何操作 2. YAML → 非 kubectl 指令(如 ssh docker restart)→ 覆蓋 LLM action, 後續 infrastructure SSH 路由才能生效 影響: - HostHighCpuLoad / NodeCPUUsageHigh → 停止自動執行,降級人工審核 - DockerContainerUnhealthy → SSH docker restart(若 labels 有 host/container) 2026-04-15 ogt + Claude Sonnet 4.6(亞太): 生產緊急修復第三批 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/decision_manager.py | 44 +++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index d774a080..0c2bca8b 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1318,6 +1318,50 @@ class DecisionManager: """ action = token.proposal_data.get("kubectl_command", "") + # 2026-04-15 ogt: YAML 規則引擎優先 — 架構斷點修復 + # 根因:LLM 生成的 kubectl_command 與 YAML 規則引擎的 NO_ACTION / SSH 指令完全脫節 + # YAML 規則是人工審閱的權威來源,LLM 只是輔助 + # 修復策略: + # 1. YAML → NO_ACTION → 立即返回,不執行任何操作 + # 2. YAML → SSH 指令(非 kubectl)→ 覆蓋 LLM 生成的 action,讓 SSH 路由生效 + _alertname_for_yaml = incident.signals[0].labels.get("alertname", "") if incident.signals else "" + if _alertname_for_yaml: + try: + from src.services.alert_rule_engine import match_rule as _yaml_match + _yaml_r = _yaml_match({ + "labels": incident.signals[0].labels if incident.signals else {}, + "alert_type": _alertname_for_yaml, + "message": "", + "target_resource": incident.affected_services[0] if incident.affected_services else "unknown", + "namespace": "awoooi-prod", + }) + if _yaml_r: + if _yaml_r.get("suggested_action") == "NO_ACTION": + logger.info( + "auto_execute_yaml_no_action", + incident_id=incident.incident_id, + alertname=_alertname_for_yaml, + reason="YAML 規則明確標記 NO_ACTION,不執行自動修復", + ) + token.state = DecisionState.READY + token.proposal_data["auto_executed"] = False + token.proposal_data["blocked_reason"] = f"YAML: NO_ACTION for {_alertname_for_yaml}" + await self._save_token(token) + _fire_and_forget(_push_decision_to_telegram(incident, token.proposal_data)) + return + _yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip() + if _yaml_cmd and not _yaml_cmd.startswith("kubectl"): + # YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 kubectl action + action = _yaml_cmd + logger.info( + "auto_execute_yaml_cmd_override", + incident_id=incident.incident_id, + alertname=_alertname_for_yaml, + yaml_cmd=_yaml_cmd[:80], + ) + except Exception as _yaml_err: + logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err)) + # Phase 6 ADR-087: 自我降級守衛(AIOPS_P6_SELF_DEMOTION 控制) # SLO 違反 → 全域信心閾值調高;連續違反 → 保守模式,所有自動執行降為人工 # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立