From c439277fc3ddec4ce207e9ce3b82dbac87f7da6e Mon Sep 17 00:00:00 2001 From: OG T Date: Sat, 11 Apr 2026 21:39:52 +0800 Subject: [PATCH] =?UTF-8?q?feat(aiops):=20ADR-070=20=E5=85=A8=E8=87=AA?= =?UTF-8?q?=E5=8B=95=E5=8C=96=E6=96=B9=E5=90=91=20=E2=80=94=20=E4=B8=89?= =?UTF-8?q?=E5=A4=A7=E4=BF=AE=E5=BE=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. auto_approve.py: 允許 high risk 自動執行 (low/medium/high 全開放) - min_confidence 0.65→0.50 (信心門檻降低) - 新增 DESTRUCTIVE_PATTERNS 攔截真正危險指令 (scale=0, delete deployment/pvc/namespace, drop table) - 核心: critical + 破壞性操作 → 人工; 其他 → 全自動 2. decision_manager.py: 新增 _collect_mcp_context() - LLM 分析前先收集真實環境狀態 (SSH/K8s MCP) - Host/Docker 告警 → ssh_get_container_status + ssh_get_top_processes - K8s 告警 → k8s_get_events - 注入 diagnosis_context "當前環境狀態 (MCP 實時查詢)" 區段 3. webhooks.py: 修復 target_resource 提取 - 新增 name/container/job label 提取 - DockerContainerUnhealthy 不再 target=alertname - IP 位址自動排除 (192.x 開頭不作為 target) 🔴 Tier 3 紅區 — 需首席架構師批准 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/api/v1/webhooks.py | 14 +++- apps/api/src/services/auto_approve.py | 35 ++++++++- apps/api/src/services/decision_manager.py | 95 ++++++++++++++++++++--- 3 files changed, 128 insertions(+), 16 deletions(-) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index a3c6ab5e..54b80cfc 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1109,13 +1109,19 @@ async def alertmanager_webhook( "warning" ) - # 優先用 component label(Docker 層告警用 component,如 SentryDown → "sentry") - # 次優 pod(K8s 告警),再次 instance(blackbox probe),最後 alertname - # (2026-04-09 Claude Sonnet 4.6 Asia/Taipei, Bug #5 修正 — affected_services 匹配 Playbook) + # target_resource 提取優先順序 (2026-04-11 Claude Sonnet 4.6 全自動化修正) + # component (Docker 服務名) > pod (K8s) > name/container (Cadvisor 容器名) > job > instance IP > alertname + # 關鍵:Docker 告警 (DockerContainerUnhealthy/DockerContainerExited) 的容器名在 name label + # 過去 fallback 直接用 alertname,導致 target_resource="DockerContainerUnhealthy" 污染整個修復流程 + _instance = alert.labels.get("instance", "") + _instance_clean = _instance.split(":")[0] if _instance and ":" in _instance else _instance target_resource = ( alert.labels.get("component") or alert.labels.get("pod") - or alert.labels.get("instance") + or alert.labels.get("name") # Cadvisor/cAdvisor 容器名 + or alert.labels.get("container") # K8s container name + or alert.labels.get("job") # Prometheus job name(次優) + or (_instance_clean if _instance_clean and not _instance_clean.startswith("192.") else None) or alertname ) namespace = alert.labels.get("namespace", "default") diff --git a/apps/api/src/services/auto_approve.py b/apps/api/src/services/auto_approve.py index b54956dc..51ca10fe 100644 --- a/apps/api/src/services/auto_approve.py +++ b/apps/api/src/services/auto_approve.py @@ -59,9 +59,11 @@ class AutoApproveConfig: """自動執行配置""" # 風險等級閾值 - # 2026-04-01 ogt: 開放 low + medium,讓常見 restart 操作可自動執行 + # 2026-04-11 Claude Sonnet 4.6: ADR-070 全自動化方向 — low/medium/high 全開放 + # 真正需要人工的由 DESTRUCTIVE_PATTERNS 攔截(scale=0, delete, drop) + # 原: ["low", "medium"] → 導致所有 high risk 告警永遠走人工審核 allowed_risk_levels: list[str] = field( - default_factory=lambda: ["low", "medium"] + default_factory=lambda: ["low", "medium", "high"] ) # 信任度閾值 @@ -69,7 +71,9 @@ class AutoApproveConfig: # → 改為 0,讓 medium risk + confidence >= 0.65 的操作直接自動執行 # 歷史原因: min_trust_score=1 導致所有告警永遠走審批,從未自動修復 min_trust_score: int = 0 # 不要求執行歷史 (原: 1) - min_confidence: float = 0.65 # AI 有合理把握即可 (原: 0.90) + # 2026-04-11 Claude Sonnet 4.6: ADR-070 全自動化 — 0.5 即可執行 + # 真正風險由 DESTRUCTIVE_PATTERNS + risk_level=critical 把關 + min_confidence: float = 0.50 # AI 有基本把握即可 (原: 0.90, 後: 0.65) # Playbook 閾值 # 2026-04-01 ogt: 降低啟動門檻,1次成功記錄即可 @@ -219,6 +223,31 @@ class AutoApprovePolicy: confidence=confidence, ) + # 條件 1b: 破壞性指令攔截 (ADR-070: 2026-04-11 Claude Sonnet 4.6) + # 即使是 low/medium risk,以下操作仍需人工確認 + # 原則: 可恢復操作 → 自動執行; 不可逆 / 業務衝擊 → 人工 + _DESTRUCTIVE_PATTERNS = [ + "--replicas=0", # scale to zero — 等同停機 + "scale deployment", # 任何 scale 操作需確認目標副本數 + "delete pod", # 強制刪除 pod + "delete deployment", # 刪除 deployment (不是 restart) + "delete pvc", # 刪除 PVC (資料丟失) + "delete namespace", # 刪除 namespace + "drop table", # DB DDL + "drop database", # DB DDL + "truncate table", # DB DDL + ] + action_lower = action.lower() + for pattern in _DESTRUCTIVE_PATTERNS: + if pattern in action_lower: + return self._reject( + reason=AutoApproveReason.CRITICAL_OPERATION, + detail=f"Destructive pattern detected: '{pattern}' in action — requires human approval", + risk_level=risk_level, + trust_score=trust_score, + confidence=confidence, + ) + # 條件 2: 風險等級必須在允許列表中 if risk_level not in self.config.allowed_risk_levels: return self._reject( diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index d062cbe6..e0cf1948 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1384,20 +1384,91 @@ class DecisionManager: logger.error("kb_rag_unexpected_error", incident_id=incident.incident_id, error=str(e)) return "" + async def _collect_mcp_context(self, incident: Incident) -> str: + """ + ADR-070 全自動 AIOps: 分析前用 MCP 收集真實環境狀態 + 讓 LLM 拿到真實資訊做決策,而非只憑 alert labels + + 策略: + - K8s 告警 → K8s MCP 查 Pod 狀態/事件 + - 主機/Docker 告警 → SSH MCP 查容器狀態/資源 + + 2026-04-11 Claude Sonnet 4.6 Asia/Taipei + """ + if not incident.signals: + return "" + + labels = incident.signals[0].labels + alertname = labels.get("alertname", "") + host = labels.get("instance", "").split(":")[0] or labels.get("host", "") + container = labels.get("name") or labels.get("container") or incident.affected_services[0] if incident.affected_services else "" + ns = labels.get("namespace", "awoooi-prod") + + ctx_parts: list[str] = [] + + # 主機/Docker 告警 → SSH MCP 診斷 + _HOST_ALERT_PREFIXES = ("Host", "Docker", "Sentry", "Harbor", "Ollama", "Backup") + if alertname.startswith(_HOST_ALERT_PREFIXES) and host: + try: + from src.plugins.mcp.providers.ssh_provider import SSHProvider + ssh = SSHProvider() + if ssh.enabled and host in ("192.168.0.188", "192.168.0.110"): + # 查容器狀態 + if container and container != alertname: + status_result = await ssh.execute( + tool_name="ssh_get_container_status", + params={"host": host, "container_name": container}, + ) + if status_result.get("success"): + ctx_parts.append(f"[SSH] 容器 {container} 狀態: {status_result.get('output', '')[:300]}") + # 查主機資源 + if "CpuLoad" in alertname or "Memory" in alertname: + top_result = await ssh.execute( + tool_name="ssh_get_top_processes", + params={"host": host, "top_n": 5}, + ) + if top_result.get("success"): + ctx_parts.append(f"[SSH] 主機 {host} Top processes: {top_result.get('output', '')[:300]}") + except Exception as e: + logger.debug("mcp_context_ssh_failed", alertname=alertname, error=str(e)) + + # K8s 告警 → K8s MCP 查 Pod 狀態 + if alertname.startswith(("Kube", "K3s")) or labels.get("pod"): + try: + from src.plugins.mcp.providers.k8s_provider import K8sProvider + k8s = K8sProvider() + if k8s.enabled: + pod = labels.get("pod", "") + if pod: + events_result = await k8s.execute( + tool_name="k8s_get_events", + params={"namespace": ns, "field_selector": f"involvedObject.name={pod}"}, + ) + if events_result.get("success"): + ctx_parts.append(f"[K8s] Pod {pod} 事件: {events_result.get('output', '')[:300]}") + except Exception as e: + logger.debug("mcp_context_k8s_failed", alertname=alertname, error=str(e)) + + return "\n".join(ctx_parts) + async def _dual_engine_analyze( self, incident: Incident, ) -> dict[str, Any]: """ - 三軌決策分析 (Phase 7.5 升級 + KB Phase 2 RAG 整合) + 三軌決策分析 (Phase 7.5 升級 + KB Phase 2 RAG 整合 + ADR-070 MCP 前置收集) 策略: - 1. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%) - 2. Playbook 命中則直接使用 (最快、經驗驗證) - 3. 否則 LLM + Expert System 雙軌 + KB RAG context 注入 + 1. MCP 前置收集真實環境狀態(ADR-070) + 2. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%) + 3. Playbook 命中則直接使用 (最快、經驗驗證) + 4. 否則 LLM + Expert System 雙軌 + KB RAG context + MCP context 注入 優先順序: Playbook > LLM > Expert System """ + # ADR-070: 分析前用 MCP 收集真實環境狀態 + mcp_context = await self._collect_mcp_context(incident) + # Phase 7.5: 先嘗試 Playbook 匹配 playbook_result = await self._try_playbook_match(incident) if playbook_result: @@ -1418,13 +1489,19 @@ class DecisionManager: try: signals_dict = [s.model_dump() for s in incident.signals] - # 將 KB context 注入 expert_context 傳給 LLM + # 將 KB context + MCP 實時狀態 注入 expert_context 傳給 LLM + # ADR-070: MCP context 優先放最前面,讓 LLM 看到真實環境狀態再做決策 llm_expert_context: dict[str, Any] = {**expert_result} if expert_result else {} + existing = str(llm_expert_context.get("diagnosis_context", "")) + context_parts = [] + if mcp_context: + context_parts.append(f"## 當前環境狀態 (MCP 實時查詢)\n{mcp_context}") if kb_context: - existing = str(llm_expert_context.get("diagnosis_context", "")) - llm_expert_context["diagnosis_context"] = ( - f"{kb_context}\n\n{existing}" if existing else kb_context - ) + context_parts.append(f"## 相關歷史知識\n{kb_context}") + if existing: + context_parts.append(existing) + if context_parts: + llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts) llm_result, provider, success = await self._openclaw.generate_incident_proposal_with_tools( incident_id=incident.incident_id,