feat(aiops): ADR-070 全自動化方向 — 三大修復

1. auto_approve.py: 允許 high risk 自動執行 (low/medium/high 全開放) - min_confidence 0.65→0.50 (信心門檻降低) - 新增 DESTRUCTIVE_PATTERNS 攔截真正危險指令 (scale=0, delete deployment/pvc/namespace, drop table) - 核心: critical + 破壞性操作 → 人工; 其他 → 全自動 2. decision_manager.py: 新增 _collect_mcp_context() - LLM 分析前先收集真實環境狀態 (SSH/K8s MCP) - Host/Docker 告警 → ssh_get_container_status + ssh_get_top_processes - K8s 告警 → k8s_get_events - 注入 diagnosis_context "當前環境狀態 (MCP 實時查詢)" 區段 3. webhooks.py: 修復 target_resource 提取 - 新增 name/container/job label 提取 - DockerContainerUnhealthy 不再 target=alertname - IP 位址自動排除 (192.x 開頭不作為 target) 🔴 Tier 3 紅區 — 需首席架構師批准 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-11 21:39:52 +08:00
parent 99cc420429
commit c439277fc3
3 changed files with 128 additions and 16 deletions
--- a/apps/api/src/api/v1/webhooks.py
+++ b/apps/api/src/api/v1/webhooks.py
@@ -1109,13 +1109,19 @@ async def alertmanager_webhook(
        "warning"
    )

-    # 優先用 component label（Docker 層告警用 component，如 SentryDown → "sentry"）
-    # 次優 pod（K8s 告警），再次 instance（blackbox probe），最後 alertname
-    # (2026-04-09 Claude Sonnet 4.6 Asia/Taipei, Bug #5 修正 — affected_services 匹配 Playbook)
+    # target_resource 提取優先順序 (2026-04-11 Claude Sonnet 4.6 全自動化修正)
+    # component (Docker 服務名) > pod (K8s) > name/container (Cadvisor 容器名) > job > instance IP > alertname
+    # 關鍵：Docker 告警 (DockerContainerUnhealthy/DockerContainerExited) 的容器名在 name label
+    # 過去 fallback 直接用 alertname，導致 target_resource="DockerContainerUnhealthy" 污染整個修復流程
+    _instance = alert.labels.get("instance", "")
+    _instance_clean = _instance.split(":")[0] if _instance and ":" in _instance else _instance
    target_resource = (
        alert.labels.get("component")
        or alert.labels.get("pod")
-        or alert.labels.get("instance")
+        or alert.labels.get("name")          # Cadvisor/cAdvisor 容器名
+        or alert.labels.get("container")     # K8s container name
+        or alert.labels.get("job")           # Prometheus job name（次優）
+        or (_instance_clean if _instance_clean and not _instance_clean.startswith("192.") else None)
        or alertname
    )
    namespace = alert.labels.get("namespace", "default")
--- a/apps/api/src/services/auto_approve.py
+++ b/apps/api/src/services/auto_approve.py
@@ -59,9 +59,11 @@ class AutoApproveConfig:
    """自動執行配置"""

    # 風險等級閾值
-    # 2026-04-01 ogt: 開放 low + medium，讓常見 restart 操作可自動執行
+    # 2026-04-11 Claude Sonnet 4.6: ADR-070 全自動化方向 — low/medium/high 全開放
+    # 真正需要人工的由 DESTRUCTIVE_PATTERNS 攔截（scale=0, delete, drop）
+    # 原: ["low", "medium"] → 導致所有 high risk 告警永遠走人工審核
    allowed_risk_levels: list[str] = field(
-        default_factory=lambda: ["low", "medium"]
+        default_factory=lambda: ["low", "medium", "high"]
    )

    # 信任度閾值
@@ -69,7 +71,9 @@ class AutoApproveConfig:
    # → 改為 0，讓 medium risk + confidence >= 0.65 的操作直接自動執行
    # 歷史原因: min_trust_score=1 導致所有告警永遠走審批，從未自動修復
    min_trust_score: int = 0   # 不要求執行歷史 (原: 1)
-    min_confidence: float = 0.65  # AI 有合理把握即可 (原: 0.90)
+    # 2026-04-11 Claude Sonnet 4.6: ADR-070 全自動化 — 0.5 即可執行
+    # 真正風險由 DESTRUCTIVE_PATTERNS + risk_level=critical 把關
+    min_confidence: float = 0.50  # AI 有基本把握即可 (原: 0.90, 後: 0.65)

    # Playbook 閾值
    # 2026-04-01 ogt: 降低啟動門檻，1次成功記錄即可
@@ -219,6 +223,31 @@ class AutoApprovePolicy:
                confidence=confidence,
            )

+        # 條件 1b: 破壞性指令攔截 (ADR-070: 2026-04-11 Claude Sonnet 4.6)
+        # 即使是 low/medium risk，以下操作仍需人工確認
+        # 原則: 可恢復操作 → 自動執行; 不可逆 / 業務衝擊 → 人工
+        _DESTRUCTIVE_PATTERNS = [
+            "--replicas=0",        # scale to zero — 等同停機
+            "scale deployment",    # 任何 scale 操作需確認目標副本數
+            "delete pod",          # 強制刪除 pod
+            "delete deployment",   # 刪除 deployment (不是 restart)
+            "delete pvc",          # 刪除 PVC (資料丟失)
+            "delete namespace",    # 刪除 namespace
+            "drop table",          # DB DDL
+            "drop database",       # DB DDL
+            "truncate table",      # DB DDL
+        ]
+        action_lower = action.lower()
+        for pattern in _DESTRUCTIVE_PATTERNS:
+            if pattern in action_lower:
+                return self._reject(
+                    reason=AutoApproveReason.CRITICAL_OPERATION,
+                    detail=f"Destructive pattern detected: '{pattern}' in action — requires human approval",
+                    risk_level=risk_level,
+                    trust_score=trust_score,
+                    confidence=confidence,
+                )
+
        # 條件 2: 風險等級必須在允許列表中
        if risk_level not in self.config.allowed_risk_levels:
            return self._reject(
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -1384,20 +1384,91 @@ class DecisionManager:
            logger.error("kb_rag_unexpected_error", incident_id=incident.incident_id, error=str(e))
            return ""

+    async def _collect_mcp_context(self, incident: Incident) -> str:
+        """
+        ADR-070 全自動 AIOps: 分析前用 MCP 收集真實環境狀態
+        讓 LLM 拿到真實資訊做決策，而非只憑 alert labels
+
+        策略：
+        - K8s 告警 → K8s MCP 查 Pod 狀態/事件
+        - 主機/Docker 告警 → SSH MCP 查容器狀態/資源
+
+        2026-04-11 Claude Sonnet 4.6 Asia/Taipei
+        """
+        if not incident.signals:
+            return ""
+
+        labels = incident.signals[0].labels
+        alertname = labels.get("alertname", "")
+        host = labels.get("instance", "").split(":")[0] or labels.get("host", "")
+        container = labels.get("name") or labels.get("container") or incident.affected_services[0] if incident.affected_services else ""
+        ns = labels.get("namespace", "awoooi-prod")
+
+        ctx_parts: list[str] = []
+
+        # 主機/Docker 告警 → SSH MCP 診斷
+        _HOST_ALERT_PREFIXES = ("Host", "Docker", "Sentry", "Harbor", "Ollama", "Backup")
+        if alertname.startswith(_HOST_ALERT_PREFIXES) and host:
+            try:
+                from src.plugins.mcp.providers.ssh_provider import SSHProvider
+                ssh = SSHProvider()
+                if ssh.enabled and host in ("192.168.0.188", "192.168.0.110"):
+                    # 查容器狀態
+                    if container and container != alertname:
+                        status_result = await ssh.execute(
+                            tool_name="ssh_get_container_status",
+                            params={"host": host, "container_name": container},
+                        )
+                        if status_result.get("success"):
+                            ctx_parts.append(f"[SSH] 容器 {container} 狀態: {status_result.get('output', '')[:300]}")
+                    # 查主機資源
+                    if "CpuLoad" in alertname or "Memory" in alertname:
+                        top_result = await ssh.execute(
+                            tool_name="ssh_get_top_processes",
+                            params={"host": host, "top_n": 5},
+                        )
+                        if top_result.get("success"):
+                            ctx_parts.append(f"[SSH] 主機 {host} Top processes: {top_result.get('output', '')[:300]}")
+            except Exception as e:
+                logger.debug("mcp_context_ssh_failed", alertname=alertname, error=str(e))
+
+        # K8s 告警 → K8s MCP 查 Pod 狀態
+        if alertname.startswith(("Kube", "K3s")) or labels.get("pod"):
+            try:
+                from src.plugins.mcp.providers.k8s_provider import K8sProvider
+                k8s = K8sProvider()
+                if k8s.enabled:
+                    pod = labels.get("pod", "")
+                    if pod:
+                        events_result = await k8s.execute(
+                            tool_name="k8s_get_events",
+                            params={"namespace": ns, "field_selector": f"involvedObject.name={pod}"},
+                        )
+                        if events_result.get("success"):
+                            ctx_parts.append(f"[K8s] Pod {pod} 事件: {events_result.get('output', '')[:300]}")
+            except Exception as e:
+                logger.debug("mcp_context_k8s_failed", alertname=alertname, error=str(e))
+
+        return "\n".join(ctx_parts)
+
    async def _dual_engine_analyze(
        self,
        incident: Incident,
    ) -> dict[str, Any]:
        """
-        三軌決策分析 (Phase 7.5 升級 + KB Phase 2 RAG 整合)
+        三軌決策分析 (Phase 7.5 升級 + KB Phase 2 RAG 整合 + ADR-070 MCP 前置收集)

        策略:
-        1. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%)
-        2. Playbook 命中則直接使用 (最快、經驗驗證)
-        3. 否則 LLM + Expert System 雙軌 + KB RAG context 注入
+        1. MCP 前置收集真實環境狀態（ADR-070）
+        2. 先檢查 Playbook 是否有高度匹配 (similarity >= 85%)
+        3. Playbook 命中則直接使用 (最快、經驗驗證)
+        4. 否則 LLM + Expert System 雙軌 + KB RAG context + MCP context 注入

        優先順序: Playbook > LLM > Expert System
        """
+        # ADR-070: 分析前用 MCP 收集真實環境狀態
+        mcp_context = await self._collect_mcp_context(incident)
+
        # Phase 7.5: 先嘗試 Playbook 匹配
        playbook_result = await self._try_playbook_match(incident)
        if playbook_result:
@@ -1418,13 +1489,19 @@ class DecisionManager:
        try:
            signals_dict = [s.model_dump() for s in incident.signals]

-            # 將 KB context 注入 expert_context 傳給 LLM
+            # 將 KB context + MCP 實時狀態 注入 expert_context 傳給 LLM
+            # ADR-070: MCP context 優先放最前面，讓 LLM 看到真實環境狀態再做決策
            llm_expert_context: dict[str, Any] = {**expert_result} if expert_result else {}
+            existing = str(llm_expert_context.get("diagnosis_context", ""))
+            context_parts = []
+            if mcp_context:
+                context_parts.append(f"## 當前環境狀態 (MCP 實時查詢)\n{mcp_context}")
            if kb_context:
-                existing = str(llm_expert_context.get("diagnosis_context", ""))
-                llm_expert_context["diagnosis_context"] = (
-                    f"{kb_context}\n\n{existing}" if existing else kb_context
-                )
+                context_parts.append(f"## 相關歷史知識\n{kb_context}")
+            if existing:
+                context_parts.append(existing)
+            if context_parts:
+                llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts)

            llm_result, provider, success = await self._openclaw.generate_incident_proposal_with_tools(
                incident_id=incident.incident_id,