diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index a3b9617c..7b1bb501 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -441,6 +441,113 @@ async def _generate_playbook_draft_if_new(incident: "Incident") -> None: _sl.get_logger(__name__).debug("playbook_draft_failed", error=str(e)) +async def _resolve_target_from_k8s(incident: "Incident", namespace: str) -> str | None: + """ + BUG-002 補救:主機層告警無 component/job/pod label 時, + 用 K8s MCP kubectl get pods 依 alertname/host label 動態查詢受影響 Pod name, + 回傳 deployment name(去掉 hash suffix)或 None。 + + 2026-04-11 Claude Sonnet 4.6 Asia/Taipei + """ + try: + from src.plugins.mcp.providers.k8s_provider import K8sProvider + + k8s = K8sProvider() + if not k8s.enabled: + return None + + alertname = "" + if incident.signals: + labels = incident.signals[0].labels + alertname = labels.get("alertname", "") + + # 用 kubectl get pods 列出所有 pods,再根據 alertname 推測受影響的 deployment + result = await k8s.execute( + tool_name="kubectl_get", + params={"resource": "pods", "namespace": namespace, "output": "name"}, + ) + if not result.get("success"): + return None + + pod_lines: list[str] = (result.get("output", "") or "").splitlines() + if not pod_lines: + return None + + # alertname → 關鍵字映射(主機層告警常見類型) + _ALERTNAME_KEYWORDS: dict[str, list[str]] = { + "HostHighCpuLoad": ["api", "web"], + "HostOutOfMemory": ["api", "web"], + "DockerContainerUnhealthy": [], + "HostHighDiskUsage": [], + } + keywords = _ALERTNAME_KEYWORDS.get(alertname, []) + + import re as _re + for line in pod_lines: + pod = line.removeprefix("pod/").strip() + if not pod: + continue + # 優先找關鍵字命中的 pod + if keywords and not any(kw in pod for kw in keywords): + continue + # 去掉 hash suffix → deployment name + parts = pod.rsplit("-", 2) + if len(parts) >= 3 and len(parts[-1]) == 5 and len(parts[-2]) in (9, 10): + return parts[0] + if len(parts) >= 2: + return "-".join(parts[:-1]) + return pod + + # 無關鍵字命中時,回傳第一個 non-infra pod + for line in pod_lines: + pod = line.removeprefix("pod/").strip() + if pod and not any(inf in pod for inf in ("prometheus", "alertmanager", "grafana")): + parts = pod.rsplit("-", 2) + if len(parts) >= 3: + return parts[0] + return pod + + except Exception as e: + logger.debug("resolve_target_from_k8s_failed", error=str(e)) + + return None + + +async def _verify_k8s_deployment_exists(target: str, namespace: str) -> bool: + """ + BUG-003 補救:呼叫 K8s MCP 確認 deployment/pod 是否真實存在。 + K8s MCP 不可用時 → 返回 True(不阻塞,保守策略)。 + + 2026-04-11 Claude Sonnet 4.6 Asia/Taipei + """ + try: + from src.plugins.mcp.providers.k8s_provider import K8sProvider + + k8s = K8sProvider() + if not k8s.enabled: + # MCP 不可用 → 保守放行,讓 kubectl 自行報錯 + return True + + result = await k8s.execute( + tool_name="kubectl_get", + params={"resource": "deployment", "name": target, "namespace": namespace}, + ) + if result.get("success"): + return True + + # 嘗試 pod(有些告警對應的是 pod 而非 deployment) + result_pod = await k8s.execute( + tool_name="kubectl_get", + params={"resource": "pod", "namespace": namespace, "selector": f"app={target}"}, + ) + return bool(result_pod.get("success") and result_pod.get("output", "").strip()) + + except Exception as e: + logger.debug("verify_k8s_deployment_exists_failed", target=target, error=str(e)) + # 例外時保守放行 + return True + + async def _fetch_metrics_snapshot(incident: Incident) -> dict: """ ADR-071-I: 從 Prometheus 抓取與此 incident 相關的指標快照 @@ -989,6 +1096,13 @@ class DecisionManager: if incident.signals: _ns = incident.signals[0].labels.get("namespace", "awoooi-prod") import re as _re + + # BUG-002 修復 2026-04-11: 主機層告警(HostHighCpuLoad 等)無 component/job/pod label + # → affected_services=[] → target="unknown" → safety guard 攔截 + # 補救:用 K8s MCP 依 alertname/host label 動態查詢受影響 Pod + if _target == "unknown": + _target = await _resolve_target_from_k8s(incident, _ns) or "unknown" + action = action.replace("{target}", _target).replace("{namespace}", _ns) # 格式佔位符 → 用 target 替換 action = _re.sub(r"", _target, action) @@ -1017,6 +1131,28 @@ class DecisionManager: ) return + # BUG-003 修復 2026-04-11: 加入 K8s deployment 存在性驗證, + # 避免 LLM 產生的無效 deployment name(/alertname/unknown)通過 safety guard + # 但仍對 K8s 發出錯誤指令 + if _target and _target != "unknown": + _k8s_verified = await _verify_k8s_deployment_exists(_target, _ns) + if not _k8s_verified: + logger.warning( + "auto_execute_blocked_deployment_not_found", + incident_id=incident.incident_id, + target=_target, + namespace=_ns, + reason="K8s 中找不到此 deployment/pod,拒絕執行", + ) + token.state = DecisionState.ERROR + token.error = f"Auto-execute blocked: deployment '{_target}' not found in K8s namespace '{_ns}'" + await self._save_token(token) + _fire_and_forget( + _push_auto_repair_result(incident, action, success=False, + error=f"K8s 中找不到 deployment '{_target}',請人工確認後手動執行") + ) + return + try: # 延遲導入避免循環依賴 from src.models.approval import ApprovalRequest, ApprovalStatus diff --git a/apps/api/src/services/drift_interpreter.py b/apps/api/src/services/drift_interpreter.py index 282aa3cc..f4275fdc 100644 --- a/apps/api/src/services/drift_interpreter.py +++ b/apps/api/src/services/drift_interpreter.py @@ -16,7 +16,6 @@ Drift Interpreter - Phase 25 P2 Config Drift Detection from __future__ import annotations -import asyncio import json from typing import TYPE_CHECKING @@ -101,29 +100,42 @@ class NemotronDriftInterpreter: return "\n".join(lines) if lines else "(均為白名單欄位)" async def _call_nemotron(self, prompt: str) -> DriftInterpretation: - """呼叫 Nemotron 進行意圖分析""" + """ + 呼叫本地 Ollama qwen2.5:7b-instruct 進行意圖分析 + BUG-001 修復 2026-04-11: nvidia_provider 已重構為返回 NvidiaProviderResult 物件(非 4-tuple), + 改用 Ollama httpx 直接呼叫,繞過 nvidia_provider,與 drift_narrator_service 一致 + """ + import httpx + + OLLAMA_URL = "http://192.168.0.111:11434" + MODEL = "qwen2.5:7b-instruct" + TIMEOUT = 45.0 + try: - from src.core.config import get_settings - from src.services.nvidia_provider import get_nvidia_provider + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + resp = await client.post( + f"{OLLAMA_URL}/api/generate", + json={ + "model": MODEL, + "prompt": prompt, + "stream": False, + "options": {"temperature": 0.2, "num_predict": 200}, + }, + ) + resp.raise_for_status() + data = resp.json() + response_text = data.get("response", "").strip() - settings = get_settings() - nvidia = get_nvidia_provider() - - response_text, success, _tokens, _cost = await asyncio.wait_for( - nvidia.chat(prompt=prompt), - timeout=getattr(settings, "NEMOTRON_DIAGNOSE_TIMEOUT_SECONDS", 30), - ) - - if not success or not response_text: - return self._unknown_result("Nemotron 回傳空值") + if not response_text: + return self._unknown_result("Ollama 回傳空值") return self._parse_response(response_text) - except asyncio.TimeoutError: - logger.warning("drift_nemotron_timeout") - return self._unknown_result("Nemotron 超時") + except httpx.TimeoutException: + logger.warning("drift_interpreter_timeout", model=MODEL) + return self._unknown_result("Ollama 超時") except Exception as e: - logger.warning("drift_nemotron_error", error=str(e)) + logger.warning("drift_interpreter_error", error=str(e)) return self._unknown_result(str(e)) def _parse_response(self, text: str) -> DriftInterpretation: