fix(aiops): ADR-072 P0 Bug 修復 — BUG-001/002/003
Some checks are pending
CD Pipeline / build-and-deploy (push) Has started running

BUG-001 drift_interpreter: nvidia_provider 已重構為 NvidiaProviderResult 物件(非 4-tuple)
  → 改用 Ollama httpx 直接呼叫 qwen2.5:7b-instruct,繞過 nvidia_provider
  → 消除所有 K8s config drift 告警的 "too many values to unpack" 永久失敗

BUG-002 deployment_name="unknown": 主機層告警(HostHighCpuLoad 等)無 component/job/pod label
  → _auto_execute() 新增 _resolve_target_from_k8s() 補救
  → K8s MCP kubectl get pods 動態查詢受影響 Pod,去掉 hash suffix 得到 deployment name

BUG-003 無效 deployment 通過 safety guard:
  → _auto_execute() safety guard 通過後加入 _verify_k8s_deployment_exists() 存在性確認
  → K8s 中找不到 deployment/pod → 拒絕執行,寫入 DecisionToken.error
  → K8s MCP 不可用時保守放行(不阻塞主流程)

2026-04-11 Claude Sonnet 4.6 Asia/Taipei

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-11 20:20:25 +08:00
parent 2ad2a7ba45
commit 2185e1755c
2 changed files with 166 additions and 18 deletions

View File

@@ -441,6 +441,113 @@ async def _generate_playbook_draft_if_new(incident: "Incident") -> None:
_sl.get_logger(__name__).debug("playbook_draft_failed", error=str(e))
async def _resolve_target_from_k8s(incident: "Incident", namespace: str) -> str | None:
"""
BUG-002 補救:主機層告警無 component/job/pod label 時,
用 K8s MCP kubectl get pods 依 alertname/host label 動態查詢受影響 Pod name
回傳 deployment name去掉 hash suffix或 None。
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
"""
try:
from src.plugins.mcp.providers.k8s_provider import K8sProvider
k8s = K8sProvider()
if not k8s.enabled:
return None
alertname = ""
if incident.signals:
labels = incident.signals[0].labels
alertname = labels.get("alertname", "")
# 用 kubectl get pods 列出所有 pods再根據 alertname 推測受影響的 deployment
result = await k8s.execute(
tool_name="kubectl_get",
params={"resource": "pods", "namespace": namespace, "output": "name"},
)
if not result.get("success"):
return None
pod_lines: list[str] = (result.get("output", "") or "").splitlines()
if not pod_lines:
return None
# alertname → 關鍵字映射(主機層告警常見類型)
_ALERTNAME_KEYWORDS: dict[str, list[str]] = {
"HostHighCpuLoad": ["api", "web"],
"HostOutOfMemory": ["api", "web"],
"DockerContainerUnhealthy": [],
"HostHighDiskUsage": [],
}
keywords = _ALERTNAME_KEYWORDS.get(alertname, [])
import re as _re
for line in pod_lines:
pod = line.removeprefix("pod/").strip()
if not pod:
continue
# 優先找關鍵字命中的 pod
if keywords and not any(kw in pod for kw in keywords):
continue
# 去掉 hash suffix → deployment name
parts = pod.rsplit("-", 2)
if len(parts) >= 3 and len(parts[-1]) == 5 and len(parts[-2]) in (9, 10):
return parts[0]
if len(parts) >= 2:
return "-".join(parts[:-1])
return pod
# 無關鍵字命中時,回傳第一個 non-infra pod
for line in pod_lines:
pod = line.removeprefix("pod/").strip()
if pod and not any(inf in pod for inf in ("prometheus", "alertmanager", "grafana")):
parts = pod.rsplit("-", 2)
if len(parts) >= 3:
return parts[0]
return pod
except Exception as e:
logger.debug("resolve_target_from_k8s_failed", error=str(e))
return None
async def _verify_k8s_deployment_exists(target: str, namespace: str) -> bool:
"""
BUG-003 補救:呼叫 K8s MCP 確認 deployment/pod 是否真實存在。
K8s MCP 不可用時 → 返回 True不阻塞保守策略
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
"""
try:
from src.plugins.mcp.providers.k8s_provider import K8sProvider
k8s = K8sProvider()
if not k8s.enabled:
# MCP 不可用 → 保守放行,讓 kubectl 自行報錯
return True
result = await k8s.execute(
tool_name="kubectl_get",
params={"resource": "deployment", "name": target, "namespace": namespace},
)
if result.get("success"):
return True
# 嘗試 pod有些告警對應的是 pod 而非 deployment
result_pod = await k8s.execute(
tool_name="kubectl_get",
params={"resource": "pod", "namespace": namespace, "selector": f"app={target}"},
)
return bool(result_pod.get("success") and result_pod.get("output", "").strip())
except Exception as e:
logger.debug("verify_k8s_deployment_exists_failed", target=target, error=str(e))
# 例外時保守放行
return True
async def _fetch_metrics_snapshot(incident: Incident) -> dict:
"""
ADR-071-I: 從 Prometheus 抓取與此 incident 相關的指標快照
@@ -989,6 +1096,13 @@ class DecisionManager:
if incident.signals:
_ns = incident.signals[0].labels.get("namespace", "awoooi-prod")
import re as _re
# BUG-002 修復 2026-04-11: 主機層告警(HostHighCpuLoad 等)無 component/job/pod label
# → affected_services=[] → target="unknown" → safety guard 攔截
# 補救:用 K8s MCP 依 alertname/host label 動態查詢受影響 Pod
if _target == "unknown":
_target = await _resolve_target_from_k8s(incident, _ns) or "unknown"
action = action.replace("{target}", _target).replace("{namespace}", _ns)
# <xxx> 格式佔位符 → 用 target 替換
action = _re.sub(r"<deployment_name>", _target, action)
@@ -1017,6 +1131,28 @@ class DecisionManager:
)
return
# BUG-003 修復 2026-04-11: 加入 K8s deployment 存在性驗證,
# 避免 LLM 產生的無效 deployment name<placeholder>/alertname/unknown通過 safety guard
# 但仍對 K8s 發出錯誤指令
if _target and _target != "unknown":
_k8s_verified = await _verify_k8s_deployment_exists(_target, _ns)
if not _k8s_verified:
logger.warning(
"auto_execute_blocked_deployment_not_found",
incident_id=incident.incident_id,
target=_target,
namespace=_ns,
reason="K8s 中找不到此 deployment/pod拒絕執行",
)
token.state = DecisionState.ERROR
token.error = f"Auto-execute blocked: deployment '{_target}' not found in K8s namespace '{_ns}'"
await self._save_token(token)
_fire_and_forget(
_push_auto_repair_result(incident, action, success=False,
error=f"K8s 中找不到 deployment '{_target}',請人工確認後手動執行")
)
return
try:
# 延遲導入避免循環依賴
from src.models.approval import ApprovalRequest, ApprovalStatus

View File

@@ -16,7 +16,6 @@ Drift Interpreter - Phase 25 P2 Config Drift Detection
from __future__ import annotations
import asyncio
import json
from typing import TYPE_CHECKING
@@ -101,29 +100,42 @@ class NemotronDriftInterpreter:
return "\n".join(lines) if lines else "(均為白名單欄位)"
async def _call_nemotron(self, prompt: str) -> DriftInterpretation:
"""呼叫 Nemotron 進行意圖分析"""
"""
呼叫本地 Ollama qwen2.5:7b-instruct 進行意圖分析
BUG-001 修復 2026-04-11: nvidia_provider 已重構為返回 NvidiaProviderResult 物件(非 4-tuple
改用 Ollama httpx 直接呼叫,繞過 nvidia_provider與 drift_narrator_service 一致
"""
import httpx
OLLAMA_URL = "http://192.168.0.111:11434"
MODEL = "qwen2.5:7b-instruct"
TIMEOUT = 45.0
try:
from src.core.config import get_settings
from src.services.nvidia_provider import get_nvidia_provider
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": MODEL,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.2, "num_predict": 200},
},
)
resp.raise_for_status()
data = resp.json()
response_text = data.get("response", "").strip()
settings = get_settings()
nvidia = get_nvidia_provider()
response_text, success, _tokens, _cost = await asyncio.wait_for(
nvidia.chat(prompt=prompt),
timeout=getattr(settings, "NEMOTRON_DIAGNOSE_TIMEOUT_SECONDS", 30),
)
if not success or not response_text:
return self._unknown_result("Nemotron 回傳空值")
if not response_text:
return self._unknown_result("Ollama 回傳空值")
return self._parse_response(response_text)
except asyncio.TimeoutError:
logger.warning("drift_nemotron_timeout")
return self._unknown_result("Nemotron 超時")
except httpx.TimeoutException:
logger.warning("drift_interpreter_timeout", model=MODEL)
return self._unknown_result("Ollama 超時")
except Exception as e:
logger.warning("drift_nemotron_error", error=str(e))
logger.warning("drift_interpreter_error", error=str(e))
return self._unknown_result(str(e))
def _parse_response(self, text: str) -> DriftInterpretation: