fix(aiops): ADR-072 P0 Bug 修復 — BUG-001/002/003
Some checks are pending
CD Pipeline / build-and-deploy (push) Has started running
Some checks are pending
CD Pipeline / build-and-deploy (push) Has started running
BUG-001 drift_interpreter: nvidia_provider 已重構為 NvidiaProviderResult 物件(非 4-tuple) → 改用 Ollama httpx 直接呼叫 qwen2.5:7b-instruct,繞過 nvidia_provider → 消除所有 K8s config drift 告警的 "too many values to unpack" 永久失敗 BUG-002 deployment_name="unknown": 主機層告警(HostHighCpuLoad 等)無 component/job/pod label → _auto_execute() 新增 _resolve_target_from_k8s() 補救 → K8s MCP kubectl get pods 動態查詢受影響 Pod,去掉 hash suffix 得到 deployment name BUG-003 無效 deployment 通過 safety guard: → _auto_execute() safety guard 通過後加入 _verify_k8s_deployment_exists() 存在性確認 → K8s 中找不到 deployment/pod → 拒絕執行,寫入 DecisionToken.error → K8s MCP 不可用時保守放行(不阻塞主流程) 2026-04-11 Claude Sonnet 4.6 Asia/Taipei Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -441,6 +441,113 @@ async def _generate_playbook_draft_if_new(incident: "Incident") -> None:
|
||||
_sl.get_logger(__name__).debug("playbook_draft_failed", error=str(e))
|
||||
|
||||
|
||||
async def _resolve_target_from_k8s(incident: "Incident", namespace: str) -> str | None:
|
||||
"""
|
||||
BUG-002 補救:主機層告警無 component/job/pod label 時,
|
||||
用 K8s MCP kubectl get pods 依 alertname/host label 動態查詢受影響 Pod name,
|
||||
回傳 deployment name(去掉 hash suffix)或 None。
|
||||
|
||||
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
|
||||
"""
|
||||
try:
|
||||
from src.plugins.mcp.providers.k8s_provider import K8sProvider
|
||||
|
||||
k8s = K8sProvider()
|
||||
if not k8s.enabled:
|
||||
return None
|
||||
|
||||
alertname = ""
|
||||
if incident.signals:
|
||||
labels = incident.signals[0].labels
|
||||
alertname = labels.get("alertname", "")
|
||||
|
||||
# 用 kubectl get pods 列出所有 pods,再根據 alertname 推測受影響的 deployment
|
||||
result = await k8s.execute(
|
||||
tool_name="kubectl_get",
|
||||
params={"resource": "pods", "namespace": namespace, "output": "name"},
|
||||
)
|
||||
if not result.get("success"):
|
||||
return None
|
||||
|
||||
pod_lines: list[str] = (result.get("output", "") or "").splitlines()
|
||||
if not pod_lines:
|
||||
return None
|
||||
|
||||
# alertname → 關鍵字映射(主機層告警常見類型)
|
||||
_ALERTNAME_KEYWORDS: dict[str, list[str]] = {
|
||||
"HostHighCpuLoad": ["api", "web"],
|
||||
"HostOutOfMemory": ["api", "web"],
|
||||
"DockerContainerUnhealthy": [],
|
||||
"HostHighDiskUsage": [],
|
||||
}
|
||||
keywords = _ALERTNAME_KEYWORDS.get(alertname, [])
|
||||
|
||||
import re as _re
|
||||
for line in pod_lines:
|
||||
pod = line.removeprefix("pod/").strip()
|
||||
if not pod:
|
||||
continue
|
||||
# 優先找關鍵字命中的 pod
|
||||
if keywords and not any(kw in pod for kw in keywords):
|
||||
continue
|
||||
# 去掉 hash suffix → deployment name
|
||||
parts = pod.rsplit("-", 2)
|
||||
if len(parts) >= 3 and len(parts[-1]) == 5 and len(parts[-2]) in (9, 10):
|
||||
return parts[0]
|
||||
if len(parts) >= 2:
|
||||
return "-".join(parts[:-1])
|
||||
return pod
|
||||
|
||||
# 無關鍵字命中時,回傳第一個 non-infra pod
|
||||
for line in pod_lines:
|
||||
pod = line.removeprefix("pod/").strip()
|
||||
if pod and not any(inf in pod for inf in ("prometheus", "alertmanager", "grafana")):
|
||||
parts = pod.rsplit("-", 2)
|
||||
if len(parts) >= 3:
|
||||
return parts[0]
|
||||
return pod
|
||||
|
||||
except Exception as e:
|
||||
logger.debug("resolve_target_from_k8s_failed", error=str(e))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def _verify_k8s_deployment_exists(target: str, namespace: str) -> bool:
|
||||
"""
|
||||
BUG-003 補救:呼叫 K8s MCP 確認 deployment/pod 是否真實存在。
|
||||
K8s MCP 不可用時 → 返回 True(不阻塞,保守策略)。
|
||||
|
||||
2026-04-11 Claude Sonnet 4.6 Asia/Taipei
|
||||
"""
|
||||
try:
|
||||
from src.plugins.mcp.providers.k8s_provider import K8sProvider
|
||||
|
||||
k8s = K8sProvider()
|
||||
if not k8s.enabled:
|
||||
# MCP 不可用 → 保守放行,讓 kubectl 自行報錯
|
||||
return True
|
||||
|
||||
result = await k8s.execute(
|
||||
tool_name="kubectl_get",
|
||||
params={"resource": "deployment", "name": target, "namespace": namespace},
|
||||
)
|
||||
if result.get("success"):
|
||||
return True
|
||||
|
||||
# 嘗試 pod(有些告警對應的是 pod 而非 deployment)
|
||||
result_pod = await k8s.execute(
|
||||
tool_name="kubectl_get",
|
||||
params={"resource": "pod", "namespace": namespace, "selector": f"app={target}"},
|
||||
)
|
||||
return bool(result_pod.get("success") and result_pod.get("output", "").strip())
|
||||
|
||||
except Exception as e:
|
||||
logger.debug("verify_k8s_deployment_exists_failed", target=target, error=str(e))
|
||||
# 例外時保守放行
|
||||
return True
|
||||
|
||||
|
||||
async def _fetch_metrics_snapshot(incident: Incident) -> dict:
|
||||
"""
|
||||
ADR-071-I: 從 Prometheus 抓取與此 incident 相關的指標快照
|
||||
@@ -989,6 +1096,13 @@ class DecisionManager:
|
||||
if incident.signals:
|
||||
_ns = incident.signals[0].labels.get("namespace", "awoooi-prod")
|
||||
import re as _re
|
||||
|
||||
# BUG-002 修復 2026-04-11: 主機層告警(HostHighCpuLoad 等)無 component/job/pod label
|
||||
# → affected_services=[] → target="unknown" → safety guard 攔截
|
||||
# 補救:用 K8s MCP 依 alertname/host label 動態查詢受影響 Pod
|
||||
if _target == "unknown":
|
||||
_target = await _resolve_target_from_k8s(incident, _ns) or "unknown"
|
||||
|
||||
action = action.replace("{target}", _target).replace("{namespace}", _ns)
|
||||
# <xxx> 格式佔位符 → 用 target 替換
|
||||
action = _re.sub(r"<deployment_name>", _target, action)
|
||||
@@ -1017,6 +1131,28 @@ class DecisionManager:
|
||||
)
|
||||
return
|
||||
|
||||
# BUG-003 修復 2026-04-11: 加入 K8s deployment 存在性驗證,
|
||||
# 避免 LLM 產生的無效 deployment name(<placeholder>/alertname/unknown)通過 safety guard
|
||||
# 但仍對 K8s 發出錯誤指令
|
||||
if _target and _target != "unknown":
|
||||
_k8s_verified = await _verify_k8s_deployment_exists(_target, _ns)
|
||||
if not _k8s_verified:
|
||||
logger.warning(
|
||||
"auto_execute_blocked_deployment_not_found",
|
||||
incident_id=incident.incident_id,
|
||||
target=_target,
|
||||
namespace=_ns,
|
||||
reason="K8s 中找不到此 deployment/pod,拒絕執行",
|
||||
)
|
||||
token.state = DecisionState.ERROR
|
||||
token.error = f"Auto-execute blocked: deployment '{_target}' not found in K8s namespace '{_ns}'"
|
||||
await self._save_token(token)
|
||||
_fire_and_forget(
|
||||
_push_auto_repair_result(incident, action, success=False,
|
||||
error=f"K8s 中找不到 deployment '{_target}',請人工確認後手動執行")
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
# 延遲導入避免循環依賴
|
||||
from src.models.approval import ApprovalRequest, ApprovalStatus
|
||||
|
||||
@@ -16,7 +16,6 @@ Drift Interpreter - Phase 25 P2 Config Drift Detection
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -101,29 +100,42 @@ class NemotronDriftInterpreter:
|
||||
return "\n".join(lines) if lines else "(均為白名單欄位)"
|
||||
|
||||
async def _call_nemotron(self, prompt: str) -> DriftInterpretation:
|
||||
"""呼叫 Nemotron 進行意圖分析"""
|
||||
"""
|
||||
呼叫本地 Ollama qwen2.5:7b-instruct 進行意圖分析
|
||||
BUG-001 修復 2026-04-11: nvidia_provider 已重構為返回 NvidiaProviderResult 物件(非 4-tuple),
|
||||
改用 Ollama httpx 直接呼叫,繞過 nvidia_provider,與 drift_narrator_service 一致
|
||||
"""
|
||||
import httpx
|
||||
|
||||
OLLAMA_URL = "http://192.168.0.111:11434"
|
||||
MODEL = "qwen2.5:7b-instruct"
|
||||
TIMEOUT = 45.0
|
||||
|
||||
try:
|
||||
from src.core.config import get_settings
|
||||
from src.services.nvidia_provider import get_nvidia_provider
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
resp = await client.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.2, "num_predict": 200},
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
response_text = data.get("response", "").strip()
|
||||
|
||||
settings = get_settings()
|
||||
nvidia = get_nvidia_provider()
|
||||
|
||||
response_text, success, _tokens, _cost = await asyncio.wait_for(
|
||||
nvidia.chat(prompt=prompt),
|
||||
timeout=getattr(settings, "NEMOTRON_DIAGNOSE_TIMEOUT_SECONDS", 30),
|
||||
)
|
||||
|
||||
if not success or not response_text:
|
||||
return self._unknown_result("Nemotron 回傳空值")
|
||||
if not response_text:
|
||||
return self._unknown_result("Ollama 回傳空值")
|
||||
|
||||
return self._parse_response(response_text)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("drift_nemotron_timeout")
|
||||
return self._unknown_result("Nemotron 超時")
|
||||
except httpx.TimeoutException:
|
||||
logger.warning("drift_interpreter_timeout", model=MODEL)
|
||||
return self._unknown_result("Ollama 超時")
|
||||
except Exception as e:
|
||||
logger.warning("drift_nemotron_error", error=str(e))
|
||||
logger.warning("drift_interpreter_error", error=str(e))
|
||||
return self._unknown_result(str(e))
|
||||
|
||||
def _parse_response(self, text: str) -> DriftInterpretation:
|
||||
|
||||
Reference in New Issue
Block a user