Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20% 根因雙修(統帥批准 A+B):
A1 — 三段 Agent step timeout 拆分(北極星 §1.2 Observable by Default):
- diagnostician_agent.py: PHASE2_STEP_TIMEOUT_SEC=20.0 共用值 → 拆三段
· AGENT_DIAGNOSTICIAN_TIMEOUT_SEC=30.0(NIM 主吃口,最大 prompt + 多假設)
· AGENT_SOLVER_TIMEOUT_SEC=20.0(後續 commit 接線)
· AGENT_CRITIC_TIMEOUT_SEC=15.0(後續 commit 接線)
· env override 支援,K8s ConfigMap 動態調整不需 rebuild
· 保留 PHASE2_STEP_TIMEOUT_SEC alias(DEPRECATED,下 sprint 移除)
- observability/agent_step_metrics.py (58 行) — 新模組:
· aiops_agent_step_duration_seconds Histogram
· observe_agent_step() helper 統一三 Agent 呼叫點
· outcome label ∈ {success, timeout, error}
· agent label ∈ {diagnostician, solver, critic}
A2 — ai_router DIAGNOSE chain 移除 Ollama:
- ai_router.py v4.4 by Claude Sonnet 4.6
· 新增 _diagnose_fallback_chain: NEMO → GEMINI → CLAUDE
· Ollama 永久排除於此 chain(CPU-only 實測 238s,二次 timeout 必爆)
· 新增 aiops_diagnose_fallback_total Prometheus metric
- 根因: NIM timeout 後 fallback 到 Ollama deepseek-r1:14b CPU 238s
→ 二次 timeout → degraded confidence=0.2
Wave8-X2 整合測試補正:
- test_ollama_failover_manager.py: TestSelectProvider 補 mock _check_gemini_quota
原 test 期望 OFFLINE→Gemini,但 quota fail-closed 後沒 mock 會被切到 188
繞過 quota check 後驗純路由邏輯 → 37/37 PASS
Tests: 37 passed (test_ollama_failover_manager 全部)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Claude Sonnet 4.6 (Wave 8 INC-20260425) <noreply@anthropic.com>
337 lines
14 KiB
Python
337 lines
14 KiB
Python
"""
|
||
AWOOOI AIOps Phase 2 — Diagnostician Agent(偵探)
|
||
==================================================
|
||
職責:RCA 根因分析
|
||
|
||
輸入:EvidenceSnapshot(8D 感官情報)
|
||
輸出:DiagnosisReport(多根因假設,含 confidence + evidence_chain)
|
||
|
||
設計原則:
|
||
1. 只做診斷,不提解法(Solver 的工作)
|
||
2. top-1 confidence < 0.4 → vote = ABSTAIN(情報不足,回傳 Coordinator 判斷)
|
||
3. 熔斷降級:LLM 失敗 / 超時 → rule-based mock(以 alert_category 作簡單假設)
|
||
4. 所有 LLM 輸出過 SanitizationService(防 Prompt Injection)
|
||
|
||
ADR-082: Phase 2 多 Agent 協作
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import hashlib
|
||
import json
|
||
import os
|
||
import time
|
||
from typing import TYPE_CHECKING, Any
|
||
|
||
import structlog
|
||
|
||
from src.agents.base import BaseAgent, AgentResult, AgentStatus
|
||
from src.agents.protocol import (
|
||
AgentRole,
|
||
AgentVote,
|
||
DiagnosisReport,
|
||
Hypothesis,
|
||
)
|
||
from src.observability.agent_step_metrics import observe_agent_step
|
||
from src.services.sanitization_service import sanitize
|
||
|
||
if TYPE_CHECKING:
|
||
from src.services.evidence_snapshot import EvidenceSnapshot
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# 每個假設的最大 evidence chain 長度(防超 token)
|
||
MAX_EVIDENCE_CHAIN = 5
|
||
|
||
# Confidence 閾值 — 低於此值 vote = ABSTAIN
|
||
ABSTAIN_CONFIDENCE_THRESHOLD = 0.4
|
||
|
||
# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric (北極星 §1.2 Observable by Default)
|
||
# 背景:INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20%
|
||
# OpenClaw NIM (192.168.0.188:8088) 實測 2-27s,原共用 PHASE2_STEP_TIMEOUT_SEC=20.0
|
||
# Diagnostician 是 NIM 主吃口(最大 prompt + 多假設輸出),因此分配最高 timeout=30s
|
||
# Solver=20s(prompt 較小),Critic=15s(只做批判,輸出最短)
|
||
# env override:部署時可透過 K8s ConfigMap 動態調整,無需重新 build image
|
||
#
|
||
# 相容 alias(2026-04-27):PHASE2_STEP_TIMEOUT_SEC 保留供外部 import 讀取(已棄用)
|
||
AGENT_DIAGNOSTICIAN_TIMEOUT_SEC: float = float(
|
||
os.environ.get("AGENT_DIAGNOSTICIAN_TIMEOUT_SEC", "30.0")
|
||
)
|
||
|
||
# 保留相容 alias,標記棄用
|
||
# DEPRECATED (2026-04-27): 使用 AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,此 alias 將在下一個 Sprint 移除
|
||
PHASE2_STEP_TIMEOUT_SEC = AGENT_DIAGNOSTICIAN_TIMEOUT_SEC
|
||
|
||
|
||
class DiagnosticianAgent(BaseAgent):
|
||
"""
|
||
Diagnostician Agent — RCA 根因分析偵探
|
||
|
||
Usage:
|
||
agent = DiagnosticianAgent()
|
||
report = await agent.run(snapshot)
|
||
"""
|
||
|
||
AGENT_NAME = AgentRole.DIAGNOSTICIAN.value
|
||
AGENT_DESCRIPTION = "Root Cause Analysis specialist. Produces multiple hypotheses with confidence scores."
|
||
|
||
async def run(
|
||
self,
|
||
snapshot: "EvidenceSnapshot",
|
||
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
|
||
) -> DiagnosisReport:
|
||
"""
|
||
執行根因分析。
|
||
|
||
Args:
|
||
snapshot: Phase 1 感官快照
|
||
timeout_sec: 已廢棄(2026-04-16 ogt + Claude Sonnet 4.6 — LLM 必須等完整回應)
|
||
降級只在真正異常(連線失敗、模型崩潰)時觸發,
|
||
全流程由 Orchestrator GLOBAL_TIMEOUT_SEC 防掛死
|
||
|
||
Returns:
|
||
DiagnosisReport(真實異常時 degraded=True,vote=ABSTAIN)
|
||
"""
|
||
start_ms = int(time.monotonic() * 1000)
|
||
|
||
try:
|
||
report = await self._analyze(snapshot)
|
||
report.latency_ms = int(time.monotonic() * 1000) - start_ms
|
||
logger.info(
|
||
"diagnostician_done",
|
||
snapshot_id=snapshot.snapshot_id,
|
||
hypotheses=len(report.hypotheses),
|
||
top_confidence=report.top_confidence,
|
||
vote=report.vote,
|
||
latency_ms=report.latency_ms,
|
||
)
|
||
return report
|
||
|
||
except Exception:
|
||
latency = int(time.monotonic() * 1000) - start_ms
|
||
logger.exception("diagnostician_error")
|
||
return self._degraded_report(snapshot, latency, reason="error")
|
||
|
||
async def _analyze(self, snapshot: "EvidenceSnapshot") -> DiagnosisReport:
|
||
"""核心 LLM 分析邏輯。"""
|
||
prompt = self._build_prompt({
|
||
"evidence_summary": snapshot.evidence_summary or "",
|
||
"anomaly_context": snapshot.anomaly_context,
|
||
})
|
||
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 傳遞 snapshot 結構化資料給 OPENCLAW_NEMO
|
||
# 根因:原本 call(prompt) 不傳 context → nemo fallback 把 prompt[:500](系統說明)
|
||
# 當 signal description → LLM 回傳 "調查 AWOOOI SRE 系統的偵探 Agent" 垃圾
|
||
# 修復:把 snapshot.evidence_summary 放進 alert_context.signals 讓 nemo 看到真實資料
|
||
_evidence = (snapshot.evidence_summary or "(待感應器資料)")[:800]
|
||
alert_context = {
|
||
"incident_id": snapshot.snapshot_id or "UNKNOWN",
|
||
"severity": "P3",
|
||
"signals": [{"alert_name": "evidence_snapshot", "description": _evidence}],
|
||
"affected_services": [],
|
||
"intent_hint": "diagnose",
|
||
}
|
||
|
||
from src.services.openclaw import get_openclaw
|
||
openclaw = get_openclaw()
|
||
_step_start = time.monotonic()
|
||
try:
|
||
response_text, _provider, success = await asyncio.wait_for(
|
||
openclaw.call(prompt, alert_context=alert_context),
|
||
timeout=AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,
|
||
)
|
||
# 2026-04-27 Claude Sonnet 4.6: A1 — success path metric observe
|
||
observe_agent_step("diagnostician", "success", time.monotonic() - _step_start)
|
||
except asyncio.TimeoutError:
|
||
# 2026-04-27 Claude Sonnet 4.6: A1 — timeout path metric observe
|
||
observe_agent_step("diagnostician", "timeout", time.monotonic() - _step_start)
|
||
logger.warning(
|
||
"diagnostician_step_timeout",
|
||
snapshot_id=snapshot.snapshot_id,
|
||
timeout_sec=AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,
|
||
)
|
||
return self._degraded_report(snapshot, 0, reason="step_timeout")
|
||
|
||
if not success or not response_text:
|
||
return self._degraded_report(snapshot, 0, reason="llm_failed")
|
||
|
||
parsed = self._parse_response(sanitize(response_text, "diagnostician_output"))
|
||
hypotheses = _extract_hypotheses(parsed)
|
||
|
||
vote = AgentVote.APPROVE
|
||
if not hypotheses or hypotheses[0].confidence < ABSTAIN_CONFIDENCE_THRESHOLD:
|
||
vote = AgentVote.ABSTAIN
|
||
|
||
return DiagnosisReport(
|
||
hypotheses=hypotheses,
|
||
evidence_snapshot_id=snapshot.snapshot_id or "",
|
||
latency_ms=0, # 由 run() 覆蓋
|
||
vote=vote,
|
||
)
|
||
|
||
def _build_prompt(self, context: dict[str, Any]) -> str:
|
||
evidence = context.get("evidence_summary", "(無感官情報)")
|
||
anomaly_context = context.get("anomaly_context")
|
||
|
||
# Phase 4 ADR-084: 動態異常感官區塊(有資料才附加,避免空白雜訊)
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 8D 升級
|
||
anomaly_section = ""
|
||
if anomaly_context:
|
||
import json as _json
|
||
anomaly_section = f"""
|
||
---
|
||
Phase 4 動態異常偵測(AI 主動巡檢結果,可作為高信心佐證):
|
||
{_json.dumps(anomaly_context, ensure_ascii=False, indent=2)}
|
||
---"""
|
||
|
||
return f"""你是 AWOOOI SRE 系統的偵探 Agent,專職根因分析(Root Cause Analysis)。
|
||
|
||
你的唯一工作:根據以下感官情報,提出 2-3 個根因假設(hypotheses)。
|
||
不要提修復方案,那是 Solver 的工作。
|
||
每個假設必須:
|
||
1. 有 confidence(0.0-1.0)
|
||
2. 列出支持此假設的 evidence key(限 {MAX_EVIDENCE_CHAIN} 個)
|
||
3. 有 category(K8s Pod / HostDisk / NetworkLatency / DatabaseConnection / 等)
|
||
|
||
如果感官情報嚴重不足(所有假設 confidence < 0.4),說明原因。
|
||
|
||
---
|
||
感官情報:
|
||
{evidence}
|
||
---{anomaly_section}
|
||
|
||
以 JSON 回覆(不要加任何解釋):
|
||
{{
|
||
"hypotheses": [
|
||
{{
|
||
"description": "假設描述",
|
||
"confidence": 0.85,
|
||
"evidence_chain": ["k8s_state.pod_status", "recent_logs.oom_signal"],
|
||
"category": "KubePodOOM"
|
||
}}
|
||
]
|
||
}}"""
|
||
|
||
def _parse_response(self, response: str) -> dict[str, Any]:
|
||
return self._extract_json(response)
|
||
|
||
def analyze(self, context: dict[str, Any]) -> Any:
|
||
"""BaseAgent 抽象方法 — Phase 2 改用 run() 入口。"""
|
||
raise NotImplementedError("Use run() for Phase 2 agents")
|
||
|
||
def _degraded_report(
|
||
self,
|
||
snapshot: "EvidenceSnapshot",
|
||
latency_ms: int,
|
||
reason: str = "unknown",
|
||
) -> DiagnosisReport:
|
||
"""熔斷降級:rule-based mock(用 alert_category 作簡單假設)"""
|
||
category = _guess_category_from_snapshot(snapshot)
|
||
return DiagnosisReport(
|
||
hypotheses=[
|
||
Hypothesis(
|
||
description=f"[降級] 無法完成 LLM 分析(原因: {reason})。基於告警類別推測: {category}",
|
||
confidence=0.2,
|
||
evidence_chain=[],
|
||
category=category,
|
||
)
|
||
],
|
||
evidence_snapshot_id=snapshot.snapshot_id or "",
|
||
latency_ms=latency_ms,
|
||
vote=AgentVote.ABSTAIN,
|
||
degraded=True,
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _extract_hypotheses(parsed: dict[str, Any]) -> list[Hypothesis]:
|
||
"""從 LLM 解析結果提取假設列表(按信心降序)。
|
||
|
||
支援兩種格式:
|
||
1. 標準格式:{"hypotheses": [{description, confidence, evidence_chain, category}]}
|
||
2. OpenClaw Nemo 格式:{"action_title": "...", "risk_level": "...", "confidence": 0.85}
|
||
(openclaw_nemo 呼叫 ClawBot /api/v1/analyze/incident 回傳)
|
||
|
||
2026-04-16 ogt + Claude Sonnet 4.6: 修復 openclaw_nemo 格式不相容
|
||
根因: ai_router DIAGNOSE→openclaw_nemo 回傳 action_title 格式,
|
||
diagnostician 只解析 hypotheses 格式 → 永遠 0 hypotheses → ABSTAIN
|
||
"""
|
||
# OpenClaw Nemo 格式轉換(有 action_title 但無 hypotheses)
|
||
if "action_title" in parsed and "hypotheses" not in parsed:
|
||
action_title = str(parsed.get("action_title", ""))
|
||
confidence = float(parsed.get("confidence", 0.5))
|
||
risk_level = str(parsed.get("risk_level", "medium"))
|
||
# risk_level → category 映射
|
||
risk_to_cat = {"critical": "CriticalFailure", "high": "HighRisk",
|
||
"medium": "ModerateIssue", "low": "LowRisk"}
|
||
category = risk_to_cat.get(risk_level.lower(), "Unknown")
|
||
if action_title and confidence > 0:
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 優先用 reasoning 作為假設描述
|
||
# reasoning(解釋「為什麼」採取行動)比 action_title(「做什麼」)更接近根因
|
||
# 例: reasoning="CPU 95%, 系統過載" vs action_title="重啟 Pod"
|
||
nemo_reasoning = str(parsed.get("reasoning", "")).strip()
|
||
description = nemo_reasoning[:500] if len(nemo_reasoning) > 20 else action_title[:500]
|
||
return [Hypothesis(
|
||
description=description,
|
||
confidence=confidence,
|
||
evidence_chain=[],
|
||
category=category,
|
||
)]
|
||
return []
|
||
|
||
raw = parsed.get("hypotheses", [])
|
||
hypotheses = []
|
||
for item in raw:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
h = Hypothesis(
|
||
description=str(item.get("description", ""))[:500],
|
||
confidence=float(item.get("confidence", 0.0)),
|
||
evidence_chain=item.get("evidence_chain", [])[:MAX_EVIDENCE_CHAIN],
|
||
category=str(item.get("category", "")),
|
||
)
|
||
hypotheses.append(h)
|
||
hypotheses.sort(key=lambda h: h.confidence, reverse=True)
|
||
return hypotheses
|
||
|
||
|
||
def _guess_category_from_snapshot(snapshot: "EvidenceSnapshot") -> str:
|
||
"""降級時從 snapshot 猜測告警類別(最粗粒度兜底)。"""
|
||
summary = (snapshot.evidence_summary or "").lower()
|
||
if "oom" in summary or "memory" in summary:
|
||
return "KubePodOOM"
|
||
if "crashloop" in summary:
|
||
return "KubePodCrashLoop"
|
||
if "disk" in summary:
|
||
return "HostDiskUsage"
|
||
if "cpu" in summary:
|
||
return "HostCpuHigh"
|
||
if "network" in summary or "timeout" in summary:
|
||
return "NetworkLatency"
|
||
return "Unknown"
|
||
|
||
|
||
def compute_input_hash(snapshot: "EvidenceSnapshot") -> str:
|
||
"""計算 Diagnostician 輸入的 fingerprint(用於 AgentSession input_hash)。"""
|
||
key = (snapshot.snapshot_id or "") + (snapshot.evidence_summary or "")[:100]
|
||
return hashlib.sha256(key.encode()).hexdigest()[:16]
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_agent: DiagnosticianAgent | None = None
|
||
|
||
|
||
def get_diagnostician_agent() -> DiagnosticianAgent:
|
||
global _agent
|
||
if _agent is None:
|
||
_agent = DiagnosticianAgent()
|
||
return _agent
|