awoooi/apps/api/src/agents/diagnostician_agent.py

"""
AWOOOI AIOps Phase 2 — Diagnostician Agent（偵探）
==================================================
職責：RCA 根因分析

輸入：EvidenceSnapshot（8D 感官情報）
輸出：DiagnosisReport（多根因假設，含 confidence + evidence_chain）

設計原則：
1. 只做診斷，不提解法（Solver 的工作）
2. top-1 confidence < 0.4 → vote = ABSTAIN（情報不足，回傳 Coordinator 判斷）
3. 熔斷降級：LLM 失敗 / 超時 → rule-based mock（以 alert_category 作簡單假設）
4. 所有 LLM 輸出過 SanitizationService（防 Prompt Injection）

ADR-082: Phase 2 多 Agent 協作
2026-04-15 ogt + Claude Sonnet 4.6（亞太）: Phase 2 初始建立
"""

from __future__ import annotations

import asyncio
import hashlib
import json
import os
import time
from typing import TYPE_CHECKING, Any

import structlog

from src.agents.base import BaseAgent, AgentResult, AgentStatus
from src.agents.protocol import (
    AgentRole,
    AgentVote,
    DiagnosisReport,
    Hypothesis,
)
from src.observability.agent_step_metrics import observe_agent_step
from src.services.sanitization_service import sanitize

if TYPE_CHECKING:
    from src.services.evidence_snapshot import EvidenceSnapshot

logger = structlog.get_logger(__name__)

# 每個假設的最大 evidence chain 長度（防超 token）
MAX_EVIDENCE_CHAIN = 5

# Confidence 閾值 — 低於此值 vote = ABSTAIN
ABSTAIN_CONFIDENCE_THRESHOLD = 0.4

# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric (北極星 §1.2 Observable by Default)
# 背景：INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20%
#   OpenClaw NIM (192.168.0.188:8088) 實測 2-27s，原共用 PHASE2_STEP_TIMEOUT_SEC=20.0
#   Diagnostician 是 NIM 主吃口（最大 prompt + 多假設輸出），因此分配最高 timeout=30s
#   Solver=20s（prompt 較小），Critic=15s（只做批判，輸出最短）
# env override：部署時可透過 K8s ConfigMap 動態調整，無需重新 build image
#
# 相容 alias（2026-04-27）：PHASE2_STEP_TIMEOUT_SEC 保留供外部 import 讀取（已棄用）
AGENT_DIAGNOSTICIAN_TIMEOUT_SEC: float = float(
    os.environ.get("AGENT_DIAGNOSTICIAN_TIMEOUT_SEC", "30.0")
)

# 保留相容 alias，標記棄用
# DEPRECATED (2026-04-27): 使用 AGENT_DIAGNOSTICIAN_TIMEOUT_SEC，此 alias 將在下一個 Sprint 移除
PHASE2_STEP_TIMEOUT_SEC = AGENT_DIAGNOSTICIAN_TIMEOUT_SEC


class DiagnosticianAgent(BaseAgent):
    """
    Diagnostician Agent — RCA 根因分析偵探

    Usage:
        agent = DiagnosticianAgent()
        report = await agent.run(snapshot)
    """

    AGENT_NAME = AgentRole.DIAGNOSTICIAN.value
    AGENT_DESCRIPTION = "Root Cause Analysis specialist. Produces multiple hypotheses with confidence scores."

    async def run(
        self,
        snapshot: "EvidenceSnapshot",
        timeout_sec: float = 0.0,  # noqa: ARG002 — 已廢棄，保留簽名相容性
    ) -> DiagnosisReport:
        """
        執行根因分析。

        Args:
            snapshot: Phase 1 感官快照
            timeout_sec: 已廢棄（2026-04-16 ogt + Claude Sonnet 4.6 — LLM 必須等完整回應）
                         降級只在真正異常（連線失敗、模型崩潰）時觸發，
                         全流程由 Orchestrator GLOBAL_TIMEOUT_SEC 防掛死

        Returns:
            DiagnosisReport（真實異常時 degraded=True，vote=ABSTAIN）
        """
        start_ms = int(time.monotonic() * 1000)

        try:
            report = await self._analyze(snapshot)
            report.latency_ms = int(time.monotonic() * 1000) - start_ms
            logger.info(
                "diagnostician_done",
                snapshot_id=snapshot.snapshot_id,
                hypotheses=len(report.hypotheses),
                top_confidence=report.top_confidence,
                vote=report.vote,
                latency_ms=report.latency_ms,
            )
            return report

        except Exception:
            latency = int(time.monotonic() * 1000) - start_ms
            logger.exception("diagnostician_error")
            return self._degraded_report(snapshot, latency, reason="error")

    async def _analyze(self, snapshot: "EvidenceSnapshot") -> DiagnosisReport:
        """核心 LLM 分析邏輯。"""
        prompt = self._build_prompt({
            "evidence_summary": snapshot.evidence_summary or "",
            "anomaly_context": snapshot.anomaly_context,
        })

        # 2026-04-16 ogt + Claude Sonnet 4.6: 傳遞 snapshot 結構化資料給 OPENCLAW_NEMO
        # 根因：原本 call(prompt) 不傳 context → nemo fallback 把 prompt[:500]（系統說明）
        #       當 signal description → LLM 回傳 "調查 AWOOOI SRE 系統的偵探 Agent" 垃圾
        # 修復：把 snapshot.evidence_summary 放進 alert_context.signals 讓 nemo 看到真實資料
        _evidence = (snapshot.evidence_summary or "（待感應器資料）")[:800]
        alert_context = {
            "incident_id": snapshot.snapshot_id or "UNKNOWN",
            "severity": "P3",
            "signals": [{"alert_name": "evidence_snapshot", "description": _evidence}],
            "affected_services": [],
            "intent_hint": "diagnose",
        }

        from src.services.openclaw import get_openclaw
        openclaw = get_openclaw()
        _step_start = time.monotonic()
        try:
            response_text, _provider, success = await asyncio.wait_for(
                openclaw.call(prompt, alert_context=alert_context),
                timeout=AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,
            )
            # 2026-04-27 Claude Sonnet 4.6: A1 — success path metric observe
            observe_agent_step("diagnostician", "success", time.monotonic() - _step_start)
        except asyncio.TimeoutError:
            # 2026-04-27 Claude Sonnet 4.6: A1 — timeout path metric observe
            observe_agent_step("diagnostician", "timeout", time.monotonic() - _step_start)
            logger.warning(
                "diagnostician_step_timeout",
                snapshot_id=snapshot.snapshot_id,
                timeout_sec=AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,
            )
            return self._degraded_report(snapshot, 0, reason="step_timeout")

        if not success or not response_text:
            return self._degraded_report(snapshot, 0, reason="llm_failed")

        parsed = self._parse_response(sanitize(response_text, "diagnostician_output"))
        hypotheses = _extract_hypotheses(parsed)

        vote = AgentVote.APPROVE
        if not hypotheses or hypotheses[0].confidence < ABSTAIN_CONFIDENCE_THRESHOLD:
            vote = AgentVote.ABSTAIN

        return DiagnosisReport(
            hypotheses=hypotheses,
            evidence_snapshot_id=snapshot.snapshot_id or "",
            latency_ms=0,  # 由 run() 覆蓋
            vote=vote,
        )

    def _build_prompt(self, context: dict[str, Any]) -> str:
        evidence = context.get("evidence_summary", "（無感官情報）")
        anomaly_context = context.get("anomaly_context")

        # Phase 4 ADR-084: 動態異常感官區塊（有資料才附加，避免空白雜訊）
        # 2026-04-15 ogt + Claude Sonnet 4.6（亞太）: Phase 4 8D 升級
        anomaly_section = ""
        if anomaly_context:
            import json as _json
            anomaly_section = f"""
---
Phase 4 動態異常偵測（AI 主動巡檢結果，可作為高信心佐證）：
{_json.dumps(anomaly_context, ensure_ascii=False, indent=2)}
---"""

        return f"""你是 AWOOOI SRE 系統的偵探 Agent，專職根因分析（Root Cause Analysis）。

你的唯一工作：根據以下感官情報，提出 2-3 個根因假設（hypotheses）。
不要提修復方案，那是 Solver 的工作。
每個假設必須：
1. 有 confidence（0.0-1.0）
2. 列出支持此假設的 evidence key（限 {MAX_EVIDENCE_CHAIN} 個）
3. 有 category（K8s Pod / HostDisk / NetworkLatency / DatabaseConnection / 等）

如果感官情報嚴重不足（所有假設 confidence < 0.4），說明原因。

---
感官情報：
{evidence}
---{anomaly_section}

以 JSON 回覆（不要加任何解釋）：
{{
  "hypotheses": [
    {{
      "description": "假設描述",
      "confidence": 0.85,
      "evidence_chain": ["k8s_state.pod_status", "recent_logs.oom_signal"],
      "category": "KubePodOOM"
    }}
  ]
}}"""

    def _parse_response(self, response: str) -> dict[str, Any]:
        return self._extract_json(response)

    def analyze(self, context: dict[str, Any]) -> Any:
        """BaseAgent 抽象方法 — Phase 2 改用 run() 入口。"""
        raise NotImplementedError("Use run() for Phase 2 agents")

    def _degraded_report(
        self,
        snapshot: "EvidenceSnapshot",
        latency_ms: int,
        reason: str = "unknown",
    ) -> DiagnosisReport:
        """熔斷降級：rule-based mock（用 alert_category 作簡單假設）"""
        category = _guess_category_from_snapshot(snapshot)
        return DiagnosisReport(
            hypotheses=[
                Hypothesis(
                    description=f"[降級] 無法完成 LLM 分析（原因: {reason}）。基於告警類別推測: {category}",
                    confidence=0.2,
                    evidence_chain=[],
                    category=category,
                )
            ],
            evidence_snapshot_id=snapshot.snapshot_id or "",
            latency_ms=latency_ms,
            vote=AgentVote.ABSTAIN,
            degraded=True,
        )


# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────

def _extract_hypotheses(parsed: dict[str, Any]) -> list[Hypothesis]:
    """從 LLM 解析結果提取假設列表（按信心降序）。

    支援兩種格式：
    1. 標準格式：{"hypotheses": [{description, confidence, evidence_chain, category}]}
    2. OpenClaw Nemo 格式：{"action_title": "...", "risk_level": "...", "confidence": 0.85}
       （openclaw_nemo 呼叫 ClawBot /api/v1/analyze/incident 回傳）

    2026-04-16 ogt + Claude Sonnet 4.6: 修復 openclaw_nemo 格式不相容
        根因: ai_router DIAGNOSE→openclaw_nemo 回傳 action_title 格式，
              diagnostician 只解析 hypotheses 格式 → 永遠 0 hypotheses → ABSTAIN
    """
    # OpenClaw Nemo 格式轉換（有 action_title 但無 hypotheses）
    if "action_title" in parsed and "hypotheses" not in parsed:
        action_title = str(parsed.get("action_title", ""))
        confidence = float(parsed.get("confidence", 0.5))
        risk_level = str(parsed.get("risk_level", "medium"))
        # risk_level → category 映射
        risk_to_cat = {"critical": "CriticalFailure", "high": "HighRisk",
                       "medium": "ModerateIssue", "low": "LowRisk"}
        category = risk_to_cat.get(risk_level.lower(), "Unknown")
        if action_title and confidence > 0:
            # 2026-04-16 ogt + Claude Sonnet 4.6: 優先用 reasoning 作為假設描述
            # reasoning（解釋「為什麼」採取行動）比 action_title（「做什麼」）更接近根因
            # 例: reasoning="CPU 95%, 系統過載" vs action_title="重啟 Pod"
            nemo_reasoning = str(parsed.get("reasoning", "")).strip()
            description = nemo_reasoning[:500] if len(nemo_reasoning) > 20 else action_title[:500]
            return [Hypothesis(
                description=description,
                confidence=confidence,
                evidence_chain=[],
                category=category,
            )]
        return []

    raw = parsed.get("hypotheses", [])
    hypotheses = []
    for item in raw:
        if not isinstance(item, dict):
            continue
        h = Hypothesis(
            description=str(item.get("description", ""))[:500],
            confidence=float(item.get("confidence", 0.0)),
            evidence_chain=item.get("evidence_chain", [])[:MAX_EVIDENCE_CHAIN],
            category=str(item.get("category", "")),
        )
        hypotheses.append(h)
    hypotheses.sort(key=lambda h: h.confidence, reverse=True)
    return hypotheses


def _guess_category_from_snapshot(snapshot: "EvidenceSnapshot") -> str:
    """降級時從 snapshot 猜測告警類別（最粗粒度兜底）。"""
    summary = (snapshot.evidence_summary or "").lower()
    if "oom" in summary or "memory" in summary:
        return "KubePodOOM"
    if "crashloop" in summary:
        return "KubePodCrashLoop"
    if "disk" in summary:
        return "HostDiskUsage"
    if "cpu" in summary:
        return "HostCpuHigh"
    if "network" in summary or "timeout" in summary:
        return "NetworkLatency"
    return "Unknown"


def compute_input_hash(snapshot: "EvidenceSnapshot") -> str:
    """計算 Diagnostician 輸入的 fingerprint（用於 AgentSession input_hash）。"""
    key = (snapshot.snapshot_id or "") + (snapshot.evidence_summary or "")[:100]
    return hashlib.sha256(key.encode()).hexdigest()[:16]


# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────

_agent: DiagnosticianAgent | None = None


def get_diagnostician_agent() -> DiagnosticianAgent:
    global _agent
    if _agent is None:
        _agent = DiagnosticianAgent()
    return _agent