Files
awoooi/apps/api/src/agents/critic_agent.py
Your Name fefe4c21cd
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
fix(inc-20260425): A1+A2 後續 — Solver/Critic timeout + auto_repair 接線 + Runbook + Grafana
延續 595629c0 INC-20260425 修復,補三段 Agent + 全鏈路觀測:

A1 後續 — Solver/Critic 三段 timeout 接線:
- solver_agent.py: AGENT_SOLVER_TIMEOUT_SEC=20.0(env override)
- critic_agent.py: AGENT_CRITIC_TIMEOUT_SEC=15.0(env override)
- protocol.py: 三 Agent 共用 observe_agent_step() 包裹呼叫
  · success/timeout/error outcome label
  · histogram 寫入 aiops_agent_step_duration_seconds

A2 後續 — auto_repair_service 改用 _diagnose_fallback_chain:
- auto_repair_service.py +46 行 — 切換 DIAGNOSE 路由到新 chain(NEMO→GEMINI→CLAUDE)
- 完全避開 Ollama CPU 238s 二次 timeout

新增 metrics:
- core/metrics.py +59 行 — 配合 observe_agent_step 的 histogram bucket + label cardinality

新增測試 (862 行):
- test_agent_step_timeouts.py (475) — 三 Agent 各 timeout 邊界 + outcome label
- test_ai_router_diagnose_fallback.py (387) — _diagnose_fallback_chain 正確序

新增配套:
- docs/runbooks/RUNBOOK-AGENT-STEP-LATENCY.md (350) — INC 故障排查 + 觀測指引
- ops/monitoring/grafana/agent_step_latency_rules.yaml (160)
  · 三 Agent histogram alert rules(p99 > timeout 80% → warning)

驗收: 33 tests pass (test_agent_step_timeouts 22 + test_ai_router_diagnose_fallback 11)

INC-20260425 雙修總工作量(595629c0 + 此 commit):
  · 5 個 service/agent 檔修改
  · 1 個新 observability 模組
  · 4 個新測試/配套檔
  · 1372+187 = 1559 行新增

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Claude Sonnet 4.6 (INC-20260425 後續) <noreply@anthropic.com>
2026-04-27 08:15:53 +08:00

273 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 2 — Critic Agent質疑者
=============================================
職責:刻意唱反調,防止幻覺與 echo chamber
輸入DiagnosisReport + ActionPlan兩者都看
輸出CriticReportchallenges[] 列表 + overall_assessment
設計原則:
1. Critic 的工作是找漏洞,不是說好話(防 sycophancy
2. prompt 強制要求批判性思維:「如果診斷是錯的,還有哪 3 種可能?」
3. challenge_count > 0 是 Phase 2 退出條件之一
4. Critic 連續 3 次找到 Diagnostician 嚴重漏洞 → 觸發 Diagnostician 狀態不穩Phase 4 實作)
5. 熔斷降級LLM 失敗 → 輸出空 challenges不阻塞 Coordinator
6. Critic 和 Reviewer 並行執行(都不阻塞對方)
ADR-082: Phase 2 多 Agent 協作
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立
"""
from __future__ import annotations
import asyncio
import hashlib
import os
import time
from typing import Any
import structlog
from src.agents.base import BaseAgent
from src.agents.protocol import (
ActionPlan,
AgentRole,
AgentVote,
Challenge,
CriticReport,
DiagnosisReport,
)
from src.observability.agent_step_metrics import observe_agent_step
from src.services.sanitization_service import sanitize
logger = structlog.get_logger(__name__)
# Critic 挑戰數量上限(防止 LLM 生成無限質疑)
MAX_CHALLENGES = 5
# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric (北極星 §1.2 Observable by Default)
# 背景INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20%
# OpenClaw NIM (192.168.0.188:8088) 實測 2-27s原共用 PHASE2_STEP_TIMEOUT_SEC=20.0
# Critic 只做批判性審查prompt 最短、輸出最簡),分配最小 timeout=15s 以保留全局預算給 Diagnostician/Solver
# env override部署時可透過 K8s ConfigMap 動態調整,無需重新 build image
AGENT_CRITIC_TIMEOUT_SEC: float = float(
os.environ.get("AGENT_CRITIC_TIMEOUT_SEC", "15.0")
)
# 保留相容 alias標記棄用
# DEPRECATED (2026-04-27): 使用 AGENT_CRITIC_TIMEOUT_SEC此 alias 將在下一個 Sprint 移除
PHASE2_STEP_TIMEOUT_SEC = AGENT_CRITIC_TIMEOUT_SEC
class CriticAgent(BaseAgent):
"""
Critic Agent — 系統性懷疑論者
Usage:
agent = CriticAgent()
report = await agent.run(diagnosis, plan)
"""
AGENT_NAME = AgentRole.CRITIC.value
AGENT_DESCRIPTION = (
"Devil's advocate. Challenges diagnosis and proposed actions to prevent "
"hallucination and echo chamber effects."
)
async def run(
self,
diagnosis: DiagnosisReport,
plan: ActionPlan,
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
) -> CriticReport:
"""
批判性審查診斷和方案。
Args:
diagnosis: Diagnostician 輸出
plan: Solver 輸出
timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級
Returns:
CriticReport真實異常時 degraded=True
"""
start_ms = int(time.monotonic() * 1000)
try:
report = await self._critique(diagnosis, plan)
report.latency_ms = int(time.monotonic() * 1000) - start_ms
logger.info(
"critic_done",
challenges=report.challenge_count,
has_critical=report.has_critical_challenge,
vote=report.vote,
latency_ms=report.latency_ms,
)
return report
except Exception:
latency = int(time.monotonic() * 1000) - start_ms
logger.exception("critic_error")
return self._degraded_report(latency, "error")
async def _critique(
self,
diagnosis: DiagnosisReport,
plan: ActionPlan,
) -> CriticReport:
"""LLM 批判性推理。"""
top_hypothesis = diagnosis.top_hypothesis
top_candidate = plan.top_candidate
prompt = self._build_prompt({
"hypothesis": top_hypothesis.description if top_hypothesis else "(無假設)",
"action": top_candidate.action if top_candidate else "(無方案)",
"confidence": top_hypothesis.confidence if top_hypothesis else 0.0,
})
_critic_signal = (
f"hypothesis={top_hypothesis.description[:300] if top_hypothesis else 'none'}; "
f"action={top_candidate.action[:300] if top_candidate else 'none'}"
)
alert_context = {
"incident_id": diagnosis.evidence_snapshot_id or "UNKNOWN",
"severity": "P3",
"signals": [{"alert_name": "critic_review", "description": _critic_signal}],
"affected_services": [],
"intent_hint": "diagnose",
}
from src.services.openclaw import get_openclaw
openclaw = get_openclaw()
_step_start = time.monotonic()
try:
response_text, _provider, success = await asyncio.wait_for(
openclaw.call(prompt, alert_context=alert_context),
timeout=AGENT_CRITIC_TIMEOUT_SEC,
)
# 2026-04-27 Claude Sonnet 4.6: A1 — success path metric observe
observe_agent_step("critic", "success", time.monotonic() - _step_start)
except asyncio.TimeoutError:
# 2026-04-27 Claude Sonnet 4.6: A1 — timeout path metric observe
observe_agent_step("critic", "timeout", time.monotonic() - _step_start)
logger.warning(
"critic_step_timeout",
snapshot_id=diagnosis.evidence_snapshot_id,
timeout_sec=AGENT_CRITIC_TIMEOUT_SEC,
)
return self._degraded_report(0, "step_timeout")
if not success or not response_text:
return self._degraded_report(0, "llm_failed")
parsed = self._parse_response(sanitize(response_text, "critic_output"))
challenges = _extract_challenges(parsed)
# 有 critical challenge → vote = REJECT
vote = AgentVote.REJECT if any(c.severity == "critical" for c in challenges) else AgentVote.APPROVE
return CriticReport(
challenges=challenges,
overall_assessment=str(parsed.get("overall_assessment", ""))[:1000],
latency_ms=0,
vote=vote,
)
def _build_prompt(self, context: dict[str, Any]) -> str:
return f"""你是 AWOOOI SRE 系統的質疑者 AgentCritic
你的工作是:找出診斷和方案的弱點。不是說好話,是找漏洞。
當前診斷:{context.get("hypothesis", "")}
當前方案:{context.get("action", "")}
診斷信心:{context.get("confidence", 0.0):.0%}
必須回答以下問題(每個問題產出一個 challenge
1. 如果這個診斷是錯的,還有哪些可能的根因?
2. 這個方案有什麼副作用或風險?
3. 是否有更好的替代方案被忽略了?
每個 challenge 標記嚴重度:
- "minor":小瑕疵,不影響執行
- "major":值得 Coordinator 考慮,但不是阻擋條件
- "critical":嚴重邏輯漏洞,必須阻止此方案執行
以 JSON 回覆:
{{
"challenges": [
{{
"target": "diagnosis",
"argument": "可能是 OOM 但也可能是 code bug需要看 GC logs 確認",
"severity": "major"
}}
],
"overall_assessment": "診斷可信但方案風險偏高"
}}"""
def _parse_response(self, response: str) -> dict[str, Any]:
return self._extract_json(response)
def analyze(self, context: dict[str, Any]) -> Any:
raise NotImplementedError("Use run() for Phase 2 agents")
def _degraded_report(
self,
latency_ms: int,
reason: str = "unknown",
) -> CriticReport:
"""熔斷降級:輸出空 challenges不阻塞 Coordinator"""
return CriticReport(
challenges=[],
overall_assessment=f"[降級] Critic LLM 失敗({reason}),跳過批判性審查",
latency_ms=latency_ms,
vote=AgentVote.ABSTAIN,
degraded=True,
)
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
def _extract_challenges(parsed: dict[str, Any]) -> list[Challenge]:
"""從 LLM 解析結果提取 challenges按嚴重度排序"""
raw = parsed.get("challenges", [])
challenges = []
severity_order = {"critical": 0, "major": 1, "minor": 2}
for item in raw:
if not isinstance(item, dict):
continue
c = Challenge(
target=str(item.get("target", "unknown"))[:50],
argument=str(item.get("argument", ""))[:500],
severity=item.get("severity", "minor") if item.get("severity") in severity_order else "minor",
)
challenges.append(c)
challenges.sort(key=lambda c: severity_order.get(c.severity, 2))
return challenges[:MAX_CHALLENGES]
def compute_input_hash(diagnosis: DiagnosisReport, plan: ActionPlan) -> str:
key = diagnosis.evidence_snapshot_id + (
diagnosis.top_hypothesis.description if diagnosis.top_hypothesis else ""
) + (
plan.top_candidate.action if plan.top_candidate else ""
)
return hashlib.sha256(key.encode()).hexdigest()[:16]
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_agent: CriticAgent | None = None
def get_critic_agent() -> CriticAgent:
global _agent
if _agent is None:
_agent = CriticAgent()
return _agent