Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
延續 595629c0 INC-20260425 修復,補三段 Agent + 全鏈路觀測:
A1 後續 — Solver/Critic 三段 timeout 接線:
- solver_agent.py: AGENT_SOLVER_TIMEOUT_SEC=20.0(env override)
- critic_agent.py: AGENT_CRITIC_TIMEOUT_SEC=15.0(env override)
- protocol.py: 三 Agent 共用 observe_agent_step() 包裹呼叫
· success/timeout/error outcome label
· histogram 寫入 aiops_agent_step_duration_seconds
A2 後續 — auto_repair_service 改用 _diagnose_fallback_chain:
- auto_repair_service.py +46 行 — 切換 DIAGNOSE 路由到新 chain(NEMO→GEMINI→CLAUDE)
- 完全避開 Ollama CPU 238s 二次 timeout
新增 metrics:
- core/metrics.py +59 行 — 配合 observe_agent_step 的 histogram bucket + label cardinality
新增測試 (862 行):
- test_agent_step_timeouts.py (475) — 三 Agent 各 timeout 邊界 + outcome label
- test_ai_router_diagnose_fallback.py (387) — _diagnose_fallback_chain 正確序
新增配套:
- docs/runbooks/RUNBOOK-AGENT-STEP-LATENCY.md (350) — INC 故障排查 + 觀測指引
- ops/monitoring/grafana/agent_step_latency_rules.yaml (160)
· 三 Agent histogram alert rules(p99 > timeout 80% → warning)
驗收: 33 tests pass (test_agent_step_timeouts 22 + test_ai_router_diagnose_fallback 11)
INC-20260425 雙修總工作量(595629c0 + 此 commit):
· 5 個 service/agent 檔修改
· 1 個新 observability 模組
· 4 個新測試/配套檔
· 1372+187 = 1559 行新增
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Claude Sonnet 4.6 (INC-20260425 後續) <noreply@anthropic.com>
273 lines
10 KiB
Python
273 lines
10 KiB
Python
"""
|
||
AWOOOI AIOps Phase 2 — Critic Agent(質疑者)
|
||
=============================================
|
||
職責:刻意唱反調,防止幻覺與 echo chamber
|
||
|
||
輸入:DiagnosisReport + ActionPlan(兩者都看)
|
||
輸出:CriticReport(challenges[] 列表 + overall_assessment)
|
||
|
||
設計原則:
|
||
1. Critic 的工作是找漏洞,不是說好話(防 sycophancy)
|
||
2. prompt 強制要求批判性思維:「如果診斷是錯的,還有哪 3 種可能?」
|
||
3. challenge_count > 0 是 Phase 2 退出條件之一
|
||
4. Critic 連續 3 次找到 Diagnostician 嚴重漏洞 → 觸發 Diagnostician 狀態不穩(Phase 4 實作)
|
||
5. 熔斷降級:LLM 失敗 → 輸出空 challenges(不阻塞 Coordinator)
|
||
6. Critic 和 Reviewer 並行執行(都不阻塞對方)
|
||
|
||
ADR-082: Phase 2 多 Agent 協作
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import hashlib
|
||
import os
|
||
import time
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.agents.base import BaseAgent
|
||
from src.agents.protocol import (
|
||
ActionPlan,
|
||
AgentRole,
|
||
AgentVote,
|
||
Challenge,
|
||
CriticReport,
|
||
DiagnosisReport,
|
||
)
|
||
from src.observability.agent_step_metrics import observe_agent_step
|
||
from src.services.sanitization_service import sanitize
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# Critic 挑戰數量上限(防止 LLM 生成無限質疑)
|
||
MAX_CHALLENGES = 5
|
||
|
||
# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric (北極星 §1.2 Observable by Default)
|
||
# 背景:INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20%
|
||
# OpenClaw NIM (192.168.0.188:8088) 實測 2-27s,原共用 PHASE2_STEP_TIMEOUT_SEC=20.0
|
||
# Critic 只做批判性審查(prompt 最短、輸出最簡),分配最小 timeout=15s 以保留全局預算給 Diagnostician/Solver
|
||
# env override:部署時可透過 K8s ConfigMap 動態調整,無需重新 build image
|
||
AGENT_CRITIC_TIMEOUT_SEC: float = float(
|
||
os.environ.get("AGENT_CRITIC_TIMEOUT_SEC", "15.0")
|
||
)
|
||
|
||
# 保留相容 alias,標記棄用
|
||
# DEPRECATED (2026-04-27): 使用 AGENT_CRITIC_TIMEOUT_SEC,此 alias 將在下一個 Sprint 移除
|
||
PHASE2_STEP_TIMEOUT_SEC = AGENT_CRITIC_TIMEOUT_SEC
|
||
|
||
|
||
class CriticAgent(BaseAgent):
|
||
"""
|
||
Critic Agent — 系統性懷疑論者
|
||
|
||
Usage:
|
||
agent = CriticAgent()
|
||
report = await agent.run(diagnosis, plan)
|
||
"""
|
||
|
||
AGENT_NAME = AgentRole.CRITIC.value
|
||
AGENT_DESCRIPTION = (
|
||
"Devil's advocate. Challenges diagnosis and proposed actions to prevent "
|
||
"hallucination and echo chamber effects."
|
||
)
|
||
|
||
async def run(
|
||
self,
|
||
diagnosis: DiagnosisReport,
|
||
plan: ActionPlan,
|
||
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
|
||
) -> CriticReport:
|
||
"""
|
||
批判性審查診斷和方案。
|
||
|
||
Args:
|
||
diagnosis: Diagnostician 輸出
|
||
plan: Solver 輸出
|
||
timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級
|
||
|
||
Returns:
|
||
CriticReport(真實異常時 degraded=True)
|
||
"""
|
||
start_ms = int(time.monotonic() * 1000)
|
||
|
||
try:
|
||
report = await self._critique(diagnosis, plan)
|
||
report.latency_ms = int(time.monotonic() * 1000) - start_ms
|
||
logger.info(
|
||
"critic_done",
|
||
challenges=report.challenge_count,
|
||
has_critical=report.has_critical_challenge,
|
||
vote=report.vote,
|
||
latency_ms=report.latency_ms,
|
||
)
|
||
return report
|
||
|
||
except Exception:
|
||
latency = int(time.monotonic() * 1000) - start_ms
|
||
logger.exception("critic_error")
|
||
return self._degraded_report(latency, "error")
|
||
|
||
async def _critique(
|
||
self,
|
||
diagnosis: DiagnosisReport,
|
||
plan: ActionPlan,
|
||
) -> CriticReport:
|
||
"""LLM 批判性推理。"""
|
||
top_hypothesis = diagnosis.top_hypothesis
|
||
top_candidate = plan.top_candidate
|
||
|
||
prompt = self._build_prompt({
|
||
"hypothesis": top_hypothesis.description if top_hypothesis else "(無假設)",
|
||
"action": top_candidate.action if top_candidate else "(無方案)",
|
||
"confidence": top_hypothesis.confidence if top_hypothesis else 0.0,
|
||
})
|
||
|
||
_critic_signal = (
|
||
f"hypothesis={top_hypothesis.description[:300] if top_hypothesis else 'none'}; "
|
||
f"action={top_candidate.action[:300] if top_candidate else 'none'}"
|
||
)
|
||
alert_context = {
|
||
"incident_id": diagnosis.evidence_snapshot_id or "UNKNOWN",
|
||
"severity": "P3",
|
||
"signals": [{"alert_name": "critic_review", "description": _critic_signal}],
|
||
"affected_services": [],
|
||
"intent_hint": "diagnose",
|
||
}
|
||
|
||
from src.services.openclaw import get_openclaw
|
||
openclaw = get_openclaw()
|
||
_step_start = time.monotonic()
|
||
try:
|
||
response_text, _provider, success = await asyncio.wait_for(
|
||
openclaw.call(prompt, alert_context=alert_context),
|
||
timeout=AGENT_CRITIC_TIMEOUT_SEC,
|
||
)
|
||
# 2026-04-27 Claude Sonnet 4.6: A1 — success path metric observe
|
||
observe_agent_step("critic", "success", time.monotonic() - _step_start)
|
||
except asyncio.TimeoutError:
|
||
# 2026-04-27 Claude Sonnet 4.6: A1 — timeout path metric observe
|
||
observe_agent_step("critic", "timeout", time.monotonic() - _step_start)
|
||
logger.warning(
|
||
"critic_step_timeout",
|
||
snapshot_id=diagnosis.evidence_snapshot_id,
|
||
timeout_sec=AGENT_CRITIC_TIMEOUT_SEC,
|
||
)
|
||
return self._degraded_report(0, "step_timeout")
|
||
|
||
if not success or not response_text:
|
||
return self._degraded_report(0, "llm_failed")
|
||
|
||
parsed = self._parse_response(sanitize(response_text, "critic_output"))
|
||
challenges = _extract_challenges(parsed)
|
||
|
||
# 有 critical challenge → vote = REJECT
|
||
vote = AgentVote.REJECT if any(c.severity == "critical" for c in challenges) else AgentVote.APPROVE
|
||
|
||
return CriticReport(
|
||
challenges=challenges,
|
||
overall_assessment=str(parsed.get("overall_assessment", ""))[:1000],
|
||
latency_ms=0,
|
||
vote=vote,
|
||
)
|
||
|
||
def _build_prompt(self, context: dict[str, Any]) -> str:
|
||
return f"""你是 AWOOOI SRE 系統的質疑者 Agent(Critic)。
|
||
|
||
你的工作是:找出診斷和方案的弱點。不是說好話,是找漏洞。
|
||
|
||
當前診斷:{context.get("hypothesis", "")}
|
||
當前方案:{context.get("action", "")}
|
||
診斷信心:{context.get("confidence", 0.0):.0%}
|
||
|
||
必須回答以下問題(每個問題產出一個 challenge):
|
||
1. 如果這個診斷是錯的,還有哪些可能的根因?
|
||
2. 這個方案有什麼副作用或風險?
|
||
3. 是否有更好的替代方案被忽略了?
|
||
|
||
每個 challenge 標記嚴重度:
|
||
- "minor":小瑕疵,不影響執行
|
||
- "major":值得 Coordinator 考慮,但不是阻擋條件
|
||
- "critical":嚴重邏輯漏洞,必須阻止此方案執行
|
||
|
||
以 JSON 回覆:
|
||
{{
|
||
"challenges": [
|
||
{{
|
||
"target": "diagnosis",
|
||
"argument": "可能是 OOM 但也可能是 code bug,需要看 GC logs 確認",
|
||
"severity": "major"
|
||
}}
|
||
],
|
||
"overall_assessment": "診斷可信但方案風險偏高"
|
||
}}"""
|
||
|
||
def _parse_response(self, response: str) -> dict[str, Any]:
|
||
return self._extract_json(response)
|
||
|
||
def analyze(self, context: dict[str, Any]) -> Any:
|
||
raise NotImplementedError("Use run() for Phase 2 agents")
|
||
|
||
def _degraded_report(
|
||
self,
|
||
latency_ms: int,
|
||
reason: str = "unknown",
|
||
) -> CriticReport:
|
||
"""熔斷降級:輸出空 challenges(不阻塞 Coordinator)"""
|
||
return CriticReport(
|
||
challenges=[],
|
||
overall_assessment=f"[降級] Critic LLM 失敗({reason}),跳過批判性審查",
|
||
latency_ms=latency_ms,
|
||
vote=AgentVote.ABSTAIN,
|
||
degraded=True,
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _extract_challenges(parsed: dict[str, Any]) -> list[Challenge]:
|
||
"""從 LLM 解析結果提取 challenges(按嚴重度排序)。"""
|
||
raw = parsed.get("challenges", [])
|
||
challenges = []
|
||
severity_order = {"critical": 0, "major": 1, "minor": 2}
|
||
|
||
for item in raw:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
c = Challenge(
|
||
target=str(item.get("target", "unknown"))[:50],
|
||
argument=str(item.get("argument", ""))[:500],
|
||
severity=item.get("severity", "minor") if item.get("severity") in severity_order else "minor",
|
||
)
|
||
challenges.append(c)
|
||
|
||
challenges.sort(key=lambda c: severity_order.get(c.severity, 2))
|
||
return challenges[:MAX_CHALLENGES]
|
||
|
||
|
||
def compute_input_hash(diagnosis: DiagnosisReport, plan: ActionPlan) -> str:
|
||
key = diagnosis.evidence_snapshot_id + (
|
||
diagnosis.top_hypothesis.description if diagnosis.top_hypothesis else ""
|
||
) + (
|
||
plan.top_candidate.action if plan.top_candidate else ""
|
||
)
|
||
return hashlib.sha256(key.encode()).hexdigest()[:16]
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_agent: CriticAgent | None = None
|
||
|
||
|
||
def get_critic_agent() -> CriticAgent:
|
||
global _agent
|
||
if _agent is None:
|
||
_agent = CriticAgent()
|
||
return _agent
|