""" AWOOOI AIOps — Agent Step Latency Metrics ========================================== # 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric # (北極星 §1.2 Observable by Default) # # 背景:INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20% # 原因:OpenClaw NIM (192.168.0.188:8088) 實測 2-27s, # 但 Diagnostician/Solver/Critic 共用 PHASE2_STEP_TIMEOUT_SEC=20.0, # 命中尾巴 latency 必爆 → degraded confidence=0.2 # # 此模組提供: # 1. aiops_agent_step_duration_seconds Histogram — 記錄每段 Agent 呼叫耗時 # 2. observe_agent_step() helper — 統一呼叫點,三 Agent 共用 # # outcome label ∈ {success, timeout, error} # agent label ∈ {diagnostician, solver, critic} # # 用法(在 agent try/except 區塊): # from src.observability.agent_step_metrics import observe_agent_step # observe_agent_step("diagnostician", "timeout", elapsed_sec) # # ADR-082: Phase 2 多 Agent 協作 # 建立者: Claude Sonnet 4.6 (fullstack-engineer, A1) """ from __future__ import annotations from prometheus_client import Histogram # Buckets 對齊 NIM 實測分佈(2-27s),並覆蓋三段 timeout 30/20/15s 邊界 # 低端(0.5-5s):快速路徑(Ollama provider pool) # 中端(5-20s):NIM + Gemini fallback # 高端(20-60s):超時 / 慢速 Provider _AGENT_STEP_BUCKETS = [0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 45.0, 60.0] AGENT_STEP_DURATION = Histogram( "aiops_agent_step_duration_seconds", "Duration of each Phase 2 agent LLM step in seconds", ["agent", "outcome"], # agent: diagnostician/solver/critic; outcome: success/timeout/error buckets=_AGENT_STEP_BUCKETS, ) def observe_agent_step(agent: str, outcome: str, duration_sec: float) -> None: """ 記錄一次 Phase 2 Agent LLM 步驟的耗時與結果。 # 2026-04-27 Claude Sonnet 4.6: A1 統一呼叫點 # 三個 agent(diagnostician/solver/critic)的 try/except 區塊都必須呼叫此函式, # 確保 Observable by Default(北極星 §1.2):任何成功/超時/錯誤都留下可觀測指標。 Args: agent: Agent 名稱,必須是 "diagnostician" / "solver" / "critic" outcome: 結果,必須是 "success" / "timeout" / "error" duration_sec: 本次 LLM 呼叫耗時(秒),使用 time.monotonic() 差值 """ AGENT_STEP_DURATION.labels(agent=agent, outcome=outcome).observe(duration_sec)