59 lines
2.4 KiB
Python
59 lines
2.4 KiB
Python
"""
|
||
AWOOOI AIOps — Agent Step Latency Metrics
|
||
==========================================
|
||
# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric
|
||
# (北極星 §1.2 Observable by Default)
|
||
#
|
||
# 背景:INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20%
|
||
# 原因:OpenClaw NIM (192.168.0.188:8088) 實測 2-27s,
|
||
# 但 Diagnostician/Solver/Critic 共用 PHASE2_STEP_TIMEOUT_SEC=20.0,
|
||
# 命中尾巴 latency 必爆 → degraded confidence=0.2
|
||
#
|
||
# 此模組提供:
|
||
# 1. aiops_agent_step_duration_seconds Histogram — 記錄每段 Agent 呼叫耗時
|
||
# 2. observe_agent_step() helper — 統一呼叫點,三 Agent 共用
|
||
#
|
||
# outcome label ∈ {success, timeout, error}
|
||
# agent label ∈ {diagnostician, solver, critic}
|
||
#
|
||
# 用法(在 agent try/except 區塊):
|
||
# from src.observability.agent_step_metrics import observe_agent_step
|
||
# observe_agent_step("diagnostician", "timeout", elapsed_sec)
|
||
#
|
||
# ADR-082: Phase 2 多 Agent 協作
|
||
# 建立者: Claude Sonnet 4.6 (fullstack-engineer, A1)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from prometheus_client import Histogram
|
||
|
||
# Buckets 對齊 NIM 實測分佈(2-27s),並覆蓋三段 timeout 30/20/15s 邊界
|
||
# 低端(0.5-5s):快速路徑(Ollama provider pool)
|
||
# 中端(5-20s):NIM + Gemini fallback
|
||
# 高端(20-60s):超時 / 慢速 Provider
|
||
_AGENT_STEP_BUCKETS = [0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 45.0, 60.0]
|
||
|
||
AGENT_STEP_DURATION = Histogram(
|
||
"aiops_agent_step_duration_seconds",
|
||
"Duration of each Phase 2 agent LLM step in seconds",
|
||
["agent", "outcome"], # agent: diagnostician/solver/critic; outcome: success/timeout/error
|
||
buckets=_AGENT_STEP_BUCKETS,
|
||
)
|
||
|
||
|
||
def observe_agent_step(agent: str, outcome: str, duration_sec: float) -> None:
|
||
"""
|
||
記錄一次 Phase 2 Agent LLM 步驟的耗時與結果。
|
||
|
||
# 2026-04-27 Claude Sonnet 4.6: A1 統一呼叫點
|
||
# 三個 agent(diagnostician/solver/critic)的 try/except 區塊都必須呼叫此函式,
|
||
# 確保 Observable by Default(北極星 §1.2):任何成功/超時/錯誤都留下可觀測指標。
|
||
|
||
Args:
|
||
agent: Agent 名稱,必須是 "diagnostician" / "solver" / "critic"
|
||
outcome: 結果,必須是 "success" / "timeout" / "error"
|
||
duration_sec: 本次 LLM 呼叫耗時(秒),使用 time.monotonic() 差值
|
||
"""
|
||
AGENT_STEP_DURATION.labels(agent=agent, outcome=outcome).observe(duration_sec)
|