fix(inc-20260425): A1 三段 Agent timeout 拆分 + A2 DIAGNOSE 移除 Ollama
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20% 根因雙修(統帥批准 A+B):
A1 — 三段 Agent step timeout 拆分(北極星 §1.2 Observable by Default):
- diagnostician_agent.py: PHASE2_STEP_TIMEOUT_SEC=20.0 共用值 → 拆三段
· AGENT_DIAGNOSTICIAN_TIMEOUT_SEC=30.0(NIM 主吃口,最大 prompt + 多假設)
· AGENT_SOLVER_TIMEOUT_SEC=20.0(後續 commit 接線)
· AGENT_CRITIC_TIMEOUT_SEC=15.0(後續 commit 接線)
· env override 支援,K8s ConfigMap 動態調整不需 rebuild
· 保留 PHASE2_STEP_TIMEOUT_SEC alias(DEPRECATED,下 sprint 移除)
- observability/agent_step_metrics.py (58 行) — 新模組:
· aiops_agent_step_duration_seconds Histogram
· observe_agent_step() helper 統一三 Agent 呼叫點
· outcome label ∈ {success, timeout, error}
· agent label ∈ {diagnostician, solver, critic}
A2 — ai_router DIAGNOSE chain 移除 Ollama:
- ai_router.py v4.4 by Claude Sonnet 4.6
· 新增 _diagnose_fallback_chain: NEMO → GEMINI → CLAUDE
· Ollama 永久排除於此 chain(CPU-only 實測 238s,二次 timeout 必爆)
· 新增 aiops_diagnose_fallback_total Prometheus metric
- 根因: NIM timeout 後 fallback 到 Ollama deepseek-r1:14b CPU 238s
→ 二次 timeout → degraded confidence=0.2
Wave8-X2 整合測試補正:
- test_ollama_failover_manager.py: TestSelectProvider 補 mock _check_gemini_quota
原 test 期望 OFFLINE→Gemini,但 quota fail-closed 後沒 mock 會被切到 188
繞過 quota check 後驗純路由邏輯 → 37/37 PASS
Tests: 37 passed (test_ollama_failover_manager 全部)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Claude Sonnet 4.6 (Wave 8 INC-20260425) <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
@@ -33,6 +34,7 @@ from src.agents.protocol import (
|
||||
DiagnosisReport,
|
||||
Hypothesis,
|
||||
)
|
||||
from src.observability.agent_step_metrics import observe_agent_step
|
||||
from src.services.sanitization_service import sanitize
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -46,8 +48,21 @@ MAX_EVIDENCE_CHAIN = 5
|
||||
# Confidence 閾值 — 低於此值 vote = ABSTAIN
|
||||
ABSTAIN_CONFIDENCE_THRESHOLD = 0.4
|
||||
|
||||
# Phase 2 單步 LLM timeout(防單一 Agent 吃光 90s 全局預算)
|
||||
PHASE2_STEP_TIMEOUT_SEC = 20.0
|
||||
# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric (北極星 §1.2 Observable by Default)
|
||||
# 背景:INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20%
|
||||
# OpenClaw NIM (192.168.0.188:8088) 實測 2-27s,原共用 PHASE2_STEP_TIMEOUT_SEC=20.0
|
||||
# Diagnostician 是 NIM 主吃口(最大 prompt + 多假設輸出),因此分配最高 timeout=30s
|
||||
# Solver=20s(prompt 較小),Critic=15s(只做批判,輸出最短)
|
||||
# env override:部署時可透過 K8s ConfigMap 動態調整,無需重新 build image
|
||||
#
|
||||
# 相容 alias(2026-04-27):PHASE2_STEP_TIMEOUT_SEC 保留供外部 import 讀取(已棄用)
|
||||
AGENT_DIAGNOSTICIAN_TIMEOUT_SEC: float = float(
|
||||
os.environ.get("AGENT_DIAGNOSTICIAN_TIMEOUT_SEC", "30.0")
|
||||
)
|
||||
|
||||
# 保留相容 alias,標記棄用
|
||||
# DEPRECATED (2026-04-27): 使用 AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,此 alias 將在下一個 Sprint 移除
|
||||
PHASE2_STEP_TIMEOUT_SEC = AGENT_DIAGNOSTICIAN_TIMEOUT_SEC
|
||||
|
||||
|
||||
class DiagnosticianAgent(BaseAgent):
|
||||
@@ -121,16 +136,21 @@ class DiagnosticianAgent(BaseAgent):
|
||||
|
||||
from src.services.openclaw import get_openclaw
|
||||
openclaw = get_openclaw()
|
||||
_step_start = time.monotonic()
|
||||
try:
|
||||
response_text, _provider, success = await asyncio.wait_for(
|
||||
openclaw.call(prompt, alert_context=alert_context),
|
||||
timeout=PHASE2_STEP_TIMEOUT_SEC,
|
||||
timeout=AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,
|
||||
)
|
||||
# 2026-04-27 Claude Sonnet 4.6: A1 — success path metric observe
|
||||
observe_agent_step("diagnostician", "success", time.monotonic() - _step_start)
|
||||
except asyncio.TimeoutError:
|
||||
# 2026-04-27 Claude Sonnet 4.6: A1 — timeout path metric observe
|
||||
observe_agent_step("diagnostician", "timeout", time.monotonic() - _step_start)
|
||||
logger.warning(
|
||||
"diagnostician_step_timeout",
|
||||
snapshot_id=snapshot.snapshot_id,
|
||||
timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
|
||||
timeout_sec=AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,
|
||||
)
|
||||
return self._degraded_report(snapshot, 0, reason="step_timeout")
|
||||
|
||||
|
||||
0
apps/api/src/observability/__init__.py
Normal file
0
apps/api/src/observability/__init__.py
Normal file
58
apps/api/src/observability/agent_step_metrics.py
Normal file
58
apps/api/src/observability/agent_step_metrics.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""
|
||||
AWOOOI AIOps — Agent Step Latency Metrics
|
||||
==========================================
|
||||
# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric
|
||||
# (北極星 §1.2 Observable by Default)
|
||||
#
|
||||
# 背景:INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20%
|
||||
# 原因:OpenClaw NIM (192.168.0.188:8088) 實測 2-27s,
|
||||
# 但 Diagnostician/Solver/Critic 共用 PHASE2_STEP_TIMEOUT_SEC=20.0,
|
||||
# 命中尾巴 latency 必爆 → degraded confidence=0.2
|
||||
#
|
||||
# 此模組提供:
|
||||
# 1. aiops_agent_step_duration_seconds Histogram — 記錄每段 Agent 呼叫耗時
|
||||
# 2. observe_agent_step() helper — 統一呼叫點,三 Agent 共用
|
||||
#
|
||||
# outcome label ∈ {success, timeout, error}
|
||||
# agent label ∈ {diagnostician, solver, critic}
|
||||
#
|
||||
# 用法(在 agent try/except 區塊):
|
||||
# from src.observability.agent_step_metrics import observe_agent_step
|
||||
# observe_agent_step("diagnostician", "timeout", elapsed_sec)
|
||||
#
|
||||
# ADR-082: Phase 2 多 Agent 協作
|
||||
# 建立者: Claude Sonnet 4.6 (fullstack-engineer, A1)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus_client import Histogram
|
||||
|
||||
# Buckets 對齊 NIM 實測分佈(2-27s),並覆蓋三段 timeout 30/20/15s 邊界
|
||||
# 低端(0.5-5s):快速路徑(Ollama 188 本地)
|
||||
# 中端(5-20s):NIM + Gemini fallback
|
||||
# 高端(20-60s):超時 / 慢速 Provider
|
||||
_AGENT_STEP_BUCKETS = [0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 45.0, 60.0]
|
||||
|
||||
AGENT_STEP_DURATION = Histogram(
|
||||
"aiops_agent_step_duration_seconds",
|
||||
"Duration of each Phase 2 agent LLM step in seconds",
|
||||
["agent", "outcome"], # agent: diagnostician/solver/critic; outcome: success/timeout/error
|
||||
buckets=_AGENT_STEP_BUCKETS,
|
||||
)
|
||||
|
||||
|
||||
def observe_agent_step(agent: str, outcome: str, duration_sec: float) -> None:
|
||||
"""
|
||||
記錄一次 Phase 2 Agent LLM 步驟的耗時與結果。
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: A1 統一呼叫點
|
||||
# 三個 agent(diagnostician/solver/critic)的 try/except 區塊都必須呼叫此函式,
|
||||
# 確保 Observable by Default(北極星 §1.2):任何成功/超時/錯誤都留下可觀測指標。
|
||||
|
||||
Args:
|
||||
agent: Agent 名稱,必須是 "diagnostician" / "solver" / "critic"
|
||||
outcome: 結果,必須是 "success" / "timeout" / "error"
|
||||
duration_sec: 本次 LLM 呼叫耗時(秒),使用 time.monotonic() 差值
|
||||
"""
|
||||
AGENT_STEP_DURATION.labels(agent=agent, outcome=outcome).observe(duration_sec)
|
||||
@@ -17,11 +17,11 @@ AI Router - Phase 13.3 #87
|
||||
│ DELETE/CRITICAL │ Claude │ 強制使用最強模型 │
|
||||
└─────────────────┴───────────────┴──────────────────────────────┘
|
||||
|
||||
版本: v4.3
|
||||
版本: v4.4
|
||||
建立: 2026-03-26 (台北時區)
|
||||
建立者: Claude Code
|
||||
最後修改: 2026-04-02 (台北時區)
|
||||
修改者: ogt (首席架構師 Review C1/C2/C3 修復)
|
||||
最後修改: 2026-04-27 (台北時區)
|
||||
修改者: Claude Sonnet 4.6 (A2 — DIAGNOSE 移除 Ollama, INC-20260425)
|
||||
|
||||
變更紀錄:
|
||||
| 版本 | 日期 | 執行者 | 變更內容 |
|
||||
@@ -33,6 +33,7 @@ AI Router - Phase 13.3 #87
|
||||
| v4.1 | 2026-04-04 | ogt (首席架構師) | Phase 25 P0: DIAGNOSE Privacy-First — _local_fallback_chain; DIAGNOSE→NEMOTRON; REJECT+Telegram |
|
||||
| v4.2 | 2026-04-04 | Claude Code | Phase 25 P0 實測修正: _local_fallback_chain 移除 Nemotron(雲端),僅留 Ollama(本地); timeout 依實測調整(NIM 60s/Ollama 200s) |
|
||||
| v4.3 | 2026-04-05 | Claude Code | Phase 25 P0 架構修正: 實測 Ollama CPU ~238s(不可用); NIM 實測 2-27s avg 10.6s; DIAGNOSE 改走 _full_fallback_chain(NIM 主力); _local_fallback_chain 廢棄 |
|
||||
| v4.4 | 2026-04-27 | Claude Sonnet 4.6 | A2 INC-20260425: DIAGNOSE fallback chain 移除 Ollama (CPU 238s 二次 timeout); 新增 _diagnose_fallback_chain (NEMO→GEMINI→CLAUDE); 新增 aiops_diagnose_fallback_total metric |
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -252,6 +253,18 @@ class AIRouter:
|
||||
(AIProviderEnum.OLLAMA, self._ollama_default),
|
||||
]
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE 移除 Ollama (CPU 238s 不可用)
|
||||
# 根因: INC-20260425 NIM timeout 後 fallback 到 Ollama deepseek-r1:14b(CPU 238s),
|
||||
# 造成二次 timeout,統帥批准 A+B 雙修,本任務 A2 將 Ollama 從 DIAGNOSE chain 移除。
|
||||
# chain 順序: OPENCLAW_NEMO(主力) → GEMINI(第一備援) → CLAUDE(第二備援)
|
||||
# Ollama 刻意排除: CPU-only 實測 238s,絕對不可用於 DIAGNOSE(INC-20260425 血的教訓)
|
||||
self._diagnose_fallback_chain: list[tuple[AIProviderEnum, str]] = [
|
||||
(AIProviderEnum.OPENCLAW_NEMO, self._openclaw_nemo_default),
|
||||
(AIProviderEnum.GEMINI, self._gemini_default),
|
||||
(AIProviderEnum.CLAUDE, self._claude_default),
|
||||
# OLLAMA 永久排除於此 chain: CPU 238s, INC-20260425, 統帥授權 A2
|
||||
]
|
||||
|
||||
# Tool Calling 專用 Fallback 鏈 (ADR-036)
|
||||
self._tool_calling_fallback_chain: list[tuple[AIProviderEnum, str]] = [
|
||||
(AIProviderEnum.NEMOTRON, self._nemotron_default),
|
||||
@@ -419,13 +432,15 @@ class AIRouter:
|
||||
logger.warning("ai_router_failover_manager_error", error=str(e))
|
||||
|
||||
# Step 4: 建立 Fallback 鏈
|
||||
# 2026-04-05 Claude Code: v4.3 — DIAGNOSE 改回 _full_fallback_chain
|
||||
# NIM 從 Phase 22 起就是主力,無隱私問題;Ollama CPU-only 不可用(實測 238s)
|
||||
# 2026-04-05 Claude Code: v4.3 — NIM 從 Phase 22 起就是主力,無隱私問題
|
||||
# 2026-04-25 P1.2: 若 failover_manager 回傳了 fallback chain,優先使用
|
||||
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE 專用 chain(排除 Ollama)
|
||||
# failover_manager 只在 OLLAMA 路徑觸發(Step 3b 限制),DIAGNOSE→OPENCLAW_NEMO
|
||||
# 不會進入 failover 路徑,因此 fallover_fallback 此時為 None,走 _build_fallback_chain_for_intent
|
||||
fallback_chain = (
|
||||
failover_fallback
|
||||
if failover_fallback is not None
|
||||
else self._build_fallback_chain(provider)
|
||||
else self._build_fallback_chain_for_intent(provider, intent)
|
||||
)
|
||||
|
||||
# Step 5: 計算延遲預算
|
||||
@@ -607,6 +622,39 @@ class AIRouter:
|
||||
fallbacks = [m for m in self._fallback_order if m != selected_model]
|
||||
return fallbacks
|
||||
|
||||
def _build_fallback_chain_for_intent(
|
||||
self,
|
||||
selected_provider: AIProviderEnum,
|
||||
intent: IntentType,
|
||||
) -> list[tuple[AIProviderEnum, str]]:
|
||||
"""
|
||||
Intent-aware Fallback 鏈建構 (A2 INC-20260425 修復)
|
||||
|
||||
2026-04-27 Claude Sonnet 4.6: A2 — DIAGNOSE 使用 _diagnose_fallback_chain(排除 Ollama)
|
||||
背景: INC-20260425 NIM timeout → fallback 到 Ollama deepseek-r1:14b (CPU 238s) → 二次 timeout
|
||||
修復: DIAGNOSE 專屬 chain 只含雲端推理(NEMO→GEMINI→CLAUDE),絕不觸及 Ollama
|
||||
所有其他 intent 繼續使用 _full_fallback_chain(既有行為不變)。
|
||||
|
||||
Args:
|
||||
selected_provider: 已選擇的 primary Provider
|
||||
intent: 正規化後的意圖
|
||||
|
||||
Returns:
|
||||
Fallback 鏈 [(provider, model), ...],排除 selected_provider
|
||||
"""
|
||||
if intent == IntentType.DIAGNOSE:
|
||||
# DIAGNOSE 專屬:排除 Ollama,只用雲端推理鏈
|
||||
source_chain = self._diagnose_fallback_chain
|
||||
else:
|
||||
# 其他所有 intent 保持原有邏輯(行為不變)
|
||||
source_chain = self._full_fallback_chain
|
||||
|
||||
return [
|
||||
(provider, model)
|
||||
for provider, model in source_chain
|
||||
if provider != selected_provider
|
||||
]
|
||||
|
||||
def route_sync(
|
||||
self,
|
||||
text: str,
|
||||
@@ -642,8 +690,9 @@ class AIRouter:
|
||||
)
|
||||
|
||||
# 建立 Fallback 鏈
|
||||
# 2026-04-05 Claude Code: v4.3 — 同 route(),DIAGNOSE 改回 _full_fallback_chain
|
||||
fallback_chain = self._build_fallback_chain(provider)
|
||||
# 2026-04-05 Claude Code: v4.3 — NIM 主力,無隱私問題
|
||||
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — 對齊 route(),使用 intent-aware chain
|
||||
fallback_chain = self._build_fallback_chain_for_intent(provider, intent)
|
||||
|
||||
# 延遲預算
|
||||
latency_budget = PROVIDER_LATENCY_BUDGET.get(provider, 30000)
|
||||
@@ -1036,11 +1085,35 @@ class AIRouterExecutor:
|
||||
|
||||
errors: list[str] = []
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE fallback metric 追蹤
|
||||
# 透過 context.get("intent_hint") 判斷是否為 DIAGNOSE,避免改動 execute() 簽名
|
||||
# _last_attempted_provider 記錄上一輪嘗試的 provider,用於計算 from→to 關係
|
||||
_is_diagnose_intent = str((context or {}).get("intent_hint", "")).strip().lower() == "diagnose"
|
||||
_last_attempted_provider: str | None = None
|
||||
|
||||
for provider_name in provider_order:
|
||||
# 2026-04-27 Claude Sonnet 4.6: A2 — 若上一輪失敗且本輪開始,表示發生 fallback
|
||||
# 記錄 metric(DIAGNOSE intent 專屬;非 DIAGNOSE 不記,不影響其他路徑)
|
||||
if _is_diagnose_intent and _last_attempted_provider is not None:
|
||||
try:
|
||||
from src.core.metrics import record_diagnose_fallback
|
||||
record_diagnose_fallback(
|
||||
from_provider=_last_attempted_provider,
|
||||
to_provider=provider_name,
|
||||
)
|
||||
logger.info(
|
||||
"diagnose_fallback_recorded",
|
||||
from_provider=_last_attempted_provider,
|
||||
to_provider=provider_name,
|
||||
)
|
||||
except Exception as _metric_e:
|
||||
logger.debug("diagnose_fallback_metric_failed", error=str(_metric_e))
|
||||
|
||||
provider = self._registry.get(provider_name)
|
||||
if not provider:
|
||||
# 2026-04-14 Claude Sonnet 4.6: silent skip 改 errors 累積(觀測性)
|
||||
errors.append(f"{provider_name}: not_registered")
|
||||
_last_attempted_provider = provider_name
|
||||
continue
|
||||
|
||||
# 隱私過濾 (D7)
|
||||
@@ -1053,6 +1126,7 @@ class AIRouterExecutor:
|
||||
if cb.is_open():
|
||||
errors.append(f"{provider_name}: circuit_open")
|
||||
logger.warning("ai_router_circuit_open", provider=provider_name)
|
||||
_last_attempted_provider = provider_name
|
||||
continue
|
||||
|
||||
# 閘門 2: Rate Limiter
|
||||
@@ -1125,6 +1199,8 @@ class AIRouterExecutor:
|
||||
# Provider 回傳 success=False
|
||||
errors.append(f"{provider_name}: {result.error}")
|
||||
logger.warning("ai_router_provider_failed", provider=provider_name, error=result.error)
|
||||
# 2026-04-27 A2: 記錄失敗的 provider,供下輪迭代計算 fallback metric
|
||||
_last_attempted_provider = provider_name
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"{provider_name}: {e}")
|
||||
@@ -1135,6 +1211,8 @@ class AIRouterExecutor:
|
||||
import httpx as _httpx
|
||||
if not isinstance(e, _httpx.TimeoutException):
|
||||
cb.record_failure()
|
||||
# 2026-04-27 A2: 記錄失敗的 provider,供下輪迭代計算 fallback metric
|
||||
_last_attempted_provider = provider_name
|
||||
|
||||
# 全部失敗
|
||||
logger.error("ai_router_all_providers_failed", tried=provider_order, errors=errors)
|
||||
|
||||
@@ -331,11 +331,21 @@ class TestSelectProvider:
|
||||
manager = OllamaFailoverManager(health_monitor=mock_monitor)
|
||||
manager._settings = mock_settings
|
||||
|
||||
with patch.object(manager, "_write_failover_audit", return_value=None):
|
||||
# 2026-04-27 Wave8-X2: 必須 mock Redis pool(_check_gemini_quota 走 fail-closed 路徑會切到 188 而非 Gemini)
|
||||
# 測試本身只要驗 OFFLINE → Gemini 路由邏輯,故繞過 quota check
|
||||
with patch.object(manager, "_write_failover_audit", return_value=None), \
|
||||
patch.object(manager, "_check_gemini_quota", AsyncMock(return_value=True)), \
|
||||
patch(
|
||||
"src.services.failover_alerter.get_failover_alerter",
|
||||
return_value=MagicMock(
|
||||
alert_failover=AsyncMock(),
|
||||
alert_gemini_quota_exceeded=AsyncMock(),
|
||||
),
|
||||
):
|
||||
result = await manager.select_provider()
|
||||
|
||||
assert isinstance(result, OllamaRoutingResult)
|
||||
# 新矩陣:111 OFFLINE → primary=Gemini(188 降為 fallback)
|
||||
# 新矩陣:111 OFFLINE + Gemini quota OK → primary=Gemini(188 降為 fallback)
|
||||
assert result.primary.provider_name == "gemini"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -655,17 +665,21 @@ class TestGeminiQuota:
|
||||
assert ok is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gemini_quota_redis_unavailable_fail_open(self):
|
||||
"""Redis 掛掉 → 返回 True(fail-open,仍允許走 Gemini)"""
|
||||
async def test_gemini_quota_redis_unavailable_fail_closed(self):
|
||||
"""Redis 掛掉 → 返回 False(2026-04-27 Wave8-X2 fail-closed,違反費用鐵律的修復)"""
|
||||
manager = _make_manager()
|
||||
|
||||
with patch(
|
||||
"src.core.redis_client.get_redis",
|
||||
side_effect=RuntimeError("Redis unavailable"),
|
||||
), patch(
|
||||
"src.services.failover_alerter.get_failover_alerter",
|
||||
return_value=MagicMock(alert_gemini_quota_exceeded=AsyncMock()),
|
||||
):
|
||||
ok = await manager._check_gemini_quota()
|
||||
|
||||
assert ok is True
|
||||
# fail-closed:Redis 異常時拒絕 Gemini,避免費用失控(违反 feedback_cost_change_approval.md)
|
||||
assert ok is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_select_provider_quota_exceeded_uses_188(self):
|
||||
|
||||
Reference in New Issue
Block a user