fix(inc-20260425): A1 三段 Agent timeout 拆分 + A2 DIAGNOSE 移除 Ollama
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20% 根因雙修(統帥批准 A+B):

A1 — 三段 Agent step timeout 拆分(北極星 §1.2 Observable by Default):
- diagnostician_agent.py: PHASE2_STEP_TIMEOUT_SEC=20.0 共用值 → 拆三段
  · AGENT_DIAGNOSTICIAN_TIMEOUT_SEC=30.0(NIM 主吃口,最大 prompt + 多假設)
  · AGENT_SOLVER_TIMEOUT_SEC=20.0(後續 commit 接線)
  · AGENT_CRITIC_TIMEOUT_SEC=15.0(後續 commit 接線)
  · env override 支援,K8s ConfigMap 動態調整不需 rebuild
  · 保留 PHASE2_STEP_TIMEOUT_SEC alias(DEPRECATED,下 sprint 移除)
- observability/agent_step_metrics.py (58 行) — 新模組:
  · aiops_agent_step_duration_seconds Histogram
  · observe_agent_step() helper 統一三 Agent 呼叫點
  · outcome label ∈ {success, timeout, error}
  · agent label ∈ {diagnostician, solver, critic}

A2 — ai_router DIAGNOSE chain 移除 Ollama:
- ai_router.py v4.4 by Claude Sonnet 4.6
  · 新增 _diagnose_fallback_chain: NEMO → GEMINI → CLAUDE
  · Ollama 永久排除於此 chain(CPU-only 實測 238s,二次 timeout 必爆)
  · 新增 aiops_diagnose_fallback_total Prometheus metric
- 根因: NIM timeout 後 fallback 到 Ollama deepseek-r1:14b CPU 238s
  → 二次 timeout → degraded confidence=0.2

Wave8-X2 整合測試補正:
- test_ollama_failover_manager.py: TestSelectProvider 補 mock _check_gemini_quota
  原 test 期望 OFFLINE→Gemini,但 quota fail-closed 後沒 mock 會被切到 188
  繞過 quota check 後驗純路由邏輯 → 37/37 PASS

Tests: 37 passed (test_ollama_failover_manager 全部)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Claude Sonnet 4.6 (Wave 8 INC-20260425) <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-04-27 08:15:10 +08:00
parent 1ab6786ce3
commit 595629c013
5 changed files with 187 additions and 17 deletions

View File

@@ -21,6 +21,7 @@ from __future__ import annotations
import asyncio
import hashlib
import json
import os
import time
from typing import TYPE_CHECKING, Any
@@ -33,6 +34,7 @@ from src.agents.protocol import (
DiagnosisReport,
Hypothesis,
)
from src.observability.agent_step_metrics import observe_agent_step
from src.services.sanitization_service import sanitize
if TYPE_CHECKING:
@@ -46,8 +48,21 @@ MAX_EVIDENCE_CHAIN = 5
# Confidence 閾值 — 低於此值 vote = ABSTAIN
ABSTAIN_CONFIDENCE_THRESHOLD = 0.4
# Phase 2 單步 LLM timeout防單一 Agent 吃光 90s 全局預算)
PHASE2_STEP_TIMEOUT_SEC = 20.0
# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric (北極星 §1.2 Observable by Default)
# 背景INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20%
# OpenClaw NIM (192.168.0.188:8088) 實測 2-27s原共用 PHASE2_STEP_TIMEOUT_SEC=20.0
# Diagnostician 是 NIM 主吃口(最大 prompt + 多假設輸出),因此分配最高 timeout=30s
# Solver=20sprompt 較小Critic=15s只做批判輸出最短
# env override部署時可透過 K8s ConfigMap 動態調整,無需重新 build image
#
# 相容 alias2026-04-27PHASE2_STEP_TIMEOUT_SEC 保留供外部 import 讀取(已棄用)
AGENT_DIAGNOSTICIAN_TIMEOUT_SEC: float = float(
os.environ.get("AGENT_DIAGNOSTICIAN_TIMEOUT_SEC", "30.0")
)
# 保留相容 alias標記棄用
# DEPRECATED (2026-04-27): 使用 AGENT_DIAGNOSTICIAN_TIMEOUT_SEC此 alias 將在下一個 Sprint 移除
PHASE2_STEP_TIMEOUT_SEC = AGENT_DIAGNOSTICIAN_TIMEOUT_SEC
class DiagnosticianAgent(BaseAgent):
@@ -121,16 +136,21 @@ class DiagnosticianAgent(BaseAgent):
from src.services.openclaw import get_openclaw
openclaw = get_openclaw()
_step_start = time.monotonic()
try:
response_text, _provider, success = await asyncio.wait_for(
openclaw.call(prompt, alert_context=alert_context),
timeout=PHASE2_STEP_TIMEOUT_SEC,
timeout=AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,
)
# 2026-04-27 Claude Sonnet 4.6: A1 — success path metric observe
observe_agent_step("diagnostician", "success", time.monotonic() - _step_start)
except asyncio.TimeoutError:
# 2026-04-27 Claude Sonnet 4.6: A1 — timeout path metric observe
observe_agent_step("diagnostician", "timeout", time.monotonic() - _step_start)
logger.warning(
"diagnostician_step_timeout",
snapshot_id=snapshot.snapshot_id,
timeout_sec=PHASE2_STEP_TIMEOUT_SEC,
timeout_sec=AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,
)
return self._degraded_report(snapshot, 0, reason="step_timeout")

View File

View File

@@ -0,0 +1,58 @@
"""
AWOOOI AIOps — Agent Step Latency Metrics
==========================================
# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric
# (北極星 §1.2 Observable by Default)
#
# 背景INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20%
# 原因OpenClaw NIM (192.168.0.188:8088) 實測 2-27s
# 但 Diagnostician/Solver/Critic 共用 PHASE2_STEP_TIMEOUT_SEC=20.0
# 命中尾巴 latency 必爆 → degraded confidence=0.2
#
# 此模組提供:
# 1. aiops_agent_step_duration_seconds Histogram — 記錄每段 Agent 呼叫耗時
# 2. observe_agent_step() helper — 統一呼叫點,三 Agent 共用
#
# outcome label ∈ {success, timeout, error}
# agent label ∈ {diagnostician, solver, critic}
#
# 用法(在 agent try/except 區塊):
# from src.observability.agent_step_metrics import observe_agent_step
# observe_agent_step("diagnostician", "timeout", elapsed_sec)
#
# ADR-082: Phase 2 多 Agent 協作
# 建立者: Claude Sonnet 4.6 (fullstack-engineer, A1)
"""
from __future__ import annotations
from prometheus_client import Histogram
# Buckets 對齊 NIM 實測分佈2-27s並覆蓋三段 timeout 30/20/15s 邊界
# 低端0.5-5s快速路徑Ollama 188 本地)
# 中端5-20sNIM + Gemini fallback
# 高端20-60s超時 / 慢速 Provider
_AGENT_STEP_BUCKETS = [0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 45.0, 60.0]
AGENT_STEP_DURATION = Histogram(
"aiops_agent_step_duration_seconds",
"Duration of each Phase 2 agent LLM step in seconds",
["agent", "outcome"], # agent: diagnostician/solver/critic; outcome: success/timeout/error
buckets=_AGENT_STEP_BUCKETS,
)
def observe_agent_step(agent: str, outcome: str, duration_sec: float) -> None:
"""
記錄一次 Phase 2 Agent LLM 步驟的耗時與結果。
# 2026-04-27 Claude Sonnet 4.6: A1 統一呼叫點
# 三個 agentdiagnostician/solver/critic的 try/except 區塊都必須呼叫此函式,
# 確保 Observable by Default北極星 §1.2):任何成功/超時/錯誤都留下可觀測指標。
Args:
agent: Agent 名稱,必須是 "diagnostician" / "solver" / "critic"
outcome: 結果,必須是 "success" / "timeout" / "error"
duration_sec: 本次 LLM 呼叫耗時(秒),使用 time.monotonic() 差值
"""
AGENT_STEP_DURATION.labels(agent=agent, outcome=outcome).observe(duration_sec)

View File

@@ -17,11 +17,11 @@ AI Router - Phase 13.3 #87
│ DELETE/CRITICAL │ Claude │ 強制使用最強模型 │
└─────────────────┴───────────────┴──────────────────────────────┘
版本: v4.3
版本: v4.4
建立: 2026-03-26 (台北時區)
建立者: Claude Code
最後修改: 2026-04-02 (台北時區)
修改者: ogt (首席架構師 Review C1/C2/C3 修復)
最後修改: 2026-04-27 (台北時區)
修改者: Claude Sonnet 4.6 (A2 — DIAGNOSE 移除 Ollama, INC-20260425)
變更紀錄:
| 版本 | 日期 | 執行者 | 變更內容 |
@@ -33,6 +33,7 @@ AI Router - Phase 13.3 #87
| v4.1 | 2026-04-04 | ogt (首席架構師) | Phase 25 P0: DIAGNOSE Privacy-First — _local_fallback_chain; DIAGNOSE→NEMOTRON; REJECT+Telegram |
| v4.2 | 2026-04-04 | Claude Code | Phase 25 P0 實測修正: _local_fallback_chain 移除 Nemotron(雲端),僅留 Ollama(本地); timeout 依實測調整(NIM 60s/Ollama 200s) |
| v4.3 | 2026-04-05 | Claude Code | Phase 25 P0 架構修正: 實測 Ollama CPU ~238s(不可用); NIM 實測 2-27s avg 10.6s; DIAGNOSE 改走 _full_fallback_chain(NIM 主力); _local_fallback_chain 廢棄 |
| v4.4 | 2026-04-27 | Claude Sonnet 4.6 | A2 INC-20260425: DIAGNOSE fallback chain 移除 Ollama (CPU 238s 二次 timeout); 新增 _diagnose_fallback_chain (NEMO→GEMINI→CLAUDE); 新增 aiops_diagnose_fallback_total metric |
"""
from __future__ import annotations
@@ -252,6 +253,18 @@ class AIRouter:
(AIProviderEnum.OLLAMA, self._ollama_default),
]
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE 移除 Ollama (CPU 238s 不可用)
# 根因: INC-20260425 NIM timeout 後 fallback 到 Ollama deepseek-r1:14bCPU 238s
# 造成二次 timeout統帥批准 A+B 雙修,本任務 A2 將 Ollama 從 DIAGNOSE chain 移除。
# chain 順序: OPENCLAW_NEMO(主力) → GEMINI(第一備援) → CLAUDE(第二備援)
# Ollama 刻意排除: CPU-only 實測 238s絕對不可用於 DIAGNOSEINC-20260425 血的教訓)
self._diagnose_fallback_chain: list[tuple[AIProviderEnum, str]] = [
(AIProviderEnum.OPENCLAW_NEMO, self._openclaw_nemo_default),
(AIProviderEnum.GEMINI, self._gemini_default),
(AIProviderEnum.CLAUDE, self._claude_default),
# OLLAMA 永久排除於此 chain: CPU 238s, INC-20260425, 統帥授權 A2
]
# Tool Calling 專用 Fallback 鏈 (ADR-036)
self._tool_calling_fallback_chain: list[tuple[AIProviderEnum, str]] = [
(AIProviderEnum.NEMOTRON, self._nemotron_default),
@@ -419,13 +432,15 @@ class AIRouter:
logger.warning("ai_router_failover_manager_error", error=str(e))
# Step 4: 建立 Fallback 鏈
# 2026-04-05 Claude Code: v4.3 — DIAGNOSE 改回 _full_fallback_chain
# NIM 從 Phase 22 起就是主力無隱私問題Ollama CPU-only 不可用(實測 238s
# 2026-04-05 Claude Code: v4.3 — NIM 從 Phase 22 起就是主力,無隱私問題
# 2026-04-25 P1.2: 若 failover_manager 回傳了 fallback chain優先使用
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE 專用 chain排除 Ollama
# failover_manager 只在 OLLAMA 路徑觸發Step 3b 限制DIAGNOSE→OPENCLAW_NEMO
# 不會進入 failover 路徑,因此 fallover_fallback 此時為 None走 _build_fallback_chain_for_intent
fallback_chain = (
failover_fallback
if failover_fallback is not None
else self._build_fallback_chain(provider)
else self._build_fallback_chain_for_intent(provider, intent)
)
# Step 5: 計算延遲預算
@@ -607,6 +622,39 @@ class AIRouter:
fallbacks = [m for m in self._fallback_order if m != selected_model]
return fallbacks
def _build_fallback_chain_for_intent(
self,
selected_provider: AIProviderEnum,
intent: IntentType,
) -> list[tuple[AIProviderEnum, str]]:
"""
Intent-aware Fallback 鏈建構 (A2 INC-20260425 修復)
2026-04-27 Claude Sonnet 4.6: A2 — DIAGNOSE 使用 _diagnose_fallback_chain排除 Ollama
背景: INC-20260425 NIM timeout → fallback 到 Ollama deepseek-r1:14b (CPU 238s) → 二次 timeout
修復: DIAGNOSE 專屬 chain 只含雲端推理NEMO→GEMINI→CLAUDE絕不觸及 Ollama
所有其他 intent 繼續使用 _full_fallback_chain既有行為不變
Args:
selected_provider: 已選擇的 primary Provider
intent: 正規化後的意圖
Returns:
Fallback 鏈 [(provider, model), ...],排除 selected_provider
"""
if intent == IntentType.DIAGNOSE:
# DIAGNOSE 專屬:排除 Ollama只用雲端推理鏈
source_chain = self._diagnose_fallback_chain
else:
# 其他所有 intent 保持原有邏輯(行為不變)
source_chain = self._full_fallback_chain
return [
(provider, model)
for provider, model in source_chain
if provider != selected_provider
]
def route_sync(
self,
text: str,
@@ -642,8 +690,9 @@ class AIRouter:
)
# 建立 Fallback 鏈
# 2026-04-05 Claude Code: v4.3 — 同 route()DIAGNOSE 改回 _full_fallback_chain
fallback_chain = self._build_fallback_chain(provider)
# 2026-04-05 Claude Code: v4.3 — NIM 主力,無隱私問題
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — 對齊 route(),使用 intent-aware chain
fallback_chain = self._build_fallback_chain_for_intent(provider, intent)
# 延遲預算
latency_budget = PROVIDER_LATENCY_BUDGET.get(provider, 30000)
@@ -1036,11 +1085,35 @@ class AIRouterExecutor:
errors: list[str] = []
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE fallback metric 追蹤
# 透過 context.get("intent_hint") 判斷是否為 DIAGNOSE避免改動 execute() 簽名
# _last_attempted_provider 記錄上一輪嘗試的 provider用於計算 from→to 關係
_is_diagnose_intent = str((context or {}).get("intent_hint", "")).strip().lower() == "diagnose"
_last_attempted_provider: str | None = None
for provider_name in provider_order:
# 2026-04-27 Claude Sonnet 4.6: A2 — 若上一輪失敗且本輪開始,表示發生 fallback
# 記錄 metricDIAGNOSE intent 專屬;非 DIAGNOSE 不記,不影響其他路徑)
if _is_diagnose_intent and _last_attempted_provider is not None:
try:
from src.core.metrics import record_diagnose_fallback
record_diagnose_fallback(
from_provider=_last_attempted_provider,
to_provider=provider_name,
)
logger.info(
"diagnose_fallback_recorded",
from_provider=_last_attempted_provider,
to_provider=provider_name,
)
except Exception as _metric_e:
logger.debug("diagnose_fallback_metric_failed", error=str(_metric_e))
provider = self._registry.get(provider_name)
if not provider:
# 2026-04-14 Claude Sonnet 4.6: silent skip 改 errors 累積(觀測性)
errors.append(f"{provider_name}: not_registered")
_last_attempted_provider = provider_name
continue
# 隱私過濾 (D7)
@@ -1053,6 +1126,7 @@ class AIRouterExecutor:
if cb.is_open():
errors.append(f"{provider_name}: circuit_open")
logger.warning("ai_router_circuit_open", provider=provider_name)
_last_attempted_provider = provider_name
continue
# 閘門 2: Rate Limiter
@@ -1125,6 +1199,8 @@ class AIRouterExecutor:
# Provider 回傳 success=False
errors.append(f"{provider_name}: {result.error}")
logger.warning("ai_router_provider_failed", provider=provider_name, error=result.error)
# 2026-04-27 A2: 記錄失敗的 provider供下輪迭代計算 fallback metric
_last_attempted_provider = provider_name
except Exception as e:
errors.append(f"{provider_name}: {e}")
@@ -1135,6 +1211,8 @@ class AIRouterExecutor:
import httpx as _httpx
if not isinstance(e, _httpx.TimeoutException):
cb.record_failure()
# 2026-04-27 A2: 記錄失敗的 provider供下輪迭代計算 fallback metric
_last_attempted_provider = provider_name
# 全部失敗
logger.error("ai_router_all_providers_failed", tried=provider_order, errors=errors)

View File

@@ -331,11 +331,21 @@ class TestSelectProvider:
manager = OllamaFailoverManager(health_monitor=mock_monitor)
manager._settings = mock_settings
with patch.object(manager, "_write_failover_audit", return_value=None):
# 2026-04-27 Wave8-X2: 必須 mock Redis pool_check_gemini_quota 走 fail-closed 路徑會切到 188 而非 Gemini
# 測試本身只要驗 OFFLINE → Gemini 路由邏輯,故繞過 quota check
with patch.object(manager, "_write_failover_audit", return_value=None), \
patch.object(manager, "_check_gemini_quota", AsyncMock(return_value=True)), \
patch(
"src.services.failover_alerter.get_failover_alerter",
return_value=MagicMock(
alert_failover=AsyncMock(),
alert_gemini_quota_exceeded=AsyncMock(),
),
):
result = await manager.select_provider()
assert isinstance(result, OllamaRoutingResult)
# 新矩陣111 OFFLINE → primary=Gemini188 降為 fallback
# 新矩陣111 OFFLINE + Gemini quota OK → primary=Gemini188 降為 fallback
assert result.primary.provider_name == "gemini"
@pytest.mark.asyncio
@@ -655,17 +665,21 @@ class TestGeminiQuota:
assert ok is False
@pytest.mark.asyncio
async def test_gemini_quota_redis_unavailable_fail_open(self):
"""Redis 掛掉 → 返回 Truefail-open仍允許走 Gemini"""
async def test_gemini_quota_redis_unavailable_fail_closed(self):
"""Redis 掛掉 → 返回 False2026-04-27 Wave8-X2 fail-closed違反費用鐵律的修復"""
manager = _make_manager()
with patch(
"src.core.redis_client.get_redis",
side_effect=RuntimeError("Redis unavailable"),
), patch(
"src.services.failover_alerter.get_failover_alerter",
return_value=MagicMock(alert_gemini_quota_exceeded=AsyncMock()),
):
ok = await manager._check_gemini_quota()
assert ok is True
# fail-closedRedis 異常時拒絕 Gemini避免費用失控违反 feedback_cost_change_approval.md
assert ok is False
@pytest.mark.asyncio
async def test_select_provider_quota_exceeded_uses_188(self):