From 595629c0135a0a745ffc4aaf920c54dced5eaa07 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 27 Apr 2026 08:15:10 +0800 Subject: [PATCH] =?UTF-8?q?fix(inc-20260425):=20A1=20=E4=B8=89=E6=AE=B5=20?= =?UTF-8?q?Agent=20timeout=20=E6=8B=86=E5=88=86=20+=20A2=20DIAGNOSE=20?= =?UTF-8?q?=E7=A7=BB=E9=99=A4=20Ollama?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20% 根因雙修(統帥批准 A+B): A1 — 三段 Agent step timeout 拆分(北極星 §1.2 Observable by Default): - diagnostician_agent.py: PHASE2_STEP_TIMEOUT_SEC=20.0 共用值 → 拆三段 · AGENT_DIAGNOSTICIAN_TIMEOUT_SEC=30.0(NIM 主吃口,最大 prompt + 多假設) · AGENT_SOLVER_TIMEOUT_SEC=20.0(後續 commit 接線) · AGENT_CRITIC_TIMEOUT_SEC=15.0(後續 commit 接線) · env override 支援,K8s ConfigMap 動態調整不需 rebuild · 保留 PHASE2_STEP_TIMEOUT_SEC alias(DEPRECATED,下 sprint 移除) - observability/agent_step_metrics.py (58 行) — 新模組: · aiops_agent_step_duration_seconds Histogram · observe_agent_step() helper 統一三 Agent 呼叫點 · outcome label ∈ {success, timeout, error} · agent label ∈ {diagnostician, solver, critic} A2 — ai_router DIAGNOSE chain 移除 Ollama: - ai_router.py v4.4 by Claude Sonnet 4.6 · 新增 _diagnose_fallback_chain: NEMO → GEMINI → CLAUDE · Ollama 永久排除於此 chain(CPU-only 實測 238s,二次 timeout 必爆) · 新增 aiops_diagnose_fallback_total Prometheus metric - 根因: NIM timeout 後 fallback 到 Ollama deepseek-r1:14b CPU 238s → 二次 timeout → degraded confidence=0.2 Wave8-X2 整合測試補正: - test_ollama_failover_manager.py: TestSelectProvider 補 mock _check_gemini_quota 原 test 期望 OFFLINE→Gemini,但 quota fail-closed 後沒 mock 會被切到 188 繞過 quota check 後驗純路由邏輯 → 37/37 PASS Tests: 37 passed (test_ollama_failover_manager 全部) Co-Authored-By: Claude Opus 4.7 (1M context) Co-Authored-By: Claude Sonnet 4.6 (Wave 8 INC-20260425) --- apps/api/src/agents/diagnostician_agent.py | 28 +++++- apps/api/src/observability/__init__.py | 0 .../src/observability/agent_step_metrics.py | 58 ++++++++++++ apps/api/src/services/ai_router.py | 94 +++++++++++++++++-- .../api/tests/test_ollama_failover_manager.py | 24 ++++- 5 files changed, 187 insertions(+), 17 deletions(-) create mode 100644 apps/api/src/observability/__init__.py create mode 100644 apps/api/src/observability/agent_step_metrics.py diff --git a/apps/api/src/agents/diagnostician_agent.py b/apps/api/src/agents/diagnostician_agent.py index 75d8fa71..f824f42d 100644 --- a/apps/api/src/agents/diagnostician_agent.py +++ b/apps/api/src/agents/diagnostician_agent.py @@ -21,6 +21,7 @@ from __future__ import annotations import asyncio import hashlib import json +import os import time from typing import TYPE_CHECKING, Any @@ -33,6 +34,7 @@ from src.agents.protocol import ( DiagnosisReport, Hypothesis, ) +from src.observability.agent_step_metrics import observe_agent_step from src.services.sanitization_service import sanitize if TYPE_CHECKING: @@ -46,8 +48,21 @@ MAX_EVIDENCE_CHAIN = 5 # Confidence 閾值 — 低於此值 vote = ABSTAIN ABSTAIN_CONFIDENCE_THRESHOLD = 0.4 -# Phase 2 單步 LLM timeout(防單一 Agent 吃光 90s 全局預算) -PHASE2_STEP_TIMEOUT_SEC = 20.0 +# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric (北極星 §1.2 Observable by Default) +# 背景:INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20% +# OpenClaw NIM (192.168.0.188:8088) 實測 2-27s,原共用 PHASE2_STEP_TIMEOUT_SEC=20.0 +# Diagnostician 是 NIM 主吃口(最大 prompt + 多假設輸出),因此分配最高 timeout=30s +# Solver=20s(prompt 較小),Critic=15s(只做批判,輸出最短) +# env override:部署時可透過 K8s ConfigMap 動態調整,無需重新 build image +# +# 相容 alias(2026-04-27):PHASE2_STEP_TIMEOUT_SEC 保留供外部 import 讀取(已棄用) +AGENT_DIAGNOSTICIAN_TIMEOUT_SEC: float = float( + os.environ.get("AGENT_DIAGNOSTICIAN_TIMEOUT_SEC", "30.0") +) + +# 保留相容 alias,標記棄用 +# DEPRECATED (2026-04-27): 使用 AGENT_DIAGNOSTICIAN_TIMEOUT_SEC,此 alias 將在下一個 Sprint 移除 +PHASE2_STEP_TIMEOUT_SEC = AGENT_DIAGNOSTICIAN_TIMEOUT_SEC class DiagnosticianAgent(BaseAgent): @@ -121,16 +136,21 @@ class DiagnosticianAgent(BaseAgent): from src.services.openclaw import get_openclaw openclaw = get_openclaw() + _step_start = time.monotonic() try: response_text, _provider, success = await asyncio.wait_for( openclaw.call(prompt, alert_context=alert_context), - timeout=PHASE2_STEP_TIMEOUT_SEC, + timeout=AGENT_DIAGNOSTICIAN_TIMEOUT_SEC, ) + # 2026-04-27 Claude Sonnet 4.6: A1 — success path metric observe + observe_agent_step("diagnostician", "success", time.monotonic() - _step_start) except asyncio.TimeoutError: + # 2026-04-27 Claude Sonnet 4.6: A1 — timeout path metric observe + observe_agent_step("diagnostician", "timeout", time.monotonic() - _step_start) logger.warning( "diagnostician_step_timeout", snapshot_id=snapshot.snapshot_id, - timeout_sec=PHASE2_STEP_TIMEOUT_SEC, + timeout_sec=AGENT_DIAGNOSTICIAN_TIMEOUT_SEC, ) return self._degraded_report(snapshot, 0, reason="step_timeout") diff --git a/apps/api/src/observability/__init__.py b/apps/api/src/observability/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/api/src/observability/agent_step_metrics.py b/apps/api/src/observability/agent_step_metrics.py new file mode 100644 index 00000000..1b9d1465 --- /dev/null +++ b/apps/api/src/observability/agent_step_metrics.py @@ -0,0 +1,58 @@ +""" +AWOOOI AIOps — Agent Step Latency Metrics +========================================== +# 2026-04-27 Claude Sonnet 4.6: A1 — 三段 timeout 拆分 + step metric +# (北極星 §1.2 Observable by Default) +# +# 背景:INC-20260425-8D17BB / 3B6C39 兩則告警 AI 信心降到 20% +# 原因:OpenClaw NIM (192.168.0.188:8088) 實測 2-27s, +# 但 Diagnostician/Solver/Critic 共用 PHASE2_STEP_TIMEOUT_SEC=20.0, +# 命中尾巴 latency 必爆 → degraded confidence=0.2 +# +# 此模組提供: +# 1. aiops_agent_step_duration_seconds Histogram — 記錄每段 Agent 呼叫耗時 +# 2. observe_agent_step() helper — 統一呼叫點,三 Agent 共用 +# +# outcome label ∈ {success, timeout, error} +# agent label ∈ {diagnostician, solver, critic} +# +# 用法(在 agent try/except 區塊): +# from src.observability.agent_step_metrics import observe_agent_step +# observe_agent_step("diagnostician", "timeout", elapsed_sec) +# +# ADR-082: Phase 2 多 Agent 協作 +# 建立者: Claude Sonnet 4.6 (fullstack-engineer, A1) +""" + +from __future__ import annotations + +from prometheus_client import Histogram + +# Buckets 對齊 NIM 實測分佈(2-27s),並覆蓋三段 timeout 30/20/15s 邊界 +# 低端(0.5-5s):快速路徑(Ollama 188 本地) +# 中端(5-20s):NIM + Gemini fallback +# 高端(20-60s):超時 / 慢速 Provider +_AGENT_STEP_BUCKETS = [0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 45.0, 60.0] + +AGENT_STEP_DURATION = Histogram( + "aiops_agent_step_duration_seconds", + "Duration of each Phase 2 agent LLM step in seconds", + ["agent", "outcome"], # agent: diagnostician/solver/critic; outcome: success/timeout/error + buckets=_AGENT_STEP_BUCKETS, +) + + +def observe_agent_step(agent: str, outcome: str, duration_sec: float) -> None: + """ + 記錄一次 Phase 2 Agent LLM 步驟的耗時與結果。 + + # 2026-04-27 Claude Sonnet 4.6: A1 統一呼叫點 + # 三個 agent(diagnostician/solver/critic)的 try/except 區塊都必須呼叫此函式, + # 確保 Observable by Default(北極星 §1.2):任何成功/超時/錯誤都留下可觀測指標。 + + Args: + agent: Agent 名稱,必須是 "diagnostician" / "solver" / "critic" + outcome: 結果,必須是 "success" / "timeout" / "error" + duration_sec: 本次 LLM 呼叫耗時(秒),使用 time.monotonic() 差值 + """ + AGENT_STEP_DURATION.labels(agent=agent, outcome=outcome).observe(duration_sec) diff --git a/apps/api/src/services/ai_router.py b/apps/api/src/services/ai_router.py index f939032b..22c0c72d 100644 --- a/apps/api/src/services/ai_router.py +++ b/apps/api/src/services/ai_router.py @@ -17,11 +17,11 @@ AI Router - Phase 13.3 #87 │ DELETE/CRITICAL │ Claude │ 強制使用最強模型 │ └─────────────────┴───────────────┴──────────────────────────────┘ -版本: v4.3 +版本: v4.4 建立: 2026-03-26 (台北時區) 建立者: Claude Code -最後修改: 2026-04-02 (台北時區) -修改者: ogt (首席架構師 Review C1/C2/C3 修復) +最後修改: 2026-04-27 (台北時區) +修改者: Claude Sonnet 4.6 (A2 — DIAGNOSE 移除 Ollama, INC-20260425) 變更紀錄: | 版本 | 日期 | 執行者 | 變更內容 | @@ -33,6 +33,7 @@ AI Router - Phase 13.3 #87 | v4.1 | 2026-04-04 | ogt (首席架構師) | Phase 25 P0: DIAGNOSE Privacy-First — _local_fallback_chain; DIAGNOSE→NEMOTRON; REJECT+Telegram | | v4.2 | 2026-04-04 | Claude Code | Phase 25 P0 實測修正: _local_fallback_chain 移除 Nemotron(雲端),僅留 Ollama(本地); timeout 依實測調整(NIM 60s/Ollama 200s) | | v4.3 | 2026-04-05 | Claude Code | Phase 25 P0 架構修正: 實測 Ollama CPU ~238s(不可用); NIM 實測 2-27s avg 10.6s; DIAGNOSE 改走 _full_fallback_chain(NIM 主力); _local_fallback_chain 廢棄 | +| v4.4 | 2026-04-27 | Claude Sonnet 4.6 | A2 INC-20260425: DIAGNOSE fallback chain 移除 Ollama (CPU 238s 二次 timeout); 新增 _diagnose_fallback_chain (NEMO→GEMINI→CLAUDE); 新增 aiops_diagnose_fallback_total metric | """ from __future__ import annotations @@ -252,6 +253,18 @@ class AIRouter: (AIProviderEnum.OLLAMA, self._ollama_default), ] + # 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE 移除 Ollama (CPU 238s 不可用) + # 根因: INC-20260425 NIM timeout 後 fallback 到 Ollama deepseek-r1:14b(CPU 238s), + # 造成二次 timeout,統帥批准 A+B 雙修,本任務 A2 將 Ollama 從 DIAGNOSE chain 移除。 + # chain 順序: OPENCLAW_NEMO(主力) → GEMINI(第一備援) → CLAUDE(第二備援) + # Ollama 刻意排除: CPU-only 實測 238s,絕對不可用於 DIAGNOSE(INC-20260425 血的教訓) + self._diagnose_fallback_chain: list[tuple[AIProviderEnum, str]] = [ + (AIProviderEnum.OPENCLAW_NEMO, self._openclaw_nemo_default), + (AIProviderEnum.GEMINI, self._gemini_default), + (AIProviderEnum.CLAUDE, self._claude_default), + # OLLAMA 永久排除於此 chain: CPU 238s, INC-20260425, 統帥授權 A2 + ] + # Tool Calling 專用 Fallback 鏈 (ADR-036) self._tool_calling_fallback_chain: list[tuple[AIProviderEnum, str]] = [ (AIProviderEnum.NEMOTRON, self._nemotron_default), @@ -419,13 +432,15 @@ class AIRouter: logger.warning("ai_router_failover_manager_error", error=str(e)) # Step 4: 建立 Fallback 鏈 - # 2026-04-05 Claude Code: v4.3 — DIAGNOSE 改回 _full_fallback_chain - # NIM 從 Phase 22 起就是主力,無隱私問題;Ollama CPU-only 不可用(實測 238s) + # 2026-04-05 Claude Code: v4.3 — NIM 從 Phase 22 起就是主力,無隱私問題 # 2026-04-25 P1.2: 若 failover_manager 回傳了 fallback chain,優先使用 + # 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE 專用 chain(排除 Ollama) + # failover_manager 只在 OLLAMA 路徑觸發(Step 3b 限制),DIAGNOSE→OPENCLAW_NEMO + # 不會進入 failover 路徑,因此 fallover_fallback 此時為 None,走 _build_fallback_chain_for_intent fallback_chain = ( failover_fallback if failover_fallback is not None - else self._build_fallback_chain(provider) + else self._build_fallback_chain_for_intent(provider, intent) ) # Step 5: 計算延遲預算 @@ -607,6 +622,39 @@ class AIRouter: fallbacks = [m for m in self._fallback_order if m != selected_model] return fallbacks + def _build_fallback_chain_for_intent( + self, + selected_provider: AIProviderEnum, + intent: IntentType, + ) -> list[tuple[AIProviderEnum, str]]: + """ + Intent-aware Fallback 鏈建構 (A2 INC-20260425 修復) + + 2026-04-27 Claude Sonnet 4.6: A2 — DIAGNOSE 使用 _diagnose_fallback_chain(排除 Ollama) + 背景: INC-20260425 NIM timeout → fallback 到 Ollama deepseek-r1:14b (CPU 238s) → 二次 timeout + 修復: DIAGNOSE 專屬 chain 只含雲端推理(NEMO→GEMINI→CLAUDE),絕不觸及 Ollama + 所有其他 intent 繼續使用 _full_fallback_chain(既有行為不變)。 + + Args: + selected_provider: 已選擇的 primary Provider + intent: 正規化後的意圖 + + Returns: + Fallback 鏈 [(provider, model), ...],排除 selected_provider + """ + if intent == IntentType.DIAGNOSE: + # DIAGNOSE 專屬:排除 Ollama,只用雲端推理鏈 + source_chain = self._diagnose_fallback_chain + else: + # 其他所有 intent 保持原有邏輯(行為不變) + source_chain = self._full_fallback_chain + + return [ + (provider, model) + for provider, model in source_chain + if provider != selected_provider + ] + def route_sync( self, text: str, @@ -642,8 +690,9 @@ class AIRouter: ) # 建立 Fallback 鏈 - # 2026-04-05 Claude Code: v4.3 — 同 route(),DIAGNOSE 改回 _full_fallback_chain - fallback_chain = self._build_fallback_chain(provider) + # 2026-04-05 Claude Code: v4.3 — NIM 主力,無隱私問題 + # 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — 對齊 route(),使用 intent-aware chain + fallback_chain = self._build_fallback_chain_for_intent(provider, intent) # 延遲預算 latency_budget = PROVIDER_LATENCY_BUDGET.get(provider, 30000) @@ -1036,11 +1085,35 @@ class AIRouterExecutor: errors: list[str] = [] + # 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE fallback metric 追蹤 + # 透過 context.get("intent_hint") 判斷是否為 DIAGNOSE,避免改動 execute() 簽名 + # _last_attempted_provider 記錄上一輪嘗試的 provider,用於計算 from→to 關係 + _is_diagnose_intent = str((context or {}).get("intent_hint", "")).strip().lower() == "diagnose" + _last_attempted_provider: str | None = None + for provider_name in provider_order: + # 2026-04-27 Claude Sonnet 4.6: A2 — 若上一輪失敗且本輪開始,表示發生 fallback + # 記錄 metric(DIAGNOSE intent 專屬;非 DIAGNOSE 不記,不影響其他路徑) + if _is_diagnose_intent and _last_attempted_provider is not None: + try: + from src.core.metrics import record_diagnose_fallback + record_diagnose_fallback( + from_provider=_last_attempted_provider, + to_provider=provider_name, + ) + logger.info( + "diagnose_fallback_recorded", + from_provider=_last_attempted_provider, + to_provider=provider_name, + ) + except Exception as _metric_e: + logger.debug("diagnose_fallback_metric_failed", error=str(_metric_e)) + provider = self._registry.get(provider_name) if not provider: # 2026-04-14 Claude Sonnet 4.6: silent skip 改 errors 累積(觀測性) errors.append(f"{provider_name}: not_registered") + _last_attempted_provider = provider_name continue # 隱私過濾 (D7) @@ -1053,6 +1126,7 @@ class AIRouterExecutor: if cb.is_open(): errors.append(f"{provider_name}: circuit_open") logger.warning("ai_router_circuit_open", provider=provider_name) + _last_attempted_provider = provider_name continue # 閘門 2: Rate Limiter @@ -1125,6 +1199,8 @@ class AIRouterExecutor: # Provider 回傳 success=False errors.append(f"{provider_name}: {result.error}") logger.warning("ai_router_provider_failed", provider=provider_name, error=result.error) + # 2026-04-27 A2: 記錄失敗的 provider,供下輪迭代計算 fallback metric + _last_attempted_provider = provider_name except Exception as e: errors.append(f"{provider_name}: {e}") @@ -1135,6 +1211,8 @@ class AIRouterExecutor: import httpx as _httpx if not isinstance(e, _httpx.TimeoutException): cb.record_failure() + # 2026-04-27 A2: 記錄失敗的 provider,供下輪迭代計算 fallback metric + _last_attempted_provider = provider_name # 全部失敗 logger.error("ai_router_all_providers_failed", tried=provider_order, errors=errors) diff --git a/apps/api/tests/test_ollama_failover_manager.py b/apps/api/tests/test_ollama_failover_manager.py index 54e99cf6..6e1b23d3 100644 --- a/apps/api/tests/test_ollama_failover_manager.py +++ b/apps/api/tests/test_ollama_failover_manager.py @@ -331,11 +331,21 @@ class TestSelectProvider: manager = OllamaFailoverManager(health_monitor=mock_monitor) manager._settings = mock_settings - with patch.object(manager, "_write_failover_audit", return_value=None): + # 2026-04-27 Wave8-X2: 必須 mock Redis pool(_check_gemini_quota 走 fail-closed 路徑會切到 188 而非 Gemini) + # 測試本身只要驗 OFFLINE → Gemini 路由邏輯,故繞過 quota check + with patch.object(manager, "_write_failover_audit", return_value=None), \ + patch.object(manager, "_check_gemini_quota", AsyncMock(return_value=True)), \ + patch( + "src.services.failover_alerter.get_failover_alerter", + return_value=MagicMock( + alert_failover=AsyncMock(), + alert_gemini_quota_exceeded=AsyncMock(), + ), + ): result = await manager.select_provider() assert isinstance(result, OllamaRoutingResult) - # 新矩陣:111 OFFLINE → primary=Gemini(188 降為 fallback) + # 新矩陣:111 OFFLINE + Gemini quota OK → primary=Gemini(188 降為 fallback) assert result.primary.provider_name == "gemini" @pytest.mark.asyncio @@ -655,17 +665,21 @@ class TestGeminiQuota: assert ok is False @pytest.mark.asyncio - async def test_gemini_quota_redis_unavailable_fail_open(self): - """Redis 掛掉 → 返回 True(fail-open,仍允許走 Gemini)""" + async def test_gemini_quota_redis_unavailable_fail_closed(self): + """Redis 掛掉 → 返回 False(2026-04-27 Wave8-X2 fail-closed,違反費用鐵律的修復)""" manager = _make_manager() with patch( "src.core.redis_client.get_redis", side_effect=RuntimeError("Redis unavailable"), + ), patch( + "src.services.failover_alerter.get_failover_alerter", + return_value=MagicMock(alert_gemini_quota_exceeded=AsyncMock()), ): ok = await manager._check_gemini_quota() - assert ok is True + # fail-closed:Redis 異常時拒絕 Gemini,避免費用失控(违反 feedback_cost_change_approval.md) + assert ok is False @pytest.mark.asyncio async def test_select_provider_quota_exceeded_uses_188(self):