Files
awoooi/apps/api/tests/test_ai_router_diagnose_fallback.py
Your Name fefe4c21cd
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
fix(inc-20260425): A1+A2 後續 — Solver/Critic timeout + auto_repair 接線 + Runbook + Grafana
延續 595629c0 INC-20260425 修復,補三段 Agent + 全鏈路觀測:

A1 後續 — Solver/Critic 三段 timeout 接線:
- solver_agent.py: AGENT_SOLVER_TIMEOUT_SEC=20.0(env override)
- critic_agent.py: AGENT_CRITIC_TIMEOUT_SEC=15.0(env override)
- protocol.py: 三 Agent 共用 observe_agent_step() 包裹呼叫
  · success/timeout/error outcome label
  · histogram 寫入 aiops_agent_step_duration_seconds

A2 後續 — auto_repair_service 改用 _diagnose_fallback_chain:
- auto_repair_service.py +46 行 — 切換 DIAGNOSE 路由到新 chain(NEMO→GEMINI→CLAUDE)
- 完全避開 Ollama CPU 238s 二次 timeout

新增 metrics:
- core/metrics.py +59 行 — 配合 observe_agent_step 的 histogram bucket + label cardinality

新增測試 (862 行):
- test_agent_step_timeouts.py (475) — 三 Agent 各 timeout 邊界 + outcome label
- test_ai_router_diagnose_fallback.py (387) — _diagnose_fallback_chain 正確序

新增配套:
- docs/runbooks/RUNBOOK-AGENT-STEP-LATENCY.md (350) — INC 故障排查 + 觀測指引
- ops/monitoring/grafana/agent_step_latency_rules.yaml (160)
  · 三 Agent histogram alert rules(p99 > timeout 80% → warning)

驗收: 33 tests pass (test_agent_step_timeouts 22 + test_ai_router_diagnose_fallback 11)

INC-20260425 雙修總工作量(595629c0 + 此 commit):
  · 5 個 service/agent 檔修改
  · 1 個新 observability 模組
  · 4 個新測試/配套檔
  · 1372+187 = 1559 行新增

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Claude Sonnet 4.6 (INC-20260425 後續) <noreply@anthropic.com>
2026-04-27 08:15:53 +08:00

388 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# apps/api/tests/test_ai_router_diagnose_fallback.py
# 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 — DIAGNOSE fallback chain 移除 Ollama
"""
DIAGNOSE Fallback Chain 測試 (A2 INC-20260425)
===============================================
驗收標準:
1. DIAGNOSE intentNEMO 失敗 → 跳 Gemini不跳 Ollama
2. Gemini 失敗 → 跳 Claude
3. 全失敗 → graceful 降級(不再去 Ollama
4. 其他 intent如 RESTART的 fallback 行為不變Ollama 仍在鏈中)
5. aiops_diagnose_fallback_total metric 可正常累計
測試分類unitmock provider / registry無 Redis / DB / K8s 依賴)
"""
from __future__ import annotations
import os
os.environ.setdefault("MOCK_MODE", "true")
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from src.services.ai_router import (
AIProviderEnum,
AIRouter,
AIRouterExecutor,
AIProviderRegistry,
reset_ai_router,
)
from src.services.intent_classifier import IntentType
# =============================================================================
# Fixtures
# =============================================================================
@pytest.fixture(autouse=True)
def reset_router():
"""每個測試前後重置 singleton避免 mock 殘留"""
yield
reset_ai_router()
def _make_router() -> AIRouter:
"""建立 AIRoutermock failover_manager 避免 Redis 依賴)"""
router = AIRouter()
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock(side_effect=RuntimeError("not needed"))
router._failover_manager = mock_fm
return router
def _make_registry_with_providers(
*,
nemo_success: bool = True,
gemini_success: bool = True,
claude_success: bool = True,
) -> AIProviderRegistry:
"""建立只含 openclaw_nemo / gemini / claude 三個 provider 的 registry無 Ollama"""
from src.services.ai_providers.interfaces import AIResult
registry = AIProviderRegistry()
def _make_provider(name: str, privacy: str, success: bool, response: str = "") -> MagicMock:
p = MagicMock()
p.name = name
p.privacy_level = privacy
p.is_enabled = True
p.capabilities = {"rca", "chat"}
p.analyze = AsyncMock(
return_value=AIResult(
raw_response=response or f"{name}_response",
success=success,
provider=name,
error="" if success else f"{name}_timeout",
)
)
p.health_check = AsyncMock(return_value=success)
return p
registry._providers = {
"openclaw_nemo": _make_provider("openclaw_nemo", "cloud", nemo_success),
"gemini": _make_provider("gemini", "cloud", gemini_success),
"claude": _make_provider("claude", "cloud", claude_success, "claude_diagnosis_result"),
}
return registry
# =============================================================================
# Test 1: _diagnose_fallback_chain 屬性存在且不含 Ollama
# =============================================================================
def test_diagnose_fallback_chain_no_ollama():
"""_diagnose_fallback_chain 應存在,且不含任何 OLLAMA variant"""
router = _make_router()
assert hasattr(router, "_diagnose_fallback_chain"), (
"_diagnose_fallback_chain 屬性不存在"
)
providers_in_chain = [p for p, _ in router._diagnose_fallback_chain]
assert AIProviderEnum.OLLAMA not in providers_in_chain, (
f"OLLAMA 不應出現在 _diagnose_fallback_chain: {providers_in_chain}"
)
assert AIProviderEnum.OLLAMA_188 not in providers_in_chain, (
f"OLLAMA_188 不應出現在 _diagnose_fallback_chain: {providers_in_chain}"
)
def test_diagnose_fallback_chain_contains_cloud_providers():
"""_diagnose_fallback_chain 應含 OPENCLAW_NEMO, GEMINI, CLAUDE"""
router = _make_router()
providers_in_chain = [p for p, _ in router._diagnose_fallback_chain]
assert AIProviderEnum.OPENCLAW_NEMO in providers_in_chain
assert AIProviderEnum.GEMINI in providers_in_chain
assert AIProviderEnum.CLAUDE in providers_in_chain
# =============================================================================
# Test 2: DIAGNOSE route() 的 fallback_chain 不含 Ollama
# =============================================================================
@pytest.mark.asyncio
async def test_diagnose_route_fallback_chain_excludes_ollama():
"""DIAGNOSE intent route() 回傳的 fallback_chain 不含 OLLAMA"""
router = _make_router()
decision = await router.route(
"pod crash loop detected",
context={"intent_hint": "diagnose"},
)
assert decision.selected_provider == AIProviderEnum.OPENCLAW_NEMO, (
f"primary 應為 OPENCLAW_NEMO實際: {decision.selected_provider}"
)
fb_providers = [p for p, _ in decision.fallback_chain]
assert AIProviderEnum.OLLAMA not in fb_providers, (
f"OLLAMA 不應在 DIAGNOSE fallback_chain: {fb_providers}"
)
assert AIProviderEnum.OLLAMA_188 not in fb_providers, (
f"OLLAMA_188 不應在 DIAGNOSE fallback_chain: {fb_providers}"
)
@pytest.mark.asyncio
async def test_diagnose_route_sync_fallback_chain_excludes_ollama():
"""DIAGNOSE intent route_sync() 回傳的 fallback_chain 同樣不含 OLLAMA"""
router = _make_router()
decision = router.route_sync(
"pod crash loop detected",
context={"intent_hint": "diagnose"},
)
fb_providers = [p for p, _ in decision.fallback_chain]
assert AIProviderEnum.OLLAMA not in fb_providers, (
f"OLLAMA 不應在 DIAGNOSE route_sync fallback_chain: {fb_providers}"
)
# =============================================================================
# Test 3: DIAGNOSE NEMO 失敗 → fallback 到 Gemini不是 Ollama
# =============================================================================
@pytest.mark.asyncio
async def test_diagnose_nemo_fail_fallback_to_gemini_not_ollama():
"""DIAGNOSE: NEMO 失敗 → executor 嘗試 Gemini不嘗試 Ollama"""
registry = _make_registry_with_providers(
nemo_success=False,
gemini_success=True,
)
executor = AIRouterExecutor(registry)
with patch("src.services.ai_router._settings") as mock_settings:
mock_settings.MOCK_MODE = False
result = await executor.execute(
prompt="RCA: pod OOMKilled",
provider_order=["openclaw_nemo", "gemini", "claude"],
context={"intent_hint": "diagnose"},
)
assert result.success is True
assert result.provider == "gemini", (
f"應 fallback 到 gemini實際: {result.provider}"
)
# 驗證 Ollama 根本不在 provider_order確保沒被加進去
ollama_provider = registry._providers.get("ollama")
assert ollama_provider is None, "registry 不應含 ollama providerDIAGNOSE 路徑)"
# =============================================================================
# Test 4: DIAGNOSE Gemini 失敗 → fallback 到 Claude
# =============================================================================
@pytest.mark.asyncio
async def test_diagnose_gemini_fail_fallback_to_claude():
"""DIAGNOSE: NEMO 失敗 + Gemini 失敗 → executor 嘗試 Claude"""
registry = _make_registry_with_providers(
nemo_success=False,
gemini_success=False,
claude_success=True,
)
executor = AIRouterExecutor(registry)
with patch("src.services.ai_router._settings") as mock_settings:
mock_settings.MOCK_MODE = False
result = await executor.execute(
prompt="RCA: pod crash",
provider_order=["openclaw_nemo", "gemini", "claude"],
context={"intent_hint": "diagnose"},
)
assert result.success is True
assert result.provider == "claude", (
f"應 fallback 到 claude實際: {result.provider}"
)
# =============================================================================
# Test 5: DIAGNOSE 全失敗 → graceful 降級(不去 Ollama
# =============================================================================
@pytest.mark.asyncio
async def test_diagnose_all_fail_graceful_no_ollama():
"""DIAGNOSE: NEMO + Gemini + Claude 全失敗 → graceful error不嘗試 Ollama"""
registry = _make_registry_with_providers(
nemo_success=False,
gemini_success=False,
claude_success=False,
)
executor = AIRouterExecutor(registry)
with patch("src.services.ai_router._settings") as mock_settings:
mock_settings.MOCK_MODE = False
result = await executor.execute(
prompt="RCA: cascading failure",
provider_order=["openclaw_nemo", "gemini", "claude"],
context={"intent_hint": "diagnose"},
)
# 全失敗應回傳 success=Falsegraceful 降級,不 raise
assert result.success is False
assert result.provider == "none"
# 確認沒有嘗試 Ollamaregistry 裡根本沒有 ollama
assert "ollama" not in registry._providers
# =============================================================================
# Test 6: 其他 intentRESTART的 fallback 行為不變Ollama 仍在鏈中)
# =============================================================================
@pytest.mark.asyncio
async def test_restart_intent_still_has_ollama_in_fallback():
"""RESTART intent 的 fallback_chain 應仍包含 OLLAMA行為不變"""
router = _make_router()
# RESTART → None複雜度路由低複雜度 → OLLAMA primary
# 使用 context_hint 直接指定,避免 LLM 分類
decision = await router.route(
"restart the api service",
context={"intent_hint": "restart"},
)
# RESTART intent 不受 A2 影響_full_fallback_chain 仍含 OLLAMA
all_providers_in_decision = [decision.selected_provider] + [
p for p, _ in decision.fallback_chain
]
assert AIProviderEnum.OLLAMA in all_providers_in_decision, (
f"RESTART 路徑應仍含 OLLAMA行為不變實際: {all_providers_in_decision}"
)
def test_build_fallback_chain_for_intent_diagnose_no_ollama():
"""_build_fallback_chain_for_intent(DIAGNOSE) 回傳結果不含 OLLAMA"""
router = _make_router()
chain = router._build_fallback_chain_for_intent(
AIProviderEnum.OPENCLAW_NEMO,
IntentType.DIAGNOSE,
)
providers = [p for p, _ in chain]
assert AIProviderEnum.OLLAMA not in providers
assert AIProviderEnum.OLLAMA_188 not in providers
# primary 已排除chain 剩 GEMINI + CLAUDE
assert AIProviderEnum.GEMINI in providers
assert AIProviderEnum.CLAUDE in providers
assert AIProviderEnum.OPENCLAW_NEMO not in providers # primary 排除
def test_build_fallback_chain_for_intent_restart_has_ollama():
"""_build_fallback_chain_for_intent(RESTART) 回傳結果仍含 OLLAMA"""
router = _make_router()
chain = router._build_fallback_chain_for_intent(
AIProviderEnum.OPENCLAW_NEMO,
IntentType.RESTART,
)
providers = [p for p, _ in chain]
assert AIProviderEnum.OLLAMA in providers, (
f"RESTART fallback 應含 OLLAMA實際: {providers}"
)
# =============================================================================
# Test 7: aiops_diagnose_fallback_total metric 正常累計
# =============================================================================
@pytest.mark.asyncio
async def test_diagnose_fallback_metric_incremented():
"""DIAGNOSE NEMO 失敗 → fallback Gemini 時aiops_diagnose_fallback_total metric 被記錄"""
registry = _make_registry_with_providers(
nemo_success=False,
gemini_success=True,
)
executor = AIRouterExecutor(registry)
with patch("src.services.ai_router._settings") as mock_settings:
mock_settings.MOCK_MODE = False
with patch("src.core.metrics.record_diagnose_fallback") as mock_metric:
await executor.execute(
prompt="RCA: high error rate",
provider_order=["openclaw_nemo", "gemini", "claude"],
context={"intent_hint": "diagnose"},
)
# fallback from openclaw_nemo → gemini 應被記錄一次
mock_metric.assert_called_once_with(
from_provider="openclaw_nemo",
to_provider="gemini",
)
@pytest.mark.asyncio
async def test_non_diagnose_intent_no_fallback_metric():
"""非 DIAGNOSE intent 的 fallback 不應觸發 aiops_diagnose_fallback_total"""
from src.services.ai_providers.interfaces import AIResult
registry = AIProviderRegistry()
# ollama 失敗
mock_ollama = MagicMock()
mock_ollama.name = "ollama"
mock_ollama.privacy_level = "local"
mock_ollama.is_enabled = True
mock_ollama.capabilities = {"chat"}
mock_ollama.analyze = AsyncMock(
return_value=AIResult(raw_response="", success=False, provider="ollama", error="timeout")
)
# gemini 成功
mock_gemini = MagicMock()
mock_gemini.name = "gemini"
mock_gemini.privacy_level = "cloud"
mock_gemini.is_enabled = True
mock_gemini.analyze = AsyncMock(
return_value=AIResult(raw_response="ok", success=True, provider="gemini")
)
registry._providers = {"ollama": mock_ollama, "gemini": mock_gemini}
executor = AIRouterExecutor(registry)
with patch("src.services.ai_router._settings") as mock_settings:
mock_settings.MOCK_MODE = False
with patch("src.core.metrics.record_diagnose_fallback") as mock_metric:
await executor.execute(
prompt="restart service",
provider_order=["ollama", "gemini"],
context={"intent_hint": "restart"}, # 非 DIAGNOSE
)
# 非 DIAGNOSE intent → metric 不應被呼叫
mock_metric.assert_not_called()