Files
awoooi/apps/api/tests/test_ai_router_failover_integration.py
Your Name 55c6b4e2d9 feat(p1): Ollama 多層容災系統 — P1.1 健康檢測 + P1.2 ai_router 整合 + P1.5 容災告警
ADR-092 P1 飛輪閉環的 Ollama 失敗轉移子系統,全部 Engineer-A2/C/C2 補上。

新服務 (1581 行):
- ollama_health_monitor.py (356):3 層健康檢測(TCP/HTTP/推理)
- ollama_failover_manager.py (571):111→188 自動切換 + Redis 持久化 + recovery callback
- ollama_auto_recovery.py (436):30s 背景監控 + 連續 3 次 HEALTHY → 切回 + clear_cache
- failover_alerter.py (218):P1.5 Telegram 容災告警

服務整合:
- ai_router.py: AIProviderEnum.OLLAMA_188 + 120s budget + failover fallback chain
- main.py lifespan: 啟動時 wire callback + start recovery,關閉時優雅 stop
- config.py: OLLAMA_FALLBACK_URL / OLLAMA_HEALTH_CHECK_MODEL / GEMINI_DAILY_QUOTA(帳單熔斷)

K8s 配置:
- 04-configmap.yaml.patch-188-fallback:注入 OLLAMA_FALLBACK_URL=http://192.168.0.188:11434

測試 (2082 行):
- test_ollama_health_monitor.py (402)
- test_ollama_failover_manager.py (707)
- test_ollama_auto_recovery.py (580)
- test_ai_router_failover_integration.py (257)
- test_lifespan_failover_wiring.py (136)

依賴鏈:service 三件套 + ai_router + main.py 一起 commit,缺一就 ImportError。

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 20:18:33 +08:00

258 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# apps/api/tests/test_ai_router_failover_integration.py | 2026-04-25 @ Asia/Taipei
# 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan
"""
AIRouter × OllamaFailoverManager 整合測試
==========================================
測試覆蓋:
1. 初步路由選 OLLAMA → failover_manager 重評 → decision 使用 failover 結果
2. failover 回傳 GEMINI primary → decision.selected_provider == GEMINI
3. failover 的 fallback_chain 正確轉換到 decision.fallback_chain
4. 初步路由選 NEMOTRON → failover_manager 不被呼叫
5. 初步路由選 OPENCLAW_NEMO → failover_manager 不被呼叫
6. failover_manager 發生例外 → fail-open保留原始 provider
測試分類unitmock OllamaFailoverManager無 Redis / DB 依賴)
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from src.services.ai_router import AIProviderEnum, AIRouter, reset_ai_router
from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingResult
from src.services.ollama_health_monitor import HealthReport, HealthStatus
# =============================================================================
# Fixtures / Helpers
# =============================================================================
@pytest.fixture(autouse=True)
def reset_router_singleton():
"""每個測試前後重置 AIRouter singleton避免 failover_manager mock 殘留"""
yield
reset_ai_router()
def _make_health(status: HealthStatus) -> HealthReport:
return HealthReport(status=status, host="http://192.168.0.111:11434", latency_ms=500.0)
def _make_failover_result(
primary_provider: str,
primary_model: str,
fallback: list[tuple[str, str]] | None = None,
) -> OllamaRoutingResult:
"""建立 OllamaRoutingResult 測試物件"""
fb_endpoints = [
OllamaEndpoint(url="", provider_name=p, model=m)
for p, m in (fallback or [])
]
return OllamaRoutingResult(
primary=OllamaEndpoint(url="", provider_name=primary_provider, model=primary_model),
fallback_chain=fb_endpoints,
routing_reason=f"test: {primary_provider}",
health_111=_make_health(HealthStatus.OFFLINE),
health_188=None,
)
def _make_router_with_mock_failover(mock_failover_manager) -> AIRouter:
"""建立 AIRouter並替換其 _failover_manager"""
router = AIRouter()
router._failover_manager = mock_failover_manager
return router
# =============================================================================
# Test 1: OLLAMA 路由 → failover_manager 重評 → 使用 GEMINI
# =============================================================================
@pytest.mark.asyncio
async def test_router_uses_failover_when_ollama_initial_provider():
"""初步路由選 OLLAMA → 應走 failover_manager 重評decision.selected_provider == GEMINI"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock(
return_value=_make_failover_result(
primary_provider="gemini",
primary_model="gemini-1.5-flash",
fallback=[("ollama_188", "qwen2.5:7b-instruct"), ("nemotron", "nvidia/nemotron-mini-4b-instruct")],
)
)
router = _make_router_with_mock_failover(mock_fm)
# 讓 intent classifier + complexity scorer 走 sync 快路徑ALERT_TRIAGE → OLLAMA
with patch.object(router._intent_classifier, "classify") as mock_classify:
from src.services.intent_classifier import IntentResult, IntentType, RiskLevel
from src.services.complexity_scorer import ComplexityScore
mock_classify.return_value = IntentResult(
intent=IntentType.ALERT_TRIAGE,
confidence=0.9,
method="keyword",
matched_keywords=["alert"],
detected_resources=[],
reasoning="test",
)
with patch.object(router._complexity_scorer, "score") as mock_score:
mock_score.return_value = ComplexityScore(score=1, features={})
decision = await router.route("test alert message")
assert decision.selected_provider == AIProviderEnum.GEMINI
assert decision.selected_model == "gemini-1.5-flash"
mock_fm.select_provider.assert_awaited_once()
# =============================================================================
# Test 2: fallback_chain 正確轉換
# =============================================================================
@pytest.mark.asyncio
async def test_router_failover_fallback_chain_converted():
"""failover_manager 回傳 fallback_chain → decision.fallback_chain 包含 OLLAMA_188"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock(
return_value=_make_failover_result(
primary_provider="gemini",
primary_model="gemini-1.5-flash",
fallback=[
("ollama_188", "qwen2.5:7b-instruct"),
("nemotron", "nvidia/nemotron-mini-4b-instruct"),
("claude", "claude-3-5-haiku-20241022"),
],
)
)
router = _make_router_with_mock_failover(mock_fm)
with patch.object(router._intent_classifier, "classify") as mock_classify:
from src.services.intent_classifier import IntentResult, IntentType
from src.services.complexity_scorer import ComplexityScore
mock_classify.return_value = IntentResult(
intent=IntentType.ALERT_TRIAGE,
confidence=0.9,
method="keyword",
matched_keywords=["alert"],
detected_resources=[],
reasoning="test",
)
with patch.object(router._complexity_scorer, "score") as mock_score:
mock_score.return_value = ComplexityScore(score=1, features={})
decision = await router.route("test alert message")
fb_providers = [p for p, _ in decision.fallback_chain]
assert AIProviderEnum.OLLAMA_188 in fb_providers, (
f"OLLAMA_188 not in fallback_chain: {fb_providers}"
)
assert AIProviderEnum.NEMOTRON in fb_providers
assert AIProviderEnum.CLAUDE in fb_providers
# =============================================================================
# Test 3: NEMOTRON 路由 → failover_manager 不被呼叫
# =============================================================================
@pytest.mark.asyncio
async def test_router_does_not_use_failover_for_nemotron():
"""初步路由選 NEMOTRONtool_calling→ failover_manager.select_provider 不應被呼叫"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock()
router = _make_router_with_mock_failover(mock_fm)
# 強制 intent = DIAGNOSE→ OPENCLAW_NEMO再用 context_hint 跳過 LLM
# 但 NEMOTRON 只由 route_tool_calling() 觸發route() 最多到 OPENCLAW_NEMO
# 改用 QUERY → OLLAMA 的 override然後驗 failover 被觸發(這不是 NEMOTRON 測試)
# 正確測試:強制 CRITICAL → CLAUDE驗 failover 不被呼叫
with patch.object(router._intent_classifier, "classify") as mock_classify:
from src.services.intent_classifier import IntentResult, IntentType, RiskLevel
from src.services.complexity_scorer import ComplexityScore
mock_classify.return_value = IntentResult(
intent=IntentType.DELETE,
confidence=1.0,
method="keyword",
matched_keywords=["delete"],
detected_resources=[],
reasoning="test",
risk_level=RiskLevel.CRITICAL,
)
with patch.object(router._complexity_scorer, "score") as mock_score:
mock_score.return_value = ComplexityScore(score=5, features={})
decision = await router.route("delete this service")
# CRITICAL risk → CLAUDEfailover_manager 不應被呼叫
assert decision.selected_provider == AIProviderEnum.CLAUDE
mock_fm.select_provider.assert_not_awaited()
# =============================================================================
# Test 4: OPENCLAW_NEMO 路由 → failover_manager 不被呼叫
# =============================================================================
@pytest.mark.asyncio
async def test_router_does_not_use_failover_for_openclaw_nemo():
"""DIAGNOSE intent → OPENCLAW_NEMO → failover_manager 不應被呼叫"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock()
router = _make_router_with_mock_failover(mock_fm)
# context_hint=diagnose → OPENCLAW_NEMO規則 3 override
decision = await router.route(
"diagnose service crash",
context={"intent_hint": "diagnose"},
)
assert decision.selected_provider == AIProviderEnum.OPENCLAW_NEMO
mock_fm.select_provider.assert_not_awaited()
# =============================================================================
# Test 5: failover_manager 發生例外 → fail-open保留原始 OLLAMA
# =============================================================================
@pytest.mark.asyncio
async def test_router_failopen_when_failover_manager_raises():
"""failover_manager.select_provider 拋出例外 → fail-opendecision 仍然成功(使用原始 OLLAMA"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock(side_effect=RuntimeError("redis timeout"))
router = _make_router_with_mock_failover(mock_fm)
with patch.object(router._intent_classifier, "classify") as mock_classify:
from src.services.intent_classifier import IntentResult, IntentType
from src.services.complexity_scorer import ComplexityScore
mock_classify.return_value = IntentResult(
intent=IntentType.ALERT_TRIAGE,
confidence=0.9,
method="keyword",
matched_keywords=["alert"],
detected_resources=[],
reasoning="test",
)
with patch.object(router._complexity_scorer, "score") as mock_score:
mock_score.return_value = ComplexityScore(score=1, features={})
# 不應 raise應 fail-open
decision = await router.route("test alert message")
# fail-open → 保留 OLLAMA原始 initial decision
assert decision.selected_provider == AIProviderEnum.OLLAMA
# fallback_chain 仍然存在(來自 _build_fallback_chain
assert len(decision.fallback_chain) > 0