Files
awoooi/apps/api/tests/test_ai_router_failover_integration.py
Your Name b1ef05fa8c
Some checks failed
Code Review / ai-code-review (push) Successful in 50s
CD Pipeline / tests (push) Failing after 1m14s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
feat(ollama): ADR-110 GCP 三層容災架構(GCP-A → GCP-B → Local → Gemini)
## 變更摘要
- Primary: http://34.143.170.20:11434 (GCP-A SSD, 9x 載速 + 2x 推理)
- Secondary: http://34.21.145.224:11434 (GCP-B SSD)
- Fallback: http://192.168.0.111:11434 (M1 Pro Local HDD,最後防線)
- 廢止 ADR-105「111 唯一鐵律」,新建 ADR-110

## 核心改動
- config.py: 新增 OLLAMA_SECONDARY_URL;validator 加 GCP IP 白名單(34.143.170.20, 34.21.145.224)
- ollama_failover_manager.py: 三層 Ollama 決策矩陣;並行健康檢查三台;health_111 → health_gcp_a
- ollama_health_monitor.py: host label 萃取改為通用版(支援 GCP 公網 IP)
- failover_alerter.py: 故障/恢復主機動態顯示,不再硬編碼「Ollama 111 (GPU)」
- ollama_auto_recovery.py: notify_recovery 改為 ollama_gcp_a;recovered_host 動態
- k8s/awoooi-prod: configmap + deployment + network-policy 同步更新(egress 加 GCP /32)
- 服務層: 10 個服務檔案硬編碼 192.168.0.111 改為讀 settings.OLLAMA_URL
- 測試: URL 常數更新,新增三層容災場景,GCP IP 白名單驗證測試

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 22:49:23 +08:00

245 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# apps/api/tests/test_ai_router_failover_integration.py | 2026-04-25 @ Asia/Taipei
# 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan
# 2026-04-26 Wave4 P1.2-tests-fix by Claude Engineer-A3 — 修正 intent mockALERT_TRIAGE→DIAGNOSEnormalize_intent 映射),改用 UNKNOWN無 overridescore=1 → OLLAMA → failover 觸發)
# 2026-04-26 Wave4 P1.2-tests-fix-v2 by Claude Opus 4.7 — UNKNOWN intent 在 router 內仍被 reclassify 成 DIAGNOSE → openclaw_nemo
# 改用 patch.object(router, "_select_provider_and_model") 直接強制初始路由為 OLLAMA繞過 normalize / alert detection 邏輯
# 2026-05-03 ogt: ADR-110 GCP 三層容災_make_health host 更新為 GCP-A Primary
"""
AIRouter × OllamaFailoverManager 整合測試
==========================================
測試覆蓋:
1. 初步路由選 OLLAMA → failover_manager 重評 → decision 使用 failover 結果
2. failover 回傳 GEMINI primary → decision.selected_provider == GEMINI
3. failover 的 fallback_chain 正確轉換到 decision.fallback_chain
4. 初步路由選 NEMOTRON → failover_manager 不被呼叫
5. 初步路由選 OPENCLAW_NEMO → failover_manager 不被呼叫
6. failover_manager 發生例外 → fail-open保留原始 provider
測試分類unitmock OllamaFailoverManager無 Redis / DB 依賴)
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from src.services.ai_router import AIProviderEnum, AIRouter, reset_ai_router
from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingResult
from src.services.ollama_health_monitor import HealthReport, HealthStatus
# =============================================================================
# Fixtures / Helpers
# =============================================================================
@pytest.fixture(autouse=True)
def reset_router_singleton():
"""每個測試前後重置 AIRouter singleton避免 failover_manager mock 殘留"""
yield
reset_ai_router()
def _make_health(status: HealthStatus) -> HealthReport:
return HealthReport(status=status, host="http://34.143.170.20:11434", latency_ms=500.0) # GCP-A PrimaryADR-110
def _make_failover_result(
primary_provider: str,
primary_model: str,
fallback: list[tuple[str, str]] | None = None,
) -> OllamaRoutingResult:
"""建立 OllamaRoutingResult 測試物件"""
fb_endpoints = [
OllamaEndpoint(url="", provider_name=p, model=m)
for p, m in (fallback or [])
]
return OllamaRoutingResult(
primary=OllamaEndpoint(url="", provider_name=primary_provider, model=primary_model),
fallback_chain=fb_endpoints,
routing_reason=f"test: {primary_provider}",
health_gcp_a=_make_health(HealthStatus.OFFLINE), # ADR-110欄位改為 health_gcp_a
)
def _make_router_with_mock_failover(mock_failover_manager) -> AIRouter:
"""建立 AIRouter並替換其 _failover_manager"""
router = AIRouter()
router._failover_manager = mock_failover_manager
return router
# =============================================================================
# Test 1: OLLAMA 路由 → failover_manager 重評 → 使用 GEMINI
# =============================================================================
@pytest.mark.asyncio
async def test_router_uses_failover_when_ollama_initial_provider():
"""初步路由選 OLLAMA → 應走 failover_manager 重評decision.selected_provider == GEMINI"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock(
return_value=_make_failover_result(
primary_provider="gemini",
primary_model="gemini-1.5-flash",
fallback=[("ollama_188", "qwen2.5:7b-instruct"), ("nemotron", "nvidia/nemotron-mini-4b-instruct")],
)
)
router = _make_router_with_mock_failover(mock_fm)
# 2026-04-26 Wave4 v2 by Claude Opus 4.7 — 直接 mock _select_provider_and_model 強制初始 OLLAMA
with patch.object(
router,
"_select_provider_and_model",
return_value=(AIProviderEnum.OLLAMA, "qwen3:8b", "test forced ollama"),
):
decision = await router.route("test alert message")
assert decision.selected_provider == AIProviderEnum.GEMINI
assert decision.selected_model == "gemini-1.5-flash"
mock_fm.select_provider.assert_awaited_once()
# =============================================================================
# Test 2: fallback_chain 正確轉換
# =============================================================================
@pytest.mark.asyncio
async def test_router_failover_fallback_chain_converted():
"""failover_manager 回傳 fallback_chain → decision.fallback_chain 包含 OLLAMA_188"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock(
return_value=_make_failover_result(
primary_provider="gemini",
primary_model="gemini-1.5-flash",
fallback=[
("ollama_188", "qwen2.5:7b-instruct"),
("nemotron", "nvidia/nemotron-mini-4b-instruct"),
("claude", "claude-haiku-4-5-20251001"),
],
)
)
router = _make_router_with_mock_failover(mock_fm)
# 2026-04-26 Wave4 v2 by Claude Opus 4.7 — 直接 mock _select_provider_and_model 強制初始 OLLAMA
with patch.object(
router,
"_select_provider_and_model",
return_value=(AIProviderEnum.OLLAMA, "qwen3:8b", "test forced ollama"),
):
decision = await router.route("test alert message")
fb_providers = [p for p, _ in decision.fallback_chain]
assert AIProviderEnum.OLLAMA_188 in fb_providers, (
f"OLLAMA_188 not in fallback_chain: {fb_providers}"
)
assert AIProviderEnum.NEMOTRON in fb_providers
assert AIProviderEnum.CLAUDE in fb_providers
# =============================================================================
# Test 3: NEMOTRON 路由 → failover_manager 不被呼叫
# =============================================================================
@pytest.mark.asyncio
async def test_router_does_not_use_failover_for_nemotron():
"""初步路由選 NEMOTRONtool_calling→ failover_manager.select_provider 不應被呼叫"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock()
router = _make_router_with_mock_failover(mock_fm)
# 強制 intent = DIAGNOSE→ OPENCLAW_NEMO再用 context_hint 跳過 LLM
# 但 NEMOTRON 只由 route_tool_calling() 觸發route() 最多到 OPENCLAW_NEMO
# 改用 QUERY → OLLAMA 的 override然後驗 failover 被觸發(這不是 NEMOTRON 測試)
# 正確測試:強制 CRITICAL → CLAUDE驗 failover 不被呼叫
with patch.object(router._intent_classifier, "classify") as mock_classify:
from src.services.intent_classifier import IntentResult, IntentType, RiskLevel
from src.services.complexity_scorer import ComplexityScore
mock_classify.return_value = IntentResult(
intent=IntentType.DELETE,
confidence=1.0,
method="keyword",
matched_keywords=["delete"],
detected_resources=[],
reasoning="test",
risk_level=RiskLevel.CRITICAL,
)
with patch.object(router._complexity_scorer, "score") as mock_score:
mock_score.return_value = ComplexityScore(score=5, features={})
decision = await router.route("delete this service")
# CRITICAL risk → CLAUDEfailover_manager 不應被呼叫
assert decision.selected_provider == AIProviderEnum.CLAUDE
mock_fm.select_provider.assert_not_awaited()
# =============================================================================
# Test 4: OPENCLAW_NEMO 路由 → failover_manager 不被呼叫
# =============================================================================
@pytest.mark.asyncio
async def test_router_uses_failover_for_diagnose_ollama_primary():
"""2026-04-29: DIAGNOSE intent → OLLAMA → failover_manager 應被呼叫
推翻 A2 後 DIAGNOSE primary 為 OLLAMA本地優先鐵律
failover_manager 對 OLLAMA primary 會檢查健康度111 vs 188 CPU 備援切換)
"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock()
# mock 回傳保持原 provider無切換
mock_decision = MagicMock()
mock_decision.primary.provider_name = "ollama"
mock_decision.primary.model = "qwen2.5:7b-instruct"
mock_decision.fallback_chain = []
mock_fm.select_provider.return_value = mock_decision
router = _make_router_with_mock_failover(mock_fm)
decision = await router.route(
"diagnose service crash",
context={"intent_hint": "diagnose"},
)
# 推翻 A2DIAGNOSE primary 是 OLLAMA
assert decision.selected_provider == AIProviderEnum.OLLAMA
# OLLAMA primary 觸發 failover_manager 健康檢查111 vs 188
mock_fm.select_provider.assert_awaited()
# =============================================================================
# Test 5: failover_manager 發生例外 → fail-open保留原始 OLLAMA
# =============================================================================
@pytest.mark.asyncio
async def test_router_failopen_when_failover_manager_raises():
"""failover_manager.select_provider 拋出例外 → fail-opendecision 仍然成功(使用原始 OLLAMA"""
mock_fm = MagicMock()
mock_fm.select_provider = AsyncMock(side_effect=RuntimeError("redis timeout"))
router = _make_router_with_mock_failover(mock_fm)
# 2026-04-26 Wave4 v2 by Claude Opus 4.7 — 直接 mock _select_provider_and_model 強制初始 OLLAMA
# → failover 觸發 → raises RuntimeError → fail-open → 保留 OLLAMA
with patch.object(
router,
"_select_provider_and_model",
return_value=(AIProviderEnum.OLLAMA, "qwen3:8b", "test forced ollama"),
):
# 不應 raise應 fail-open
decision = await router.route("test alert message")
# fail-open → 保留 OLLAMA原始 initial decision
assert decision.selected_provider == AIProviderEnum.OLLAMA
# fallback_chain 仍然存在(來自 _build_fallback_chain
assert len(decision.fallback_chain) > 0