## 變更摘要 - Primary: http://34.143.170.20:11434 (GCP-A SSD, 9x 載速 + 2x 推理) - Secondary: http://34.21.145.224:11434 (GCP-B SSD) - Fallback: http://192.168.0.111:11434 (M1 Pro Local HDD,最後防線) - 廢止 ADR-105「111 唯一鐵律」,新建 ADR-110 ## 核心改動 - config.py: 新增 OLLAMA_SECONDARY_URL;validator 加 GCP IP 白名單(34.143.170.20, 34.21.145.224) - ollama_failover_manager.py: 三層 Ollama 決策矩陣;並行健康檢查三台;health_111 → health_gcp_a - ollama_health_monitor.py: host label 萃取改為通用版(支援 GCP 公網 IP) - failover_alerter.py: 故障/恢復主機動態顯示,不再硬編碼「Ollama 111 (GPU)」 - ollama_auto_recovery.py: notify_recovery 改為 ollama_gcp_a;recovered_host 動態 - k8s/awoooi-prod: configmap + deployment + network-policy 同步更新(egress 加 GCP /32) - 服務層: 10 個服務檔案硬編碼 192.168.0.111 改為讀 settings.OLLAMA_URL - 測試: URL 常數更新,新增三層容災場景,GCP IP 白名單驗證測試 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
245 lines
10 KiB
Python
245 lines
10 KiB
Python
# apps/api/tests/test_ai_router_failover_integration.py | 2026-04-25 @ Asia/Taipei
|
||
# 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan
|
||
# 2026-04-26 Wave4 P1.2-tests-fix by Claude Engineer-A3 — 修正 intent mock:ALERT_TRIAGE→DIAGNOSE(normalize_intent 映射),改用 UNKNOWN(無 override,score=1 → OLLAMA → failover 觸發)
|
||
# 2026-04-26 Wave4 P1.2-tests-fix-v2 by Claude Opus 4.7 — UNKNOWN intent 在 router 內仍被 reclassify 成 DIAGNOSE → openclaw_nemo
|
||
# 改用 patch.object(router, "_select_provider_and_model") 直接強制初始路由為 OLLAMA,繞過 normalize / alert detection 邏輯
|
||
# 2026-05-03 ogt: ADR-110 GCP 三層容災,_make_health host 更新為 GCP-A Primary
|
||
"""
|
||
AIRouter × OllamaFailoverManager 整合測試
|
||
==========================================
|
||
測試覆蓋:
|
||
1. 初步路由選 OLLAMA → failover_manager 重評 → decision 使用 failover 結果
|
||
2. failover 回傳 GEMINI primary → decision.selected_provider == GEMINI
|
||
3. failover 的 fallback_chain 正確轉換到 decision.fallback_chain
|
||
4. 初步路由選 NEMOTRON → failover_manager 不被呼叫
|
||
5. 初步路由選 OPENCLAW_NEMO → failover_manager 不被呼叫
|
||
6. failover_manager 發生例外 → fail-open,保留原始 provider
|
||
|
||
測試分類:unit(mock OllamaFailoverManager,無 Redis / DB 依賴)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from unittest.mock import AsyncMock, MagicMock, patch
|
||
|
||
import pytest
|
||
|
||
from src.services.ai_router import AIProviderEnum, AIRouter, reset_ai_router
|
||
from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingResult
|
||
from src.services.ollama_health_monitor import HealthReport, HealthStatus
|
||
|
||
|
||
# =============================================================================
|
||
# Fixtures / Helpers
|
||
# =============================================================================
|
||
|
||
|
||
@pytest.fixture(autouse=True)
|
||
def reset_router_singleton():
|
||
"""每個測試前後重置 AIRouter singleton,避免 failover_manager mock 殘留"""
|
||
yield
|
||
reset_ai_router()
|
||
|
||
|
||
def _make_health(status: HealthStatus) -> HealthReport:
|
||
return HealthReport(status=status, host="http://34.143.170.20:11434", latency_ms=500.0) # GCP-A Primary(ADR-110)
|
||
|
||
|
||
def _make_failover_result(
|
||
primary_provider: str,
|
||
primary_model: str,
|
||
fallback: list[tuple[str, str]] | None = None,
|
||
) -> OllamaRoutingResult:
|
||
"""建立 OllamaRoutingResult 測試物件"""
|
||
fb_endpoints = [
|
||
OllamaEndpoint(url="", provider_name=p, model=m)
|
||
for p, m in (fallback or [])
|
||
]
|
||
return OllamaRoutingResult(
|
||
primary=OllamaEndpoint(url="", provider_name=primary_provider, model=primary_model),
|
||
fallback_chain=fb_endpoints,
|
||
routing_reason=f"test: {primary_provider}",
|
||
health_gcp_a=_make_health(HealthStatus.OFFLINE), # ADR-110:欄位改為 health_gcp_a
|
||
)
|
||
|
||
|
||
def _make_router_with_mock_failover(mock_failover_manager) -> AIRouter:
|
||
"""建立 AIRouter,並替換其 _failover_manager"""
|
||
router = AIRouter()
|
||
router._failover_manager = mock_failover_manager
|
||
return router
|
||
|
||
|
||
# =============================================================================
|
||
# Test 1: OLLAMA 路由 → failover_manager 重評 → 使用 GEMINI
|
||
# =============================================================================
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_router_uses_failover_when_ollama_initial_provider():
|
||
"""初步路由選 OLLAMA → 應走 failover_manager 重評,decision.selected_provider == GEMINI"""
|
||
mock_fm = MagicMock()
|
||
mock_fm.select_provider = AsyncMock(
|
||
return_value=_make_failover_result(
|
||
primary_provider="gemini",
|
||
primary_model="gemini-1.5-flash",
|
||
fallback=[("ollama_188", "qwen2.5:7b-instruct"), ("nemotron", "nvidia/nemotron-mini-4b-instruct")],
|
||
)
|
||
)
|
||
|
||
router = _make_router_with_mock_failover(mock_fm)
|
||
|
||
# 2026-04-26 Wave4 v2 by Claude Opus 4.7 — 直接 mock _select_provider_and_model 強制初始 OLLAMA
|
||
with patch.object(
|
||
router,
|
||
"_select_provider_and_model",
|
||
return_value=(AIProviderEnum.OLLAMA, "qwen3:8b", "test forced ollama"),
|
||
):
|
||
decision = await router.route("test alert message")
|
||
|
||
assert decision.selected_provider == AIProviderEnum.GEMINI
|
||
assert decision.selected_model == "gemini-1.5-flash"
|
||
mock_fm.select_provider.assert_awaited_once()
|
||
|
||
|
||
# =============================================================================
|
||
# Test 2: fallback_chain 正確轉換
|
||
# =============================================================================
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_router_failover_fallback_chain_converted():
|
||
"""failover_manager 回傳 fallback_chain → decision.fallback_chain 包含 OLLAMA_188"""
|
||
mock_fm = MagicMock()
|
||
mock_fm.select_provider = AsyncMock(
|
||
return_value=_make_failover_result(
|
||
primary_provider="gemini",
|
||
primary_model="gemini-1.5-flash",
|
||
fallback=[
|
||
("ollama_188", "qwen2.5:7b-instruct"),
|
||
("nemotron", "nvidia/nemotron-mini-4b-instruct"),
|
||
("claude", "claude-haiku-4-5-20251001"),
|
||
],
|
||
)
|
||
)
|
||
|
||
router = _make_router_with_mock_failover(mock_fm)
|
||
|
||
# 2026-04-26 Wave4 v2 by Claude Opus 4.7 — 直接 mock _select_provider_and_model 強制初始 OLLAMA
|
||
with patch.object(
|
||
router,
|
||
"_select_provider_and_model",
|
||
return_value=(AIProviderEnum.OLLAMA, "qwen3:8b", "test forced ollama"),
|
||
):
|
||
decision = await router.route("test alert message")
|
||
|
||
fb_providers = [p for p, _ in decision.fallback_chain]
|
||
assert AIProviderEnum.OLLAMA_188 in fb_providers, (
|
||
f"OLLAMA_188 not in fallback_chain: {fb_providers}"
|
||
)
|
||
assert AIProviderEnum.NEMOTRON in fb_providers
|
||
assert AIProviderEnum.CLAUDE in fb_providers
|
||
|
||
|
||
# =============================================================================
|
||
# Test 3: NEMOTRON 路由 → failover_manager 不被呼叫
|
||
# =============================================================================
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_router_does_not_use_failover_for_nemotron():
|
||
"""初步路由選 NEMOTRON(tool_calling)→ failover_manager.select_provider 不應被呼叫"""
|
||
mock_fm = MagicMock()
|
||
mock_fm.select_provider = AsyncMock()
|
||
|
||
router = _make_router_with_mock_failover(mock_fm)
|
||
|
||
# 強制 intent = DIAGNOSE(→ OPENCLAW_NEMO),再用 context_hint 跳過 LLM
|
||
# 但 NEMOTRON 只由 route_tool_calling() 觸發,route() 最多到 OPENCLAW_NEMO
|
||
# 改用 QUERY → OLLAMA 的 override,然後驗 failover 被觸發(這不是 NEMOTRON 測試)
|
||
# 正確測試:強制 CRITICAL → CLAUDE,驗 failover 不被呼叫
|
||
with patch.object(router._intent_classifier, "classify") as mock_classify:
|
||
from src.services.intent_classifier import IntentResult, IntentType, RiskLevel
|
||
from src.services.complexity_scorer import ComplexityScore
|
||
|
||
mock_classify.return_value = IntentResult(
|
||
intent=IntentType.DELETE,
|
||
confidence=1.0,
|
||
method="keyword",
|
||
matched_keywords=["delete"],
|
||
detected_resources=[],
|
||
reasoning="test",
|
||
risk_level=RiskLevel.CRITICAL,
|
||
)
|
||
with patch.object(router._complexity_scorer, "score") as mock_score:
|
||
mock_score.return_value = ComplexityScore(score=5, features={})
|
||
|
||
decision = await router.route("delete this service")
|
||
|
||
# CRITICAL risk → CLAUDE,failover_manager 不應被呼叫
|
||
assert decision.selected_provider == AIProviderEnum.CLAUDE
|
||
mock_fm.select_provider.assert_not_awaited()
|
||
|
||
|
||
# =============================================================================
|
||
# Test 4: OPENCLAW_NEMO 路由 → failover_manager 不被呼叫
|
||
# =============================================================================
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_router_uses_failover_for_diagnose_ollama_primary():
|
||
"""2026-04-29: DIAGNOSE intent → OLLAMA → failover_manager 應被呼叫
|
||
|
||
推翻 A2 後 DIAGNOSE primary 為 OLLAMA(本地優先鐵律)
|
||
failover_manager 對 OLLAMA primary 會檢查健康度(111 vs 188 CPU 備援切換)
|
||
"""
|
||
mock_fm = MagicMock()
|
||
mock_fm.select_provider = AsyncMock()
|
||
# mock 回傳保持原 provider(無切換)
|
||
mock_decision = MagicMock()
|
||
mock_decision.primary.provider_name = "ollama"
|
||
mock_decision.primary.model = "qwen2.5:7b-instruct"
|
||
mock_decision.fallback_chain = []
|
||
mock_fm.select_provider.return_value = mock_decision
|
||
|
||
router = _make_router_with_mock_failover(mock_fm)
|
||
|
||
decision = await router.route(
|
||
"diagnose service crash",
|
||
context={"intent_hint": "diagnose"},
|
||
)
|
||
|
||
# 推翻 A2:DIAGNOSE primary 是 OLLAMA
|
||
assert decision.selected_provider == AIProviderEnum.OLLAMA
|
||
# OLLAMA primary 觸發 failover_manager 健康檢查(111 vs 188)
|
||
mock_fm.select_provider.assert_awaited()
|
||
|
||
|
||
# =============================================================================
|
||
# Test 5: failover_manager 發生例外 → fail-open,保留原始 OLLAMA
|
||
# =============================================================================
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_router_failopen_when_failover_manager_raises():
|
||
"""failover_manager.select_provider 拋出例外 → fail-open,decision 仍然成功(使用原始 OLLAMA)"""
|
||
mock_fm = MagicMock()
|
||
mock_fm.select_provider = AsyncMock(side_effect=RuntimeError("redis timeout"))
|
||
|
||
router = _make_router_with_mock_failover(mock_fm)
|
||
|
||
# 2026-04-26 Wave4 v2 by Claude Opus 4.7 — 直接 mock _select_provider_and_model 強制初始 OLLAMA
|
||
# → failover 觸發 → raises RuntimeError → fail-open → 保留 OLLAMA
|
||
with patch.object(
|
||
router,
|
||
"_select_provider_and_model",
|
||
return_value=(AIProviderEnum.OLLAMA, "qwen3:8b", "test forced ollama"),
|
||
):
|
||
# 不應 raise,應 fail-open
|
||
decision = await router.route("test alert message")
|
||
|
||
# fail-open → 保留 OLLAMA(原始 initial decision)
|
||
assert decision.selected_provider == AIProviderEnum.OLLAMA
|
||
# fallback_chain 仍然存在(來自 _build_fallback_chain)
|
||
assert len(decision.fallback_chain) > 0
|