Files
awoooi/apps/api/tests/test_ollama_auto_recovery.py
Your Name 869646459c
Some checks failed
CD Pipeline / tests (push) Failing after 1m48s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 28s
fix(ollama): treat legacy primary as ollama
2026-05-05 13:25:27 +08:00

593 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# apps/api/tests/test_ollama_auto_recovery.py | 2026-04-25 @ Asia/Taipei
# Created 2026-04-25 P1.1d by Claude Engineer-C
# 2026-04-25 統帥指令 by Claude Engineer-C — 自動切 Gemini + 自動恢復
# 2026-05-03 ogt: ADR-110 GCP 三層容災URL_111 改名為 URL_GCP_A
"""
OllamaAutoRecoveryService 單元測試 - P1.1d
==========================================
測試覆蓋:
1. 111 OFFLINE → HEALTHY × 3 → 觸發 _switch_back_to_ollama
2. 111 OFFLINE → HEALTHY × 2 → OFFLINE → counter 歸零
3. 中途 DEGRADED/OFFLINE → counter 歸零SLOW 視為可用GCP 高負載仍優於 Gemini
4. stop() 優雅取消 task
5. start() 重複呼叫不重複建立 task
6. _switch_back_to_ollamaclear_cache + notify_recovery + Telegram alerter
7. alerter = None 時不 crash
8. clear_cache 失敗時不 crashbest-effort
9. set_current_primary / is_running / consecutive_healthy 屬性
測試分類unitmock OllamaHealthMonitor + OllamaFailoverManager
"""
from __future__ import annotations
import asyncio
from unittest.mock import AsyncMock, MagicMock, call, patch
import pytest
from src.services.ollama_auto_recovery import (
OllamaAutoRecoveryService,
get_ollama_auto_recovery_service,
reset_ollama_auto_recovery_service,
)
from src.services.ollama_health_monitor import HealthReport, HealthStatus
# =============================================================================
# Fixtures / Helpers
# =============================================================================
URL_GCP_A = "http://34.143.170.20:11434" # GCP-A PrimaryADR-110 2026-05-03
URL_111 = URL_GCP_A # 向下相容別名(保留舊名,對應 settings.OLLAMA_URL
@pytest.fixture(autouse=True)
def reset_singleton():
yield
reset_ollama_auto_recovery_service()
def _make_health(status: HealthStatus) -> HealthReport:
return HealthReport(status=status, host=URL_111, latency_ms=500.0)
def _make_service(
*,
current_primary: str = "gemini",
stable_count: int = 3,
check_interval: int = 1, # 測試用短 interval
alerter=None,
) -> tuple[OllamaAutoRecoveryService, AsyncMock, MagicMock]:
"""
建立 service + mock monitor + mock failover manager。
返回 (service, mock_monitor, mock_failover_manager)
"""
mock_monitor = MagicMock()
mock_monitor.check = AsyncMock()
mock_failover = MagicMock()
mock_failover.clear_cache = AsyncMock()
mock_failover.notify_recovery = MagicMock()
mock_settings = MagicMock()
mock_settings.OLLAMA_URL = URL_111
svc = OllamaAutoRecoveryService(
health_monitor=mock_monitor,
failover_manager=mock_failover,
telegram_alerter=alerter,
recovery_check_interval_sec=check_interval,
stable_count_required=stable_count,
)
svc._settings = mock_settings
svc._current_primary = current_primary
return svc, mock_monitor, mock_failover
# =============================================================================
# _check_and_recover():核心防抖邏輯
# =============================================================================
class TestCheckAndRecover:
"""_check_and_recover 單次執行邏輯"""
@pytest.mark.asyncio
async def test_healthy_increments_counter(self):
"""HEALTHY → counter +1"""
svc, mock_monitor, _ = _make_service(current_primary="gemini")
mock_monitor.check.return_value = _make_health(HealthStatus.HEALTHY)
await svc._check_and_recover()
assert svc.consecutive_healthy == 1
@pytest.mark.asyncio
async def test_non_healthy_resets_counter(self):
"""OFFLINE → counter 歸零"""
svc, mock_monitor, _ = _make_service(current_primary="gemini")
svc._consecutive_healthy = 2
mock_monitor.check.return_value = _make_health(HealthStatus.OFFLINE)
await svc._check_and_recover()
assert svc.consecutive_healthy == 0
@pytest.mark.asyncio
async def test_slow_increments_counter(self):
"""SLOW → counter +1GCP 高負載仍優於 Gemini fallback"""
svc, mock_monitor, _ = _make_service(current_primary="gemini")
svc._consecutive_healthy = 1
mock_monitor.check.return_value = _make_health(HealthStatus.SLOW)
await svc._check_and_recover()
assert svc.consecutive_healthy == 2
@pytest.mark.asyncio
async def test_degraded_resets_counter(self):
"""DEGRADED → counter 歸零"""
svc, mock_monitor, _ = _make_service(current_primary="gemini")
svc._consecutive_healthy = 2
mock_monitor.check.return_value = _make_health(HealthStatus.DEGRADED)
await svc._check_and_recover()
assert svc.consecutive_healthy == 0
@pytest.mark.asyncio
async def test_three_healthy_triggers_switch_back(self):
"""連續 3 次 HEALTHY + current_primary=gemini → 觸發切回"""
svc, mock_monitor, mock_failover = _make_service(current_primary="gemini", stable_count=3)
mock_monitor.check.return_value = _make_health(HealthStatus.HEALTHY)
svc._consecutive_healthy = 2 # 已有 2 次,本次第 3 次
with patch.object(svc, "_switch_back_to_ollama", new_callable=AsyncMock) as mock_switch:
await svc._check_and_recover()
mock_switch.assert_awaited_once()
@pytest.mark.asyncio
async def test_two_healthy_not_yet_switch(self):
"""連續 2 次 HEALTHY未達 3 次門檻)→ 不觸發切回"""
svc, mock_monitor, _ = _make_service(current_primary="gemini", stable_count=3)
mock_monitor.check.return_value = _make_health(HealthStatus.HEALTHY)
svc._consecutive_healthy = 1 # 已有 1 次,本次第 2 次
with patch.object(svc, "_switch_back_to_ollama", new_callable=AsyncMock) as mock_switch:
await svc._check_and_recover()
mock_switch.assert_not_awaited()
assert svc.consecutive_healthy == 2
@pytest.mark.asyncio
async def test_already_ollama_primary_no_switch(self):
"""current_primary 已是 ollama → 不觸發切回(避免重複)"""
svc, mock_monitor, _ = _make_service(current_primary="ollama", stable_count=3)
mock_monitor.check.return_value = _make_health(HealthStatus.HEALTHY)
svc._consecutive_healthy = 2
with patch.object(svc, "_switch_back_to_ollama", new_callable=AsyncMock) as mock_switch:
await svc._check_and_recover()
mock_switch.assert_not_awaited()
@pytest.mark.asyncio
async def test_check_exception_resets_counter(self):
"""health check 拋異常 → counter 歸零,不 crash"""
svc, mock_monitor, _ = _make_service(current_primary="gemini")
mock_monitor.check.side_effect = RuntimeError("network error")
svc._consecutive_healthy = 2
# 不應 raise
await svc._check_and_recover()
assert svc.consecutive_healthy == 0
# =============================================================================
# 防抖場景OFFLINE → HEALTHY × 2 → OFFLINE → counter 歸零
# =============================================================================
class TestDebounce:
"""防抖機制:中途斷線 → counter 歸零"""
@pytest.mark.asyncio
async def test_counter_resets_on_intermittent_offline(self):
"""
模擬HEALTHY × 2 → OFFLINE → HEALTHY → counter 應從 1 重新開始
"""
svc, mock_monitor, _ = _make_service(current_primary="gemini", stable_count=3)
statuses = [
HealthStatus.HEALTHY,
HealthStatus.HEALTHY,
HealthStatus.OFFLINE, # 中途斷線 → counter 歸零
HealthStatus.HEALTHY, # 重新開始
]
mock_monitor.check.side_effect = [_make_health(s) for s in statuses]
with patch.object(svc, "_switch_back_to_ollama", new_callable=AsyncMock) as mock_switch:
for _ in range(4):
await svc._check_and_recover()
# 總計 3 次 HEALTHY第 1、2、4 次),但第 3 次斷線 → counter 歸零
# 第 4 次後 counter=1未達門檻
mock_switch.assert_not_awaited()
assert svc.consecutive_healthy == 1
@pytest.mark.asyncio
async def test_full_recovery_flow(self):
"""
完整場景OFFLINE → HEALTHY × 3 → 觸發 switch_back
"""
mock_alerter = AsyncMock()
mock_alerter.alert_recovery = AsyncMock()
svc, mock_monitor, mock_failover = _make_service(
current_primary="gemini", stable_count=3, alerter=mock_alerter
)
mock_monitor.check.side_effect = [
_make_health(HealthStatus.HEALTHY),
_make_health(HealthStatus.HEALTHY),
_make_health(HealthStatus.HEALTHY),
]
for _ in range(3):
await svc._check_and_recover()
# 觸發後 current_primary 應切回 ollama
assert svc.current_primary == "ollama"
mock_failover.clear_cache.assert_awaited_once()
# 2026-05-03 ogt: ADR-110 — notify_recovery 改為 "ollama_gcp_a"GCP-A Primary
mock_failover.notify_recovery.assert_called_once_with("ollama_gcp_a")
# =============================================================================
# _switch_back_to_ollama()
# =============================================================================
class TestSwitchBackToOllama:
"""_switch_back_to_ollama 切回邏輯"""
@pytest.mark.asyncio
async def test_sets_current_primary_to_ollama(self):
svc, _, _ = _make_service(current_primary="gemini")
await svc._switch_back_to_ollama()
assert svc.current_primary == "ollama"
@pytest.mark.asyncio
async def test_calls_clear_cache(self):
svc, _, mock_failover = _make_service(current_primary="gemini")
await svc._switch_back_to_ollama()
mock_failover.clear_cache.assert_awaited_once()
@pytest.mark.asyncio
async def test_calls_notify_recovery(self):
svc, _, mock_failover = _make_service(current_primary="gemini")
await svc._switch_back_to_ollama()
# 2026-05-03 ogt: ADR-110 — notify_recovery 改為 "ollama_gcp_a"GCP-A Primary
mock_failover.notify_recovery.assert_called_once_with("ollama_gcp_a")
@pytest.mark.asyncio
async def test_calls_telegram_alerter_when_set(self):
"""alerter 已設定 → 呼叫 alert_recovery"""
mock_alerter = AsyncMock()
mock_alerter.alert_recovery = AsyncMock()
svc, _, _ = _make_service(current_primary="gemini", alerter=mock_alerter)
svc._consecutive_healthy = 3
await svc._switch_back_to_ollama()
mock_alerter.alert_recovery.assert_awaited_once()
payload = mock_alerter.alert_recovery.call_args[0][0]
assert payload["from"] == "gemini"
assert payload["to"] == "ollama" # 2026-05-03 ogt: ADR-110 — "to" 改為 "ollama"(不再指定 111
assert payload["stable_count"] == 3
@pytest.mark.asyncio
async def test_no_alerter_does_not_crash(self):
"""alerter = None → 不 crash"""
svc, _, _ = _make_service(current_primary="gemini", alerter=None)
# 不應 raise
await svc._switch_back_to_ollama()
assert svc.current_primary == "ollama"
@pytest.mark.asyncio
async def test_clear_cache_failure_does_not_crash(self):
"""clear_cache 失敗 → 靜默繼續,不 crash"""
svc, _, mock_failover = _make_service(current_primary="gemini")
mock_failover.clear_cache.side_effect = RuntimeError("Redis down")
# 不應 raise
await svc._switch_back_to_ollama()
# 儘管 clear_cache 失敗current_primary 應已更新
assert svc.current_primary == "ollama"
@pytest.mark.asyncio
async def test_alerter_failure_does_not_crash(self):
"""Telegram alerter 失敗 → 靜默繼續,不 crash"""
mock_alerter = AsyncMock()
mock_alerter.alert_recovery = AsyncMock(side_effect=RuntimeError("TG timeout"))
svc, _, _ = _make_service(current_primary="gemini", alerter=mock_alerter)
# 不應 raise
await svc._switch_back_to_ollama()
assert svc.current_primary == "ollama"
# =============================================================================
# start() / stop() 生命週期
# =============================================================================
class TestLifecycle:
"""start() / stop() 背景任務生命週期"""
@pytest.mark.asyncio
async def test_start_creates_task(self):
svc, _, _ = _make_service()
try:
await svc.start()
assert svc.is_running is True
finally:
await svc.stop()
@pytest.mark.asyncio
async def test_stop_cancels_task(self):
svc, _, _ = _make_service()
await svc.start()
assert svc.is_running is True
await svc.stop()
assert svc.is_running is False
@pytest.mark.asyncio
async def test_stop_idempotent_when_not_started(self):
"""未 start 的情況下呼叫 stop → 不 crash"""
svc, _, _ = _make_service()
await svc.stop() # 不應 raise
@pytest.mark.asyncio
async def test_start_idempotent_second_call(self):
"""重複呼叫 start() → 不重複建立 task"""
svc, _, _ = _make_service()
try:
await svc.start()
task_1 = svc._task
await svc.start() # 第二次呼叫
task_2 = svc._task
assert task_1 is task_2 # 同一個 task
finally:
await svc.stop()
@pytest.mark.asyncio
async def test_monitor_loop_continues_after_exception(self):
"""
_monitor_loop 內部拋異常 → 繼續監控(不 break
模擬:第一次 check 拋異常,第二次正常。
"""
svc, mock_monitor, _ = _make_service(
current_primary="gemini",
check_interval=0, # 0s interval測試快速
stable_count=3,
)
call_count = [0]
async def _check_side_effect(host):
call_count[0] += 1
if call_count[0] == 1:
raise RuntimeError("transient error")
return _make_health(HealthStatus.HEALTHY)
mock_monitor.check.side_effect = _check_side_effect
# 讓 loop 跑幾次
await svc.start()
await asyncio.sleep(0.05)
await svc.stop()
# 至少被呼叫 2 次(第一次異常,第二次正常)
assert call_count[0] >= 2
# =============================================================================
# set_current_primary() / 屬性
# =============================================================================
class TestStateManagement:
"""set_current_primary + 屬性"""
@pytest.mark.asyncio
async def test_set_current_primary_updates_state(self):
# H5+H6 修復set_current_primary 改為 async需 await
svc, _, _ = _make_service(current_primary="ollama")
with patch("src.core.redis_client.get_redis", side_effect=RuntimeError("no redis")):
await svc.set_current_primary("gemini")
assert svc.current_primary == "gemini"
@pytest.mark.asyncio
async def test_set_current_primary_non_ollama_resets_counter(self):
"""切換到非 Ollama 時counter 歸零,開始等待恢復"""
svc, _, _ = _make_service(current_primary="ollama")
svc._consecutive_healthy = 5
with patch("src.core.redis_client.get_redis", side_effect=RuntimeError("no redis")):
await svc.set_current_primary("gemini")
assert svc.consecutive_healthy == 0
@pytest.mark.asyncio
async def test_set_current_primary_to_ollama_no_counter_reset(self):
"""切換到 ollama正常路由→ counter 不重置(直接標記)"""
svc, _, _ = _make_service(current_primary="gemini")
svc._consecutive_healthy = 3
with patch("src.core.redis_client.get_redis", side_effect=RuntimeError("no redis")):
await svc.set_current_primary("ollama")
# set_current_primary 只在切到非 ollama 時 reset counter
assert svc.current_primary == "ollama"
def test_is_running_false_before_start(self):
svc, _, _ = _make_service()
assert svc.is_running is False
# =============================================================================
# H5+H6: Redis 持久化 + Bootstrap + 立刻 check
# 2026-04-25 critic-fix Part2 by Claude Engineer-C2
# =============================================================================
class TestRedisPersistence:
"""H5+H6 修復驗證set_current_primary 持久化 + start() bootstrap"""
@pytest.mark.asyncio
async def test_set_current_primary_persists_to_redis(self):
"""set_current_primary("gemini") → Phase A 雙寫新舊 Redis key"""
svc, _, _ = _make_service(current_primary="ollama")
mock_redis = AsyncMock()
mock_redis.set = AsyncMock()
with patch("src.core.redis_client.get_redis", return_value=mock_redis):
await svc.set_current_primary("gemini")
mock_redis.set.assert_has_awaits(
[
call("platform:ollama:current_primary", "gemini"),
call("ollama:current_primary", "gemini"),
]
)
@pytest.mark.asyncio
async def test_set_current_primary_same_value_no_redis_write(self):
"""set_current_primary("gemini") 但 current_primary 已是 gemini → 不重複寫 Redis"""
svc, _, _ = _make_service(current_primary="gemini")
mock_redis = AsyncMock()
mock_redis.set = AsyncMock()
with patch("src.core.redis_client.get_redis", return_value=mock_redis):
await svc.set_current_primary("gemini")
# 相同值不觸發 persist
mock_redis.set.assert_not_awaited()
@pytest.mark.asyncio
async def test_load_primary_from_redis(self):
"""_load_primary() 從 Redis 讀取,返回正確值"""
svc, _, _ = _make_service()
mock_redis = AsyncMock()
mock_redis.get = AsyncMock(return_value=b"gemini")
with patch("src.core.redis_client.get_redis", return_value=mock_redis):
result = await svc._load_primary()
assert result == "gemini"
@pytest.mark.asyncio
async def test_load_primary_redis_empty_returns_ollama(self):
"""Redis 無值 → 預設返回 'ollama'"""
svc, _, _ = _make_service()
mock_redis = AsyncMock()
mock_redis.get = AsyncMock(return_value=None)
with patch("src.core.redis_client.get_redis", return_value=mock_redis):
result = await svc._load_primary()
assert result == "ollama"
@pytest.mark.asyncio
async def test_load_primary_redis_error_returns_ollama(self):
"""Redis 掛掉 → 預設返回 "ollama"fail-safe"""
svc, _, _ = _make_service()
with patch("src.core.redis_client.get_redis", side_effect=RuntimeError("Redis down")):
result = await svc._load_primary()
assert result == "ollama"
@pytest.mark.asyncio
async def test_start_loads_primary_from_redis(self):
"""start() 從 Redis bootstrap current_primary"""
svc, mock_monitor, _ = _make_service(current_primary="ollama")
mock_monitor.check = AsyncMock(return_value=_make_health(HealthStatus.HEALTHY))
mock_redis = AsyncMock()
mock_redis.get = AsyncMock(return_value=b"gemini")
with patch("src.core.redis_client.get_redis", return_value=mock_redis):
try:
await svc.start()
# bootstrap 後 current_primary 應為 Redis 讀到的值
assert svc.current_primary == "gemini"
finally:
await svc.stop()
@pytest.mark.asyncio
async def test_start_immediate_check_when_primary_not_ollama(self):
"""start() 時 primary=gemini → 立刻執行一次 _check_and_recover不等 30s"""
svc, mock_monitor, _ = _make_service(current_primary="gemini")
mock_monitor.check = AsyncMock(return_value=_make_health(HealthStatus.HEALTHY))
mock_redis = AsyncMock()
mock_redis.get = AsyncMock(return_value=b"gemini")
check_called = [False]
original_check = svc._check_and_recover
async def _spy_check():
check_called[0] = True
await original_check()
with patch.object(svc, "_check_and_recover", side_effect=_spy_check), \
patch("src.core.redis_client.get_redis", return_value=mock_redis):
try:
await svc.start()
assert check_called[0] is True
finally:
await svc.stop()
@pytest.mark.asyncio
async def test_start_no_immediate_check_when_primary_is_ollama(self):
"""start() 時 primary=ollama正常狀態→ 不立刻執行 check"""
svc, mock_monitor, _ = _make_service(current_primary="gemini")
mock_redis = AsyncMock()
mock_redis.get = AsyncMock(return_value=b"ollama") # Redis 說 primary=ollama
check_called = [False]
async def _spy_check():
check_called[0] = True
with patch.object(svc, "_check_and_recover", side_effect=_spy_check), \
patch("src.core.redis_client.get_redis", return_value=mock_redis):
try:
await svc.start()
# primary=ollama → 不立刻 check
assert check_called[0] is False
finally:
await svc.stop()
# =============================================================================
# Singleton
# =============================================================================
def test_singleton_returns_same_instance():
s1 = get_ollama_auto_recovery_service()
s2 = get_ollama_auto_recovery_service()
assert s1 is s2
def test_reset_singleton_gives_new_instance():
s1 = get_ollama_auto_recovery_service()
reset_ollama_auto_recovery_service()
s2 = get_ollama_auto_recovery_service()
assert s1 is not s2