fix(api): avoid local ollama health blocking gcp route

2026-05-19 12:22:46 +08:00
parent 1d285dd9d4
commit 36aeea80a3
2 changed files with 100 additions and 43 deletions
--- a/apps/api/src/services/ollama_failover_manager.py
+++ b/apps/api/src/services/ollama_failover_manager.py
@@ -33,19 +33,12 @@ from __future__ import annotations

 import asyncio
 import datetime
-from dataclasses import dataclass, field
-# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
-# 用標準庫 timezone(timedelta(hours=8)) 取代 zoneinfo，保證一定有 +8 時區
-# 原 zoneinfo.ZoneInfo("Asia/Taipei") 失敗時 = None → datetime.now(None) 為 UTC
-from datetime import timezone, timedelta
+from dataclasses import dataclass
+from datetime import timedelta, timezone

 import structlog

 from src.core.config import get_settings
-
-# 台北時區 +8（標準庫保險絲，100% 可用）
-# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
-TAIPEI_TZ = timezone(timedelta(hours=8))
 from src.services.ollama_health_monitor import (
    HealthReport,
    HealthStatus,
@@ -55,6 +48,12 @@ from src.services.ollama_health_monitor import (

 logger = structlog.get_logger(__name__)

+# 台北時區 +8（標準庫保險絲，100% 可用）
+# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
+# 用標準庫 timezone(timedelta(hours=8)) 取代 zoneinfo，保證一定有 +8 時區
+# 原 zoneinfo.ZoneInfo("Asia/Taipei") 失敗時 = None → datetime.now(None) 為 UTC
+TAIPEI_TZ = timezone(timedelta(hours=8))
+

 # =============================================================================
 # 路由結果模型（輕量，P1.2 整合時轉換為 RoutingDecision）
@@ -203,31 +202,59 @@ class OllamaFailoverManager:
        url_secondary = self._settings.OLLAMA_SECONDARY_URL  # 110:11436 → GCP-B (nginx proxy)
        url_tertiary  = self._settings.OLLAMA_FALLBACK_URL   # 110:11437 → Local 111 (nginx proxy)

-        # 並行檢查三台 Ollama 主機（asyncio.gather 提升效率）
-        results_raw = await asyncio.gather(
-            self._monitor.check(url_primary),
-            self._monitor.check(url_secondary),
-            self._monitor.check(url_tertiary),
-            return_exceptions=True,
-        )
-
        def _to_health(r, label: str) -> HealthReport:
            if isinstance(r, Exception):
                return HealthReport(status=HealthStatus.OFFLINE, reason=f"{label} check error: {r}")
            return r

-        health_gcp_a = _to_health(results_raw[0], f"primary({url_primary})")
-        health_gcp_b = _to_health(results_raw[1], f"secondary({url_secondary})")
-        health_local = _to_health(results_raw[2], f"tertiary({url_tertiary})")
+        def _short(url: str) -> str:
+            from urllib.parse import urlparse
+            return urlparse(url).hostname or url

-        result = self._decide_route(
-            health_gcp_a=health_gcp_a,
-            health_gcp_b=health_gcp_b,
-            health_local=health_local,
-            url_gcp_a=url_primary,
-            url_gcp_b=url_secondary,
-            url_local=url_tertiary,
-        )
+        # 2026-05-19 Codex: alert-fast path must not wait for the slow local lane
+        # when GCP-A is already healthy. The old gather(GCP-A/GCP-B/111) path made
+        # 111's 45s health timeout dominate every routing decision.
+        try:
+            primary_raw = await self._monitor.check(url_primary)
+        except Exception as exc:
+            primary_raw = exc
+        health_gcp_a = _to_health(primary_raw, f"primary({url_primary})")
+        health_gcp_b: HealthReport | None = None
+        health_local: HealthReport | None = None
+
+        if health_gcp_a.status == HealthStatus.HEALTHY:
+            model = self._settings.OLLAMA_HEALTH_CHECK_MODEL
+            fallback_chain = [
+                OllamaEndpoint(url=url_secondary, provider_name="ollama_gcp_b", model=model),
+                OllamaEndpoint(url=url_tertiary, provider_name="ollama_local", model=model),
+                _GEMINI_ENDPOINT,
+            ]
+            result = OllamaRoutingResult(
+                primary=OllamaEndpoint(url=url_primary, provider_name="ollama_gcp_a", model=model),
+                fallback_chain=fallback_chain,
+                routing_reason=f"primary({_short(url_primary)}) HEALTHY",
+                health_gcp_a=health_gcp_a,
+                health_gcp_b=None,
+                health_local=None,
+            )
+        else:
+            # Primary 不健康時才並行檢查後兩層，保留 GCP-B/Local 容災。
+            results_raw = await asyncio.gather(
+                self._monitor.check(url_secondary),
+                self._monitor.check(url_tertiary),
+                return_exceptions=True,
+            )
+            health_gcp_b = _to_health(results_raw[0], f"secondary({url_secondary})")
+            health_local = _to_health(results_raw[1], f"tertiary({url_tertiary})")
+
+            result = self._decide_route(
+                health_gcp_a=health_gcp_a,
+                health_gcp_b=health_gcp_b,
+                health_local=health_local,
+                url_gcp_a=url_primary,
+                url_gcp_b=url_secondary,
+                url_local=url_tertiary,
+            )

        # Gemini 帳單熔斷（quota gate）
        # 2026-04-25 critic-fix Part2 H7 by Claude Engineer-C2
@@ -243,8 +270,8 @@ class OllamaFailoverManager:
                result = self._build_quota_exceeded_route(health_gcp_a=health_gcp_a)
                # Quota 耗盡 Telegram 告警（24h dedup）
                try:
-                    from src.services.failover_alerter import get_failover_alerter
                    from src.core.redis_client import get_redis
+                    from src.services.failover_alerter import get_failover_alerter
                    _current_count = quota
                    try:
                        _redis = get_redis()
@@ -267,6 +294,9 @@ class OllamaFailoverManager:
        # 寫入 audit_log（best-effort）
        await self._write_failover_audit(result)

+        def _status(report: HealthReport | None) -> str:
+            return report.status.value if report else "not_checked"
+
        logger.info(
            "ollama_failover_decision",
            primary=result.primary.provider_name,
@@ -274,8 +304,8 @@ class OllamaFailoverManager:
            reason=result.routing_reason,
            fallback_count=len(result.fallback_chain),
            health_gcp_a=health_gcp_a.status.value,
-            health_gcp_b=health_gcp_b.status.value,
-            health_local=health_local.status.value,
+            health_gcp_b=_status(health_gcp_b),
+            health_local=_status(health_local),
        )

        # 通知 recovery service 當前 primary（跨重啟持久化）
@@ -589,8 +619,8 @@ class OllamaFailoverManager:
        # 2026-04-26 P2.3 by Claude Sonnet 4.6 (tool-expert) — 記錄 failover Prometheus metric
        try:
            from src.core.metrics import (
-                OLLAMA_FAILOVER_TRIGGERED_TOTAL,
                OLLAMA_CURRENT_PRIMARY_IS_OLLAMA,
+                OLLAMA_FAILOVER_TRIGGERED_TOTAL,
            )
            OLLAMA_FAILOVER_TRIGGERED_TOTAL.labels(
                from_provider="ollama",
--- a/apps/api/tests/test_ollama_failover_manager.py
+++ b/apps/api/tests/test_ollama_failover_manager.py
@@ -3,6 +3,7 @@
 # 2026-04-25 統帥指令 by Claude Engineer-C — 自動切 Gemini + 自動恢復（路由矩陣更新）
 # 2026-04-27 波次對齊 by Claude Sonnet 4.6 — 統帥鐵律：唯一 Ollama=111，188 完全移出
 # 2026-05-03 ogt: ADR-110 GCP 三層容災架構，URL 常數更新為 GCP-A/B/Local，新增三層容災場景
+# 2026-05-19 Codex: GCP-A healthy fast path 不等待 Local 111 health timeout
 """
 OllamaFailoverManager 單元測試 - P1.1c v4.0
 =============================================
@@ -25,14 +26,13 @@ from unittest.mock import AsyncMock, MagicMock, patch

 import pytest

-from src.services.ollama_health_monitor import HealthReport, HealthStatus
 from src.services.ollama_failover_manager import (
    OllamaFailoverManager,
    OllamaRoutingResult,
    get_ollama_failover_manager,
    reset_ollama_failover_manager,
 )
-
+from src.services.ollama_health_monitor import HealthReport, HealthStatus

 # =============================================================================
 # Fixtures
@@ -266,12 +266,12 @@ class TestDecideRoute:


 # =============================================================================
-# select_provider()：只 check 111
+# select_provider()：GCP-A healthy fast path
 # =============================================================================


 class TestSelectProvider:
-    """select_provider() 三層容災健康檢查（ADR-110：並行 check GCP-A / GCP-B / Local）"""
+    """select_provider() 三層容災健康檢查。"""

    def _make_three_layer_mock(
        self,
@@ -303,16 +303,37 @@ class TestSelectProvider:
        return manager, mock_monitor

    @pytest.mark.asyncio
-    async def test_select_provider_checks_all_three_hosts(self):
-        """ADR-110：select_provider 並行 check 三台 Ollama 主機"""
+    async def test_select_provider_gcp_a_healthy_checks_primary_only(self):
+        """GCP-A healthy 時不等待 GCP-B / Local 111，避免 routing 被 111 timeout 拖慢。"""
        manager, mock_monitor = self._make_three_layer_mock(
            gcp_a_status=HealthStatus.HEALTHY,
        )

        with patch.object(manager, "_write_failover_audit", return_value=None):
-            await manager.select_provider()
+            result = await manager.select_provider()

-        # 並行 check 三台主機（GCP-A / GCP-B / Local）
+        assert result.primary.provider_name == "ollama_gcp_a"
+        assert result.health_gcp_b is None
+        assert result.health_local is None
+        assert mock_monitor.check.call_count == 1
+        called_urls = {call.args[0] for call in mock_monitor.check.call_args_list}
+        assert URL_GCP_A in called_urls
+        assert URL_GCP_B not in called_urls
+        assert URL_LOCAL not in called_urls
+
+    @pytest.mark.asyncio
+    async def test_select_provider_checks_fallback_hosts_when_gcp_a_not_healthy(self):
+        """GCP-A 不健康時仍檢查 GCP-B / Local，保留三層容災。"""
+        manager, mock_monitor = self._make_three_layer_mock(
+            gcp_a_status=HealthStatus.OFFLINE,
+            gcp_b_status=HealthStatus.HEALTHY,
+            local_status=HealthStatus.OFFLINE,
+        )
+
+        with patch.object(manager, "_write_failover_audit", return_value=None):
+            result = await manager.select_provider()
+
+        assert result.primary.provider_name == "ollama_gcp_b"
        assert mock_monitor.check.call_count == 3
        called_urls = {call.args[0] for call in mock_monitor.check.call_args_list}
        assert URL_GCP_A in called_urls
@@ -626,7 +647,10 @@ class TestWriteFailoverAudit:
    async def test_audit_uses_structlog_not_db(self):
        """_write_failover_audit 應呼叫 structlog，不呼叫 DB"""
        manager = _make_manager()
-        from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingResult
+        from src.services.ollama_failover_manager import (
+            OllamaEndpoint,
+            OllamaRoutingResult,
+        )

        result = OllamaRoutingResult(
            primary=OllamaEndpoint(url="", provider_name="gemini", model="gemini-1.5-flash"),
@@ -642,7 +666,10 @@ class TestWriteFailoverAudit:
    async def test_audit_skipped_when_gcp_a_healthy(self):
        """GCP-A HEALTHY 時 early return，不記錄 failover"""
        manager = _make_manager()
-        from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingResult
+        from src.services.ollama_failover_manager import (
+            OllamaEndpoint,
+            OllamaRoutingResult,
+        )

        result = OllamaRoutingResult(
            primary=OllamaEndpoint(url=URL_GCP_A, provider_name="ollama_gcp_a", model="qwen"),
@@ -669,7 +696,7 @@ class TestAIProviderEnumOllamaLocal:
        assert AIProviderEnum.OLLAMA_LOCAL.value == "ollama_local"

    def test_ollama_local_in_latency_budget(self):
-        from src.services.ai_router import AIProviderEnum, PROVIDER_LATENCY_BUDGET
+        from src.services.ai_router import PROVIDER_LATENCY_BUDGET, AIProviderEnum
        assert AIProviderEnum.OLLAMA_LOCAL in PROVIDER_LATENCY_BUDGET
        assert PROVIDER_LATENCY_BUDGET[AIProviderEnum.OLLAMA_LOCAL] == 90000