From 36aeea80a36f01e58e3cd9c2089b6befa8dcd1df Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 19 May 2026 12:22:46 +0800
Subject: [PATCH] fix(api): avoid local ollama health blocking gcp route

---
 .../src/services/ollama_failover_manager.py   | 94 ++++++++++++-------
 .../api/tests/test_ollama_failover_manager.py | 49 +++++++---
 2 files changed, 100 insertions(+), 43 deletions(-)

diff --git a/apps/api/src/services/ollama_failover_manager.py b/apps/api/src/services/ollama_failover_manager.py
index 20f71e69..148e6e92 100644
--- a/apps/api/src/services/ollama_failover_manager.py
+++ b/apps/api/src/services/ollama_failover_manager.py
@@ -33,19 +33,12 @@ from __future__ import annotations
 
 import asyncio
 import datetime
-from dataclasses import dataclass, field
-# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
-# 用標準庫 timezone(timedelta(hours=8)) 取代 zoneinfo，保證一定有 +8 時區
-# 原 zoneinfo.ZoneInfo("Asia/Taipei") 失敗時 = None → datetime.now(None) 為 UTC
-from datetime import timezone, timedelta
+from dataclasses import dataclass
+from datetime import timedelta, timezone
 
 import structlog
 
 from src.core.config import get_settings
-
-# 台北時區 +8（標準庫保險絲，100% 可用）
-# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
-TAIPEI_TZ = timezone(timedelta(hours=8))
 from src.services.ollama_health_monitor import (
     HealthReport,
     HealthStatus,
@@ -55,6 +48,12 @@ from src.services.ollama_health_monitor import (
 
 logger = structlog.get_logger(__name__)
 
+# 台北時區 +8（標準庫保險絲，100% 可用）
+# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
+# 用標準庫 timezone(timedelta(hours=8)) 取代 zoneinfo，保證一定有 +8 時區
+# 原 zoneinfo.ZoneInfo("Asia/Taipei") 失敗時 = None → datetime.now(None) 為 UTC
+TAIPEI_TZ = timezone(timedelta(hours=8))
+
 
 # =============================================================================
 # 路由結果模型（輕量，P1.2 整合時轉換為 RoutingDecision）
@@ -203,31 +202,59 @@ class OllamaFailoverManager:
         url_secondary = self._settings.OLLAMA_SECONDARY_URL  # 110:11436 → GCP-B (nginx proxy)
         url_tertiary  = self._settings.OLLAMA_FALLBACK_URL   # 110:11437 → Local 111 (nginx proxy)
 
-        # 並行檢查三台 Ollama 主機（asyncio.gather 提升效率）
-        results_raw = await asyncio.gather(
-            self._monitor.check(url_primary),
-            self._monitor.check(url_secondary),
-            self._monitor.check(url_tertiary),
-            return_exceptions=True,
-        )
-
         def _to_health(r, label: str) -> HealthReport:
             if isinstance(r, Exception):
                 return HealthReport(status=HealthStatus.OFFLINE, reason=f"{label} check error: {r}")
             return r
 
-        health_gcp_a = _to_health(results_raw[0], f"primary({url_primary})")
-        health_gcp_b = _to_health(results_raw[1], f"secondary({url_secondary})")
-        health_local = _to_health(results_raw[2], f"tertiary({url_tertiary})")
+        def _short(url: str) -> str:
+            from urllib.parse import urlparse
+            return urlparse(url).hostname or url
 
-        result = self._decide_route(
-            health_gcp_a=health_gcp_a,
-            health_gcp_b=health_gcp_b,
-            health_local=health_local,
-            url_gcp_a=url_primary,
-            url_gcp_b=url_secondary,
-            url_local=url_tertiary,
-        )
+        # 2026-05-19 Codex: alert-fast path must not wait for the slow local lane
+        # when GCP-A is already healthy. The old gather(GCP-A/GCP-B/111) path made
+        # 111's 45s health timeout dominate every routing decision.
+        try:
+            primary_raw = await self._monitor.check(url_primary)
+        except Exception as exc:
+            primary_raw = exc
+        health_gcp_a = _to_health(primary_raw, f"primary({url_primary})")
+        health_gcp_b: HealthReport | None = None
+        health_local: HealthReport | None = None
+
+        if health_gcp_a.status == HealthStatus.HEALTHY:
+            model = self._settings.OLLAMA_HEALTH_CHECK_MODEL
+            fallback_chain = [
+                OllamaEndpoint(url=url_secondary, provider_name="ollama_gcp_b", model=model),
+                OllamaEndpoint(url=url_tertiary, provider_name="ollama_local", model=model),
+                _GEMINI_ENDPOINT,
+            ]
+            result = OllamaRoutingResult(
+                primary=OllamaEndpoint(url=url_primary, provider_name="ollama_gcp_a", model=model),
+                fallback_chain=fallback_chain,
+                routing_reason=f"primary({_short(url_primary)}) HEALTHY",
+                health_gcp_a=health_gcp_a,
+                health_gcp_b=None,
+                health_local=None,
+            )
+        else:
+            # Primary 不健康時才並行檢查後兩層，保留 GCP-B/Local 容災。
+            results_raw = await asyncio.gather(
+                self._monitor.check(url_secondary),
+                self._monitor.check(url_tertiary),
+                return_exceptions=True,
+            )
+            health_gcp_b = _to_health(results_raw[0], f"secondary({url_secondary})")
+            health_local = _to_health(results_raw[1], f"tertiary({url_tertiary})")
+
+            result = self._decide_route(
+                health_gcp_a=health_gcp_a,
+                health_gcp_b=health_gcp_b,
+                health_local=health_local,
+                url_gcp_a=url_primary,
+                url_gcp_b=url_secondary,
+                url_local=url_tertiary,
+            )
 
         # Gemini 帳單熔斷（quota gate）
         # 2026-04-25 critic-fix Part2 H7 by Claude Engineer-C2
@@ -243,8 +270,8 @@ class OllamaFailoverManager:
                 result = self._build_quota_exceeded_route(health_gcp_a=health_gcp_a)
                 # Quota 耗盡 Telegram 告警（24h dedup）
                 try:
-                    from src.services.failover_alerter import get_failover_alerter
                     from src.core.redis_client import get_redis
+                    from src.services.failover_alerter import get_failover_alerter
                     _current_count = quota
                     try:
                         _redis = get_redis()
@@ -267,6 +294,9 @@ class OllamaFailoverManager:
         # 寫入 audit_log（best-effort）
         await self._write_failover_audit(result)
 
+        def _status(report: HealthReport | None) -> str:
+            return report.status.value if report else "not_checked"
+
         logger.info(
             "ollama_failover_decision",
             primary=result.primary.provider_name,
@@ -274,8 +304,8 @@ class OllamaFailoverManager:
             reason=result.routing_reason,
             fallback_count=len(result.fallback_chain),
             health_gcp_a=health_gcp_a.status.value,
-            health_gcp_b=health_gcp_b.status.value,
-            health_local=health_local.status.value,
+            health_gcp_b=_status(health_gcp_b),
+            health_local=_status(health_local),
         )
 
         # 通知 recovery service 當前 primary（跨重啟持久化）
@@ -589,8 +619,8 @@ class OllamaFailoverManager:
         # 2026-04-26 P2.3 by Claude Sonnet 4.6 (tool-expert) — 記錄 failover Prometheus metric
         try:
             from src.core.metrics import (
-                OLLAMA_FAILOVER_TRIGGERED_TOTAL,
                 OLLAMA_CURRENT_PRIMARY_IS_OLLAMA,
+                OLLAMA_FAILOVER_TRIGGERED_TOTAL,
             )
             OLLAMA_FAILOVER_TRIGGERED_TOTAL.labels(
                 from_provider="ollama",
diff --git a/apps/api/tests/test_ollama_failover_manager.py b/apps/api/tests/test_ollama_failover_manager.py
index fe8a8aaf..5c52c248 100644
--- a/apps/api/tests/test_ollama_failover_manager.py
+++ b/apps/api/tests/test_ollama_failover_manager.py
@@ -3,6 +3,7 @@
 # 2026-04-25 統帥指令 by Claude Engineer-C — 自動切 Gemini + 自動恢復（路由矩陣更新）
 # 2026-04-27 波次對齊 by Claude Sonnet 4.6 — 統帥鐵律：唯一 Ollama=111，188 完全移出
 # 2026-05-03 ogt: ADR-110 GCP 三層容災架構，URL 常數更新為 GCP-A/B/Local，新增三層容災場景
+# 2026-05-19 Codex: GCP-A healthy fast path 不等待 Local 111 health timeout
 """
 OllamaFailoverManager 單元測試 - P1.1c v4.0
 =============================================
@@ -25,14 +26,13 @@ from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
 
-from src.services.ollama_health_monitor import HealthReport, HealthStatus
 from src.services.ollama_failover_manager import (
     OllamaFailoverManager,
     OllamaRoutingResult,
     get_ollama_failover_manager,
     reset_ollama_failover_manager,
 )
-
+from src.services.ollama_health_monitor import HealthReport, HealthStatus
 
 # =============================================================================
 # Fixtures
@@ -266,12 +266,12 @@ class TestDecideRoute:
 
 
 # =============================================================================
-# select_provider()：只 check 111
+# select_provider()：GCP-A healthy fast path
 # =============================================================================
 
 
 class TestSelectProvider:
-    """select_provider() 三層容災健康檢查（ADR-110：並行 check GCP-A / GCP-B / Local）"""
+    """select_provider() 三層容災健康檢查。"""
 
     def _make_three_layer_mock(
         self,
@@ -303,16 +303,37 @@ class TestSelectProvider:
         return manager, mock_monitor
 
     @pytest.mark.asyncio
-    async def test_select_provider_checks_all_three_hosts(self):
-        """ADR-110：select_provider 並行 check 三台 Ollama 主機"""
+    async def test_select_provider_gcp_a_healthy_checks_primary_only(self):
+        """GCP-A healthy 時不等待 GCP-B / Local 111，避免 routing 被 111 timeout 拖慢。"""
         manager, mock_monitor = self._make_three_layer_mock(
             gcp_a_status=HealthStatus.HEALTHY,
         )
 
         with patch.object(manager, "_write_failover_audit", return_value=None):
-            await manager.select_provider()
+            result = await manager.select_provider()
 
-        # 並行 check 三台主機（GCP-A / GCP-B / Local）
+        assert result.primary.provider_name == "ollama_gcp_a"
+        assert result.health_gcp_b is None
+        assert result.health_local is None
+        assert mock_monitor.check.call_count == 1
+        called_urls = {call.args[0] for call in mock_monitor.check.call_args_list}
+        assert URL_GCP_A in called_urls
+        assert URL_GCP_B not in called_urls
+        assert URL_LOCAL not in called_urls
+
+    @pytest.mark.asyncio
+    async def test_select_provider_checks_fallback_hosts_when_gcp_a_not_healthy(self):
+        """GCP-A 不健康時仍檢查 GCP-B / Local，保留三層容災。"""
+        manager, mock_monitor = self._make_three_layer_mock(
+            gcp_a_status=HealthStatus.OFFLINE,
+            gcp_b_status=HealthStatus.HEALTHY,
+            local_status=HealthStatus.OFFLINE,
+        )
+
+        with patch.object(manager, "_write_failover_audit", return_value=None):
+            result = await manager.select_provider()
+
+        assert result.primary.provider_name == "ollama_gcp_b"
         assert mock_monitor.check.call_count == 3
         called_urls = {call.args[0] for call in mock_monitor.check.call_args_list}
         assert URL_GCP_A in called_urls
@@ -626,7 +647,10 @@ class TestWriteFailoverAudit:
     async def test_audit_uses_structlog_not_db(self):
         """_write_failover_audit 應呼叫 structlog，不呼叫 DB"""
         manager = _make_manager()
-        from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingResult
+        from src.services.ollama_failover_manager import (
+            OllamaEndpoint,
+            OllamaRoutingResult,
+        )
 
         result = OllamaRoutingResult(
             primary=OllamaEndpoint(url="", provider_name="gemini", model="gemini-1.5-flash"),
@@ -642,7 +666,10 @@ class TestWriteFailoverAudit:
     async def test_audit_skipped_when_gcp_a_healthy(self):
         """GCP-A HEALTHY 時 early return，不記錄 failover"""
         manager = _make_manager()
-        from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingResult
+        from src.services.ollama_failover_manager import (
+            OllamaEndpoint,
+            OllamaRoutingResult,
+        )
 
         result = OllamaRoutingResult(
             primary=OllamaEndpoint(url=URL_GCP_A, provider_name="ollama_gcp_a", model="qwen"),
@@ -669,7 +696,7 @@ class TestAIProviderEnumOllamaLocal:
         assert AIProviderEnum.OLLAMA_LOCAL.value == "ollama_local"
 
     def test_ollama_local_in_latency_budget(self):
-        from src.services.ai_router import AIProviderEnum, PROVIDER_LATENCY_BUDGET
+        from src.services.ai_router import PROVIDER_LATENCY_BUDGET, AIProviderEnum
         assert AIProviderEnum.OLLAMA_LOCAL in PROVIDER_LATENCY_BUDGET
         assert PROVIDER_LATENCY_BUDGET[AIProviderEnum.OLLAMA_LOCAL] == 90000