fix(api): avoid local ollama health blocking gcp route

2026-05-19 12:22:46 +08:00
parent 1d285dd9d4
commit 36aeea80a3
2 changed files with 100 additions and 43 deletions
--- a/apps/api/src/services/ollama_failover_manager.py
+++ b/apps/api/src/services/ollama_failover_manager.py
@@ -33,19 +33,12 @@ from __future__ import annotations

 import asyncio
 import datetime
-from dataclasses import dataclass, field
-# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
-# 用標準庫 timezone(timedelta(hours=8)) 取代 zoneinfo，保證一定有 +8 時區
-# 原 zoneinfo.ZoneInfo("Asia/Taipei") 失敗時 = None → datetime.now(None) 為 UTC
-from datetime import timezone, timedelta
+from dataclasses import dataclass
+from datetime import timedelta, timezone

 import structlog

 from src.core.config import get_settings
-
-# 台北時區 +8（標準庫保險絲，100% 可用）
-# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
-TAIPEI_TZ = timezone(timedelta(hours=8))
 from src.services.ollama_health_monitor import (
    HealthReport,
    HealthStatus,
@@ -55,6 +48,12 @@ from src.services.ollama_health_monitor import (

 logger = structlog.get_logger(__name__)

+# 台北時區 +8（標準庫保險絲，100% 可用）
+# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
+# 用標準庫 timezone(timedelta(hours=8)) 取代 zoneinfo，保證一定有 +8 時區
+# 原 zoneinfo.ZoneInfo("Asia/Taipei") 失敗時 = None → datetime.now(None) 為 UTC
+TAIPEI_TZ = timezone(timedelta(hours=8))
+

 # =============================================================================
 # 路由結果模型（輕量，P1.2 整合時轉換為 RoutingDecision）
@@ -203,31 +202,59 @@ class OllamaFailoverManager:
        url_secondary = self._settings.OLLAMA_SECONDARY_URL  # 110:11436 → GCP-B (nginx proxy)
        url_tertiary  = self._settings.OLLAMA_FALLBACK_URL   # 110:11437 → Local 111 (nginx proxy)

-        # 並行檢查三台 Ollama 主機（asyncio.gather 提升效率）
-        results_raw = await asyncio.gather(
-            self._monitor.check(url_primary),
-            self._monitor.check(url_secondary),
-            self._monitor.check(url_tertiary),
-            return_exceptions=True,
-        )
-
        def _to_health(r, label: str) -> HealthReport:
            if isinstance(r, Exception):
                return HealthReport(status=HealthStatus.OFFLINE, reason=f"{label} check error: {r}")
            return r

-        health_gcp_a = _to_health(results_raw[0], f"primary({url_primary})")
-        health_gcp_b = _to_health(results_raw[1], f"secondary({url_secondary})")
-        health_local = _to_health(results_raw[2], f"tertiary({url_tertiary})")
+        def _short(url: str) -> str:
+            from urllib.parse import urlparse
+            return urlparse(url).hostname or url

-        result = self._decide_route(
-            health_gcp_a=health_gcp_a,
-            health_gcp_b=health_gcp_b,
-            health_local=health_local,
-            url_gcp_a=url_primary,
-            url_gcp_b=url_secondary,
-            url_local=url_tertiary,
-        )
+        # 2026-05-19 Codex: alert-fast path must not wait for the slow local lane
+        # when GCP-A is already healthy. The old gather(GCP-A/GCP-B/111) path made
+        # 111's 45s health timeout dominate every routing decision.
+        try:
+            primary_raw = await self._monitor.check(url_primary)
+        except Exception as exc:
+            primary_raw = exc
+        health_gcp_a = _to_health(primary_raw, f"primary({url_primary})")
+        health_gcp_b: HealthReport | None = None
+        health_local: HealthReport | None = None
+
+        if health_gcp_a.status == HealthStatus.HEALTHY:
+            model = self._settings.OLLAMA_HEALTH_CHECK_MODEL
+            fallback_chain = [
+                OllamaEndpoint(url=url_secondary, provider_name="ollama_gcp_b", model=model),
+                OllamaEndpoint(url=url_tertiary, provider_name="ollama_local", model=model),
+                _GEMINI_ENDPOINT,
+            ]
+            result = OllamaRoutingResult(
+                primary=OllamaEndpoint(url=url_primary, provider_name="ollama_gcp_a", model=model),
+                fallback_chain=fallback_chain,
+                routing_reason=f"primary({_short(url_primary)}) HEALTHY",
+                health_gcp_a=health_gcp_a,
+                health_gcp_b=None,
+                health_local=None,
+            )
+        else:
+            # Primary 不健康時才並行檢查後兩層，保留 GCP-B/Local 容災。
+            results_raw = await asyncio.gather(
+                self._monitor.check(url_secondary),
+                self._monitor.check(url_tertiary),
+                return_exceptions=True,
+            )
+            health_gcp_b = _to_health(results_raw[0], f"secondary({url_secondary})")
+            health_local = _to_health(results_raw[1], f"tertiary({url_tertiary})")
+
+            result = self._decide_route(
+                health_gcp_a=health_gcp_a,
+                health_gcp_b=health_gcp_b,
+                health_local=health_local,
+                url_gcp_a=url_primary,
+                url_gcp_b=url_secondary,
+                url_local=url_tertiary,
+            )

        # Gemini 帳單熔斷（quota gate）
        # 2026-04-25 critic-fix Part2 H7 by Claude Engineer-C2
@@ -243,8 +270,8 @@ class OllamaFailoverManager:
                result = self._build_quota_exceeded_route(health_gcp_a=health_gcp_a)
                # Quota 耗盡 Telegram 告警（24h dedup）
                try:
-                    from src.services.failover_alerter import get_failover_alerter
                    from src.core.redis_client import get_redis
+                    from src.services.failover_alerter import get_failover_alerter
                    _current_count = quota
                    try:
                        _redis = get_redis()
@@ -267,6 +294,9 @@ class OllamaFailoverManager:
        # 寫入 audit_log（best-effort）
        await self._write_failover_audit(result)

+        def _status(report: HealthReport | None) -> str:
+            return report.status.value if report else "not_checked"
+
        logger.info(
            "ollama_failover_decision",
            primary=result.primary.provider_name,
@@ -274,8 +304,8 @@ class OllamaFailoverManager:
            reason=result.routing_reason,
            fallback_count=len(result.fallback_chain),
            health_gcp_a=health_gcp_a.status.value,
-            health_gcp_b=health_gcp_b.status.value,
-            health_local=health_local.status.value,
+            health_gcp_b=_status(health_gcp_b),
+            health_local=_status(health_local),
        )

        # 通知 recovery service 當前 primary（跨重啟持久化）
@@ -589,8 +619,8 @@ class OllamaFailoverManager:
        # 2026-04-26 P2.3 by Claude Sonnet 4.6 (tool-expert) — 記錄 failover Prometheus metric
        try:
            from src.core.metrics import (
-                OLLAMA_FAILOVER_TRIGGERED_TOTAL,
                OLLAMA_CURRENT_PRIMARY_IS_OLLAMA,
+                OLLAMA_FAILOVER_TRIGGERED_TOTAL,
            )
            OLLAMA_FAILOVER_TRIGGERED_TOTAL.labels(
                from_provider="ollama",