From 855819652ec8a98c3c24ee65e9f2299ce79d3b4b Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 4 May 2026 19:01:27 +0800
Subject: [PATCH] =?UTF-8?q?fix(ollama):=20=E4=BF=AE=E5=BE=A9=E5=AE=B9?=
 =?UTF-8?q?=E7=81=BD=E9=8F=88=E5=9B=9B=E5=A4=A7=20bug=20=E2=80=94=20OFFLIN?=
 =?UTF-8?q?E=20cache=20=E6=94=BE=E5=A4=A7=20+=20SLOW=20=E8=B7=AF=E7=94=B1?=
 =?UTF-8?q?=E7=BC=BA=E5=A4=B1=20+=20recovery=20=E5=91=BD=E5=90=8D=E4=B8=8D?=
 =?UTF-8?q?=E4=B8=80=E8=87=B4=20+=20=E5=91=8A=E8=AD=A6=E9=A1=AF=E7=A4=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

根因：NetworkPolicy reload/CNI 瞬態抖動導致三台 Ollama 同時 OFFLINE，被 30s Redis cache 放大
  → 後續 30s 所有請求誤走 Gemini，燒 quota

B1 ollama_health_monitor: OFFLINE TTL 從 30s 縮短至 5s，儘速重試
B3 ollama_health_monitor: inference ConnectError 改判 DEGRADED（connectivity 通了不算 OFFLINE）
B5/B6 ollama_auto_recovery: _current_primary 預設改 "ollama_gcp_a"，比對改 startswith("ollama_")
SLOW 修復: failover_manager SLOW 節點視為可用（優於 Gemini quota 耗盡）
SLOW 修復: auto_recovery SLOW 也計入 recovery counter（GCP 高負載仍可切回）
告警顯示: _provider_display 加入 GCP-A/B/Local 具體伺服器識別
告警顯示: _format_automation_block 加入 Token 用量行

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/api/src/services/ollama_auto_recovery.py | 23 +++++++++---
 .../src/services/ollama_failover_manager.py   | 37 ++++++++++++++++++-
 .../api/src/services/ollama_health_monitor.py | 25 ++++++++++---
 apps/api/src/services/telegram_gateway.py     | 19 +++++++++-
 4 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/apps/api/src/services/ollama_auto_recovery.py b/apps/api/src/services/ollama_auto_recovery.py
index d4b218a3..e46fadc5 100644
--- a/apps/api/src/services/ollama_auto_recovery.py
+++ b/apps/api/src/services/ollama_auto_recovery.py
@@ -110,7 +110,11 @@ class OllamaAutoRecoveryService:
         self._settings = get_settings()
 
         # 狀態追蹤
-        self._current_primary: str = "ollama"   # "ollama" / "gemini" / "fallback"
+        # 2026-05-04 ogt: B5 修復 — 改用與 failover_manager callback 一致的命名
+        # failover_manager 傳入的是 "ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"/"gemini"
+        # 原設計用 "ollama" 造成 != "ollama" 永遠成立 → recovery loop 永遠以為在 Gemini
+        # 修法：判斷改用 _is_ollama_primary()，比對 startswith("ollama_")
+        self._current_primary: str = "ollama_gcp_a"   # fallback_manager 傳入的 provider_name
         self._consecutive_healthy: int = 0
         self._task: asyncio.Task | None = None
 
@@ -213,8 +217,10 @@ class OllamaAutoRecoveryService:
             # Redis 持久化（跨重啟恢復）
             await self._persist_primary(provider)
 
-        if provider != "ollama":
-            # 切換到非 Ollama → 重置 counter，開始監控恢復
+        # 2026-05-04 ogt: B5/B6 修復 — 判斷改用 startswith("ollama_")
+        # 原設計：provider != "ollama"，但 callback 傳 "ollama_gcp_a" → 永遠觸發 tracking
+        if not provider.startswith("ollama_"):
+            # 切換到非 Ollama（gemini/nemotron/claude）→ 重置 counter，開始監控恢復
             self._consecutive_healthy = 0
             logger.info(
                 "ollama_auto_recovery_tracking_started",
@@ -336,7 +342,11 @@ class OllamaAutoRecoveryService:
             self._consecutive_healthy = 0
             return
 
-        if health.status == HealthStatus.HEALTHY:
+        # 2026-05-04 ogt: SLOW 視為可用（GCP 高負載 10-30s 仍優於 Gemini quota 限制）
+        # 原設計只接受 HEALTHY，GCP 若因負載偏高落入 SLOW 區會永久卡在 Gemini
+        is_usable = health.status in (HealthStatus.HEALTHY, HealthStatus.SLOW)
+
+        if is_usable:
             self._consecutive_healthy += 1
             logger.debug(
                 "ollama_auto_recovery_healthy_tick",
@@ -344,15 +354,16 @@ class OllamaAutoRecoveryService:
                 consecutive=self._consecutive_healthy,
                 required=self._stable_count_required,
                 current_primary=self._current_primary,
+                actual_status=health.status.value,
             )
 
             if (
                 self._consecutive_healthy >= self._stable_count_required
-                and self._current_primary != "ollama"
+                and not self._current_primary.startswith("ollama_")
             ):
                 await self._switch_back_to_ollama()
         else:
-            # 非 HEALTHY → counter 歸零，繼續等
+            # DEGRADED / OFFLINE → counter 歸零，繼續等
             if self._consecutive_healthy > 0:
                 logger.debug(
                     "ollama_auto_recovery_counter_reset",
diff --git a/apps/api/src/services/ollama_failover_manager.py b/apps/api/src/services/ollama_failover_manager.py
index dc0e7c86..fbd00ab1 100644
--- a/apps/api/src/services/ollama_failover_manager.py
+++ b/apps/api/src/services/ollama_failover_manager.py
@@ -352,7 +352,42 @@ class OllamaFailoverManager:
                 health_local=health_local,
             )
 
-        # 全部 Ollama 不健康 → Gemini
+        # 2026-05-04 ogt: SLOW 容災備援（外網同時抖動時，SLOW Ollama 仍優於 Gemini quota 耗盡）
+        # 原設計：三層全部非 HEALTHY 直接切 Gemini
+        # 問題：111 關機 + GCP 雙外網抖動 → 三節點同時 SLOW → 誤飛 Gemini → 燒 quota
+        # 修法：SLOW 節點視為可用，按優先序選最佳 SLOW 節點
+        if health_gcp_a.status == HealthStatus.SLOW:
+            return OllamaRoutingResult(
+                primary=ep_gcp_a,
+                fallback_chain=[ep_gcp_b, ep_local, _GEMINI_ENDPOINT],
+                routing_reason=f"GCP-A SLOW（降級可用）→ primary GCP-A at {now_ts}",
+                health_gcp_a=health_gcp_a,
+                health_gcp_b=health_gcp_b,
+                health_local=health_local,
+            )
+        if health_gcp_b.status == HealthStatus.SLOW:
+            return OllamaRoutingResult(
+                primary=ep_gcp_b,
+                fallback_chain=[ep_local, _GEMINI_ENDPOINT],
+                routing_reason=f"GCP-A {health_gcp_a.status.value} + GCP-B SLOW（降級可用）→ 切 GCP-B at {now_ts}",
+                health_gcp_a=health_gcp_a,
+                health_gcp_b=health_gcp_b,
+                health_local=health_local,
+            )
+        if health_local.status == HealthStatus.SLOW:
+            return OllamaRoutingResult(
+                primary=ep_local,
+                fallback_chain=[_GEMINI_ENDPOINT],
+                routing_reason=(
+                    f"GCP-A {health_gcp_a.status.value} + GCP-B {health_gcp_b.status.value}"
+                    f" + Local SLOW（降級可用）→ 切 Local(111) at {now_ts}"
+                ),
+                health_gcp_a=health_gcp_a,
+                health_gcp_b=health_gcp_b,
+                health_local=health_local,
+            )
+
+        # 全部 Ollama 不可用（DEGRADED/OFFLINE）→ Gemini
         return OllamaRoutingResult(
             primary=_GEMINI_ENDPOINT,
             fallback_chain=[_NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT],
diff --git a/apps/api/src/services/ollama_health_monitor.py b/apps/api/src/services/ollama_health_monitor.py
index ce52ae82..e7e3e182 100644
--- a/apps/api/src/services/ollama_health_monitor.py
+++ b/apps/api/src/services/ollama_health_monitor.py
@@ -37,7 +37,11 @@ logger = structlog.get_logger(__name__)
 # =============================================================================
 
 REDIS_CACHE_KEY_PREFIX = "ollama_health:"
-REDIS_CACHE_TTL_SECONDS = 30  # 防 health check storm
+REDIS_CACHE_TTL_SECONDS = 30        # HEALTHY/SLOW/DEGRADED 快取 30s 防 check storm
+# 2026-05-04 ogt: B1 修復 — OFFLINE 快取 TTL 縮短至 5s
+# 根因：NetworkPolicy reload/CNI 瞬態抖動導致三台同時 OFFLINE，被 30s cache 放大
+# OFFLINE 狀態快取越短越好，讓系統儘快重新評估並切回
+REDIS_CACHE_TTL_OFFLINE_SECONDS = 5  # OFFLINE 只快取 5s，儘速重試
 
 CONNECTIVITY_TIMEOUT_SECONDS = 5.0
 # 2026-04-25 critic-fix H3 by Claude Engineer-C — 45s 讓 SLOW 門檻(30s)真的能觀察到
@@ -289,9 +293,13 @@ class OllamaHealthMonitor:
                 reason="推理超時 >35s",
             )
         except (httpx.ConnectError, httpx.NetworkError) as e:
+            # 2026-05-04 ogt: B3 修復 — connectivity 已通過，推理階段 ConnectError 改判 DEGRADED
+            # 原設計：ConnectError → OFFLINE，但 /api/tags 已成功，表示主機存活
+            # 根因：socket 半開（GCP LB 回收 idle conn）或 Ollama 進程重啟，屬瞬態
+            # DEGRADED 不觸發 30s OFFLINE cache，下次請求立刻重試
             return HealthReport(
-                status=HealthStatus.OFFLINE,
-                reason=f"推理連接失敗：{e}",
+                status=HealthStatus.DEGRADED,
+                reason=f"推理連接失敗（主機可達，socket 瞬斷）：{e}",
             )
         except Exception as e:
             logger.warning("ollama_inference_check_error", host=host, error=str(e))
@@ -329,12 +337,19 @@ class OllamaHealthMonitor:
             return None
 
     async def _set_cached(self, host: str, report: HealthReport) -> None:
-        """寫入 Redis 快取，失敗靜默（不影響功能）"""
+        """寫入 Redis 快取，失敗靜默（不影響功能）
+        2026-05-04 ogt: OFFLINE 結果快取 5s（縮短），其他狀態快取 30s
+        """
         try:
             from src.core.redis_client import get_redis
             redis = get_redis()
             data = json.dumps(report.to_dict())
-            await redis.set(self._cache_key(host), data, ex=REDIS_CACHE_TTL_SECONDS)
+            ttl = (
+                REDIS_CACHE_TTL_OFFLINE_SECONDS
+                if report.status == HealthStatus.OFFLINE
+                else REDIS_CACHE_TTL_SECONDS
+            )
+            await redis.set(self._cache_key(host), data, ex=ttl)
         except Exception as e:
             logger.debug("ollama_health_cache_set_failed", host=host, error=str(e))
 
diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py
index bcb6ba98..2510edbb 100644
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -218,9 +218,15 @@ class TelegramMessage:
     nemotron_latency_ms: float = 0.0  # Nemotron 呼叫延遲 (ms)
 
     def _provider_display(self) -> tuple[str, str]:
-        """Return display provider and optional model suffix."""
+        """Return display provider and optional model suffix.
+        2026-05-04 ogt: 加入具體 Ollama 伺服器顯示（GCP-A/B/Local）
+        """
         provider_names = {
             "ollama": "Ollama",
+            # 2026-05-04 ogt: ADR-110 三層容災具體伺服器識別
+            "ollama_gcp_a": "Ollama GCP-A (34.143.170.20)",
+            "ollama_gcp_b": "Ollama GCP-B (34.21.145.224)",
+            "ollama_local": "Ollama Local (111)",
             "gemini": "Gemini",
             "claude": "Claude",
             "nvidia": "Nemotron",
@@ -249,7 +255,9 @@ class TelegramMessage:
         return "safe_gate_pending"
 
     def _format_automation_block(self) -> str:
-        """Visible AI automation chain for every ACTION REQUIRED card."""
+        """Visible AI automation chain for every ACTION REQUIRED card.
+        2026-05-04 ogt: 加入 Token 用量 + 具體 Ollama 伺服器顯示
+        """
         provider_display, model_suffix = self._provider_display()
         mode = self._automation_mode()
         openclaw_state = provider_display if provider_display != "rule_fallback" else "degraded"
@@ -258,6 +266,12 @@ class TelegramMessage:
         elephant_state = "timeline_km_pending"
         flow = "webhook&gt;investigator&gt;router&gt;llm/rule&gt;safe&gt;approval"
 
+        # 2026-05-04 ogt: Token 用量顯示（有資料才顯示）
+        token_line = ""
+        if self.ai_tokens > 0:
+            cost_str = f" / ${self.ai_cost:.4f}" if self.ai_cost > 0 else ""
+            token_line = f"├ Tokens：<code>{self.ai_tokens:,}{cost_str}</code>\n"
+
         return (
             f"🤖 <b>AI 自動化鏈路</b>\n"
             f"├ Router：<code>{html.escape(provider_display)}{model_suffix}</code>\n"
@@ -266,6 +280,7 @@ class TelegramMessage:
             f"NemoTron：<code>{html.escape(nemotron_state)}</code>\n"
             f"├ Hermes：<code>{html.escape(hermes_state)}</code> | "
             f"ElephantAlpha：<code>{html.escape(elephant_state)}</code>\n"
+            f"{token_line}"
             f"└ Flow：<code>{flow}</code>\n"
         )