fix(ollama): 修復容災鏈四大 bug — OFFLINE cache 放大 + SLOW 路由缺失 + recovery 命名不一致 + 告警顯示

根因：NetworkPolicy reload/CNI 瞬態抖動導致三台 Ollama 同時 OFFLINE，被 30s Redis cache 放大 → 後續 30s 所有請求誤走 Gemini，燒 quota B1 ollama_health_monitor: OFFLINE TTL 從 30s 縮短至 5s，儘速重試 B3 ollama_health_monitor: inference ConnectError 改判 DEGRADED（connectivity 通了不算 OFFLINE） B5/B6 ollama_auto_recovery: _current_primary 預設改 "ollama_gcp_a"，比對改 startswith("ollama_") SLOW 修復: failover_manager SLOW 節點視為可用（優於 Gemini quota 耗盡） SLOW 修復: auto_recovery SLOW 也計入 recovery counter（GCP 高負載仍可切回）告警顯示: _provider_display 加入 GCP-A/B/Local 具體伺服器識別告警顯示: _format_automation_block 加入 Token 用量行 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 19:01:27 +08:00
parent f6b698c873
commit 855819652e
4 changed files with 90 additions and 14 deletions
--- a/apps/api/src/services/ollama_auto_recovery.py
+++ b/apps/api/src/services/ollama_auto_recovery.py
@@ -110,7 +110,11 @@ class OllamaAutoRecoveryService:
        self._settings = get_settings()
        # 狀態追蹤
-        self._current_primary: str = "ollama"   # "ollama" / "gemini" / "fallback"
+        # 2026-05-04 ogt: B5 修復 — 改用與 failover_manager callback 一致的命名
        # failover_manager 傳入的是 "ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"/"gemini"
        # 原設計用 "ollama" 造成 != "ollama" 永遠成立 → recovery loop 永遠以為在 Gemini
        # 修法：判斷改用 _is_ollama_primary()，比對 startswith("ollama_")
        self._current_primary: str = "ollama_gcp_a"   # fallback_manager 傳入的 provider_name
        self._consecutive_healthy: int = 0
        self._task: asyncio.Task | None = None
@@ -213,8 +217,10 @@ class OllamaAutoRecoveryService:
            # Redis 持久化（跨重啟恢復）
            await self._persist_primary(provider)
-        if provider != "ollama":
+        # 2026-05-04 ogt: B5/B6 修復 — 判斷改用 startswith("ollama_")
-            # 切換到非 Ollama → 重置 counter，開始監控恢復
+        # 原設計：provider != "ollama"，但 callback 傳 "ollama_gcp_a" → 永遠觸發 tracking
        if not provider.startswith("ollama_"):
            # 切換到非 Ollama（gemini/nemotron/claude）→ 重置 counter，開始監控恢復
            self._consecutive_healthy = 0
            logger.info(
                "ollama_auto_recovery_tracking_started",
@@ -336,7 +342,11 @@ class OllamaAutoRecoveryService:
            self._consecutive_healthy = 0
            return
-        if health.status == HealthStatus.HEALTHY:
+        # 2026-05-04 ogt: SLOW 視為可用（GCP 高負載 10-30s 仍優於 Gemini quota 限制）
        # 原設計只接受 HEALTHY，GCP 若因負載偏高落入 SLOW 區會永久卡在 Gemini
        is_usable = health.status in (HealthStatus.HEALTHY, HealthStatus.SLOW)
        if is_usable:
            self._consecutive_healthy += 1
            logger.debug(
                "ollama_auto_recovery_healthy_tick",
@@ -344,15 +354,16 @@ class OllamaAutoRecoveryService:
                consecutive=self._consecutive_healthy,
                required=self._stable_count_required,
                current_primary=self._current_primary,
                actual_status=health.status.value,
            )
            if (
                self._consecutive_healthy >= self._stable_count_required
-                and self._current_primary != "ollama"
+                and not self._current_primary.startswith("ollama_")
            ):
                await self._switch_back_to_ollama()
        else:
-            # 非 HEALTHY → counter 歸零，繼續等
+            # DEGRADED / OFFLINE → counter 歸零，繼續等
            if self._consecutive_healthy > 0:
                logger.debug(
                    "ollama_auto_recovery_counter_reset",
--- a/apps/api/src/services/ollama_failover_manager.py
+++ b/apps/api/src/services/ollama_failover_manager.py
@@ -352,7 +352,42 @@ class OllamaFailoverManager:
                health_local=health_local,
            )
-        # 全部 Ollama 不健康 → Gemini
+        # 2026-05-04 ogt: SLOW 容災備援（外網同時抖動時，SLOW Ollama 仍優於 Gemini quota 耗盡）
        # 原設計：三層全部非 HEALTHY 直接切 Gemini
        # 問題：111 關機 + GCP 雙外網抖動 → 三節點同時 SLOW → 誤飛 Gemini → 燒 quota
        # 修法：SLOW 節點視為可用，按優先序選最佳 SLOW 節點
        if health_gcp_a.status == HealthStatus.SLOW:
            return OllamaRoutingResult(
                primary=ep_gcp_a,
                fallback_chain=[ep_gcp_b, ep_local, _GEMINI_ENDPOINT],
                routing_reason=f"GCP-A SLOW（降級可用）→ primary GCP-A at {now_ts}",
                health_gcp_a=health_gcp_a,
                health_gcp_b=health_gcp_b,
                health_local=health_local,
            )
        if health_gcp_b.status == HealthStatus.SLOW:
            return OllamaRoutingResult(
                primary=ep_gcp_b,
                fallback_chain=[ep_local, _GEMINI_ENDPOINT],
                routing_reason=f"GCP-A {health_gcp_a.status.value} + GCP-B SLOW（降級可用）→ 切 GCP-B at {now_ts}",
                health_gcp_a=health_gcp_a,
                health_gcp_b=health_gcp_b,
                health_local=health_local,
            )
        if health_local.status == HealthStatus.SLOW:
            return OllamaRoutingResult(
                primary=ep_local,
                fallback_chain=[_GEMINI_ENDPOINT],
                routing_reason=(
                    f"GCP-A {health_gcp_a.status.value} + GCP-B {health_gcp_b.status.value}"
                    f" + Local SLOW（降級可用）→ 切 Local(111) at {now_ts}"
                ),
                health_gcp_a=health_gcp_a,
                health_gcp_b=health_gcp_b,
                health_local=health_local,
            )
        # 全部 Ollama 不可用（DEGRADED/OFFLINE）→ Gemini
        return OllamaRoutingResult(
            primary=_GEMINI_ENDPOINT,
            fallback_chain=[_NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT],
--- a/apps/api/src/services/ollama_health_monitor.py
+++ b/apps/api/src/services/ollama_health_monitor.py
@@ -37,7 +37,11 @@ logger = structlog.get_logger(__name__)
 # =============================================================================
 REDIS_CACHE_KEY_PREFIX = "ollama_health:"
-REDIS_CACHE_TTL_SECONDS = 30  # 防 health check storm
+REDIS_CACHE_TTL_SECONDS = 30        # HEALTHY/SLOW/DEGRADED 快取 30s 防 check storm
 # 2026-05-04 ogt: B1 修復 — OFFLINE 快取 TTL 縮短至 5s
 # 根因：NetworkPolicy reload/CNI 瞬態抖動導致三台同時 OFFLINE，被 30s cache 放大
 # OFFLINE 狀態快取越短越好，讓系統儘快重新評估並切回
 REDIS_CACHE_TTL_OFFLINE_SECONDS = 5  # OFFLINE 只快取 5s，儘速重試
 CONNECTIVITY_TIMEOUT_SECONDS = 5.0
 # 2026-04-25 critic-fix H3 by Claude Engineer-C — 45s 讓 SLOW 門檻(30s)真的能觀察到
@@ -289,9 +293,13 @@ class OllamaHealthMonitor:
                reason="推理超時 >35s",
            )
        except (httpx.ConnectError, httpx.NetworkError) as e:
            # 2026-05-04 ogt: B3 修復 — connectivity 已通過，推理階段 ConnectError 改判 DEGRADED
            # 原設計：ConnectError → OFFLINE，但 /api/tags 已成功，表示主機存活
            # 根因：socket 半開（GCP LB 回收 idle conn）或 Ollama 進程重啟，屬瞬態
            # DEGRADED 不觸發 30s OFFLINE cache，下次請求立刻重試
            return HealthReport(
-                status=HealthStatus.OFFLINE,
+                status=HealthStatus.DEGRADED,
-                reason=f"推理連接失敗：{e}",
+                reason=f"推理連接失敗（主機可達，socket 瞬斷）：{e}",
            )
        except Exception as e:
            logger.warning("ollama_inference_check_error", host=host, error=str(e))
@@ -329,12 +337,19 @@ class OllamaHealthMonitor:
            return None
    async def _set_cached(self, host: str, report: HealthReport) -> None:
-        """寫入 Redis 快取，失敗靜默（不影響功能）"""
+        """寫入 Redis 快取，失敗靜默（不影響功能）
        2026-05-04 ogt: OFFLINE 結果快取 5s（縮短），其他狀態快取 30s
        """
        try:
            from src.core.redis_client import get_redis
            redis = get_redis()
            data = json.dumps(report.to_dict())
-            await redis.set(self._cache_key(host), data, ex=REDIS_CACHE_TTL_SECONDS)
+            ttl = (
                REDIS_CACHE_TTL_OFFLINE_SECONDS
                if report.status == HealthStatus.OFFLINE
                else REDIS_CACHE_TTL_SECONDS
            )
            await redis.set(self._cache_key(host), data, ex=ttl)
        except Exception as e:
            logger.debug("ollama_health_cache_set_failed", host=host, error=str(e))
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -218,9 +218,15 @@ class TelegramMessage:
    nemotron_latency_ms: float = 0.0  # Nemotron 呼叫延遲 (ms)
    def _provider_display(self) -> tuple[str, str]:
-        """Return display provider and optional model suffix."""
+        """Return display provider and optional model suffix.
        2026-05-04 ogt: 加入具體 Ollama 伺服器顯示（GCP-A/B/Local）
        """
        provider_names = {
            "ollama": "Ollama",
            # 2026-05-04 ogt: ADR-110 三層容災具體伺服器識別
            "ollama_gcp_a": "Ollama GCP-A (34.143.170.20)",
            "ollama_gcp_b": "Ollama GCP-B (34.21.145.224)",
            "ollama_local": "Ollama Local (111)",
            "gemini": "Gemini",
            "claude": "Claude",
            "nvidia": "Nemotron",
@@ -249,7 +255,9 @@ class TelegramMessage:
        return "safe_gate_pending"
    def _format_automation_block(self) -> str:
-        """Visible AI automation chain for every ACTION REQUIRED card."""
+        """Visible AI automation chain for every ACTION REQUIRED card.
        2026-05-04 ogt: 加入 Token 用量 + 具體 Ollama 伺服器顯示
        """
        provider_display, model_suffix = self._provider_display()
        mode = self._automation_mode()
        openclaw_state = provider_display if provider_display != "rule_fallback" else "degraded"
@@ -258,6 +266,12 @@ class TelegramMessage:
        elephant_state = "timeline_km_pending"
        flow = "webhook&gt;investigator&gt;router&gt;llm/rule&gt;safe&gt;approval"
        # 2026-05-04 ogt: Token 用量顯示（有資料才顯示）
        token_line = ""
        if self.ai_tokens > 0:
            cost_str = f" / ${self.ai_cost:.4f}" if self.ai_cost > 0 else ""
            token_line = f"├ Tokens：<code>{self.ai_tokens:,}{cost_str}</code>\n"
        return (
            f"🤖 <b>AI 自動化鏈路</b>\n"
            f"├ Router：<code>{html.escape(provider_display)}{model_suffix}</code>\n"
@@ -266,6 +280,7 @@ class TelegramMessage:
            f"NemoTron：<code>{html.escape(nemotron_state)}</code>\n"
            f"├ Hermes：<code>{html.escape(hermes_state)}</code> | "
            f"ElephantAlpha：<code>{html.escape(elephant_state)}</code>\n"
            f"{token_line}"
            f"└ Flow：<code>{flow}</code>\n"
        )