From 855819652ec8a98c3c24ee65e9f2299ce79d3b4b Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 4 May 2026 19:01:27 +0800 Subject: [PATCH] =?UTF-8?q?fix(ollama):=20=E4=BF=AE=E5=BE=A9=E5=AE=B9?= =?UTF-8?q?=E7=81=BD=E9=8F=88=E5=9B=9B=E5=A4=A7=20bug=20=E2=80=94=20OFFLIN?= =?UTF-8?q?E=20cache=20=E6=94=BE=E5=A4=A7=20+=20SLOW=20=E8=B7=AF=E7=94=B1?= =?UTF-8?q?=E7=BC=BA=E5=A4=B1=20+=20recovery=20=E5=91=BD=E5=90=8D=E4=B8=8D?= =?UTF-8?q?=E4=B8=80=E8=87=B4=20+=20=E5=91=8A=E8=AD=A6=E9=A1=AF=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因:NetworkPolicy reload/CNI 瞬態抖動導致三台 Ollama 同時 OFFLINE,被 30s Redis cache 放大 → 後續 30s 所有請求誤走 Gemini,燒 quota B1 ollama_health_monitor: OFFLINE TTL 從 30s 縮短至 5s,儘速重試 B3 ollama_health_monitor: inference ConnectError 改判 DEGRADED(connectivity 通了不算 OFFLINE) B5/B6 ollama_auto_recovery: _current_primary 預設改 "ollama_gcp_a",比對改 startswith("ollama_") SLOW 修復: failover_manager SLOW 節點視為可用(優於 Gemini quota 耗盡) SLOW 修復: auto_recovery SLOW 也計入 recovery counter(GCP 高負載仍可切回) 告警顯示: _provider_display 加入 GCP-A/B/Local 具體伺服器識別 告警顯示: _format_automation_block 加入 Token 用量行 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/ollama_auto_recovery.py | 23 +++++++++--- .../src/services/ollama_failover_manager.py | 37 ++++++++++++++++++- .../api/src/services/ollama_health_monitor.py | 25 ++++++++++--- apps/api/src/services/telegram_gateway.py | 19 +++++++++- 4 files changed, 90 insertions(+), 14 deletions(-) diff --git a/apps/api/src/services/ollama_auto_recovery.py b/apps/api/src/services/ollama_auto_recovery.py index d4b218a3..e46fadc5 100644 --- a/apps/api/src/services/ollama_auto_recovery.py +++ b/apps/api/src/services/ollama_auto_recovery.py @@ -110,7 +110,11 @@ class OllamaAutoRecoveryService: self._settings = get_settings() # 狀態追蹤 - self._current_primary: str = "ollama" # "ollama" / "gemini" / "fallback" + # 2026-05-04 ogt: B5 修復 — 改用與 failover_manager callback 一致的命名 + # failover_manager 傳入的是 "ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"/"gemini" + # 原設計用 "ollama" 造成 != "ollama" 永遠成立 → recovery loop 永遠以為在 Gemini + # 修法:判斷改用 _is_ollama_primary(),比對 startswith("ollama_") + self._current_primary: str = "ollama_gcp_a" # fallback_manager 傳入的 provider_name self._consecutive_healthy: int = 0 self._task: asyncio.Task | None = None @@ -213,8 +217,10 @@ class OllamaAutoRecoveryService: # Redis 持久化(跨重啟恢復) await self._persist_primary(provider) - if provider != "ollama": - # 切換到非 Ollama → 重置 counter,開始監控恢復 + # 2026-05-04 ogt: B5/B6 修復 — 判斷改用 startswith("ollama_") + # 原設計:provider != "ollama",但 callback 傳 "ollama_gcp_a" → 永遠觸發 tracking + if not provider.startswith("ollama_"): + # 切換到非 Ollama(gemini/nemotron/claude)→ 重置 counter,開始監控恢復 self._consecutive_healthy = 0 logger.info( "ollama_auto_recovery_tracking_started", @@ -336,7 +342,11 @@ class OllamaAutoRecoveryService: self._consecutive_healthy = 0 return - if health.status == HealthStatus.HEALTHY: + # 2026-05-04 ogt: SLOW 視為可用(GCP 高負載 10-30s 仍優於 Gemini quota 限制) + # 原設計只接受 HEALTHY,GCP 若因負載偏高落入 SLOW 區會永久卡在 Gemini + is_usable = health.status in (HealthStatus.HEALTHY, HealthStatus.SLOW) + + if is_usable: self._consecutive_healthy += 1 logger.debug( "ollama_auto_recovery_healthy_tick", @@ -344,15 +354,16 @@ class OllamaAutoRecoveryService: consecutive=self._consecutive_healthy, required=self._stable_count_required, current_primary=self._current_primary, + actual_status=health.status.value, ) if ( self._consecutive_healthy >= self._stable_count_required - and self._current_primary != "ollama" + and not self._current_primary.startswith("ollama_") ): await self._switch_back_to_ollama() else: - # 非 HEALTHY → counter 歸零,繼續等 + # DEGRADED / OFFLINE → counter 歸零,繼續等 if self._consecutive_healthy > 0: logger.debug( "ollama_auto_recovery_counter_reset", diff --git a/apps/api/src/services/ollama_failover_manager.py b/apps/api/src/services/ollama_failover_manager.py index dc0e7c86..fbd00ab1 100644 --- a/apps/api/src/services/ollama_failover_manager.py +++ b/apps/api/src/services/ollama_failover_manager.py @@ -352,7 +352,42 @@ class OllamaFailoverManager: health_local=health_local, ) - # 全部 Ollama 不健康 → Gemini + # 2026-05-04 ogt: SLOW 容災備援(外網同時抖動時,SLOW Ollama 仍優於 Gemini quota 耗盡) + # 原設計:三層全部非 HEALTHY 直接切 Gemini + # 問題:111 關機 + GCP 雙外網抖動 → 三節點同時 SLOW → 誤飛 Gemini → 燒 quota + # 修法:SLOW 節點視為可用,按優先序選最佳 SLOW 節點 + if health_gcp_a.status == HealthStatus.SLOW: + return OllamaRoutingResult( + primary=ep_gcp_a, + fallback_chain=[ep_gcp_b, ep_local, _GEMINI_ENDPOINT], + routing_reason=f"GCP-A SLOW(降級可用)→ primary GCP-A at {now_ts}", + health_gcp_a=health_gcp_a, + health_gcp_b=health_gcp_b, + health_local=health_local, + ) + if health_gcp_b.status == HealthStatus.SLOW: + return OllamaRoutingResult( + primary=ep_gcp_b, + fallback_chain=[ep_local, _GEMINI_ENDPOINT], + routing_reason=f"GCP-A {health_gcp_a.status.value} + GCP-B SLOW(降級可用)→ 切 GCP-B at {now_ts}", + health_gcp_a=health_gcp_a, + health_gcp_b=health_gcp_b, + health_local=health_local, + ) + if health_local.status == HealthStatus.SLOW: + return OllamaRoutingResult( + primary=ep_local, + fallback_chain=[_GEMINI_ENDPOINT], + routing_reason=( + f"GCP-A {health_gcp_a.status.value} + GCP-B {health_gcp_b.status.value}" + f" + Local SLOW(降級可用)→ 切 Local(111) at {now_ts}" + ), + health_gcp_a=health_gcp_a, + health_gcp_b=health_gcp_b, + health_local=health_local, + ) + + # 全部 Ollama 不可用(DEGRADED/OFFLINE)→ Gemini return OllamaRoutingResult( primary=_GEMINI_ENDPOINT, fallback_chain=[_NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT], diff --git a/apps/api/src/services/ollama_health_monitor.py b/apps/api/src/services/ollama_health_monitor.py index ce52ae82..e7e3e182 100644 --- a/apps/api/src/services/ollama_health_monitor.py +++ b/apps/api/src/services/ollama_health_monitor.py @@ -37,7 +37,11 @@ logger = structlog.get_logger(__name__) # ============================================================================= REDIS_CACHE_KEY_PREFIX = "ollama_health:" -REDIS_CACHE_TTL_SECONDS = 30 # 防 health check storm +REDIS_CACHE_TTL_SECONDS = 30 # HEALTHY/SLOW/DEGRADED 快取 30s 防 check storm +# 2026-05-04 ogt: B1 修復 — OFFLINE 快取 TTL 縮短至 5s +# 根因:NetworkPolicy reload/CNI 瞬態抖動導致三台同時 OFFLINE,被 30s cache 放大 +# OFFLINE 狀態快取越短越好,讓系統儘快重新評估並切回 +REDIS_CACHE_TTL_OFFLINE_SECONDS = 5 # OFFLINE 只快取 5s,儘速重試 CONNECTIVITY_TIMEOUT_SECONDS = 5.0 # 2026-04-25 critic-fix H3 by Claude Engineer-C — 45s 讓 SLOW 門檻(30s)真的能觀察到 @@ -289,9 +293,13 @@ class OllamaHealthMonitor: reason="推理超時 >35s", ) except (httpx.ConnectError, httpx.NetworkError) as e: + # 2026-05-04 ogt: B3 修復 — connectivity 已通過,推理階段 ConnectError 改判 DEGRADED + # 原設計:ConnectError → OFFLINE,但 /api/tags 已成功,表示主機存活 + # 根因:socket 半開(GCP LB 回收 idle conn)或 Ollama 進程重啟,屬瞬態 + # DEGRADED 不觸發 30s OFFLINE cache,下次請求立刻重試 return HealthReport( - status=HealthStatus.OFFLINE, - reason=f"推理連接失敗:{e}", + status=HealthStatus.DEGRADED, + reason=f"推理連接失敗(主機可達,socket 瞬斷):{e}", ) except Exception as e: logger.warning("ollama_inference_check_error", host=host, error=str(e)) @@ -329,12 +337,19 @@ class OllamaHealthMonitor: return None async def _set_cached(self, host: str, report: HealthReport) -> None: - """寫入 Redis 快取,失敗靜默(不影響功能)""" + """寫入 Redis 快取,失敗靜默(不影響功能) + 2026-05-04 ogt: OFFLINE 結果快取 5s(縮短),其他狀態快取 30s + """ try: from src.core.redis_client import get_redis redis = get_redis() data = json.dumps(report.to_dict()) - await redis.set(self._cache_key(host), data, ex=REDIS_CACHE_TTL_SECONDS) + ttl = ( + REDIS_CACHE_TTL_OFFLINE_SECONDS + if report.status == HealthStatus.OFFLINE + else REDIS_CACHE_TTL_SECONDS + ) + await redis.set(self._cache_key(host), data, ex=ttl) except Exception as e: logger.debug("ollama_health_cache_set_failed", host=host, error=str(e)) diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index bcb6ba98..2510edbb 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -218,9 +218,15 @@ class TelegramMessage: nemotron_latency_ms: float = 0.0 # Nemotron 呼叫延遲 (ms) def _provider_display(self) -> tuple[str, str]: - """Return display provider and optional model suffix.""" + """Return display provider and optional model suffix. + 2026-05-04 ogt: 加入具體 Ollama 伺服器顯示(GCP-A/B/Local) + """ provider_names = { "ollama": "Ollama", + # 2026-05-04 ogt: ADR-110 三層容災具體伺服器識別 + "ollama_gcp_a": "Ollama GCP-A (34.143.170.20)", + "ollama_gcp_b": "Ollama GCP-B (34.21.145.224)", + "ollama_local": "Ollama Local (111)", "gemini": "Gemini", "claude": "Claude", "nvidia": "Nemotron", @@ -249,7 +255,9 @@ class TelegramMessage: return "safe_gate_pending" def _format_automation_block(self) -> str: - """Visible AI automation chain for every ACTION REQUIRED card.""" + """Visible AI automation chain for every ACTION REQUIRED card. + 2026-05-04 ogt: 加入 Token 用量 + 具體 Ollama 伺服器顯示 + """ provider_display, model_suffix = self._provider_display() mode = self._automation_mode() openclaw_state = provider_display if provider_display != "rule_fallback" else "degraded" @@ -258,6 +266,12 @@ class TelegramMessage: elephant_state = "timeline_km_pending" flow = "webhook>investigator>router>llm/rule>safe>approval" + # 2026-05-04 ogt: Token 用量顯示(有資料才顯示) + token_line = "" + if self.ai_tokens > 0: + cost_str = f" / ${self.ai_cost:.4f}" if self.ai_cost > 0 else "" + token_line = f"├ Tokens:{self.ai_tokens:,}{cost_str}\n" + return ( f"🤖 AI 自動化鏈路\n" f"├ Router:{html.escape(provider_display)}{model_suffix}\n" @@ -266,6 +280,7 @@ class TelegramMessage: f"NemoTron:{html.escape(nemotron_state)}\n" f"├ Hermes:{html.escape(hermes_state)} | " f"ElephantAlpha:{html.escape(elephant_state)}\n" + f"{token_line}" f"└ Flow:{flow}\n" )