diff --git a/apps/api/src/services/ollama_auto_recovery.py b/apps/api/src/services/ollama_auto_recovery.py
index d4b218a3..e46fadc5 100644
--- a/apps/api/src/services/ollama_auto_recovery.py
+++ b/apps/api/src/services/ollama_auto_recovery.py
@@ -110,7 +110,11 @@ class OllamaAutoRecoveryService:
self._settings = get_settings()
# 狀態追蹤
- self._current_primary: str = "ollama" # "ollama" / "gemini" / "fallback"
+ # 2026-05-04 ogt: B5 修復 — 改用與 failover_manager callback 一致的命名
+ # failover_manager 傳入的是 "ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"/"gemini"
+ # 原設計用 "ollama" 造成 != "ollama" 永遠成立 → recovery loop 永遠以為在 Gemini
+ # 修法:判斷改用 _is_ollama_primary(),比對 startswith("ollama_")
+ self._current_primary: str = "ollama_gcp_a" # fallback_manager 傳入的 provider_name
self._consecutive_healthy: int = 0
self._task: asyncio.Task | None = None
@@ -213,8 +217,10 @@ class OllamaAutoRecoveryService:
# Redis 持久化(跨重啟恢復)
await self._persist_primary(provider)
- if provider != "ollama":
- # 切換到非 Ollama → 重置 counter,開始監控恢復
+ # 2026-05-04 ogt: B5/B6 修復 — 判斷改用 startswith("ollama_")
+ # 原設計:provider != "ollama",但 callback 傳 "ollama_gcp_a" → 永遠觸發 tracking
+ if not provider.startswith("ollama_"):
+ # 切換到非 Ollama(gemini/nemotron/claude)→ 重置 counter,開始監控恢復
self._consecutive_healthy = 0
logger.info(
"ollama_auto_recovery_tracking_started",
@@ -336,7 +342,11 @@ class OllamaAutoRecoveryService:
self._consecutive_healthy = 0
return
- if health.status == HealthStatus.HEALTHY:
+ # 2026-05-04 ogt: SLOW 視為可用(GCP 高負載 10-30s 仍優於 Gemini quota 限制)
+ # 原設計只接受 HEALTHY,GCP 若因負載偏高落入 SLOW 區會永久卡在 Gemini
+ is_usable = health.status in (HealthStatus.HEALTHY, HealthStatus.SLOW)
+
+ if is_usable:
self._consecutive_healthy += 1
logger.debug(
"ollama_auto_recovery_healthy_tick",
@@ -344,15 +354,16 @@ class OllamaAutoRecoveryService:
consecutive=self._consecutive_healthy,
required=self._stable_count_required,
current_primary=self._current_primary,
+ actual_status=health.status.value,
)
if (
self._consecutive_healthy >= self._stable_count_required
- and self._current_primary != "ollama"
+ and not self._current_primary.startswith("ollama_")
):
await self._switch_back_to_ollama()
else:
- # 非 HEALTHY → counter 歸零,繼續等
+ # DEGRADED / OFFLINE → counter 歸零,繼續等
if self._consecutive_healthy > 0:
logger.debug(
"ollama_auto_recovery_counter_reset",
diff --git a/apps/api/src/services/ollama_failover_manager.py b/apps/api/src/services/ollama_failover_manager.py
index dc0e7c86..fbd00ab1 100644
--- a/apps/api/src/services/ollama_failover_manager.py
+++ b/apps/api/src/services/ollama_failover_manager.py
@@ -352,7 +352,42 @@ class OllamaFailoverManager:
health_local=health_local,
)
- # 全部 Ollama 不健康 → Gemini
+ # 2026-05-04 ogt: SLOW 容災備援(外網同時抖動時,SLOW Ollama 仍優於 Gemini quota 耗盡)
+ # 原設計:三層全部非 HEALTHY 直接切 Gemini
+ # 問題:111 關機 + GCP 雙外網抖動 → 三節點同時 SLOW → 誤飛 Gemini → 燒 quota
+ # 修法:SLOW 節點視為可用,按優先序選最佳 SLOW 節點
+ if health_gcp_a.status == HealthStatus.SLOW:
+ return OllamaRoutingResult(
+ primary=ep_gcp_a,
+ fallback_chain=[ep_gcp_b, ep_local, _GEMINI_ENDPOINT],
+ routing_reason=f"GCP-A SLOW(降級可用)→ primary GCP-A at {now_ts}",
+ health_gcp_a=health_gcp_a,
+ health_gcp_b=health_gcp_b,
+ health_local=health_local,
+ )
+ if health_gcp_b.status == HealthStatus.SLOW:
+ return OllamaRoutingResult(
+ primary=ep_gcp_b,
+ fallback_chain=[ep_local, _GEMINI_ENDPOINT],
+ routing_reason=f"GCP-A {health_gcp_a.status.value} + GCP-B SLOW(降級可用)→ 切 GCP-B at {now_ts}",
+ health_gcp_a=health_gcp_a,
+ health_gcp_b=health_gcp_b,
+ health_local=health_local,
+ )
+ if health_local.status == HealthStatus.SLOW:
+ return OllamaRoutingResult(
+ primary=ep_local,
+ fallback_chain=[_GEMINI_ENDPOINT],
+ routing_reason=(
+ f"GCP-A {health_gcp_a.status.value} + GCP-B {health_gcp_b.status.value}"
+ f" + Local SLOW(降級可用)→ 切 Local(111) at {now_ts}"
+ ),
+ health_gcp_a=health_gcp_a,
+ health_gcp_b=health_gcp_b,
+ health_local=health_local,
+ )
+
+ # 全部 Ollama 不可用(DEGRADED/OFFLINE)→ Gemini
return OllamaRoutingResult(
primary=_GEMINI_ENDPOINT,
fallback_chain=[_NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT],
diff --git a/apps/api/src/services/ollama_health_monitor.py b/apps/api/src/services/ollama_health_monitor.py
index ce52ae82..e7e3e182 100644
--- a/apps/api/src/services/ollama_health_monitor.py
+++ b/apps/api/src/services/ollama_health_monitor.py
@@ -37,7 +37,11 @@ logger = structlog.get_logger(__name__)
# =============================================================================
REDIS_CACHE_KEY_PREFIX = "ollama_health:"
-REDIS_CACHE_TTL_SECONDS = 30 # 防 health check storm
+REDIS_CACHE_TTL_SECONDS = 30 # HEALTHY/SLOW/DEGRADED 快取 30s 防 check storm
+# 2026-05-04 ogt: B1 修復 — OFFLINE 快取 TTL 縮短至 5s
+# 根因:NetworkPolicy reload/CNI 瞬態抖動導致三台同時 OFFLINE,被 30s cache 放大
+# OFFLINE 狀態快取越短越好,讓系統儘快重新評估並切回
+REDIS_CACHE_TTL_OFFLINE_SECONDS = 5 # OFFLINE 只快取 5s,儘速重試
CONNECTIVITY_TIMEOUT_SECONDS = 5.0
# 2026-04-25 critic-fix H3 by Claude Engineer-C — 45s 讓 SLOW 門檻(30s)真的能觀察到
@@ -289,9 +293,13 @@ class OllamaHealthMonitor:
reason="推理超時 >35s",
)
except (httpx.ConnectError, httpx.NetworkError) as e:
+ # 2026-05-04 ogt: B3 修復 — connectivity 已通過,推理階段 ConnectError 改判 DEGRADED
+ # 原設計:ConnectError → OFFLINE,但 /api/tags 已成功,表示主機存活
+ # 根因:socket 半開(GCP LB 回收 idle conn)或 Ollama 進程重啟,屬瞬態
+ # DEGRADED 不觸發 30s OFFLINE cache,下次請求立刻重試
return HealthReport(
- status=HealthStatus.OFFLINE,
- reason=f"推理連接失敗:{e}",
+ status=HealthStatus.DEGRADED,
+ reason=f"推理連接失敗(主機可達,socket 瞬斷):{e}",
)
except Exception as e:
logger.warning("ollama_inference_check_error", host=host, error=str(e))
@@ -329,12 +337,19 @@ class OllamaHealthMonitor:
return None
async def _set_cached(self, host: str, report: HealthReport) -> None:
- """寫入 Redis 快取,失敗靜默(不影響功能)"""
+ """寫入 Redis 快取,失敗靜默(不影響功能)
+ 2026-05-04 ogt: OFFLINE 結果快取 5s(縮短),其他狀態快取 30s
+ """
try:
from src.core.redis_client import get_redis
redis = get_redis()
data = json.dumps(report.to_dict())
- await redis.set(self._cache_key(host), data, ex=REDIS_CACHE_TTL_SECONDS)
+ ttl = (
+ REDIS_CACHE_TTL_OFFLINE_SECONDS
+ if report.status == HealthStatus.OFFLINE
+ else REDIS_CACHE_TTL_SECONDS
+ )
+ await redis.set(self._cache_key(host), data, ex=ttl)
except Exception as e:
logger.debug("ollama_health_cache_set_failed", host=host, error=str(e))
diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py
index bcb6ba98..2510edbb 100644
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -218,9 +218,15 @@ class TelegramMessage:
nemotron_latency_ms: float = 0.0 # Nemotron 呼叫延遲 (ms)
def _provider_display(self) -> tuple[str, str]:
- """Return display provider and optional model suffix."""
+ """Return display provider and optional model suffix.
+ 2026-05-04 ogt: 加入具體 Ollama 伺服器顯示(GCP-A/B/Local)
+ """
provider_names = {
"ollama": "Ollama",
+ # 2026-05-04 ogt: ADR-110 三層容災具體伺服器識別
+ "ollama_gcp_a": "Ollama GCP-A (34.143.170.20)",
+ "ollama_gcp_b": "Ollama GCP-B (34.21.145.224)",
+ "ollama_local": "Ollama Local (111)",
"gemini": "Gemini",
"claude": "Claude",
"nvidia": "Nemotron",
@@ -249,7 +255,9 @@ class TelegramMessage:
return "safe_gate_pending"
def _format_automation_block(self) -> str:
- """Visible AI automation chain for every ACTION REQUIRED card."""
+ """Visible AI automation chain for every ACTION REQUIRED card.
+ 2026-05-04 ogt: 加入 Token 用量 + 具體 Ollama 伺服器顯示
+ """
provider_display, model_suffix = self._provider_display()
mode = self._automation_mode()
openclaw_state = provider_display if provider_display != "rule_fallback" else "degraded"
@@ -258,6 +266,12 @@ class TelegramMessage:
elephant_state = "timeline_km_pending"
flow = "webhook>investigator>router>llm/rule>safe>approval"
+ # 2026-05-04 ogt: Token 用量顯示(有資料才顯示)
+ token_line = ""
+ if self.ai_tokens > 0:
+ cost_str = f" / ${self.ai_cost:.4f}" if self.ai_cost > 0 else ""
+ token_line = f"├ Tokens:{self.ai_tokens:,}{cost_str}\n"
+
return (
f"🤖 AI 自動化鏈路\n"
f"├ Router:{html.escape(provider_display)}{model_suffix}\n"
@@ -266,6 +280,7 @@ class TelegramMessage:
f"NemoTron:{html.escape(nemotron_state)}\n"
f"├ Hermes:{html.escape(hermes_state)} | "
f"ElephantAlpha:{html.escape(elephant_state)}\n"
+ f"{token_line}"
f"└ Flow:{flow}\n"
)