fix(ollama): 修復容災鏈四大 bug — OFFLINE cache 放大 + SLOW 路由缺失 + recovery 命名不一致 + 告警顯示
All checks were successful
Code Review / ai-code-review (push) Successful in 48s

根因:NetworkPolicy reload/CNI 瞬態抖動導致三台 Ollama 同時 OFFLINE,被 30s Redis cache 放大
  → 後續 30s 所有請求誤走 Gemini,燒 quota

B1 ollama_health_monitor: OFFLINE TTL 從 30s 縮短至 5s,儘速重試
B3 ollama_health_monitor: inference ConnectError 改判 DEGRADED(connectivity 通了不算 OFFLINE)
B5/B6 ollama_auto_recovery: _current_primary 預設改 "ollama_gcp_a",比對改 startswith("ollama_")
SLOW 修復: failover_manager SLOW 節點視為可用(優於 Gemini quota 耗盡)
SLOW 修復: auto_recovery SLOW 也計入 recovery counter(GCP 高負載仍可切回)
告警顯示: _provider_display 加入 GCP-A/B/Local 具體伺服器識別
告警顯示: _format_automation_block 加入 Token 用量行

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-05-04 19:01:27 +08:00
parent f6b698c873
commit 855819652e
4 changed files with 90 additions and 14 deletions

View File

@@ -110,7 +110,11 @@ class OllamaAutoRecoveryService:
self._settings = get_settings() self._settings = get_settings()
# 狀態追蹤 # 狀態追蹤
self._current_primary: str = "ollama" # "ollama" / "gemini" / "fallback" # 2026-05-04 ogt: B5 修復 — 改用與 failover_manager callback 一致的命名
# failover_manager 傳入的是 "ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"/"gemini"
# 原設計用 "ollama" 造成 != "ollama" 永遠成立 → recovery loop 永遠以為在 Gemini
# 修法:判斷改用 _is_ollama_primary(),比對 startswith("ollama_")
self._current_primary: str = "ollama_gcp_a" # fallback_manager 傳入的 provider_name
self._consecutive_healthy: int = 0 self._consecutive_healthy: int = 0
self._task: asyncio.Task | None = None self._task: asyncio.Task | None = None
@@ -213,8 +217,10 @@ class OllamaAutoRecoveryService:
# Redis 持久化(跨重啟恢復) # Redis 持久化(跨重啟恢復)
await self._persist_primary(provider) await self._persist_primary(provider)
if provider != "ollama": # 2026-05-04 ogt: B5/B6 修復 — 判斷改用 startswith("ollama_")
# 切換到非 Ollama → 重置 counter開始監控恢復 # 原設計provider != "ollama",但 callback 傳 "ollama_gcp_a" → 永遠觸發 tracking
if not provider.startswith("ollama_"):
# 切換到非 Ollamagemini/nemotron/claude→ 重置 counter開始監控恢復
self._consecutive_healthy = 0 self._consecutive_healthy = 0
logger.info( logger.info(
"ollama_auto_recovery_tracking_started", "ollama_auto_recovery_tracking_started",
@@ -336,7 +342,11 @@ class OllamaAutoRecoveryService:
self._consecutive_healthy = 0 self._consecutive_healthy = 0
return return
if health.status == HealthStatus.HEALTHY: # 2026-05-04 ogt: SLOW 視為可用GCP 高負載 10-30s 仍優於 Gemini quota 限制)
# 原設計只接受 HEALTHYGCP 若因負載偏高落入 SLOW 區會永久卡在 Gemini
is_usable = health.status in (HealthStatus.HEALTHY, HealthStatus.SLOW)
if is_usable:
self._consecutive_healthy += 1 self._consecutive_healthy += 1
logger.debug( logger.debug(
"ollama_auto_recovery_healthy_tick", "ollama_auto_recovery_healthy_tick",
@@ -344,15 +354,16 @@ class OllamaAutoRecoveryService:
consecutive=self._consecutive_healthy, consecutive=self._consecutive_healthy,
required=self._stable_count_required, required=self._stable_count_required,
current_primary=self._current_primary, current_primary=self._current_primary,
actual_status=health.status.value,
) )
if ( if (
self._consecutive_healthy >= self._stable_count_required self._consecutive_healthy >= self._stable_count_required
and self._current_primary != "ollama" and not self._current_primary.startswith("ollama_")
): ):
await self._switch_back_to_ollama() await self._switch_back_to_ollama()
else: else:
# 非 HEALTHY → counter 歸零,繼續等 # DEGRADED / OFFLINE → counter 歸零,繼續等
if self._consecutive_healthy > 0: if self._consecutive_healthy > 0:
logger.debug( logger.debug(
"ollama_auto_recovery_counter_reset", "ollama_auto_recovery_counter_reset",

View File

@@ -352,7 +352,42 @@ class OllamaFailoverManager:
health_local=health_local, health_local=health_local,
) )
# 全部 Ollama 不健康 → Gemini # 2026-05-04 ogt: SLOW 容災備援外網同時抖動時SLOW Ollama 仍優於 Gemini quota 耗盡)
# 原設計:三層全部非 HEALTHY 直接切 Gemini
# 問題111 關機 + GCP 雙外網抖動 → 三節點同時 SLOW → 誤飛 Gemini → 燒 quota
# 修法SLOW 節點視為可用,按優先序選最佳 SLOW 節點
if health_gcp_a.status == HealthStatus.SLOW:
return OllamaRoutingResult(
primary=ep_gcp_a,
fallback_chain=[ep_gcp_b, ep_local, _GEMINI_ENDPOINT],
routing_reason=f"GCP-A SLOW降級可用→ primary GCP-A at {now_ts}",
health_gcp_a=health_gcp_a,
health_gcp_b=health_gcp_b,
health_local=health_local,
)
if health_gcp_b.status == HealthStatus.SLOW:
return OllamaRoutingResult(
primary=ep_gcp_b,
fallback_chain=[ep_local, _GEMINI_ENDPOINT],
routing_reason=f"GCP-A {health_gcp_a.status.value} + GCP-B SLOW降級可用→ 切 GCP-B at {now_ts}",
health_gcp_a=health_gcp_a,
health_gcp_b=health_gcp_b,
health_local=health_local,
)
if health_local.status == HealthStatus.SLOW:
return OllamaRoutingResult(
primary=ep_local,
fallback_chain=[_GEMINI_ENDPOINT],
routing_reason=(
f"GCP-A {health_gcp_a.status.value} + GCP-B {health_gcp_b.status.value}"
f" + Local SLOW降級可用→ 切 Local(111) at {now_ts}"
),
health_gcp_a=health_gcp_a,
health_gcp_b=health_gcp_b,
health_local=health_local,
)
# 全部 Ollama 不可用DEGRADED/OFFLINE→ Gemini
return OllamaRoutingResult( return OllamaRoutingResult(
primary=_GEMINI_ENDPOINT, primary=_GEMINI_ENDPOINT,
fallback_chain=[_NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT], fallback_chain=[_NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT],

View File

@@ -37,7 +37,11 @@ logger = structlog.get_logger(__name__)
# ============================================================================= # =============================================================================
REDIS_CACHE_KEY_PREFIX = "ollama_health:" REDIS_CACHE_KEY_PREFIX = "ollama_health:"
REDIS_CACHE_TTL_SECONDS = 30 # 防 health check storm REDIS_CACHE_TTL_SECONDS = 30 # HEALTHY/SLOW/DEGRADED 快取 30s 防 check storm
# 2026-05-04 ogt: B1 修復 — OFFLINE 快取 TTL 縮短至 5s
# 根因NetworkPolicy reload/CNI 瞬態抖動導致三台同時 OFFLINE被 30s cache 放大
# OFFLINE 狀態快取越短越好,讓系統儘快重新評估並切回
REDIS_CACHE_TTL_OFFLINE_SECONDS = 5 # OFFLINE 只快取 5s儘速重試
CONNECTIVITY_TIMEOUT_SECONDS = 5.0 CONNECTIVITY_TIMEOUT_SECONDS = 5.0
# 2026-04-25 critic-fix H3 by Claude Engineer-C — 45s 讓 SLOW 門檻(30s)真的能觀察到 # 2026-04-25 critic-fix H3 by Claude Engineer-C — 45s 讓 SLOW 門檻(30s)真的能觀察到
@@ -289,9 +293,13 @@ class OllamaHealthMonitor:
reason="推理超時 >35s", reason="推理超時 >35s",
) )
except (httpx.ConnectError, httpx.NetworkError) as e: except (httpx.ConnectError, httpx.NetworkError) as e:
# 2026-05-04 ogt: B3 修復 — connectivity 已通過,推理階段 ConnectError 改判 DEGRADED
# 原設計ConnectError → OFFLINE但 /api/tags 已成功,表示主機存活
# 根因socket 半開GCP LB 回收 idle conn或 Ollama 進程重啟,屬瞬態
# DEGRADED 不觸發 30s OFFLINE cache下次請求立刻重試
return HealthReport( return HealthReport(
status=HealthStatus.OFFLINE, status=HealthStatus.DEGRADED,
reason=f"推理連接失敗:{e}", reason=f"推理連接失敗主機可達socket 瞬斷){e}",
) )
except Exception as e: except Exception as e:
logger.warning("ollama_inference_check_error", host=host, error=str(e)) logger.warning("ollama_inference_check_error", host=host, error=str(e))
@@ -329,12 +337,19 @@ class OllamaHealthMonitor:
return None return None
async def _set_cached(self, host: str, report: HealthReport) -> None: async def _set_cached(self, host: str, report: HealthReport) -> None:
"""寫入 Redis 快取,失敗靜默(不影響功能)""" """寫入 Redis 快取,失敗靜默(不影響功能)
2026-05-04 ogt: OFFLINE 結果快取 5s縮短其他狀態快取 30s
"""
try: try:
from src.core.redis_client import get_redis from src.core.redis_client import get_redis
redis = get_redis() redis = get_redis()
data = json.dumps(report.to_dict()) data = json.dumps(report.to_dict())
await redis.set(self._cache_key(host), data, ex=REDIS_CACHE_TTL_SECONDS) ttl = (
REDIS_CACHE_TTL_OFFLINE_SECONDS
if report.status == HealthStatus.OFFLINE
else REDIS_CACHE_TTL_SECONDS
)
await redis.set(self._cache_key(host), data, ex=ttl)
except Exception as e: except Exception as e:
logger.debug("ollama_health_cache_set_failed", host=host, error=str(e)) logger.debug("ollama_health_cache_set_failed", host=host, error=str(e))

View File

@@ -218,9 +218,15 @@ class TelegramMessage:
nemotron_latency_ms: float = 0.0 # Nemotron 呼叫延遲 (ms) nemotron_latency_ms: float = 0.0 # Nemotron 呼叫延遲 (ms)
def _provider_display(self) -> tuple[str, str]: def _provider_display(self) -> tuple[str, str]:
"""Return display provider and optional model suffix.""" """Return display provider and optional model suffix.
2026-05-04 ogt: 加入具體 Ollama 伺服器顯示GCP-A/B/Local
"""
provider_names = { provider_names = {
"ollama": "Ollama", "ollama": "Ollama",
# 2026-05-04 ogt: ADR-110 三層容災具體伺服器識別
"ollama_gcp_a": "Ollama GCP-A (34.143.170.20)",
"ollama_gcp_b": "Ollama GCP-B (34.21.145.224)",
"ollama_local": "Ollama Local (111)",
"gemini": "Gemini", "gemini": "Gemini",
"claude": "Claude", "claude": "Claude",
"nvidia": "Nemotron", "nvidia": "Nemotron",
@@ -249,7 +255,9 @@ class TelegramMessage:
return "safe_gate_pending" return "safe_gate_pending"
def _format_automation_block(self) -> str: def _format_automation_block(self) -> str:
"""Visible AI automation chain for every ACTION REQUIRED card.""" """Visible AI automation chain for every ACTION REQUIRED card.
2026-05-04 ogt: 加入 Token 用量 + 具體 Ollama 伺服器顯示
"""
provider_display, model_suffix = self._provider_display() provider_display, model_suffix = self._provider_display()
mode = self._automation_mode() mode = self._automation_mode()
openclaw_state = provider_display if provider_display != "rule_fallback" else "degraded" openclaw_state = provider_display if provider_display != "rule_fallback" else "degraded"
@@ -258,6 +266,12 @@ class TelegramMessage:
elephant_state = "timeline_km_pending" elephant_state = "timeline_km_pending"
flow = "webhook&gt;investigator&gt;router&gt;llm/rule&gt;safe&gt;approval" flow = "webhook&gt;investigator&gt;router&gt;llm/rule&gt;safe&gt;approval"
# 2026-05-04 ogt: Token 用量顯示(有資料才顯示)
token_line = ""
if self.ai_tokens > 0:
cost_str = f" / ${self.ai_cost:.4f}" if self.ai_cost > 0 else ""
token_line = f"├ Tokens<code>{self.ai_tokens:,}{cost_str}</code>\n"
return ( return (
f"🤖 <b>AI 自動化鏈路</b>\n" f"🤖 <b>AI 自動化鏈路</b>\n"
f"├ Router<code>{html.escape(provider_display)}{model_suffix}</code>\n" f"├ Router<code>{html.escape(provider_display)}{model_suffix}</code>\n"
@@ -266,6 +280,7 @@ class TelegramMessage:
f"NemoTron<code>{html.escape(nemotron_state)}</code>\n" f"NemoTron<code>{html.escape(nemotron_state)}</code>\n"
f"├ Hermes<code>{html.escape(hermes_state)}</code> | " f"├ Hermes<code>{html.escape(hermes_state)}</code> | "
f"ElephantAlpha<code>{html.escape(elephant_state)}</code>\n" f"ElephantAlpha<code>{html.escape(elephant_state)}</code>\n"
f"{token_line}"
f"└ Flow<code>{flow}</code>\n" f"└ Flow<code>{flow}</code>\n"
) )