diff --git a/apps/api/src/services/ollama_failover_manager.py b/apps/api/src/services/ollama_failover_manager.py index 55018dd6..c9f53116 100644 --- a/apps/api/src/services/ollama_failover_manager.py +++ b/apps/api/src/services/ollama_failover_manager.py @@ -3,17 +3,17 @@ Ollama 自動容災管理 - P1.1b ============================ 依 OllamaHealthMonitor 健康狀態決定 Ollama 路由方案。 -路由邏輯(2026-04-25 統帥指令:Gemini 優先,188 最後備援): - 111 HEALTHY → 主 111,fallback [Gemini, 188, Nemotron] - 111 SLOW → 主 Gemini,fallback [111, 188] - 111 DEGRADED → 主 Gemini,fallback [188, Nemotron, Claude] - 111 OFFLINE → 主 Gemini,fallback [188, Nemotron, Claude] - 111 OFFLINE + 188 OFFLINE → 主 Gemini,fallback [Nemotron, Claude] +路由邏輯(2026-04-26 統帥鐵律:111 = 唯一 Ollama,備援只用 Gemini): + 111 HEALTHY → 主 111,fallback [Gemini] + 111 SLOW/DEGRADED/OFFLINE → 主 Gemini,fallback [Nemotron, Claude] + Gemini quota 超過 → 主 Nemotron,fallback [Claude] 設計說明: +- 188 CPU-only 禁止用於即時回應(0.45 tok/s),完全移出 routing chain +- 唯一 Ollama 主機:192.168.0.111(M1 Pro, Metal 加速) - 不直接依賴 AIProviderEnum(P1.2 Engineer-A 整合時再對齊) - 返回輕量 OllamaRoutingResult,含主 endpoint + fallback 清單 -- 並行檢查 111 + 188(asyncio.gather) +- 只檢查 111(不再並行檢查 188) - 切換觸發時寫 audit_logs service="ollama_failover" - clear_cache() 方法供 OllamaAutoRecoveryService 切回後清空路由快取 @@ -26,8 +26,9 @@ Ollama 自動容災管理 - P1.1b from __future__ import annotations +import asyncio import datetime -from dataclasses import dataclass +from dataclasses import dataclass, field # 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2 # 用標準庫 timezone(timedelta(hours=8)) 取代 zoneinfo,保證一定有 +8 時區 # 原 zoneinfo.ZoneInfo("Asia/Taipei") 失敗時 = None → datetime.now(None) 為 UTC @@ -82,7 +83,6 @@ class OllamaRoutingResult: fallback_chain: list[OllamaEndpoint] routing_reason: str health_111: HealthReport - health_188: HealthReport | None = None def all_endpoints_in_order(self) -> list[OllamaEndpoint]: """返回完整的優先序端點列表(primary 在前)""" @@ -96,11 +96,12 @@ class OllamaRoutingResult: "model": self.primary.model, }, "fallback_chain": [ - {"url": e.url, "provider": e.provider_name, "model": e.model} + {"url": e.url, "provider": e.provider_name, "model": e.model} # noqa: E501 for e in self.fallback_chain ], "routing_reason": self.routing_reason, "health_111": self.health_111.to_dict(), + "health_188": self.health_188.to_dict() if self.health_188 else None, } @@ -165,22 +166,34 @@ class OllamaFailoverManager: async def select_provider( self, - task_type: str = "", # noqa: ARG002 - context: dict | None = None, # noqa: ARG002 + task_type: str = "", + context: dict | None = None, ) -> OllamaRoutingResult: """ 檢查 111 健康狀態,返回路由結果。 2026-04-26 統帥鐵律:唯一 Ollama = 111,188 禁止用於即時回應。 + 只檢查 111,不再並行檢查 188。 + + Args: + task_type: 任務類型(預留,目前未影響路由邏輯) + context: 額外上下文(預留) + + Returns: + OllamaRoutingResult """ url_111 = self._settings.OLLAMA_URL + # 只檢查 111(188 移出 routing chain) try: health_111 = await self._monitor.check(url_111) except Exception as e: health_111 = HealthReport(status=HealthStatus.OFFLINE, reason=f"check error: {e}") - result = self._decide_route(health_111=health_111, url_111=url_111) + result = self._decide_route( + health_111=health_111, + url_111=url_111, + ) # Gemini 帳單熔斷(quota gate) # 2026-04-25 critic-fix Part2 H7 by Claude Engineer-C2 @@ -189,18 +202,17 @@ class OllamaFailoverManager: if not quota_ok: quota = getattr(self._settings, "GEMINI_DAILY_QUOTA", 1000) logger.warning( - "gemini_quota_exceeded_falling_to_188", + "gemini_quota_exceeded_fallback_to_nemotron", quota=quota, health_111=health_111.status.value, ) + # 2026-04-26 統帥鐵律:188 移出,quota 超過 → Nemotron → Claude result = self._build_quota_exceeded_route(health_111=health_111) - # 2026-04-26 P1.5 整合點 3 by Claude Opus 4.7 — 配額耗盡 Telegram 告警 - # alerter 內部 24h dedup(QUOTA_DEDUP_TTL_SEC),即使每次 quota exceeded - # 都呼叫,當日只會發送一次告警。失敗 fail-open(不阻擋 routing)。 + # Quota 耗盡 Telegram 告警(24h dedup) try: from src.services.failover_alerter import get_failover_alerter from src.core.redis_client import get_redis - _current_count = quota # 預設為 quota 值(已超過則 ≥ quota) + _current_count = quota try: _redis = get_redis() if _redis is not None: @@ -261,10 +273,12 @@ class OllamaFailoverManager: 111 DEGRADED → primary=Gemini, fallback=[Nemotron, Claude] 111 OFFLINE → primary=Gemini, fallback=[Nemotron, Claude] - 188 完全移出(CPU-only 0.45 tok/s,禁止即時回應)。 + 188 完全移出 routing chain(CPU-only 0.45 tok/s,禁止即時回應)。 + Gemini quota 超過由 _build_quota_exceeded_route() 接管。 """ model_111 = self._settings.OLLAMA_HEALTH_CHECK_MODEL ep_111 = OllamaEndpoint(url=url_111, provider_name="ollama", model=model_111) + now_ts = datetime.datetime.now(TAIPEI_TZ).isoformat() if health_111.status == HealthStatus.HEALTHY: @@ -283,6 +297,7 @@ class OllamaFailoverManager: health_111=health_111, ) + # DEGRADED / OFFLINE status_label = health_111.status.value return OllamaRoutingResult( primary=_GEMINI_ENDPOINT, @@ -368,7 +383,10 @@ class OllamaFailoverManager: self, health_111: HealthReport, ) -> OllamaRoutingResult: - """Gemini 配額耗盡 → Nemotron 備援。2026-04-26 統帥鐵律:188 移出。""" + """ + Gemini 配額耗盡時的備援路由:primary=Nemotron, fallback=[Claude] + 2026-04-26 統帥鐵律:188 移出,quota 超過直接走 Nemotron → Claude。 + """ return OllamaRoutingResult( primary=_NEMOTRON_ENDPOINT, fallback_chain=[_CLAUDE_ENDPOINT],