feat(p1): Ollama 多層容災系統 — P1.1 健康檢測 + P1.2 ai_router 整合 + P1.5 容災告警
ADR-092 P1 飛輪閉環的 Ollama 失敗轉移子系統,全部 Engineer-A2/C/C2 補上。 新服務 (1581 行): - ollama_health_monitor.py (356):3 層健康檢測(TCP/HTTP/推理) - ollama_failover_manager.py (571):111→188 自動切換 + Redis 持久化 + recovery callback - ollama_auto_recovery.py (436):30s 背景監控 + 連續 3 次 HEALTHY → 切回 + clear_cache - failover_alerter.py (218):P1.5 Telegram 容災告警 服務整合: - ai_router.py: AIProviderEnum.OLLAMA_188 + 120s budget + failover fallback chain - main.py lifespan: 啟動時 wire callback + start recovery,關閉時優雅 stop - config.py: OLLAMA_FALLBACK_URL / OLLAMA_HEALTH_CHECK_MODEL / GEMINI_DAILY_QUOTA(帳單熔斷) K8s 配置: - 04-configmap.yaml.patch-188-fallback:注入 OLLAMA_FALLBACK_URL=http://192.168.0.188:11434 測試 (2082 行): - test_ollama_health_monitor.py (402) - test_ollama_failover_manager.py (707) - test_ollama_auto_recovery.py (580) - test_ai_router_failover_integration.py (257) - test_lifespan_failover_wiring.py (136) 依賴鏈:service 三件套 + ai_router + main.py 一起 commit,缺一就 ImportError。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -189,11 +189,29 @@ class Settings(BaseSettings):
|
||||
default="http://192.168.0.111:11434", # 2026-04-08 ogt: 切換至 M1 Pro (40+ tok/s vs 0.45 tok/s)
|
||||
description="Ollama LLM service URL",
|
||||
)
|
||||
# 2026-04-25 Claude Engineer-C (P1.1): Ollama 188 CPU-only 備援 (方案 C)
|
||||
# 若空字串則 OllamaFailoverManager 僅使用 OLLAMA_URL(單節點模式)
|
||||
OLLAMA_FALLBACK_URL: str = Field(
|
||||
default="",
|
||||
description="Ollama CPU-only fallback URL (188 備援,P1.1),空字串=停用",
|
||||
)
|
||||
# 2026-04-25 Claude Engineer-C (P1.1): Ollama 健康檢測推理測試模型
|
||||
OLLAMA_HEALTH_CHECK_MODEL: str = Field(
|
||||
default="qwen2.5:7b-instruct",
|
||||
description="OllamaHealthMonitor 推理測試使用模型(P1.1)",
|
||||
)
|
||||
# 2026-04-12 ogt: 心跳必須確認載入的 Ollama 模型清單
|
||||
OLLAMA_REQUIRED_MODELS: list[str] = Field(
|
||||
default=["nomic-embed-text", "qwen2.5:7b-instruct", "deepseek-r1:14b"],
|
||||
description="HeartbeatReportService 探測必要模型是否載入",
|
||||
)
|
||||
# 2026-04-25 critic-fix Part2 H7 by Claude Engineer-C2
|
||||
# Gemini 帳單熔斷:每日呼叫上限,超過改走 188+Nemotron
|
||||
# 超過上限後寫 Redis key ollama:gemini_daily_count:{date},TTL 86400s
|
||||
GEMINI_DAILY_QUOTA: int = Field(
|
||||
default=1000,
|
||||
description="每日 Gemini 呼叫上限,超過切到 188+Nemotron(P1.1 帳單熔斷)",
|
||||
)
|
||||
# Deprecated: use OPENCLAW_URL instead
|
||||
CLAWBOT_URL: str = Field(
|
||||
default="http://192.168.0.188:8088", # 🔧 修正: OpenClaw 實際 port 是 8088
|
||||
|
||||
@@ -546,9 +546,39 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
except Exception as e:
|
||||
logger.warning("ai_slo_watchdog_schedule_failed", error=str(e))
|
||||
|
||||
# 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan
|
||||
# OllamaFailoverManager + OllamaAutoRecoveryService 飛輪接線:
|
||||
# failover 切換時 → recovery_callback → set_current_primary → Redis 持久化
|
||||
# recovery service 每 30s 檢查 → 111 連續 3 次 HEALTHY → 自動切回 → clear_cache
|
||||
# 順序:先取 singleton → wire callback → 啟動 recovery service(才能接收 callback)
|
||||
try:
|
||||
from src.services.ollama_failover_manager import get_ollama_failover_manager
|
||||
from src.services.ollama_auto_recovery import get_ollama_auto_recovery_service
|
||||
|
||||
_failover_mgr = get_ollama_failover_manager()
|
||||
_recovery_svc = get_ollama_auto_recovery_service()
|
||||
|
||||
# wire callback:failover 切換時通知 recovery service 更新 current_primary
|
||||
_failover_mgr.set_recovery_callback(_recovery_svc.set_current_primary)
|
||||
|
||||
# 啟動 recovery service(從 Redis bootstrap current_primary,並啟動背景監控)
|
||||
await _recovery_svc.start()
|
||||
logger.info("ollama_failover_system_started")
|
||||
except Exception as e:
|
||||
logger.warning("ollama_failover_system_start_failed", error=str(e))
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
# 2026-04-25 P1.2 by Claude Engineer-A2 — 優雅關閉 Ollama failover 背景監控
|
||||
# 必須在 Redis pool 關閉之前停止(recovery service 可能仍在寫 Redis)
|
||||
try:
|
||||
from src.services.ollama_auto_recovery import get_ollama_auto_recovery_service
|
||||
await get_ollama_auto_recovery_service().stop()
|
||||
logger.info("ollama_failover_system_stopped")
|
||||
except Exception as e:
|
||||
logger.warning("ollama_failover_system_stop_failed", error=str(e))
|
||||
|
||||
# Phase 6.1: 關閉 Signal Worker (先關閉 Consumer)
|
||||
await close_signal_worker()
|
||||
await publisher.stop()
|
||||
|
||||
@@ -68,10 +68,14 @@ logger = structlog.get_logger(__name__)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class AIProviderEnum(Enum):
|
||||
class AIProviderEnum(str, Enum):
|
||||
"""AI 提供者"""
|
||||
|
||||
OLLAMA = "ollama"
|
||||
# 2026-04-25 critic-fix Part2 B2 by Claude Engineer-C2
|
||||
# P1.1b OllamaFailoverManager 使用 provider_name="ollama_188",
|
||||
# 但 AIProviderEnum 沒有此值 → P1.2 整合時 lookup 失敗
|
||||
OLLAMA_188 = "ollama_188" # 188 CPU-only 備援節點(P1.1b)
|
||||
GEMINI = "gemini"
|
||||
CLAUDE = "claude"
|
||||
# 2026-04-02 ogt: C1 修復 — 對齊 Registry 實際名稱
|
||||
@@ -85,6 +89,8 @@ class AIProviderEnum(Enum):
|
||||
# Provider 對應延遲預算 (ms)
|
||||
PROVIDER_LATENCY_BUDGET: dict[AIProviderEnum, int] = {
|
||||
AIProviderEnum.OLLAMA: 60000, # 本地,允許較長處理時間
|
||||
# 2026-04-25 critic-fix Part2 B2 by Claude Engineer-C2 — 188 CPU-only 推理較慢
|
||||
AIProviderEnum.OLLAMA_188: 120000, # 120s budget for CPU inference
|
||||
AIProviderEnum.GEMINI: 30000, # 雲端,較低延遲
|
||||
AIProviderEnum.CLAUDE: 30000, # 雲端,較低延遲
|
||||
# 2026-04-02 ogt: C1 修復 — 對齊 Registry 名稱
|
||||
@@ -212,6 +218,10 @@ class AIRouter:
|
||||
self._intent_classifier = get_intent_classifier()
|
||||
self._complexity_scorer = get_complexity_scorer()
|
||||
self._model_registry = get_model_registry()
|
||||
# 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan
|
||||
# 延遲 import 避免循環依賴(ollama_failover_manager 不 import ai_router)
|
||||
from src.services.ollama_failover_manager import get_ollama_failover_manager
|
||||
self._failover_manager = get_ollama_failover_manager()
|
||||
|
||||
# 從 ModelRegistry 取得模型配置
|
||||
self._ollama_default = self._model_registry.get_model("ollama", "default")
|
||||
@@ -372,10 +382,51 @@ class AIRouter:
|
||||
intent, intent_result, complexity
|
||||
)
|
||||
|
||||
# Step 3b: 若 initial decision 選到 OLLAMA,交由 FailoverManager 重評
|
||||
# 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan
|
||||
# 只在 OLLAMA 時觸發,不干擾 NEMOTRON / OPENCLAW_NEMO / CLAUDE / GEMINI 路由
|
||||
failover_fallback: list[tuple[AIProviderEnum, str]] | None = None
|
||||
if provider == AIProviderEnum.OLLAMA:
|
||||
try:
|
||||
failover_result = await self._failover_manager.select_provider(
|
||||
task_type=intent.value if intent else "general"
|
||||
)
|
||||
primary_str = failover_result.primary.provider_name
|
||||
try:
|
||||
provider = AIProviderEnum(primary_str)
|
||||
model = failover_result.primary.model
|
||||
reason = f"{reason} [failover→{primary_str}]"
|
||||
except ValueError:
|
||||
# provider_name 無法對應已知 enum(理論上不應發生,OLLAMA_188 已加)
|
||||
logger.warning(
|
||||
"ai_router_unknown_failover_provider",
|
||||
provider=primary_str,
|
||||
)
|
||||
# 重建 fallback chain:從 failover_result 轉換
|
||||
fb_list: list[tuple[AIProviderEnum, str]] = []
|
||||
for ep in failover_result.fallback_chain:
|
||||
try:
|
||||
fb_provider = AIProviderEnum(ep.provider_name)
|
||||
fb_list.append((fb_provider, ep.model))
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"ai_router_unknown_failover_fallback_provider",
|
||||
provider=ep.provider_name,
|
||||
)
|
||||
failover_fallback = fb_list
|
||||
except Exception as e:
|
||||
# failover_manager 異常 → 保留原始 provider(fail-open)
|
||||
logger.warning("ai_router_failover_manager_error", error=str(e))
|
||||
|
||||
# Step 4: 建立 Fallback 鏈
|
||||
# 2026-04-05 Claude Code: v4.3 — DIAGNOSE 改回 _full_fallback_chain
|
||||
# NIM 從 Phase 22 起就是主力,無隱私問題;Ollama CPU-only 不可用(實測 238s)
|
||||
fallback_chain = self._build_fallback_chain(provider)
|
||||
# 2026-04-25 P1.2: 若 failover_manager 回傳了 fallback chain,優先使用
|
||||
fallback_chain = (
|
||||
failover_fallback
|
||||
if failover_fallback is not None
|
||||
else self._build_fallback_chain(provider)
|
||||
)
|
||||
|
||||
# Step 5: 計算延遲預算
|
||||
latency_budget = PROVIDER_LATENCY_BUDGET.get(provider, 30000)
|
||||
|
||||
218
apps/api/src/services/failover_alerter.py
Normal file
218
apps/api/src/services/failover_alerter.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""Ollama 容災 / 自動恢復 / Gemini 帳單 Telegram 告警
|
||||
|
||||
設計原則:
|
||||
- 每次 failover 觸發都通知(用戶明確指示)
|
||||
- 但用 10min Redis dedup TTL 防同樣狀態重複告警
|
||||
- 三種告警類型:failover_triggered / recovery_succeeded / gemini_quota_exceeded
|
||||
|
||||
2026-04-25 P1.5 by Claude Engineer-D — Telegram Alerter for Ollama Failover
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
TAIPEI_TZ = timezone(timedelta(hours=8))
|
||||
DEDUP_TTL_SEC = 600 # 10 min
|
||||
QUOTA_DEDUP_TTL_SEC = 86400 # 24h(每日 quota 告警只發一次)
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
class FailoverAlerter:
|
||||
"""Ollama 容災 Telegram 告警
|
||||
|
||||
2026-04-25 P1.5 by Claude Engineer-D — Telegram Alerter for Ollama Failover
|
||||
"""
|
||||
|
||||
def __init__(self, redis_client=None) -> None:
|
||||
# telegram_gateway 從 singleton 取,不注入(lifespan 已確保初始化)
|
||||
self._redis = redis_client
|
||||
|
||||
async def alert_failover(self, event: dict[str, Any]) -> None:
|
||||
"""111 故障切換到 Gemini/188 — 10min dedup"""
|
||||
to_provider = event.get("to_provider", "unknown")
|
||||
dedup_key = f"alert:failover:{to_provider}"
|
||||
if not await self._check_dedup(dedup_key, ttl=DEDUP_TTL_SEC):
|
||||
logger.debug("failover_alert_dedup_skipped", to_provider=to_provider)
|
||||
return
|
||||
|
||||
reason = event.get("reason", "unknown")
|
||||
model = event.get("model", "?")
|
||||
timestamp = event.get("timestamp", datetime.now(TAIPEI_TZ).isoformat())
|
||||
fallback_chain_str = event.get("fallback_chain_str", "?")
|
||||
|
||||
msg = (
|
||||
f"*Ollama 容災激活*\n\n"
|
||||
f"故障主機:Ollama 111 \\(GPU\\)\n"
|
||||
f"故障狀態:{_escape_md(reason)}\n"
|
||||
f"切換目標:{_escape_md(to_provider)} \\(model: {_escape_md(model)}\\)\n"
|
||||
f"切換時間:{_escape_md(timestamp)}\n\n"
|
||||
f"Fallback 鏈:{_escape_md(fallback_chain_str)}\n\n"
|
||||
f"自動恢復服務持續監控,3 次 HEALTHY 後自動切回"
|
||||
)
|
||||
await self._send(msg)
|
||||
logger.info("failover_alert_sent", to_provider=to_provider)
|
||||
|
||||
async def alert_recovery(self, event: dict[str, Any]) -> None:
|
||||
"""111 恢復後自動切回 — 10min dedup
|
||||
|
||||
2026-04-25 P1.5 by Claude Engineer-D — 用戶明確要求要通知
|
||||
"""
|
||||
dedup_key = "alert:recovery"
|
||||
if not await self._check_dedup(dedup_key, ttl=DEDUP_TTL_SEC):
|
||||
logger.debug("recovery_alert_dedup_skipped")
|
||||
return
|
||||
|
||||
stable_count = event.get("stable_count", event.get("to", "?"))
|
||||
# 相容 auto_recovery 傳入的 {"from": ..., "to": ...} 格式
|
||||
from_provider = event.get("from_provider", event.get("from", "?"))
|
||||
to_provider = event.get("to_provider", event.get("to", "ollama_111"))
|
||||
recovery_time = event.get("recovery_time", datetime.now(TAIPEI_TZ).isoformat())
|
||||
|
||||
msg = (
|
||||
f"*Ollama 自動恢復*\n\n"
|
||||
f"恢復主機:Ollama 111 \\(GPU\\)\n"
|
||||
f"穩定計數:連續 {stable_count} 次 HEALTHY\n"
|
||||
f"切回時間:{_escape_md(str(recovery_time))}\n"
|
||||
f"切換路徑:{_escape_md(str(from_provider))} → {_escape_md(str(to_provider))}\n\n"
|
||||
f"自動化飛輪已恢復至 GPU 推理模式"
|
||||
)
|
||||
await self._send(msg)
|
||||
logger.info("recovery_alert_sent", from_provider=from_provider)
|
||||
|
||||
async def alert_gemini_quota_exceeded(self, event: dict[str, Any]) -> None:
|
||||
"""Gemini 每日上限觸發,降級到 188 CPU 備援 — 24h dedup"""
|
||||
dedup_key = "alert:gemini_quota_exceeded"
|
||||
if not await self._check_dedup(dedup_key, ttl=QUOTA_DEDUP_TTL_SEC):
|
||||
logger.debug("quota_alert_dedup_skipped")
|
||||
return
|
||||
|
||||
quota = event.get("quota", "?")
|
||||
current_count = event.get("current_count", "?")
|
||||
date_str = datetime.now(TAIPEI_TZ).date().isoformat()
|
||||
|
||||
msg = (
|
||||
f"*Gemini 每日配額耗盡*\n\n"
|
||||
f"日期:{date_str}\n"
|
||||
f"上限:{quota} calls/day\n"
|
||||
f"當前用量:{current_count}\n"
|
||||
f"降級目標:OLLAMA\\_188 \\(CPU,推理較慢\\)\n\n"
|
||||
f"進入慢速模式至明日 0:00\n"
|
||||
f"建議檢查是否有異常流量,評估是否升級 Gemini 配額"
|
||||
)
|
||||
await self._send(msg)
|
||||
logger.info("quota_alert_sent", quota=quota, current_count=current_count)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Dedup(Redis SET NX EX)
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def _check_dedup(self, key: str, ttl: int) -> bool:
|
||||
"""
|
||||
Redis SET NX EX 防止重複告警。
|
||||
True = 第一次(應送出),False = 已送過(跳過)。
|
||||
|
||||
fail-open:Redis 不可用時允許送出(不阻擋通知)
|
||||
|
||||
2026-04-25 P1.5 by Claude Engineer-D — Telegram dedup 鐵律 10min/24h TTL
|
||||
"""
|
||||
if self._redis is None:
|
||||
return True # fail-open
|
||||
try:
|
||||
ok = await self._redis.set(f"{key}:dedup", "1", ex=ttl, nx=True)
|
||||
return bool(ok)
|
||||
except Exception as e:
|
||||
logger.warning("dedup_check_failed", error=str(e))
|
||||
return True # fail-open
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# 發送(透過 TelegramGateway singleton)
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def _send(self, message: str) -> None:
|
||||
"""發送至 Telegram OPENCLAW_TG_CHAT_ID
|
||||
|
||||
使用現有 TelegramGateway singleton(不另建 HTTP client),
|
||||
parse_mode=MarkdownV2 對應 MarkdownV2 escape 規則。
|
||||
|
||||
alerter 失敗不阻斷主路由邏輯(exception 吞掉只 log)
|
||||
|
||||
2026-04-25 P1.5 by Claude Engineer-D — 告警失敗不能阻斷主流程
|
||||
"""
|
||||
try:
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
from src.core.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
chat_id = getattr(settings, "OPENCLAW_TG_CHAT_ID", None)
|
||||
if not chat_id:
|
||||
logger.warning("telegram_chat_id_missing_failover_alert")
|
||||
return
|
||||
|
||||
gateway = get_telegram_gateway()
|
||||
await gateway.send_notification(text=message, parse_mode="MarkdownV2")
|
||||
logger.info("telegram_failover_alert_sent", message_len=len(message))
|
||||
except Exception as e:
|
||||
# 不 raise — 告警失敗不該阻斷主流程(鐵律)
|
||||
logger.exception("telegram_failover_send_failed", error=str(e))
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# MarkdownV2 escape 工具
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
_MD2_SPECIAL = r"\_*[]()~`>#+-=|{}.!"
|
||||
|
||||
|
||||
def _escape_md(text: str) -> str:
|
||||
"""Escape MarkdownV2 特殊字元,防止 Telegram parse error。
|
||||
|
||||
2026-04-25 P1.5 by Claude Engineer-D — MarkdownV2 安全逸出
|
||||
"""
|
||||
for ch in _MD2_SPECIAL:
|
||||
text = text.replace(ch, f"\\{ch}")
|
||||
return text
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
_alerter_instance: FailoverAlerter | None = None
|
||||
|
||||
|
||||
def get_failover_alerter() -> FailoverAlerter:
|
||||
"""取得 FailoverAlerter singleton(lifespan 中注入依賴前也可安全呼叫,僅 fail-open)
|
||||
|
||||
2026-04-25 P1.5 by Claude Engineer-D — Singleton 取得 alerter
|
||||
"""
|
||||
global _alerter_instance
|
||||
if _alerter_instance is None:
|
||||
_alerter_instance = FailoverAlerter()
|
||||
return _alerter_instance
|
||||
|
||||
|
||||
def configure_alerter(redis_client) -> None:
|
||||
"""Lifespan 注入:redis_client 就緒後呼叫,讓 dedup 功能生效。
|
||||
|
||||
telegram 不另外注入,直接從 get_telegram_gateway() singleton 取得
|
||||
(lifespan startup 已保證 TelegramGateway 初始化完成)。
|
||||
|
||||
2026-04-25 P1.5 by Claude Engineer-D — Lifespan 注入
|
||||
"""
|
||||
global _alerter_instance
|
||||
_alerter_instance = FailoverAlerter(redis_client=redis_client)
|
||||
logger.info("failover_alerter_configured")
|
||||
|
||||
|
||||
def reset_failover_alerter() -> None:
|
||||
"""重置 singleton(測試用)
|
||||
|
||||
2026-04-25 P1.5 by Claude Engineer-D
|
||||
"""
|
||||
global _alerter_instance
|
||||
_alerter_instance = None
|
||||
436
apps/api/src/services/ollama_auto_recovery.py
Normal file
436
apps/api/src/services/ollama_auto_recovery.py
Normal file
@@ -0,0 +1,436 @@
|
||||
"""
|
||||
Ollama 自動恢復服務 - P1.1d
|
||||
============================
|
||||
背景監控:偵測 111 從 OFFLINE/SLOW/DEGRADED 恢復為 HEALTHY 後,立刻切回 Ollama。
|
||||
|
||||
核心設計:
|
||||
- 30s 輪詢 111 健康狀態
|
||||
- 防抖:連續 3 次 HEALTHY 才觸發切回(30s × 3 = 90s 穩定視窗)
|
||||
- 中途若又 OFFLINE,counter 歸零,重新計數
|
||||
- 切回後呼叫 failover_manager.clear_cache(),讓下次路由重新評估
|
||||
- 預留 telegram_alerter 介面(P1.5 Engineer 接入)
|
||||
- structlog audit service="ollama_auto_recovery"
|
||||
|
||||
整合方式(FastAPI lifespan):
|
||||
from src.services.ollama_auto_recovery import OllamaAutoRecoveryService
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
recovery_svc = OllamaAutoRecoveryService()
|
||||
await recovery_svc.start()
|
||||
yield
|
||||
await recovery_svc.stop()
|
||||
|
||||
版本: v1.0
|
||||
建立: 2026-04-25 (台北時區)
|
||||
建立者: Claude Engineer-C (P1.1d)
|
||||
# 2026-04-25 統帥指令 by Claude Engineer-C — 自動切 Gemini + 自動恢復
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import datetime
|
||||
from datetime import timezone, timedelta
|
||||
from typing import Any, Protocol, runtime_checkable
|
||||
|
||||
import structlog
|
||||
|
||||
# 台北時區 +8(標準庫保險絲,100% 可用)
|
||||
# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
|
||||
# 原 zoneinfo.ZoneInfo("Asia/Taipei") 失敗時 = None → datetime.now(None) 為 UTC
|
||||
TAIPEI_TZ = timezone(timedelta(hours=8))
|
||||
|
||||
from src.core.config import get_settings
|
||||
from src.services.ollama_health_monitor import (
|
||||
HealthStatus,
|
||||
OllamaHealthMonitor,
|
||||
get_ollama_health_monitor,
|
||||
)
|
||||
from src.services.ollama_failover_manager import (
|
||||
OllamaFailoverManager,
|
||||
get_ollama_failover_manager,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# 常數
|
||||
# =============================================================================
|
||||
|
||||
RECOVERY_CHECK_INTERVAL_SEC = 30 # 每 30s 檢查一次
|
||||
STABLE_COUNT_REQUIRED = 3 # 連續 3 次 HEALTHY 才切回(防抖)
|
||||
|
||||
# =============================================================================
|
||||
# Telegram Alerter Protocol(預留介面,P1.5 Engineer 實作)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class TelegramAlerter(Protocol):
|
||||
"""Telegram 通知介面(預留,P1.5 Engineer 接入)"""
|
||||
|
||||
async def alert_recovery(self, payload: dict[str, Any]) -> None:
|
||||
"""111 恢復通知"""
|
||||
...
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OllamaAutoRecoveryService
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class OllamaAutoRecoveryService:
|
||||
"""
|
||||
Ollama 111 自動恢復服務
|
||||
|
||||
偵測 111 從 OFFLINE/SLOW/DEGRADED 恢復為 HEALTHY 後,立刻切回 Ollama 路由。
|
||||
|
||||
防抖邏輯:
|
||||
- 連續 3 次 HEALTHY(90s 穩定視窗)才觸發切回
|
||||
- 中途任一次非 HEALTHY → counter 歸零
|
||||
- 已是 ollama primary 時不重複觸發
|
||||
|
||||
2026-04-25 統帥指令 by Claude Engineer-C — 自動切 Gemini + 自動恢復
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
health_monitor: OllamaHealthMonitor | None = None,
|
||||
failover_manager: OllamaFailoverManager | None = None,
|
||||
telegram_alerter: TelegramAlerter | None = None,
|
||||
recovery_check_interval_sec: int = RECOVERY_CHECK_INTERVAL_SEC,
|
||||
stable_count_required: int = STABLE_COUNT_REQUIRED,
|
||||
) -> None:
|
||||
self._monitor = health_monitor or get_ollama_health_monitor()
|
||||
self._failover_manager = failover_manager or get_ollama_failover_manager()
|
||||
self._alerter = telegram_alerter
|
||||
self._recovery_check_interval_sec = recovery_check_interval_sec
|
||||
self._stable_count_required = stable_count_required
|
||||
self._settings = get_settings()
|
||||
|
||||
# 狀態追蹤
|
||||
self._current_primary: str = "ollama" # "ollama" / "gemini" / "fallback"
|
||||
self._consecutive_healthy: int = 0
|
||||
self._task: asyncio.Task | None = None
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
啟動背景監控任務。
|
||||
|
||||
在 FastAPI lifespan 的 startup 階段呼叫:
|
||||
recovery_svc = OllamaAutoRecoveryService()
|
||||
await recovery_svc.start()
|
||||
|
||||
# 2026-04-25 critic-fix Part2 H5+H6 by Claude Engineer-C2
|
||||
# 啟動時從 Redis 載入狀態(跨重啟持久化),
|
||||
# 若 primary != "ollama" 立刻執行一次 check(不等 30s)。
|
||||
"""
|
||||
if self._task is not None and not self._task.done():
|
||||
logger.warning(
|
||||
"ollama_auto_recovery_already_running",
|
||||
service="ollama_auto_recovery",
|
||||
)
|
||||
return
|
||||
|
||||
# 從 Redis 載入持久化狀態(跨重啟恢復)
|
||||
# 2026-04-25 critic-fix Part2 H5+H6 by Claude Engineer-C2
|
||||
self._current_primary = await self._load_primary()
|
||||
logger.info(
|
||||
"ollama_auto_recovery_bootstrap",
|
||||
service="ollama_auto_recovery",
|
||||
current_primary=self._current_primary,
|
||||
)
|
||||
|
||||
# 若 primary 不是 ollama,立刻評估一次(不等 30s interval)
|
||||
if self._current_primary != "ollama":
|
||||
try:
|
||||
await self._check_and_recover()
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"ollama_auto_recovery_immediate_check_error",
|
||||
service="ollama_auto_recovery",
|
||||
)
|
||||
|
||||
self._task = asyncio.create_task(
|
||||
self._monitor_loop(),
|
||||
name="ollama_auto_recovery_loop",
|
||||
)
|
||||
logger.info(
|
||||
"ollama_auto_recovery_started",
|
||||
service="ollama_auto_recovery",
|
||||
check_interval_sec=self._recovery_check_interval_sec,
|
||||
stable_count_required=self._stable_count_required,
|
||||
)
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""
|
||||
優雅關閉背景任務。
|
||||
|
||||
在 FastAPI lifespan 的 shutdown 階段呼叫:
|
||||
await recovery_svc.stop()
|
||||
"""
|
||||
if self._task is None or self._task.done():
|
||||
return
|
||||
|
||||
self._task.cancel()
|
||||
try:
|
||||
await self._task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
finally:
|
||||
self._task = None
|
||||
|
||||
logger.info(
|
||||
"ollama_auto_recovery_stopped",
|
||||
service="ollama_auto_recovery",
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# 狀態控制(供整合層設定當前 primary,例如 failover 觸發時更新)
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def set_current_primary(self, provider: str) -> None:
|
||||
"""
|
||||
通知 recovery service 當前路由 primary 是哪個 provider。
|
||||
OllamaFailoverManager.select_provider() 觸發 failover 後呼叫此方法更新狀態。
|
||||
|
||||
# 2026-04-25 critic-fix Part2 H5+H6 by Claude Engineer-C2
|
||||
# 改為 async,切換時同步寫 Redis 持久化(跨重啟恢復)
|
||||
"""
|
||||
if self._current_primary != provider:
|
||||
logger.info(
|
||||
"ollama_auto_recovery_primary_changed",
|
||||
service="ollama_auto_recovery",
|
||||
from_primary=self._current_primary,
|
||||
to_primary=provider,
|
||||
)
|
||||
self._current_primary = provider
|
||||
# Redis 持久化(跨重啟恢復)
|
||||
await self._persist_primary(provider)
|
||||
|
||||
if provider != "ollama":
|
||||
# 切換到非 Ollama → 重置 counter,開始監控恢復
|
||||
self._consecutive_healthy = 0
|
||||
logger.info(
|
||||
"ollama_auto_recovery_tracking_started",
|
||||
service="ollama_auto_recovery",
|
||||
current_primary=provider,
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Redis 持久化(跨重啟恢復)
|
||||
# 2026-04-25 critic-fix Part2 H5+H6 by Claude Engineer-C2
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
_REDIS_PRIMARY_KEY = "ollama:current_primary"
|
||||
|
||||
async def _persist_primary(self, primary: str) -> None:
|
||||
"""持久化 current_primary 到 Redis(無 TTL,跨重啟恢復)"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
await redis.set(self._REDIS_PRIMARY_KEY, primary)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"ollama_auto_recovery_persist_failed",
|
||||
service="ollama_auto_recovery",
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
async def _load_primary(self) -> str:
|
||||
"""從 Redis 載入 current_primary(找不到時預設 "ollama")"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
val = await redis.get(self._REDIS_PRIMARY_KEY)
|
||||
if val:
|
||||
decoded = val.decode() if isinstance(val, bytes) else val
|
||||
logger.info(
|
||||
"ollama_auto_recovery_loaded_from_redis",
|
||||
service="ollama_auto_recovery",
|
||||
current_primary=decoded,
|
||||
)
|
||||
return decoded
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"ollama_auto_recovery_load_failed",
|
||||
service="ollama_auto_recovery",
|
||||
error=str(e),
|
||||
)
|
||||
return "ollama" # 預設:正常路由
|
||||
|
||||
@property
|
||||
def current_primary(self) -> str:
|
||||
return self._current_primary
|
||||
|
||||
@property
|
||||
def consecutive_healthy(self) -> int:
|
||||
return self._consecutive_healthy
|
||||
|
||||
@property
|
||||
def is_running(self) -> bool:
|
||||
return self._task is not None and not self._task.done()
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# 背景監控迴圈
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def _monitor_loop(self) -> None:
|
||||
"""
|
||||
每 30s 檢查 111 健康狀態。
|
||||
連續 3 次 HEALTHY 且 current_primary != "ollama" → 觸發切回。
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
await asyncio.sleep(self._recovery_check_interval_sec)
|
||||
await self._check_and_recover()
|
||||
except asyncio.CancelledError:
|
||||
raise # 必須重新 raise,讓 stop() 的 await 正常結束
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"ollama_auto_recovery_loop_error",
|
||||
service="ollama_auto_recovery",
|
||||
)
|
||||
# 不 break,繼續監控(異常不應停止監控迴圈)
|
||||
|
||||
async def _check_and_recover(self) -> None:
|
||||
"""
|
||||
單次健康狀態檢查 + 切回邏輯。
|
||||
抽取為獨立方法,方便單元測試。
|
||||
"""
|
||||
host = self._settings.OLLAMA_URL
|
||||
|
||||
try:
|
||||
health = await self._monitor.check(host)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"ollama_auto_recovery_check_failed",
|
||||
service="ollama_auto_recovery",
|
||||
host=host,
|
||||
)
|
||||
self._consecutive_healthy = 0
|
||||
return
|
||||
|
||||
if health.status == HealthStatus.HEALTHY:
|
||||
self._consecutive_healthy += 1
|
||||
logger.debug(
|
||||
"ollama_auto_recovery_healthy_tick",
|
||||
service="ollama_auto_recovery",
|
||||
consecutive=self._consecutive_healthy,
|
||||
required=self._stable_count_required,
|
||||
current_primary=self._current_primary,
|
||||
)
|
||||
|
||||
if (
|
||||
self._consecutive_healthy >= self._stable_count_required
|
||||
and self._current_primary != "ollama"
|
||||
):
|
||||
await self._switch_back_to_ollama()
|
||||
else:
|
||||
# 非 HEALTHY → counter 歸零,繼續等
|
||||
if self._consecutive_healthy > 0:
|
||||
logger.debug(
|
||||
"ollama_auto_recovery_counter_reset",
|
||||
service="ollama_auto_recovery",
|
||||
previous_count=self._consecutive_healthy,
|
||||
status=health.status.value,
|
||||
)
|
||||
self._consecutive_healthy = 0
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# 切回 Ollama 111
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def _switch_back_to_ollama(self) -> None:
|
||||
"""
|
||||
切回 Ollama 111:
|
||||
1. 更新 _current_primary
|
||||
2. 呼叫 failover_manager.clear_cache() 清空路由快取
|
||||
3. 通知 failover_manager(notify_recovery)
|
||||
4. Telegram 通知(若 alerter 已設定)
|
||||
5. structlog audit
|
||||
"""
|
||||
from_provider = self._current_primary
|
||||
self._current_primary = "ollama"
|
||||
stable_count = self._consecutive_healthy
|
||||
# counter 不立即清零:保留供 audit,下次 loop 開始後自然計數
|
||||
self._consecutive_healthy = 0
|
||||
|
||||
# 取台北時間(標準庫保證 +8,禁 UTC)
|
||||
# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
|
||||
recovery_time = datetime.datetime.now(TAIPEI_TZ)
|
||||
|
||||
# 清空 failover manager 快取(讓下次 select_provider 重新評估)
|
||||
try:
|
||||
await self._failover_manager.clear_cache()
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"ollama_auto_recovery_clear_cache_error",
|
||||
service="ollama_auto_recovery",
|
||||
)
|
||||
|
||||
# 通知 failover_manager
|
||||
try:
|
||||
self._failover_manager.notify_recovery("ollama_111")
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"ollama_auto_recovery_notify_error",
|
||||
service="ollama_auto_recovery",
|
||||
)
|
||||
|
||||
# Telegram 通知(P1.5 Engineer-D 接入)
|
||||
# 2026-04-25 P1.5 by Claude Engineer-D — 接 FailoverAlerter,self._alerter 優先,
|
||||
# 無則 fallback 到 get_failover_alerter() singleton
|
||||
try:
|
||||
alerter = self._alerter
|
||||
if alerter is None:
|
||||
from src.services.failover_alerter import get_failover_alerter
|
||||
alerter = get_failover_alerter()
|
||||
await alerter.alert_recovery({
|
||||
"from": from_provider,
|
||||
"to": "ollama_111",
|
||||
"stable_count": stable_count,
|
||||
"recovery_time": recovery_time.isoformat(),
|
||||
})
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"ollama_auto_recovery_telegram_alert_error",
|
||||
service="ollama_auto_recovery",
|
||||
)
|
||||
|
||||
# structlog audit(必須記錄)
|
||||
logger.info(
|
||||
"ollama_auto_recovery_switched_back",
|
||||
service="ollama_auto_recovery",
|
||||
action="switched_back",
|
||||
from_provider=from_provider,
|
||||
to_provider="ollama_111",
|
||||
stable_count=stable_count,
|
||||
recovery_time=recovery_time.isoformat(),
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
_recovery_service: OllamaAutoRecoveryService | None = None
|
||||
|
||||
|
||||
def get_ollama_auto_recovery_service() -> OllamaAutoRecoveryService:
|
||||
"""取得 OllamaAutoRecoveryService singleton"""
|
||||
global _recovery_service
|
||||
if _recovery_service is None:
|
||||
_recovery_service = OllamaAutoRecoveryService()
|
||||
return _recovery_service
|
||||
|
||||
|
||||
def reset_ollama_auto_recovery_service() -> None:
|
||||
"""重置 singleton(測試用)"""
|
||||
global _recovery_service
|
||||
_recovery_service = None
|
||||
571
apps/api/src/services/ollama_failover_manager.py
Normal file
571
apps/api/src/services/ollama_failover_manager.py
Normal file
@@ -0,0 +1,571 @@
|
||||
"""
|
||||
Ollama 自動容災管理 - P1.1b
|
||||
============================
|
||||
依 OllamaHealthMonitor 健康狀態決定 Ollama 路由方案。
|
||||
|
||||
路由邏輯(2026-04-25 統帥指令:Gemini 優先,188 最後備援):
|
||||
111 HEALTHY → 主 111,fallback [Gemini, 188, Nemotron]
|
||||
111 SLOW → 主 Gemini,fallback [111, 188]
|
||||
111 DEGRADED → 主 Gemini,fallback [188, Nemotron, Claude]
|
||||
111 OFFLINE → 主 Gemini,fallback [188, Nemotron, Claude]
|
||||
111 OFFLINE + 188 OFFLINE → 主 Gemini,fallback [Nemotron, Claude]
|
||||
|
||||
設計說明:
|
||||
- 不直接依賴 AIProviderEnum(P1.2 Engineer-A 整合時再對齊)
|
||||
- 返回輕量 OllamaRoutingResult,含主 endpoint + fallback 清單
|
||||
- 並行檢查 111 + 188(asyncio.gather)
|
||||
- 切換觸發時寫 audit_logs service="ollama_failover"
|
||||
- clear_cache() 方法供 OllamaAutoRecoveryService 切回後清空路由快取
|
||||
|
||||
版本: v2.0
|
||||
建立: 2026-04-25 (台北時區)
|
||||
建立者: Claude Engineer-C (P1.1b)
|
||||
# Created 2026-04-25 P1.1 by Claude Engineer-C
|
||||
# 2026-04-25 統帥指令 by Claude Engineer-C — 自動切 Gemini + 自動恢復
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import datetime
|
||||
from dataclasses import dataclass, field
|
||||
# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
|
||||
# 用標準庫 timezone(timedelta(hours=8)) 取代 zoneinfo,保證一定有 +8 時區
|
||||
# 原 zoneinfo.ZoneInfo("Asia/Taipei") 失敗時 = None → datetime.now(None) 為 UTC
|
||||
from datetime import timezone, timedelta
|
||||
|
||||
import structlog
|
||||
|
||||
from src.core.config import get_settings
|
||||
|
||||
# 台北時區 +8(標準庫保險絲,100% 可用)
|
||||
# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
|
||||
TAIPEI_TZ = timezone(timedelta(hours=8))
|
||||
from src.services.ollama_health_monitor import (
|
||||
HealthReport,
|
||||
HealthStatus,
|
||||
OllamaHealthMonitor,
|
||||
get_ollama_health_monitor,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 路由結果模型(輕量,P1.2 整合時轉換為 RoutingDecision)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@dataclass
|
||||
class OllamaEndpoint:
|
||||
"""Ollama 端點描述"""
|
||||
|
||||
url: str
|
||||
provider_name: str # 給 AIRouterExecutor 用的 provider 名稱
|
||||
model: str
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {"url": self.url, "provider_name": self.provider_name, "model": self.model}
|
||||
|
||||
|
||||
@dataclass
|
||||
class OllamaRoutingResult:
|
||||
"""
|
||||
Ollama 容災路由結果
|
||||
|
||||
P1.2 Engineer-A 整合時,將此結果轉換為 ai_router.RoutingDecision:
|
||||
- selected_provider = AIProviderEnum[result.primary.provider_name.upper()] or 新的 OLLAMA_188
|
||||
- selected_model = result.primary.model
|
||||
- fallback_chain = [(AIProviderEnum[p.provider_name.upper()], p.model) for p in result.fallback_chain]
|
||||
"""
|
||||
|
||||
primary: OllamaEndpoint
|
||||
fallback_chain: list[OllamaEndpoint]
|
||||
routing_reason: str
|
||||
health_111: HealthReport
|
||||
health_188: HealthReport | None = None
|
||||
|
||||
def all_endpoints_in_order(self) -> list[OllamaEndpoint]:
|
||||
"""返回完整的優先序端點列表(primary 在前)"""
|
||||
return [self.primary, *self.fallback_chain]
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"primary": {
|
||||
"url": self.primary.url,
|
||||
"provider": self.primary.provider_name,
|
||||
"model": self.primary.model,
|
||||
},
|
||||
"fallback_chain": [
|
||||
{"url": e.url, "provider": e.provider_name, "model": e.model}
|
||||
for e in self.fallback_chain
|
||||
],
|
||||
"routing_reason": self.routing_reason,
|
||||
"health_111": self.health_111.to_dict(),
|
||||
"health_188": self.health_188.to_dict() if self.health_188 else None,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 已知 Fallback 端點定義(Nemotron / Gemini / Claude)
|
||||
# =============================================================================
|
||||
|
||||
# 以 provider_name 對應 ai_router.AIProviderEnum 的 value
|
||||
_NEMOTRON_ENDPOINT = OllamaEndpoint(
|
||||
url="", # Nemotron 不是 HTTP URL,由 AIRouterExecutor 從 Registry 取得
|
||||
provider_name="nemotron",
|
||||
model="nvidia/nemotron-mini-4b-instruct",
|
||||
)
|
||||
_GEMINI_ENDPOINT = OllamaEndpoint(
|
||||
url="",
|
||||
provider_name="gemini",
|
||||
model="gemini-1.5-flash",
|
||||
)
|
||||
_CLAUDE_ENDPOINT = OllamaEndpoint(
|
||||
url="",
|
||||
provider_name="claude",
|
||||
model="claude-3-5-haiku-20241022",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OllamaFailoverManager
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class OllamaFailoverManager:
|
||||
"""
|
||||
Ollama 自動容災管理器
|
||||
|
||||
並行檢查 111 + 188,依健康狀態選擇最佳路由。
|
||||
|
||||
使用方式:
|
||||
manager = OllamaFailoverManager()
|
||||
result = await manager.select_provider()
|
||||
# result.primary.url → 使用的 Ollama URL
|
||||
# result.fallback_chain → 依序 fallback
|
||||
|
||||
2026-04-25 Claude Engineer-C (P1.1b)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
health_monitor: OllamaHealthMonitor | None = None,
|
||||
recovery_callback=None,
|
||||
) -> None:
|
||||
self._monitor = health_monitor or get_ollama_health_monitor()
|
||||
self._settings = get_settings()
|
||||
# 2026-04-25 critic-fix Part2 H5+H6 by Claude Engineer-C2
|
||||
# recovery_callback: async callable(provider_name: str) → None
|
||||
# OllamaAutoRecoveryService.set_current_primary 在 failover 時被通知,
|
||||
# 避免重啟後 _current_primary 停留在 "ollama" 而永不啟動恢復監控
|
||||
self._recovery_callback = recovery_callback
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Public API
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def select_provider(
|
||||
self,
|
||||
task_type: str = "",
|
||||
context: dict | None = None,
|
||||
) -> OllamaRoutingResult:
|
||||
"""
|
||||
並行檢查 111 + 188,返回路由結果
|
||||
|
||||
Args:
|
||||
task_type: 任務類型(預留,目前未影響路由邏輯)
|
||||
context: 額外上下文(預留)
|
||||
|
||||
Returns:
|
||||
OllamaRoutingResult
|
||||
"""
|
||||
url_111 = self._settings.OLLAMA_URL
|
||||
url_188 = self._settings.OLLAMA_FALLBACK_URL or ""
|
||||
|
||||
# 並行檢查
|
||||
# 2026-04-25 critic-fix Part2 H4 by Claude Engineer-C2
|
||||
# return_exceptions=True 防止任一 check 例外導致整個 select_provider 炸
|
||||
if url_188:
|
||||
results = await asyncio.gather(
|
||||
self._monitor.check(url_111),
|
||||
self._monitor.check(url_188),
|
||||
return_exceptions=True,
|
||||
)
|
||||
# 處理 exception — 任一失敗視為 OFFLINE
|
||||
health_111_raw, health_188_raw = results
|
||||
health_111: HealthReport = (
|
||||
HealthReport(status=HealthStatus.OFFLINE, reason=f"check error: {health_111_raw}")
|
||||
if isinstance(health_111_raw, Exception)
|
||||
else health_111_raw
|
||||
)
|
||||
health_188: HealthReport | None = (
|
||||
HealthReport(status=HealthStatus.OFFLINE, reason=f"check error: {health_188_raw}")
|
||||
if isinstance(health_188_raw, Exception)
|
||||
else health_188_raw
|
||||
)
|
||||
else:
|
||||
health_111 = await self._monitor.check(url_111)
|
||||
health_188 = None
|
||||
|
||||
result = self._decide_route(
|
||||
health_111=health_111,
|
||||
health_188=health_188,
|
||||
url_111=url_111,
|
||||
url_188=url_188,
|
||||
)
|
||||
|
||||
# Gemini 帳單熔斷(quota gate)
|
||||
# 2026-04-25 critic-fix Part2 H7 by Claude Engineer-C2
|
||||
if result.primary.provider_name == "gemini":
|
||||
quota_ok = await self._check_gemini_quota()
|
||||
if not quota_ok:
|
||||
quota = getattr(self._settings, "GEMINI_DAILY_QUOTA", 1000)
|
||||
logger.warning(
|
||||
"gemini_quota_exceeded_falling_to_188",
|
||||
quota=quota,
|
||||
health_111=health_111.status.value,
|
||||
)
|
||||
result = self._build_quota_exceeded_route(
|
||||
health_111=health_111,
|
||||
health_188=health_188,
|
||||
url_111=url_111,
|
||||
url_188=url_188,
|
||||
)
|
||||
|
||||
# 寫入 audit_log(best-effort)
|
||||
await self._write_failover_audit(result)
|
||||
|
||||
logger.info(
|
||||
"ollama_failover_decision",
|
||||
primary=result.primary.provider_name,
|
||||
primary_url=result.primary.url,
|
||||
reason=result.routing_reason,
|
||||
fallback_count=len(result.fallback_chain),
|
||||
health_111=health_111.status.value,
|
||||
health_188=health_188.status.value if health_188 else "not_configured",
|
||||
)
|
||||
|
||||
# 通知 recovery service 當前 primary(跨重啟持久化)
|
||||
# 2026-04-25 critic-fix Part2 H5+H6 by Claude Engineer-C2
|
||||
if self._recovery_callback is not None:
|
||||
try:
|
||||
await self._recovery_callback(result.primary.provider_name)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"ollama_failover_recovery_callback_failed",
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# 路由決策邏輯
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def _decide_route(
|
||||
self,
|
||||
health_111: HealthReport,
|
||||
health_188: HealthReport | None,
|
||||
url_111: str,
|
||||
url_188: str,
|
||||
) -> OllamaRoutingResult:
|
||||
"""
|
||||
決策矩陣(2026-04-25 統帥指令:Gemini 優先,188 最後備援):
|
||||
|
||||
111 HEALTHY → primary=111, fallback=[Gemini, 188, Nemotron]
|
||||
111 SLOW → primary=Gemini, fallback=[111, 188]
|
||||
111 DEGRADED → primary=Gemini, fallback=[188, Nemotron, Claude]
|
||||
111 OFFLINE → primary=Gemini, fallback=[188, Nemotron, Claude]
|
||||
111 OFFLINE + 188 OFFLINE → primary=Gemini, fallback=[Nemotron, Claude]
|
||||
|
||||
關鍵原則:
|
||||
- 111 非 HEALTHY 時,primary 必為 Gemini(快速雲端,不等 188 慢推理)
|
||||
- 188 永遠在 fallback chain,作為 Gemini 額度耗盡的最後備援
|
||||
- degradation_reason 記錄切換原因 + 時間戳
|
||||
|
||||
2026-04-25 統帥指令 by Claude Engineer-C — 自動切 Gemini + 自動恢復
|
||||
"""
|
||||
model_111 = self._settings.OLLAMA_HEALTH_CHECK_MODEL
|
||||
model_188 = "qwen2.5:7b-instruct" # 188 CPU-only 備援推薦模型(plan 方案 C)
|
||||
|
||||
ep_111 = OllamaEndpoint(url=url_111, provider_name="ollama", model=model_111)
|
||||
ep_188 = (
|
||||
OllamaEndpoint(url=url_188, provider_name="ollama_188", model=model_188)
|
||||
if url_188
|
||||
else None
|
||||
)
|
||||
|
||||
# 188 可用性判斷(僅供 fallback 使用)
|
||||
has_188 = ep_188 is not None and (
|
||||
health_188 is not None and health_188.status != HealthStatus.OFFLINE
|
||||
)
|
||||
|
||||
# 切換時間戳(台北時區 +8,標準庫保證)
|
||||
# 2026-04-25 critic-fix Part2 B4 by Claude Engineer-C2
|
||||
now_ts = datetime.datetime.now(TAIPEI_TZ).isoformat()
|
||||
|
||||
# ==========================================================
|
||||
# 111 HEALTHY → 主 111,Gemini 作為第一 fallback(快速雲端)
|
||||
# ==========================================================
|
||||
if health_111.status == HealthStatus.HEALTHY:
|
||||
fallback: list[OllamaEndpoint] = [_GEMINI_ENDPOINT]
|
||||
if has_188 and ep_188:
|
||||
fallback.append(ep_188)
|
||||
fallback.append(_NEMOTRON_ENDPOINT)
|
||||
return OllamaRoutingResult(
|
||||
primary=ep_111,
|
||||
fallback_chain=fallback,
|
||||
routing_reason="111 HEALTHY → 主 111",
|
||||
health_111=health_111,
|
||||
health_188=health_188,
|
||||
)
|
||||
|
||||
# ==========================================================
|
||||
# 111 SLOW → primary=Gemini,fallback=[111, 188]
|
||||
# 111 實測 eval rate 0.09 token/s,~111s 推理,Gemini 更快
|
||||
# ==========================================================
|
||||
if health_111.status == HealthStatus.SLOW:
|
||||
fallback_slow: list[OllamaEndpoint] = [ep_111]
|
||||
if has_188 and ep_188:
|
||||
fallback_slow.append(ep_188)
|
||||
degradation_reason = (
|
||||
f"111 SLOW(eval ~0.09 token/s, ~111s)→ 切 Gemini at {now_ts}"
|
||||
)
|
||||
return OllamaRoutingResult(
|
||||
primary=_GEMINI_ENDPOINT,
|
||||
fallback_chain=fallback_slow,
|
||||
routing_reason=degradation_reason,
|
||||
health_111=health_111,
|
||||
health_188=health_188,
|
||||
)
|
||||
|
||||
# ==========================================================
|
||||
# 111 DEGRADED 或 OFFLINE → primary=Gemini,188 在 fallback
|
||||
# ==========================================================
|
||||
status_label = health_111.status.value # "degraded" / "offline"
|
||||
degradation_reason = f"111 {status_label} → 切 Gemini at {now_ts}"
|
||||
if has_188 and ep_188:
|
||||
return OllamaRoutingResult(
|
||||
primary=_GEMINI_ENDPOINT,
|
||||
fallback_chain=[ep_188, _NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT],
|
||||
routing_reason=degradation_reason,
|
||||
health_111=health_111,
|
||||
health_188=health_188,
|
||||
)
|
||||
|
||||
# 188 也不可用 → Gemini 主力,最後備援 Nemotron / Claude
|
||||
degradation_reason = f"111 {status_label} + 188 不可用 → 切 Gemini at {now_ts}"
|
||||
return OllamaRoutingResult(
|
||||
primary=_GEMINI_ENDPOINT,
|
||||
fallback_chain=[_NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT],
|
||||
routing_reason=degradation_reason,
|
||||
health_111=health_111,
|
||||
health_188=health_188,
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Gemini 帳單熔斷(quota gate)
|
||||
# 2026-04-25 critic-fix Part2 H7 by Claude Engineer-C2
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def _check_gemini_quota(self) -> bool:
|
||||
"""
|
||||
檢查每日 Gemini call 配額,超過上限則禁用。
|
||||
|
||||
Redis key: ollama:gemini_daily_count:{YYYY-MM-DD},TTL 86400s
|
||||
計數 atomic(incr)。
|
||||
|
||||
Returns:
|
||||
True → 仍在配額內,可使用 Gemini
|
||||
False → 已超配額,應切到 188+Nemotron
|
||||
|
||||
fail-open:Redis 不可用時允許走 Gemini(不阻擋服務)
|
||||
"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
if redis is None:
|
||||
return True # fail-open
|
||||
quota = getattr(self._settings, "GEMINI_DAILY_QUOTA", 1000)
|
||||
key = f"ollama:gemini_daily_count:{datetime.date.today().isoformat()}"
|
||||
count_raw = await redis.get(key)
|
||||
count = int(count_raw or 0)
|
||||
if count >= quota:
|
||||
return False
|
||||
# atomic incr + 設定 TTL(確保跨日自動重置)
|
||||
await redis.incr(key)
|
||||
await redis.expire(key, 86400)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning("gemini_quota_check_failed", error=str(e))
|
||||
return True # fail-open
|
||||
|
||||
def _build_quota_exceeded_route(
|
||||
self,
|
||||
health_111: HealthReport,
|
||||
health_188: HealthReport | None,
|
||||
url_111: str, # noqa: ARG002 — 保留供 OllamaRoutingResult 結構完整性(health_111 對應)
|
||||
url_188: str,
|
||||
) -> OllamaRoutingResult:
|
||||
"""
|
||||
Gemini 配額耗盡時的備援路由:primary=OLLAMA_188,fallback=[Nemotron, Claude]
|
||||
若 188 也不可用,則 primary=Nemotron。
|
||||
"""
|
||||
model_188 = "qwen2.5:7b-instruct"
|
||||
ep_188 = (
|
||||
OllamaEndpoint(url=url_188, provider_name="ollama_188", model=model_188)
|
||||
if url_188
|
||||
else None
|
||||
)
|
||||
has_188 = ep_188 is not None and (
|
||||
health_188 is not None and health_188.status != HealthStatus.OFFLINE
|
||||
)
|
||||
|
||||
if has_188 and ep_188:
|
||||
return OllamaRoutingResult(
|
||||
primary=ep_188,
|
||||
fallback_chain=[_NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT],
|
||||
routing_reason="Gemini quota exceeded → 188 CPU-only 備援",
|
||||
health_111=health_111,
|
||||
health_188=health_188,
|
||||
)
|
||||
|
||||
# 188 也不可用
|
||||
return OllamaRoutingResult(
|
||||
primary=_NEMOTRON_ENDPOINT,
|
||||
fallback_chain=[_CLAUDE_ENDPOINT],
|
||||
routing_reason="Gemini quota exceeded + 188 不可用 → Nemotron 備援",
|
||||
health_111=health_111,
|
||||
health_188=health_188,
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Recovery API(供 OllamaAutoRecoveryService 呼叫)
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def set_recovery_callback(self, callback) -> None:
|
||||
"""
|
||||
設定 recovery callback(供 lifespan wiring 使用)。
|
||||
callback signature: async (provider_name: str) -> None
|
||||
|
||||
# 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan
|
||||
"""
|
||||
self._recovery_callback = callback
|
||||
|
||||
async def clear_cache(self) -> None:
|
||||
"""
|
||||
清空路由決策快取,讓下次 select_provider 重新評估健康狀態。
|
||||
OllamaAutoRecoveryService 在偵測 111 恢復後呼叫此方法。
|
||||
|
||||
2026-04-25 統帥指令 by Claude Engineer-C — 自動切 Gemini + 自動恢復
|
||||
# 2026-04-25 P1.2 by Claude Engineer-A2 — 改用 make_cache_key 動態組 key,消除硬編碼 IP
|
||||
"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
from src.services.ollama_health_monitor import make_cache_key
|
||||
redis = get_redis()
|
||||
if redis is None:
|
||||
return
|
||||
# 動態由 settings URL 組 cache key,避免硬編碼 IP
|
||||
keys = [
|
||||
make_cache_key(self._settings.OLLAMA_URL),
|
||||
make_cache_key(self._settings.OLLAMA_FALLBACK_URL or ""),
|
||||
]
|
||||
for k in keys:
|
||||
if k and k != "ollama_health:": # 空 URL 會產生無意義的 key,跳過
|
||||
await redis.delete(k)
|
||||
logger.info(
|
||||
"ollama_failover_cache_cleared",
|
||||
service="ollama_failover",
|
||||
reason="recovery_triggered",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("ollama_failover_clear_cache_failed", error=str(e))
|
||||
|
||||
def notify_recovery(self, provider: str) -> None:
|
||||
"""
|
||||
預留:P1.5 Engineer 接入 Telegram alerter 時使用。
|
||||
目前僅寫 structlog audit。
|
||||
|
||||
2026-04-25 統帥指令 by Claude Engineer-C — 自動切 Gemini + 自動恢復
|
||||
"""
|
||||
logger.info(
|
||||
"ollama_recovery_notified",
|
||||
service="ollama_failover",
|
||||
provider=provider,
|
||||
action="recovery_received",
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Audit Log
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def _write_failover_audit(self, result: OllamaRoutingResult) -> None:
|
||||
"""
|
||||
切換觸發時寫 structlog audit(best-effort)+ Telegram 告警
|
||||
|
||||
# 2026-04-25 critic-fix Part2 B1 by Claude Engineer-C2
|
||||
# 原 AuditLog DB 寫入使用不存在的欄位(service/action/target/status/metadata)
|
||||
# → SQLAlchemy crash → except 吃掉 → 零稽核
|
||||
# 修法:刪除 DB 寫入路徑,改用 structlog only(audit 不依賴 DB schema)
|
||||
|
||||
# 2026-04-25 P1.5 by Claude Engineer-D — 新增 Telegram 告警(dedup 10min)
|
||||
|
||||
service="ollama_failover"(per 任務規格)
|
||||
僅在 primary 非 111 時記錄(真正發生切換)
|
||||
"""
|
||||
if result.primary.provider_name == "ollama":
|
||||
# 111 正常,無切換事件
|
||||
return
|
||||
|
||||
logger.info(
|
||||
"ollama_failover_triggered",
|
||||
service="ollama_failover",
|
||||
action="failover_triggered",
|
||||
from_provider="ollama",
|
||||
to_provider=result.primary.provider_name,
|
||||
reason=result.routing_reason,
|
||||
primary_url=result.primary.url or result.primary.provider_name,
|
||||
health_111=result.health_111.status.value,
|
||||
health_188=result.health_188.status.value if result.health_188 else "not_configured",
|
||||
)
|
||||
|
||||
# Telegram 告警(首次切換才通知,dedup 10min 內建)
|
||||
# 2026-04-25 P1.5 by Claude Engineer-D — 告警失敗不阻斷主路由邏輯
|
||||
try:
|
||||
from src.services.failover_alerter import get_failover_alerter
|
||||
fallback_chain_str = " → ".join(
|
||||
p.provider_name for p in result.fallback_chain
|
||||
)
|
||||
alerter = get_failover_alerter()
|
||||
await alerter.alert_failover({
|
||||
"to_provider": result.primary.provider_name,
|
||||
"model": result.primary.model,
|
||||
"reason": result.routing_reason,
|
||||
"timestamp": datetime.datetime.now(TAIPEI_TZ).isoformat(),
|
||||
"fallback_chain_str": fallback_chain_str,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning("failover_alert_failed", error=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
_failover_manager: OllamaFailoverManager | None = None
|
||||
|
||||
|
||||
def get_ollama_failover_manager() -> OllamaFailoverManager:
|
||||
"""取得 OllamaFailoverManager singleton"""
|
||||
global _failover_manager
|
||||
if _failover_manager is None:
|
||||
_failover_manager = OllamaFailoverManager()
|
||||
return _failover_manager
|
||||
|
||||
|
||||
def reset_ollama_failover_manager() -> None:
|
||||
"""重置 singleton(測試用)"""
|
||||
global _failover_manager
|
||||
_failover_manager = None
|
||||
356
apps/api/src/services/ollama_health_monitor.py
Normal file
356
apps/api/src/services/ollama_health_monitor.py
Normal file
@@ -0,0 +1,356 @@
|
||||
"""
|
||||
Ollama 健康檢測模組 - P1.1a
|
||||
============================
|
||||
三層健康檢查:連通性 → 推理測試 → (optional) GPU 記憶體
|
||||
|
||||
設計原則:
|
||||
- Redis 30s 快取防 health check storm
|
||||
- 降級安全:快取失敗不影響功能,直接執行檢查
|
||||
- 用 settings 取 host 配置(禁硬編碼 IP)
|
||||
- 用 httpx.AsyncClient(非 requests)
|
||||
|
||||
版本: v1.1
|
||||
建立: 2026-04-25 (台北時區)
|
||||
建立者: Claude Engineer-C (P1.1a)
|
||||
# Created 2026-04-25 P1.1 by Claude Engineer-C
|
||||
# 2026-04-25 critic-fix by Claude Engineer-C — 修 BLOCKER B3(make_cache_key 公開) + H3(timeout 45s)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
from src.core.config import get_settings
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# 常數
|
||||
# =============================================================================
|
||||
|
||||
REDIS_CACHE_KEY_PREFIX = "ollama_health:"
|
||||
REDIS_CACHE_TTL_SECONDS = 30 # 防 health check storm
|
||||
|
||||
CONNECTIVITY_TIMEOUT_SECONDS = 5.0
|
||||
# 2026-04-25 critic-fix H3 by Claude Engineer-C — 45s 讓 SLOW 門檻(30s)真的能觀察到
|
||||
INFERENCE_TIMEOUT_SECONDS = 45.0
|
||||
|
||||
# 推理延遲分級門檻(毫秒)
|
||||
LATENCY_HEALTHY_THRESHOLD_MS = 10_000 # <10s → HEALTHY
|
||||
LATENCY_SLOW_THRESHOLD_MS = 30_000 # 10-30s → SLOW
|
||||
# >30s → DEGRADED
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 公開工具函數
|
||||
# =============================================================================
|
||||
|
||||
def make_cache_key(url: str) -> str:
|
||||
"""
|
||||
由 URL 產生 Redis cache key。
|
||||
|
||||
取 host:port 部分(netloc)作為 key,避免 scheme 差異干擾。
|
||||
例:http://192.168.0.111:11434 → "ollama_health:192.168.0.111:11434"
|
||||
|
||||
公開函數供 OllamaFailoverManager.clear_cache() 動態組 key,
|
||||
確保與內部 _cache_key() 使用相同邏輯。
|
||||
|
||||
# 2026-04-25 critic-fix B3 by Claude Engineer-C — 避免 clear_cache 硬編碼 IP
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
netloc = parsed.netloc or url
|
||||
return f"{REDIS_CACHE_KEY_PREFIX}{netloc}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 資料模型
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class HealthStatus(Enum):
|
||||
"""Ollama 健康狀態分級"""
|
||||
|
||||
HEALTHY = "healthy" # <10s 推理,正常使用
|
||||
SLOW = "slow" # 10-30s,降級使用
|
||||
DEGRADED = "degraded" # >30s 或推理超時,優先切換
|
||||
OFFLINE = "offline" # 連通性失敗,必須切換
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthReport:
|
||||
"""Ollama 健康檢測報告"""
|
||||
|
||||
status: HealthStatus
|
||||
host: str = ""
|
||||
latency_ms: float = 0.0
|
||||
reason: str = ""
|
||||
checked_at: float = field(default_factory=time.time)
|
||||
from_cache: bool = False
|
||||
mcp_health: dict[str, bool] = field(default_factory=dict)
|
||||
|
||||
def is_usable(self) -> bool:
|
||||
"""是否可用(不含 OFFLINE)"""
|
||||
return self.status != HealthStatus.OFFLINE
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"status": self.status.value,
|
||||
"host": self.host,
|
||||
"latency_ms": round(self.latency_ms, 1),
|
||||
"reason": self.reason,
|
||||
"checked_at": self.checked_at,
|
||||
"from_cache": self.from_cache,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OllamaHealthMonitor
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class OllamaHealthMonitor:
|
||||
"""
|
||||
Ollama 三層健康檢測
|
||||
|
||||
層 1:連通性(/api/tags,5s timeout)
|
||||
層 2:推理測試(/api/generate,35s timeout)
|
||||
層 3:GPU 記憶體(optional,via SSH MCP)
|
||||
|
||||
結果 Redis 快取 30s,防 health check storm。
|
||||
|
||||
2026-04-25 Claude Engineer-C (P1.1a)
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._settings = get_settings()
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Public API
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def check(self, host: str) -> HealthReport:
|
||||
"""
|
||||
執行健康檢測(含 Redis 快取)
|
||||
|
||||
Args:
|
||||
host: Ollama 主機 URL,e.g. "http://192.168.0.111:11434"
|
||||
|
||||
Returns:
|
||||
HealthReport
|
||||
"""
|
||||
# 嘗試從快取取得
|
||||
cached = await self._get_cached(host)
|
||||
if cached is not None:
|
||||
cached.from_cache = True
|
||||
logger.debug("ollama_health_cache_hit", host=host, status=cached.status.value)
|
||||
return cached
|
||||
|
||||
# 執行實際健康檢測
|
||||
report = await self._run_checks(host)
|
||||
report.host = host
|
||||
|
||||
# 寫入快取(失敗不影響功能,外部 exception 也靜默)
|
||||
try:
|
||||
await self._set_cached(host, report)
|
||||
except Exception as e:
|
||||
logger.debug("ollama_health_set_cached_outer_failed", host=host, error=str(e))
|
||||
|
||||
logger.info(
|
||||
"ollama_health_checked",
|
||||
host=host,
|
||||
status=report.status.value,
|
||||
latency_ms=round(report.latency_ms, 1),
|
||||
reason=report.reason,
|
||||
)
|
||||
|
||||
# 寫入 audit_log(best-effort)
|
||||
await self._write_audit_log(host, report)
|
||||
|
||||
return report
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# 三層檢查
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def _run_checks(self, host: str) -> HealthReport:
|
||||
"""依序執行三層檢查"""
|
||||
# 層 1:連通性
|
||||
connectivity_ok = await self._check_connectivity(host)
|
||||
if not connectivity_ok:
|
||||
return HealthReport(
|
||||
status=HealthStatus.OFFLINE,
|
||||
host=host,
|
||||
reason="連通性失敗:/api/tags 無回應",
|
||||
)
|
||||
|
||||
# 層 2:推理測試
|
||||
report = await self._check_inference(host)
|
||||
return report
|
||||
|
||||
async def _check_connectivity(self, host: str) -> bool:
|
||||
"""
|
||||
層 1:連通性檢查
|
||||
GET /api/tags,5s timeout,回傳 200 即通過
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=httpx.Timeout(CONNECTIVITY_TIMEOUT_SECONDS)
|
||||
) as client:
|
||||
resp = await client.get(f"{host}/api/tags")
|
||||
return resp.status_code == 200
|
||||
except (httpx.TimeoutException, httpx.ConnectError, httpx.NetworkError):
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning("ollama_connectivity_check_error", host=host, error=str(e))
|
||||
return False
|
||||
|
||||
async def _check_inference(self, host: str) -> HealthReport:
|
||||
"""
|
||||
層 2:推理測試
|
||||
POST /api/generate,35s timeout,依延遲分級
|
||||
"""
|
||||
model = self._settings.OLLAMA_HEALTH_CHECK_MODEL
|
||||
start = time.perf_counter()
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=httpx.Timeout(INFERENCE_TIMEOUT_SECONDS)
|
||||
) as client:
|
||||
resp = await client.post(
|
||||
f"{host}/api/generate",
|
||||
json={
|
||||
"model": model,
|
||||
"prompt": "hi",
|
||||
"stream": False,
|
||||
"options": {"num_predict": 1},
|
||||
},
|
||||
)
|
||||
latency_ms = (time.perf_counter() - start) * 1000
|
||||
|
||||
if resp.status_code != 200:
|
||||
return HealthReport(
|
||||
status=HealthStatus.DEGRADED,
|
||||
latency_ms=latency_ms,
|
||||
reason=f"推理回傳 HTTP {resp.status_code}",
|
||||
)
|
||||
|
||||
# 分級判斷
|
||||
if latency_ms < LATENCY_HEALTHY_THRESHOLD_MS:
|
||||
return HealthReport(
|
||||
status=HealthStatus.HEALTHY,
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
elif latency_ms < LATENCY_SLOW_THRESHOLD_MS:
|
||||
return HealthReport(
|
||||
status=HealthStatus.SLOW,
|
||||
latency_ms=latency_ms,
|
||||
reason=f"推理延遲 {latency_ms:.0f}ms(slow zone)",
|
||||
)
|
||||
else:
|
||||
return HealthReport(
|
||||
status=HealthStatus.DEGRADED,
|
||||
latency_ms=latency_ms,
|
||||
reason=f"推理延遲 {latency_ms:.0f}ms(>30s)",
|
||||
)
|
||||
|
||||
except (httpx.TimeoutException, asyncio.TimeoutError):
|
||||
latency_ms = (time.perf_counter() - start) * 1000
|
||||
return HealthReport(
|
||||
status=HealthStatus.DEGRADED,
|
||||
latency_ms=latency_ms,
|
||||
reason="推理超時 >35s",
|
||||
)
|
||||
except (httpx.ConnectError, httpx.NetworkError) as e:
|
||||
return HealthReport(
|
||||
status=HealthStatus.OFFLINE,
|
||||
reason=f"推理連接失敗:{e}",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("ollama_inference_check_error", host=host, error=str(e))
|
||||
return HealthReport(
|
||||
status=HealthStatus.DEGRADED,
|
||||
reason=f"推理測試例外:{e}",
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Redis 快取
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def _cache_key(self, host: str) -> str:
|
||||
"""生成 Redis cache key(內部使用,委派至 make_cache_key)"""
|
||||
return make_cache_key(host)
|
||||
|
||||
async def _get_cached(self, host: str) -> HealthReport | None:
|
||||
"""從 Redis 取得快取的 HealthReport,失敗返回 None"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
raw = await redis.get(self._cache_key(host))
|
||||
if not raw:
|
||||
return None
|
||||
data = json.loads(raw)
|
||||
return HealthReport(
|
||||
status=HealthStatus(data["status"]),
|
||||
host=data.get("host", host),
|
||||
latency_ms=data.get("latency_ms", 0.0),
|
||||
reason=data.get("reason", ""),
|
||||
checked_at=data.get("checked_at", time.time()),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("ollama_health_cache_get_failed", host=host, error=str(e))
|
||||
return None
|
||||
|
||||
async def _set_cached(self, host: str, report: HealthReport) -> None:
|
||||
"""寫入 Redis 快取,失敗靜默(不影響功能)"""
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
data = json.dumps(report.to_dict())
|
||||
await redis.set(self._cache_key(host), data, ex=REDIS_CACHE_TTL_SECONDS)
|
||||
except Exception as e:
|
||||
logger.debug("ollama_health_cache_set_failed", host=host, error=str(e))
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Audit Log (structured log,AuditLog 表設計用於 K8s 操作審計,不適合此場景)
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
async def _write_audit_log(self, host: str, report: HealthReport) -> None:
|
||||
"""記錄健康檢測結果(structlog,service=ollama_health_monitor)"""
|
||||
logger.info(
|
||||
"ollama_health_monitor_audit",
|
||||
service="ollama_health_monitor",
|
||||
host=host,
|
||||
status=report.status.value,
|
||||
latency_ms=round(report.latency_ms, 1),
|
||||
reason=report.reason,
|
||||
is_usable=report.is_usable(),
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
_monitor: OllamaHealthMonitor | None = None
|
||||
|
||||
|
||||
def get_ollama_health_monitor() -> OllamaHealthMonitor:
|
||||
"""取得 OllamaHealthMonitor singleton"""
|
||||
global _monitor
|
||||
if _monitor is None:
|
||||
_monitor = OllamaHealthMonitor()
|
||||
return _monitor
|
||||
|
||||
|
||||
def reset_ollama_health_monitor() -> None:
|
||||
"""重置 singleton(測試用)"""
|
||||
global _monitor
|
||||
_monitor = None
|
||||
Reference in New Issue
Block a user