fix(ollama): 修復容災鏈四大 bug — OFFLINE cache 放大 + SLOW 路由缺失 + recovery 命名不一致 + 告警顯示
All checks were successful
Code Review / ai-code-review (push) Successful in 48s
All checks were successful
Code Review / ai-code-review (push) Successful in 48s
根因:NetworkPolicy reload/CNI 瞬態抖動導致三台 Ollama 同時 OFFLINE,被 30s Redis cache 放大
→ 後續 30s 所有請求誤走 Gemini,燒 quota
B1 ollama_health_monitor: OFFLINE TTL 從 30s 縮短至 5s,儘速重試
B3 ollama_health_monitor: inference ConnectError 改判 DEGRADED(connectivity 通了不算 OFFLINE)
B5/B6 ollama_auto_recovery: _current_primary 預設改 "ollama_gcp_a",比對改 startswith("ollama_")
SLOW 修復: failover_manager SLOW 節點視為可用(優於 Gemini quota 耗盡)
SLOW 修復: auto_recovery SLOW 也計入 recovery counter(GCP 高負載仍可切回)
告警顯示: _provider_display 加入 GCP-A/B/Local 具體伺服器識別
告警顯示: _format_automation_block 加入 Token 用量行
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -110,7 +110,11 @@ class OllamaAutoRecoveryService:
|
|||||||
self._settings = get_settings()
|
self._settings = get_settings()
|
||||||
|
|
||||||
# 狀態追蹤
|
# 狀態追蹤
|
||||||
self._current_primary: str = "ollama" # "ollama" / "gemini" / "fallback"
|
# 2026-05-04 ogt: B5 修復 — 改用與 failover_manager callback 一致的命名
|
||||||
|
# failover_manager 傳入的是 "ollama_gcp_a"/"ollama_gcp_b"/"ollama_local"/"gemini"
|
||||||
|
# 原設計用 "ollama" 造成 != "ollama" 永遠成立 → recovery loop 永遠以為在 Gemini
|
||||||
|
# 修法:判斷改用 _is_ollama_primary(),比對 startswith("ollama_")
|
||||||
|
self._current_primary: str = "ollama_gcp_a" # fallback_manager 傳入的 provider_name
|
||||||
self._consecutive_healthy: int = 0
|
self._consecutive_healthy: int = 0
|
||||||
self._task: asyncio.Task | None = None
|
self._task: asyncio.Task | None = None
|
||||||
|
|
||||||
@@ -213,8 +217,10 @@ class OllamaAutoRecoveryService:
|
|||||||
# Redis 持久化(跨重啟恢復)
|
# Redis 持久化(跨重啟恢復)
|
||||||
await self._persist_primary(provider)
|
await self._persist_primary(provider)
|
||||||
|
|
||||||
if provider != "ollama":
|
# 2026-05-04 ogt: B5/B6 修復 — 判斷改用 startswith("ollama_")
|
||||||
# 切換到非 Ollama → 重置 counter,開始監控恢復
|
# 原設計:provider != "ollama",但 callback 傳 "ollama_gcp_a" → 永遠觸發 tracking
|
||||||
|
if not provider.startswith("ollama_"):
|
||||||
|
# 切換到非 Ollama(gemini/nemotron/claude)→ 重置 counter,開始監控恢復
|
||||||
self._consecutive_healthy = 0
|
self._consecutive_healthy = 0
|
||||||
logger.info(
|
logger.info(
|
||||||
"ollama_auto_recovery_tracking_started",
|
"ollama_auto_recovery_tracking_started",
|
||||||
@@ -336,7 +342,11 @@ class OllamaAutoRecoveryService:
|
|||||||
self._consecutive_healthy = 0
|
self._consecutive_healthy = 0
|
||||||
return
|
return
|
||||||
|
|
||||||
if health.status == HealthStatus.HEALTHY:
|
# 2026-05-04 ogt: SLOW 視為可用(GCP 高負載 10-30s 仍優於 Gemini quota 限制)
|
||||||
|
# 原設計只接受 HEALTHY,GCP 若因負載偏高落入 SLOW 區會永久卡在 Gemini
|
||||||
|
is_usable = health.status in (HealthStatus.HEALTHY, HealthStatus.SLOW)
|
||||||
|
|
||||||
|
if is_usable:
|
||||||
self._consecutive_healthy += 1
|
self._consecutive_healthy += 1
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"ollama_auto_recovery_healthy_tick",
|
"ollama_auto_recovery_healthy_tick",
|
||||||
@@ -344,15 +354,16 @@ class OllamaAutoRecoveryService:
|
|||||||
consecutive=self._consecutive_healthy,
|
consecutive=self._consecutive_healthy,
|
||||||
required=self._stable_count_required,
|
required=self._stable_count_required,
|
||||||
current_primary=self._current_primary,
|
current_primary=self._current_primary,
|
||||||
|
actual_status=health.status.value,
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self._consecutive_healthy >= self._stable_count_required
|
self._consecutive_healthy >= self._stable_count_required
|
||||||
and self._current_primary != "ollama"
|
and not self._current_primary.startswith("ollama_")
|
||||||
):
|
):
|
||||||
await self._switch_back_to_ollama()
|
await self._switch_back_to_ollama()
|
||||||
else:
|
else:
|
||||||
# 非 HEALTHY → counter 歸零,繼續等
|
# DEGRADED / OFFLINE → counter 歸零,繼續等
|
||||||
if self._consecutive_healthy > 0:
|
if self._consecutive_healthy > 0:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"ollama_auto_recovery_counter_reset",
|
"ollama_auto_recovery_counter_reset",
|
||||||
|
|||||||
@@ -352,7 +352,42 @@ class OllamaFailoverManager:
|
|||||||
health_local=health_local,
|
health_local=health_local,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 全部 Ollama 不健康 → Gemini
|
# 2026-05-04 ogt: SLOW 容災備援(外網同時抖動時,SLOW Ollama 仍優於 Gemini quota 耗盡)
|
||||||
|
# 原設計:三層全部非 HEALTHY 直接切 Gemini
|
||||||
|
# 問題:111 關機 + GCP 雙外網抖動 → 三節點同時 SLOW → 誤飛 Gemini → 燒 quota
|
||||||
|
# 修法:SLOW 節點視為可用,按優先序選最佳 SLOW 節點
|
||||||
|
if health_gcp_a.status == HealthStatus.SLOW:
|
||||||
|
return OllamaRoutingResult(
|
||||||
|
primary=ep_gcp_a,
|
||||||
|
fallback_chain=[ep_gcp_b, ep_local, _GEMINI_ENDPOINT],
|
||||||
|
routing_reason=f"GCP-A SLOW(降級可用)→ primary GCP-A at {now_ts}",
|
||||||
|
health_gcp_a=health_gcp_a,
|
||||||
|
health_gcp_b=health_gcp_b,
|
||||||
|
health_local=health_local,
|
||||||
|
)
|
||||||
|
if health_gcp_b.status == HealthStatus.SLOW:
|
||||||
|
return OllamaRoutingResult(
|
||||||
|
primary=ep_gcp_b,
|
||||||
|
fallback_chain=[ep_local, _GEMINI_ENDPOINT],
|
||||||
|
routing_reason=f"GCP-A {health_gcp_a.status.value} + GCP-B SLOW(降級可用)→ 切 GCP-B at {now_ts}",
|
||||||
|
health_gcp_a=health_gcp_a,
|
||||||
|
health_gcp_b=health_gcp_b,
|
||||||
|
health_local=health_local,
|
||||||
|
)
|
||||||
|
if health_local.status == HealthStatus.SLOW:
|
||||||
|
return OllamaRoutingResult(
|
||||||
|
primary=ep_local,
|
||||||
|
fallback_chain=[_GEMINI_ENDPOINT],
|
||||||
|
routing_reason=(
|
||||||
|
f"GCP-A {health_gcp_a.status.value} + GCP-B {health_gcp_b.status.value}"
|
||||||
|
f" + Local SLOW(降級可用)→ 切 Local(111) at {now_ts}"
|
||||||
|
),
|
||||||
|
health_gcp_a=health_gcp_a,
|
||||||
|
health_gcp_b=health_gcp_b,
|
||||||
|
health_local=health_local,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 全部 Ollama 不可用(DEGRADED/OFFLINE)→ Gemini
|
||||||
return OllamaRoutingResult(
|
return OllamaRoutingResult(
|
||||||
primary=_GEMINI_ENDPOINT,
|
primary=_GEMINI_ENDPOINT,
|
||||||
fallback_chain=[_NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT],
|
fallback_chain=[_NEMOTRON_ENDPOINT, _CLAUDE_ENDPOINT],
|
||||||
|
|||||||
@@ -37,7 +37,11 @@ logger = structlog.get_logger(__name__)
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
REDIS_CACHE_KEY_PREFIX = "ollama_health:"
|
REDIS_CACHE_KEY_PREFIX = "ollama_health:"
|
||||||
REDIS_CACHE_TTL_SECONDS = 30 # 防 health check storm
|
REDIS_CACHE_TTL_SECONDS = 30 # HEALTHY/SLOW/DEGRADED 快取 30s 防 check storm
|
||||||
|
# 2026-05-04 ogt: B1 修復 — OFFLINE 快取 TTL 縮短至 5s
|
||||||
|
# 根因:NetworkPolicy reload/CNI 瞬態抖動導致三台同時 OFFLINE,被 30s cache 放大
|
||||||
|
# OFFLINE 狀態快取越短越好,讓系統儘快重新評估並切回
|
||||||
|
REDIS_CACHE_TTL_OFFLINE_SECONDS = 5 # OFFLINE 只快取 5s,儘速重試
|
||||||
|
|
||||||
CONNECTIVITY_TIMEOUT_SECONDS = 5.0
|
CONNECTIVITY_TIMEOUT_SECONDS = 5.0
|
||||||
# 2026-04-25 critic-fix H3 by Claude Engineer-C — 45s 讓 SLOW 門檻(30s)真的能觀察到
|
# 2026-04-25 critic-fix H3 by Claude Engineer-C — 45s 讓 SLOW 門檻(30s)真的能觀察到
|
||||||
@@ -289,9 +293,13 @@ class OllamaHealthMonitor:
|
|||||||
reason="推理超時 >35s",
|
reason="推理超時 >35s",
|
||||||
)
|
)
|
||||||
except (httpx.ConnectError, httpx.NetworkError) as e:
|
except (httpx.ConnectError, httpx.NetworkError) as e:
|
||||||
|
# 2026-05-04 ogt: B3 修復 — connectivity 已通過,推理階段 ConnectError 改判 DEGRADED
|
||||||
|
# 原設計:ConnectError → OFFLINE,但 /api/tags 已成功,表示主機存活
|
||||||
|
# 根因:socket 半開(GCP LB 回收 idle conn)或 Ollama 進程重啟,屬瞬態
|
||||||
|
# DEGRADED 不觸發 30s OFFLINE cache,下次請求立刻重試
|
||||||
return HealthReport(
|
return HealthReport(
|
||||||
status=HealthStatus.OFFLINE,
|
status=HealthStatus.DEGRADED,
|
||||||
reason=f"推理連接失敗:{e}",
|
reason=f"推理連接失敗(主機可達,socket 瞬斷):{e}",
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("ollama_inference_check_error", host=host, error=str(e))
|
logger.warning("ollama_inference_check_error", host=host, error=str(e))
|
||||||
@@ -329,12 +337,19 @@ class OllamaHealthMonitor:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
async def _set_cached(self, host: str, report: HealthReport) -> None:
|
async def _set_cached(self, host: str, report: HealthReport) -> None:
|
||||||
"""寫入 Redis 快取,失敗靜默(不影響功能)"""
|
"""寫入 Redis 快取,失敗靜默(不影響功能)
|
||||||
|
2026-05-04 ogt: OFFLINE 結果快取 5s(縮短),其他狀態快取 30s
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from src.core.redis_client import get_redis
|
from src.core.redis_client import get_redis
|
||||||
redis = get_redis()
|
redis = get_redis()
|
||||||
data = json.dumps(report.to_dict())
|
data = json.dumps(report.to_dict())
|
||||||
await redis.set(self._cache_key(host), data, ex=REDIS_CACHE_TTL_SECONDS)
|
ttl = (
|
||||||
|
REDIS_CACHE_TTL_OFFLINE_SECONDS
|
||||||
|
if report.status == HealthStatus.OFFLINE
|
||||||
|
else REDIS_CACHE_TTL_SECONDS
|
||||||
|
)
|
||||||
|
await redis.set(self._cache_key(host), data, ex=ttl)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug("ollama_health_cache_set_failed", host=host, error=str(e))
|
logger.debug("ollama_health_cache_set_failed", host=host, error=str(e))
|
||||||
|
|
||||||
|
|||||||
@@ -218,9 +218,15 @@ class TelegramMessage:
|
|||||||
nemotron_latency_ms: float = 0.0 # Nemotron 呼叫延遲 (ms)
|
nemotron_latency_ms: float = 0.0 # Nemotron 呼叫延遲 (ms)
|
||||||
|
|
||||||
def _provider_display(self) -> tuple[str, str]:
|
def _provider_display(self) -> tuple[str, str]:
|
||||||
"""Return display provider and optional model suffix."""
|
"""Return display provider and optional model suffix.
|
||||||
|
2026-05-04 ogt: 加入具體 Ollama 伺服器顯示(GCP-A/B/Local)
|
||||||
|
"""
|
||||||
provider_names = {
|
provider_names = {
|
||||||
"ollama": "Ollama",
|
"ollama": "Ollama",
|
||||||
|
# 2026-05-04 ogt: ADR-110 三層容災具體伺服器識別
|
||||||
|
"ollama_gcp_a": "Ollama GCP-A (34.143.170.20)",
|
||||||
|
"ollama_gcp_b": "Ollama GCP-B (34.21.145.224)",
|
||||||
|
"ollama_local": "Ollama Local (111)",
|
||||||
"gemini": "Gemini",
|
"gemini": "Gemini",
|
||||||
"claude": "Claude",
|
"claude": "Claude",
|
||||||
"nvidia": "Nemotron",
|
"nvidia": "Nemotron",
|
||||||
@@ -249,7 +255,9 @@ class TelegramMessage:
|
|||||||
return "safe_gate_pending"
|
return "safe_gate_pending"
|
||||||
|
|
||||||
def _format_automation_block(self) -> str:
|
def _format_automation_block(self) -> str:
|
||||||
"""Visible AI automation chain for every ACTION REQUIRED card."""
|
"""Visible AI automation chain for every ACTION REQUIRED card.
|
||||||
|
2026-05-04 ogt: 加入 Token 用量 + 具體 Ollama 伺服器顯示
|
||||||
|
"""
|
||||||
provider_display, model_suffix = self._provider_display()
|
provider_display, model_suffix = self._provider_display()
|
||||||
mode = self._automation_mode()
|
mode = self._automation_mode()
|
||||||
openclaw_state = provider_display if provider_display != "rule_fallback" else "degraded"
|
openclaw_state = provider_display if provider_display != "rule_fallback" else "degraded"
|
||||||
@@ -258,6 +266,12 @@ class TelegramMessage:
|
|||||||
elephant_state = "timeline_km_pending"
|
elephant_state = "timeline_km_pending"
|
||||||
flow = "webhook>investigator>router>llm/rule>safe>approval"
|
flow = "webhook>investigator>router>llm/rule>safe>approval"
|
||||||
|
|
||||||
|
# 2026-05-04 ogt: Token 用量顯示(有資料才顯示)
|
||||||
|
token_line = ""
|
||||||
|
if self.ai_tokens > 0:
|
||||||
|
cost_str = f" / ${self.ai_cost:.4f}" if self.ai_cost > 0 else ""
|
||||||
|
token_line = f"├ Tokens:<code>{self.ai_tokens:,}{cost_str}</code>\n"
|
||||||
|
|
||||||
return (
|
return (
|
||||||
f"🤖 <b>AI 自動化鏈路</b>\n"
|
f"🤖 <b>AI 自動化鏈路</b>\n"
|
||||||
f"├ Router:<code>{html.escape(provider_display)}{model_suffix}</code>\n"
|
f"├ Router:<code>{html.escape(provider_display)}{model_suffix}</code>\n"
|
||||||
@@ -266,6 +280,7 @@ class TelegramMessage:
|
|||||||
f"NemoTron:<code>{html.escape(nemotron_state)}</code>\n"
|
f"NemoTron:<code>{html.escape(nemotron_state)}</code>\n"
|
||||||
f"├ Hermes:<code>{html.escape(hermes_state)}</code> | "
|
f"├ Hermes:<code>{html.escape(hermes_state)}</code> | "
|
||||||
f"ElephantAlpha:<code>{html.escape(elephant_state)}</code>\n"
|
f"ElephantAlpha:<code>{html.escape(elephant_state)}</code>\n"
|
||||||
|
f"{token_line}"
|
||||||
f"└ Flow:<code>{flow}</code>\n"
|
f"└ Flow:<code>{flow}</code>\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user