From bd942e9427f4d7ce38eb7d12cb236e1e95d4a3af Mon Sep 17 00:00:00 2001 From: OoO Date: Thu, 18 Jun 2026 13:46:30 +0800 Subject: [PATCH] fix: skip unhealthy direct ollama probes --- .env.example | 2 + config.py | 2 +- docs/AI_INTELLIGENCE_MODULE_SOT.md | 5 +- .../current_execution_queue_20260524.md | 6 ++ services/ollama_service.py | 84 ++++++++++++++++++- tests/test_ollama_resolve.py | 38 +++++++++ 6 files changed, 132 insertions(+), 5 deletions(-) diff --git a/.env.example b/.env.example index 237d9bb..1759e95 100644 --- a/.env.example +++ b/.env.example @@ -430,6 +430,8 @@ OLLAMA_HOST_SECONDARY=http://34.21.145.224:11434 OLLAMA_HOST_FALLBACK=http://192.168.0.111:11434 OLLAMA_HOST_PRIMARY_PROXY=http://192.168.0.110:11435 OLLAMA_HOST_SECONDARY_PROXY=http://192.168.0.110:11436 +OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED=true +OLLAMA_RESOLVE_HOST_HEALTH_SKIP_WINDOW_MINUTES=20 OLLAMA_MODEL=gemma3:4b OLLAMA_TIMEOUT=120 OLLAMA_COPY_TIMEOUT=180 diff --git a/config.py b/config.py index e3e626c..78ca06e 100644 --- a/config.py +++ b/config.py @@ -402,7 +402,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.626" +SYSTEM_VERSION = "V10.627" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index d725378..8794af3 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -1,8 +1,8 @@ # PChome 業績成長自動化作戰系統 — AI 競價情報模組 Single Source of Truth > **最後更新**: 2026-06-18 (台北時間) -> **狀態**: 🟢 四 AI Agent 自動化閉環已落地;LLM 路由紅線升級為 Ollama-first 三主機級聯;PChome 後台業績匯入韌性已補強;產品定位正名為「PChome 業績成長自動化作戰系統」;外部市場來源正規化層、自動同步、作戰清單與價格參考表優先讀取、CSV 備援預檢、前台操作入口、高可見頁面繁中化守門、比價/作戰 UI 工作台化、GCP embedding 熔斷延後處理與 110 proxy rescue 已建立 -> **適用版本**: V10.626 +> **狀態**: 🟢 四 AI Agent 自動化閉環已落地;LLM 路由紅線升級為 Ollama-first 三主機級聯;PChome 後台業績匯入韌性已補強;產品定位正名為「PChome 業績成長自動化作戰系統」;外部市場來源正規化層、自動同步、作戰清單與價格參考表優先讀取、CSV 備援預檢、前台操作入口、高可見頁面繁中化守門、比價/作戰 UI 工作台化、GCP embedding 熔斷延後處理、110 proxy rescue 與 direct host health skip 已建立 +> **適用版本**: V10.627 --- @@ -11,6 +11,7 @@ - 所有 AI Agent、LLM 推理與 embedding 預設必須走 Ollama 三主機級聯:GCP-A `34.143.170.20:11434` → GCP-B `34.21.145.224:11434` → 111 `192.168.0.111:11434`。 - `services/ollama_service.resolve_ollama_host()` 是主機解析契約;`OLLAMA_HOST`、`HERMES_URL`、`EMBEDDING_HOST`、`OLLAMA_API_BASE` 只接受 GCP-A / GCP-B / 111 或 110 的核准轉發端口。 - 188 直連 GCP-A / GCP-B timeout 時,resolver 可先使用同順位 110 proxy rescue:GCP-A direct → `192.168.0.110:11435` → GCP-B direct → `192.168.0.110:11436` → 111。proxy rescue 只是同一順位的可用入口,不代表 GCP direct host 已恢復。 +- `OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED=true` 時,resolver 會讀最近 `host_health_probes`;若 direct GCP-A/GCP-B 在視窗內已被判定不健康,會直接略過該 direct endpoint,先試同順位 proxy rescue,避免每 120 秒 cache refresh 都等待 direct timeout。此 skip 只套用 direct GCP,不套用 110 proxy。 - `config.OLLAMA_HOST`、`config.HERMES_URL`、`config.EMBEDDING_HOST` 只保留為舊 caller 相容常數;import-time 不得 probe network,也不得因 GCP-A/GCP-B 短暫不可用而 freeze 到 111。需要即時路由時一律呼叫 `get_ollama_host()`、`get_hermes_url()`、`get_embedding_host()` 或 `OllamaService`。 - Gemini 只能作為 Ollama 主路徑失敗後的備援;MCP Grounding、PPT/vision、週/月報、Code Review、EA HITL、複雜 SKU 升級等舊鎖定場景也必須先走 GCP-A → GCP-B → 111。 - 188 `192.168.0.188` 僅是 App / DB / scheduler / Telegram bot 容器宿主與 AutoHeal target,不可作為 Ollama 節點。 diff --git a/docs/memory/current_execution_queue_20260524.md b/docs/memory/current_execution_queue_20260524.md index d29272f..d971367 100644 --- a/docs/memory/current_execution_queue_20260524.md +++ b/docs/memory/current_execution_queue_20260524.md @@ -324,3 +324,9 @@ - V10.626 新增 `OLLAMA_HOST_PRIMARY_PROXY` / `OLLAMA_HOST_SECONDARY_PROXY`,預設為 `http://192.168.0.110:11435` / `http://192.168.0.110:11436`。 - `resolve_ollama_host()` 順序調整為 GCP-A direct → GCP-A via 110 proxy → GCP-B direct → GCP-B via 110 proxy → 111;proxy rescue 是同順位入口救援,不代表 direct GCP host 已恢復。 - 近 24 小時 `ai_calls` 只有 `ollama_secondary=51`、`gcp_ollama=3`、`nim=1`,沒有 Gemini provider;Gemini hard disabled / fallback disabled 的紅線仍有效。 + +## 30. 2026-06-18 V10.627 Resolver 讀 host_health 跳過 direct timeout + +- V10.626 已能在 GCP-A direct timeout 後走 110 proxy,但 cache refresh 仍會先等一次 direct `/api/version` timeout。 +- V10.627 新增 direct-only host health skip:`resolve_ollama_host()` 會讀最近 `host_health_probes`,若 GCP-A/GCP-B direct 在視窗內已 unhealthy,先跳過 direct endpoint,改試同順位 110 proxy;proxy rescue 不吃這個 skip,避免因 direct unhealthy 誤跳過可用 proxy。 +- 新增 `OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED=true` 與 `OLLAMA_RESOLVE_HOST_HEALTH_SKIP_WINDOW_MINUTES=20`;DB 讀取失敗 fail-open,回到原本網路探測。 diff --git a/services/ollama_service.py b/services/ollama_service.py index b69c15f..40ae909 100644 --- a/services/ollama_service.py +++ b/services/ollama_service.py @@ -211,6 +211,84 @@ def _host_label_for_embedding_health(host: str) -> str: return '' +def _host_label_for_direct_health(host: str) -> str: + """Map only direct GCP Ollama URLs to host_health_probes labels.""" + if not host: + return '' + if '34.143.170.20:11434' in host: + return 'Primary (GCP)' + if '34.21.145.224:11434' in host: + return 'Secondary (GCP)' + return '' + + +def _recent_direct_host_unhealthy(host: str) -> bool: + """Skip recent unhealthy direct GCP endpoints before trying proxy rescue. + + This is deliberately direct-host only. Proxy rescue URLs must still be + probed even when the direct GCP endpoint is unhealthy. + """ + if not _env_flag('OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED', True): + return False + + host_label = _host_label_for_direct_health(host) + if not host_label: + return False + + try: + window_minutes = int(os.getenv('OLLAMA_RESOLVE_HOST_HEALTH_SKIP_WINDOW_MINUTES', '20')) + except (TypeError, ValueError): + window_minutes = 20 + window_minutes = max(1, window_minutes) + + try: + from sqlalchemy import text as sa_text + from database.manager import get_session + + session = get_session() + try: + row = session.execute( + sa_text(""" + SELECT healthy, error_msg, probed_at + FROM host_health_probes + WHERE host_label = :host_label + ORDER BY probed_at DESC + LIMIT 1 + """), + {'host_label': host_label}, + ).fetchone() + finally: + session.close() + except Exception: + logger.debug("[OllamaHost] direct host health skip fail-open for host=%s", host, exc_info=True) + return False + + if not row: + return False + + healthy, error_msg, probed_at = row[0], row[1], row[2] + if probed_at: + try: + now = datetime.now(probed_at.tzinfo) if getattr(probed_at, 'tzinfo', None) else datetime.now() + if now - probed_at > timedelta(minutes=window_minutes): + return False + except Exception: + logger.debug("[OllamaHost] could not evaluate host health probe age for host=%s", host, exc_info=True) + return False + + if bool(healthy): + return False + + logger.warning( + "[OllamaHost] skip recent unhealthy direct host=%s label=%s window=%sm error=%s", + host, + host_label, + window_minutes, + (error_msg or '')[:180], + ) + return True + + def _recent_embedding_host_unhealthy(host: str) -> bool: """Skip known-bad GCP embedding runtimes using recent host_health_probes rows. @@ -507,9 +585,11 @@ def resolve_ollama_host(primary: str = OLLAMA_HOST_PRIMARY, primary_proxy = _proxy_rescue_for_primary(primary) secondary_proxy = _proxy_rescue_for_secondary(secondary) + primary_recent_unhealthy = _recent_direct_host_unhealthy(primary) + secondary_recent_unhealthy = _recent_direct_host_unhealthy(secondary) # B4: primary 若被標 unhealthy,先嘗試同順位 110 proxy,再嘗試 secondary - if not _is_unhealthy(primary) and _is_reachable(primary): + if not _is_unhealthy(primary) and not primary_recent_unhealthy and _is_reachable(primary): selected = primary logger.info(f"[OllamaHost] Primary 主機可用: {primary}") elif primary_proxy and not _is_unhealthy(primary_proxy) and _is_reachable(primary_proxy): @@ -518,7 +598,7 @@ def resolve_ollama_host(primary: str = OLLAMA_HOST_PRIMARY, "[OllamaHost] Primary direct 不可用,使用 110 primary proxy: %s", primary_proxy, ) - elif not _is_unhealthy(secondary) and _is_reachable(secondary): + elif not _is_unhealthy(secondary) and not secondary_recent_unhealthy and _is_reachable(secondary): selected = secondary logger.info(f"[OllamaHost] Primary 不可用,使用 Secondary: {secondary}") elif secondary_proxy and not _is_unhealthy(secondary_proxy) and _is_reachable(secondary_proxy): diff --git a/tests/test_ollama_resolve.py b/tests/test_ollama_resolve.py index 2a17f92..3d870a6 100644 --- a/tests/test_ollama_resolve.py +++ b/tests/test_ollama_resolve.py @@ -100,6 +100,44 @@ def test_resolve_uses_primary_proxy_rescue_before_secondary(): ] +def test_resolve_skips_recent_unhealthy_direct_primary_and_uses_proxy(monkeypatch): + """host_health 已判定 GCP-A direct 不健康時,不再等待 direct timeout。""" + from datetime import datetime + from services import ollama_service as oss + + class FakeResult: + def fetchone(self): + return (False, "ConnectTimeout", datetime.now()) + + class FakeSession: + def execute(self, *args, **kwargs): + return FakeResult() + + def close(self): + pass + + fake_ok = MagicMock(status_code=200) + seen_urls = [] + + def fake_get(url, timeout=None): + seen_urls.append(url) + if url == f"{oss.OLLAMA_HOST_PRIMARY}/api/version": + raise AssertionError("recent unhealthy direct host should be skipped") + if url == f"{oss.OLLAMA_HOST_PRIMARY_PROXY}/api/version": + return fake_ok + raise AssertionError(f"should not reach {url}") + + monkeypatch.setenv("OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED", "true") + monkeypatch.setenv("OLLAMA_RESOLVE_HOST_HEALTH_SKIP_WINDOW_MINUTES", "20") + monkeypatch.setattr("database.manager.get_session", lambda: FakeSession()) + + with patch('services.ollama_service.requests.get', side_effect=fake_get): + host = oss.resolve_ollama_host() + + assert host == oss.OLLAMA_HOST_PRIMARY_PROXY + assert seen_urls == [f"{oss.OLLAMA_HOST_PRIMARY_PROXY}/api/version"] + + # ═══════════════════════════════════════════════════════════════════════════ # B4 — mark_unhealthy 行為 # ═══════════════════════════════════════════════════════════════════════════