fix: skip unhealthy direct ollama probes
All checks were successful
CD Pipeline / deploy (push) Successful in 1m10s

This commit is contained in:
OoO
2026-06-18 13:46:30 +08:00
parent ee50e440ce
commit bd942e9427
6 changed files with 132 additions and 5 deletions

View File

@@ -430,6 +430,8 @@ OLLAMA_HOST_SECONDARY=http://34.21.145.224:11434
OLLAMA_HOST_FALLBACK=http://192.168.0.111:11434
OLLAMA_HOST_PRIMARY_PROXY=http://192.168.0.110:11435
OLLAMA_HOST_SECONDARY_PROXY=http://192.168.0.110:11436
OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED=true
OLLAMA_RESOLVE_HOST_HEALTH_SKIP_WINDOW_MINUTES=20
OLLAMA_MODEL=gemma3:4b
OLLAMA_TIMEOUT=120
OLLAMA_COPY_TIMEOUT=180

View File

@@ -402,7 +402,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
# ==========================================
# 系統版本與路徑
# ==========================================
SYSTEM_VERSION = "V10.626"
SYSTEM_VERSION = "V10.627"
LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
public_url = PUBLIC_URL # 用於模板顯示

View File

@@ -1,8 +1,8 @@
# PChome 業績成長自動化作戰系統 — AI 競價情報模組 Single Source of Truth
> **最後更新**: 2026-06-18 (台北時間)
> **狀態**: 🟢 四 AI Agent 自動化閉環已落地LLM 路由紅線升級為 Ollama-first 三主機級聯PChome 後台業績匯入韌性已補強產品定位正名為「PChome 業績成長自動化作戰系統」外部市場來源正規化層、自動同步、作戰清單與價格參考表優先讀取、CSV 備援預檢、前台操作入口、高可見頁面繁中化守門、比價/作戰 UI 工作台化、GCP embedding 熔斷延後處理110 proxy rescue 已建立
> **適用版本**: V10.626
> **狀態**: 🟢 四 AI Agent 自動化閉環已落地LLM 路由紅線升級為 Ollama-first 三主機級聯PChome 後台業績匯入韌性已補強產品定位正名為「PChome 業績成長自動化作戰系統」外部市場來源正規化層、自動同步、作戰清單與價格參考表優先讀取、CSV 備援預檢、前台操作入口、高可見頁面繁中化守門、比價/作戰 UI 工作台化、GCP embedding 熔斷延後處理110 proxy rescue 與 direct host health skip 已建立
> **適用版本**: V10.627
---
@@ -11,6 +11,7 @@
- 所有 AI Agent、LLM 推理與 embedding 預設必須走 Ollama 三主機級聯GCP-A `34.143.170.20:11434` → GCP-B `34.21.145.224:11434` → 111 `192.168.0.111:11434`
- `services/ollama_service.resolve_ollama_host()` 是主機解析契約;`OLLAMA_HOST``HERMES_URL``EMBEDDING_HOST``OLLAMA_API_BASE` 只接受 GCP-A / GCP-B / 111 或 110 的核准轉發端口。
- 188 直連 GCP-A / GCP-B timeout 時resolver 可先使用同順位 110 proxy rescueGCP-A direct → `192.168.0.110:11435` → GCP-B direct → `192.168.0.110:11436` → 111。proxy rescue 只是同一順位的可用入口,不代表 GCP direct host 已恢復。
- `OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED=true`resolver 會讀最近 `host_health_probes`;若 direct GCP-A/GCP-B 在視窗內已被判定不健康,會直接略過該 direct endpoint先試同順位 proxy rescue避免每 120 秒 cache refresh 都等待 direct timeout。此 skip 只套用 direct GCP不套用 110 proxy。
- `config.OLLAMA_HOST``config.HERMES_URL``config.EMBEDDING_HOST` 只保留為舊 caller 相容常數import-time 不得 probe network也不得因 GCP-A/GCP-B 短暫不可用而 freeze 到 111。需要即時路由時一律呼叫 `get_ollama_host()``get_hermes_url()``get_embedding_host()``OllamaService`
- Gemini 只能作為 Ollama 主路徑失敗後的備援MCP Grounding、PPT/vision、週/月報、Code Review、EA HITL、複雜 SKU 升級等舊鎖定場景也必須先走 GCP-A → GCP-B → 111。
- 188 `192.168.0.188` 僅是 App / DB / scheduler / Telegram bot 容器宿主與 AutoHeal target不可作為 Ollama 節點。

View File

@@ -324,3 +324,9 @@
- V10.626 新增 `OLLAMA_HOST_PRIMARY_PROXY` / `OLLAMA_HOST_SECONDARY_PROXY`,預設為 `http://192.168.0.110:11435` / `http://192.168.0.110:11436`
- `resolve_ollama_host()` 順序調整為 GCP-A direct → GCP-A via 110 proxy → GCP-B direct → GCP-B via 110 proxy → 111proxy rescue 是同順位入口救援,不代表 direct GCP host 已恢復。
- 近 24 小時 `ai_calls` 只有 `ollama_secondary=51``gcp_ollama=3``nim=1`,沒有 Gemini providerGemini hard disabled / fallback disabled 的紅線仍有效。
## 30. 2026-06-18 V10.627 Resolver 讀 host_health 跳過 direct timeout
- V10.626 已能在 GCP-A direct timeout 後走 110 proxy但 cache refresh 仍會先等一次 direct `/api/version` timeout。
- V10.627 新增 direct-only host health skip`resolve_ollama_host()` 會讀最近 `host_health_probes`,若 GCP-A/GCP-B direct 在視窗內已 unhealthy先跳過 direct endpoint改試同順位 110 proxyproxy rescue 不吃這個 skip避免因 direct unhealthy 誤跳過可用 proxy。
- 新增 `OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED=true``OLLAMA_RESOLVE_HOST_HEALTH_SKIP_WINDOW_MINUTES=20`DB 讀取失敗 fail-open回到原本網路探測。

View File

@@ -211,6 +211,84 @@ def _host_label_for_embedding_health(host: str) -> str:
return ''
def _host_label_for_direct_health(host: str) -> str:
"""Map only direct GCP Ollama URLs to host_health_probes labels."""
if not host:
return ''
if '34.143.170.20:11434' in host:
return 'Primary (GCP)'
if '34.21.145.224:11434' in host:
return 'Secondary (GCP)'
return ''
def _recent_direct_host_unhealthy(host: str) -> bool:
"""Skip recent unhealthy direct GCP endpoints before trying proxy rescue.
This is deliberately direct-host only. Proxy rescue URLs must still be
probed even when the direct GCP endpoint is unhealthy.
"""
if not _env_flag('OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED', True):
return False
host_label = _host_label_for_direct_health(host)
if not host_label:
return False
try:
window_minutes = int(os.getenv('OLLAMA_RESOLVE_HOST_HEALTH_SKIP_WINDOW_MINUTES', '20'))
except (TypeError, ValueError):
window_minutes = 20
window_minutes = max(1, window_minutes)
try:
from sqlalchemy import text as sa_text
from database.manager import get_session
session = get_session()
try:
row = session.execute(
sa_text("""
SELECT healthy, error_msg, probed_at
FROM host_health_probes
WHERE host_label = :host_label
ORDER BY probed_at DESC
LIMIT 1
"""),
{'host_label': host_label},
).fetchone()
finally:
session.close()
except Exception:
logger.debug("[OllamaHost] direct host health skip fail-open for host=%s", host, exc_info=True)
return False
if not row:
return False
healthy, error_msg, probed_at = row[0], row[1], row[2]
if probed_at:
try:
now = datetime.now(probed_at.tzinfo) if getattr(probed_at, 'tzinfo', None) else datetime.now()
if now - probed_at > timedelta(minutes=window_minutes):
return False
except Exception:
logger.debug("[OllamaHost] could not evaluate host health probe age for host=%s", host, exc_info=True)
return False
if bool(healthy):
return False
logger.warning(
"[OllamaHost] skip recent unhealthy direct host=%s label=%s window=%sm error=%s",
host,
host_label,
window_minutes,
(error_msg or '')[:180],
)
return True
def _recent_embedding_host_unhealthy(host: str) -> bool:
"""Skip known-bad GCP embedding runtimes using recent host_health_probes rows.
@@ -507,9 +585,11 @@ def resolve_ollama_host(primary: str = OLLAMA_HOST_PRIMARY,
primary_proxy = _proxy_rescue_for_primary(primary)
secondary_proxy = _proxy_rescue_for_secondary(secondary)
primary_recent_unhealthy = _recent_direct_host_unhealthy(primary)
secondary_recent_unhealthy = _recent_direct_host_unhealthy(secondary)
# B4: primary 若被標 unhealthy先嘗試同順位 110 proxy再嘗試 secondary
if not _is_unhealthy(primary) and _is_reachable(primary):
if not _is_unhealthy(primary) and not primary_recent_unhealthy and _is_reachable(primary):
selected = primary
logger.info(f"[OllamaHost] Primary 主機可用: {primary}")
elif primary_proxy and not _is_unhealthy(primary_proxy) and _is_reachable(primary_proxy):
@@ -518,7 +598,7 @@ def resolve_ollama_host(primary: str = OLLAMA_HOST_PRIMARY,
"[OllamaHost] Primary direct 不可用,使用 110 primary proxy: %s",
primary_proxy,
)
elif not _is_unhealthy(secondary) and _is_reachable(secondary):
elif not _is_unhealthy(secondary) and not secondary_recent_unhealthy and _is_reachable(secondary):
selected = secondary
logger.info(f"[OllamaHost] Primary 不可用,使用 Secondary: {secondary}")
elif secondary_proxy and not _is_unhealthy(secondary_proxy) and _is_reachable(secondary_proxy):

View File

@@ -100,6 +100,44 @@ def test_resolve_uses_primary_proxy_rescue_before_secondary():
]
def test_resolve_skips_recent_unhealthy_direct_primary_and_uses_proxy(monkeypatch):
"""host_health 已判定 GCP-A direct 不健康時,不再等待 direct timeout。"""
from datetime import datetime
from services import ollama_service as oss
class FakeResult:
def fetchone(self):
return (False, "ConnectTimeout", datetime.now())
class FakeSession:
def execute(self, *args, **kwargs):
return FakeResult()
def close(self):
pass
fake_ok = MagicMock(status_code=200)
seen_urls = []
def fake_get(url, timeout=None):
seen_urls.append(url)
if url == f"{oss.OLLAMA_HOST_PRIMARY}/api/version":
raise AssertionError("recent unhealthy direct host should be skipped")
if url == f"{oss.OLLAMA_HOST_PRIMARY_PROXY}/api/version":
return fake_ok
raise AssertionError(f"should not reach {url}")
monkeypatch.setenv("OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED", "true")
monkeypatch.setenv("OLLAMA_RESOLVE_HOST_HEALTH_SKIP_WINDOW_MINUTES", "20")
monkeypatch.setattr("database.manager.get_session", lambda: FakeSession())
with patch('services.ollama_service.requests.get', side_effect=fake_get):
host = oss.resolve_ollama_host()
assert host == oss.OLLAMA_HOST_PRIMARY_PROXY
assert seen_urls == [f"{oss.OLLAMA_HOST_PRIMARY_PROXY}/api/version"]
# ═══════════════════════════════════════════════════════════════════════════
# B4 — mark_unhealthy 行為
# ═══════════════════════════════════════════════════════════════════════════