From a00f34ce8712bed92ae8d99b4b832a62853c8c2f Mon Sep 17 00:00:00 2001 From: OoO Date: Mon, 25 May 2026 14:16:51 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5=20Ollama=20GCP=20failover=20?= =?UTF-8?q?=E8=A8=BA=E6=96=B7=E8=88=87=20unhealthy=20skip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.example | 5 + TODO_NEXT_STEPS.txt | 2 + config.py | 2 +- docs/AI_INTELLIGENCE_MODULE_SOT.md | 3 +- docs/guides/devops_handbook.md | 17 +++ .../code_modularization_inventory_20260430.md | 3 +- .../current_execution_queue_20260524.md | 3 + docs/memory/history_logs.md | 2 + scripts/ops/diagnose_ollama_gcp_failover.sh | 136 ++++++++++++++++++ services/ollama_service.py | 98 ++++++++++++- tests/test_ollama_retry_chain.py | 78 ++++++++++ 11 files changed, 341 insertions(+), 8 deletions(-) create mode 100755 scripts/ops/diagnose_ollama_gcp_failover.sh diff --git a/.env.example b/.env.example index d067dc6..cda8b80 100644 --- a/.env.example +++ b/.env.example @@ -162,6 +162,9 @@ OLLAMA_EMBED_KEEP_ALIVE=1m OLLAMA_EMBED_MAX_CHARS=4000 OLLAMA_EMBED_GCP_FAILURE_COOLDOWN_SEC=60 OLLAMA_EMBED_GCP_FAILURE_NOTICE_SEC=30 +# [預設 true] 背景 embedding 會讀 host_health_probes,跳過最近 runtime unhealthy 的 GCP 節點 +OLLAMA_EMBED_HOST_HEALTH_SKIP_ENABLED=true +OLLAMA_EMBED_HOST_HEALTH_SKIP_WINDOW_MINUTES=20 # 111 Mac final fallback guardrail and allowlist proxy OLLAMA_111_CIRCUIT_BREAKER_ENABLED=true @@ -416,6 +419,8 @@ OLLAMA_EMBED_KEEP_ALIVE=1m OLLAMA_EMBED_MAX_CHARS=4000 OLLAMA_EMBED_GCP_FAILURE_COOLDOWN_SEC=60 OLLAMA_EMBED_GCP_FAILURE_NOTICE_SEC=30 +OLLAMA_EMBED_HOST_HEALTH_SKIP_ENABLED=true +OLLAMA_EMBED_HOST_HEALTH_SKIP_WINDOW_MINUTES=20 OLLAMA_HOST_HEALTH_MODEL_PROBE_ENABLED=true OLLAMA_HOST_HEALTH_MODEL_PROBE_INCLUDE_111=false OLLAMA_HOST_HEALTH_EMBED_MODEL=bge-m3:latest diff --git a/TODO_NEXT_STEPS.txt b/TODO_NEXT_STEPS.txt index 15c2d0f..8e51c38 100644 --- a/TODO_NEXT_STEPS.txt +++ b/TODO_NEXT_STEPS.txt @@ -4,6 +4,8 @@ ================================================================================ 【已完成】 + - V10.473 補背景 embedding host_health skip:`allow_111_fallback=false` 會讀最近 `host_health_probes`,跳過 runtime unhealthy 的 GCP 節點(預設 20 分鐘,DB fail-open),避免每筆任務都等待已知壞節點 timeout;路由安全不變,不把背景 embedding 落 111。 + - V10.472 補 GCP Ollama failover rootless 診斷:新增 `scripts/ops/diagnose_ollama_gcp_failover.sh`,可一鍵檢查 GCP-A direct、GCP-B direct、111、110:11435、110:11436 與 GCP-B `bge-m3` runtime;目前輸出確認 GCP-A `11434` refused、GCP-B direct/embed OK、110:11435 502、110:11436 OK。110 無免密 sudo、GCP-A 22 refused、GCP-B SSH key denied,因此 primary 修復需 GCP/SSH 或 110 root 權限;應用層維持 GCP-A → GCP-B → 111,不把背景 embedding 落 111。 - V10.471 依 GCP-B `bge-m3` 實測 latency 調整 embedding timeout,已部署正式環境並確認 `/health=V10.471`:GCP-B `/api/embed` 三次實測約 6.4s / 7.3s / 23.5s,原本 `OLLAMA_EMBED_MAX_TIMEOUT=15` 與 host health `8s` 會誤殺慢但成功的 embedding;已將背景 embedding cap 與 host health model probe timeout 預設調為 30s。正式 smoke 顯示容器內 embedding 回 1024 維、耗時 10.07s;手動 host health probe 後最新狀態為 GCP-A unhealthy、GCP-B healthy、111 healthy。路由安全不變:背景 embedding 仍只跑 GCP-A/GCP-B,不落 111。 - V10.470 強化 Ollama host health probe,已部署正式環境並確認 `/health=V10.470`:scheduler 與觀測台 host health 對 GCP-A / GCP-B 除 `/api/tags` 外,再做短 `bge-m3` `/api/embed` 實作探針;可抓出 GCP-B「tags/version 正常但 embedding runner 8s timeout」這類假健康。111 預設不做背景 embedding probe,避免監控任務把 `bge-m3` 載入 fallback Mac。正式 smoke 後 `host_health_probes` 最新狀態為 GCP-A unhealthy、GCP-B unhealthy、111 healthy。 - V10.469 將背景 embedding 的 GCP-only 全失敗改為專業降級語意,已部署正式環境並確認 `/health=V10.469`:`allow_111_fallback=False` 時若 GCP-A/GCP-B 都不可用,開啟 failure circuit 並記 WARNING,不再把可預期的背景熔斷每分鐘打成 ERROR;同步 / 允許 fallback 的 embedding 全失敗仍保留 ERROR。Smoke 顯示 GCP-B `/api/version` 可用,但 `/api/embed` 仍可能 15s timeout,下一步需修 GCP-A primary 與 GCP-B runner/model 負載。 diff --git a/config.py b/config.py index 355e137..1e512e3 100644 --- a/config.py +++ b/config.py @@ -350,7 +350,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.471" +SYSTEM_VERSION = "V10.473" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index 685cba7..285c380 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -2,7 +2,7 @@ > **最後更新**: 2026-05-25 (台北時間) > **狀態**: 🟢 四 AI Agent 自動化閉環已落地;LLM 路由紅線升級為 Ollama-first 三主機級聯,Gemini 備援預設關閉 -> **適用版本**: V10.471 +> **適用版本**: V10.473 --- @@ -26,6 +26,7 @@ - `allow_111_fallback=False` 且 GCP-A / GCP-B 皆失敗時,背景 embedding 會開啟短暫 GCP failure circuit(預設 60 秒),期間不重複打兩台 GCP、不落 111,避免 worker 與 log 被連續失敗拖慢;GCP 恢復後會自然再試。 - 背景 embedding 的 GCP-only 熔斷屬於可降級背景能力,應記錄為明確 WARNING 與 circuit 狀態,不應每次污染 ERROR 通道;真正允許三主機 fallback 的同步 embedding 全失敗仍保留 ERROR。 - Scheduler host health probe 不只看 `/api/tags`;GCP-A / GCP-B 節點必須再通過 `bge-m3` `/api/embed` 實作探針,才算 healthy。探針 timeout 預設 30s,111 預設不納入這個背景 embedding 探針,避免監測任務把 fallback Mac 載入 `bge-m3`。 +- 背景 embedding 會讀取最近 `host_health_probes` runtime 結果;若 GCP-A / GCP-B 在 `OLLAMA_EMBED_HOST_HEALTH_SKIP_WINDOW_MINUTES=20` 視窗內已被標為 unhealthy,`OllamaService.generate_embedding(..., allow_111_fallback=False)` 會先跳過該節點並開啟短暫 GCP circuit,不再等待 30 秒 timeout,也仍不落 111。此功能由 `OLLAMA_EMBED_HOST_HEALTH_SKIP_ENABLED=true` 控制,DB 讀取失敗時 fail-open 回到原本網路 retry。 - BGE-M3 一致性檢查是監測任務,不是 fallback 壓測;預設只比對 GCP-A / GCP-B。111 Mac fallback 只有 `EMBED_CONSISTENCY_INCLUDE_111=true` 時才納入,避免每週背景檢查把 `bge-m3` 載入 111。 - OpenClaw Telegram Q&A 主路徑也不得綁單一 host:`_call_qwen3_qa()` 必須透過 `OllamaService` 跑 GCP-A → GCP-B → 111,並把實際落點寫入 `ai_calls.provider`。 - OpenClaw Telegram 圖片商品辨識也必須 Ollama-first:`_identify_product_name_with_ollama_vision()` 透過 `OllamaService` 嘗試 GCP-A → GCP-B → 111;Gemini 只允許以 `openclaw_bot_image_gemini` caller 作為失敗後備援。 diff --git a/docs/guides/devops_handbook.md b/docs/guides/devops_handbook.md index 5696acc..cbd3e2f 100644 --- a/docs/guides/devops_handbook.md +++ b/docs/guides/devops_handbook.md @@ -122,3 +122,20 @@ - **原因**: Blackbox 或外部探測打 Dashboard 首頁 `/`,會觸發商品看板與 PChome 比價重型查詢;少量 sync worker 被長請求佔滿時,輕量 `/health` 也會排隊逾時。 - **檢查**: `docker logs momo-pro-system --since 20m | grep 'Blackbox-Exporter'` 應只看到 `GET /health`;`docker stats momo-db` 若接近多核心滿載,需同步看 `pg_stat_activity` 的 `latest_momo` 類查詢。 - **修復**: 188 的 `monitoring/prometheus.yml` 與 110 的 `/home/wooo/monitoring/prometheus.yml` blackbox HTTP targets 必須使用 `/health`;Gunicorn 保持 `worker_class=gthread`、`GUNICORN_THREADS=4`、`preload_app=False`。 + +### 12. GCP-A Ollama refused / 110:11435 502 +- **快速診斷**: 在 repo 根目錄執行 `scripts/ops/diagnose_ollama_gcp_failover.sh`。此腳本不需要 root,也不會修改 nginx、Docker、GCP 或正式服務。 +- **判讀**: + - `GCP-A direct /api/version` 失敗且 `GCP-B direct` 成功:primary VM、防火牆或 Ollama 服務異常;應用層會走 GCP-A → GCP-B → 111,但仍需修 primary。 + - `110 proxy primary` 502 且 `110 proxy secondary` 成功:110 的 `11435` 固定代理 GCP-A,所以 primary 掛時舊 proxy 入口會失敗;需 110 root 才能改 nginx 或 reload。 + - `GCP-B embed` 成功但耗時接近 30s:表示 `bge-m3` runner 慢但可用;若經常超過 30s,應處理 GCP-B runner/CPU/模型併發,不要把 111 納入背景 embedding。 +- **GCP-A 修復方向**: + - 有 GCP/SSH 權限時,先確認 VM 是否開機、Firewall 是否開 `22` 與 `11434`、Ollama process 是否在 listen。 + - 110 現況若 `ssh gcp-a` 回 `port 22: Connection refused`,代表目前跳板無法進主機,不能靠 momo-pro app 修復。 +- **110 proxy failover 方向**: + - 需要 root:`sudo nginx -t`、修改 `/etc/nginx/sites-enabled/110-ollama-proxy.conf`、`sudo systemctl reload nginx`。 + - 若要讓 `11435` 在 GCP-A 掛時 fallback 到 GCP-B,必須明確標註這是 proxy failover,不代表 GCP-A 已恢復;host health 仍應以 direct GCP-A 探針為準。 +- **紅線**: + - 不要把背景 `bge-m3` 任務改落 111。 + - 不要用更長 timeout 掩蓋 GCP-A refused;GCP-A 是 primary infra blocker。 + - 沒有 110 root 或 GCP SSH 權限時,只能完成診斷、文件與應用層降級,不能假裝已修復 primary。 diff --git a/docs/memory/code_modularization_inventory_20260430.md b/docs/memory/code_modularization_inventory_20260430.md index 61446f6..3babeee 100644 --- a/docs/memory/code_modularization_inventory_20260430.md +++ b/docs/memory/code_modularization_inventory_20260430.md @@ -54,6 +54,7 @@ - 2026-05-24 追記:同步 PChome 覆核頁 fast-count、輕量 render 與重算可採用指標後的 `routes/dashboard_routes.py` 行數;此處只更新 inventory,不變更 dashboard 行為。 - 2026-05-24 追記:同步 PChome rescore audit 最新狀態口徑與單位價 multiplier 修正後的 `services/marketplace_product_matcher.py` 行數;此處只更新 inventory,不變更拆分策略。 - 2026-05-24 追記:同步 PChome review queue 決策信封合併後的 `services/competitor_intel_repository.py` 行數;此處只更新 inventory,不變更拆分策略。 +- 2026-05-25 追記:同步背景 embedding 讀取 `host_health_probes` skip guard 後的 `services/ollama_service.py` 行數;此處只更新 inventory,不變更 Ollama 路由決策。 ## 達到或超過 800 行檔案清單 @@ -82,7 +83,7 @@ | 867 | `services/token_report_service.py` | P2 token report service | query / aggregation / chart payload / notification formatting | | 3786 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / search term quality / persistence normalization | | 865 | `routes/daily_sales_routes.py` | P2 Daily Sales Blueprint | route glue / export helpers / daily query and formatting service | -| 1117 | `services/ollama_service.py` | P2 Ollama client | host health / request client / fallback policy / response parsing | +| 1266 | `services/ollama_service.py` | P2 Ollama client | host health / request client / fallback policy / response parsing | | 849 | `services/pchome_crawler.py` | P2 PChome crawler | search fetch / parsing / fallback source handling / rate limit policy | | 1100 | `services/code_review_pipeline_service.py` | P2 Code review pipeline service | scan orchestration / finding normalization / persistence adapter | | 953 | `routes/export_routes.py` | P2 Export flow | export command/router glue / file path / download orchestration | diff --git a/docs/memory/current_execution_queue_20260524.md b/docs/memory/current_execution_queue_20260524.md index 0cd45b3..2b5b755 100644 --- a/docs/memory/current_execution_queue_20260524.md +++ b/docs/memory/current_execution_queue_20260524.md @@ -25,6 +25,8 @@ - 2026-05-25 12:39 CST 狀態:已部署 `V10.469` 到 188,正式 `/health` 為 `V10.469`。本輪 recreate `momo-app`、`scheduler`、`telegram-bot`;未使用 `--remove-orphans`,未碰 `momo-db`。Smoke 通過:三個 app 容器 healthy、首頁 / daily / growth / PChome review queue HTTP 200、Gemini hard disabled;`allow_111_fallback=False` 時 GCP-only embedding 全失敗會開啟 failure circuit 並記 WARNING,不再把預期內的背景熔斷打進 ERROR 通道。觀測到 GCP-B `/api/version` 200,但 `/api/embed` 仍可能 15s timeout,下一步需修 GCP-A primary 與 GCP-B runner/model 負載。 - 2026-05-25 12:53 CST 狀態:已部署 `V10.470` 到 188,正式 `/health` 為 `V10.470`。本輪 recreate `momo-app`、`scheduler`、`telegram-bot`;未使用 `--remove-orphans`,未碰 `momo-db`。Smoke 通過:三容器 healthy、host health page HTTP 200 並顯示 Runtime 狀態、scheduler probe 寫入 DB。最新 `host_health_probes`:GCP-A unhealthy(11434 refused)、GCP-B unhealthy(`EmbedProbe ReadTimeout`, `/api/tags` 仍可列出 4 模型)、111 healthy;這補上「HTTP API 活著但模型 runtime 卡住」的假健康監控缺口。 - 2026-05-25 13:38 CST 狀態:已部署 `V10.471` 到 188,正式 `/health` 為 `V10.471`。本輪 recreate `momo-app`、`scheduler`、`telegram-bot`;未使用 `--remove-orphans`,未碰 `momo-db`。Smoke 通過:三容器 healthy、首頁 / daily / growth / host_health / ppt_audit_history / PChome review queue HTTP 200。GCP-B `bge-m3` `/api/embed` 直接實測約 6.4s、7.3s、23.5s,原 `OLLAMA_EMBED_MAX_TIMEOUT=15` 與 host health `OLLAMA_HOST_HEALTH_EMBED_TIMEOUT=8` 會誤殺慢但成功的 embedding;預設改為 30s。正式容器內 embedding smoke 回 1024 維、耗時 10.07s;手動 host health probe 後最新狀態為 GCP-A unhealthy、GCP-B healthy、111 healthy。背景 embedding 路由安全不變:GCP-A → GCP-B,不落 111。 +- 2026-05-25 14:10 CST 起,`V10.472` 補 rootless GCP Ollama failover 診斷腳本與 DevOps SOP:`scripts/ops/diagnose_ollama_gcp_failover.sh` 會檢查 direct GCP-A/GCP-B/111、110 proxy `11435/11436` 與 GCP-B `bge-m3` runtime。現況輸出:GCP-A direct `/api/version` failed/refused、GCP-B direct OK、111 OK、110:11435 502、110:11436 OK、GCP-B embed OK;110 無免密 sudo,`ssh gcp-a` 22 refused、`ssh gcp-b` publickey denied,因此 primary 修復需 GCP/SSH 或 110 root 權限。 +- 2026-05-25 14:12 CST 起,`V10.473` 進行背景 embedding host_health skip:`allow_111_fallback=false` 的背景 embedding 會讀最近 `host_health_probes`,若 GCP-A/GCP-B runtime 已被標 unhealthy,直接跳過該節點並開 GCP circuit,不等待 30 秒 timeout、不落 111;DB 讀取失敗 fail-open。 - 2026-05-25 12:05 CST 狀態:`main` 已部署到 188,正式 `/health` 為 `V10.467`,待推 Gitea。兩段變更已合併驗證:V10.466 rescore duplicate 改看 latest-state,7 筆 SKU 最新 attempt 全為 `rescore_accepted_current`,`competitor_prices` / `competitor_price_history` 目標計數未變;V10.467 focused exact matcher 在容器內回 `exact / total_price / price_alert_exact`。本輪 recreate `momo-app`、`scheduler`、`telegram-bot`;未使用 `--remove-orphans`,未碰 `momo-db`。Smoke 通過:三容器 healthy、PChome rescore queue API HTTP 200、Gemini 24 小時無 provider 紀錄、Ollama env 順序維持 GCP-A → GCP-B → 111、3 分鐘三容器 log 未見 Traceback / ERROR / CRITICAL / IntegrityError。 ## 1. MOMO / PChome 核心比價準確率 @@ -81,6 +83,7 @@ - 2026-05-25 12:27 CST 起,背景 embedding 在 GCP-A/GCP-B 全掛時開啟短暫 failure circuit;這是降載保護,不代表 primary 已恢復。部署 smoke 時 GCP-B `/api/version` 已恢復 200,下一步仍需恢復 GCP-A Ollama 或更新 110 的可用 SSH/GCP 操作憑證。 - 2026-05-25 12:37 CST 起,背景 embedding GCP-only failure circuit 改用 WARNING 記錄,避免可預期降級污染 ERROR 告警通道。 - 2026-05-25 13:35 CST 起,GCP-B `bge-m3` 實測 P95 波動已超過 15s,背景 embedding / host health model probe timeout 預設改 30s;若 30s 仍常 timeout,需進一步處理 GCP-B runner/CPU/模型併發,而不是再把 111 納入背景任務。 +- 2026-05-25 14:10 CST 起,GCP-A refused 已明確歸類為 infra blocker;應用層不得改成 111 背景 fallback,也不得把 110:11435 502 當成 momo-app 故障。背景 embedding 可依 `host_health_probes` 跳過近期 unhealthy GCP host,但查 DB 必須 fail-open。 ## 4. 業績分析資料與圖表修復 diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index 4f3ded4..b5f4a20 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -13,6 +13,8 @@ ## 📅 詳細更新日誌 (考古存檔) ### 2026-05-24:PChome 近門檻身份回收第二輪 +- **V10.473 背景 embedding 讀取 host_health skip**: `OllamaService.generate_embedding(..., allow_111_fallback=False)` 會先查最近 `host_health_probes`;若 GCP-A/GCP-B 在 20 分鐘視窗內已由 runtime probe 標成 unhealthy,背景 embedding 直接跳過該節點並開短暫 GCP circuit,不等待 30 秒 timeout、不落 111。DB 讀取失敗時 fail-open 回原本 retry,避免觀測層阻斷 embedding。 +- **V10.472 GCP Ollama failover rootless 診斷**: 新增 `scripts/ops/diagnose_ollama_gcp_failover.sh` 與 DevOps SOP,可不需 root 檢查 GCP-A/GCP-B/111 direct、110 proxy `11435/11436` 與 GCP-B `bge-m3` runtime。現況確認:GCP-A `22/11434` refused、GCP-B `22/11434` open 但 SSH key denied、GCP-B embed OK、110:11435 502、110:11436 OK;primary 修復需 GCP/SSH 或 110 root 權限。 - **V10.471 GCP-B embedding timeout 校準**: GCP-B `bge-m3` `/api/embed` 直接實測約 6.4s / 7.3s / 23.5s,原 `OLLAMA_EMBED_MAX_TIMEOUT=15` 與 host health `OLLAMA_HOST_HEALTH_EMBED_TIMEOUT=8` 會誤判慢但成功的 embedding;預設改為 30s。背景 embedding 仍只跑 GCP-A/GCP-B,不落 111。 - **V10.470 Ollama host health 實作探針**: `run_host_health_probe()` 對 GCP-A / GCP-B 在 `/api/tags` 成功後追加短 `bge-m3` `/api/embed` probe,避免 GCP-B 出現 tags/version 正常、但實際 embedding runner 20s timeout 時仍被標 healthy;111 預設不做背景 embedding probe,避免監測任務把 fallback Mac 載入 `bge-m3`。 - **V10.469 Background embedding 降級語意修正**: `OllamaService.generate_embedding(..., allow_111_fallback=False)` 在 GCP-A/GCP-B 全失敗時會開啟短暫 failure circuit 並記 WARNING,不再把背景 `bge-m3` 降級熔斷每分鐘寫成 ERROR;同步或允許三主機 fallback 的 embedding 全失敗仍維持 ERROR,保留真正阻塞型故障訊號。 diff --git a/scripts/ops/diagnose_ollama_gcp_failover.sh b/scripts/ops/diagnose_ollama_gcp_failover.sh new file mode 100755 index 0000000..a3dc6cd --- /dev/null +++ b/scripts/ops/diagnose_ollama_gcp_failover.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +set -u + +# Rootless Ollama GCP failover diagnostic. +# It verifies the direct GCP-A/GCP-B/111 endpoints plus the 110 proxy ports. +# It does not modify nginx, Docker, GCP, or any production service. + +PRIMARY_URL="${OLLAMA_HOST_PRIMARY:-http://34.143.170.20:11434}" +SECONDARY_URL="${OLLAMA_HOST_SECONDARY:-http://34.21.145.224:11434}" +FALLBACK_URL="${OLLAMA_HOST_FALLBACK:-http://192.168.0.111:11434}" +PROXY_PRIMARY_URL="${OLLAMA_PROXY_PRIMARY:-http://192.168.0.110:11435}" +PROXY_SECONDARY_URL="${OLLAMA_PROXY_SECONDARY:-http://192.168.0.110:11436}" +EMBED_MODEL="${OLLAMA_DIAG_EMBED_MODEL:-bge-m3:latest}" +EMBED_TIMEOUT="${OLLAMA_DIAG_EMBED_TIMEOUT:-30}" +CONNECT_TIMEOUT="${OLLAMA_DIAG_CONNECT_TIMEOUT:-3}" +RUN_EMBED_PROBE="${OLLAMA_DIAG_RUN_EMBED_PROBE:-1}" +INCLUDE_111_EMBED="${OLLAMA_DIAG_INCLUDE_111_EMBED:-0}" + +ok_count=0 +warn_count=0 +fail_count=0 + +say() { + printf '%s\n' "$*" +} + +status_line() { + local state="$1" + local label="$2" + local detail="$3" + printf '%-6s %-24s %s\n' "$state" "$label" "$detail" + case "$state" in + OK) ok_count=$((ok_count + 1)) ;; + WARN) warn_count=$((warn_count + 1)) ;; + FAIL) fail_count=$((fail_count + 1)) ;; + esac +} + +curl_probe() { + local label="$1" + local url="$2" + local path="${3:-/api/version}" + local max_time="${4:-10}" + local output + local code + output="$(mktemp)" + code="$(curl -sS --connect-timeout "$CONNECT_TIMEOUT" --max-time "$max_time" \ + -o "$output" -w '%{http_code} %{time_total}' "${url%/}${path}" 2>"$output.err")" + local rc=$? + if [[ "$rc" -eq 0 && "${code%% *}" == "200" ]]; then + status_line OK "$label" "${path} ${code}" + rm -f "$output" "$output.err" + return 0 + fi + local err + err="$(head -c 180 "$output.err" 2>/dev/null || true)" + [[ -z "$err" ]] && err="$(head -c 180 "$output" 2>/dev/null || true)" + status_line FAIL "$label" "${path} ${code:-curl_rc=$rc} ${err}" + rm -f "$output" "$output.err" + return 1 +} + +embed_probe() { + local label="$1" + local url="$2" + local output + local code + output="$(mktemp)" + code="$(curl -sS --connect-timeout "$CONNECT_TIMEOUT" --max-time "$EMBED_TIMEOUT" \ + -H 'Content-Type: application/json' \ + -d "{\"model\":\"${EMBED_MODEL}\",\"input\":\"health\",\"keep_alive\":\"1m\"}" \ + -o "$output" -w '%{http_code} %{time_total} %{size_download}' \ + "${url%/}/api/embed" 2>"$output.err")" + local rc=$? + if [[ "$rc" -eq 0 && "${code%% *}" == "200" ]]; then + status_line OK "$label" "/api/embed ${code}" + rm -f "$output" "$output.err" + return 0 + fi + local err + err="$(head -c 180 "$output.err" 2>/dev/null || true)" + [[ -z "$err" ]] && err="$(head -c 180 "$output" 2>/dev/null || true)" + status_line FAIL "$label" "/api/embed ${code:-curl_rc=$rc} ${err}" + rm -f "$output" "$output.err" + return 1 +} + +say "Ollama GCP failover diagnostic" +say "time=$(date -u '+%Y-%m-%dT%H:%M:%SZ')" +say "primary=${PRIMARY_URL}" +say "secondary=${SECONDARY_URL}" +say "fallback=${FALLBACK_URL}" +say "proxy_primary=${PROXY_PRIMARY_URL}" +say "proxy_secondary=${PROXY_SECONDARY_URL}" +say "" + +primary_version_ok=0 +secondary_version_ok=0 +proxy_primary_ok=0 +proxy_secondary_ok=0 + +curl_probe "GCP-A direct" "$PRIMARY_URL" "/api/version" 10 && primary_version_ok=1 +curl_probe "GCP-B direct" "$SECONDARY_URL" "/api/version" 10 && secondary_version_ok=1 +curl_probe "111 fallback" "$FALLBACK_URL" "/api/version" 10 || true +curl_probe "110 proxy primary" "$PROXY_PRIMARY_URL" "/api/version" 10 && proxy_primary_ok=1 +curl_probe "110 proxy secondary" "$PROXY_SECONDARY_URL" "/api/version" 10 && proxy_secondary_ok=1 + +if [[ "$RUN_EMBED_PROBE" == "1" ]]; then + say "" + say "Embedding runtime probe model=${EMBED_MODEL} timeout=${EMBED_TIMEOUT}s" + [[ "$primary_version_ok" == "1" ]] && embed_probe "GCP-A embed" "$PRIMARY_URL" || status_line WARN "GCP-A embed" "skipped because /api/version is not healthy" + [[ "$secondary_version_ok" == "1" ]] && embed_probe "GCP-B embed" "$SECONDARY_URL" || status_line WARN "GCP-B embed" "skipped because /api/version is not healthy" + if [[ "$INCLUDE_111_EMBED" == "1" ]]; then + embed_probe "111 embed" "$FALLBACK_URL" || true + else + status_line WARN "111 embed" "skipped by default; 111 must not carry background bge-m3 probes" + fi +fi + +say "" +say "Summary: OK=${ok_count} WARN=${warn_count} FAIL=${fail_count}" + +if [[ "$primary_version_ok" != "1" && "$secondary_version_ok" == "1" ]]; then + say "Diagnosis: GCP-A primary is down/refusing, but GCP-B is available. Keep app routing on GCP-A -> GCP-B -> 111; repair GCP-A with GCP/SSH access." +fi +if [[ "$proxy_primary_ok" != "1" && "$proxy_secondary_ok" == "1" ]]; then + say "Diagnosis: 110:11435 primary proxy is unhealthy while 110:11436 works. Root on 110 is required for nginx failover config or reload." +fi + +if [[ "$secondary_version_ok" != "1" ]]; then + exit 2 +fi +if [[ "$primary_version_ok" != "1" || "$proxy_primary_ok" != "1" ]]; then + exit 1 +fi +exit 0 diff --git a/services/ollama_service.py b/services/ollama_service.py index f52ee20..087aa4c 100644 --- a/services/ollama_service.py +++ b/services/ollama_service.py @@ -10,6 +10,7 @@ import requests import json import logging import fnmatch +from datetime import datetime, timedelta from typing import Optional, Dict, Any, List, Tuple from dataclasses import dataclass @@ -172,6 +173,85 @@ def _reset_embedding_gcp_circuit() -> None: _embedding_gcp_failure_circuit.update({'blocked_until': 0.0, 'notice_ts': 0.0, 'tried': ()}) +def _host_label_for_embedding_health(host: str) -> str: + """Map an Ollama host URL to the host_health_probes label used by scheduler.""" + if not host: + return '' + if '34.143.170.20:11434' in host or '192.168.0.110:11435' in host: + return 'Primary (GCP)' + if '34.21.145.224:11434' in host or '192.168.0.110:11436' in host: + return 'Secondary (GCP)' + return '' + + +def _recent_embedding_host_unhealthy(host: str) -> bool: + """Skip known-bad GCP embedding runtimes using recent host_health_probes rows. + + This guard is used only by background embedding paths that already disallow + 111 fallback. It is deliberately fail-open: DB/read errors must not block + Ollama calls. + """ + if not _env_flag('OLLAMA_EMBED_HOST_HEALTH_SKIP_ENABLED', True): + return False + + host_label = _host_label_for_embedding_health(host) + if not host_label: + return False + + try: + window_minutes = int(os.getenv('OLLAMA_EMBED_HOST_HEALTH_SKIP_WINDOW_MINUTES', '20')) + except (TypeError, ValueError): + window_minutes = 20 + window_minutes = max(1, window_minutes) + + try: + from sqlalchemy import text as sa_text + from database.manager import get_session + + session = get_session() + try: + row = session.execute( + sa_text(""" + SELECT healthy, error_msg, probed_at + FROM host_health_probes + WHERE host_label = :host_label + ORDER BY probed_at DESC + LIMIT 1 + """), + {'host_label': host_label}, + ).fetchone() + finally: + session.close() + except Exception: + logger.debug("[Embed] host health skip fail-open for host=%s", host, exc_info=True) + return False + + if not row: + return False + + healthy, error_msg, probed_at = row[0], row[1], row[2] + if probed_at: + try: + now = datetime.now(probed_at.tzinfo) if getattr(probed_at, 'tzinfo', None) else datetime.now() + if now - probed_at > timedelta(minutes=window_minutes): + return False + except Exception: + logger.debug("[Embed] could not evaluate host health probe age for host=%s", host, exc_info=True) + return False + + if bool(healthy): + return False + + logger.warning( + "[Embed] skip recent unhealthy GCP embedding host=%s label=%s window=%sm error=%s", + host, + host_label, + window_minutes, + (error_msg or '')[:180], + ) + return True + + def _fallback_111_block_reason(host: str) -> Tuple[bool, str]: """Return whether 111 fallback should be skipped for this request. @@ -1086,6 +1166,7 @@ class OllamaService: # HOTFIX 三主機 retry 鏈(與 generate() 同模式) attempted_hosts: List[str] = [] + skipped_hosts: List[str] = [] canonical_hosts = _canonical_host_chain() allowed_hosts = [ candidate for candidate in canonical_hosts @@ -1101,7 +1182,8 @@ class OllamaService: logger.warning("[Embed] 111 fallback disabled; ignoring EMBEDDING_HOST=%s", configured_host) target_host = resolve_ollama_host().rstrip("/") if not allow_111_fallback and _is_111_fallback_host(target_host): - next_host = next((candidate for candidate in allowed_hosts if candidate not in attempted_hosts), None) + visited_hosts = attempted_hosts + skipped_hosts + next_host = next((candidate for candidate in allowed_hosts if candidate not in visited_hosts), None) if not next_host: logger.warning("[Embed] 111 fallback disabled; no approved GCP embedding host available") break @@ -1110,10 +1192,11 @@ class OllamaService: next_host, ) target_host = next_host - if target_host in attempted_hosts: + visited_hosts = attempted_hosts + skipped_hosts + if target_host in visited_hosts: next_host = None if target_host in allowed_hosts: - next_host = next((candidate for candidate in allowed_hosts if candidate not in attempted_hosts), None) + next_host = next((candidate for candidate in allowed_hosts if candidate not in visited_hosts), None) if not next_host: break # cache 還沒過期或同主機,避免無限迴圈 logger.info( @@ -1127,6 +1210,10 @@ class OllamaService: logger.warning("[Embed] skip 111 fallback: %s", block_reason) _clear_resolved_host_cache() break + if not allow_111_fallback and _recent_embedding_host_unhealthy(target_host): + skipped_hosts.append(target_host) + _clear_resolved_host_cache() + continue attempted_hosts.append(target_host) vec = _embed_one(target_host) @@ -1136,11 +1223,12 @@ class OllamaService: logger.info(f"[Embed] retry #{attempt+1}/{max_attempts} — {target_host} failed, mark_unhealthy + 取新主機") if not allow_111_fallback: - _open_embedding_gcp_circuit(attempted_hosts) + _open_embedding_gcp_circuit(attempted_hosts or skipped_hosts) logger.warning( - "[Embed] background GCP embedding unavailable; circuit open %ss; tried=%s", + "[Embed] background GCP embedding unavailable; circuit open %ss; tried=%s skipped=%s", EMBED_GCP_FAILURE_COOLDOWN_SEC, attempted_hosts, + skipped_hosts, ) else: logger.error(f"[Embed] all {len(attempted_hosts)} hosts failed; tried={attempted_hosts}") diff --git a/tests/test_ollama_retry_chain.py b/tests/test_ollama_retry_chain.py index 5768dcb..72408cf 100644 --- a/tests/test_ollama_retry_chain.py +++ b/tests/test_ollama_retry_chain.py @@ -440,6 +440,84 @@ def test_embedding_fallback_disabled_opens_short_gcp_failure_circuit(): assert oss._embedding_gcp_failure_circuit['blocked_until'] > 0 +def test_embedding_health_label_maps_direct_and_proxy_gcp_hosts(): + """host_health skip 要對齊 scheduler 寫入的 host_label。""" + from services import ollama_service as oss + + assert oss._host_label_for_embedding_health("http://34.143.170.20:11434") == "Primary (GCP)" + assert oss._host_label_for_embedding_health("http://192.168.0.110:11435") == "Primary (GCP)" + assert oss._host_label_for_embedding_health("http://34.21.145.224:11434") == "Secondary (GCP)" + assert oss._host_label_for_embedding_health("http://192.168.0.110:11436") == "Secondary (GCP)" + assert oss._host_label_for_embedding_health("http://192.168.0.111:11434") == "" + + +def test_recent_embedding_host_unhealthy_reads_fresh_host_health_probe(monkeypatch): + """最新 host_health_probes 若顯示 GCP embedding runtime unhealthy,背景 embedding 可先跳過。""" + from datetime import datetime + from services import ollama_service as oss + + class FakeResult: + def fetchone(self): + return (False, "EmbedProbe ReadTimeout", datetime.now()) + + class FakeSession: + def execute(self, *args, **kwargs): + return FakeResult() + + def close(self): + pass + + monkeypatch.setenv("OLLAMA_EMBED_HOST_HEALTH_SKIP_ENABLED", "true") + monkeypatch.setenv("OLLAMA_EMBED_HOST_HEALTH_SKIP_WINDOW_MINUTES", "20") + monkeypatch.setattr("database.manager.get_session", lambda: FakeSession()) + + assert oss._recent_embedding_host_unhealthy(oss.OLLAMA_HOST_SECONDARY) is True + + +def test_recent_embedding_host_unhealthy_fails_open_when_db_is_unavailable(monkeypatch): + """host health 查詢失敗不可阻斷 embedding;最多回到原本網路 retry。""" + from services import ollama_service as oss + + monkeypatch.setattr( + "database.manager.get_session", + lambda: (_ for _ in ()).throw(RuntimeError("db down")), + ) + + assert oss._recent_embedding_host_unhealthy(oss.OLLAMA_HOST_SECONDARY) is False + + +def test_embedding_fallback_disabled_skips_recent_unhealthy_gcp_hosts(): + """背景 embedding 會直接跳過 host_health 最近標成 unhealthy 的 GCP,不打 111。""" + from services import ollama_service as oss + from services.ollama_service import OllamaService + + svc = OllamaService() + + def fake_recent_unhealthy(host): + return host in {oss.OLLAMA_HOST_PRIMARY, oss.OLLAMA_HOST_SECONDARY} + + with patch('services.ollama_service.EMBED_GCP_FAILURE_COOLDOWN_SEC', 60), \ + patch('services.ollama_service.resolve_ollama_host', side_effect=[ + oss.OLLAMA_HOST_PRIMARY, + oss.OLLAMA_HOST_PRIMARY, + oss.OLLAMA_HOST_FALLBACK, + ]), \ + patch('services.ollama_service._recent_embedding_host_unhealthy', side_effect=fake_recent_unhealthy), \ + patch.dict('os.environ', {}, clear=False), \ + patch('services.ollama_service.requests.post') as mock_post: + import os + os.environ.pop('EMBEDDING_HOST', None) + vec = svc.generate_embedding('test text', allow_111_fallback=False) + + assert vec == [] + mock_post.assert_not_called() + assert oss._embedding_gcp_failure_circuit['blocked_until'] > 0 + assert oss._embedding_gcp_failure_circuit['tried'] == ( + oss.OLLAMA_HOST_PRIMARY, + oss.OLLAMA_HOST_SECONDARY, + ) + + def test_embedding_ignores_111_embedding_host_when_fallback_disabled(): """EMBEDDING_HOST 若誤設 111,背景 embedding 仍回 GCP resolver,不直接棄跑。""" from services import ollama_service as oss