From 106c1935f4e486bb90e09b58719886c3554eb106 Mon Sep 17 00:00:00 2001 From: OoO Date: Thu, 21 May 2026 18:06:09 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=B6=E7=B7=8A=20111=20Ollama=20fallback=20?= =?UTF-8?q?=E8=B3=87=E6=BA=90=E4=B8=8A=E9=99=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.example | 5 +++-- config.py | 2 +- docs/AI_INTELLIGENCE_MODULE_SOT.md | 4 ++-- docs/memory/history_logs.md | 1 + services/ollama_service.py | 11 +++++++++-- tests/test_ollama_resolve.py | 22 ++++++++++++++++------ 6 files changed, 32 insertions(+), 13 deletions(-) diff --git a/.env.example b/.env.example index 1bd557b..2281c23 100644 --- a/.env.example +++ b/.env.example @@ -363,12 +363,13 @@ OLLAMA_MODEL=gemma3:4b OLLAMA_TIMEOUT=120 OLLAMA_COPY_TIMEOUT=180 OLLAMA_EMBED_TIMEOUT=45 -# 111 是 Mac final fallback,不承接 7B+ / vision / long-context 模型長駐;落到 111 時自動降級與縮短常駐。 +# 111 是 Mac final fallback,不承接 7B+ / vision / long-context / 長輸出任務;落到 111 時自動降級與縮短常駐。 OLLAMA_111_MODEL_FALLBACK=llama3.2:latest OLLAMA_111_MODEL_DOWNGRADE_PATTERNS=qwen3:*,deepseek-r1:*,hermes3:*,llama3.1:*,qwen2.5:*,qwen2.5-coder:*,gemma3:*,minicpm-v:*,llava:*,*:7b*,*:8b*,*:14b*,*:32b*,*:70b* OLLAMA_111_KEEP_ALIVE=5m -OLLAMA_111_MAX_TIMEOUT=45 +OLLAMA_111_MAX_TIMEOUT=20 OLLAMA_111_NUM_CTX=4096 +OLLAMA_111_NUM_PREDICT=512 # [預設 true] OpenClaw Q&A 先走 Ollama,品質不足或失敗時才 fallback Gemini/NIM # 主機不提供單 caller override;一律走 OLLAMA_HOST_PRIMARY → OLLAMA_HOST_SECONDARY → OLLAMA_HOST_FALLBACK diff --git a/config.py b/config.py index 3c03fc0..8abbfce 100644 --- a/config.py +++ b/config.py @@ -325,7 +325,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.379" +SYSTEM_VERSION = "V10.380" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index 221ab7a..807b898 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -2,7 +2,7 @@ > **最後更新**: 2026-05-21 (台北時間) > **狀態**: 🟢 四 AI Agent 自動化閉環已落地;LLM 路由紅線升級為 Ollama-first 三主機級聯,Gemini 備援預設關閉 -> **適用版本**: V10.377 +> **適用版本**: V10.380 --- @@ -27,7 +27,7 @@ - Gemini API 出站有第二道 kill switch:`GEMINI_FALLBACK_ENABLED` 預設為 `false`。即使 `GEMINI_API_KEY` 存在,通用 AI fallback、OpenClaw 報告/QA/PPT/圖片、MCP Grounding 與 Code Review L3 都不得呼叫 Gemini;只有操作員明確設為 `true` 時,Gemini 才能作緊急備援。 - Gemini 不可被任何狀態面板或 router 推薦為主提供者:`AIProviderService._get_recommended_provider()` 不得回傳 `gemini`,只能顯示為 fallback 狀態;`llm_model_router` 的 `ea_engine` 若收到 `gemini-*` default 必須改回 `hermes3:latest`,需要深推理時才升本地 `deepseek-r1:14b`。 - ElephantAlpha prompt / agent registry 不得再把 OpenClaw 描述為 Gemini 主模型;OpenClaw 是 `qwen2.5-coder:7b` / `qwen3:14b` Ollama-first 策略師,Gemini 僅能在 guard 顯式解鎖後作 emergency fallback。 -- 111 `192.168.0.111` 只是最後一道 Mac fallback,不承接 7B+、vision、long-context 模型長駐;`OllamaService.generate()` 落到 111 時會將 `qwen3`、`deepseek-r1`、`hermes3`、`qwen2.5*`、`gemma3`、`llava`、`minicpm-v` 與 7B+ 模型依 `OLLAMA_111_MODEL_DOWNGRADE_PATTERNS` 降級到 `OLLAMA_111_MODEL_FALLBACK=llama3.2:latest`,並以 `OLLAMA_111_KEEP_ALIVE=5m`、`OLLAMA_111_MAX_TIMEOUT=45`、`OLLAMA_111_NUM_CTX=4096` 封頂,避免 16GB RAM 主機被大 context runner 與 24h keep-alive 壓到 swap。 +- 111 `192.168.0.111` 只是最後一道 Mac fallback,不承接 7B+、vision、long-context 模型長駐;`OllamaService.generate()` 落到 111 時會將 `qwen3`、`deepseek-r1`、`hermes3`、`qwen2.5*`、`gemma3`、`llava`、`minicpm-v` 與 7B+ 模型依 `OLLAMA_111_MODEL_DOWNGRADE_PATTERNS` 降級到 `OLLAMA_111_MODEL_FALLBACK=llama3.2:latest`,並以 `OLLAMA_111_KEEP_ALIVE=5m`、`OLLAMA_111_MAX_TIMEOUT=20`、`OLLAMA_111_NUM_CTX=4096`、`OLLAMA_111_NUM_PREDICT=512` 封頂,避免 16GB RAM 主機被大 context runner、長輸出與 24h keep-alive 壓到 swap。 ## 一、四 AI Agent 路由架構 diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index 7463959..51e4e82 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -13,6 +13,7 @@ ## 📅 詳細更新日誌 (考古存檔) ### 2026-05-21:瀏覽器測試守門與 PChome 熱路徑優化 +- **V10.380 111 Ollama final fallback 收斂**: 111 Mac fallback 從救急路徑改成更短的保護路徑,`OLLAMA_111_MAX_TIMEOUT` 預設由 45s 收緊到 20s,並新增 `OLLAMA_111_NUM_PREDICT=512` 輸出上限;落到 111 時仍會降級重模型到 `llama3.2:latest`、縮 `num_ctx=4096`、`keep_alive=5m`,避免 GCP-A/GCP-B 短暫 timeout 後把長篇 Hermes/OpenClaw 工作轉嫁到 111 造成 swap 與 load 飆高。 - **V10.379 MCP runtime promotion gate**: 新增 `mcp_runtime_promotion` read-only builder、GET/POST endpoint、UI promotion package 審核面板與 deployment readiness smoke target,將 MCP activation evidence 與 runtime smoke receipt 合併審核,讓 completion audit 的 runtime 缺口可由人工收據明確補齊。 - **V10.379 只讀安全邊界**: 本階段不保存 payload、不打 health、不開 DB、不抓外站、不掛 scheduler,也不會因 promotion 通過自動打開人工 fetch gate;正式 fetch / DB write / scheduler attach 仍需各自獨立 gate。 - **V10.378 AI 推薦頁首屏 Gemini 防漏**: `/ai_recommend` 首屏狀態快照新增 provider sanitization,即使舊 cache / env 內出現 `default_provider='gemini'` 或 `recommended_provider='gemini'`,也會回到 `ollama`,避免 UI 把 Gemini 顯示成主推薦路徑;`/api/ai/set_provider` 同步正規化 provider 輸入,保留 Gemini 只能作 Ollama 失敗備援的拒絕訊息。 diff --git a/services/ollama_service.py b/services/ollama_service.py index d9f1157..94115a0 100644 --- a/services/ollama_service.py +++ b/services/ollama_service.py @@ -57,8 +57,9 @@ TIMEOUT = int(os.getenv('OLLAMA_TIMEOUT', '120')) # 秒 - 2 分鐘 COPY_TIMEOUT = int(os.getenv('OLLAMA_COPY_TIMEOUT', '180')) # 文案生成專用超時 - 3 分鐘 EMBED_TIMEOUT = int(os.getenv('OLLAMA_EMBED_TIMEOUT', os.getenv('EMBEDDING_TIMEOUT', '45'))) FALLBACK_111_KEEP_ALIVE = os.getenv('OLLAMA_111_KEEP_ALIVE', '5m') -FALLBACK_111_MAX_TIMEOUT = int(os.getenv('OLLAMA_111_MAX_TIMEOUT', '45')) +FALLBACK_111_MAX_TIMEOUT = int(os.getenv('OLLAMA_111_MAX_TIMEOUT', '20')) FALLBACK_111_NUM_CTX = int(os.getenv('OLLAMA_111_NUM_CTX', '4096')) +FALLBACK_111_NUM_PREDICT = int(os.getenv('OLLAMA_111_NUM_PREDICT', '512')) FALLBACK_111_MODEL = os.getenv('OLLAMA_111_MODEL_FALLBACK', 'llama3.2:latest') FALLBACK_111_MODEL_PATTERNS = tuple( pattern.strip().lower() @@ -142,13 +143,19 @@ def _effective_timeout_for_host(timeout_s: int, host: str) -> int: def _cap_111_options(options: Dict[str, Any]) -> None: - """111 fallback 強制縮 context,避免 3B/7B 仍因 131k context 吃爆記憶體。""" + """111 fallback 強制縮 context / output,避免最後備援被長任務拖成高負載。""" try: requested_num_ctx = int(options.get("num_ctx") or FALLBACK_111_NUM_CTX) except (TypeError, ValueError): requested_num_ctx = FALLBACK_111_NUM_CTX options["num_ctx"] = min(requested_num_ctx, FALLBACK_111_NUM_CTX) + try: + requested_num_predict = int(options.get("num_predict") or FALLBACK_111_NUM_PREDICT) + except (TypeError, ValueError): + requested_num_predict = FALLBACK_111_NUM_PREDICT + options["num_predict"] = min(requested_num_predict, FALLBACK_111_NUM_PREDICT) + def _canonical_host_chain() -> List[str]: """Return the approved static fallback chain without duplicates.""" diff --git a/tests/test_ollama_resolve.py b/tests/test_ollama_resolve.py index a2a0cc7..24e0205 100644 --- a/tests/test_ollama_resolve.py +++ b/tests/test_ollama_resolve.py @@ -243,8 +243,9 @@ def test_111_fallback_downgrades_heavy_model_and_shortens_keep_alive(monkeypatch monkeypatch.setattr(oss, "FALLBACK_111_MODEL", "qwen2.5:7b-instruct") monkeypatch.setattr(oss, "FALLBACK_111_KEEP_ALIVE", "5m") - monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 45) + monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 20) monkeypatch.setattr(oss, "FALLBACK_111_NUM_CTX", 4096) + monkeypatch.setattr(oss, "FALLBACK_111_NUM_PREDICT", 512) monkeypatch.setattr(oss, "FALLBACK_111_MODEL_PATTERNS", ("qwen3:14b",)) fake_resp = MagicMock(status_code=200) @@ -257,13 +258,19 @@ def test_111_fallback_downgrades_heavy_model_and_shortens_keep_alive(monkeypatch svc = oss.OllamaService(host="http://192.168.0.111:11434", model="qwen3:14b") with patch("services.ollama_service.requests.post", return_value=fake_resp) as mock_post: - resp = svc.generate("hi", timeout=120, keep_alive="24h") + resp = svc.generate( + "hi", + timeout=120, + keep_alive="24h", + options={"num_ctx": 131072, "num_predict": 4096}, + ) payload = mock_post.call_args.kwargs["json"] assert payload["model"] == "qwen2.5:7b-instruct" assert payload["keep_alive"] == "5m" assert payload["options"]["num_ctx"] == 4096 - assert mock_post.call_args.kwargs["timeout"] == 45 + assert payload["options"]["num_predict"] == 512 + assert mock_post.call_args.kwargs["timeout"] == 20 assert resp.model == "qwen2.5:7b-instruct" @@ -272,15 +279,16 @@ def test_111_fallback_keeps_light_model_but_caps_timeout(monkeypatch): from services import ollama_service as oss monkeypatch.setattr(oss, "FALLBACK_111_KEEP_ALIVE", "5m") - monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 45) + monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 20) monkeypatch.setattr(oss, "FALLBACK_111_NUM_CTX", 4096) + monkeypatch.setattr(oss, "FALLBACK_111_NUM_PREDICT", 512) svc = oss.OllamaService(host="http://192.168.0.111:11434", model="llama3.2:latest") with patch("services.ollama_service.requests.post", side_effect=Timeout): resp = svc.generate("hi", timeout=120, keep_alive="24h") assert resp.success is False - assert "timeout (45s)" in resp.error + assert "timeout (20s)" in resp.error def test_111_fallback_downgrades_hermes_context_heavy_model(monkeypatch): @@ -288,8 +296,9 @@ def test_111_fallback_downgrades_hermes_context_heavy_model(monkeypatch): monkeypatch.setattr(oss, "FALLBACK_111_MODEL", "llama3.2:latest") monkeypatch.setattr(oss, "FALLBACK_111_KEEP_ALIVE", "5m") - monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 45) + monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 20) monkeypatch.setattr(oss, "FALLBACK_111_NUM_CTX", 4096) + monkeypatch.setattr(oss, "FALLBACK_111_NUM_PREDICT", 512) monkeypatch.setattr(oss, "FALLBACK_111_MODEL_PATTERNS", ("hermes3:*",)) fake_resp = MagicMock(status_code=200) @@ -308,4 +317,5 @@ def test_111_fallback_downgrades_hermes_context_heavy_model(monkeypatch): assert payload["model"] == "llama3.2:latest" assert payload["keep_alive"] == "5m" assert payload["options"]["num_ctx"] == 4096 + assert payload["options"]["num_predict"] == 512 assert resp.model == "llama3.2:latest"