diff --git a/.env.example b/.env.example index 41374cd..08dc15e 100644 --- a/.env.example +++ b/.env.example @@ -365,6 +365,7 @@ OLLAMA_111_MODEL_FALLBACK=llama3.2:latest OLLAMA_111_MODEL_DOWNGRADE_PATTERNS=qwen3:*,deepseek-r1:*,hermes3:*,llama3.1:*,qwen2.5:*,qwen2.5-coder:*,gemma3:*,minicpm-v:*,llava:*,*:7b*,*:8b*,*:14b*,*:32b*,*:70b* OLLAMA_111_KEEP_ALIVE=5m OLLAMA_111_MAX_TIMEOUT=45 +OLLAMA_111_NUM_CTX=4096 # [預設 true] OpenClaw Q&A 先走 Ollama,品質不足或失敗時才 fallback Gemini/NIM # 主機不提供單 caller override;一律走 OLLAMA_HOST_PRIMARY → OLLAMA_HOST_SECONDARY → OLLAMA_HOST_FALLBACK diff --git a/config.py b/config.py index ed44c8e..0a3212f 100644 --- a/config.py +++ b/config.py @@ -323,7 +323,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.363" +SYSTEM_VERSION = "V10.364" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index 57d6ab6..eb4413c 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -2,7 +2,7 @@ > **最後更新**: 2026-05-21 (台北時間) > **狀態**: 🟢 四 AI Agent 自動化閉環已落地;LLM 路由紅線升級為 Ollama-first 三主機級聯,Gemini 備援預設關閉 -> **適用版本**: V10.362 +> **適用版本**: V10.364 --- @@ -25,7 +25,7 @@ - OpenClaw 週報、月報、Meta analysis、日報洞察、Telegram PPT 分析與 MCP fallback 也必須 Ollama-first;Gemini caller 只能帶 `_gemini_fallback` 或明確 fallback caller 語意,且不得先於 Ollama/NIM 被呼叫。 - OpenClaw 週報、月報、Meta analysis、日報洞察與每日報告的 Gemini/NIM 備援 caller 必須登錄在 caller registry、AI 觀測台 agent group 與 Telegram 狀態統計,避免 fallback 用量被歸類為未知或漏算。 - Gemini API 出站有第二道 kill switch:`GEMINI_FALLBACK_ENABLED` 預設為 `false`。即使 `GEMINI_API_KEY` 存在,通用 AI fallback、OpenClaw 報告/QA/PPT/圖片、MCP Grounding 與 Code Review L3 都不得呼叫 Gemini;只有操作員明確設為 `true` 時,Gemini 才能作緊急備援。 -- 111 `192.168.0.111` 只是最後一道 Mac fallback,不承接 7B+、vision、long-context 模型長駐;`OllamaService.generate()` 落到 111 時會將 `qwen3`、`deepseek-r1`、`hermes3`、`qwen2.5*`、`gemma3`、`llava`、`minicpm-v` 與 7B+ 模型依 `OLLAMA_111_MODEL_DOWNGRADE_PATTERNS` 降級到 `OLLAMA_111_MODEL_FALLBACK=llama3.2:latest`,並以 `OLLAMA_111_KEEP_ALIVE=5m`、`OLLAMA_111_MAX_TIMEOUT=45` 封頂,避免 16GB RAM 主機被大 context runner 與 24h keep-alive 壓到 swap。 +- 111 `192.168.0.111` 只是最後一道 Mac fallback,不承接 7B+、vision、long-context 模型長駐;`OllamaService.generate()` 落到 111 時會將 `qwen3`、`deepseek-r1`、`hermes3`、`qwen2.5*`、`gemma3`、`llava`、`minicpm-v` 與 7B+ 模型依 `OLLAMA_111_MODEL_DOWNGRADE_PATTERNS` 降級到 `OLLAMA_111_MODEL_FALLBACK=llama3.2:latest`,並以 `OLLAMA_111_KEEP_ALIVE=5m`、`OLLAMA_111_MAX_TIMEOUT=45`、`OLLAMA_111_NUM_CTX=4096` 封頂,避免 16GB RAM 主機被大 context runner 與 24h keep-alive 壓到 swap。 ## 一、四 AI Agent 路由架構 diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index 31bd1ed..4e51660 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -13,6 +13,7 @@ ## 📅 詳細更新日誌 (考古存檔) ### 2026-05-21:瀏覽器測試守門與 PChome 熱路徑優化 +- **V10.364 111 context cap**: 111 fallback 即使降到 `llama3.2:latest`,Ollama 仍可能用 131k context 啟動 runner,導致 3B 模型也吃到 10GB+;新增 `OLLAMA_111_NUM_CTX=4096`,落到 111 時強制縮 context,並把 `llama3.2:latest` 加入零成本模型表,避免觀測台 unknown model warning。 - **V10.363 Dashing Diva variant-safe search**: PChome/MOMO matcher 針對 Dashing Diva 美甲片補「商品頁目錄有 30片/盒、MOMO 標題省略片數」的安全豁免,只限同品牌、同美甲片線、同具名款式錨點;搜尋詞也優先帶出 `月影柔霧`、`銀絲柔彩` 等款式名,降低同系列不同款式互撞。 - **V10.362 111 fallback shrink-to-3B**: 111 Mac 實測 `hermes3` / `qwen2.5-coder` 雖是 7B/8B,但 large context runner 仍會佔用 6-10GB RSS 並推高 swap;111 fallback 改為所有 7B+、vision 與 long-context 文字生成都降級到 `llama3.2:latest`,`ai_calls.model` 也會記錄實際降級模型並把原請求模型放入 `meta.requested_model`。 - **V10.361 111 fallback resource guard**: 實測 111 Mac 高 load 主要來自 Codex app / WindowServer 前台負載,且 Ollama 曾因 fallback 載入 `qwen3:14b` 造成 16GB RAM / swap 壓力;已手動 unload 111 上的重模型,並讓 `OllamaService.generate()` 落到 111 時自動把 14B+ 模型降到 `OLLAMA_111_MODEL_FALLBACK`、`keep_alive` 縮至 `OLLAMA_111_KEEP_ALIVE=5m`、timeout 封頂 `OLLAMA_111_MAX_TIMEOUT=45`。GCP-A/GCP-B 仍可跑 `qwen3:14b`,111 只做短時最後備援。 diff --git a/services/ai_call_logger.py b/services/ai_call_logger.py index dc05b6e..3dff160 100644 --- a/services/ai_call_logger.py +++ b/services/ai_call_logger.py @@ -66,6 +66,7 @@ COST_TABLE: Dict[str, Dict[str, float]] = { 'minicpm-v:latest': {'in': 0.0, 'out': 0.0}, # Phase 14 PPT vision(5.5GB) 'llava:latest': {'in': 0.0, 'out': 0.0}, # 2026-05-04 已拉(4.7GB),Vision 備援 'llama3.1:8b': {'in': 0.0, 'out': 0.0}, + 'llama3.2:latest': {'in': 0.0, 'out': 0.0}, # 111 final fallback 3B 'bge-m3:latest': {'in': 0.0, 'out': 0.0}, } diff --git a/services/ollama_service.py b/services/ollama_service.py index 74dc5fe..d9f1157 100644 --- a/services/ollama_service.py +++ b/services/ollama_service.py @@ -58,6 +58,7 @@ COPY_TIMEOUT = int(os.getenv('OLLAMA_COPY_TIMEOUT', '180')) # 文案生成專 EMBED_TIMEOUT = int(os.getenv('OLLAMA_EMBED_TIMEOUT', os.getenv('EMBEDDING_TIMEOUT', '45'))) FALLBACK_111_KEEP_ALIVE = os.getenv('OLLAMA_111_KEEP_ALIVE', '5m') FALLBACK_111_MAX_TIMEOUT = int(os.getenv('OLLAMA_111_MAX_TIMEOUT', '45')) +FALLBACK_111_NUM_CTX = int(os.getenv('OLLAMA_111_NUM_CTX', '4096')) FALLBACK_111_MODEL = os.getenv('OLLAMA_111_MODEL_FALLBACK', 'llama3.2:latest') FALLBACK_111_MODEL_PATTERNS = tuple( pattern.strip().lower() @@ -140,6 +141,15 @@ def _effective_timeout_for_host(timeout_s: int, host: str) -> int: return timeout_s +def _cap_111_options(options: Dict[str, Any]) -> None: + """111 fallback 強制縮 context,避免 3B/7B 仍因 131k context 吃爆記憶體。""" + try: + requested_num_ctx = int(options.get("num_ctx") or FALLBACK_111_NUM_CTX) + except (TypeError, ValueError): + requested_num_ctx = FALLBACK_111_NUM_CTX + options["num_ctx"] = min(requested_num_ctx, FALLBACK_111_NUM_CTX) + + def _canonical_host_chain() -> List[str]: """Return the approved static fallback chain without duplicates.""" chain: List[str] = [] @@ -412,6 +422,7 @@ class OllamaService: payload["model"] = effective_model if _is_111_fallback_host(current_host): payload["keep_alive"] = FALLBACK_111_KEEP_ALIVE + _cap_111_options(payload["options"]) elif keep_alive: payload["keep_alive"] = keep_alive diff --git a/tests/test_ollama_resolve.py b/tests/test_ollama_resolve.py index aeba614..a2a0cc7 100644 --- a/tests/test_ollama_resolve.py +++ b/tests/test_ollama_resolve.py @@ -244,6 +244,7 @@ def test_111_fallback_downgrades_heavy_model_and_shortens_keep_alive(monkeypatch monkeypatch.setattr(oss, "FALLBACK_111_MODEL", "qwen2.5:7b-instruct") monkeypatch.setattr(oss, "FALLBACK_111_KEEP_ALIVE", "5m") monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 45) + monkeypatch.setattr(oss, "FALLBACK_111_NUM_CTX", 4096) monkeypatch.setattr(oss, "FALLBACK_111_MODEL_PATTERNS", ("qwen3:14b",)) fake_resp = MagicMock(status_code=200) @@ -261,6 +262,7 @@ def test_111_fallback_downgrades_heavy_model_and_shortens_keep_alive(monkeypatch payload = mock_post.call_args.kwargs["json"] assert payload["model"] == "qwen2.5:7b-instruct" assert payload["keep_alive"] == "5m" + assert payload["options"]["num_ctx"] == 4096 assert mock_post.call_args.kwargs["timeout"] == 45 assert resp.model == "qwen2.5:7b-instruct" @@ -271,6 +273,7 @@ def test_111_fallback_keeps_light_model_but_caps_timeout(monkeypatch): monkeypatch.setattr(oss, "FALLBACK_111_KEEP_ALIVE", "5m") monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 45) + monkeypatch.setattr(oss, "FALLBACK_111_NUM_CTX", 4096) svc = oss.OllamaService(host="http://192.168.0.111:11434", model="llama3.2:latest") with patch("services.ollama_service.requests.post", side_effect=Timeout): @@ -286,6 +289,7 @@ def test_111_fallback_downgrades_hermes_context_heavy_model(monkeypatch): monkeypatch.setattr(oss, "FALLBACK_111_MODEL", "llama3.2:latest") monkeypatch.setattr(oss, "FALLBACK_111_KEEP_ALIVE", "5m") monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 45) + monkeypatch.setattr(oss, "FALLBACK_111_NUM_CTX", 4096) monkeypatch.setattr(oss, "FALLBACK_111_MODEL_PATTERNS", ("hermes3:*",)) fake_resp = MagicMock(status_code=200) @@ -303,4 +307,5 @@ def test_111_fallback_downgrades_hermes_context_heavy_model(monkeypatch): payload = mock_post.call_args.kwargs["json"] assert payload["model"] == "llama3.2:latest" assert payload["keep_alive"] == "5m" + assert payload["options"]["num_ctx"] == 4096 assert resp.model == "llama3.2:latest"