From 00a808518ee601a5801a3f6641556fa59a6f7e28 Mon Sep 17 00:00:00 2001
From: OoO <ooo@MacBook-Pro.local>
Date: Thu, 21 May 2026 12:38:08 +0800
Subject: [PATCH] =?UTF-8?q?=E5=B0=87=20111=20Ollama=20fallback=20=E6=94=B6?=
 =?UTF-8?q?=E6=96=82=E5=88=B0=E8=BC=95=E9=87=8F=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .env.example                             |  6 ++---
 config.py                                |  2 +-
 docs/AI_INTELLIGENCE_MODULE_SOT.md       | 14 ++++++------
 docs/memory/history_logs.md              |  1 +
 services/ai_call_logger.py               |  5 +++++
 services/code_review_pipeline_service.py |  7 +++++-
 services/hermes_analyst_service.py       |  6 +++++
 services/ollama_service.py               | 12 ++++++----
 services/openclaw_strategist_service.py  |  6 +++++
 tests/test_ollama_resolve.py             | 28 +++++++++++++++++++++++-
 10 files changed, 70 insertions(+), 17 deletions(-)

diff --git a/.env.example b/.env.example
index 024d556..41374cd 100644
--- a/.env.example
+++ b/.env.example
@@ -360,9 +360,9 @@ OLLAMA_MODEL=gemma3:4b
 OLLAMA_TIMEOUT=120
 OLLAMA_COPY_TIMEOUT=180
 OLLAMA_EMBED_TIMEOUT=45
-# 111 是 Mac final fallback，不承接 14B+ 重模型長駐；落到 111 時自動降級與縮短常駐。
-OLLAMA_111_MODEL_FALLBACK=qwen2.5:7b-instruct
-OLLAMA_111_MODEL_DOWNGRADE_PATTERNS=qwen3:14b,deepseek-r1:14b,*:32b,*:70b
+# 111 是 Mac final fallback，不承接 7B+ / vision / long-context 模型長駐；落到 111 時自動降級與縮短常駐。
+OLLAMA_111_MODEL_FALLBACK=llama3.2:latest
+OLLAMA_111_MODEL_DOWNGRADE_PATTERNS=qwen3:*,deepseek-r1:*,hermes3:*,llama3.1:*,qwen2.5:*,qwen2.5-coder:*,gemma3:*,minicpm-v:*,llava:*,*:7b*,*:8b*,*:14b*,*:32b*,*:70b*
 OLLAMA_111_KEEP_ALIVE=5m
 OLLAMA_111_MAX_TIMEOUT=45
 
diff --git a/config.py b/config.py
index 0c47130..a035a3a 100644
--- a/config.py
+++ b/config.py
@@ -323,7 +323,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
 # ==========================================
 # 系統版本與路徑
 # ==========================================
-SYSTEM_VERSION = "V10.361"
+SYSTEM_VERSION = "V10.362"
 LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
 public_url = PUBLIC_URL  # 用於模板顯示
 
diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md
index 3495190..57d6ab6 100644
--- a/docs/AI_INTELLIGENCE_MODULE_SOT.md
+++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md
@@ -2,7 +2,7 @@
 
 > **最後更新**: 2026-05-21 (台北時間)
 > **狀態**: 🟢 四 AI Agent 自動化閉環已落地；LLM 路由紅線升級為 Ollama-first 三主機級聯，Gemini 備援預設關閉
-> **適用版本**: V10.361
+> **適用版本**: V10.362
 
 ---
 
@@ -18,14 +18,14 @@
 - PPT vision、PPT 文案 final fallback、MCP 離線 final fallback 等特殊 Ollama 路徑也不得只打單一 host；如需 `/api/generate`，一律透過 `OllamaService.generate()`。
 - Code Review pipeline 也必須 Ollama-first：Hermes scan 與 OpenClaw assessment 都走 `OllamaService` 三主機 retry；Gemini telemetry 只能以 `code_review_openclaw_gemini` 出現，表示 Ollama/可選 Claude 備援都失敗後才啟用。
 - Code Review Hermes scan 預設不呼叫 LLM，改用 deterministic fast static scan，避免部署後先卡三段 Ollama timeout；需要 LLM 掃描時才以 `CODE_REVIEW_HERMES_LLM_SCAN_ENABLED=true` 啟用本地矩陣。
-- Code Review Hermes LLM scan 啟用時才使用本地模型矩陣：GCP-A `qwen2.5-coder:7b`、GCP-B `gemma3:4b`、111 `hermes3:latest`；不啟用 Gemini 備援，三段本地掃描失敗時只回空 findings 並交由 OpenClaw 本地矩陣續跑。
-- Code Review OpenClaw assessment 保持主機順序 GCP-A → GCP-B → 111，但可使用主機適配本地模型：GCP-A `qwen2.5-coder:7b`、GCP-B `gemma3:4b`、111 `hermes3:latest`；三段本地 Ollama 全失敗後才允許雲端備援。
+- Code Review Hermes LLM scan 啟用時才使用本地模型矩陣：GCP-A `qwen2.5-coder:7b`、GCP-B `gemma3:4b`；落到 111 時由 `OllamaService` 降級到 `llama3.2:latest`。不啟用 Gemini 備援，三段本地掃描失敗時只回空 findings 並交由 OpenClaw 本地矩陣續跑。
+- Code Review OpenClaw assessment 保持主機順序 GCP-A → GCP-B → 111，但可使用主機適配本地模型：GCP-A `qwen2.5-coder:7b`、GCP-B `gemma3:4b`；落到 111 時由 `OllamaService` 降級到 `llama3.2:latest`。三段本地 Ollama 全失敗後才允許雲端備援。
 - OpenClaw Telegram Q&A 主路徑也不得綁單一 host：`_call_qwen3_qa()` 必須透過 `OllamaService` 跑 GCP-A → GCP-B → 111，並把實際落點寫入 `ai_calls.provider`。
 - OpenClaw Telegram 圖片商品辨識也必須 Ollama-first：`_identify_product_name_with_ollama_vision()` 透過 `OllamaService` 嘗試 GCP-A → GCP-B → 111；Gemini 只允許以 `openclaw_bot_image_gemini` caller 作為失敗後備援。
 - OpenClaw 週報、月報、Meta analysis、日報洞察、Telegram PPT 分析與 MCP fallback 也必須 Ollama-first；Gemini caller 只能帶 `_gemini_fallback` 或明確 fallback caller 語意，且不得先於 Ollama/NIM 被呼叫。
 - OpenClaw 週報、月報、Meta analysis、日報洞察與每日報告的 Gemini/NIM 備援 caller 必須登錄在 caller registry、AI 觀測台 agent group 與 Telegram 狀態統計，避免 fallback 用量被歸類為未知或漏算。
 - Gemini API 出站有第二道 kill switch：`GEMINI_FALLBACK_ENABLED` 預設為 `false`。即使 `GEMINI_API_KEY` 存在，通用 AI fallback、OpenClaw 報告/QA/PPT/圖片、MCP Grounding 與 Code Review L3 都不得呼叫 Gemini；只有操作員明確設為 `true` 時，Gemini 才能作緊急備援。
-- 111 `192.168.0.111` 只是最後一道 Mac fallback，不承接 14B+ 重模型長駐；`OllamaService.generate()` 落到 111 時會將 `qwen3:14b` / `deepseek-r1:14b` / 32B+ / 70B+ 依 `OLLAMA_111_MODEL_DOWNGRADE_PATTERNS` 降級到 `OLLAMA_111_MODEL_FALLBACK`，並以 `OLLAMA_111_KEEP_ALIVE=5m`、`OLLAMA_111_MAX_TIMEOUT=45` 封頂，避免 16GB RAM 主機被 14B 模型與 24h keep-alive 壓到 swap。
+- 111 `192.168.0.111` 只是最後一道 Mac fallback，不承接 7B+、vision、long-context 模型長駐；`OllamaService.generate()` 落到 111 時會將 `qwen3`、`deepseek-r1`、`hermes3`、`qwen2.5*`、`gemma3`、`llava`、`minicpm-v` 與 7B+ 模型依 `OLLAMA_111_MODEL_DOWNGRADE_PATTERNS` 降級到 `OLLAMA_111_MODEL_FALLBACK=llama3.2:latest`，並以 `OLLAMA_111_KEEP_ALIVE=5m`、`OLLAMA_111_MAX_TIMEOUT=45` 封頂，避免 16GB RAM 主機被大 context runner 與 24h keep-alive 壓到 swap。
 
 ## 一、四 AI Agent 路由架構
 
@@ -37,7 +37,7 @@ SQL漏斗(~300筆)
   任務: 競價威脅分類 → TOP 20 HIGH/MED/LOW
      ↓
 [NemoTron / qwen3] — 派發器
-  主路徑: qwen3:14b @ GCP-A/GCP-B；落到 111 時自動降級 7B
+  主路徑: qwen3:14b @ GCP-A/GCP-B；落到 111 時自動降級 llama3.2
   備援: NVIDIA NIM meta/llama-3.1-8b-instruct
   任務: Tool Calling → Telegram 告警 / DB 寫入
      ↓
@@ -65,8 +65,8 @@ SQL漏斗(~300筆)
 | 角色 | 模型 | 主機 | 成本 | 每日限額 |
 |------|------|------|------|---------|
 | Hermes 分析師 | hermes3:latest / bge-m3 | GCP-A → GCP-B → 111 Ollama | 零 | 無限 |
-| NemoTron 派發器 | qwen3:14b；111 fallback 降級 7B；NIM fallback | GCP-A → GCP-B → 111；NVIDIA NIM 備援 | Ollama 零；NIM 配額內免費 | NIM 80 |
-| OpenClaw 策略師 | qwen3:14b；111 fallback 降級 7B；Gemini 鎖定場景 | Ollama-first；Gemini 備援 | Ollama 零；Gemini 需控管 | — |
+| NemoTron 派發器 | qwen3:14b；111 fallback 降級 llama3.2；NIM fallback | GCP-A → GCP-B → 111；NVIDIA NIM 備援 | Ollama 零；NIM 配額內免費 | NIM 80 |
+| OpenClaw 策略師 | qwen3:14b；111 fallback 降級 llama3.2；Gemini 鎖定場景 | Ollama-first；Gemini 備援 | Ollama 零；Gemini 需控管 | — |
 | ElephantAlpha 編排者 | ElephantAlpha | 依部署環境 | 受控 | HITL / 任務制 |
 
 ---
diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md
index 564b718..e693f29 100644
--- a/docs/memory/history_logs.md
+++ b/docs/memory/history_logs.md
@@ -13,6 +13,7 @@
 ## 📅 詳細更新日誌 (考古存檔)
 
 ### 2026-05-21：瀏覽器測試守門與 PChome 熱路徑優化
+- **V10.362 111 fallback shrink-to-3B**: 111 Mac 實測 `hermes3` / `qwen2.5-coder` 雖是 7B/8B，但 large context runner 仍會佔用 6-10GB RSS 並推高 swap；111 fallback 改為所有 7B+、vision 與 long-context 文字生成都降級到 `llama3.2:latest`，`ai_calls.model` 也會記錄實際降級模型並把原請求模型放入 `meta.requested_model`。
 - **V10.361 111 fallback resource guard**: 實測 111 Mac 高 load 主要來自 Codex app / WindowServer 前台負載，且 Ollama 曾因 fallback 載入 `qwen3:14b` 造成 16GB RAM / swap 壓力；已手動 unload 111 上的重模型，並讓 `OllamaService.generate()` 落到 111 時自動把 14B+ 模型降到 `OLLAMA_111_MODEL_FALLBACK`、`keep_alive` 縮至 `OLLAMA_111_KEEP_ALIVE=5m`、timeout 封頂 `OLLAMA_111_MAX_TIMEOUT=45`。GCP-A/GCP-B 仍可跑 `qwen3:14b`，111 只做短時最後備援。
 - **V10.360 browser smoke guard**: `tests/test_image_fetch.py` 改為預設 skip，只有 `RUN_MOMO_BROWSER_TESTS=1` 才會打開外部 MOMO 網站；手動執行時預設 headless，並關閉 Chrome password manager/autofill，避免一般 pytest 觸發瀏覽器與密碼允許提示。
 - **Scheduler Selenium 防彈窗**: `managed_scraper_resources()` 補 `credentials_enable_service=false`、`profile.password_manager_enabled=false` 與 Autofill/PasswordManager feature disable，降低背景 Selenium 觸發密碼管理提示的機率。
diff --git a/services/ai_call_logger.py b/services/ai_call_logger.py
index a4326a6..dc05b6e 100644
--- a/services/ai_call_logger.py
+++ b/services/ai_call_logger.py
@@ -175,6 +175,11 @@ class _CallState:
         if provider:
             self.provider = provider[:32]
 
+    def set_model(self, model: str) -> None:
+        """更新實際模型。適用於 host-aware downgrade 後才知道落點模型的 caller。"""
+        if model:
+            self.model = model[:128]
+
     def set_cache_hit(self, hit: bool = True) -> None:
         self.cache_hit = bool(hit)
 
diff --git a/services/code_review_pipeline_service.py b/services/code_review_pipeline_service.py
index 5555144..17d1377 100644
--- a/services/code_review_pipeline_service.py
+++ b/services/code_review_pipeline_service.py
@@ -340,12 +340,15 @@ class CodeReviewPipeline:
                     )
                     actual_host = resp.host or host
                     _ctx.set_provider(get_provider_tag(actual_host))
+                    _ctx.set_model(resp.model or model_name)
                     _ctx.set_tokens(
                         input=resp.input_tokens,
                         output=resp.output_tokens,
                     )
                     _ctx.add_meta('host', actual_host)
                     _ctx.add_meta('host_label', get_host_label(actual_host))
+                    if resp.model and resp.model != model_name:
+                        _ctx.add_meta('requested_model', model_name)
                     if not resp.success:
                         last_error = resp.error or 'ollama generate failed'
                         _ctx.set_error(last_error)
@@ -529,10 +532,12 @@ class CodeReviewPipeline:
                 )
                 actual_host = resp.host or host
                 _ctx.set_provider(get_provider_tag(actual_host))
+                _ctx.set_model(resp.model or model_name)
                 _ctx.set_tokens(input=resp.input_tokens, output=resp.output_tokens)
                 _ctx.add_meta('host', actual_host)
                 _ctx.add_meta('host_label', get_host_label(actual_host))
-                _ctx.add_meta('model', model_name)
+                if resp.model and resp.model != model_name:
+                    _ctx.add_meta('requested_model', model_name)
                 if resp.success and (resp.content or '').strip():
                     return resp.content or ""
                 last_ollama_error = resp.error or 'ollama generate failed'
diff --git a/services/hermes_analyst_service.py b/services/hermes_analyst_service.py
index ca5ff76..9e9c8e3 100644
--- a/services/hermes_analyst_service.py
+++ b/services/hermes_analyst_service.py
@@ -237,12 +237,15 @@ class HermesAnalystService:
                     keep_alive=HERMES_KEEP_ALIVE,  # ADR-012：避免冷啟動 timeout
                 )
                 _ctx.set_provider(get_provider_tag(resp.host or ''))
+                _ctx.set_model(resp.model or HERMES_MODEL)
                 _ctx.set_tokens(
                     input=resp.input_tokens,
                     output=resp.output_tokens,
                 )
                 _ctx.add_meta('host', resp.host)
                 _ctx.add_meta('host_label', get_host_label(resp.host or ''))
+                if resp.model and resp.model != HERMES_MODEL:
+                    _ctx.add_meta('requested_model', HERMES_MODEL)
                 if not resp.success:
                     raise RuntimeError(resp.error or "ollama generate failed")
                 raw = (resp.content or "").strip()
@@ -516,9 +519,12 @@ class HermesAnalystService:
                     keep_alive=HERMES_KEEP_ALIVE,
                 )
                 _ctx.set_provider(get_provider_tag(resp.host or ''))
+                _ctx.set_model(resp.model or HERMES_MODEL)
                 _ctx.set_tokens(input=resp.input_tokens, output=resp.output_tokens)
                 _ctx.add_meta('host', resp.host)
                 _ctx.add_meta('host_label', get_host_label(resp.host or ''))
+                if resp.model and resp.model != HERMES_MODEL:
+                    _ctx.add_meta('requested_model', HERMES_MODEL)
                 if not resp.success:
                     raise RuntimeError(resp.error or "ollama generate failed")
             except Exception as e:
diff --git a/services/ollama_service.py b/services/ollama_service.py
index e1a6843..74dc5fe 100644
--- a/services/ollama_service.py
+++ b/services/ollama_service.py
@@ -58,12 +58,16 @@ COPY_TIMEOUT = int(os.getenv('OLLAMA_COPY_TIMEOUT', '180'))  # 文案生成專
 EMBED_TIMEOUT = int(os.getenv('OLLAMA_EMBED_TIMEOUT', os.getenv('EMBEDDING_TIMEOUT', '45')))
 FALLBACK_111_KEEP_ALIVE = os.getenv('OLLAMA_111_KEEP_ALIVE', '5m')
 FALLBACK_111_MAX_TIMEOUT = int(os.getenv('OLLAMA_111_MAX_TIMEOUT', '45'))
-FALLBACK_111_MODEL = os.getenv('OLLAMA_111_MODEL_FALLBACK', 'qwen2.5:7b-instruct')
+FALLBACK_111_MODEL = os.getenv('OLLAMA_111_MODEL_FALLBACK', 'llama3.2:latest')
 FALLBACK_111_MODEL_PATTERNS = tuple(
     pattern.strip().lower()
     for pattern in os.getenv(
         'OLLAMA_111_MODEL_DOWNGRADE_PATTERNS',
-        'qwen3:14b,deepseek-r1:14b,*:32b,*:70b',
+        (
+            'qwen3:*,deepseek-r1:*,hermes3:*,llama3.1:*,'
+            'qwen2.5:*,qwen2.5-coder:*,gemma3:*,minicpm-v:*,llava:*,'
+            '*:7b*,*:8b*,*:14b*,*:32b*,*:70b*'
+        ),
     ).split(',')
     if pattern.strip()
 )
@@ -112,9 +116,9 @@ def _is_111_fallback_host(host: str) -> bool:
 
 def _effective_model_for_host(model: str, host: str) -> str:
     """
-    111 是 Mac/HDD final fallback，不承接 14B+ 等重模型。
+    111 是 Mac/HDD final fallback，不承接 7B+ / vision / long-context 等模型。
     GCP-A/GCP-B 仍照 caller 指定模型；只有落到 111 才降級，避免 16GB RAM
-    被 qwen3:14b / deepseek-r1:14b 長時間壓到 swap。
+    被 hermes3/qwen/gemma 的大 context runner 長時間壓到 swap。
     """
     if not _is_111_fallback_host(host):
         return model
diff --git a/services/openclaw_strategist_service.py b/services/openclaw_strategist_service.py
index 84559d0..83f3878 100644
--- a/services/openclaw_strategist_service.py
+++ b/services/openclaw_strategist_service.py
@@ -305,12 +305,15 @@ def _call_qwen3_qa(
             )
             actual_provider = get_provider_tag(resp.host or '')
             ctx.set_provider(actual_provider)
+            ctx.set_model(resp.model or OPENCLAW_QA_OLLAMA_MODEL)
             ctx.set_tokens(
                 input=resp.input_tokens,
                 output=resp.output_tokens,
             )
             ctx.add_meta('host', resp.host)
             ctx.add_meta('host_label', get_host_label(resp.host or ''))
+            if resp.model and resp.model != OPENCLAW_QA_OLLAMA_MODEL:
+                ctx.add_meta('requested_model', OPENCLAW_QA_OLLAMA_MODEL)
             if not resp.success:
                 ctx.set_error(resp.error or 'ollama generate failed')
                 ctx.fallback_to_caller('openclaw_qa_gemini_fallback')
@@ -1108,9 +1111,12 @@ def _call_ollama_strategy(
                 options={"num_predict": predict},
             )
             ctx.set_provider(get_provider_tag(resp.host or ""))
+            ctx.set_model(resp.model or model)
             ctx.set_tokens(input=resp.input_tokens, output=resp.output_tokens)
             ctx.add_meta("host", resp.host)
             ctx.add_meta("host_label", get_host_label(resp.host or ""))
+            if resp.model and resp.model != model:
+                ctx.add_meta("requested_model", model)
             if not resp.success:
                 ctx.set_error(resp.error or "ollama generate failed")
                 ctx.fallback_to_caller(fallback)
diff --git a/tests/test_ollama_resolve.py b/tests/test_ollama_resolve.py
index a2d626b..aeba614 100644
--- a/tests/test_ollama_resolve.py
+++ b/tests/test_ollama_resolve.py
@@ -271,10 +271,36 @@ def test_111_fallback_keeps_light_model_but_caps_timeout(monkeypatch):
 
     monkeypatch.setattr(oss, "FALLBACK_111_KEEP_ALIVE", "5m")
     monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 45)
-    svc = oss.OllamaService(host="http://192.168.0.111:11434", model="hermes3:latest")
+    svc = oss.OllamaService(host="http://192.168.0.111:11434", model="llama3.2:latest")
 
     with patch("services.ollama_service.requests.post", side_effect=Timeout):
         resp = svc.generate("hi", timeout=120, keep_alive="24h")
 
     assert resp.success is False
     assert "timeout (45s)" in resp.error
+
+
+def test_111_fallback_downgrades_hermes_context_heavy_model(monkeypatch):
+    from services import ollama_service as oss
+
+    monkeypatch.setattr(oss, "FALLBACK_111_MODEL", "llama3.2:latest")
+    monkeypatch.setattr(oss, "FALLBACK_111_KEEP_ALIVE", "5m")
+    monkeypatch.setattr(oss, "FALLBACK_111_MAX_TIMEOUT", 45)
+    monkeypatch.setattr(oss, "FALLBACK_111_MODEL_PATTERNS", ("hermes3:*",))
+
+    fake_resp = MagicMock(status_code=200)
+    fake_resp.json.return_value = {
+        "response": "ok",
+        "prompt_eval_count": 3,
+        "eval_count": 2,
+        "total_duration": 1_000_000_000,
+    }
+    svc = oss.OllamaService(host="http://192.168.0.111:11434", model="hermes3:latest")
+
+    with patch("services.ollama_service.requests.post", return_value=fake_resp) as mock_post:
+        resp = svc.generate("hi", timeout=120, keep_alive="24h")
+
+    payload = mock_post.call_args.kwargs["json"]
+    assert payload["model"] == "llama3.2:latest"
+    assert payload["keep_alive"] == "5m"
+    assert resp.model == "llama3.2:latest"