From ed29e66fde3921acf035095668cdea84f5485b72 Mon Sep 17 00:00:00 2001
From: OoO <ooo@MacBook-Pro.local>
Date: Wed, 13 May 2026 21:21:05 +0800
Subject: [PATCH] fix: route hermes through ollama cascade

---
 docs/AI_INTELLIGENCE_MODULE_SOT.md            |   1 +
 .../claude_inventory_validation_20260513.md   |   1 +
 services/hermes_analyst_service.py            |  86 ++++++------
 services/ollama_service.py                    |   7 +-
 tests/test_hermes_ollama_cascade.py           | 132 ++++++++++++++++++
 5 files changed, 180 insertions(+), 47 deletions(-)
 create mode 100644 tests/test_hermes_ollama_cascade.py

diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md
index e9d975b..95df0b4 100644
--- a/docs/AI_INTELLIGENCE_MODULE_SOT.md
+++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md
@@ -13,6 +13,7 @@
 - Gemini 只能作為 Ollama 主路徑失敗後的備援，或 ADR-028 明確鎖定的 MCP Grounding、PPT/vision、週/月報、Code Review、EA HITL、複雜 SKU 升級等低頻場景。
 - 188 `192.168.0.188` 僅是 App / DB / scheduler / Telegram bot 容器宿主與 AutoHeal target，不可作為 Ollama 節點。
 - 通用 AI 文案、關鍵字、商品洞察與 Telegram Q&A 第一響應不得 Gemini-first。
+- Hermes intent / analyst 路徑不得手刻 `/api/generate` 或只 resolve 單次 host；必須走 `OllamaService`，讓同一請求可依序 retry GCP-A → GCP-B → 111。
 - Code Review pipeline 也必須 Ollama-first：Hermes scan 與 OpenClaw assessment 都走 `OllamaService` 三主機 retry；Gemini telemetry 只能以 `code_review_openclaw_gemini` 出現，表示 Ollama/可選 Claude 備援都失敗後才啟用。
 - OpenClaw Telegram Q&A 主路徑也不得綁單一 host：`_call_qwen3_qa()` 必須透過 `OllamaService` 跑 GCP-A → GCP-B → 111，並把實際落點寫入 `ai_calls.provider`。
 
diff --git a/docs/memory/claude_inventory_validation_20260513.md b/docs/memory/claude_inventory_validation_20260513.md
index 579a6ea..9060f45 100644
--- a/docs/memory/claude_inventory_validation_20260513.md
+++ b/docs/memory/claude_inventory_validation_20260513.md
@@ -51,6 +51,7 @@
 - `services/pg_sync_service.py` 是顯式 opt-in legacy CLI，不是生產自動同步路徑；`tests/test_pg_sync_contract.py` 已守住預設 OFF 與 runtime paths 不自動 import。
 - `qwen3:14b` 不是未使用 Ollama 模型：OpenClaw QA、NemoTron dispatch 與 LLM model router 仍有現役路徑；`tests/test_qwen3_runtime_usage.py` 已守住，不能只因體積大就三主機移除。
 - Ollama host env 已加白名單護欄：`OLLAMA_HOST*` / `EMBEDDING_HOST` 只接受 GCP-A、GCP-B、111 或 110 proxy，誤設 188/localhost 會回到核准主機。
+- Hermes intent 與批量 analyst 已從單次 `resolve_ollama_host()` + raw `requests.post('/api/generate')` 改為 `OllamaService.generate()`，同一請求會依序 retry GCP-A → GCP-B → 111，並保留 `HERMES_KEEP_ALIVE` 與實際 provider 回寫測試。
 - OpenClaw QA / daily Hermes template / NemoTron qwen3 的 flag 文件與測試已對齊 Ollama-first 預設 ON；顯式 `false` 才是 Gemini/NIM legacy 緊急退路。OpenClaw QA 已移除單一 `OPENCLAW_QA_OLLAMA_HOST` 主機覆寫，`_call_qwen3_qa()` 改走 `OllamaService` 的 GCP-A → GCP-B → 111 retry 並回寫實際 provider。
 - Code Review pipeline 已對齊 Ollama-first：`_hermes_scan()` 與 `_openclaw_assess()` 都先走 `OllamaService` 的 GCP-A → GCP-B → 111 retry；Gemini 僅在 Ollama（與可選 Claude）失敗後以 `code_review_openclaw_gemini` caller 記錄備援，不再以 `code_review_openclaw` 直接 Gemini-first。
 - `.env.example` 已補齊 Python runtime 實際讀取的環境變數，`tests/test_phase3f_cleanup_contracts.py::test_env_example_documents_runtime_os_env_keys` 會掃 `app.py/config.py/scheduler.py/run_scheduler.py/routes/services/utils` 的 `os.getenv()` / `os.environ.get()`；只允許 `PYTEST_CURRENT_TEST` 與 `MOMO_ALLOW_INSECURE_CONFIG_FOR_TESTS` 兩個測試內部 key 不進範例。
diff --git a/services/hermes_analyst_service.py b/services/hermes_analyst_service.py
index b4be5a7..842aae9 100644
--- a/services/hermes_analyst_service.py
+++ b/services/hermes_analyst_service.py
@@ -4,7 +4,7 @@
 Hermes 3 競價情報分析服務 (Module 2)
 
 角色：分析師 (Analyst)
-模型：hermes3:latest @ HERMES_URL（預設 192.168.0.111:11434）
+模型：hermes3:latest @ OllamaService 三主機級聯（GCP-A → GCP-B → 111）
 輸入：SQL 漏斗篩選後的候選商品（~300筆）
 輸出：Top N 威脅清單（結構化 JSON）→ 交給 NemoTron dispatcher
 
@@ -21,16 +21,15 @@ import uuid
 from dataclasses import dataclass
 from typing import Optional
 
-import requests
 from sqlalchemy import text
 from services.mcp_context_service import build_mcp_context
-from services.ollama_service import resolve_ollama_host, get_host_label
+from services.ollama_service import OllamaService, get_host_label, get_provider_tag
 from services.ai_call_logger import log_ai_call  # Operation Ollama-First v5.0 P1
 from services.rag_service import rag_service, is_rag_enabled  # Phase 11: RAG-first 快取
 
 logger = logging.getLogger(__name__)
 
-from config import HERMES_URL, HERMES_TIMEOUT
+from config import HERMES_TIMEOUT
 
 HERMES_MODEL = "hermes3:latest"
 HERMES_KEEP_ALIVE = "24h"  # ADR-012：保持模型熱駐留，避免被別模型擠下後冷啟動 30+s timeout
@@ -219,35 +218,34 @@ class HermesAnalystService:
             "規則：greeting/help 類 complexity_score<=0.3；涉及數據、報告、日期、"
             "品牌、競品對比者 complexity_score>=0.7 且 requires_data_fetch=true。"
         )
-        payload = {
-            "model": HERMES_MODEL,
-            "system": system,
-            "prompt": f"使用者訊息：{message}\n輸出 JSON：",
-            "stream": False,
-            "keep_alive": HERMES_KEEP_ALIVE,  # ADR-012：避免冷啟動 timeout
-            "options": {"temperature": 0.1},
-        }
-        target_host = resolve_ollama_host()
+        prompt = f"使用者訊息：{message}\n輸出 JSON："
         # Phase 1 v5.0: 包 ai_call_logger 追蹤 Hermes 意圖分類 token / fallback
         with log_ai_call(
             caller='hermes_intent',
             provider='gcp_ollama',
             model=HERMES_MODEL,
-            meta={'host_label': get_host_label(target_host)},
+            meta={'route': 'ollama_first'},
         ) as _ctx:
             try:
-                resp = requests.post(
-                    f"{target_host}/api/generate",
-                    json=payload,
-                    timeout=HERMES_TIMEOUT,  # 統一 config 集中讀取（ADR-008）；keep_alive 確保熱駐留時實測 < 10s
+                ollama = OllamaService(model=HERMES_MODEL)
+                resp = ollama.generate(
+                    prompt=prompt,
+                    model=HERMES_MODEL,
+                    system_prompt=system,
+                    temperature=0.1,
+                    timeout=HERMES_TIMEOUT,
+                    keep_alive=HERMES_KEEP_ALIVE,  # ADR-012：避免冷啟動 timeout
                 )
-                resp.raise_for_status()
-                body = resp.json()
+                _ctx.set_provider(get_provider_tag(resp.host or ''))
                 _ctx.set_tokens(
-                    input=body.get("prompt_eval_count", 0),
-                    output=body.get("eval_count", 0),
+                    input=resp.input_tokens,
+                    output=resp.output_tokens,
                 )
-                raw = (body.get("response", "") or "").strip()
+                _ctx.add_meta('host', resp.host)
+                _ctx.add_meta('host_label', get_host_label(resp.host or ''))
+                if not resp.success:
+                    raise RuntimeError(resp.error or "ollama generate failed")
+                raw = (resp.content or "").strip()
                 if raw.startswith("```"):
                     raw = re.sub(r"^```(?:json)?\s*", "", raw, flags=re.MULTILINE)
                     raw = re.sub(r"\s*```\s*$", "", raw.strip(), flags=re.MULTILINE).strip()
@@ -495,44 +493,40 @@ class HermesAnalystService:
             f'"risk": "HIGH|MED|LOW", "recommended_action": string, "confidence": number}}]'
         )
 
-        payload = {
-            "model": HERMES_MODEL,
-            "system": self.SYSTEM_PROMPT,
-            "prompt": prompt,
-            "stream": False,
-            "keep_alive": HERMES_KEEP_ALIVE,  # ADR-012：避免冷啟動 timeout
-            "options": {"temperature": 0.1},
-        }
-
-        target_host = resolve_ollama_host()
         # Phase 1 v5.0: 包 ai_call_logger 追蹤 Hermes 競價分析 token / fallback
         with log_ai_call(
             caller='hermes_analyst',
             provider='gcp_ollama',
             model=HERMES_MODEL,
             meta={
-                'host_label': get_host_label(target_host),
+                'route': 'ollama_first',
                 'item_count': len(items),
                 'top_n': TOP_N,
             },
         ) as _ctx:
             try:
-                resp = requests.post(
-                    f"{target_host}/api/generate",
-                    json=payload,
+                ollama = OllamaService(model=HERMES_MODEL)
+                resp = ollama.generate(
+                    prompt=prompt,
+                    model=HERMES_MODEL,
+                    system_prompt=self.SYSTEM_PROMPT,
+                    temperature=0.1,
                     timeout=HERMES_TIMEOUT,
+                    keep_alive=HERMES_KEEP_ALIVE,
                 )
-                resp.raise_for_status()
+                _ctx.set_provider(get_provider_tag(resp.host or ''))
+                _ctx.set_tokens(input=resp.input_tokens, output=resp.output_tokens)
+                _ctx.add_meta('host', resp.host)
+                _ctx.add_meta('host_label', get_host_label(resp.host or ''))
+                if not resp.success:
+                    raise RuntimeError(resp.error or "ollama generate failed")
             except Exception as e:
                 _ctx.set_error(f"{type(e).__name__}: {e}")
                 raise
 
-            data = resp.json()
-            raw = data.get("response", "").strip()
-            duration_sec = round(data.get("total_duration", 0) / 1e9, 1)
-            eval_tokens_raw  = data.get("eval_count", 0)   # Ollama 推理 token 數
-            prompt_tokens_raw = data.get("prompt_eval_count", 0)
-            _ctx.set_tokens(input=prompt_tokens_raw, output=eval_tokens_raw)
+            raw = (resp.content or "").strip()
+            duration_sec = round(resp.total_duration or 0, 1)
+            eval_tokens_raw = resp.output_tokens
             logger.info(
                 f"[Hermes] 推理耗時 {duration_sec}s，"
                 f"輸入 {len(items)} 筆，tokens={eval_tokens_raw}，回應長度 {len(raw)}"
@@ -541,8 +535,8 @@ class HermesAnalystService:
             self._last_stats = {
                 "duration_sec": duration_sec,
                 "tokens": eval_tokens_raw,
-                "host": target_host,
-                "host_label": get_host_label(target_host)
+                "host": resp.host,
+                "host_label": get_host_label(resp.host or '')
             }
 
         # P0-1 修復：剝除 Hermes 可能輸出的 markdown code fence
diff --git a/services/ollama_service.py b/services/ollama_service.py
index 6d7aa64..10ed9e2 100644
--- a/services/ollama_service.py
+++ b/services/ollama_service.py
@@ -286,7 +286,8 @@ class OllamaService:
 
     def generate(self, prompt: str, model: str = None,
                  system_prompt: str = None, temperature: float = 0.7,
-                 timeout: int = None) -> OllamaResponse:
+                 timeout: int = None, keep_alive: str = None,
+                 options: Optional[Dict[str, Any]] = None) -> OllamaResponse:
         """
         生成文字 — 含三主機自動 retry（HOTFIX 2026-05-04）
 
@@ -302,8 +303,12 @@ class OllamaService:
             "stream": False,
             "options": {"temperature": temperature},
         }
+        if options:
+            payload["options"].update(options)
         if system_prompt:
             payload["system"] = system_prompt
+        if keep_alive:
+            payload["keep_alive"] = keep_alive
 
         # HOTFIX 三主機 retry 鏈
         attempted_hosts: List[str] = []
diff --git a/tests/test_hermes_ollama_cascade.py b/tests/test_hermes_ollama_cascade.py
new file mode 100644
index 0000000..43bdee6
--- /dev/null
+++ b/tests/test_hermes_ollama_cascade.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Hermes 分析師必須透過 OllamaService 三主機級聯。"""
+
+import time
+from types import SimpleNamespace
+
+import pytest
+
+import services.ai_call_logger as logger_mod
+import services.hermes_analyst_service as hermes_mod
+from services.ai_call_logger import _reset_kill_switch
+
+
+@pytest.fixture(autouse=True)
+def reset_ai_logger(monkeypatch):
+    _reset_kill_switch()
+    captured = []
+
+    def fake_write(state):
+        captured.append({
+            'caller': state.caller,
+            'provider': state.provider,
+            'model': state.model,
+            'status': state.status,
+            'fallback_to': state.fallback_to,
+            'error': state.error,
+            'meta': dict(state.meta),
+        })
+
+    monkeypatch.setattr(logger_mod, '_write_to_db', fake_write)
+    monkeypatch.setenv('AI_CALL_LOGGING_ENABLED', 'true')
+    yield captured
+
+
+def _wait_for(captured, n=1, timeout=2.0):
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        if len(captured) >= n:
+            return True
+        time.sleep(0.01)
+    return False
+
+
+def _stub_ollama(monkeypatch, *, content: str, host: str):
+    fake_resp = SimpleNamespace(
+        success=True,
+        content=content,
+        model=hermes_mod.HERMES_MODEL,
+        error=None,
+        total_duration=1.2,
+        host=host,
+        input_tokens=33,
+        output_tokens=22,
+    )
+
+    class FakeOllamaService:
+        instances = []
+
+        def __init__(self, *args, **kwargs):
+            self.init_args = args
+            self.init_kwargs = kwargs
+            self.generate_calls = []
+            FakeOllamaService.instances.append(self)
+
+        def generate(self, **kwargs):
+            self.generate_calls.append(kwargs)
+            return fake_resp
+
+    monkeypatch.setattr(hermes_mod, 'OllamaService', FakeOllamaService)
+    return FakeOllamaService
+
+
+def test_hermes_intent_uses_ollama_service_and_logs_actual_host(monkeypatch, reset_ai_logger):
+    fake_service = _stub_ollama(
+        monkeypatch,
+        content='{"intent":"query_sales","confidence":0.9,"complexity_score":0.8,'
+                '"requires_data_fetch":true,"preliminary_answer":""}',
+        host='http://192.168.0.111:11434',
+    )
+
+    svc = hermes_mod.HermesAnalystService()
+    result = svc._call_hermes_intent("本週業績如何？")
+
+    assert result['intent'] == 'query_sales'
+    assert result['metadata']['source'] == 'hermes_llm'
+    call_kwargs = fake_service.instances[0].generate_calls[0]
+    assert call_kwargs['model'] == hermes_mod.HERMES_MODEL
+    assert call_kwargs['keep_alive'] == hermes_mod.HERMES_KEEP_ALIVE
+
+    assert _wait_for(reset_ai_logger, 1)
+    rec = reset_ai_logger[0]
+    assert rec['caller'] == 'hermes_intent'
+    assert rec['provider'] == 'ollama_111'
+    assert rec['meta']['host_label'] == '111 備援'
+
+
+def test_hermes_batch_analyze_uses_ollama_service_and_logs_secondary(monkeypatch, reset_ai_logger):
+    fake_service = _stub_ollama(
+        monkeypatch,
+        content='[{"sku":"A1","name":"測試商品","category":"家電","momo_price":120,'
+                '"pchome_price":100,"gap_pct":20,"sales_7d_delta_pct":-30,'
+                '"risk":"HIGH","recommended_action":"建議人工評估","confidence":0.8}]',
+        host='http://34.21.145.224:11434',
+    )
+    monkeypatch.setattr(hermes_mod, 'build_mcp_context', lambda *args, **kwargs: 'MCP context')
+
+    candidates = [{
+        'sku': 'A1',
+        'name': '測試商品',
+        'category': '家電',
+        'momo_price': 120,
+        'pchome_price': 100,
+        'sales_7d_prev': 1000,
+        'sales_7d_curr': 700,
+        'competitor_tags': [],
+    }]
+
+    svc = hermes_mod.HermesAnalystService()
+    raw_threats, items = svc._batch_analyze(candidates)
+
+    assert raw_threats[0]['sku'] == 'A1'
+    assert items[0]['gap_pct'] == 20.0
+    call_kwargs = fake_service.instances[0].generate_calls[0]
+    assert call_kwargs['system_prompt'] == svc.SYSTEM_PROMPT
+    assert call_kwargs['keep_alive'] == hermes_mod.HERMES_KEEP_ALIVE
+
+    assert _wait_for(reset_ai_logger, 1)
+    rec = reset_ai_logger[0]
+    assert rec['caller'] == 'hermes_analyst'
+    assert rec['provider'] == 'ollama_secondary'
+    assert rec['meta']['host_label'] == 'GCP-SSD-2'