fix: route hermes through ollama cascade
All checks were successful
CD Pipeline / deploy (push) Successful in 57s

This commit is contained in:
OoO
2026-05-13 21:21:05 +08:00
parent f656026082
commit ed29e66fde
5 changed files with 180 additions and 47 deletions

View File

@@ -13,6 +13,7 @@
- Gemini 只能作為 Ollama 主路徑失敗後的備援,或 ADR-028 明確鎖定的 MCP Grounding、PPT/vision、週/月報、Code Review、EA HITL、複雜 SKU 升級等低頻場景。
- 188 `192.168.0.188` 僅是 App / DB / scheduler / Telegram bot 容器宿主與 AutoHeal target不可作為 Ollama 節點。
- 通用 AI 文案、關鍵字、商品洞察與 Telegram Q&A 第一響應不得 Gemini-first。
- Hermes intent / analyst 路徑不得手刻 `/api/generate` 或只 resolve 單次 host必須走 `OllamaService`,讓同一請求可依序 retry GCP-A → GCP-B → 111。
- Code Review pipeline 也必須 Ollama-firstHermes scan 與 OpenClaw assessment 都走 `OllamaService` 三主機 retryGemini telemetry 只能以 `code_review_openclaw_gemini` 出現,表示 Ollama/可選 Claude 備援都失敗後才啟用。
- OpenClaw Telegram Q&A 主路徑也不得綁單一 host`_call_qwen3_qa()` 必須透過 `OllamaService` 跑 GCP-A → GCP-B → 111並把實際落點寫入 `ai_calls.provider`

View File

@@ -51,6 +51,7 @@
- `services/pg_sync_service.py` 是顯式 opt-in legacy CLI不是生產自動同步路徑`tests/test_pg_sync_contract.py` 已守住預設 OFF 與 runtime paths 不自動 import。
- `qwen3:14b` 不是未使用 Ollama 模型OpenClaw QA、NemoTron dispatch 與 LLM model router 仍有現役路徑;`tests/test_qwen3_runtime_usage.py` 已守住,不能只因體積大就三主機移除。
- Ollama host env 已加白名單護欄:`OLLAMA_HOST*` / `EMBEDDING_HOST` 只接受 GCP-A、GCP-B、111 或 110 proxy誤設 188/localhost 會回到核准主機。
- Hermes intent 與批量 analyst 已從單次 `resolve_ollama_host()` + raw `requests.post('/api/generate')` 改為 `OllamaService.generate()`,同一請求會依序 retry GCP-A → GCP-B → 111並保留 `HERMES_KEEP_ALIVE` 與實際 provider 回寫測試。
- OpenClaw QA / daily Hermes template / NemoTron qwen3 的 flag 文件與測試已對齊 Ollama-first 預設 ON顯式 `false` 才是 Gemini/NIM legacy 緊急退路。OpenClaw QA 已移除單一 `OPENCLAW_QA_OLLAMA_HOST` 主機覆寫,`_call_qwen3_qa()` 改走 `OllamaService` 的 GCP-A → GCP-B → 111 retry 並回寫實際 provider。
- Code Review pipeline 已對齊 Ollama-first`_hermes_scan()``_openclaw_assess()` 都先走 `OllamaService` 的 GCP-A → GCP-B → 111 retryGemini 僅在 Ollama與可選 Claude失敗後以 `code_review_openclaw_gemini` caller 記錄備援,不再以 `code_review_openclaw` 直接 Gemini-first。
- `.env.example` 已補齊 Python runtime 實際讀取的環境變數,`tests/test_phase3f_cleanup_contracts.py::test_env_example_documents_runtime_os_env_keys` 會掃 `app.py/config.py/scheduler.py/run_scheduler.py/routes/services/utils``os.getenv()` / `os.environ.get()`;只允許 `PYTEST_CURRENT_TEST``MOMO_ALLOW_INSECURE_CONFIG_FOR_TESTS` 兩個測試內部 key 不進範例。

View File

@@ -4,7 +4,7 @@
Hermes 3 競價情報分析服務 (Module 2)
角色:分析師 (Analyst)
模型hermes3:latest @ HERMES_URL預設 192.168.0.111:11434
模型hermes3:latest @ OllamaService 三主機級聯GCP-A → GCP-B → 111
輸入SQL 漏斗篩選後的候選商品(~300筆
輸出Top N 威脅清單(結構化 JSON→ 交給 NemoTron dispatcher
@@ -21,16 +21,15 @@ import uuid
from dataclasses import dataclass
from typing import Optional
import requests
from sqlalchemy import text
from services.mcp_context_service import build_mcp_context
from services.ollama_service import resolve_ollama_host, get_host_label
from services.ollama_service import OllamaService, get_host_label, get_provider_tag
from services.ai_call_logger import log_ai_call # Operation Ollama-First v5.0 P1
from services.rag_service import rag_service, is_rag_enabled # Phase 11: RAG-first 快取
logger = logging.getLogger(__name__)
from config import HERMES_URL, HERMES_TIMEOUT
from config import HERMES_TIMEOUT
HERMES_MODEL = "hermes3:latest"
HERMES_KEEP_ALIVE = "24h" # ADR-012保持模型熱駐留避免被別模型擠下後冷啟動 30+s timeout
@@ -219,35 +218,34 @@ class HermesAnalystService:
"規則greeting/help 類 complexity_score<=0.3;涉及數據、報告、日期、"
"品牌、競品對比者 complexity_score>=0.7 且 requires_data_fetch=true。"
)
payload = {
"model": HERMES_MODEL,
"system": system,
"prompt": f"使用者訊息:{message}\n輸出 JSON",
"stream": False,
"keep_alive": HERMES_KEEP_ALIVE, # ADR-012避免冷啟動 timeout
"options": {"temperature": 0.1},
}
target_host = resolve_ollama_host()
prompt = f"使用者訊息:{message}\n輸出 JSON"
# Phase 1 v5.0: 包 ai_call_logger 追蹤 Hermes 意圖分類 token / fallback
with log_ai_call(
caller='hermes_intent',
provider='gcp_ollama',
model=HERMES_MODEL,
meta={'host_label': get_host_label(target_host)},
meta={'route': 'ollama_first'},
) as _ctx:
try:
resp = requests.post(
f"{target_host}/api/generate",
json=payload,
timeout=HERMES_TIMEOUT, # 統一 config 集中讀取ADR-008keep_alive 確保熱駐留時實測 < 10s
ollama = OllamaService(model=HERMES_MODEL)
resp = ollama.generate(
prompt=prompt,
model=HERMES_MODEL,
system_prompt=system,
temperature=0.1,
timeout=HERMES_TIMEOUT,
keep_alive=HERMES_KEEP_ALIVE, # ADR-012避免冷啟動 timeout
)
resp.raise_for_status()
body = resp.json()
_ctx.set_provider(get_provider_tag(resp.host or ''))
_ctx.set_tokens(
input=body.get("prompt_eval_count", 0),
output=body.get("eval_count", 0),
input=resp.input_tokens,
output=resp.output_tokens,
)
raw = (body.get("response", "") or "").strip()
_ctx.add_meta('host', resp.host)
_ctx.add_meta('host_label', get_host_label(resp.host or ''))
if not resp.success:
raise RuntimeError(resp.error or "ollama generate failed")
raw = (resp.content or "").strip()
if raw.startswith("```"):
raw = re.sub(r"^```(?:json)?\s*", "", raw, flags=re.MULTILINE)
raw = re.sub(r"\s*```\s*$", "", raw.strip(), flags=re.MULTILINE).strip()
@@ -495,44 +493,40 @@ class HermesAnalystService:
f'"risk": "HIGH|MED|LOW", "recommended_action": string, "confidence": number}}]'
)
payload = {
"model": HERMES_MODEL,
"system": self.SYSTEM_PROMPT,
"prompt": prompt,
"stream": False,
"keep_alive": HERMES_KEEP_ALIVE, # ADR-012避免冷啟動 timeout
"options": {"temperature": 0.1},
}
target_host = resolve_ollama_host()
# Phase 1 v5.0: 包 ai_call_logger 追蹤 Hermes 競價分析 token / fallback
with log_ai_call(
caller='hermes_analyst',
provider='gcp_ollama',
model=HERMES_MODEL,
meta={
'host_label': get_host_label(target_host),
'route': 'ollama_first',
'item_count': len(items),
'top_n': TOP_N,
},
) as _ctx:
try:
resp = requests.post(
f"{target_host}/api/generate",
json=payload,
ollama = OllamaService(model=HERMES_MODEL)
resp = ollama.generate(
prompt=prompt,
model=HERMES_MODEL,
system_prompt=self.SYSTEM_PROMPT,
temperature=0.1,
timeout=HERMES_TIMEOUT,
keep_alive=HERMES_KEEP_ALIVE,
)
resp.raise_for_status()
_ctx.set_provider(get_provider_tag(resp.host or ''))
_ctx.set_tokens(input=resp.input_tokens, output=resp.output_tokens)
_ctx.add_meta('host', resp.host)
_ctx.add_meta('host_label', get_host_label(resp.host or ''))
if not resp.success:
raise RuntimeError(resp.error or "ollama generate failed")
except Exception as e:
_ctx.set_error(f"{type(e).__name__}: {e}")
raise
data = resp.json()
raw = data.get("response", "").strip()
duration_sec = round(data.get("total_duration", 0) / 1e9, 1)
eval_tokens_raw = data.get("eval_count", 0) # Ollama 推理 token 數
prompt_tokens_raw = data.get("prompt_eval_count", 0)
_ctx.set_tokens(input=prompt_tokens_raw, output=eval_tokens_raw)
raw = (resp.content or "").strip()
duration_sec = round(resp.total_duration or 0, 1)
eval_tokens_raw = resp.output_tokens
logger.info(
f"[Hermes] 推理耗時 {duration_sec}s"
f"輸入 {len(items)}tokens={eval_tokens_raw},回應長度 {len(raw)}"
@@ -541,8 +535,8 @@ class HermesAnalystService:
self._last_stats = {
"duration_sec": duration_sec,
"tokens": eval_tokens_raw,
"host": target_host,
"host_label": get_host_label(target_host)
"host": resp.host,
"host_label": get_host_label(resp.host or '')
}
# P0-1 修復:剝除 Hermes 可能輸出的 markdown code fence

View File

@@ -286,7 +286,8 @@ class OllamaService:
def generate(self, prompt: str, model: str = None,
system_prompt: str = None, temperature: float = 0.7,
timeout: int = None) -> OllamaResponse:
timeout: int = None, keep_alive: str = None,
options: Optional[Dict[str, Any]] = None) -> OllamaResponse:
"""
生成文字 — 含三主機自動 retryHOTFIX 2026-05-04
@@ -302,8 +303,12 @@ class OllamaService:
"stream": False,
"options": {"temperature": temperature},
}
if options:
payload["options"].update(options)
if system_prompt:
payload["system"] = system_prompt
if keep_alive:
payload["keep_alive"] = keep_alive
# HOTFIX 三主機 retry 鏈
attempted_hosts: List[str] = []

View File

@@ -0,0 +1,132 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Hermes 分析師必須透過 OllamaService 三主機級聯。"""
import time
from types import SimpleNamespace
import pytest
import services.ai_call_logger as logger_mod
import services.hermes_analyst_service as hermes_mod
from services.ai_call_logger import _reset_kill_switch
@pytest.fixture(autouse=True)
def reset_ai_logger(monkeypatch):
_reset_kill_switch()
captured = []
def fake_write(state):
captured.append({
'caller': state.caller,
'provider': state.provider,
'model': state.model,
'status': state.status,
'fallback_to': state.fallback_to,
'error': state.error,
'meta': dict(state.meta),
})
monkeypatch.setattr(logger_mod, '_write_to_db', fake_write)
monkeypatch.setenv('AI_CALL_LOGGING_ENABLED', 'true')
yield captured
def _wait_for(captured, n=1, timeout=2.0):
deadline = time.time() + timeout
while time.time() < deadline:
if len(captured) >= n:
return True
time.sleep(0.01)
return False
def _stub_ollama(monkeypatch, *, content: str, host: str):
fake_resp = SimpleNamespace(
success=True,
content=content,
model=hermes_mod.HERMES_MODEL,
error=None,
total_duration=1.2,
host=host,
input_tokens=33,
output_tokens=22,
)
class FakeOllamaService:
instances = []
def __init__(self, *args, **kwargs):
self.init_args = args
self.init_kwargs = kwargs
self.generate_calls = []
FakeOllamaService.instances.append(self)
def generate(self, **kwargs):
self.generate_calls.append(kwargs)
return fake_resp
monkeypatch.setattr(hermes_mod, 'OllamaService', FakeOllamaService)
return FakeOllamaService
def test_hermes_intent_uses_ollama_service_and_logs_actual_host(monkeypatch, reset_ai_logger):
fake_service = _stub_ollama(
monkeypatch,
content='{"intent":"query_sales","confidence":0.9,"complexity_score":0.8,'
'"requires_data_fetch":true,"preliminary_answer":""}',
host='http://192.168.0.111:11434',
)
svc = hermes_mod.HermesAnalystService()
result = svc._call_hermes_intent("本週業績如何?")
assert result['intent'] == 'query_sales'
assert result['metadata']['source'] == 'hermes_llm'
call_kwargs = fake_service.instances[0].generate_calls[0]
assert call_kwargs['model'] == hermes_mod.HERMES_MODEL
assert call_kwargs['keep_alive'] == hermes_mod.HERMES_KEEP_ALIVE
assert _wait_for(reset_ai_logger, 1)
rec = reset_ai_logger[0]
assert rec['caller'] == 'hermes_intent'
assert rec['provider'] == 'ollama_111'
assert rec['meta']['host_label'] == '111 備援'
def test_hermes_batch_analyze_uses_ollama_service_and_logs_secondary(monkeypatch, reset_ai_logger):
fake_service = _stub_ollama(
monkeypatch,
content='[{"sku":"A1","name":"測試商品","category":"家電","momo_price":120,'
'"pchome_price":100,"gap_pct":20,"sales_7d_delta_pct":-30,'
'"risk":"HIGH","recommended_action":"建議人工評估","confidence":0.8}]',
host='http://34.21.145.224:11434',
)
monkeypatch.setattr(hermes_mod, 'build_mcp_context', lambda *args, **kwargs: 'MCP context')
candidates = [{
'sku': 'A1',
'name': '測試商品',
'category': '家電',
'momo_price': 120,
'pchome_price': 100,
'sales_7d_prev': 1000,
'sales_7d_curr': 700,
'competitor_tags': [],
}]
svc = hermes_mod.HermesAnalystService()
raw_threats, items = svc._batch_analyze(candidates)
assert raw_threats[0]['sku'] == 'A1'
assert items[0]['gap_pct'] == 20.0
call_kwargs = fake_service.instances[0].generate_calls[0]
assert call_kwargs['system_prompt'] == svc.SYSTEM_PROMPT
assert call_kwargs['keep_alive'] == hermes_mod.HERMES_KEEP_ALIVE
assert _wait_for(reset_ai_logger, 1)
rec = reset_ai_logger[0]
assert rec['caller'] == 'hermes_analyst'
assert rec['provider'] == 'ollama_secondary'
assert rec['meta']['host_label'] == 'GCP-SSD-2'