fix: route hermes through ollama cascade
All checks were successful
CD Pipeline / deploy (push) Successful in 57s
All checks were successful
CD Pipeline / deploy (push) Successful in 57s
This commit is contained in:
@@ -13,6 +13,7 @@
|
||||
- Gemini 只能作為 Ollama 主路徑失敗後的備援,或 ADR-028 明確鎖定的 MCP Grounding、PPT/vision、週/月報、Code Review、EA HITL、複雜 SKU 升級等低頻場景。
|
||||
- 188 `192.168.0.188` 僅是 App / DB / scheduler / Telegram bot 容器宿主與 AutoHeal target,不可作為 Ollama 節點。
|
||||
- 通用 AI 文案、關鍵字、商品洞察與 Telegram Q&A 第一響應不得 Gemini-first。
|
||||
- Hermes intent / analyst 路徑不得手刻 `/api/generate` 或只 resolve 單次 host;必須走 `OllamaService`,讓同一請求可依序 retry GCP-A → GCP-B → 111。
|
||||
- Code Review pipeline 也必須 Ollama-first:Hermes scan 與 OpenClaw assessment 都走 `OllamaService` 三主機 retry;Gemini telemetry 只能以 `code_review_openclaw_gemini` 出現,表示 Ollama/可選 Claude 備援都失敗後才啟用。
|
||||
- OpenClaw Telegram Q&A 主路徑也不得綁單一 host:`_call_qwen3_qa()` 必須透過 `OllamaService` 跑 GCP-A → GCP-B → 111,並把實際落點寫入 `ai_calls.provider`。
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@
|
||||
- `services/pg_sync_service.py` 是顯式 opt-in legacy CLI,不是生產自動同步路徑;`tests/test_pg_sync_contract.py` 已守住預設 OFF 與 runtime paths 不自動 import。
|
||||
- `qwen3:14b` 不是未使用 Ollama 模型:OpenClaw QA、NemoTron dispatch 與 LLM model router 仍有現役路徑;`tests/test_qwen3_runtime_usage.py` 已守住,不能只因體積大就三主機移除。
|
||||
- Ollama host env 已加白名單護欄:`OLLAMA_HOST*` / `EMBEDDING_HOST` 只接受 GCP-A、GCP-B、111 或 110 proxy,誤設 188/localhost 會回到核准主機。
|
||||
- Hermes intent 與批量 analyst 已從單次 `resolve_ollama_host()` + raw `requests.post('/api/generate')` 改為 `OllamaService.generate()`,同一請求會依序 retry GCP-A → GCP-B → 111,並保留 `HERMES_KEEP_ALIVE` 與實際 provider 回寫測試。
|
||||
- OpenClaw QA / daily Hermes template / NemoTron qwen3 的 flag 文件與測試已對齊 Ollama-first 預設 ON;顯式 `false` 才是 Gemini/NIM legacy 緊急退路。OpenClaw QA 已移除單一 `OPENCLAW_QA_OLLAMA_HOST` 主機覆寫,`_call_qwen3_qa()` 改走 `OllamaService` 的 GCP-A → GCP-B → 111 retry 並回寫實際 provider。
|
||||
- Code Review pipeline 已對齊 Ollama-first:`_hermes_scan()` 與 `_openclaw_assess()` 都先走 `OllamaService` 的 GCP-A → GCP-B → 111 retry;Gemini 僅在 Ollama(與可選 Claude)失敗後以 `code_review_openclaw_gemini` caller 記錄備援,不再以 `code_review_openclaw` 直接 Gemini-first。
|
||||
- `.env.example` 已補齊 Python runtime 實際讀取的環境變數,`tests/test_phase3f_cleanup_contracts.py::test_env_example_documents_runtime_os_env_keys` 會掃 `app.py/config.py/scheduler.py/run_scheduler.py/routes/services/utils` 的 `os.getenv()` / `os.environ.get()`;只允許 `PYTEST_CURRENT_TEST` 與 `MOMO_ALLOW_INSECURE_CONFIG_FOR_TESTS` 兩個測試內部 key 不進範例。
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
Hermes 3 競價情報分析服務 (Module 2)
|
||||
|
||||
角色:分析師 (Analyst)
|
||||
模型:hermes3:latest @ HERMES_URL(預設 192.168.0.111:11434)
|
||||
模型:hermes3:latest @ OllamaService 三主機級聯(GCP-A → GCP-B → 111)
|
||||
輸入:SQL 漏斗篩選後的候選商品(~300筆)
|
||||
輸出:Top N 威脅清單(結構化 JSON)→ 交給 NemoTron dispatcher
|
||||
|
||||
@@ -21,16 +21,15 @@ import uuid
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from sqlalchemy import text
|
||||
from services.mcp_context_service import build_mcp_context
|
||||
from services.ollama_service import resolve_ollama_host, get_host_label
|
||||
from services.ollama_service import OllamaService, get_host_label, get_provider_tag
|
||||
from services.ai_call_logger import log_ai_call # Operation Ollama-First v5.0 P1
|
||||
from services.rag_service import rag_service, is_rag_enabled # Phase 11: RAG-first 快取
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from config import HERMES_URL, HERMES_TIMEOUT
|
||||
from config import HERMES_TIMEOUT
|
||||
|
||||
HERMES_MODEL = "hermes3:latest"
|
||||
HERMES_KEEP_ALIVE = "24h" # ADR-012:保持模型熱駐留,避免被別模型擠下後冷啟動 30+s timeout
|
||||
@@ -219,35 +218,34 @@ class HermesAnalystService:
|
||||
"規則:greeting/help 類 complexity_score<=0.3;涉及數據、報告、日期、"
|
||||
"品牌、競品對比者 complexity_score>=0.7 且 requires_data_fetch=true。"
|
||||
)
|
||||
payload = {
|
||||
"model": HERMES_MODEL,
|
||||
"system": system,
|
||||
"prompt": f"使用者訊息:{message}\n輸出 JSON:",
|
||||
"stream": False,
|
||||
"keep_alive": HERMES_KEEP_ALIVE, # ADR-012:避免冷啟動 timeout
|
||||
"options": {"temperature": 0.1},
|
||||
}
|
||||
target_host = resolve_ollama_host()
|
||||
prompt = f"使用者訊息:{message}\n輸出 JSON:"
|
||||
# Phase 1 v5.0: 包 ai_call_logger 追蹤 Hermes 意圖分類 token / fallback
|
||||
with log_ai_call(
|
||||
caller='hermes_intent',
|
||||
provider='gcp_ollama',
|
||||
model=HERMES_MODEL,
|
||||
meta={'host_label': get_host_label(target_host)},
|
||||
meta={'route': 'ollama_first'},
|
||||
) as _ctx:
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{target_host}/api/generate",
|
||||
json=payload,
|
||||
timeout=HERMES_TIMEOUT, # 統一 config 集中讀取(ADR-008);keep_alive 確保熱駐留時實測 < 10s
|
||||
ollama = OllamaService(model=HERMES_MODEL)
|
||||
resp = ollama.generate(
|
||||
prompt=prompt,
|
||||
model=HERMES_MODEL,
|
||||
system_prompt=system,
|
||||
temperature=0.1,
|
||||
timeout=HERMES_TIMEOUT,
|
||||
keep_alive=HERMES_KEEP_ALIVE, # ADR-012:避免冷啟動 timeout
|
||||
)
|
||||
resp.raise_for_status()
|
||||
body = resp.json()
|
||||
_ctx.set_provider(get_provider_tag(resp.host or ''))
|
||||
_ctx.set_tokens(
|
||||
input=body.get("prompt_eval_count", 0),
|
||||
output=body.get("eval_count", 0),
|
||||
input=resp.input_tokens,
|
||||
output=resp.output_tokens,
|
||||
)
|
||||
raw = (body.get("response", "") or "").strip()
|
||||
_ctx.add_meta('host', resp.host)
|
||||
_ctx.add_meta('host_label', get_host_label(resp.host or ''))
|
||||
if not resp.success:
|
||||
raise RuntimeError(resp.error or "ollama generate failed")
|
||||
raw = (resp.content or "").strip()
|
||||
if raw.startswith("```"):
|
||||
raw = re.sub(r"^```(?:json)?\s*", "", raw, flags=re.MULTILINE)
|
||||
raw = re.sub(r"\s*```\s*$", "", raw.strip(), flags=re.MULTILINE).strip()
|
||||
@@ -495,44 +493,40 @@ class HermesAnalystService:
|
||||
f'"risk": "HIGH|MED|LOW", "recommended_action": string, "confidence": number}}]'
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": HERMES_MODEL,
|
||||
"system": self.SYSTEM_PROMPT,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"keep_alive": HERMES_KEEP_ALIVE, # ADR-012:避免冷啟動 timeout
|
||||
"options": {"temperature": 0.1},
|
||||
}
|
||||
|
||||
target_host = resolve_ollama_host()
|
||||
# Phase 1 v5.0: 包 ai_call_logger 追蹤 Hermes 競價分析 token / fallback
|
||||
with log_ai_call(
|
||||
caller='hermes_analyst',
|
||||
provider='gcp_ollama',
|
||||
model=HERMES_MODEL,
|
||||
meta={
|
||||
'host_label': get_host_label(target_host),
|
||||
'route': 'ollama_first',
|
||||
'item_count': len(items),
|
||||
'top_n': TOP_N,
|
||||
},
|
||||
) as _ctx:
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{target_host}/api/generate",
|
||||
json=payload,
|
||||
ollama = OllamaService(model=HERMES_MODEL)
|
||||
resp = ollama.generate(
|
||||
prompt=prompt,
|
||||
model=HERMES_MODEL,
|
||||
system_prompt=self.SYSTEM_PROMPT,
|
||||
temperature=0.1,
|
||||
timeout=HERMES_TIMEOUT,
|
||||
keep_alive=HERMES_KEEP_ALIVE,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
_ctx.set_provider(get_provider_tag(resp.host or ''))
|
||||
_ctx.set_tokens(input=resp.input_tokens, output=resp.output_tokens)
|
||||
_ctx.add_meta('host', resp.host)
|
||||
_ctx.add_meta('host_label', get_host_label(resp.host or ''))
|
||||
if not resp.success:
|
||||
raise RuntimeError(resp.error or "ollama generate failed")
|
||||
except Exception as e:
|
||||
_ctx.set_error(f"{type(e).__name__}: {e}")
|
||||
raise
|
||||
|
||||
data = resp.json()
|
||||
raw = data.get("response", "").strip()
|
||||
duration_sec = round(data.get("total_duration", 0) / 1e9, 1)
|
||||
eval_tokens_raw = data.get("eval_count", 0) # Ollama 推理 token 數
|
||||
prompt_tokens_raw = data.get("prompt_eval_count", 0)
|
||||
_ctx.set_tokens(input=prompt_tokens_raw, output=eval_tokens_raw)
|
||||
raw = (resp.content or "").strip()
|
||||
duration_sec = round(resp.total_duration or 0, 1)
|
||||
eval_tokens_raw = resp.output_tokens
|
||||
logger.info(
|
||||
f"[Hermes] 推理耗時 {duration_sec}s,"
|
||||
f"輸入 {len(items)} 筆,tokens={eval_tokens_raw},回應長度 {len(raw)}"
|
||||
@@ -541,8 +535,8 @@ class HermesAnalystService:
|
||||
self._last_stats = {
|
||||
"duration_sec": duration_sec,
|
||||
"tokens": eval_tokens_raw,
|
||||
"host": target_host,
|
||||
"host_label": get_host_label(target_host)
|
||||
"host": resp.host,
|
||||
"host_label": get_host_label(resp.host or '')
|
||||
}
|
||||
|
||||
# P0-1 修復:剝除 Hermes 可能輸出的 markdown code fence
|
||||
|
||||
@@ -286,7 +286,8 @@ class OllamaService:
|
||||
|
||||
def generate(self, prompt: str, model: str = None,
|
||||
system_prompt: str = None, temperature: float = 0.7,
|
||||
timeout: int = None) -> OllamaResponse:
|
||||
timeout: int = None, keep_alive: str = None,
|
||||
options: Optional[Dict[str, Any]] = None) -> OllamaResponse:
|
||||
"""
|
||||
生成文字 — 含三主機自動 retry(HOTFIX 2026-05-04)
|
||||
|
||||
@@ -302,8 +303,12 @@ class OllamaService:
|
||||
"stream": False,
|
||||
"options": {"temperature": temperature},
|
||||
}
|
||||
if options:
|
||||
payload["options"].update(options)
|
||||
if system_prompt:
|
||||
payload["system"] = system_prompt
|
||||
if keep_alive:
|
||||
payload["keep_alive"] = keep_alive
|
||||
|
||||
# HOTFIX 三主機 retry 鏈
|
||||
attempted_hosts: List[str] = []
|
||||
|
||||
132
tests/test_hermes_ollama_cascade.py
Normal file
132
tests/test_hermes_ollama_cascade.py
Normal file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Hermes 分析師必須透過 OllamaService 三主機級聯。"""
|
||||
|
||||
import time
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
import services.ai_call_logger as logger_mod
|
||||
import services.hermes_analyst_service as hermes_mod
|
||||
from services.ai_call_logger import _reset_kill_switch
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_ai_logger(monkeypatch):
|
||||
_reset_kill_switch()
|
||||
captured = []
|
||||
|
||||
def fake_write(state):
|
||||
captured.append({
|
||||
'caller': state.caller,
|
||||
'provider': state.provider,
|
||||
'model': state.model,
|
||||
'status': state.status,
|
||||
'fallback_to': state.fallback_to,
|
||||
'error': state.error,
|
||||
'meta': dict(state.meta),
|
||||
})
|
||||
|
||||
monkeypatch.setattr(logger_mod, '_write_to_db', fake_write)
|
||||
monkeypatch.setenv('AI_CALL_LOGGING_ENABLED', 'true')
|
||||
yield captured
|
||||
|
||||
|
||||
def _wait_for(captured, n=1, timeout=2.0):
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
if len(captured) >= n:
|
||||
return True
|
||||
time.sleep(0.01)
|
||||
return False
|
||||
|
||||
|
||||
def _stub_ollama(monkeypatch, *, content: str, host: str):
|
||||
fake_resp = SimpleNamespace(
|
||||
success=True,
|
||||
content=content,
|
||||
model=hermes_mod.HERMES_MODEL,
|
||||
error=None,
|
||||
total_duration=1.2,
|
||||
host=host,
|
||||
input_tokens=33,
|
||||
output_tokens=22,
|
||||
)
|
||||
|
||||
class FakeOllamaService:
|
||||
instances = []
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.init_args = args
|
||||
self.init_kwargs = kwargs
|
||||
self.generate_calls = []
|
||||
FakeOllamaService.instances.append(self)
|
||||
|
||||
def generate(self, **kwargs):
|
||||
self.generate_calls.append(kwargs)
|
||||
return fake_resp
|
||||
|
||||
monkeypatch.setattr(hermes_mod, 'OllamaService', FakeOllamaService)
|
||||
return FakeOllamaService
|
||||
|
||||
|
||||
def test_hermes_intent_uses_ollama_service_and_logs_actual_host(monkeypatch, reset_ai_logger):
|
||||
fake_service = _stub_ollama(
|
||||
monkeypatch,
|
||||
content='{"intent":"query_sales","confidence":0.9,"complexity_score":0.8,'
|
||||
'"requires_data_fetch":true,"preliminary_answer":""}',
|
||||
host='http://192.168.0.111:11434',
|
||||
)
|
||||
|
||||
svc = hermes_mod.HermesAnalystService()
|
||||
result = svc._call_hermes_intent("本週業績如何?")
|
||||
|
||||
assert result['intent'] == 'query_sales'
|
||||
assert result['metadata']['source'] == 'hermes_llm'
|
||||
call_kwargs = fake_service.instances[0].generate_calls[0]
|
||||
assert call_kwargs['model'] == hermes_mod.HERMES_MODEL
|
||||
assert call_kwargs['keep_alive'] == hermes_mod.HERMES_KEEP_ALIVE
|
||||
|
||||
assert _wait_for(reset_ai_logger, 1)
|
||||
rec = reset_ai_logger[0]
|
||||
assert rec['caller'] == 'hermes_intent'
|
||||
assert rec['provider'] == 'ollama_111'
|
||||
assert rec['meta']['host_label'] == '111 備援'
|
||||
|
||||
|
||||
def test_hermes_batch_analyze_uses_ollama_service_and_logs_secondary(monkeypatch, reset_ai_logger):
|
||||
fake_service = _stub_ollama(
|
||||
monkeypatch,
|
||||
content='[{"sku":"A1","name":"測試商品","category":"家電","momo_price":120,'
|
||||
'"pchome_price":100,"gap_pct":20,"sales_7d_delta_pct":-30,'
|
||||
'"risk":"HIGH","recommended_action":"建議人工評估","confidence":0.8}]',
|
||||
host='http://34.21.145.224:11434',
|
||||
)
|
||||
monkeypatch.setattr(hermes_mod, 'build_mcp_context', lambda *args, **kwargs: 'MCP context')
|
||||
|
||||
candidates = [{
|
||||
'sku': 'A1',
|
||||
'name': '測試商品',
|
||||
'category': '家電',
|
||||
'momo_price': 120,
|
||||
'pchome_price': 100,
|
||||
'sales_7d_prev': 1000,
|
||||
'sales_7d_curr': 700,
|
||||
'competitor_tags': [],
|
||||
}]
|
||||
|
||||
svc = hermes_mod.HermesAnalystService()
|
||||
raw_threats, items = svc._batch_analyze(candidates)
|
||||
|
||||
assert raw_threats[0]['sku'] == 'A1'
|
||||
assert items[0]['gap_pct'] == 20.0
|
||||
call_kwargs = fake_service.instances[0].generate_calls[0]
|
||||
assert call_kwargs['system_prompt'] == svc.SYSTEM_PROMPT
|
||||
assert call_kwargs['keep_alive'] == hermes_mod.HERMES_KEEP_ALIVE
|
||||
|
||||
assert _wait_for(reset_ai_logger, 1)
|
||||
rec = reset_ai_logger[0]
|
||||
assert rec['caller'] == 'hermes_analyst'
|
||||
assert rec['provider'] == 'ollama_secondary'
|
||||
assert rec['meta']['host_label'] == 'GCP-SSD-2'
|
||||
Reference in New Issue
Block a user