fix(hermes): 改用 Ollama 本地模型（111），零費用，按 agent 類型選模型

模型路由： debugger / vuln-verifier → deepseek-r1:14b (強推理，找根因/安全分析) critic / db-expert / coder 群 → qwen2.5-coder:7b (程式碼專用) planner / onboarder / web → qwen2.5:7b-instruct (通用指令) default → deepseek-r1:14b - _strip_think_tags(): 去除 deepseek-r1 <think> 推理塊，只留最終回答 - timeout=90s (deepseek-r1 推理較慢) - log 加 model 欄位供 latency 監控 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-25 03:13:59 +08:00
parent d467cac709
commit 250eca99c6
1 changed files with 58 additions and 14 deletions
--- a/apps/api/src/hermes/nl_gateway.py
+++ b/apps/api/src/hermes/nl_gateway.py
@@ -1,19 +1,19 @@
 """Hermes 自然語言閘道 — ADR-094

-Layer 1 意圖路由（關鍵字正則）→ Anthropic Python SDK 直呼 → Telegram 格式化輸出。
+Layer 1 意圖路由（關鍵字正則）→ Ollama 本地模型（111）→ Telegram 格式化輸出。

 2026-04-24 Claude Sonnet 4.6 (WS4 Hermes NL)
 2026-04-24 Claude Sonnet 4.6 (WS4 Hermes NL T1+T2+T3): hermes_dispatch_log DB 寫入 /
  Redis per-chat_id 速率限制 / Multi-turn session (Redis Hash TTL=300s)
-2026-04-25 Claude Sonnet 4.6: 改用 anthropic Python SDK 直呼，棄用需要 CLI 的
-  claude-agent-sdk（prod pod 無 claude CLI，sdk call 回傳空字串）
+2026-04-25 Claude Sonnet 4.6: 改用 Ollama 本地模型（111），按 agent 類型選模型，零費用
+  debugger/vuln → deepseek-r1:14b（推理）; code agents → qwen2.5-coder:7b; 其他 → qwen2.5:7b-instruct
 """
 from __future__ import annotations
 import asyncio
 import re
 import time

-import anthropic as _anthropic
+import httpx
 import structlog
 from sqlalchemy import text

@@ -50,6 +50,38 @@ _ROUTING_RULES: list[tuple[re.Pattern, str]] = [
 _RATE_LIMIT_MAX = 20
 _RATE_LIMIT_WINDOW_SEC = 60

+# ─────────────────────────────────────────────────────────────────────────────
+# Ollama 模型路由（按 agent 專業選最適模型，111 主機）
+# ─────────────────────────────────────────────────────────────────────────────
+_MODEL_BY_AGENT: dict[str, str] = {
+    # 推理型（找根因 / 安全分析）→ deepseek-r1:14b
+    "debugger":           "deepseek-r1:14b",
+    "vuln-verifier":      "deepseek-r1:14b",
+    # 程式碼專用（review / 實作 / 重構 / DB / 前端 / 工具）→ qwen2.5-coder:7b
+    "critic":             "qwen2.5-coder:7b",
+    "db-expert":          "qwen2.5-coder:7b",
+    "fullstack-engineer": "qwen2.5-coder:7b",
+    "refactor-specialist":"qwen2.5-coder:7b",
+    "migration-engineer": "qwen2.5-coder:7b",
+    "frontend-designer":  "qwen2.5-coder:7b",
+    "tool-expert":        "qwen2.5-coder:7b",
+    # 通用指令（規劃 / 導覽 / 文件）→ qwen2.5:7b-instruct
+    "planner":            "qwen2.5:7b-instruct",
+    "onboarder":          "qwen2.5:7b-instruct",
+    "web-researcher":     "qwen2.5:7b-instruct",
+}
+_DEFAULT_MODEL = "deepseek-r1:14b"
+_OLLAMA_TIMEOUT = 90.0  # deepseek-r1:14b 推理較慢，給 90s
+
+
+def _pick_model(agent_name: str) -> str:
+    return _MODEL_BY_AGENT.get(agent_name, _DEFAULT_MODEL)
+
+
+def _strip_think_tags(text: str) -> str:
+    """移除 deepseek-r1 的 <think>...</think> 內部推理塊，只留最終回答。"""
+    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
+

 def _route_intent_layer1(msg: str) -> str:
    """Layer 1: 關鍵字正則路由，回傳 agent 名稱"""
@@ -221,19 +253,29 @@ async def process_nl_message(

    t0 = time.monotonic()

-    # 呼叫 Anthropic Python SDK（直呼 messages.create，不依賴 claude CLI）
+    # 呼叫 Ollama 本地模型（111，零費用，按 agent 選模型）
+    model = _pick_model(agent_name)
    success = False
    error_type: str | None = None
    try:
-        _client = _anthropic.AsyncAnthropic(api_key=settings.CLAUDE_API_KEY or None)
-        _msg = await _client.messages.create(
-            model="claude-haiku-4-5-20251001",
-            max_tokens=1500,
-            system=system_prompt,
-            messages=[{"role": "user", "content": prompt_with_ctx}],
-        )
-        result_text = _msg.content[0].text if _msg.content else ""
+        ollama_base = getattr(settings, "OLLAMA_URL", "http://192.168.0.111:11434")
+        async with httpx.AsyncClient(timeout=_OLLAMA_TIMEOUT) as _hc:
+            resp = await _hc.post(
+                f"{ollama_base}/api/chat",
+                json={
+                    "model": model,
+                    "messages": [
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user",   "content": prompt_with_ctx},
+                    ],
+                    "stream": False,
+                    "options": {"num_predict": 1500, "temperature": 0.3},
+                },
+            )
+            resp.raise_for_status()
+            result_text = resp.json().get("message", {}).get("content", "")

+        result_text = _strip_think_tags(result_text)
        if not result_text:
            result_text = "_Agent 回應為空，請稍後再試。_"
        success = True
@@ -241,9 +283,10 @@ async def process_nl_message(
    except Exception as exc:
        error_type = type(exc).__name__
        logger.error(
-            "hermes_nl_sdk_error",
+            "hermes_nl_ollama_error",
            error=str(exc),
            agent=agent_name,
+            model=model,
            exc_type=error_type,
        )
        result_text = f"_Hermes 暫時無法連線（{error_type}），請稍後再試。_"
@@ -252,6 +295,7 @@ async def process_nl_message(
    logger.info(
        "hermes_nl_dispatch",
        agent=agent_name,
+        model=model,
        user_id=user_id,
        chat_id=chat_id,
        username=username,