diff --git a/apps/api/src/services/chat_manager.py b/apps/api/src/services/chat_manager.py index 38584404..e9bdc0f9 100644 --- a/apps/api/src/services/chat_manager.py +++ b/apps/api/src/services/chat_manager.py @@ -5,23 +5,28 @@ Phase 21.5 初版: 2026-03-31 ogt Phase 22.6 重寫: 2026-04-03 ogt (老闆需求: 雙 AI 互動對話) Phase 22.7 更新: 2026-04-03 ogt (老闆指示: OpenClaw→Gemini, NemoClaw→Ollama llama3.2:3b) Phase 22.8 更新: 2026-04-09 ogt (老闆指示: NemoClaw→Ollama 111 deepseek-r1:14b,SRE 推理更強) +Phase 33 更新: 2026-05-05 ogt (ADR-110: OpenClaw chat 改走 GCP-A Ollama interactive lane) 架構: -- OpenClaw (Gemini API): SRE 首席顧問,精準分析 -- NemoClaw (Ollama 192.168.0.111 deepseek-r1:14b): 戰術參謀,深度推理 +- OpenClaw (Ollama GCP-A interactive lane): SRE 首席顧問,精準分析 +- NemoClaw (Ollama interactive lane deepseek-r1:14b): 戰術參謀,深度推理 費用控管: -- Gemini Flash: Input $0.075/1M tokens, Output $0.30/1M tokens -- NemoClaw: 免費 (本地 Ollama) -- 每次回覆顯示 token 用量與費用 -- 月上限 $10 USD (由 ai_rate_limiter 控管) +- OpenClaw/NemoClaw chat 預設免費 Ollama;Gemini 不再作為 ChatManager 預設路徑 +- 每次回覆顯示 token 用量 """ import asyncio +import re + +import httpx import structlog -from src.utils.timezone import now_taipei -from src.repositories.k8s_repository import get_k8s_repository + +from src.core.config import get_settings from src.repositories.incident_repository import get_incident_repository +from src.repositories.k8s_repository import get_k8s_repository +from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint +from src.utils.timezone import now_taipei logger = structlog.get_logger(__name__) @@ -81,73 +86,49 @@ class ChatManager: async def _call_openclaw(self, system_prompt: str, user_message: str) -> str | None: """ - 呼叫 OpenClaw 對話 — Gemini Flash API - - 2026-04-03 ogt: 老闆指示改用 Gemini,費用控管月上限 $10 USD - 每次回覆附帶 token 用量與費用統計 + 呼叫 OpenClaw 對話 — Ollama interactive lane 2026-04-10 Claude Code: 強制合併 OPENCLAW_PERSONA,確保字數限制與格式規範 + 2026-05-05 Codex: 改走 ADR-110 GCP-A/GCP-B/111 Ollama topology,避免個人聊天直打 Gemini """ # 強制在 system_prompt 前置 persona,確保 LLM 遵守字數與格式 system_prompt = f"{OPENCLAW_PERSONA}\n{system_prompt}" - import httpx - from src.core.config import get_settings settings = get_settings() - api_key = settings.GEMINI_API_KEY - if not api_key: - logger.warning("openclaw_chat_failed", error="GEMINI_API_KEY not configured") - return None - - # 月費用上限檢查 ($10 USD) - MONTHLY_LIMIT_USD = 10.0 - from src.core.redis_client import get_redis - from src.utils.timezone import now_taipei - redis = get_redis() - month_key = f"gemini_cost:{now_taipei().strftime('%Y-%m')}" + model = settings.OPENCLAW_DEFAULT_MODEL + ollama_url = resolve_ollama_endpoint("interactive") try: - current_cost = float(await redis.get(month_key) or 0) - except Exception: - current_cost = 0.0 - - if current_cost >= MONTHLY_LIMIT_USD: - logger.warning("openclaw_gemini_monthly_limit_reached", current_usd=current_cost, limit_usd=MONTHLY_LIMIT_USD) - return f"🔴 OpenClaw 本月 Gemini 用量已達上限 ${MONTHLY_LIMIT_USD} USD(已用 ${current_cost:.4f})" - - # Gemini 2.0 Flash-Lite: 最便宜 (2026-04-03 老闆指示) - model = "gemini-2.0-flash-lite" - try: - async with httpx.AsyncClient(timeout=30.0) as client: + async with httpx.AsyncClient(timeout=40.0) as client: resp = await client.post( - f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent", - headers={"x-goog-api-key": api_key}, + f"{ollama_url}/api/chat", json={ - "system_instruction": {"parts": [{"text": system_prompt}]}, - "contents": [{"parts": [{"text": user_message}]}], - "generationConfig": {"maxOutputTokens": 300, "temperature": 0.7}, + "model": model, + "stream": False, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_message}, + ], + "options": {"num_predict": 900, "temperature": 0.2}, }, ) resp.raise_for_status() data = resp.json() - text = data["candidates"][0]["content"]["parts"][0]["text"].strip() + raw = data.get("message", {}).get("content", "").strip() + text = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() or raw - # Token/費用統計 + 累計到 Redis - usage = data.get("usageMetadata", {}) - in_tok = usage.get("promptTokenCount", 0) - out_tok = usage.get("candidatesTokenCount", 0) - # Gemini 2.0 Flash-Lite: Input $0.075/1M, Output $0.30/1M - cost = (in_tok * 0.000000075) + (out_tok * 0.0000003) - new_total = current_cost + cost + eval_count = data.get("eval_count", 0) + prompt_eval_count = data.get("prompt_eval_count", 0) + total_tokens = eval_count + prompt_eval_count - try: - await redis.set(month_key, str(round(new_total, 6)), ex=40 * 24 * 3600) # 40天 TTL - except Exception: - pass + logger.info( + "openclaw_ollama_chat_usage", + model=model, + endpoint=ollama_url, + prompt_tokens=prompt_eval_count, + output_tokens=eval_count, + ) - logger.info("openclaw_gemini_usage", in_tokens=in_tok, out_tokens=out_tok, - cost_usd=round(cost, 6), monthly_total_usd=round(new_total, 4)) - - return f"{text}\n\n📊 {in_tok+out_tok} tokens | ${cost:.4f} | 本月累計 ${new_total:.4f}" + return f"{text}\n\n🦙 {model} | {total_tokens} tokens | 免費" except Exception as e: logger.warning("openclaw_chat_failed", error=str(e)) return None @@ -164,12 +145,8 @@ class ChatManager: # 強制在 system_prompt 前置 persona system_prompt = f"{NEMOCLAW_PERSONA}\n{system_prompt}" - import httpx - import re - from src.core.config import get_settings as _get_settings - - # 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111 - OLLAMA_URL = _get_settings().OLLAMA_URL + # 2026-05-05 Codex: ADR-110 interactive lane,由 resolver 管理 GCP-A/GCP-B/111 拓撲 + OLLAMA_URL = resolve_ollama_endpoint("interactive") MODEL = "deepseek-r1:14b" try: @@ -250,14 +227,14 @@ class ChatManager: # 2026-04-03 ogt: 移除 asyncio.shield — shield 會在超時後讓 task 繼續跑但無人等待,造成 silent leak try: openclaw_raw = await asyncio.wait_for(openclaw_task, timeout=40.0) - except asyncio.TimeoutError: + except TimeoutError: openclaw_raw = None openclaw_block = f"🦞 OpenClaw:\n{openclaw_raw or '🔴 無響應'}" try: nemo_raw = await asyncio.wait_for(nemo_task, timeout=60.0) - except asyncio.TimeoutError: + except TimeoutError: nemo_raw = None if nemo_raw: diff --git a/apps/api/tests/test_chat_manager_ollama_routing.py b/apps/api/tests/test_chat_manager_ollama_routing.py new file mode 100644 index 00000000..d5dc40f6 --- /dev/null +++ b/apps/api/tests/test_chat_manager_ollama_routing.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest + +from src.services import chat_manager as chat_module +from src.services.chat_manager import ChatManager + + +class _FakeResponse: + def __init__(self, content: str = "老闆,系統目前穩定。") -> None: + self._content = content + + def raise_for_status(self) -> None: + return None + + def json(self) -> dict[str, Any]: + return { + "message": {"content": self._content}, + "prompt_eval_count": 11, + "eval_count": 13, + } + + +class _FakeAsyncClient: + posted: list[tuple[str, dict[str, Any]]] = [] + + def __init__(self, *args: Any, **kwargs: Any) -> None: + self.args = args + self.kwargs = kwargs + + async def __aenter__(self) -> _FakeAsyncClient: + return self + + async def __aexit__(self, *args: Any) -> None: + return None + + async def post(self, url: str, *, json: dict[str, Any]) -> _FakeResponse: + self.posted.append((url, json)) + return _FakeResponse() + + +def _settings() -> SimpleNamespace: + return SimpleNamespace(OPENCLAW_DEFAULT_MODEL="qwen3:14b") + + +@pytest.fixture(autouse=True) +def _reset_fake_client() -> None: + _FakeAsyncClient.posted = [] + + +@pytest.mark.asyncio +async def test_openclaw_chat_uses_ollama_interactive_lane( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(chat_module.httpx, "AsyncClient", _FakeAsyncClient) + monkeypatch.setattr(chat_module, "get_settings", _settings) + monkeypatch.setattr( + chat_module, + "resolve_ollama_endpoint", + lambda workload_type: "http://gcp-a:11435", + ) + + result = await ChatManager()._call_openclaw("system context", "幫我看狀態") + + assert result is not None + assert "qwen3:14b" in result + assert "免費" in result + + assert len(_FakeAsyncClient.posted) == 1 + url, payload = _FakeAsyncClient.posted[0] + assert url == "http://gcp-a:11435/api/chat" + assert payload["model"] == "qwen3:14b" + assert payload["messages"][0]["role"] == "system" + assert payload["messages"][1] == {"role": "user", "content": "幫我看狀態"} + + +@pytest.mark.asyncio +async def test_nemoclaw_chat_uses_resolved_interactive_lane( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(chat_module.httpx, "AsyncClient", _FakeAsyncClient) + monkeypatch.setattr( + chat_module, + "resolve_ollama_endpoint", + lambda workload_type: "http://gcp-a:11435", + ) + + result = await ChatManager()._call_nemotron("system context", "補充觀點") + + assert result is not None + url, payload = _FakeAsyncClient.posted[0] + assert url == "http://gcp-a:11435/api/chat" + assert payload["model"] == "deepseek-r1:14b" + + +def test_chat_manager_has_no_direct_gemini_generation_path() -> None: + source_path = Path(chat_module.__file__).resolve() + source = source_path.read_text(encoding="utf-8") + + assert "generativelanguage.googleapis.com" not in source + assert "GEMINI_API_KEY" not in source