diff --git a/apps/api/src/api/v1/rag.py b/apps/api/src/api/v1/rag.py index ad806ff6..c41060ca 100644 --- a/apps/api/src/api/v1/rag.py +++ b/apps/api/src/api/v1/rag.py @@ -64,6 +64,7 @@ async def rag_debug() -> dict: """診斷用:確認容器內 docs 路徑 + Ollama 連線""" import os from pathlib import Path + import httpx paths_check = {} @@ -78,12 +79,23 @@ async def rag_debug() -> dict: try: async with httpx.AsyncClient(timeout=10.0) as c: from src.core.config import get_settings as _gs + from src.services.ollama_endpoint_resolver import resolve_ollama_order + settings = _gs() - r = await c.post( - f"{settings.OLLAMA_URL}/api/embeddings", - json={"model": settings.OLLAMA_EMBEDDING_MODEL, "prompt": "test"}, - ) - ollama_ok = r.status_code == 200 if r.status_code == 200 else f"http_{r.status_code}" + statuses: list[str] = [] + for endpoint in resolve_ollama_order("embedding"): + if not endpoint.url: + continue + r = await c.post( + f"{endpoint.url}/api/embeddings", + json={"model": settings.OLLAMA_EMBEDDING_MODEL, "prompt": "test"}, + ) + if r.status_code == 200: + ollama_ok = True + break + statuses.append(f"{endpoint.provider_name}=http_{r.status_code}") + if ollama_ok is not True: + ollama_ok = ", ".join(statuses) or "no_endpoint" except Exception as e: ollama_ok = f"error: {type(e).__name__}: {e}" diff --git a/apps/api/src/hermes/nl_gateway.py b/apps/api/src/hermes/nl_gateway.py index 0d633557..29f9f4e6 100644 --- a/apps/api/src/hermes/nl_gateway.py +++ b/apps/api/src/hermes/nl_gateway.py @@ -23,6 +23,7 @@ from src.db.base import get_db_context from src.hermes.agent_loader import get_agent_system_prompt from src.hermes.display_names import DEFAULT_AGENT, format_response_header from src.hermes.safety_hooks import is_dangerous_input, is_mutate_intent +from src.services.ollama_endpoint_resolver import resolve_ollama_order logger = structlog.get_logger(__name__) @@ -261,44 +262,46 @@ async def process_nl_message( t0 = time.monotonic() - # 呼叫 Ollama 本地模型(111,零費用,按 agent 選模型) + # 呼叫 Ollama 模型(GCP-A → GCP-B → 111,零費用,按 agent 選模型) model = _pick_model(agent_name) success = False error_type: str | None = None - try: - from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint - - ollama_base = resolve_ollama_endpoint("hermes") - async with httpx.AsyncClient(timeout=_OLLAMA_TIMEOUT) as _hc: - resp = await _hc.post( - f"{ollama_base}/api/chat", - json={ - "model": model, - "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": prompt_with_ctx}, - ], - "stream": False, - "options": {"num_predict": 1500, "temperature": 0.3}, - }, - ) - resp.raise_for_status() - result_text = resp.json().get("message", {}).get("content", "") - - result_text = _strip_think_tags(result_text) - if not result_text: - result_text = "_Agent 回應為空,請稍後再試。_" - success = True - - except Exception as exc: - error_type = type(exc).__name__ - logger.error( - "hermes_nl_ollama_error", - error=str(exc), - agent=agent_name, - model=model, - exc_type=error_type, - ) + result_text = "" + async with httpx.AsyncClient(timeout=_OLLAMA_TIMEOUT) as _hc: + for endpoint in resolve_ollama_order("hermes"): + if not endpoint.url: + continue + try: + resp = await _hc.post( + f"{endpoint.url}/api/chat", + json={ + "model": model, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt_with_ctx}, + ], + "stream": False, + "options": {"num_predict": 1500, "temperature": 0.3}, + }, + ) + resp.raise_for_status() + result_text = resp.json().get("message", {}).get("content", "") + result_text = _strip_think_tags(result_text) + if not result_text: + result_text = "_Agent 回應為空,請稍後再試。_" + success = True + break + except Exception as exc: + error_type = type(exc).__name__ + logger.error( + "hermes_nl_ollama_error", + error=str(exc), + agent=agent_name, + model=model, + provider=endpoint.provider_name, + exc_type=error_type, + ) + if not success: result_text = f"_Hermes 暫時無法連線({error_type}),請稍後再試。_" latency_ms = int((time.monotonic() - t0) * 1000) diff --git a/apps/api/src/routes/agent.py b/apps/api/src/routes/agent.py index cf67de47..348480c7 100644 --- a/apps/api/src/routes/agent.py +++ b/apps/api/src/routes/agent.py @@ -19,10 +19,11 @@ router = APIRouter() logger = logging.getLogger(__name__) # ==================== Ollama Config ==================== -# 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111 -def _get_ollama_base_url() -> str: - from src.core.config import get_settings - return get_settings().OLLAMA_URL +# 2026-05-19 Codex: agent thinking stream follows GCP-A → GCP-B → 111. +def _get_ollama_endpoints(): + from src.services.ollama_endpoint_resolver import resolve_ollama_order + + return resolve_ollama_order("interactive") OLLAMA_MODEL = "llama3.2:latest" # 可根據實際部署調整 OLLAMA_TIMEOUT = 120.0 # 串流超時 @@ -112,66 +113,82 @@ async def get_agent_thinking( # 1. 開始思考 yield f"data: {json.dumps({'type': 'thinking', 'content': '正在連接 AI 模型...'}, ensure_ascii=False)}\n\n" - try: - async with httpx.AsyncClient(timeout=OLLAMA_TIMEOUT) as client: - # 2. 發送請求到 Ollama - yield f"data: {json.dumps({'type': 'thinking', 'content': f'模型: {model}'}, ensure_ascii=False)}\n\n" + last_error = "" + async with httpx.AsyncClient(timeout=OLLAMA_TIMEOUT) as client: + # 2. 發送請求到 Ollama + yield f"data: {json.dumps({'type': 'thinking', 'content': f'模型: {model}'}, ensure_ascii=False)}\n\n" - async with client.stream( - "POST", - f"{_get_ollama_base_url()}/api/generate", - json={ - "model": model, - "prompt": prompt, - "stream": True, - }, - ) as response: - if response.status_code != 200: - yield f"data: {json.dumps({'type': 'error', 'content': f'Ollama 錯誤: HTTP {response.status_code}'}, ensure_ascii=False)}\n\n" - yield "data: [DONE]\n\n" - return - - yield f"data: {json.dumps({'type': 'thinking', 'content': '開始接收 AI 回應...'}, ensure_ascii=False)}\n\n" - - # 3. 串流讀取 Ollama 回應 - buffer = "" - async for line in response.aiter_lines(): - if not line: + for endpoint in _get_ollama_endpoints(): + if not endpoint.url: + continue + try: + async with client.stream( + "POST", + f"{endpoint.url}/api/generate", + json={ + "model": model, + "prompt": prompt, + "stream": True, + }, + ) as response: + if response.status_code != 200: + last_error = f"HTTP {response.status_code}" + logger.warning( + "agent_thinking_ollama_http_error", + provider=endpoint.provider_name, + status=response.status_code, + ) continue - try: - chunk = json.loads(line) - token = chunk.get("response", "") - done = chunk.get("done", False) + yield f"data: {json.dumps({'type': 'thinking', 'content': '開始接收 AI 回應...'}, ensure_ascii=False)}\n\n" - if token: - # 累積 token,每 10 字符或遇到標點符號時發送 - buffer += token - if len(buffer) >= 10 or any(p in buffer for p in "。!?,、\n"): - yield f"data: {json.dumps({'type': 'thinking', 'content': buffer}, ensure_ascii=False)}\n\n" - buffer = "" + # 3. 串流讀取 Ollama 回應 + buffer = "" + async for line in response.aiter_lines(): + if not line: + continue - if done: - # 發送剩餘 buffer - if buffer: - yield f"data: {json.dumps({'type': 'thinking', 'content': buffer}, ensure_ascii=False)}\n\n" - # 發送完成訊息 - yield f"data: {json.dumps({'type': 'result', 'content': '分析完成'}, ensure_ascii=False)}\n\n" - break + try: + chunk = json.loads(line) + token = chunk.get("response", "") + done = chunk.get("done", False) - except json.JSONDecodeError as e: - logger.warning(f"JSON 解析失敗: {line[:100]}... - {e}") - continue + if token: + # 累積 token,每 10 字符或遇到標點符號時發送 + buffer += token + if len(buffer) >= 10 or any(p in buffer for p in "。!?,、\n"): + yield f"data: {json.dumps({'type': 'thinking', 'content': buffer}, ensure_ascii=False)}\n\n" + buffer = "" - except httpx.ConnectError as e: - logger.error(f"無法連接 Ollama: {e}") - yield f"data: {json.dumps({'type': 'error', 'content': f'無法連接 Ollama ({_get_ollama_base_url()})'}, ensure_ascii=False)}\n\n" - except httpx.TimeoutException as e: - logger.error(f"Ollama 超時: {e}") - yield f"data: {json.dumps({'type': 'error', 'content': '請求超時'}, ensure_ascii=False)}\n\n" - except Exception as e: - logger.error(f"未知錯誤: {e}") - yield f"data: {json.dumps({'type': 'error', 'content': f'未知錯誤: {str(e)}'}, ensure_ascii=False)}\n\n" + if done: + # 發送剩餘 buffer + if buffer: + yield f"data: {json.dumps({'type': 'thinking', 'content': buffer}, ensure_ascii=False)}\n\n" + # 發送完成訊息 + yield f"data: {json.dumps({'type': 'result', 'content': '分析完成'}, ensure_ascii=False)}\n\n" + yield "data: [DONE]\n\n" + return + + except json.JSONDecodeError as e: + logger.warning(f"JSON 解析失敗: {line[:100]}... - {e}") + continue + except (httpx.ConnectError, httpx.TimeoutException) as e: + last_error = type(e).__name__ + logger.error( + "agent_thinking_ollama_endpoint_failed", + provider=endpoint.provider_name, + error=str(e), + ) + except Exception as e: + last_error = str(e) + logger.error( + "agent_thinking_unknown_error", + provider=endpoint.provider_name, + error=str(e), + ) + + error_content = f"Ollama 全端點不可用: {last_error or 'unknown'}" + yield f"data: {json.dumps({'type': 'error', 'content': error_content}, ensure_ascii=False)}\n\n" # 4. 結束標記 yield "data: [DONE]\n\n" diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py index 7b96173a..eef892ee 100644 --- a/apps/api/src/services/alert_rule_engine.py +++ b/apps/api/src/services/alert_rule_engine.py @@ -33,6 +33,7 @@ import yaml from src.constants.alert_types import ALERTNAME_TO_TYPE from src.services.action_parser import parse_kubectl_action +from src.services.ollama_endpoint_resolver import resolve_ollama_order logger = structlog.get_logger(__name__) @@ -634,10 +635,12 @@ async def _insert_catalog_ai_generated( ) return - from sqlalchemy import text as _sql - import src.db.base as _db_base import json as _json + from sqlalchemy import text as _sql + + import src.db.base as _db_base + # 從 rule_dict 提取欄位 # 'expr' 在 OpenClaw YAML 規則中不存在(非 PromQL), # 使用 alertname 作為語意佔位(與 yaml_hardcoded 同等策略) @@ -704,17 +707,30 @@ async def _insert_catalog_ai_generated( async def _call_ollama(prompt: str, ollama_url: str, model: str) -> str | None: """呼叫 Ollama 生成規則 YAML""" - try: - async with httpx.AsyncClient(timeout=60) as client: - resp = await client.post( - f"{ollama_url}/api/generate", - json={"model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1}}, - ) - resp.raise_for_status() - return resp.json().get("response", "") - except Exception as e: - logger.warning("auto_rule_ollama_failed", error=str(e)) - return None + endpoints = list(resolve_ollama_order("deep_rca")) + if ollama_url and all(endpoint.url != ollama_url for endpoint in endpoints): + from types import SimpleNamespace + + endpoints.insert(0, SimpleNamespace(url=ollama_url, provider_name="custom")) + + async with httpx.AsyncClient(timeout=60) as client: + for endpoint in endpoints: + if not endpoint.url: + continue + try: + resp = await client.post( + f"{endpoint.url}/api/generate", + json={"model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1}}, + ) + resp.raise_for_status() + return resp.json().get("response", "") + except Exception as e: + logger.warning( + "auto_rule_ollama_failed", + provider=endpoint.provider_name, + error=str(e), + ) + return None async def _call_gemini(prompt: str, api_key: str) -> str | None: diff --git a/apps/api/src/services/chat_manager.py b/apps/api/src/services/chat_manager.py index e9bdc0f9..855bb0f8 100644 --- a/apps/api/src/services/chat_manager.py +++ b/apps/api/src/services/chat_manager.py @@ -25,7 +25,7 @@ import structlog from src.core.config import get_settings from src.repositories.incident_repository import get_incident_repository from src.repositories.k8s_repository import get_k8s_repository -from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint +from src.services.ollama_endpoint_resolver import resolve_ollama_order from src.utils.timezone import now_taipei logger = structlog.get_logger(__name__) @@ -96,42 +96,51 @@ class ChatManager: settings = get_settings() model = settings.OPENCLAW_DEFAULT_MODEL - ollama_url = resolve_ollama_endpoint("interactive") - try: - async with httpx.AsyncClient(timeout=40.0) as client: - resp = await client.post( - f"{ollama_url}/api/chat", - json={ - "model": model, - "stream": False, - "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_message}, - ], - "options": {"num_predict": 900, "temperature": 0.2}, - }, - ) - resp.raise_for_status() - data = resp.json() - raw = data.get("message", {}).get("content", "").strip() - text = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() or raw + async with httpx.AsyncClient(timeout=40.0) as client: + for endpoint in resolve_ollama_order("interactive"): + if not endpoint.url: + continue + try: + resp = await client.post( + f"{endpoint.url}/api/chat", + json={ + "model": model, + "stream": False, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_message}, + ], + "options": {"num_predict": 900, "temperature": 0.2}, + }, + ) + resp.raise_for_status() + data = resp.json() + raw = data.get("message", {}).get("content", "").strip() + text = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() or raw - eval_count = data.get("eval_count", 0) - prompt_eval_count = data.get("prompt_eval_count", 0) - total_tokens = eval_count + prompt_eval_count + eval_count = data.get("eval_count", 0) + prompt_eval_count = data.get("prompt_eval_count", 0) + total_tokens = eval_count + prompt_eval_count - logger.info( - "openclaw_ollama_chat_usage", - model=model, - endpoint=ollama_url, - prompt_tokens=prompt_eval_count, - output_tokens=eval_count, - ) + logger.info( + "openclaw_ollama_chat_usage", + model=model, + endpoint=endpoint.url, + provider=endpoint.provider_name, + prompt_tokens=prompt_eval_count, + output_tokens=eval_count, + ) - return f"{text}\n\n🦙 {model} | {total_tokens} tokens | 免費" - except Exception as e: - logger.warning("openclaw_chat_failed", error=str(e)) - return None + return f"{text}\n\n🦙 {model} | {total_tokens} tokens | 免費" + except Exception as e: + logger.warning( + "openclaw_chat_endpoint_failed", + provider=endpoint.provider_name, + endpoint=endpoint.url, + error=str(e), + ) + logger.warning("openclaw_chat_failed_all_endpoints", model=model) + return None async def _call_nemotron(self, system_prompt: str, user_message: str) -> str | None: """ @@ -146,43 +155,57 @@ class ChatManager: system_prompt = f"{NEMOCLAW_PERSONA}\n{system_prompt}" # 2026-05-05 Codex: ADR-110 interactive lane,由 resolver 管理 GCP-A/GCP-B/111 拓撲 - OLLAMA_URL = resolve_ollama_endpoint("interactive") MODEL = "deepseek-r1:14b" - try: - async with httpx.AsyncClient(timeout=120.0) as client: - resp = await client.post( - f"{OLLAMA_URL}/api/chat", - json={ - "model": MODEL, - "stream": False, - "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_message}, - ], - "options": {"num_predict": 1200}, - }, - ) - resp.raise_for_status() - data = resp.json() - raw = data.get("message", {}).get("content", "").strip() + async with httpx.AsyncClient(timeout=120.0) as client: + for endpoint in resolve_ollama_order("interactive"): + if not endpoint.url: + continue + try: + resp = await client.post( + f"{endpoint.url}/api/chat", + json={ + "model": MODEL, + "stream": False, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_message}, + ], + "options": {"num_predict": 1200}, + }, + ) + resp.raise_for_status() + data = resp.json() + raw = data.get("message", {}).get("content", "").strip() - # 過濾 deepseek-r1 的 ... 推理區塊 - text = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() - if not text: - text = raw # 萬一全是 think block,直接回傳原文 + # 過濾 deepseek-r1 的 ... 推理區塊 + text = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() + if not text: + text = raw # 萬一全是 think block,直接回傳原文 - eval_count = data.get("eval_count", 0) - prompt_eval_count = data.get("prompt_eval_count", 0) - total_tokens = eval_count + prompt_eval_count + eval_count = data.get("eval_count", 0) + prompt_eval_count = data.get("prompt_eval_count", 0) + total_tokens = eval_count + prompt_eval_count - logger.info("nemotron_ollama_usage", model=MODEL, - prompt_tokens=prompt_eval_count, output_tokens=eval_count) + logger.info( + "nemotron_ollama_usage", + model=MODEL, + provider=endpoint.provider_name, + prompt_tokens=prompt_eval_count, + output_tokens=eval_count, + ) - return f"{text}\n\n🦙 {MODEL} | {total_tokens} tokens | 免費" - except Exception as e: - logger.warning("nemotron_chat_failed", model=MODEL, error=str(e)) - return None + return f"{text}\n\n🦙 {MODEL} | {total_tokens} tokens | 免費" + except Exception as e: + logger.warning( + "nemotron_chat_endpoint_failed", + model=MODEL, + provider=endpoint.provider_name, + endpoint=endpoint.url, + error=str(e), + ) + logger.warning("nemotron_chat_failed_all_endpoints", model=MODEL) + return None async def generate_response( self, diff --git a/apps/api/src/services/decision_fusion.py b/apps/api/src/services/decision_fusion.py index 392a108b..6adb6cc8 100644 --- a/apps/api/src/services/decision_fusion.py +++ b/apps/api/src/services/decision_fusion.py @@ -29,6 +29,7 @@ import httpx import structlog from src.core.config import get_settings +from src.services.ollama_endpoint_resolver import resolve_ollama_order if TYPE_CHECKING: from src.models.incident import Incident @@ -186,9 +187,41 @@ class DecisionFusionEngine: # settings 延遲讀取(避免測試環境初始化問題) self._settings = get_settings() - @property - def _ollama_url(self) -> str: - return getattr(self._settings, "OLLAMA_URL", "http://34.143.170.20:11434") # 2026-05-03 ogt: ADR-110 GCP-A Primary + async def _call_ollama_generate( + self, + *, + prompt: str, + timeout_sec: float, + num_predict: int, + ) -> str: + """Call Ollama in the global order: GCP-A -> GCP-B -> 111.""" + last_error: Exception | None = None + async with httpx.AsyncClient( + timeout=httpx.Timeout(timeout_sec, connect=5.0) + ) as client: + for endpoint in resolve_ollama_order("deep_rca"): + if not endpoint.url: + continue + try: + resp = await client.post( + f"{endpoint.url}{_OLLAMA_GENERATE_PATH}", + json={ + "model": "qwen3:8b", + "prompt": prompt, + "stream": False, + "options": {"num_predict": num_predict, "temperature": 0.1}, + }, + ) + resp.raise_for_status() + return resp.json().get("response", "").strip() + except Exception as exc: + last_error = exc + logger.debug( + "decision_fusion_ollama_endpoint_failed", + provider=endpoint.provider_name, + error=str(exc), + ) + raise RuntimeError(str(last_error) if last_error else "no_ollama_endpoint") # ========================================================================= # Public API @@ -322,21 +355,12 @@ class DecisionFusionEngine: ) try: - async with httpx.AsyncClient( - timeout=httpx.Timeout(_HERMES_TIMEOUT_SEC, connect=5.0) - ) as client: - resp = await client.post( - f"{self._ollama_url}{_OLLAMA_GENERATE_PATH}", - json={ - "model": "qwen3:8b", - "prompt": prompt, - "stream": False, - "options": {"num_predict": 16, "temperature": 0.1}, - }, - ) - if resp.status_code == 200: - text = resp.json().get("response", "").strip() - return self._extract_float(text, default=0.5) + text = await self._call_ollama_generate( + prompt=prompt, + timeout_sec=_HERMES_TIMEOUT_SEC, + num_predict=16, + ) + return self._extract_float(text, default=0.5) except Exception as exc: logger.debug("hermes_score_failed", error=str(exc)) @@ -451,20 +475,11 @@ class DecisionFusionEngine: "只回覆一個 0-1 的小數,不要解釋。" ) - async with httpx.AsyncClient( - timeout=httpx.Timeout(_ELEPHANT_TIMEOUT_SEC, connect=5.0) - ) as client: - resp = await client.post( - f"{self._ollama_url}{_OLLAMA_GENERATE_PATH}", - json={ - "model": "qwen3:8b", - "prompt": prompt, - "stream": False, - "options": {"num_predict": 32, "temperature": 0.1}, - }, - ) - resp.raise_for_status() - raw_text = resp.json().get("response", "").strip() + raw_text = await self._call_ollama_generate( + prompt=prompt, + timeout_sec=_ELEPHANT_TIMEOUT_SEC, + num_predict=32, + ) # 移除 deepseek/qwen3 標籤 clean = re.sub(r".*?", "", raw_text, flags=re.DOTALL).strip() diff --git a/apps/api/src/services/decision_fusion_adapter.py b/apps/api/src/services/decision_fusion_adapter.py index 529f2e90..de3806e7 100644 --- a/apps/api/src/services/decision_fusion_adapter.py +++ b/apps/api/src/services/decision_fusion_adapter.py @@ -33,6 +33,7 @@ import httpx import structlog from src.core.config import get_settings +from src.services.ollama_endpoint_resolver import resolve_ollama_order if TYPE_CHECKING: from src.db.models import AiGovernanceEvent @@ -254,35 +255,47 @@ class DecisionFusionAdapter: "只輸出 CONFIDENCE 和 ACTION 兩行,不要其他解釋。" ) - from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint - - ollama_url = resolve_ollama_endpoint("deep_rca") - - try: - async with httpx.AsyncClient( - timeout=httpx.Timeout(_LLM_TIMEOUT_SEC, connect=5.0) - ) as client: - resp = await client.post( - f"{ollama_url}/api/generate", - json={ - "model": "qwen3:8b", - "prompt": prompt, - "stream": False, - "options": {"num_predict": 128, "temperature": 0.1}, - }, - ) - if resp.status_code != 200: - logger.warning( - "fusion_llm_http_error", - status=resp.status_code, - event_id=event.id, + raw_text = "" + last_error = "" + async with httpx.AsyncClient( + timeout=httpx.Timeout(_LLM_TIMEOUT_SEC, connect=5.0) + ) as client: + for endpoint in resolve_ollama_order("deep_rca"): + if not endpoint.url: + continue + try: + resp = await client.post( + f"{endpoint.url}/api/generate", + json={ + "model": "qwen3:8b", + "prompt": prompt, + "stream": False, + "options": {"num_predict": 128, "temperature": 0.1}, + }, ) - return 0.5, "(LLM 不可用,使用中立值)", {"error": f"http_{resp.status_code}"} + if resp.status_code != 200: + last_error = f"http_{resp.status_code}" + logger.warning( + "fusion_llm_http_error", + provider=endpoint.provider_name, + status=resp.status_code, + event_id=event.id, + ) + continue - raw_text = resp.json().get("response", "").strip() - except Exception as exc: - logger.warning("fusion_llm_request_failed", event_id=event.id, error=str(exc)) - return 0.5, "(LLM 連線失敗,使用中立值)", {"error": str(exc)} + raw_text = resp.json().get("response", "").strip() + break + except Exception as exc: + last_error = str(exc) + logger.warning( + "fusion_llm_request_failed", + provider=endpoint.provider_name, + event_id=event.id, + error=str(exc), + ) + + if not raw_text: + return 0.5, "(LLM 連線失敗,使用中立值)", {"error": last_error or "no_ollama_endpoint"} # 移除 標籤(qwen3 CoT 輸出) clean = re.sub(r".*?", "", raw_text, flags=re.DOTALL).strip() diff --git a/apps/api/src/services/drift_narrator_service.py b/apps/api/src/services/drift_narrator_service.py index 8e29651f..c952e5da 100644 --- a/apps/api/src/services/drift_narrator_service.py +++ b/apps/api/src/services/drift_narrator_service.py @@ -33,11 +33,6 @@ logger = structlog.get_logger(__name__) # ============================================================ # 設定 # ============================================================ -# 2026-05-05 Codex: 重摘要走 111 lane,避免污染 GCP alert-fast lane -def _get_ollama_url() -> str: - from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint - - return resolve_ollama_endpoint("deep_rca") # D1 集中化 2026-04-11: 從 models.json providers.ollama.models.drift_summary 讀取 NARRATOR_MODEL = get_model("ollama", "drift_summary") NARRATOR_TIMEOUT = 90.0 # seconds diff --git a/apps/api/src/services/image_analysis_service.py b/apps/api/src/services/image_analysis_service.py index d40728d1..7e30431f 100644 --- a/apps/api/src/services/image_analysis_service.py +++ b/apps/api/src/services/image_analysis_service.py @@ -29,7 +29,7 @@ import httpx import structlog from src.services.model_registry import get_model -from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint +from src.services.ollama_endpoint_resolver import resolve_ollama_order if TYPE_CHECKING: pass @@ -118,31 +118,51 @@ class ImageAnalysisService: async def _call_llava(self, image_path: Path, question: str) -> str | None: """呼叫 llava:latest via Ollama /api/generate with base64 image""" + timed_out = False try: image_b64 = base64.b64encode(image_path.read_bytes()).decode() http = await self._get_http() - resp = await http.post( - f"{resolve_ollama_endpoint('image_analysis')}/api/generate", - json={ - "model": _MODEL, - "prompt": question, - "images": [image_b64], - "stream": False, - "options": { - "num_predict": 512, - "temperature": 0.3, - }, - }, - ) - if resp.status_code == 200: - text = resp.json().get("response", "").strip() - return text or None - else: - logger.warning("image_analysis_ollama_error", status=resp.status_code) - return None - except httpx.TimeoutException: - logger.warning("image_analysis_timeout", path=str(image_path)) - return "⚠️ 圖片分析超時(llava 處理中),請稍後重試" + for endpoint in resolve_ollama_order("image_analysis"): + if not endpoint.url: + continue + try: + resp = await http.post( + f"{endpoint.url}/api/generate", + json={ + "model": _MODEL, + "prompt": question, + "images": [image_b64], + "stream": False, + "options": { + "num_predict": 512, + "temperature": 0.3, + }, + }, + ) + if resp.status_code == 200: + text = resp.json().get("response", "").strip() + return text or None + logger.warning( + "image_analysis_ollama_error", + provider=endpoint.provider_name, + status=resp.status_code, + ) + except httpx.TimeoutException: + timed_out = True + logger.warning( + "image_analysis_timeout", + provider=endpoint.provider_name, + path=str(image_path), + ) + except Exception as e: + logger.error( + "image_analysis_failed", + provider=endpoint.provider_name, + error=str(e), + ) + if timed_out: + return "⚠️ 圖片分析超時(llava 處理中),請稍後重試" + return None except Exception as e: logger.error("image_analysis_failed", error=str(e)) return None diff --git a/apps/api/src/services/intent_classifier.py b/apps/api/src/services/intent_classifier.py index d5d44938..cda2bfe7 100644 --- a/apps/api/src/services/intent_classifier.py +++ b/apps/api/src/services/intent_classifier.py @@ -32,7 +32,7 @@ import httpx import structlog from src.services.model_registry import get_model_registry -from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint +from src.services.ollama_endpoint_resolver import resolve_ollama_order logger = structlog.get_logger(__name__) @@ -546,27 +546,56 @@ class IntentClassifier: # 取得模型配置 model_name = self.llm_model # qwen2.5:1b 或配置值 - # 呼叫 Ollama + # 呼叫 Ollama(GCP-A → GCP-B → 111) + result_text = "" + last_error: Exception | None = None async with httpx.AsyncClient() as client: - response = await client.post( - f"{resolve_ollama_endpoint('hermes')}/api/generate", - json={ - "model": model_name, - "prompt": prompt, - "stream": False, - "format": "json", - "options": { - "num_predict": 128, # 意圖分類只需短回應 - "temperature": 0.0, # 確定性輸出 - "top_p": 0.9, - }, - }, - timeout=httpx.Timeout(5.0, connect=2.0), # 嚴格超時 - ) + for endpoint in resolve_ollama_order("hermes"): + if not endpoint.url: + continue + try: + response = await client.post( + f"{endpoint.url}/api/generate", + json={ + "model": model_name, + "prompt": prompt, + "stream": False, + "format": "json", + "options": { + "num_predict": 128, # 意圖分類只需短回應 + "temperature": 0.0, # 確定性輸出 + "top_p": 0.9, + }, + }, + timeout=httpx.Timeout(5.0, connect=2.0), # 嚴格超時 + ) - response.raise_for_status() - data = response.json() - result_text = data.get("response", "") + response.raise_for_status() + data = response.json() + result_text = data.get("response", "") + break + except Exception as e: + last_error = e + logger.warning( + "intent_llm_endpoint_failed", + provider=endpoint.provider_name, + error=str(e), + error_type=type(e).__name__, + ) + + if not result_text: + _kw_result = self._keyword_classify(text) + return IntentResult( + intent=_kw_result.intent, + confidence=_kw_result.confidence, + method="llm_unavailable_keyword", + matched_keywords=_kw_result.matched_keywords, + detected_resources=_kw_result.detected_resources, + reasoning=( + f"LLM 全端點不可用({type(last_error).__name__ if last_error else 'no_endpoint'})" + f" → {_kw_result.reasoning}" + ), + ) # 解析 JSON 回應 elapsed_ms = (time.time() - start_time) * 1000 diff --git a/apps/api/src/services/log_summary_service.py b/apps/api/src/services/log_summary_service.py index 5574bfaf..b2dc62ba 100644 --- a/apps/api/src/services/log_summary_service.py +++ b/apps/api/src/services/log_summary_service.py @@ -33,11 +33,11 @@ logger = structlog.get_logger(__name__) # ============================================================ # 設定 # ============================================================ -# 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111 -def _get_ollama_url() -> str: - from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint +# 2026-05-19 Codex: 所有 Ollama workload 固定 GCP-A → GCP-B → 111。 +def _get_ollama_endpoints(): + from src.services.ollama_endpoint_resolver import resolve_ollama_order - return resolve_ollama_endpoint("deep_rca") + return resolve_ollama_order("deep_rca") # D1 集中化 2026-04-11: 從 models.json providers.ollama.models.log_anomaly 讀取 SUMMARY_MODEL = get_model("ollama", "log_anomaly") LLM_TIMEOUT = 180.0 # deepseek-r1 硬超時 @@ -209,31 +209,40 @@ class LogSummaryService: log_snippet=log_snippet[:3000], # 避免超出 context ) - try: - async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client: - resp = await client.post( - f"{_get_ollama_url()}/api/generate", - json={ - "model": SUMMARY_MODEL, - "prompt": prompt, - "stream": False, - "options": {"temperature": 0.1, "num_predict": 200}, - }, - ) - resp.raise_for_status() - data = resp.json() - raw = data.get("response", "").strip() + async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client: + for endpoint in _get_ollama_endpoints(): + if not endpoint.url: + continue + try: + resp = await client.post( + f"{endpoint.url}/api/generate", + json={ + "model": SUMMARY_MODEL, + "prompt": prompt, + "stream": False, + "options": {"temperature": 0.1, "num_predict": 200}, + }, + ) + resp.raise_for_status() + data = resp.json() + raw = data.get("response", "").strip() - # 過濾 deepseek-r1 的 ... 推理區塊 - text = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() - return text or raw or None - - except httpx.TimeoutException: - logger.warning("log_summary_llm_timeout", model=SUMMARY_MODEL) - return None - except Exception as e: - logger.warning("log_summary_llm_error", error=str(e)) - return None + # 過濾 deepseek-r1 的 ... 推理區塊 + text = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() + return text or raw or None + except httpx.TimeoutException: + logger.warning( + "log_summary_llm_timeout", + model=SUMMARY_MODEL, + provider=endpoint.provider_name, + ) + except Exception as e: + logger.warning( + "log_summary_llm_error", + provider=endpoint.provider_name, + error=str(e), + ) + return None # ============================================================ diff --git a/apps/api/src/services/nvidia_provider.py b/apps/api/src/services/nvidia_provider.py index e9419ab6..28205f04 100644 --- a/apps/api/src/services/nvidia_provider.py +++ b/apps/api/src/services/nvidia_provider.py @@ -40,7 +40,10 @@ from src.models.nvidia import ( from src.services.langfuse_client import ( # 2026-03-29 ogt: P1-1 Langfuse 整合 LangfuseTraceContext, ) -from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint +from src.services.ollama_endpoint_resolver import ( + resolve_ollama_endpoint, + resolve_ollama_order, +) logger = structlog.get_logger(__name__) settings = get_settings() @@ -874,15 +877,30 @@ class OllamaToolProvider: return [tc for tc in tool_calls if self.is_high_risk_tool(tc.tool_name)] def _base_url(self) -> str: - """Tool-calling/Hermes models stay off the GCP alert lane.""" + """Return the first Hermes/tool endpoint for backward compatibility.""" return resolve_ollama_endpoint("hermes").rstrip("/") + def _base_urls(self) -> list[str]: + """Tool-calling/Hermes follows GCP-A -> GCP-B -> 111.""" + urls = [self._base_url()] + urls.extend(endpoint.url.rstrip("/") for endpoint in resolve_ollama_order("hermes") if endpoint.url) + deduped: list[str] = [] + for url in urls: + if url and url not in deduped: + deduped.append(url) + return deduped + async def health_check(self) -> bool: try: client = await self._get_client() - base_url = self._base_url() - resp = await client.get(f"{base_url}/api/tags", timeout=5.0) - return resp.status_code == 200 + for base_url in self._base_urls(): + try: + resp = await client.get(f"{base_url}/api/tags", timeout=5.0) + if resp.status_code == 200: + return True + except Exception: + continue + return False except Exception: return False @@ -897,8 +915,6 @@ class OllamaToolProvider: """Ollama /v1/chat/completions tool calling""" start_time = time.perf_counter() model = model or settings.OLLAMA_TOOL_MODEL - base_url = self._base_url() - url = f"{base_url}/v1/chat/completions" # 轉換 tools 為 dict 格式(同 NvidiaProvider) tools_data = [] @@ -919,68 +935,78 @@ class OllamaToolProvider: try: client = await self._get_client() - response = await client.post(url, json=request_body) - latency_ms = (time.perf_counter() - start_time) * 1000 - - if response.status_code != 200: - logger.warning( - "ollama_tool_call_http_error", - status=response.status_code, - body=response.text[:200], + last_error = "" + for base_url in self._base_urls(): + response = await client.post( + f"{base_url}/v1/chat/completions", + json=request_body, ) + latency_ms = (time.perf_counter() - start_time) * 1000 + + if response.status_code != 200: + last_error = f"Ollama HTTP {response.status_code}" + logger.warning( + "ollama_tool_call_http_error", + status=response.status_code, + body=response.text[:200], + endpoint=base_url, + ) + continue + + data = response.json() + # 解析 tool_calls(OpenAI 格式) + choices = data.get("choices", []) + if not choices: + last_error = "Ollama 無回應" + continue + + message = choices[0].get("message", {}) + raw_tool_calls = message.get("tool_calls", []) + + tool_call_results: list[ToolCallValidationResult] = [] + for tc in raw_tool_calls: + fn = tc.get("function", {}) + name = fn.get("name", "") + args_raw = fn.get("arguments", "{}") + try: + args = json.loads(args_raw) if isinstance(args_raw, str) else args_raw + except json.JSONDecodeError: + args = {} + tool_call_results.append(ToolCallValidationResult( + tool_name=name, + arguments=args, + valid=bool(name), + )) + + usage_data = data.get("usage", {}) + from src.models.nvidia import NvidiaUsage + usage = NvidiaUsage( + prompt_tokens=usage_data.get("prompt_tokens", 0), + completion_tokens=usage_data.get("completion_tokens", 0), + total_tokens=usage_data.get("total_tokens", 0), + ) + logger.info( + "ollama_tool_call_success", + model=model, + tool_count=len(tool_call_results), + latency_ms=round(latency_ms, 1), + tokens=usage.total_tokens, + endpoint=base_url, + ) + return NvidiaProviderResult( - success=False, - error=f"Ollama HTTP {response.status_code}", + success=True, + tool_calls=tool_call_results, latency_ms=latency_ms, - fallback_triggered=True, + usage=usage, ) - data = response.json() - # 解析 tool_calls(OpenAI 格式) - choices = data.get("choices", []) - if not choices: - return NvidiaProviderResult( - success=False, error="Ollama 無回應", latency_ms=latency_ms, fallback_triggered=True - ) - - message = choices[0].get("message", {}) - raw_tool_calls = message.get("tool_calls", []) - - tool_call_results: list[ToolCallValidationResult] = [] - for tc in raw_tool_calls: - fn = tc.get("function", {}) - name = fn.get("name", "") - args_raw = fn.get("arguments", "{}") - try: - args = json.loads(args_raw) if isinstance(args_raw, str) else args_raw - except json.JSONDecodeError: - args = {} - tool_call_results.append(ToolCallValidationResult( - tool_name=name, - arguments=args, - valid=bool(name), - )) - - usage_data = data.get("usage", {}) - from src.models.nvidia import NvidiaUsage - usage = NvidiaUsage( - prompt_tokens=usage_data.get("prompt_tokens", 0), - completion_tokens=usage_data.get("completion_tokens", 0), - total_tokens=usage_data.get("total_tokens", 0), - ) - logger.info( - "ollama_tool_call_success", - model=model, - tool_count=len(tool_call_results), - latency_ms=round(latency_ms, 1), - tokens=usage.total_tokens, - ) - + latency_ms = (time.perf_counter() - start_time) * 1000 return NvidiaProviderResult( - success=True, - tool_calls=tool_call_results, + success=False, + error=last_error or "Ollama 無回應", latency_ms=latency_ms, - usage=usage, + fallback_triggered=True, ) except Exception as e: @@ -993,16 +1019,25 @@ class OllamaToolProvider: async def chat(self, prompt: str, model: str = "", temperature: float = 0.7, max_tokens: int = 512) -> str: """簡單 chat(非 tool calling 路徑,保持 INvidiaProvider 相容)""" model = model or settings.OLLAMA_TOOL_MODEL - base_url = self._base_url() try: client = await self._get_client() - resp = await client.post( - f"{base_url}/v1/chat/completions", - json={"model": model, "messages": [{"role": "user", "content": prompt}], - "temperature": temperature, "max_tokens": max_tokens}, - ) - data = resp.json() - return data.get("choices", [{}])[0].get("message", {}).get("content", "") + last_error = "" + for base_url in self._base_urls(): + try: + resp = await client.post( + f"{base_url}/v1/chat/completions", + json={"model": model, "messages": [{"role": "user", "content": prompt}], + "temperature": temperature, "max_tokens": max_tokens}, + ) + if resp.status_code != 200: + last_error = f"http_{resp.status_code}" + continue + data = resp.json() + return data.get("choices", [{}])[0].get("message", {}).get("content", "") + except Exception as e: + last_error = str(e) + continue + return f"Ollama chat error: {last_error or 'no endpoint'}" except Exception as e: return f"Ollama chat error: {e}" diff --git a/apps/api/tests/test_chat_manager_ollama_routing.py b/apps/api/tests/test_chat_manager_ollama_routing.py index d5dc40f6..a40648d7 100644 --- a/apps/api/tests/test_chat_manager_ollama_routing.py +++ b/apps/api/tests/test_chat_manager_ollama_routing.py @@ -47,6 +47,14 @@ def _settings() -> SimpleNamespace: return SimpleNamespace(OPENCLAW_DEFAULT_MODEL="qwen3:14b") +def _fake_ollama_order(_workload_type: str) -> tuple[SimpleNamespace, ...]: + return ( + SimpleNamespace(url="http://gcp-a:11435", provider_name="ollama_gcp_a"), + SimpleNamespace(url="http://gcp-b:11436", provider_name="ollama_gcp_b"), + SimpleNamespace(url="http://local-111:11434", provider_name="ollama_local"), + ) + + @pytest.fixture(autouse=True) def _reset_fake_client() -> None: _FakeAsyncClient.posted = [] @@ -60,8 +68,8 @@ async def test_openclaw_chat_uses_ollama_interactive_lane( monkeypatch.setattr(chat_module, "get_settings", _settings) monkeypatch.setattr( chat_module, - "resolve_ollama_endpoint", - lambda workload_type: "http://gcp-a:11435", + "resolve_ollama_order", + _fake_ollama_order, ) result = await ChatManager()._call_openclaw("system context", "幫我看狀態") @@ -85,8 +93,8 @@ async def test_nemoclaw_chat_uses_resolved_interactive_lane( monkeypatch.setattr(chat_module.httpx, "AsyncClient", _FakeAsyncClient) monkeypatch.setattr( chat_module, - "resolve_ollama_endpoint", - lambda workload_type: "http://gcp-a:11435", + "resolve_ollama_order", + _fake_ollama_order, ) result = await ChatManager()._call_nemotron("system context", "補充觀點")