diff --git a/apps/api/src/services/ai_providers/ollama.py b/apps/api/src/services/ai_providers/ollama.py index d6caaf78..1bd45fc7 100644 --- a/apps/api/src/services/ai_providers/ollama.py +++ b/apps/api/src/services/ai_providers/ollama.py @@ -29,6 +29,54 @@ from src.services.model_registry import get_model_registry logger = structlog.get_logger(__name__) settings = get_settings() +_GCP_SAFE_MODELS = { + "gemma3:4b", +} + + +def _normalized_url(value: str | None) -> str: + return (value or "").rstrip("/") + + +def _is_gcp_alert_lane(endpoint_url: str) -> bool: + """Return true for the CPU-only GCP-A/B synchronous alert lane.""" + endpoint = _normalized_url(endpoint_url) + return endpoint in { + _normalized_url(getattr(settings, "OLLAMA_URL", "")), + _normalized_url(getattr(settings, "OLLAMA_SECONDARY_URL", "")), + } + + +def _resolve_model_for_endpoint( + *, + requested_model: str, + endpoint_url: str, + context: dict | None, +) -> str: + """ + Keep GCP-A/B on the fast alert model unless explicitly allowed. + + The GCP hosts currently expose CPU-only Ollama. Loading 7B/14B/32B models on + that lane blocks synchronous alerts long enough to fall through to Gemini. + Heavy/deep workloads must use 111 or the future AwoooP Inference Gateway. + """ + model_name = requested_model.strip() + context = context or {} + allow_gcp_heavy = bool(context.get("allow_gcp_heavy_model")) + + if _is_gcp_alert_lane(endpoint_url) and not allow_gcp_heavy and model_name not in _GCP_SAFE_MODELS: + alert_model = str(getattr(settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")).strip() or "gemma3:4b" + logger.warning( + "ollama_gcp_heavy_model_coerced", + endpoint=endpoint_url, + requested_model=model_name, + safe_model=alert_model, + task_type=context.get("task_type"), + ) + return alert_model + + return model_name + class OllamaProvider: """ @@ -77,7 +125,13 @@ class OllamaProvider: client = await self._get_client() registry = get_model_registry() - model_name = str((context or {}).get("ollama_model") or registry.get_model("ollama", "rca")).strip() + endpoint_url = self._endpoint_url() + requested_model = str((context or {}).get("ollama_model") or registry.get_model("ollama", "rca")).strip() + model_name = _resolve_model_for_endpoint( + requested_model=requested_model, + endpoint_url=endpoint_url, + context=context, + ) options = registry.get_provider_options("ollama") # P0 2026-04-04 Claude Code: per-task timeout(Option C 分情境) @@ -89,7 +143,6 @@ class OllamaProvider: else: read_timeout = float(settings.OPENCLAW_TIMEOUT) - endpoint_url = self._endpoint_url() response = await client.post( f"{endpoint_url}/api/generate", json={ diff --git a/apps/api/src/services/nvidia_provider.py b/apps/api/src/services/nvidia_provider.py index 6b5be0a6..e9419ab6 100644 --- a/apps/api/src/services/nvidia_provider.py +++ b/apps/api/src/services/nvidia_provider.py @@ -40,6 +40,7 @@ from src.models.nvidia import ( from src.services.langfuse_client import ( # 2026-03-29 ogt: P1-1 Langfuse 整合 LangfuseTraceContext, ) +from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint logger = structlog.get_logger(__name__) settings = get_settings() @@ -822,8 +823,8 @@ class NvidiaProvider: NVIDIA_REQUESTS_TOTAL.labels(status="error", tool_name="chat").inc() import traceback logger.warning( - "nvidia_chat_failed", - error=str(e), + "nvidia_chat_failed", + error=str(e), error_type=type(e).__name__, stacktrace=traceback.format_exc() ) @@ -845,7 +846,7 @@ class OllamaToolProvider: 取代 NVIDIA 雲端 NIM。延遲從 44s 降至 ~5s。 模型: llama3.1:8b (tool calling 最穩定的 8B 模型) - Endpoint: OLLAMA_URL/v1/chat/completions (OpenAI 相容格式) + Endpoint: local tool lane /v1/chat/completions (OpenAI 相容格式) """ def __init__(self) -> None: @@ -872,10 +873,14 @@ class OllamaToolProvider: ) -> list[ToolCallValidationResult]: return [tc for tc in tool_calls if self.is_high_risk_tool(tc.tool_name)] + def _base_url(self) -> str: + """Tool-calling/Hermes models stay off the GCP alert lane.""" + return resolve_ollama_endpoint("hermes").rstrip("/") + async def health_check(self) -> bool: try: client = await self._get_client() - base_url = settings.OLLAMA_URL.rstrip("/") + base_url = self._base_url() resp = await client.get(f"{base_url}/api/tags", timeout=5.0) return resp.status_code == 200 except Exception: @@ -892,7 +897,7 @@ class OllamaToolProvider: """Ollama /v1/chat/completions tool calling""" start_time = time.perf_counter() model = model or settings.OLLAMA_TOOL_MODEL - base_url = settings.OLLAMA_URL.rstrip("/") + base_url = self._base_url() url = f"{base_url}/v1/chat/completions" # 轉換 tools 為 dict 格式(同 NvidiaProvider) @@ -988,7 +993,7 @@ class OllamaToolProvider: async def chat(self, prompt: str, model: str = "", temperature: float = 0.7, max_tokens: int = 512) -> str: """簡單 chat(非 tool calling 路徑,保持 INvidiaProvider 相容)""" model = model or settings.OLLAMA_TOOL_MODEL - base_url = settings.OLLAMA_URL.rstrip("/") + base_url = self._base_url() try: client = await self._get_client() resp = await client.post( @@ -1010,7 +1015,7 @@ _provider: NvidiaProvider | None = None _ollama_tool_provider: OllamaToolProvider | None = None -def get_nvidia_provider() -> "NvidiaProvider | OllamaToolProvider": +def get_nvidia_provider() -> NvidiaProvider | OllamaToolProvider: """ 取得 Tool Calling Provider 單例。 USE_OLLAMA_TOOL_CALLING=True (預設) → OllamaToolProvider (本機,~5s) diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 62c19bb4..60292e38 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -534,6 +534,8 @@ class OpenClawService: # 從 ModelRegistry 取得模型配置 registry = get_model_registry() model_name = registry.get_model("ollama", "rca") + if ollama_only: + model_name = getattr(settings, "ALERT_OLLAMA_MODEL", "gemma3:4b") options = registry.get_provider_options("ollama") timeout_seconds = max( float(settings.OPENCLAW_TIMEOUT), diff --git a/apps/api/tests/test_nvidia_provider.py b/apps/api/tests/test_nvidia_provider.py index f912f82b..5be46f0c 100644 --- a/apps/api/tests/test_nvidia_provider.py +++ b/apps/api/tests/test_nvidia_provider.py @@ -23,6 +23,7 @@ from src.models.nvidia import ( from src.services.nvidia_provider import ( HIGH_RISK_TOOLS, NvidiaProvider, + OllamaToolProvider, create_tool_definition, get_nvidia_provider, reset_nvidia_provider, @@ -286,6 +287,58 @@ class TestProtocolCompliance: assert hasattr(INvidiaProvider, "close") +class TestOllamaToolProviderRouting: + """Ollama tool-calling must not pollute the GCP alert lane.""" + + def test_base_url_uses_hermes_resolver_lane(self, monkeypatch): + from src.services import nvidia_provider as nvidia_provider_module + + captured_workloads = [] + + def fake_resolve(workload): + captured_workloads.append(workload) + return "http://local-111:11434" + + monkeypatch.setattr(nvidia_provider_module, "resolve_ollama_endpoint", fake_resolve) + + provider = OllamaToolProvider() + + assert provider._base_url() == "http://local-111:11434" + assert captured_workloads == ["hermes"] + + @pytest.mark.asyncio + async def test_health_check_uses_local_hermes_lane(self, monkeypatch): + from src.services import nvidia_provider as nvidia_provider_module + + class _FakeResponse: + status_code = 200 + + class _FakeClient: + def __init__(self): + self.checked_urls = [] + + async def get(self, url, **kwargs): + self.checked_urls.append(url) + return _FakeResponse() + + monkeypatch.setattr( + nvidia_provider_module, + "resolve_ollama_endpoint", + lambda workload: "http://local-111:11434", + ) + + provider = OllamaToolProvider() + client = _FakeClient() + + async def _get_client(): + return client + + monkeypatch.setattr(provider, "_get_client", _get_client) + + assert await provider.health_check() is True + assert client.checked_urls == ["http://local-111:11434/api/tags"] + + class TestEdgeCases: """邊界測試案例 (P2-2)""" @@ -406,7 +459,11 @@ class TestAIRouterNvidiaIntegration: def test_tool_calling_route(self): """測試 Tool Calling 路由""" - from src.services.ai_router import AIProviderEnum, get_ai_router, reset_ai_router + from src.services.ai_router import ( + AIProviderEnum, + get_ai_router, + reset_ai_router, + ) reset_ai_router() router = get_ai_router() @@ -424,7 +481,11 @@ class TestAIRouterNvidiaIntegration: def test_existing_routing_not_affected(self): """測試現有路由規則不受影響""" - from src.services.ai_router import AIProviderEnum, get_ai_router, reset_ai_router + from src.services.ai_router import ( + AIProviderEnum, + get_ai_router, + reset_ai_router, + ) reset_ai_router() router = get_ai_router() diff --git a/apps/api/tests/test_ollama_provider_endpoints.py b/apps/api/tests/test_ollama_provider_endpoints.py index b8e84b76..dbe4de82 100644 --- a/apps/api/tests/test_ollama_provider_endpoints.py +++ b/apps/api/tests/test_ollama_provider_endpoints.py @@ -10,7 +10,7 @@ from src.services.ai_providers.ollama import OllamaGcpBProvider, OllamaProvider class _FakeRegistry: def get_model(self, provider: str, use_case: str) -> str: - return "qwen2.5:7b-instruct" + return "qwen3:14b" def get_provider_options(self, provider: str) -> dict[str, Any]: return {"num_predict": 32, "temperature": 0.1, "top_p": 0.9} @@ -33,10 +33,12 @@ class _FakeResponse: class _FakeClient: def __init__(self) -> None: self.posted_urls: list[str] = [] + self.posted_payloads: list[dict[str, Any]] = [] self.checked_urls: list[str] = [] async def post(self, url: str, **kwargs: Any) -> _FakeResponse: self.posted_urls.append(url) + self.posted_payloads.append(kwargs.get("json", {})) return _FakeResponse() async def get(self, url: str, **kwargs: Any) -> _FakeResponse: @@ -53,6 +55,7 @@ async def test_ollama_gcp_b_analyze_uses_secondary_url(monkeypatch: pytest.Monke "OLLAMA_SECONDARY_URL", "http://secondary:11436", ) + monkeypatch.setattr(ollama_module.settings, "ALERT_OLLAMA_MODEL", "gemma3:4b") client = _FakeClient() provider = OllamaGcpBProvider() @@ -67,6 +70,57 @@ async def test_ollama_gcp_b_analyze_uses_secondary_url(monkeypatch: pytest.Monke assert result.success is True assert result.provider == "ollama_gcp_b" assert client.posted_urls == ["http://secondary:11436/api/generate"] + assert client.posted_payloads[0]["model"] == "gemma3:4b" + + +@pytest.mark.asyncio +async def test_ollama_gcp_a_coerces_heavy_diagnose_model_to_alert_model( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(ollama_module, "get_model_registry", lambda: _FakeRegistry()) + monkeypatch.setattr(ollama_module.settings, "OLLAMA_URL", "http://primary:11435") + monkeypatch.setattr(ollama_module.settings, "OLLAMA_SECONDARY_URL", "http://secondary:11436") + monkeypatch.setattr(ollama_module.settings, "ALERT_OLLAMA_MODEL", "gemma3:4b") + + client = _FakeClient() + provider = OllamaProvider() + + async def _get_client() -> _FakeClient: + return client + + monkeypatch.setattr(provider, "_get_client", _get_client) + + result = await provider.analyze("diagnose", context={"task_type": "diagnose"}) + + assert result.success is True + assert client.posted_urls == ["http://primary:11435/api/generate"] + assert client.posted_payloads[0]["model"] == "gemma3:4b" + + +@pytest.mark.asyncio +async def test_ollama_gcp_a_can_explicitly_allow_heavy_model( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(ollama_module, "get_model_registry", lambda: _FakeRegistry()) + monkeypatch.setattr(ollama_module.settings, "OLLAMA_URL", "http://primary:11435") + monkeypatch.setattr(ollama_module.settings, "OLLAMA_SECONDARY_URL", "http://secondary:11436") + monkeypatch.setattr(ollama_module.settings, "ALERT_OLLAMA_MODEL", "gemma3:4b") + + client = _FakeClient() + provider = OllamaProvider() + + async def _get_client() -> _FakeClient: + return client + + monkeypatch.setattr(provider, "_get_client", _get_client) + + result = await provider.analyze( + "deep diagnose", + context={"task_type": "diagnose", "allow_gcp_heavy_model": True}, + ) + + assert result.success is True + assert client.posted_payloads[0]["model"] == "qwen3:14b" @pytest.mark.asyncio diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index b7090453..2397f3bc 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,20 @@ --- +## 2026-05-05 | GCP Ollama alert lane model isolation fix + +**背景**:Telegram 告警卡片仍看到 Gemini / GCP-A 逾時;live log 顯示 Phase24 AI Router 的 `diagnose` 路徑已選到 `ollama_gcp_a`,但模型仍使用 `qwen3:14b`,導致 CPU-only GCP-A 載入重模型後 `gemma3:4b` 健康檢查也 timeout。 + +**本次修補**: +- `OllamaProvider` 在 GCP-A/B endpoint 上攔截非 fast-lane 模型,預設強制改用 `ALERT_OLLAMA_MODEL=gemma3:4b`;只有明確帶 `allow_gcp_heavy_model=true` 才允許重模型跑在 GCP。 +- legacy OpenClaw `_call_ollama(ollama_only=True)` 同步固定使用 `ALERT_OLLAMA_MODEL`,避免 safety-net 再把 `qwen3:14b` 送到 GCP alert lane。 +- `OllamaToolProvider` 改用 resolver 的 `hermes` lane,不再以 `settings.OLLAMA_URL` 直接把 `hermes3:latest` 載到 GCP-A。 +- 補 `test_ollama_provider_endpoints.py`,鎖住 GCP-A/GCP-B 重模型 coercion 與顯式放行行為。 + +**現場操作**: +- 已手動卸載 GCP-A/B 的 `qwen*`、`deepseek*`、`hermes3`、`bge-m3`、`llava`、`minicpm`,並重新 keep-alive `gemma3:4b`。 +- 下一步需推 Gitea CD,確認 production image 含本修補後,再觀察告警卡片 Router 是否維持 Ollama 且不再載入 GCP 重模型。 + ## 2026-05-05 | drift-scanner CronJob 納入 ArgoCD baseline **背景**:重開機恢復後,K8s Deployments 與三個新納入的 CronJob 已跟到最新 image,但 `drift-scanner` 仍是手動套用的舊固定 SHA,會造成「服務健康、排程吃舊版」的冷啟動盲區。