fix(ai): keep GCP Ollama lane on safe models
This commit is contained in:
@@ -29,6 +29,54 @@ from src.services.model_registry import get_model_registry
|
||||
logger = structlog.get_logger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
_GCP_SAFE_MODELS = {
|
||||
"gemma3:4b",
|
||||
}
|
||||
|
||||
|
||||
def _normalized_url(value: str | None) -> str:
|
||||
return (value or "").rstrip("/")
|
||||
|
||||
|
||||
def _is_gcp_alert_lane(endpoint_url: str) -> bool:
|
||||
"""Return true for the CPU-only GCP-A/B synchronous alert lane."""
|
||||
endpoint = _normalized_url(endpoint_url)
|
||||
return endpoint in {
|
||||
_normalized_url(getattr(settings, "OLLAMA_URL", "")),
|
||||
_normalized_url(getattr(settings, "OLLAMA_SECONDARY_URL", "")),
|
||||
}
|
||||
|
||||
|
||||
def _resolve_model_for_endpoint(
|
||||
*,
|
||||
requested_model: str,
|
||||
endpoint_url: str,
|
||||
context: dict | None,
|
||||
) -> str:
|
||||
"""
|
||||
Keep GCP-A/B on the fast alert model unless explicitly allowed.
|
||||
|
||||
The GCP hosts currently expose CPU-only Ollama. Loading 7B/14B/32B models on
|
||||
that lane blocks synchronous alerts long enough to fall through to Gemini.
|
||||
Heavy/deep workloads must use 111 or the future AwoooP Inference Gateway.
|
||||
"""
|
||||
model_name = requested_model.strip()
|
||||
context = context or {}
|
||||
allow_gcp_heavy = bool(context.get("allow_gcp_heavy_model"))
|
||||
|
||||
if _is_gcp_alert_lane(endpoint_url) and not allow_gcp_heavy and model_name not in _GCP_SAFE_MODELS:
|
||||
alert_model = str(getattr(settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")).strip() or "gemma3:4b"
|
||||
logger.warning(
|
||||
"ollama_gcp_heavy_model_coerced",
|
||||
endpoint=endpoint_url,
|
||||
requested_model=model_name,
|
||||
safe_model=alert_model,
|
||||
task_type=context.get("task_type"),
|
||||
)
|
||||
return alert_model
|
||||
|
||||
return model_name
|
||||
|
||||
|
||||
class OllamaProvider:
|
||||
"""
|
||||
@@ -77,7 +125,13 @@ class OllamaProvider:
|
||||
client = await self._get_client()
|
||||
|
||||
registry = get_model_registry()
|
||||
model_name = str((context or {}).get("ollama_model") or registry.get_model("ollama", "rca")).strip()
|
||||
endpoint_url = self._endpoint_url()
|
||||
requested_model = str((context or {}).get("ollama_model") or registry.get_model("ollama", "rca")).strip()
|
||||
model_name = _resolve_model_for_endpoint(
|
||||
requested_model=requested_model,
|
||||
endpoint_url=endpoint_url,
|
||||
context=context,
|
||||
)
|
||||
options = registry.get_provider_options("ollama")
|
||||
|
||||
# P0 2026-04-04 Claude Code: per-task timeout(Option C 分情境)
|
||||
@@ -89,7 +143,6 @@ class OllamaProvider:
|
||||
else:
|
||||
read_timeout = float(settings.OPENCLAW_TIMEOUT)
|
||||
|
||||
endpoint_url = self._endpoint_url()
|
||||
response = await client.post(
|
||||
f"{endpoint_url}/api/generate",
|
||||
json={
|
||||
|
||||
@@ -40,6 +40,7 @@ from src.models.nvidia import (
|
||||
from src.services.langfuse_client import ( # 2026-03-29 ogt: P1-1 Langfuse 整合
|
||||
LangfuseTraceContext,
|
||||
)
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
settings = get_settings()
|
||||
@@ -822,8 +823,8 @@ class NvidiaProvider:
|
||||
NVIDIA_REQUESTS_TOTAL.labels(status="error", tool_name="chat").inc()
|
||||
import traceback
|
||||
logger.warning(
|
||||
"nvidia_chat_failed",
|
||||
error=str(e),
|
||||
"nvidia_chat_failed",
|
||||
error=str(e),
|
||||
error_type=type(e).__name__,
|
||||
stacktrace=traceback.format_exc()
|
||||
)
|
||||
@@ -845,7 +846,7 @@ class OllamaToolProvider:
|
||||
取代 NVIDIA 雲端 NIM。延遲從 44s 降至 ~5s。
|
||||
|
||||
模型: llama3.1:8b (tool calling 最穩定的 8B 模型)
|
||||
Endpoint: OLLAMA_URL/v1/chat/completions (OpenAI 相容格式)
|
||||
Endpoint: local tool lane /v1/chat/completions (OpenAI 相容格式)
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
@@ -872,10 +873,14 @@ class OllamaToolProvider:
|
||||
) -> list[ToolCallValidationResult]:
|
||||
return [tc for tc in tool_calls if self.is_high_risk_tool(tc.tool_name)]
|
||||
|
||||
def _base_url(self) -> str:
|
||||
"""Tool-calling/Hermes models stay off the GCP alert lane."""
|
||||
return resolve_ollama_endpoint("hermes").rstrip("/")
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
try:
|
||||
client = await self._get_client()
|
||||
base_url = settings.OLLAMA_URL.rstrip("/")
|
||||
base_url = self._base_url()
|
||||
resp = await client.get(f"{base_url}/api/tags", timeout=5.0)
|
||||
return resp.status_code == 200
|
||||
except Exception:
|
||||
@@ -892,7 +897,7 @@ class OllamaToolProvider:
|
||||
"""Ollama /v1/chat/completions tool calling"""
|
||||
start_time = time.perf_counter()
|
||||
model = model or settings.OLLAMA_TOOL_MODEL
|
||||
base_url = settings.OLLAMA_URL.rstrip("/")
|
||||
base_url = self._base_url()
|
||||
url = f"{base_url}/v1/chat/completions"
|
||||
|
||||
# 轉換 tools 為 dict 格式(同 NvidiaProvider)
|
||||
@@ -988,7 +993,7 @@ class OllamaToolProvider:
|
||||
async def chat(self, prompt: str, model: str = "", temperature: float = 0.7, max_tokens: int = 512) -> str:
|
||||
"""簡單 chat(非 tool calling 路徑,保持 INvidiaProvider 相容)"""
|
||||
model = model or settings.OLLAMA_TOOL_MODEL
|
||||
base_url = settings.OLLAMA_URL.rstrip("/")
|
||||
base_url = self._base_url()
|
||||
try:
|
||||
client = await self._get_client()
|
||||
resp = await client.post(
|
||||
@@ -1010,7 +1015,7 @@ _provider: NvidiaProvider | None = None
|
||||
_ollama_tool_provider: OllamaToolProvider | None = None
|
||||
|
||||
|
||||
def get_nvidia_provider() -> "NvidiaProvider | OllamaToolProvider":
|
||||
def get_nvidia_provider() -> NvidiaProvider | OllamaToolProvider:
|
||||
"""
|
||||
取得 Tool Calling Provider 單例。
|
||||
USE_OLLAMA_TOOL_CALLING=True (預設) → OllamaToolProvider (本機,~5s)
|
||||
|
||||
@@ -534,6 +534,8 @@ class OpenClawService:
|
||||
# 從 ModelRegistry 取得模型配置
|
||||
registry = get_model_registry()
|
||||
model_name = registry.get_model("ollama", "rca")
|
||||
if ollama_only:
|
||||
model_name = getattr(settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")
|
||||
options = registry.get_provider_options("ollama")
|
||||
timeout_seconds = max(
|
||||
float(settings.OPENCLAW_TIMEOUT),
|
||||
|
||||
@@ -23,6 +23,7 @@ from src.models.nvidia import (
|
||||
from src.services.nvidia_provider import (
|
||||
HIGH_RISK_TOOLS,
|
||||
NvidiaProvider,
|
||||
OllamaToolProvider,
|
||||
create_tool_definition,
|
||||
get_nvidia_provider,
|
||||
reset_nvidia_provider,
|
||||
@@ -286,6 +287,58 @@ class TestProtocolCompliance:
|
||||
assert hasattr(INvidiaProvider, "close")
|
||||
|
||||
|
||||
class TestOllamaToolProviderRouting:
|
||||
"""Ollama tool-calling must not pollute the GCP alert lane."""
|
||||
|
||||
def test_base_url_uses_hermes_resolver_lane(self, monkeypatch):
|
||||
from src.services import nvidia_provider as nvidia_provider_module
|
||||
|
||||
captured_workloads = []
|
||||
|
||||
def fake_resolve(workload):
|
||||
captured_workloads.append(workload)
|
||||
return "http://local-111:11434"
|
||||
|
||||
monkeypatch.setattr(nvidia_provider_module, "resolve_ollama_endpoint", fake_resolve)
|
||||
|
||||
provider = OllamaToolProvider()
|
||||
|
||||
assert provider._base_url() == "http://local-111:11434"
|
||||
assert captured_workloads == ["hermes"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_health_check_uses_local_hermes_lane(self, monkeypatch):
|
||||
from src.services import nvidia_provider as nvidia_provider_module
|
||||
|
||||
class _FakeResponse:
|
||||
status_code = 200
|
||||
|
||||
class _FakeClient:
|
||||
def __init__(self):
|
||||
self.checked_urls = []
|
||||
|
||||
async def get(self, url, **kwargs):
|
||||
self.checked_urls.append(url)
|
||||
return _FakeResponse()
|
||||
|
||||
monkeypatch.setattr(
|
||||
nvidia_provider_module,
|
||||
"resolve_ollama_endpoint",
|
||||
lambda workload: "http://local-111:11434",
|
||||
)
|
||||
|
||||
provider = OllamaToolProvider()
|
||||
client = _FakeClient()
|
||||
|
||||
async def _get_client():
|
||||
return client
|
||||
|
||||
monkeypatch.setattr(provider, "_get_client", _get_client)
|
||||
|
||||
assert await provider.health_check() is True
|
||||
assert client.checked_urls == ["http://local-111:11434/api/tags"]
|
||||
|
||||
|
||||
class TestEdgeCases:
|
||||
"""邊界測試案例 (P2-2)"""
|
||||
|
||||
@@ -406,7 +459,11 @@ class TestAIRouterNvidiaIntegration:
|
||||
|
||||
def test_tool_calling_route(self):
|
||||
"""測試 Tool Calling 路由"""
|
||||
from src.services.ai_router import AIProviderEnum, get_ai_router, reset_ai_router
|
||||
from src.services.ai_router import (
|
||||
AIProviderEnum,
|
||||
get_ai_router,
|
||||
reset_ai_router,
|
||||
)
|
||||
|
||||
reset_ai_router()
|
||||
router = get_ai_router()
|
||||
@@ -424,7 +481,11 @@ class TestAIRouterNvidiaIntegration:
|
||||
|
||||
def test_existing_routing_not_affected(self):
|
||||
"""測試現有路由規則不受影響"""
|
||||
from src.services.ai_router import AIProviderEnum, get_ai_router, reset_ai_router
|
||||
from src.services.ai_router import (
|
||||
AIProviderEnum,
|
||||
get_ai_router,
|
||||
reset_ai_router,
|
||||
)
|
||||
|
||||
reset_ai_router()
|
||||
router = get_ai_router()
|
||||
|
||||
@@ -10,7 +10,7 @@ from src.services.ai_providers.ollama import OllamaGcpBProvider, OllamaProvider
|
||||
|
||||
class _FakeRegistry:
|
||||
def get_model(self, provider: str, use_case: str) -> str:
|
||||
return "qwen2.5:7b-instruct"
|
||||
return "qwen3:14b"
|
||||
|
||||
def get_provider_options(self, provider: str) -> dict[str, Any]:
|
||||
return {"num_predict": 32, "temperature": 0.1, "top_p": 0.9}
|
||||
@@ -33,10 +33,12 @@ class _FakeResponse:
|
||||
class _FakeClient:
|
||||
def __init__(self) -> None:
|
||||
self.posted_urls: list[str] = []
|
||||
self.posted_payloads: list[dict[str, Any]] = []
|
||||
self.checked_urls: list[str] = []
|
||||
|
||||
async def post(self, url: str, **kwargs: Any) -> _FakeResponse:
|
||||
self.posted_urls.append(url)
|
||||
self.posted_payloads.append(kwargs.get("json", {}))
|
||||
return _FakeResponse()
|
||||
|
||||
async def get(self, url: str, **kwargs: Any) -> _FakeResponse:
|
||||
@@ -53,6 +55,7 @@ async def test_ollama_gcp_b_analyze_uses_secondary_url(monkeypatch: pytest.Monke
|
||||
"OLLAMA_SECONDARY_URL",
|
||||
"http://secondary:11436",
|
||||
)
|
||||
monkeypatch.setattr(ollama_module.settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")
|
||||
|
||||
client = _FakeClient()
|
||||
provider = OllamaGcpBProvider()
|
||||
@@ -67,6 +70,57 @@ async def test_ollama_gcp_b_analyze_uses_secondary_url(monkeypatch: pytest.Monke
|
||||
assert result.success is True
|
||||
assert result.provider == "ollama_gcp_b"
|
||||
assert client.posted_urls == ["http://secondary:11436/api/generate"]
|
||||
assert client.posted_payloads[0]["model"] == "gemma3:4b"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ollama_gcp_a_coerces_heavy_diagnose_model_to_alert_model(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
monkeypatch.setattr(ollama_module, "get_model_registry", lambda: _FakeRegistry())
|
||||
monkeypatch.setattr(ollama_module.settings, "OLLAMA_URL", "http://primary:11435")
|
||||
monkeypatch.setattr(ollama_module.settings, "OLLAMA_SECONDARY_URL", "http://secondary:11436")
|
||||
monkeypatch.setattr(ollama_module.settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")
|
||||
|
||||
client = _FakeClient()
|
||||
provider = OllamaProvider()
|
||||
|
||||
async def _get_client() -> _FakeClient:
|
||||
return client
|
||||
|
||||
monkeypatch.setattr(provider, "_get_client", _get_client)
|
||||
|
||||
result = await provider.analyze("diagnose", context={"task_type": "diagnose"})
|
||||
|
||||
assert result.success is True
|
||||
assert client.posted_urls == ["http://primary:11435/api/generate"]
|
||||
assert client.posted_payloads[0]["model"] == "gemma3:4b"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ollama_gcp_a_can_explicitly_allow_heavy_model(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
monkeypatch.setattr(ollama_module, "get_model_registry", lambda: _FakeRegistry())
|
||||
monkeypatch.setattr(ollama_module.settings, "OLLAMA_URL", "http://primary:11435")
|
||||
monkeypatch.setattr(ollama_module.settings, "OLLAMA_SECONDARY_URL", "http://secondary:11436")
|
||||
monkeypatch.setattr(ollama_module.settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")
|
||||
|
||||
client = _FakeClient()
|
||||
provider = OllamaProvider()
|
||||
|
||||
async def _get_client() -> _FakeClient:
|
||||
return client
|
||||
|
||||
monkeypatch.setattr(provider, "_get_client", _get_client)
|
||||
|
||||
result = await provider.analyze(
|
||||
"deep diagnose",
|
||||
context={"task_type": "diagnose", "allow_gcp_heavy_model": True},
|
||||
)
|
||||
|
||||
assert result.success is True
|
||||
assert client.posted_payloads[0]["model"] == "qwen3:14b"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -6,6 +6,20 @@
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-05 | GCP Ollama alert lane model isolation fix
|
||||
|
||||
**背景**:Telegram 告警卡片仍看到 Gemini / GCP-A 逾時;live log 顯示 Phase24 AI Router 的 `diagnose` 路徑已選到 `ollama_gcp_a`,但模型仍使用 `qwen3:14b`,導致 CPU-only GCP-A 載入重模型後 `gemma3:4b` 健康檢查也 timeout。
|
||||
|
||||
**本次修補**:
|
||||
- `OllamaProvider` 在 GCP-A/B endpoint 上攔截非 fast-lane 模型,預設強制改用 `ALERT_OLLAMA_MODEL=gemma3:4b`;只有明確帶 `allow_gcp_heavy_model=true` 才允許重模型跑在 GCP。
|
||||
- legacy OpenClaw `_call_ollama(ollama_only=True)` 同步固定使用 `ALERT_OLLAMA_MODEL`,避免 safety-net 再把 `qwen3:14b` 送到 GCP alert lane。
|
||||
- `OllamaToolProvider` 改用 resolver 的 `hermes` lane,不再以 `settings.OLLAMA_URL` 直接把 `hermes3:latest` 載到 GCP-A。
|
||||
- 補 `test_ollama_provider_endpoints.py`,鎖住 GCP-A/GCP-B 重模型 coercion 與顯式放行行為。
|
||||
|
||||
**現場操作**:
|
||||
- 已手動卸載 GCP-A/B 的 `qwen*`、`deepseek*`、`hermes3`、`bge-m3`、`llava`、`minicpm`,並重新 keep-alive `gemma3:4b`。
|
||||
- 下一步需推 Gitea CD,確認 production image 含本修補後,再觀察告警卡片 Router 是否維持 Ollama 且不再載入 GCP 重模型。
|
||||
|
||||
## 2026-05-05 | drift-scanner CronJob 納入 ArgoCD baseline
|
||||
|
||||
**背景**:重開機恢復後,K8s Deployments 與三個新納入的 CronJob 已跟到最新 image,但 `drift-scanner` 仍是手動套用的舊固定 SHA,會造成「服務健康、排程吃舊版」的冷啟動盲區。
|
||||
|
||||
Reference in New Issue
Block a user