fix(ai): keep GCP Ollama lane on safe models
All checks were successful
CD Pipeline / tests (push) Successful in 54s
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / build-and-deploy (push) Successful in 3m25s
CD Pipeline / post-deploy-checks (push) Successful in 1m50s

This commit is contained in:
Your Name
2026-05-05 23:37:08 +08:00
parent 1ba36697ca
commit e208798531
6 changed files with 201 additions and 12 deletions

View File

@@ -29,6 +29,54 @@ from src.services.model_registry import get_model_registry
logger = structlog.get_logger(__name__)
settings = get_settings()
_GCP_SAFE_MODELS = {
"gemma3:4b",
}
def _normalized_url(value: str | None) -> str:
return (value or "").rstrip("/")
def _is_gcp_alert_lane(endpoint_url: str) -> bool:
"""Return true for the CPU-only GCP-A/B synchronous alert lane."""
endpoint = _normalized_url(endpoint_url)
return endpoint in {
_normalized_url(getattr(settings, "OLLAMA_URL", "")),
_normalized_url(getattr(settings, "OLLAMA_SECONDARY_URL", "")),
}
def _resolve_model_for_endpoint(
*,
requested_model: str,
endpoint_url: str,
context: dict | None,
) -> str:
"""
Keep GCP-A/B on the fast alert model unless explicitly allowed.
The GCP hosts currently expose CPU-only Ollama. Loading 7B/14B/32B models on
that lane blocks synchronous alerts long enough to fall through to Gemini.
Heavy/deep workloads must use 111 or the future AwoooP Inference Gateway.
"""
model_name = requested_model.strip()
context = context or {}
allow_gcp_heavy = bool(context.get("allow_gcp_heavy_model"))
if _is_gcp_alert_lane(endpoint_url) and not allow_gcp_heavy and model_name not in _GCP_SAFE_MODELS:
alert_model = str(getattr(settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")).strip() or "gemma3:4b"
logger.warning(
"ollama_gcp_heavy_model_coerced",
endpoint=endpoint_url,
requested_model=model_name,
safe_model=alert_model,
task_type=context.get("task_type"),
)
return alert_model
return model_name
class OllamaProvider:
"""
@@ -77,7 +125,13 @@ class OllamaProvider:
client = await self._get_client()
registry = get_model_registry()
model_name = str((context or {}).get("ollama_model") or registry.get_model("ollama", "rca")).strip()
endpoint_url = self._endpoint_url()
requested_model = str((context or {}).get("ollama_model") or registry.get_model("ollama", "rca")).strip()
model_name = _resolve_model_for_endpoint(
requested_model=requested_model,
endpoint_url=endpoint_url,
context=context,
)
options = registry.get_provider_options("ollama")
# P0 2026-04-04 Claude Code: per-task timeoutOption C 分情境)
@@ -89,7 +143,6 @@ class OllamaProvider:
else:
read_timeout = float(settings.OPENCLAW_TIMEOUT)
endpoint_url = self._endpoint_url()
response = await client.post(
f"{endpoint_url}/api/generate",
json={

View File

@@ -40,6 +40,7 @@ from src.models.nvidia import (
from src.services.langfuse_client import ( # 2026-03-29 ogt: P1-1 Langfuse 整合
LangfuseTraceContext,
)
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
logger = structlog.get_logger(__name__)
settings = get_settings()
@@ -822,8 +823,8 @@ class NvidiaProvider:
NVIDIA_REQUESTS_TOTAL.labels(status="error", tool_name="chat").inc()
import traceback
logger.warning(
"nvidia_chat_failed",
error=str(e),
"nvidia_chat_failed",
error=str(e),
error_type=type(e).__name__,
stacktrace=traceback.format_exc()
)
@@ -845,7 +846,7 @@ class OllamaToolProvider:
取代 NVIDIA 雲端 NIM。延遲從 44s 降至 ~5s。
模型: llama3.1:8b (tool calling 最穩定的 8B 模型)
Endpoint: OLLAMA_URL/v1/chat/completions (OpenAI 相容格式)
Endpoint: local tool lane /v1/chat/completions (OpenAI 相容格式)
"""
def __init__(self) -> None:
@@ -872,10 +873,14 @@ class OllamaToolProvider:
) -> list[ToolCallValidationResult]:
return [tc for tc in tool_calls if self.is_high_risk_tool(tc.tool_name)]
def _base_url(self) -> str:
"""Tool-calling/Hermes models stay off the GCP alert lane."""
return resolve_ollama_endpoint("hermes").rstrip("/")
async def health_check(self) -> bool:
try:
client = await self._get_client()
base_url = settings.OLLAMA_URL.rstrip("/")
base_url = self._base_url()
resp = await client.get(f"{base_url}/api/tags", timeout=5.0)
return resp.status_code == 200
except Exception:
@@ -892,7 +897,7 @@ class OllamaToolProvider:
"""Ollama /v1/chat/completions tool calling"""
start_time = time.perf_counter()
model = model or settings.OLLAMA_TOOL_MODEL
base_url = settings.OLLAMA_URL.rstrip("/")
base_url = self._base_url()
url = f"{base_url}/v1/chat/completions"
# 轉換 tools 為 dict 格式(同 NvidiaProvider
@@ -988,7 +993,7 @@ class OllamaToolProvider:
async def chat(self, prompt: str, model: str = "", temperature: float = 0.7, max_tokens: int = 512) -> str:
"""簡單 chat非 tool calling 路徑,保持 INvidiaProvider 相容)"""
model = model or settings.OLLAMA_TOOL_MODEL
base_url = settings.OLLAMA_URL.rstrip("/")
base_url = self._base_url()
try:
client = await self._get_client()
resp = await client.post(
@@ -1010,7 +1015,7 @@ _provider: NvidiaProvider | None = None
_ollama_tool_provider: OllamaToolProvider | None = None
def get_nvidia_provider() -> "NvidiaProvider | OllamaToolProvider":
def get_nvidia_provider() -> NvidiaProvider | OllamaToolProvider:
"""
取得 Tool Calling Provider 單例。
USE_OLLAMA_TOOL_CALLING=True (預設) → OllamaToolProvider (本機,~5s)

View File

@@ -534,6 +534,8 @@ class OpenClawService:
# 從 ModelRegistry 取得模型配置
registry = get_model_registry()
model_name = registry.get_model("ollama", "rca")
if ollama_only:
model_name = getattr(settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")
options = registry.get_provider_options("ollama")
timeout_seconds = max(
float(settings.OPENCLAW_TIMEOUT),

View File

@@ -23,6 +23,7 @@ from src.models.nvidia import (
from src.services.nvidia_provider import (
HIGH_RISK_TOOLS,
NvidiaProvider,
OllamaToolProvider,
create_tool_definition,
get_nvidia_provider,
reset_nvidia_provider,
@@ -286,6 +287,58 @@ class TestProtocolCompliance:
assert hasattr(INvidiaProvider, "close")
class TestOllamaToolProviderRouting:
"""Ollama tool-calling must not pollute the GCP alert lane."""
def test_base_url_uses_hermes_resolver_lane(self, monkeypatch):
from src.services import nvidia_provider as nvidia_provider_module
captured_workloads = []
def fake_resolve(workload):
captured_workloads.append(workload)
return "http://local-111:11434"
monkeypatch.setattr(nvidia_provider_module, "resolve_ollama_endpoint", fake_resolve)
provider = OllamaToolProvider()
assert provider._base_url() == "http://local-111:11434"
assert captured_workloads == ["hermes"]
@pytest.mark.asyncio
async def test_health_check_uses_local_hermes_lane(self, monkeypatch):
from src.services import nvidia_provider as nvidia_provider_module
class _FakeResponse:
status_code = 200
class _FakeClient:
def __init__(self):
self.checked_urls = []
async def get(self, url, **kwargs):
self.checked_urls.append(url)
return _FakeResponse()
monkeypatch.setattr(
nvidia_provider_module,
"resolve_ollama_endpoint",
lambda workload: "http://local-111:11434",
)
provider = OllamaToolProvider()
client = _FakeClient()
async def _get_client():
return client
monkeypatch.setattr(provider, "_get_client", _get_client)
assert await provider.health_check() is True
assert client.checked_urls == ["http://local-111:11434/api/tags"]
class TestEdgeCases:
"""邊界測試案例 (P2-2)"""
@@ -406,7 +459,11 @@ class TestAIRouterNvidiaIntegration:
def test_tool_calling_route(self):
"""測試 Tool Calling 路由"""
from src.services.ai_router import AIProviderEnum, get_ai_router, reset_ai_router
from src.services.ai_router import (
AIProviderEnum,
get_ai_router,
reset_ai_router,
)
reset_ai_router()
router = get_ai_router()
@@ -424,7 +481,11 @@ class TestAIRouterNvidiaIntegration:
def test_existing_routing_not_affected(self):
"""測試現有路由規則不受影響"""
from src.services.ai_router import AIProviderEnum, get_ai_router, reset_ai_router
from src.services.ai_router import (
AIProviderEnum,
get_ai_router,
reset_ai_router,
)
reset_ai_router()
router = get_ai_router()

View File

@@ -10,7 +10,7 @@ from src.services.ai_providers.ollama import OllamaGcpBProvider, OllamaProvider
class _FakeRegistry:
def get_model(self, provider: str, use_case: str) -> str:
return "qwen2.5:7b-instruct"
return "qwen3:14b"
def get_provider_options(self, provider: str) -> dict[str, Any]:
return {"num_predict": 32, "temperature": 0.1, "top_p": 0.9}
@@ -33,10 +33,12 @@ class _FakeResponse:
class _FakeClient:
def __init__(self) -> None:
self.posted_urls: list[str] = []
self.posted_payloads: list[dict[str, Any]] = []
self.checked_urls: list[str] = []
async def post(self, url: str, **kwargs: Any) -> _FakeResponse:
self.posted_urls.append(url)
self.posted_payloads.append(kwargs.get("json", {}))
return _FakeResponse()
async def get(self, url: str, **kwargs: Any) -> _FakeResponse:
@@ -53,6 +55,7 @@ async def test_ollama_gcp_b_analyze_uses_secondary_url(monkeypatch: pytest.Monke
"OLLAMA_SECONDARY_URL",
"http://secondary:11436",
)
monkeypatch.setattr(ollama_module.settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")
client = _FakeClient()
provider = OllamaGcpBProvider()
@@ -67,6 +70,57 @@ async def test_ollama_gcp_b_analyze_uses_secondary_url(monkeypatch: pytest.Monke
assert result.success is True
assert result.provider == "ollama_gcp_b"
assert client.posted_urls == ["http://secondary:11436/api/generate"]
assert client.posted_payloads[0]["model"] == "gemma3:4b"
@pytest.mark.asyncio
async def test_ollama_gcp_a_coerces_heavy_diagnose_model_to_alert_model(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(ollama_module, "get_model_registry", lambda: _FakeRegistry())
monkeypatch.setattr(ollama_module.settings, "OLLAMA_URL", "http://primary:11435")
monkeypatch.setattr(ollama_module.settings, "OLLAMA_SECONDARY_URL", "http://secondary:11436")
monkeypatch.setattr(ollama_module.settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")
client = _FakeClient()
provider = OllamaProvider()
async def _get_client() -> _FakeClient:
return client
monkeypatch.setattr(provider, "_get_client", _get_client)
result = await provider.analyze("diagnose", context={"task_type": "diagnose"})
assert result.success is True
assert client.posted_urls == ["http://primary:11435/api/generate"]
assert client.posted_payloads[0]["model"] == "gemma3:4b"
@pytest.mark.asyncio
async def test_ollama_gcp_a_can_explicitly_allow_heavy_model(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(ollama_module, "get_model_registry", lambda: _FakeRegistry())
monkeypatch.setattr(ollama_module.settings, "OLLAMA_URL", "http://primary:11435")
monkeypatch.setattr(ollama_module.settings, "OLLAMA_SECONDARY_URL", "http://secondary:11436")
monkeypatch.setattr(ollama_module.settings, "ALERT_OLLAMA_MODEL", "gemma3:4b")
client = _FakeClient()
provider = OllamaProvider()
async def _get_client() -> _FakeClient:
return client
monkeypatch.setattr(provider, "_get_client", _get_client)
result = await provider.analyze(
"deep diagnose",
context={"task_type": "diagnose", "allow_gcp_heavy_model": True},
)
assert result.success is True
assert client.posted_payloads[0]["model"] == "qwen3:14b"
@pytest.mark.asyncio

View File

@@ -6,6 +6,20 @@
---
## 2026-05-05 | GCP Ollama alert lane model isolation fix
**背景**Telegram 告警卡片仍看到 Gemini / GCP-A 逾時live log 顯示 Phase24 AI Router 的 `diagnose` 路徑已選到 `ollama_gcp_a`,但模型仍使用 `qwen3:14b`,導致 CPU-only GCP-A 載入重模型後 `gemma3:4b` 健康檢查也 timeout。
**本次修補**
- `OllamaProvider` 在 GCP-A/B endpoint 上攔截非 fast-lane 模型,預設強制改用 `ALERT_OLLAMA_MODEL=gemma3:4b`;只有明確帶 `allow_gcp_heavy_model=true` 才允許重模型跑在 GCP。
- legacy OpenClaw `_call_ollama(ollama_only=True)` 同步固定使用 `ALERT_OLLAMA_MODEL`,避免 safety-net 再把 `qwen3:14b` 送到 GCP alert lane。
- `OllamaToolProvider` 改用 resolver 的 `hermes` lane不再以 `settings.OLLAMA_URL` 直接把 `hermes3:latest` 載到 GCP-A。
-`test_ollama_provider_endpoints.py`,鎖住 GCP-A/GCP-B 重模型 coercion 與顯式放行行為。
**現場操作**
- 已手動卸載 GCP-A/B 的 `qwen*``deepseek*``hermes3``bge-m3``llava``minicpm`,並重新 keep-alive `gemma3:4b`
- 下一步需推 Gitea CD確認 production image 含本修補後,再觀察告警卡片 Router 是否維持 Ollama 且不再載入 GCP 重模型。
## 2026-05-05 | drift-scanner CronJob 納入 ArgoCD baseline
**背景**重開機恢復後K8s Deployments 與三個新納入的 CronJob 已跟到最新 image`drift-scanner` 仍是手動套用的舊固定 SHA會造成「服務健康、排程吃舊版」的冷啟動盲區。