fix(ai): route alerts through openclaw before gemini
This commit is contained in:
@@ -217,7 +217,7 @@ class OpenClawService:
|
||||
alert_context: dict | None = None,
|
||||
cloud_provider_order: list[str] | None = None,
|
||||
) -> list[str]:
|
||||
"""Resolve GCP-A/GCP-B/111, then Gemini backup, for alert analysis."""
|
||||
"""Resolve GCP-A/GCP-B/111, then OpenClaw/Nemo before Gemini backup."""
|
||||
provider_order: list[str] = []
|
||||
try:
|
||||
route = await get_ollama_failover_manager().select_provider(task_type=task_type)
|
||||
@@ -257,11 +257,19 @@ class OpenClawService:
|
||||
if not self._cloud_fallback_allowed_for_alert(alert_context):
|
||||
return ordered_ollama
|
||||
|
||||
cloud_candidates = cloud_provider_order or []
|
||||
cloud_backup: list[str] = []
|
||||
for provider_name in [*cloud_candidates, "gemini"]:
|
||||
if provider_name == "gemini" and provider_name not in cloud_backup:
|
||||
cloud_backup.append(provider_name)
|
||||
cloud_aliases = {"nvidia": "openclaw_nemo"}
|
||||
cloud_candidates = {
|
||||
cloud_aliases.get(provider_name, provider_name)
|
||||
for provider_name in (cloud_provider_order or [])
|
||||
}
|
||||
# Gemini remains the final paid backup, but alert traffic should use
|
||||
# OpenClaw/Nemo first whenever the router control plane has not disabled it.
|
||||
cloud_candidates.add("gemini")
|
||||
cloud_backup = [
|
||||
provider_name
|
||||
for provider_name in ("openclaw_nemo", "gemini")
|
||||
if provider_name in cloud_candidates
|
||||
]
|
||||
|
||||
return ordered_ollama + cloud_backup
|
||||
|
||||
|
||||
@@ -57,6 +57,7 @@ class _FakeRouter:
|
||||
return SimpleNamespace(
|
||||
selected_provider=AIProviderEnum.GEMINI,
|
||||
fallback_chain=[
|
||||
(AIProviderEnum.OPENCLAW_NEMO, "nvidia"),
|
||||
(AIProviderEnum.CLAUDE, "claude"),
|
||||
(AIProviderEnum.OLLAMA, "qwen2.5:7b-instruct"),
|
||||
],
|
||||
@@ -90,7 +91,7 @@ class _FakeExecutor:
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alert_context_uses_ollama_lane_then_gemini_backup(
|
||||
async def test_alert_context_uses_ollama_lane_then_openclaw_nemo_before_gemini(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
fake_executor = _FakeExecutor()
|
||||
@@ -124,7 +125,13 @@ async def test_alert_context_uses_ollama_lane_then_gemini_backup(
|
||||
42,
|
||||
0.0,
|
||||
)
|
||||
assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
|
||||
assert fake_executor.provider_order == [
|
||||
"ollama_gcp_a",
|
||||
"ollama_gcp_b",
|
||||
"ollama_local",
|
||||
"openclaw_nemo",
|
||||
"gemini",
|
||||
]
|
||||
assert fake_failover.task_types == ["diagnose"]
|
||||
|
||||
|
||||
@@ -174,7 +181,7 @@ async def test_non_alert_context_keeps_router_cloud_order(
|
||||
service = object.__new__(OpenClawService)
|
||||
await service._call_with_fallback("general question", alert_context={"intent_hint": "query"})
|
||||
|
||||
assert fake_executor.provider_order == ["gemini", "claude", "ollama"]
|
||||
assert fake_executor.provider_order == ["gemini", "openclaw_nemo", "claude", "ollama"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -207,7 +214,13 @@ async def test_explicit_ai_governance_context_uses_ollama_first(
|
||||
},
|
||||
)
|
||||
|
||||
assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
|
||||
assert fake_executor.provider_order == [
|
||||
"ollama_gcp_a",
|
||||
"ollama_gcp_b",
|
||||
"ollama_local",
|
||||
"openclaw_nemo",
|
||||
"gemini",
|
||||
]
|
||||
assert fake_failover.task_types == ["diagnose"]
|
||||
|
||||
|
||||
@@ -248,7 +261,7 @@ async def test_alert_context_sorts_ollama_lane_and_drops_cloud_providers(
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_alert_context_sorts_ollama_lane_before_gemini_backup(
|
||||
async def test_alert_context_sorts_ollama_lane_before_openclaw_nemo_backup(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True)
|
||||
@@ -259,7 +272,7 @@ async def test_alert_context_sorts_ollama_lane_before_gemini_backup(
|
||||
provider_order = await service._resolve_alert_provider_order(
|
||||
task_type="diagnose",
|
||||
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
|
||||
cloud_provider_order=["claude", "gemini", "ollama"],
|
||||
cloud_provider_order=["claude", "openclaw_nemo", "gemini", "ollama"],
|
||||
)
|
||||
|
||||
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
|
||||
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "openclaw_nemo", "gemini"]
|
||||
|
||||
@@ -4083,3 +4083,49 @@ py_compile \
|
||||
| Container | `docker compose up -d --build clawbot` 已重建 `openclaw`,健康檢查正常 |
|
||||
| analyze 技術驗證 | `openclaw_nemo` 端點已可返回 success,約 `4.8s`,provider=`openclaw_nvidia_nim` |
|
||||
| 品質判斷 | 暫不重新啟用 Redis `openclaw_nemo`:回應中文仍有亂碼且風險判斷偏高,尚未達到主線品質門檻 |
|
||||
|
||||
## 2026-05-06(台北)— 告警 AI 路由補齊 OpenClaw/Nemo 備援,避免 111 後直接跳 Gemini
|
||||
|
||||
**觸發**:Telegram 告警卡片仍顯示 `Router:Gemini`,即使 GCP-A、GCP-B、111 Ollama lane 已恢復。進一步追查發現 `OpenClawService._resolve_alert_provider_order()` 在強制 Ollama-first 後,只把 `gemini` 放回 cloud backup,導致 `openclaw_nemo` 被告警路徑跳過。
|
||||
|
||||
### 修正
|
||||
|
||||
| 範圍 | 結果 |
|
||||
|------|------|
|
||||
| AWOOOI provider order | 告警/治理路徑改為 `ollama_gcp_a → ollama_gcp_b → ollama_local → openclaw_nemo → gemini`;Gemini 仍保留為最後備援,不再是 111 後第一個 cloud provider |
|
||||
| Control plane 尊重 disable | `openclaw_nemo` 只有在 AI control 未停用時才會出現在 cloud candidates;不會繞過 `/ai disable openclaw_nemo` |
|
||||
| 188 OpenClaw/Nemo 品質 | `/home/ollama/clawbot-v5/app/api/routes.py` 已熱修並提交 `833dfb1`:預設 NIM model 改為 `meta/llama-3.3-70b-instruct`,強制繁中 JSON,ProviderHealthCheck/diagnostic_only 夾成低風險不可執行 |
|
||||
| 不採用 nano | `nvidia/llama-3.1-nemotron-nano-8b-v1` 短 prompt 很快,但正式 incident schema 會產生亂碼/JSON 失敗;不適合作為 OpenClaw 仲裁模型 |
|
||||
|
||||
### 現場驗證
|
||||
|
||||
```text
|
||||
OpenClaw health:
|
||||
openclaw Up healthy
|
||||
GET /health -> {"status":"healthy","version":"6.0"}
|
||||
|
||||
AWOOOI API Pod -> 188 OpenClaw -> NVIDIA NIM:
|
||||
ProviderHealthCheck: success=true, provider=openclaw_nemo, latency=3093ms, risk=low, kubectl_command=null
|
||||
DockerContainerRestartSpike synthetic: success=true, provider=openclaw_nemo, latency=4892ms, suggested_action=investigate, kubectl_command=null
|
||||
```
|
||||
|
||||
### 測試
|
||||
|
||||
```bash
|
||||
DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
|
||||
/Users/ogt/awoooi/apps/api/.venv/bin/python -m pytest \
|
||||
apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py \
|
||||
apps/api/tests/test_drift_interpreter_ollama_first.py -q
|
||||
# 10 passed
|
||||
|
||||
/Users/ogt/awoooi/apps/api/.venv/bin/python -m ruff check \
|
||||
apps/api/src/services/openclaw.py \
|
||||
apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py \
|
||||
apps/api/tests/test_drift_interpreter_ollama_first.py
|
||||
# All checks passed
|
||||
```
|
||||
|
||||
### 判讀
|
||||
|
||||
- 這次不是要禁 Gemini;而是恢復正確順序:GCP-A/GCP-B/111 優先,OpenClaw/Nemo 作為 cloud 仲裁備援,Gemini 只保留最後備援。
|
||||
- `openclaw_nemo` 在修補前仍被 Redis control disabled;需等 AWOOOI 新 image 部署後,再依現場測試結果解除 disabled,避免舊 image 仍直接跳 Gemini。
|
||||
|
||||
Reference in New Issue
Block a user