fix(ai): route alerts through openclaw before gemini
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 1m5s
CD Pipeline / build-and-deploy (push) Successful in 3m42s
CD Pipeline / post-deploy-checks (push) Successful in 1m36s

This commit is contained in:
Your Name
2026-05-06 20:11:24 +08:00
parent 572e7640cd
commit 1a1ab0df6e
3 changed files with 80 additions and 13 deletions

View File

@@ -217,7 +217,7 @@ class OpenClawService:
alert_context: dict | None = None,
cloud_provider_order: list[str] | None = None,
) -> list[str]:
"""Resolve GCP-A/GCP-B/111, then Gemini backup, for alert analysis."""
"""Resolve GCP-A/GCP-B/111, then OpenClaw/Nemo before Gemini backup."""
provider_order: list[str] = []
try:
route = await get_ollama_failover_manager().select_provider(task_type=task_type)
@@ -257,11 +257,19 @@ class OpenClawService:
if not self._cloud_fallback_allowed_for_alert(alert_context):
return ordered_ollama
cloud_candidates = cloud_provider_order or []
cloud_backup: list[str] = []
for provider_name in [*cloud_candidates, "gemini"]:
if provider_name == "gemini" and provider_name not in cloud_backup:
cloud_backup.append(provider_name)
cloud_aliases = {"nvidia": "openclaw_nemo"}
cloud_candidates = {
cloud_aliases.get(provider_name, provider_name)
for provider_name in (cloud_provider_order or [])
}
# Gemini remains the final paid backup, but alert traffic should use
# OpenClaw/Nemo first whenever the router control plane has not disabled it.
cloud_candidates.add("gemini")
cloud_backup = [
provider_name
for provider_name in ("openclaw_nemo", "gemini")
if provider_name in cloud_candidates
]
return ordered_ollama + cloud_backup

View File

@@ -57,6 +57,7 @@ class _FakeRouter:
return SimpleNamespace(
selected_provider=AIProviderEnum.GEMINI,
fallback_chain=[
(AIProviderEnum.OPENCLAW_NEMO, "nvidia"),
(AIProviderEnum.CLAUDE, "claude"),
(AIProviderEnum.OLLAMA, "qwen2.5:7b-instruct"),
],
@@ -90,7 +91,7 @@ class _FakeExecutor:
@pytest.mark.asyncio
async def test_alert_context_uses_ollama_lane_then_gemini_backup(
async def test_alert_context_uses_ollama_lane_then_openclaw_nemo_before_gemini(
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_executor = _FakeExecutor()
@@ -124,7 +125,13 @@ async def test_alert_context_uses_ollama_lane_then_gemini_backup(
42,
0.0,
)
assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
assert fake_executor.provider_order == [
"ollama_gcp_a",
"ollama_gcp_b",
"ollama_local",
"openclaw_nemo",
"gemini",
]
assert fake_failover.task_types == ["diagnose"]
@@ -174,7 +181,7 @@ async def test_non_alert_context_keeps_router_cloud_order(
service = object.__new__(OpenClawService)
await service._call_with_fallback("general question", alert_context={"intent_hint": "query"})
assert fake_executor.provider_order == ["gemini", "claude", "ollama"]
assert fake_executor.provider_order == ["gemini", "openclaw_nemo", "claude", "ollama"]
@pytest.mark.asyncio
@@ -207,7 +214,13 @@ async def test_explicit_ai_governance_context_uses_ollama_first(
},
)
assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
assert fake_executor.provider_order == [
"ollama_gcp_a",
"ollama_gcp_b",
"ollama_local",
"openclaw_nemo",
"gemini",
]
assert fake_failover.task_types == ["diagnose"]
@@ -248,7 +261,7 @@ async def test_alert_context_sorts_ollama_lane_and_drops_cloud_providers(
@pytest.mark.asyncio
async def test_alert_context_sorts_ollama_lane_before_gemini_backup(
async def test_alert_context_sorts_ollama_lane_before_openclaw_nemo_backup(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True)
@@ -259,7 +272,7 @@ async def test_alert_context_sorts_ollama_lane_before_gemini_backup(
provider_order = await service._resolve_alert_provider_order(
task_type="diagnose",
alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
cloud_provider_order=["claude", "gemini", "ollama"],
cloud_provider_order=["claude", "openclaw_nemo", "gemini", "ollama"],
)
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "openclaw_nemo", "gemini"]

View File

@@ -4083,3 +4083,49 @@ py_compile \
| Container | `docker compose up -d --build clawbot` 已重建 `openclaw`,健康檢查正常 |
| analyze 技術驗證 | `openclaw_nemo` 端點已可返回 success`4.8s`provider=`openclaw_nvidia_nim` |
| 品質判斷 | 暫不重新啟用 Redis `openclaw_nemo`:回應中文仍有亂碼且風險判斷偏高,尚未達到主線品質門檻 |
## 2026-05-06台北— 告警 AI 路由補齊 OpenClaw/Nemo 備援,避免 111 後直接跳 Gemini
**觸發**Telegram 告警卡片仍顯示 `RouterGemini`,即使 GCP-A、GCP-B、111 Ollama lane 已恢復。進一步追查發現 `OpenClawService._resolve_alert_provider_order()` 在強制 Ollama-first 後,只把 `gemini` 放回 cloud backup導致 `openclaw_nemo` 被告警路徑跳過。
### 修正
| 範圍 | 結果 |
|------|------|
| AWOOOI provider order | 告警/治理路徑改為 `ollama_gcp_a → ollama_gcp_b → ollama_local → openclaw_nemo → gemini`Gemini 仍保留為最後備援,不再是 111 後第一個 cloud provider |
| Control plane 尊重 disable | `openclaw_nemo` 只有在 AI control 未停用時才會出現在 cloud candidates不會繞過 `/ai disable openclaw_nemo` |
| 188 OpenClaw/Nemo 品質 | `/home/ollama/clawbot-v5/app/api/routes.py` 已熱修並提交 `833dfb1`:預設 NIM model 改為 `meta/llama-3.3-70b-instruct`,強制繁中 JSONProviderHealthCheck/diagnostic_only 夾成低風險不可執行 |
| 不採用 nano | `nvidia/llama-3.1-nemotron-nano-8b-v1` 短 prompt 很快,但正式 incident schema 會產生亂碼/JSON 失敗;不適合作為 OpenClaw 仲裁模型 |
### 現場驗證
```text
OpenClaw health:
openclaw Up healthy
GET /health -> {"status":"healthy","version":"6.0"}
AWOOOI API Pod -> 188 OpenClaw -> NVIDIA NIM:
ProviderHealthCheck: success=true, provider=openclaw_nemo, latency=3093ms, risk=low, kubectl_command=null
DockerContainerRestartSpike synthetic: success=true, provider=openclaw_nemo, latency=4892ms, suggested_action=investigate, kubectl_command=null
```
### 測試
```bash
DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
/Users/ogt/awoooi/apps/api/.venv/bin/python -m pytest \
apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py \
apps/api/tests/test_drift_interpreter_ollama_first.py -q
# 10 passed
/Users/ogt/awoooi/apps/api/.venv/bin/python -m ruff check \
apps/api/src/services/openclaw.py \
apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py \
apps/api/tests/test_drift_interpreter_ollama_first.py
# All checks passed
```
### 判讀
- 這次不是要禁 Gemini而是恢復正確順序GCP-A/GCP-B/111 優先OpenClaw/Nemo 作為 cloud 仲裁備援Gemini 只保留最後備援。
- `openclaw_nemo` 在修補前仍被 Redis control disabled需等 AWOOOI 新 image 部署後,再依現場測試結果解除 disabled避免舊 image 仍直接跳 Gemini。