fix(ai): route alerts through openclaw before gemini

2026-05-06 20:11:24 +08:00
parent 572e7640cd
commit 1a1ab0df6e
3 changed files with 80 additions and 13 deletions
--- a/apps/api/src/services/openclaw.py
+++ b/apps/api/src/services/openclaw.py
@@ -217,7 +217,7 @@ class OpenClawService:
        alert_context: dict | None = None,
        cloud_provider_order: list[str] | None = None,
    ) -> list[str]:
-        """Resolve GCP-A/GCP-B/111, then Gemini backup, for alert analysis."""
+        """Resolve GCP-A/GCP-B/111, then OpenClaw/Nemo before Gemini backup."""
        provider_order: list[str] = []
        try:
            route = await get_ollama_failover_manager().select_provider(task_type=task_type)
@@ -257,11 +257,19 @@ class OpenClawService:
        if not self._cloud_fallback_allowed_for_alert(alert_context):
            return ordered_ollama

-        cloud_candidates = cloud_provider_order or []
-        cloud_backup: list[str] = []
-        for provider_name in [*cloud_candidates, "gemini"]:
-            if provider_name == "gemini" and provider_name not in cloud_backup:
-                cloud_backup.append(provider_name)
+        cloud_aliases = {"nvidia": "openclaw_nemo"}
+        cloud_candidates = {
+            cloud_aliases.get(provider_name, provider_name)
+            for provider_name in (cloud_provider_order or [])
+        }
+        # Gemini remains the final paid backup, but alert traffic should use
+        # OpenClaw/Nemo first whenever the router control plane has not disabled it.
+        cloud_candidates.add("gemini")
+        cloud_backup = [
+            provider_name
+            for provider_name in ("openclaw_nemo", "gemini")
+            if provider_name in cloud_candidates
+        ]

        return ordered_ollama + cloud_backup

--- a/apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py
+++ b/apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py
@@ -57,6 +57,7 @@ class _FakeRouter:
        return SimpleNamespace(
            selected_provider=AIProviderEnum.GEMINI,
            fallback_chain=[
+                (AIProviderEnum.OPENCLAW_NEMO, "nvidia"),
                (AIProviderEnum.CLAUDE, "claude"),
                (AIProviderEnum.OLLAMA, "qwen2.5:7b-instruct"),
            ],
@@ -90,7 +91,7 @@ class _FakeExecutor:


@pytest.mark.asyncio
-async def test_alert_context_uses_ollama_lane_then_gemini_backup(
+async def test_alert_context_uses_ollama_lane_then_openclaw_nemo_before_gemini(
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    fake_executor = _FakeExecutor()
@@ -124,7 +125,13 @@ async def test_alert_context_uses_ollama_lane_then_gemini_backup(
        42,
        0.0,
    )
-    assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
+    assert fake_executor.provider_order == [
+        "ollama_gcp_a",
+        "ollama_gcp_b",
+        "ollama_local",
+        "openclaw_nemo",
+        "gemini",
+    ]
    assert fake_failover.task_types == ["diagnose"]


@@ -174,7 +181,7 @@ async def test_non_alert_context_keeps_router_cloud_order(
    service = object.__new__(OpenClawService)
    await service._call_with_fallback("general question", alert_context={"intent_hint": "query"})

-    assert fake_executor.provider_order == ["gemini", "claude", "ollama"]
+    assert fake_executor.provider_order == ["gemini", "openclaw_nemo", "claude", "ollama"]


@pytest.mark.asyncio
@@ -207,7 +214,13 @@ async def test_explicit_ai_governance_context_uses_ollama_first(
        },
    )

-    assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
+    assert fake_executor.provider_order == [
+        "ollama_gcp_a",
+        "ollama_gcp_b",
+        "ollama_local",
+        "openclaw_nemo",
+        "gemini",
+    ]
    assert fake_failover.task_types == ["diagnose"]


@@ -248,7 +261,7 @@ async def test_alert_context_sorts_ollama_lane_and_drops_cloud_providers(


@pytest.mark.asyncio
-async def test_alert_context_sorts_ollama_lane_before_gemini_backup(
+async def test_alert_context_sorts_ollama_lane_before_openclaw_nemo_backup(
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True)
@@ -259,7 +272,7 @@ async def test_alert_context_sorts_ollama_lane_before_gemini_backup(
    provider_order = await service._resolve_alert_provider_order(
        task_type="diagnose",
        alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"},
-        cloud_provider_order=["claude", "gemini", "ollama"],
+        cloud_provider_order=["claude", "openclaw_nemo", "gemini", "ollama"],
    )

-    assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"]
+    assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "openclaw_nemo", "gemini"]
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -4083,3 +4083,49 @@ py_compile \
 | Container | `docker compose up -d --build clawbot` 已重建 `openclaw`，健康檢查正常 |
 | analyze 技術驗證 | `openclaw_nemo` 端點已可返回 success，約 `4.8s`，provider=`openclaw_nvidia_nim` |
 | 品質判斷 | 暫不重新啟用 Redis `openclaw_nemo`：回應中文仍有亂碼且風險判斷偏高，尚未達到主線品質門檻 |
+
+## 2026-05-06（台北）— 告警 AI 路由補齊 OpenClaw/Nemo 備援，避免 111 後直接跳 Gemini
+
+**觸發**：Telegram 告警卡片仍顯示 `Router：Gemini`，即使 GCP-A、GCP-B、111 Ollama lane 已恢復。進一步追查發現 `OpenClawService._resolve_alert_provider_order()` 在強制 Ollama-first 後，只把 `gemini` 放回 cloud backup，導致 `openclaw_nemo` 被告警路徑跳過。
+
+### 修正
+
+| 範圍 | 結果 |
+|------|------|
+| AWOOOI provider order | 告警/治理路徑改為 `ollama_gcp_a → ollama_gcp_b → ollama_local → openclaw_nemo → gemini`；Gemini 仍保留為最後備援，不再是 111 後第一個 cloud provider |
+| Control plane 尊重 disable | `openclaw_nemo` 只有在 AI control 未停用時才會出現在 cloud candidates；不會繞過 `/ai disable openclaw_nemo` |
+| 188 OpenClaw/Nemo 品質 | `/home/ollama/clawbot-v5/app/api/routes.py` 已熱修並提交 `833dfb1`：預設 NIM model 改為 `meta/llama-3.3-70b-instruct`，強制繁中 JSON，ProviderHealthCheck/diagnostic_only 夾成低風險不可執行 |
+| 不採用 nano | `nvidia/llama-3.1-nemotron-nano-8b-v1` 短 prompt 很快，但正式 incident schema 會產生亂碼/JSON 失敗；不適合作為 OpenClaw 仲裁模型 |
+
+### 現場驗證
+
+```text
+OpenClaw health:
+openclaw Up healthy
+GET /health -> {"status":"healthy","version":"6.0"}
+
+AWOOOI API Pod -> 188 OpenClaw -> NVIDIA NIM:
+ProviderHealthCheck: success=true, provider=openclaw_nemo, latency=3093ms, risk=low, kubectl_command=null
+DockerContainerRestartSpike synthetic: success=true, provider=openclaw_nemo, latency=4892ms, suggested_action=investigate, kubectl_command=null
+```
+
+### 測試
+
+```bash
+DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
+  /Users/ogt/awoooi/apps/api/.venv/bin/python -m pytest \
+  apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py \
+  apps/api/tests/test_drift_interpreter_ollama_first.py -q
+# 10 passed
+
+/Users/ogt/awoooi/apps/api/.venv/bin/python -m ruff check \
+  apps/api/src/services/openclaw.py \
+  apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py \
+  apps/api/tests/test_drift_interpreter_ollama_first.py
+# All checks passed
+```
+
+### 判讀
+
+- 這次不是要禁 Gemini；而是恢復正確順序：GCP-A/GCP-B/111 優先，OpenClaw/Nemo 作為 cloud 仲裁備援，Gemini 只保留最後備援。
+- `openclaw_nemo` 在修補前仍被 Redis control disabled；需等 AWOOOI 新 image 部署後，再依現場測試結果解除 disabled，避免舊 image 仍直接跳 Gemini。