diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 35d9eba4..6f8a7f56 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -217,7 +217,7 @@ class OpenClawService: alert_context: dict | None = None, cloud_provider_order: list[str] | None = None, ) -> list[str]: - """Resolve GCP-A/GCP-B/111, then Gemini backup, for alert analysis.""" + """Resolve GCP-A/GCP-B/111, then OpenClaw/Nemo before Gemini backup.""" provider_order: list[str] = [] try: route = await get_ollama_failover_manager().select_provider(task_type=task_type) @@ -257,11 +257,19 @@ class OpenClawService: if not self._cloud_fallback_allowed_for_alert(alert_context): return ordered_ollama - cloud_candidates = cloud_provider_order or [] - cloud_backup: list[str] = [] - for provider_name in [*cloud_candidates, "gemini"]: - if provider_name == "gemini" and provider_name not in cloud_backup: - cloud_backup.append(provider_name) + cloud_aliases = {"nvidia": "openclaw_nemo"} + cloud_candidates = { + cloud_aliases.get(provider_name, provider_name) + for provider_name in (cloud_provider_order or []) + } + # Gemini remains the final paid backup, but alert traffic should use + # OpenClaw/Nemo first whenever the router control plane has not disabled it. + cloud_candidates.add("gemini") + cloud_backup = [ + provider_name + for provider_name in ("openclaw_nemo", "gemini") + if provider_name in cloud_candidates + ] return ordered_ollama + cloud_backup diff --git a/apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py b/apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py index 977da1f4..f565457f 100644 --- a/apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py +++ b/apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py @@ -57,6 +57,7 @@ class _FakeRouter: return SimpleNamespace( selected_provider=AIProviderEnum.GEMINI, fallback_chain=[ + (AIProviderEnum.OPENCLAW_NEMO, "nvidia"), (AIProviderEnum.CLAUDE, "claude"), (AIProviderEnum.OLLAMA, "qwen2.5:7b-instruct"), ], @@ -90,7 +91,7 @@ class _FakeExecutor: @pytest.mark.asyncio -async def test_alert_context_uses_ollama_lane_then_gemini_backup( +async def test_alert_context_uses_ollama_lane_then_openclaw_nemo_before_gemini( monkeypatch: pytest.MonkeyPatch, ) -> None: fake_executor = _FakeExecutor() @@ -124,7 +125,13 @@ async def test_alert_context_uses_ollama_lane_then_gemini_backup( 42, 0.0, ) - assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"] + assert fake_executor.provider_order == [ + "ollama_gcp_a", + "ollama_gcp_b", + "ollama_local", + "openclaw_nemo", + "gemini", + ] assert fake_failover.task_types == ["diagnose"] @@ -174,7 +181,7 @@ async def test_non_alert_context_keeps_router_cloud_order( service = object.__new__(OpenClawService) await service._call_with_fallback("general question", alert_context={"intent_hint": "query"}) - assert fake_executor.provider_order == ["gemini", "claude", "ollama"] + assert fake_executor.provider_order == ["gemini", "openclaw_nemo", "claude", "ollama"] @pytest.mark.asyncio @@ -207,7 +214,13 @@ async def test_explicit_ai_governance_context_uses_ollama_first( }, ) - assert fake_executor.provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"] + assert fake_executor.provider_order == [ + "ollama_gcp_a", + "ollama_gcp_b", + "ollama_local", + "openclaw_nemo", + "gemini", + ] assert fake_failover.task_types == ["diagnose"] @@ -248,7 +261,7 @@ async def test_alert_context_sorts_ollama_lane_and_drops_cloud_providers( @pytest.mark.asyncio -async def test_alert_context_sorts_ollama_lane_before_gemini_backup( +async def test_alert_context_sorts_ollama_lane_before_openclaw_nemo_backup( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr(openclaw_module.settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True) @@ -259,7 +272,7 @@ async def test_alert_context_sorts_ollama_lane_before_gemini_backup( provider_order = await service._resolve_alert_provider_order( task_type="diagnose", alert_context={"incident_id": "INC-1", "alertname": "HostHighCpuLoad"}, - cloud_provider_order=["claude", "gemini", "ollama"], + cloud_provider_order=["claude", "openclaw_nemo", "gemini", "ollama"], ) - assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "gemini"] + assert provider_order == ["ollama_gcp_a", "ollama_gcp_b", "ollama_local", "openclaw_nemo", "gemini"] diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 457c0302..d70ff3e3 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -4083,3 +4083,49 @@ py_compile \ | Container | `docker compose up -d --build clawbot` 已重建 `openclaw`,健康檢查正常 | | analyze 技術驗證 | `openclaw_nemo` 端點已可返回 success,約 `4.8s`,provider=`openclaw_nvidia_nim` | | 品質判斷 | 暫不重新啟用 Redis `openclaw_nemo`:回應中文仍有亂碼且風險判斷偏高,尚未達到主線品質門檻 | + +## 2026-05-06(台北)— 告警 AI 路由補齊 OpenClaw/Nemo 備援,避免 111 後直接跳 Gemini + +**觸發**:Telegram 告警卡片仍顯示 `Router:Gemini`,即使 GCP-A、GCP-B、111 Ollama lane 已恢復。進一步追查發現 `OpenClawService._resolve_alert_provider_order()` 在強制 Ollama-first 後,只把 `gemini` 放回 cloud backup,導致 `openclaw_nemo` 被告警路徑跳過。 + +### 修正 + +| 範圍 | 結果 | +|------|------| +| AWOOOI provider order | 告警/治理路徑改為 `ollama_gcp_a → ollama_gcp_b → ollama_local → openclaw_nemo → gemini`;Gemini 仍保留為最後備援,不再是 111 後第一個 cloud provider | +| Control plane 尊重 disable | `openclaw_nemo` 只有在 AI control 未停用時才會出現在 cloud candidates;不會繞過 `/ai disable openclaw_nemo` | +| 188 OpenClaw/Nemo 品質 | `/home/ollama/clawbot-v5/app/api/routes.py` 已熱修並提交 `833dfb1`:預設 NIM model 改為 `meta/llama-3.3-70b-instruct`,強制繁中 JSON,ProviderHealthCheck/diagnostic_only 夾成低風險不可執行 | +| 不採用 nano | `nvidia/llama-3.1-nemotron-nano-8b-v1` 短 prompt 很快,但正式 incident schema 會產生亂碼/JSON 失敗;不適合作為 OpenClaw 仲裁模型 | + +### 現場驗證 + +```text +OpenClaw health: +openclaw Up healthy +GET /health -> {"status":"healthy","version":"6.0"} + +AWOOOI API Pod -> 188 OpenClaw -> NVIDIA NIM: +ProviderHealthCheck: success=true, provider=openclaw_nemo, latency=3093ms, risk=low, kubectl_command=null +DockerContainerRestartSpike synthetic: success=true, provider=openclaw_nemo, latency=4892ms, suggested_action=investigate, kubectl_command=null +``` + +### 測試 + +```bash +DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \ + /Users/ogt/awoooi/apps/api/.venv/bin/python -m pytest \ + apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py \ + apps/api/tests/test_drift_interpreter_ollama_first.py -q +# 10 passed + +/Users/ogt/awoooi/apps/api/.venv/bin/python -m ruff check \ + apps/api/src/services/openclaw.py \ + apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py \ + apps/api/tests/test_drift_interpreter_ollama_first.py +# All checks passed +``` + +### 判讀 + +- 這次不是要禁 Gemini;而是恢復正確順序:GCP-A/GCP-B/111 優先,OpenClaw/Nemo 作為 cloud 仲裁備援,Gemini 只保留最後備援。 +- `openclaw_nemo` 在修補前仍被 Redis control disabled;需等 AWOOOI 新 image 部署後,再依現場測試結果解除 disabled,避免舊 image 仍直接跳 Gemini。