diff --git a/apps/api/src/services/ai_providers/openclaw_nemo.py b/apps/api/src/services/ai_providers/openclaw_nemo.py index bc9b08b5..0718fe8a 100644 --- a/apps/api/src/services/ai_providers/openclaw_nemo.py +++ b/apps/api/src/services/ai_providers/openclaw_nemo.py @@ -21,7 +21,7 @@ import httpx import structlog from src.core.config import get_settings -from src.services.ai_providers.interfaces import AIProvider, AIResult, is_provider_enabled_by_env +from src.services.ai_providers.interfaces import AIResult, is_provider_enabled_by_env logger = structlog.get_logger(__name__) settings = get_settings() @@ -45,6 +45,23 @@ def _to_serializable(obj: Any) -> Any: return str(obj) +def _confidence_value(data: dict[str, Any]) -> float: + try: + return float(data.get("confidence", 0.0)) + except (TypeError, ValueError): + return 0.0 + + +def _is_degraded_response(data: dict[str, Any]) -> bool: + """ + OpenClaw may return a syntactically valid degraded proposal when its + downstream NIM/Ollama call timed out. Treat that as fallbackable so Gemini + and Claude remain available instead of accepting a low-confidence stub. + """ + provider = str(data.get("provider", "")) + return bool(data.get("degraded")) or provider == "openclaw_degraded" or _confidence_value(data) < 0.3 + + class OpenClawNemoProvider: """ OpenClaw 委派 Provider (188 → NVIDIA NIM) @@ -137,6 +154,22 @@ class OpenClawNemoProvider: result_json = _json.dumps(data, ensure_ascii=False) latency = (time.perf_counter() - start) * 1000 + if _is_degraded_response(data): + reason = str(data.get("reasoning") or data.get("description") or "OpenClaw degraded response") + logger.warning( + "openclaw_nemo_degraded_response", + provider=data.get("provider"), + confidence=data.get("confidence"), + latency_ms=round(latency, 1), + ) + return AIResult( + raw_response=result_json, + success=False, + provider=self.name, + latency_ms=latency, + error=f"OpenClaw degraded: {reason[:200]}", + ) + logger.info( "openclaw_nemo_provider_success", confidence=data.get("confidence", 0), diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 31b8a54b..b26f9247 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -364,6 +364,19 @@ class OpenClawService: logger.warning("openclaw_analyze_invalid_response", incident_id=incident_id) return None + try: + confidence_value = float(data.get("confidence", 0.0)) + except (TypeError, ValueError): + confidence_value = 0.0 + if data.get("degraded") or data.get("provider") == "openclaw_degraded" or confidence_value < 0.3: + logger.warning( + "openclaw_analyze_degraded_response", + incident_id=incident_id, + provider=data.get("provider"), + confidence=data.get("confidence"), + ) + return None + logger.info( "openclaw_analyze_success", incident_id=incident_id, diff --git a/apps/api/tests/test_openclaw_nemo_provider.py b/apps/api/tests/test_openclaw_nemo_provider.py new file mode 100644 index 00000000..56c12c7b --- /dev/null +++ b/apps/api/tests/test_openclaw_nemo_provider.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import httpx +import pytest + +from src.services.ai_providers.openclaw_nemo import OpenClawNemoProvider + + +def _transport(payload: dict) -> httpx.MockTransport: + def handler(_: httpx.Request) -> httpx.Response: + return httpx.Response(200, json=payload) + + return httpx.MockTransport(handler) + + +@pytest.mark.asyncio +async def test_openclaw_degraded_response_is_fallbackable(): + provider = OpenClawNemoProvider() + provider._http_client = httpx.AsyncClient(transport=_transport({ + "action_title": "OpenClaw 降級調查", + "description": "OpenClaw 下游 LLM timeout", + "suggested_action": "investigate", + "kubectl_command": None, + "target_resource": "awoooi-api", + "namespace": "awoooi-prod", + "risk_level": "low", + "confidence": 0.2, + "provider": "openclaw_degraded", + "degraded": True, + "reasoning": "NVIDIA NIM timeout", + })) + + result = await provider.analyze("diagnose", context={"incident_id": "inc-1"}) + await provider.close() + + assert result.success is False + assert result.provider == "openclaw_nemo" + assert "OpenClaw degraded" in (result.error or "") + + +@pytest.mark.asyncio +async def test_openclaw_low_confidence_response_is_fallbackable(): + provider = OpenClawNemoProvider() + provider._http_client = httpx.AsyncClient(transport=_transport({ + "action_title": "調查服務異常", + "risk_level": "low", + "confidence": 0.1, + "provider": "openclaw_nvidia_nim", + "reasoning": "low confidence", + })) + + result = await provider.analyze("diagnose", context={"incident_id": "inc-2"}) + await provider.close() + + assert result.success is False + assert "OpenClaw degraded" in (result.error or "") + + +@pytest.mark.asyncio +async def test_openclaw_normal_response_stays_successful(): + provider = OpenClawNemoProvider() + provider._http_client = httpx.AsyncClient(transport=_transport({ + "action_title": "重啟服務", + "risk_level": "medium", + "confidence": 0.8, + "provider": "openclaw_nvidia_nim", + "reasoning": "valid proposal", + })) + + result = await provider.analyze("diagnose", context={"incident_id": "inc-3"}) + await provider.close() + + assert result.success is True + assert result.provider == "openclaw_nemo"