fix(flywheel): fallback on OpenClaw degraded responses
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 9m56s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 9m56s
This commit is contained in:
@@ -21,7 +21,7 @@ import httpx
|
||||
import structlog
|
||||
|
||||
from src.core.config import get_settings
|
||||
from src.services.ai_providers.interfaces import AIProvider, AIResult, is_provider_enabled_by_env
|
||||
from src.services.ai_providers.interfaces import AIResult, is_provider_enabled_by_env
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
settings = get_settings()
|
||||
@@ -45,6 +45,23 @@ def _to_serializable(obj: Any) -> Any:
|
||||
return str(obj)
|
||||
|
||||
|
||||
def _confidence_value(data: dict[str, Any]) -> float:
|
||||
try:
|
||||
return float(data.get("confidence", 0.0))
|
||||
except (TypeError, ValueError):
|
||||
return 0.0
|
||||
|
||||
|
||||
def _is_degraded_response(data: dict[str, Any]) -> bool:
|
||||
"""
|
||||
OpenClaw may return a syntactically valid degraded proposal when its
|
||||
downstream NIM/Ollama call timed out. Treat that as fallbackable so Gemini
|
||||
and Claude remain available instead of accepting a low-confidence stub.
|
||||
"""
|
||||
provider = str(data.get("provider", ""))
|
||||
return bool(data.get("degraded")) or provider == "openclaw_degraded" or _confidence_value(data) < 0.3
|
||||
|
||||
|
||||
class OpenClawNemoProvider:
|
||||
"""
|
||||
OpenClaw 委派 Provider (188 → NVIDIA NIM)
|
||||
@@ -137,6 +154,22 @@ class OpenClawNemoProvider:
|
||||
result_json = _json.dumps(data, ensure_ascii=False)
|
||||
latency = (time.perf_counter() - start) * 1000
|
||||
|
||||
if _is_degraded_response(data):
|
||||
reason = str(data.get("reasoning") or data.get("description") or "OpenClaw degraded response")
|
||||
logger.warning(
|
||||
"openclaw_nemo_degraded_response",
|
||||
provider=data.get("provider"),
|
||||
confidence=data.get("confidence"),
|
||||
latency_ms=round(latency, 1),
|
||||
)
|
||||
return AIResult(
|
||||
raw_response=result_json,
|
||||
success=False,
|
||||
provider=self.name,
|
||||
latency_ms=latency,
|
||||
error=f"OpenClaw degraded: {reason[:200]}",
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"openclaw_nemo_provider_success",
|
||||
confidence=data.get("confidence", 0),
|
||||
|
||||
@@ -364,6 +364,19 @@ class OpenClawService:
|
||||
logger.warning("openclaw_analyze_invalid_response", incident_id=incident_id)
|
||||
return None
|
||||
|
||||
try:
|
||||
confidence_value = float(data.get("confidence", 0.0))
|
||||
except (TypeError, ValueError):
|
||||
confidence_value = 0.0
|
||||
if data.get("degraded") or data.get("provider") == "openclaw_degraded" or confidence_value < 0.3:
|
||||
logger.warning(
|
||||
"openclaw_analyze_degraded_response",
|
||||
incident_id=incident_id,
|
||||
provider=data.get("provider"),
|
||||
confidence=data.get("confidence"),
|
||||
)
|
||||
return None
|
||||
|
||||
logger.info(
|
||||
"openclaw_analyze_success",
|
||||
incident_id=incident_id,
|
||||
|
||||
74
apps/api/tests/test_openclaw_nemo_provider.py
Normal file
74
apps/api/tests/test_openclaw_nemo_provider.py
Normal file
@@ -0,0 +1,74 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from src.services.ai_providers.openclaw_nemo import OpenClawNemoProvider
|
||||
|
||||
|
||||
def _transport(payload: dict) -> httpx.MockTransport:
|
||||
def handler(_: httpx.Request) -> httpx.Response:
|
||||
return httpx.Response(200, json=payload)
|
||||
|
||||
return httpx.MockTransport(handler)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_openclaw_degraded_response_is_fallbackable():
|
||||
provider = OpenClawNemoProvider()
|
||||
provider._http_client = httpx.AsyncClient(transport=_transport({
|
||||
"action_title": "OpenClaw 降級調查",
|
||||
"description": "OpenClaw 下游 LLM timeout",
|
||||
"suggested_action": "investigate",
|
||||
"kubectl_command": None,
|
||||
"target_resource": "awoooi-api",
|
||||
"namespace": "awoooi-prod",
|
||||
"risk_level": "low",
|
||||
"confidence": 0.2,
|
||||
"provider": "openclaw_degraded",
|
||||
"degraded": True,
|
||||
"reasoning": "NVIDIA NIM timeout",
|
||||
}))
|
||||
|
||||
result = await provider.analyze("diagnose", context={"incident_id": "inc-1"})
|
||||
await provider.close()
|
||||
|
||||
assert result.success is False
|
||||
assert result.provider == "openclaw_nemo"
|
||||
assert "OpenClaw degraded" in (result.error or "")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_openclaw_low_confidence_response_is_fallbackable():
|
||||
provider = OpenClawNemoProvider()
|
||||
provider._http_client = httpx.AsyncClient(transport=_transport({
|
||||
"action_title": "調查服務異常",
|
||||
"risk_level": "low",
|
||||
"confidence": 0.1,
|
||||
"provider": "openclaw_nvidia_nim",
|
||||
"reasoning": "low confidence",
|
||||
}))
|
||||
|
||||
result = await provider.analyze("diagnose", context={"incident_id": "inc-2"})
|
||||
await provider.close()
|
||||
|
||||
assert result.success is False
|
||||
assert "OpenClaw degraded" in (result.error or "")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_openclaw_normal_response_stays_successful():
|
||||
provider = OpenClawNemoProvider()
|
||||
provider._http_client = httpx.AsyncClient(transport=_transport({
|
||||
"action_title": "重啟服務",
|
||||
"risk_level": "medium",
|
||||
"confidence": 0.8,
|
||||
"provider": "openclaw_nvidia_nim",
|
||||
"reasoning": "valid proposal",
|
||||
}))
|
||||
|
||||
result = await provider.analyze("diagnose", context={"incident_id": "inc-3"})
|
||||
await provider.close()
|
||||
|
||||
assert result.success is True
|
||||
assert result.provider == "openclaw_nemo"
|
||||
Reference in New Issue
Block a user