diff --git a/apps/api/src/services/governance_agent.py b/apps/api/src/services/governance_agent.py index 24ea676e..9e2596bf 100644 --- a/apps/api/src/services/governance_agent.py +++ b/apps/api/src/services/governance_agent.py @@ -399,6 +399,7 @@ class GovernanceAgent: 2026-04-27 P3.4 by Claude — AI SLO(ADR-100) """ import httpx + import math from src.core.config import settings @@ -457,6 +458,21 @@ class GovernanceAgent: ) continue value = float(result_list[0]["value"][1]) + if not math.isfinite(value): + results[name] = { + "name": name, + "status": "skipped", + "error": "non_finite_value", + "reason": "prometheus_nan_or_inf", + "hint": "SLO 分母目前沒有足夠事件,等待下一個有效樣本再評估", + } + logger.warning( + "governance_slo_non_finite", + slo=name, + query=query, + value=str(result_list[0]["value"][1]), + ) + continue threshold = hard_red_lines[name] target = slo_targets[name] violated = value < threshold diff --git a/apps/api/tests/test_governance_agent.py b/apps/api/tests/test_governance_agent.py index e84c0fdb..d345a402 100644 --- a/apps/api/tests/test_governance_agent.py +++ b/apps/api/tests/test_governance_agent.py @@ -23,13 +23,6 @@ import pytest from src.services.governance_agent import ( GovernanceAgent, - get_governance_agent, - reset_governance_agent, - run_governance_loop, - EXECUTION_FAIL_RATE_THRESHOLD, - HALLUCINATION_RATE_THRESHOLD, - KM_STALE_RATIO, - TRUST_DRIFT_THRESHOLD, ) @@ -645,3 +638,48 @@ class TestRunSelfCheckGlobalFailureAlert: calls = [c[0][0] for c in alerter.alert_governance.call_args_list] assert "governance_self_failure" not in calls + + +class _FakePrometheusResponse: + def __init__(self, value: str) -> None: + self._value = value + + def json(self) -> dict[str, Any]: + return { + "status": "success", + "data": {"result": [{"value": [1778756604, self._value]}]}, + } + + +class _FakePrometheusClient: + def __init__(self, value: str) -> None: + self._value = value + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + async def get(self, *args, **kwargs): # noqa: ANN002, ANN003 + return _FakePrometheusResponse(self._value) + + +class TestCheckSloCompliance: + @pytest.mark.asyncio + async def test_non_finite_prometheus_value_is_skipped_not_ok(self): + """Prometheus NaN 代表分母暫無有效事件,不可被治理層誤判為 ok.""" + agent = _make_agent() + + with patch("httpx.AsyncClient", return_value=_FakePrometheusClient("NaN")): + result = await agent.check_slo_compliance() + + for name in ( + "autonomy_rate", + "decision_accuracy", + "confidence_calibration", + "km_growth_rate", + ): + assert result[name]["status"] == "skipped" + assert result[name]["reason"] == "prometheus_nan_or_inf" + assert result["_meta"]["status"] == "no_data"