fix(governance): skip non-finite slo values
This commit is contained in:
@@ -399,6 +399,7 @@ class GovernanceAgent:
|
||||
2026-04-27 P3.4 by Claude — AI SLO(ADR-100)
|
||||
"""
|
||||
import httpx
|
||||
import math
|
||||
|
||||
from src.core.config import settings
|
||||
|
||||
@@ -457,6 +458,21 @@ class GovernanceAgent:
|
||||
)
|
||||
continue
|
||||
value = float(result_list[0]["value"][1])
|
||||
if not math.isfinite(value):
|
||||
results[name] = {
|
||||
"name": name,
|
||||
"status": "skipped",
|
||||
"error": "non_finite_value",
|
||||
"reason": "prometheus_nan_or_inf",
|
||||
"hint": "SLO 分母目前沒有足夠事件,等待下一個有效樣本再評估",
|
||||
}
|
||||
logger.warning(
|
||||
"governance_slo_non_finite",
|
||||
slo=name,
|
||||
query=query,
|
||||
value=str(result_list[0]["value"][1]),
|
||||
)
|
||||
continue
|
||||
threshold = hard_red_lines[name]
|
||||
target = slo_targets[name]
|
||||
violated = value < threshold
|
||||
|
||||
@@ -23,13 +23,6 @@ import pytest
|
||||
|
||||
from src.services.governance_agent import (
|
||||
GovernanceAgent,
|
||||
get_governance_agent,
|
||||
reset_governance_agent,
|
||||
run_governance_loop,
|
||||
EXECUTION_FAIL_RATE_THRESHOLD,
|
||||
HALLUCINATION_RATE_THRESHOLD,
|
||||
KM_STALE_RATIO,
|
||||
TRUST_DRIFT_THRESHOLD,
|
||||
)
|
||||
|
||||
|
||||
@@ -645,3 +638,48 @@ class TestRunSelfCheckGlobalFailureAlert:
|
||||
|
||||
calls = [c[0][0] for c in alerter.alert_governance.call_args_list]
|
||||
assert "governance_self_failure" not in calls
|
||||
|
||||
|
||||
class _FakePrometheusResponse:
|
||||
def __init__(self, value: str) -> None:
|
||||
self._value = value
|
||||
|
||||
def json(self) -> dict[str, Any]:
|
||||
return {
|
||||
"status": "success",
|
||||
"data": {"result": [{"value": [1778756604, self._value]}]},
|
||||
}
|
||||
|
||||
|
||||
class _FakePrometheusClient:
|
||||
def __init__(self, value: str) -> None:
|
||||
self._value = value
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
async def get(self, *args, **kwargs): # noqa: ANN002, ANN003
|
||||
return _FakePrometheusResponse(self._value)
|
||||
|
||||
|
||||
class TestCheckSloCompliance:
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_finite_prometheus_value_is_skipped_not_ok(self):
|
||||
"""Prometheus NaN 代表分母暫無有效事件,不可被治理層誤判為 ok."""
|
||||
agent = _make_agent()
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=_FakePrometheusClient("NaN")):
|
||||
result = await agent.check_slo_compliance()
|
||||
|
||||
for name in (
|
||||
"autonomy_rate",
|
||||
"decision_accuracy",
|
||||
"confidence_calibration",
|
||||
"km_growth_rate",
|
||||
):
|
||||
assert result[name]["status"] == "skipped"
|
||||
assert result[name]["reason"] == "prometheus_nan_or_inf"
|
||||
assert result["_meta"]["status"] == "no_data"
|
||||
|
||||
Reference in New Issue
Block a user