fix(governance): skip non-finite slo values
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m4s
CD Pipeline / build-and-deploy (push) Successful in 3m18s
CD Pipeline / post-deploy-checks (push) Successful in 1m17s

This commit is contained in:
Your Name
2026-05-14 19:05:16 +08:00
parent d1b0ee7e96
commit 368386abc0
2 changed files with 61 additions and 7 deletions

View File

@@ -399,6 +399,7 @@ class GovernanceAgent:
2026-04-27 P3.4 by Claude — AI SLOADR-100
"""
import httpx
import math
from src.core.config import settings
@@ -457,6 +458,21 @@ class GovernanceAgent:
)
continue
value = float(result_list[0]["value"][1])
if not math.isfinite(value):
results[name] = {
"name": name,
"status": "skipped",
"error": "non_finite_value",
"reason": "prometheus_nan_or_inf",
"hint": "SLO 分母目前沒有足夠事件,等待下一個有效樣本再評估",
}
logger.warning(
"governance_slo_non_finite",
slo=name,
query=query,
value=str(result_list[0]["value"][1]),
)
continue
threshold = hard_red_lines[name]
target = slo_targets[name]
violated = value < threshold

View File

@@ -23,13 +23,6 @@ import pytest
from src.services.governance_agent import (
GovernanceAgent,
get_governance_agent,
reset_governance_agent,
run_governance_loop,
EXECUTION_FAIL_RATE_THRESHOLD,
HALLUCINATION_RATE_THRESHOLD,
KM_STALE_RATIO,
TRUST_DRIFT_THRESHOLD,
)
@@ -645,3 +638,48 @@ class TestRunSelfCheckGlobalFailureAlert:
calls = [c[0][0] for c in alerter.alert_governance.call_args_list]
assert "governance_self_failure" not in calls
class _FakePrometheusResponse:
def __init__(self, value: str) -> None:
self._value = value
def json(self) -> dict[str, Any]:
return {
"status": "success",
"data": {"result": [{"value": [1778756604, self._value]}]},
}
class _FakePrometheusClient:
def __init__(self, value: str) -> None:
self._value = value
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return False
async def get(self, *args, **kwargs): # noqa: ANN002, ANN003
return _FakePrometheusResponse(self._value)
class TestCheckSloCompliance:
@pytest.mark.asyncio
async def test_non_finite_prometheus_value_is_skipped_not_ok(self):
"""Prometheus NaN 代表分母暫無有效事件,不可被治理層誤判為 ok."""
agent = _make_agent()
with patch("httpx.AsyncClient", return_value=_FakePrometheusClient("NaN")):
result = await agent.check_slo_compliance()
for name in (
"autonomy_rate",
"decision_accuracy",
"confidence_calibration",
"km_growth_rate",
):
assert result[name]["status"] == "skipped"
assert result[name]["reason"] == "prometheus_nan_or_inf"
assert result["_meta"]["status"] == "no_data"