feat(governance): surface adr100 slo states
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 1m0s
CD Pipeline / build-and-deploy (push) Successful in 4m0s
CD Pipeline / post-deploy-checks (push) Successful in 1m55s

This commit is contained in:
Your Name
2026-05-14 19:57:32 +08:00
parent 6c16a7b162
commit 809bc9670b
7 changed files with 559 additions and 40 deletions

View File

@@ -20,6 +20,7 @@ from __future__ import annotations
import structlog
from fastapi import APIRouter, Query
from src.services.adr100_slo_status_service import get_adr100_slo_status_service
from src.services.ai_slo_calculator import AiSloCalculator
logger = structlog.get_logger(__name__)
@@ -50,9 +51,11 @@ async def get_ai_slo(
if cached:
data = cached.to_dict()
data["cache_hit"] = True
data["adr100"] = await get_adr100_slo_status_service().fetch_report()
return data
report = await calc.run()
data = report.to_dict()
data["cache_hit"] = False
data["adr100"] = await get_adr100_slo_status_service().fetch_report()
return data

View File

@@ -0,0 +1,278 @@
"""
Read-only ADR-100 SLO status snapshot.
GovernanceAgent.check_slo_compliance() can emit governance alerts when an SLO is
violated. This service is intentionally read-only so dashboards can show the
same Prometheus-backed state without producing Telegram/DB side effects.
"""
from __future__ import annotations
import math
from dataclasses import dataclass
from datetime import UTC, datetime
from typing import Any
import httpx
import structlog
from src.core.config import settings
logger = structlog.get_logger(__name__)
@dataclass(frozen=True)
class Adr100SloDefinition:
name: str
query: str
target: float
hard_red_line: float
direction: str
unit: str
window: str
denominator_query: str | None = None
denominator_window_seconds: int = 0
minimum_events: float = 1.0
ADR100_SLO_DEFINITIONS: tuple[Adr100SloDefinition, ...] = (
Adr100SloDefinition(
name="autonomy_rate",
query="sli:autonomy_rate:5m",
target=0.80,
hard_red_line=0.70,
direction="above",
unit="percent",
window="5m",
denominator_query="sum(rate(automation_operation_log_total[5m]))",
denominator_window_seconds=300,
),
Adr100SloDefinition(
name="decision_accuracy",
query="sli:decision_accuracy:5m",
target=0.90,
hard_red_line=0.85,
direction="above",
unit="percent",
window="5m",
denominator_query='sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))',
denominator_window_seconds=300,
),
Adr100SloDefinition(
name="confidence_calibration",
query="sli:confidence_calibration:1h",
target=0.80,
hard_red_line=0.70,
direction="above",
unit="percent",
window="1h",
denominator_query="sum(rate(approval_records_high_confidence_total[1h]))",
denominator_window_seconds=3600,
),
Adr100SloDefinition(
name="km_growth_rate",
query="max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)",
target=20.0,
hard_red_line=5.0,
direction="above",
unit="count",
window="24h",
),
)
class Adr100SloStatusService:
"""Fetch ADR-100 SLO status from Prometheus without writing governance events."""
async def fetch_report(self) -> dict[str, Any]:
prom_url = getattr(
settings,
"PROMETHEUS_URL",
"http://prometheus.observability.svc:9090",
).rstrip("/")
metrics: list[dict[str, Any]] = []
async with httpx.AsyncClient(timeout=5.0) as client:
for definition in ADR100_SLO_DEFINITIONS:
metrics.append(await self._fetch_metric(client, prom_url, definition))
evaluable = [metric for metric in metrics if metric.get("evaluable")]
ok_count = sum(1 for metric in evaluable if metric.get("status") == "ok")
overall_compliance = (ok_count / len(evaluable)) if evaluable else None
overall_status = _overall_status(metrics, evaluable)
return {
"schema_version": "adr100_slo_status_v1",
"source": "prometheus",
"evaluated_at": datetime.now(UTC).isoformat(),
"overall_status": overall_status,
"overall_compliance": overall_compliance,
"evaluable_count": len(evaluable),
"metric_count": len(metrics),
"metrics": metrics,
}
async def _fetch_metric(
self,
client: httpx.AsyncClient,
prom_url: str,
definition: Adr100SloDefinition,
) -> dict[str, Any]:
denominator_value: float | None = None
sample_count: float | None = None
if definition.denominator_query:
denominator_result = await _query_prometheus_value(
client,
prom_url,
definition.denominator_query,
)
if denominator_result["status"] != "ok":
return _metric_payload(
definition,
value=None,
status="no_data",
reason=denominator_result["reason"],
denominator_value=None,
sample_count=None,
)
denominator_value = float(denominator_result["value"])
sample_count = denominator_value * definition.denominator_window_seconds
if sample_count < definition.minimum_events:
return _metric_payload(
definition,
value=None,
status="skipped_low_volume",
reason="denominator_below_minimum_events",
denominator_value=denominator_value,
sample_count=sample_count,
)
value_result = await _query_prometheus_value(client, prom_url, definition.query)
if value_result["status"] != "ok":
status = (
"skipped_low_volume"
if value_result["reason"] == "prometheus_nan_or_inf"
else "no_data"
)
return _metric_payload(
definition,
value=None,
status=status,
reason=value_result["reason"],
denominator_value=denominator_value,
sample_count=sample_count,
)
value = float(value_result["value"])
status = _classify_status(value, definition)
return _metric_payload(
definition,
value=value,
status=status,
reason=None,
denominator_value=denominator_value,
sample_count=sample_count if sample_count is not None else value,
)
async def _query_prometheus_value(
client: httpx.AsyncClient,
prom_url: str,
query: str,
) -> dict[str, Any]:
try:
response = await client.get(
f"{prom_url}/api/v1/query",
params={"query": query},
)
data = response.json()
if data.get("status") != "success":
return {"status": "error", "reason": "prometheus_query_failed"}
results = data.get("data", {}).get("result", [])
if not results:
return {
"status": "no_data",
"reason": "prometheus_empty_result_metric_not_emitted",
}
raw_value = results[0]["value"][1]
value = float(raw_value)
if not math.isfinite(value):
return {
"status": "skipped",
"reason": "prometheus_nan_or_inf",
"raw_value": raw_value,
}
return {"status": "ok", "value": value}
except Exception as exc:
logger.warning("adr100_slo_prometheus_query_error", query=query, error=str(exc))
return {"status": "error", "reason": "prometheus_query_error"}
def _metric_payload(
definition: Adr100SloDefinition,
*,
value: float | None,
status: str,
reason: str | None,
denominator_value: float | None,
sample_count: float | None,
) -> dict[str, Any]:
return {
"name": definition.name,
"query": definition.query,
"value": value,
"target": definition.target,
"hard_red_line": definition.hard_red_line,
"direction": definition.direction,
"unit": definition.unit,
"window": definition.window,
"status": status,
"evaluable": status in {"ok", "warning", "violated"},
"reason": reason,
"denominator_query": definition.denominator_query,
"denominator_value": denominator_value,
"sample_count": sample_count,
}
def _classify_status(value: float, definition: Adr100SloDefinition) -> str:
if definition.direction == "above":
if value < definition.hard_red_line:
return "violated"
if value < definition.target:
return "warning"
return "ok"
if value > definition.hard_red_line:
return "violated"
if value > definition.target:
return "warning"
return "ok"
def _overall_status(metrics: list[dict[str, Any]], evaluable: list[dict[str, Any]]) -> str:
if any(metric.get("status") == "violated" for metric in metrics):
return "violated"
if any(metric.get("status") == "warning" for metric in metrics):
return "warning"
if evaluable and any(metric.get("status") == "skipped_low_volume" for metric in metrics):
return "partial"
if evaluable:
return "ok"
if any(metric.get("status") == "no_data" for metric in metrics):
return "no_data"
return "skipped_low_volume"
_adr100_slo_status_service: Adr100SloStatusService | None = None
def get_adr100_slo_status_service() -> Adr100SloStatusService:
global _adr100_slo_status_service
if _adr100_slo_status_service is None:
_adr100_slo_status_service = Adr100SloStatusService()
return _adr100_slo_status_service

View File

@@ -0,0 +1,89 @@
from __future__ import annotations
from typing import Any
import pytest
from src.services.adr100_slo_status_service import Adr100SloStatusService
class _FakePrometheusResponse:
def __init__(self, payload: dict[str, Any]) -> None:
self._payload = payload
def json(self) -> dict[str, Any]:
return self._payload
class _FakePrometheusClient:
def __init__(self, values: dict[str, str]) -> None:
self.values = values
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return False
async def get(self, *args, **kwargs): # noqa: ANN002, ANN003
query = str(kwargs.get("params", {}).get("query", ""))
value = self.values.get(query)
if value is None:
return _FakePrometheusResponse({
"status": "success",
"data": {"result": []},
})
return _FakePrometheusResponse({
"status": "success",
"data": {"result": [{"value": [1778756604, value]}]},
})
@pytest.mark.asyncio
async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch):
values = {
"sum(rate(automation_operation_log_total[5m]))": "0",
'sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))': "0",
"sum(rate(approval_records_high_confidence_total[1h]))": "0",
"max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)": "24",
}
monkeypatch.setattr(
"httpx.AsyncClient",
lambda *args, **kwargs: _FakePrometheusClient(values),
)
report = await Adr100SloStatusService().fetch_report()
by_name = {metric["name"]: metric for metric in report["metrics"]}
assert by_name["decision_accuracy"]["status"] == "skipped_low_volume"
assert by_name["decision_accuracy"]["evaluable"] is False
assert by_name["confidence_calibration"]["status"] == "skipped_low_volume"
assert by_name["km_growth_rate"]["status"] == "ok"
assert by_name["km_growth_rate"]["value"] == 24
assert report["overall_status"] == "partial"
assert report["overall_compliance"] == 1.0
@pytest.mark.asyncio
async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch):
values = {
"sum(rate(automation_operation_log_total[5m]))": "0.02",
"sli:autonomy_rate:5m": "0.5",
'sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))': "0",
"sum(rate(approval_records_high_confidence_total[1h]))": "0",
"max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)": "3",
}
monkeypatch.setattr(
"httpx.AsyncClient",
lambda *args, **kwargs: _FakePrometheusClient(values),
)
report = await Adr100SloStatusService().fetch_report()
by_name = {metric["name"]: metric for metric in report["metrics"]}
assert by_name["autonomy_rate"]["status"] == "violated"
assert by_name["autonomy_rate"]["sample_count"] == 6
assert by_name["km_growth_rate"]["status"] == "violated"
assert report["overall_status"] == "violated"

View File

@@ -1354,15 +1354,38 @@
"comingSoon": "This tab is coming soon",
"slo": {
"kpi": {
"autonomy_rate": "Autonomy Rate",
"decision_accuracy": "Decision Accuracy",
"confidence_calibration": "Confidence Calibration",
"km_growth_rate": "KM Growth Rate",
"mcp_call_diversity": "MCP Call Diversity",
"auto_execute_success_rate": "Auto Execute Success",
"human_override_rate": "Human Override Rate",
"verifier_false_neg_rate": "Verifier False Negative",
"current": "Current",
"target": "Target",
"sparkline": "7-day trend",
"loading": "Loading...",
"error": "Failed to load",
"noData": "No data"
"noData": "No data",
"sampleCount": "Samples {count}",
"window": "Window {window}",
"state": {
"ok": "OK",
"warning": "Below target",
"violated": "Hard red line",
"skipped_low_volume": "Low sample wait",
"no_data": "No data",
"error": "Query failed",
"partial": "Partially evaluable"
},
"reason": {
"none": "None",
"denominator_below_minimum_events": "Denominator events too low",
"prometheus_nan_or_inf": "Prometheus has no valid denominator yet",
"prometheus_empty_result_metric_not_emitted": "Prometheus has not returned the metric yet",
"unknown": "Reason pending"
}
},
"chart": {
"title": "30-day Violation Timeline",

View File

@@ -1355,15 +1355,38 @@
"comingSoon": "本 Tab 即將上線",
"slo": {
"kpi": {
"autonomy_rate": "自主化率",
"decision_accuracy": "決策準確率",
"confidence_calibration": "信心校準",
"km_growth_rate": "KM 成長率",
"mcp_call_diversity": "MCP 呼叫多樣性",
"auto_execute_success_rate": "自動執行成功率",
"human_override_rate": "人工推翻率",
"verifier_false_neg_rate": "驗證漏判率",
"current": "當前",
"target": "目標",
"sparkline": "7 日趨勢",
"loading": "載入中...",
"error": "無法載入",
"noData": "暫無資料"
"noData": "暫無資料",
"sampleCount": "樣本 {count}",
"window": "視窗 {window}",
"state": {
"ok": "正常",
"warning": "低於目標",
"violated": "硬紅線",
"skipped_low_volume": "低樣本等待",
"no_data": "沒有資料",
"error": "查詢失敗",
"partial": "部分可評估"
},
"reason": {
"none": "無",
"denominator_below_minimum_events": "分母事件不足",
"prometheus_nan_or_inf": "Prometheus 暫無有效分母",
"prometheus_empty_result_metric_not_emitted": "Prometheus 尚未回傳指標",
"unknown": "原因待查"
}
},
"chart": {
"title": "30 日違反事件時序",

View File

@@ -31,11 +31,32 @@ const API_BASE = process.env.NEXT_PUBLIC_API_URL ?? ''
// =============================================================================
interface SloApiResponse {
metrics?: {
metrics?: Array<{
name: SloMetric['name']
value: number | null
threshold: number
direction: 'above' | 'below'
sample_count: number
violated: boolean
}> | {
decision_accuracy?: { current: number; target: number; status: string; sparkline?: number[] }
km_growth_rate?: { current: number; target: number; status: string; sparkline?: number[] }
mcp_call_diversity?: { current: number; target: number; status: string; sparkline?: number[] }
}
adr100?: {
overall_status?: string
overall_compliance?: number | null
metrics?: Array<{
name: SloMetric['name']
value: number | null
target: number
status: 'ok' | 'warning' | 'violated' | 'skipped_low_volume' | 'no_data' | 'error'
unit: 'percent' | 'count'
sample_count?: number | null
window?: string
reason?: string | null
}>
}
overall_compliance?: number
computed_at?: string
}
@@ -51,15 +72,55 @@ interface SummaryApiResponse {
// =============================================================================
function mapStatus(s: string): SloMetric['status'] {
if (s === 'healthy') return 'healthy'
if (s === 'healthy' || s === 'ok') return 'healthy'
if (s === 'warning') return 'warning'
if (s === 'skipped_low_volume') return 'syncing'
if (s === 'no_data') return 'idle'
return 'critical'
}
function buildMetrics(api: SloApiResponse): SloMetric[] {
const adr100Metrics = api.adr100?.metrics
if (adr100Metrics?.length) {
const order: SloMetric['name'][] = ['autonomy_rate', 'decision_accuracy', 'confidence_calibration', 'km_growth_rate']
const byName = new Map(adr100Metrics.map(metric => [metric.name, metric]))
const built: SloMetric[] = []
order.forEach(name => {
const entry = byName.get(name)
if (!entry) return
built.push({
name,
current: entry.value ?? null,
target: entry.target,
status: mapStatus(entry.status),
state: entry.status,
unit: entry.unit === 'count' ? 'count' : '%',
sparkline: [],
sampleCount: entry.sample_count ?? null,
window: entry.window,
reason: entry.reason ?? null,
})
})
return built
}
if (Array.isArray(api.metrics)) {
return api.metrics.map(entry => ({
name: entry.name,
current: entry.value,
target: entry.threshold,
status: entry.value == null ? 'syncing' : entry.violated ? 'critical' : 'healthy',
state: entry.value == null ? 'skipped_low_volume' : entry.violated ? 'violated' : 'ok',
unit: '%',
sparkline: [],
sampleCount: entry.sample_count,
}))
}
const m = api.metrics ?? {}
const names: Array<SloMetric['name']> = ['decision_accuracy', 'km_growth_rate', 'mcp_call_diversity']
return names.map(name => {
if (Array.isArray(m)) return []
const names: Array<'decision_accuracy' | 'km_growth_rate' | 'mcp_call_diversity'> = ['decision_accuracy', 'km_growth_rate', 'mcp_call_diversity']
return names.map((name): SloMetric => {
const entry = m[name]
return {
name,
@@ -111,7 +172,7 @@ export function SloTab() {
}, [])
const metrics = sloData ? buildMetrics(sloData) : []
const compliance = sloData?.overall_compliance ?? null
const compliance = sloData?.adr100?.overall_compliance ?? sloData?.overall_compliance ?? null
const chartData: ViolationDataPoint[] = summaryData?.data ?? []
const eventTypes: string[] = summaryData?.event_types ?? []
@@ -169,7 +230,7 @@ export function SloTab() {
className="slo-kpi-grid"
>
{sloLoading
? [0, 1, 2].map(i => <SloKpiCard key={i} metric={{ name: 'decision_accuracy', current: null, target: 0.9, status: 'warning' }} loading />)
? [0, 1, 2, 3].map(i => <SloKpiCard key={i} metric={{ name: 'decision_accuracy', current: null, target: 0.9, status: 'warning' }} loading />)
: metrics.map(m => <SloKpiCard key={m.name} metric={m} />)
}
</div>

View File

@@ -24,12 +24,24 @@ import { useTranslations } from 'next-intl'
// =============================================================================
export interface SloMetric {
name: 'decision_accuracy' | 'km_growth_rate' | 'mcp_call_diversity'
name:
| 'autonomy_rate'
| 'decision_accuracy'
| 'confidence_calibration'
| 'km_growth_rate'
| 'mcp_call_diversity'
| 'auto_execute_success_rate'
| 'human_override_rate'
| 'verifier_false_neg_rate'
current: number | null
target: number
status: 'healthy' | 'warning' | 'critical'
unit?: string
status: 'healthy' | 'warning' | 'critical' | 'idle' | 'syncing'
state?: 'ok' | 'warning' | 'violated' | 'skipped_low_volume' | 'no_data' | 'error' | 'partial'
unit?: '%' | 'count'
sparkline?: number[] // 7 points, most recent last
sampleCount?: number | null
window?: string
reason?: string | null
}
interface SloKpiCardProps {
@@ -45,6 +57,22 @@ const statusColor: Record<SloMetric['status'], string> = {
healthy: '#22C55E',
warning: '#F59E0B',
critical: '#FF3300',
idle: '#87867f',
syncing: '#3B82F6',
}
function formatCompactNumber(value: number): string {
if (value >= 100) return value.toFixed(0)
if (value >= 10) return value.toFixed(1)
return value.toFixed(2)
}
function reasonKey(reason?: string | null): string {
if (!reason) return 'none'
if (reason === 'denominator_below_minimum_events') return 'denominator_below_minimum_events'
if (reason === 'prometheus_nan_or_inf') return 'prometheus_nan_or_inf'
if (reason === 'prometheus_empty_result_metric_not_emitted') return 'prometheus_empty_result_metric_not_emitted'
return 'unknown'
}
// =============================================================================
@@ -73,21 +101,24 @@ export function SloKpiCard({ metric, loading = false }: SloKpiCardProps) {
if (loading) return <KpiSkeleton />
const color = statusColor[metric.status]
const orbStatus: StatusType = metric.status === 'healthy' ? 'healthy'
: metric.status === 'warning' ? 'warning'
: 'critical'
const orbStatus: StatusType = metric.status
const formattedValue = metric.current == null
? '--'
: metric.unit === '%'
? `${(metric.current * 100).toFixed(1)}%`
: metric.current.toFixed(2)
: metric.current.toFixed(0)
const formattedTarget = metric.unit === '%'
? `${(metric.target * 100).toFixed(0)}%`
: metric.target.toFixed(2)
: metric.target.toFixed(0)
const sparkData = (metric.sparkline ?? Array(7).fill(0)).map((v, i) => ({ i, v }))
const stateLabel = metric.state ? t(`state.${metric.state}`) : ''
const reasonLabel = metric.reason ? t(`reason.${reasonKey(metric.reason)}`) : null
const sampleLabel = metric.sampleCount == null
? null
: t('sampleCount', { count: formatCompactNumber(metric.sampleCount) })
return (
<GlassCard variant="elevated" padding="md" className="min-w-0 flex-1">
@@ -114,35 +145,46 @@ export function SloKpiCard({ metric, loading = false }: SloKpiCardProps) {
color,
lineHeight: 1,
marginBottom: 4,
letterSpacing: '-0.5px',
letterSpacing: 0,
}}>
{formattedValue}
</div>
{/* Target + sparkline row */}
<div style={{ display: 'flex', alignItems: 'flex-end', justifyContent: 'space-between' }}>
<span style={{
fontFamily: "'DM Mono', monospace",
fontSize: 10,
color: '#87867f',
}}>
{t('target')} {formattedTarget}
</span>
<div style={{ display: 'flex', flexDirection: 'column', gap: 8 }}>
{/* Target + sparkline row */}
<div style={{ display: 'flex', alignItems: 'flex-end', justifyContent: 'space-between', gap: 8 }}>
<span style={{
fontFamily: "'DM Mono', monospace",
fontSize: 10,
color: '#87867f',
}}>
{t('target')} {formattedTarget}
</span>
{/* Sparkline 80×24px */}
<div style={{ width: 80, height: 24 }} aria-label={t('sparkline')}>
<ResponsiveContainer width="100%" height="100%">
<LineChart data={sparkData} margin={{ top: 2, right: 0, bottom: 2, left: 0 }}>
<Line
type="monotone"
dataKey="v"
stroke={color}
strokeWidth={1.5}
dot={false}
isAnimationActive={false}
/>
</LineChart>
</ResponsiveContainer>
{/* Sparkline 80×24px */}
<div style={{ width: 80, height: 24, flexShrink: 0 }} aria-label={t('sparkline')}>
<ResponsiveContainer width="100%" height="100%">
<LineChart data={sparkData} margin={{ top: 2, right: 0, bottom: 2, left: 0 }}>
<Line
type="monotone"
dataKey="v"
stroke={color}
strokeWidth={1.5}
dot={false}
isAnimationActive={false}
/>
</LineChart>
</ResponsiveContainer>
</div>
</div>
<div style={{ display: 'flex', flexDirection: 'column', gap: 3, minHeight: 28 }}>
<span style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color }}>
{stateLabel}
</span>
<span style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', lineHeight: 1.35 }}>
{reasonLabel ?? sampleLabel ?? (metric.window ? t('window', { window: metric.window }) : '')}
</span>
</div>
</div>
</GlassCard>