feat(governance): surface verification coverage
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m9s
CD Pipeline / build-and-deploy (push) Successful in 3m49s
CD Pipeline / post-deploy-checks (push) Successful in 1m17s

This commit is contained in:
Your Name
2026-05-14 20:28:53 +08:00
parent bc1a11e373
commit 485c58d085
5 changed files with 447 additions and 6 deletions

View File

@@ -10,13 +10,15 @@ from __future__ import annotations
import math
from dataclasses import dataclass
from datetime import UTC, datetime
from typing import Any
import httpx
import structlog
from sqlalchemy import text
from src.core.config import settings
from src.db.base import get_db_context
from src.utils.timezone import now_taipei_iso
logger = structlog.get_logger(__name__)
@@ -99,17 +101,19 @@ class Adr100SloStatusService:
evaluable = [metric for metric in metrics if metric.get("evaluable")]
ok_count = sum(1 for metric in evaluable if metric.get("status") == "ok")
overall_compliance = (ok_count / len(evaluable)) if evaluable else None
overall_status = _overall_status(metrics, evaluable)
verification_coverage = await self._fetch_verification_coverage()
overall_status = _overall_status(metrics, evaluable, verification_coverage)
return {
"schema_version": "adr100_slo_status_v1",
"source": "prometheus",
"evaluated_at": datetime.now(UTC).isoformat(),
"source": "prometheus+postgresql",
"evaluated_at": now_taipei_iso(),
"overall_status": overall_status,
"overall_compliance": overall_compliance,
"evaluable_count": len(evaluable),
"metric_count": len(metrics),
"metrics": metrics,
"verification_coverage": verification_coverage,
}
async def _fetch_metric(
@@ -176,6 +180,116 @@ class Adr100SloStatusService:
sample_count=sample_count if sample_count is not None else value,
)
async def _fetch_verification_coverage(self) -> dict[str, Any]:
"""Summarize whether recent auto-repair executions have verifier evidence."""
try:
async with get_db_context() as db:
summary_row = (
await db.execute(text(_VERIFICATION_COVERAGE_SQL))
).mappings().one()
recent_rows = (
await db.execute(text(_VERIFICATION_COVERAGE_RECENT_SQL))
).mappings().all()
except Exception as exc:
logger.warning("adr100_verification_coverage_query_error", error=str(exc))
return {
"schema_version": "adr100_verification_coverage_v1",
"source": "postgresql",
"window": "24h",
"status": "error",
"reason": "postgresql_query_error",
"evaluable": False,
"total_auto": 0,
"successful_auto": 0,
"verified_auto": 0,
"verified_success": 0,
"verified_non_success": 0,
"unverified_auto": 0,
"coverage_rate": None,
"verification_success_rate": None,
"last_auto_at": None,
"last_verified_auto_at": None,
"last_verification_evidence_at": None,
"latest_auto_age_seconds": None,
"last_verified_auto_age_seconds": None,
"recent_unverified": [],
}
return _build_verification_coverage_payload(summary_row, recent_rows)
_VERIFICATION_COVERAGE_SQL = """
WITH recent_auto AS (
SELECT id, incident_id, success, created_at
FROM auto_repair_executions
WHERE created_at >= NOW() - INTERVAL '24 hours'
),
per_auto AS (
SELECT
are.id,
are.incident_id,
are.success,
are.created_at,
latest.verification_result,
latest.collected_at AS verification_collected_at,
latest.self_healing_score
FROM recent_auto are
LEFT JOIN LATERAL (
SELECT ev.verification_result, ev.collected_at, ev.self_healing_score
FROM incident_evidence ev
WHERE ev.incident_id = are.incident_id
AND ev.verification_result IS NOT NULL
ORDER BY ev.collected_at DESC
LIMIT 1
) latest ON TRUE
)
SELECT
count(*)::int AS total_auto,
count(*) FILTER (WHERE success)::int AS successful_auto,
count(*) FILTER (WHERE verification_result IS NOT NULL)::int AS verified_auto,
count(*) FILTER (WHERE verification_result = 'success')::int AS verified_success,
count(*) FILTER (WHERE verification_result IN ('degraded','failed','timeout'))::int AS verified_non_success,
count(*) FILTER (WHERE verification_result IS NULL)::int AS unverified_auto,
max(created_at) AS last_auto_at,
max(created_at) FILTER (WHERE verification_result IS NOT NULL) AS last_verified_auto_at,
max(verification_collected_at) AS last_verification_evidence_at,
EXTRACT(EPOCH FROM (NOW() - max(created_at)))::int AS latest_auto_age_seconds,
EXTRACT(EPOCH FROM (NOW() - (max(created_at) FILTER (WHERE verification_result IS NOT NULL))))::int
AS last_verified_auto_age_seconds
FROM per_auto
"""
_VERIFICATION_COVERAGE_RECENT_SQL = """
WITH recent_auto AS (
SELECT id, incident_id, success, created_at
FROM auto_repair_executions
WHERE created_at >= NOW() - INTERVAL '24 hours'
),
per_auto AS (
SELECT
are.id,
are.incident_id,
are.success,
are.created_at,
latest.verification_result
FROM recent_auto are
LEFT JOIN LATERAL (
SELECT ev.verification_result
FROM incident_evidence ev
WHERE ev.incident_id = are.incident_id
AND ev.verification_result IS NOT NULL
ORDER BY ev.collected_at DESC
LIMIT 1
) latest ON TRUE
)
SELECT id, incident_id, success, created_at
FROM per_auto
WHERE verification_result IS NULL
ORDER BY created_at DESC
LIMIT 5
"""
async def _query_prometheus_value(
client: httpx.AsyncClient,
@@ -254,9 +368,86 @@ def _classify_status(value: float, definition: Adr100SloDefinition) -> str:
return "ok"
def _overall_status(metrics: list[dict[str, Any]], evaluable: list[dict[str, Any]]) -> str:
def _build_verification_coverage_payload(
summary_row: Any,
recent_unverified_rows: Any,
) -> dict[str, Any]:
row = dict(summary_row)
total_auto = int(row.get("total_auto") or 0)
verified_auto = int(row.get("verified_auto") or 0)
verified_success = int(row.get("verified_success") or 0)
verified_non_success = int(row.get("verified_non_success") or 0)
unverified_auto = int(row.get("unverified_auto") or 0)
if total_auto == 0:
status = "skipped_low_volume"
reason = "no_auto_repair_executions_24h"
evaluable = False
elif unverified_auto > 0:
status = "warning"
reason = "verification_backlog_present"
evaluable = True
elif verified_non_success > 0:
status = "warning"
reason = "non_success_verification_present"
evaluable = True
else:
status = "ok"
reason = None
evaluable = True
coverage_rate = (verified_auto / total_auto) if total_auto else None
verification_success_rate = (verified_success / verified_auto) if verified_auto else None
return {
"schema_version": "adr100_verification_coverage_v1",
"source": "postgresql",
"window": "24h",
"status": status,
"reason": reason,
"evaluable": evaluable,
"total_auto": total_auto,
"successful_auto": int(row.get("successful_auto") or 0),
"verified_auto": verified_auto,
"verified_success": verified_success,
"verified_non_success": verified_non_success,
"unverified_auto": unverified_auto,
"coverage_rate": coverage_rate,
"verification_success_rate": verification_success_rate,
"last_auto_at": _iso(row.get("last_auto_at")),
"last_verified_auto_at": _iso(row.get("last_verified_auto_at")),
"last_verification_evidence_at": _iso(row.get("last_verification_evidence_at")),
"latest_auto_age_seconds": _int_or_none(row.get("latest_auto_age_seconds")),
"last_verified_auto_age_seconds": _int_or_none(row.get("last_verified_auto_age_seconds")),
"recent_unverified": [
{
"id": str(item.get("id")),
"incident_id": str(item.get("incident_id")),
"success": bool(item.get("success")),
"created_at": _iso(item.get("created_at")),
}
for item in (dict(raw) for raw in recent_unverified_rows)
],
}
def _iso(value: Any) -> str | None:
return value.isoformat() if hasattr(value, "isoformat") else None
def _int_or_none(value: Any) -> int | None:
return int(value) if value is not None else None
def _overall_status(
metrics: list[dict[str, Any]],
evaluable: list[dict[str, Any]],
verification_coverage: dict[str, Any] | None = None,
) -> str:
if any(metric.get("status") == "violated" for metric in metrics):
return "violated"
if verification_coverage and verification_coverage.get("status") in {"violated", "warning"}:
return str(verification_coverage["status"])
if any(metric.get("status") == "warning" for metric in metrics):
return "warning"
if evaluable and any(metric.get("status") == "skipped_low_volume" for metric in metrics):

View File

@@ -4,7 +4,10 @@ from typing import Any
import pytest
from src.services.adr100_slo_status_service import Adr100SloStatusService
from src.services.adr100_slo_status_service import (
Adr100SloStatusService,
_build_verification_coverage_payload,
)
class _FakePrometheusResponse:
@@ -39,6 +42,31 @@ class _FakePrometheusClient:
})
async def _low_volume_coverage(self): # noqa: ANN001
return {
"schema_version": "adr100_verification_coverage_v1",
"source": "postgresql",
"window": "24h",
"status": "skipped_low_volume",
"reason": "no_auto_repair_executions_24h",
"evaluable": False,
"total_auto": 0,
"successful_auto": 0,
"verified_auto": 0,
"verified_success": 0,
"verified_non_success": 0,
"unverified_auto": 0,
"coverage_rate": None,
"verification_success_rate": None,
"last_auto_at": None,
"last_verified_auto_at": None,
"last_verification_evidence_at": None,
"latest_auto_age_seconds": None,
"last_verified_auto_age_seconds": None,
"recent_unverified": [],
}
@pytest.mark.asyncio
async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch):
values = {
@@ -52,6 +80,11 @@ async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch):
"httpx.AsyncClient",
lambda *args, **kwargs: _FakePrometheusClient(values),
)
monkeypatch.setattr(
Adr100SloStatusService,
"_fetch_verification_coverage",
_low_volume_coverage,
)
report = await Adr100SloStatusService().fetch_report()
@@ -79,6 +112,11 @@ async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch):
"httpx.AsyncClient",
lambda *args, **kwargs: _FakePrometheusClient(values),
)
monkeypatch.setattr(
Adr100SloStatusService,
"_fetch_verification_coverage",
_low_volume_coverage,
)
report = await Adr100SloStatusService().fetch_report()
@@ -87,3 +125,53 @@ async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch):
assert by_name["autonomy_rate"]["sample_count"] == 6
assert by_name["km_growth_rate"]["status"] == "violated"
assert report["overall_status"] == "violated"
def test_verification_coverage_payload_flags_backlog():
payload = _build_verification_coverage_payload(
{
"total_auto": 7,
"successful_auto": 6,
"verified_auto": 5,
"verified_success": 4,
"verified_non_success": 1,
"unverified_auto": 2,
"last_auto_at": None,
"last_verified_auto_at": None,
"last_verification_evidence_at": None,
"latest_auto_age_seconds": 90,
"last_verified_auto_age_seconds": 120,
},
[
{
"id": "are-1",
"incident_id": "INC-1",
"success": True,
"created_at": None,
},
],
)
assert payload["status"] == "warning"
assert payload["reason"] == "verification_backlog_present"
assert payload["coverage_rate"] == pytest.approx(5 / 7)
assert payload["verification_success_rate"] == pytest.approx(4 / 5)
assert payload["recent_unverified"][0]["incident_id"] == "INC-1"
def test_verification_coverage_payload_skips_when_no_auto_repair():
payload = _build_verification_coverage_payload(
{
"total_auto": 0,
"successful_auto": 0,
"verified_auto": 0,
"verified_success": 0,
"verified_non_success": 0,
"unverified_auto": 0,
},
[],
)
assert payload["status"] == "skipped_low_volume"
assert payload["reason"] == "no_auto_repair_executions_24h"
assert payload["evaluable"] is False

View File

@@ -1399,6 +1399,32 @@
"compliance": {
"title": "Overall Compliance",
"target": "Target ≥ 95%"
},
"coverage": {
"title": "Verification Coverage",
"subtitle": "Auto-repair executions and verifier writeback in the last {window}",
"totalAuto": "Auto repairs",
"verifiedAuto": "Verified",
"unverifiedAuto": "Unverified",
"coverageRate": "Coverage",
"successRate": "Success verification",
"lastVerified": "Last verified execution",
"reasonLabel": "Reason",
"state": {
"ok": "OK",
"warning": "Needs tracking",
"violated": "Hard red line",
"skipped_low_volume": "Waiting for samples",
"no_data": "No data",
"error": "Query failed"
},
"reason": {
"none": "None",
"no_auto_repair_executions_24h": "No auto-repair executions in the last 24h",
"verification_backlog_present": "Some auto repairs are missing verification results",
"non_success_verification_present": "degraded / failed / timeout verification exists",
"postgresql_query_error": "PostgreSQL query failed"
}
}
},
"events": {

View File

@@ -1400,6 +1400,32 @@
"compliance": {
"title": "整體合規率",
"target": "目標 ≥ 95%"
},
"coverage": {
"title": "驗證覆蓋率",
"subtitle": "近 {window} 自動修復執行與 verifier 寫回狀態",
"totalAuto": "自動修復",
"verifiedAuto": "已驗證",
"unverifiedAuto": "待驗證",
"coverageRate": "覆蓋率",
"successRate": "成功驗證",
"lastVerified": "最後已驗證執行",
"reasonLabel": "原因",
"state": {
"ok": "正常",
"warning": "需追蹤",
"violated": "硬紅線",
"skipped_low_volume": "等待樣本",
"no_data": "沒有資料",
"error": "查詢失敗"
},
"reason": {
"none": "無",
"no_auto_repair_executions_24h": "近 24h 無自動修復執行",
"verification_backlog_present": "有自動修復尚未寫入驗證結果",
"non_success_verification_present": "存在 degraded / failed / timeout 驗證結果",
"postgresql_query_error": "PostgreSQL 查詢失敗"
}
}
},
"events": {

View File

@@ -46,6 +46,7 @@ interface SloApiResponse {
adr100?: {
overall_status?: string
overall_compliance?: number | null
verification_coverage?: Adr100VerificationCoverage
metrics?: Array<{
name: SloMetric['name']
value: number | null
@@ -61,6 +62,30 @@ interface SloApiResponse {
computed_at?: string
}
interface Adr100VerificationCoverage {
status: 'ok' | 'warning' | 'violated' | 'skipped_low_volume' | 'no_data' | 'error'
reason?: string | null
window?: string
total_auto: number
successful_auto: number
verified_auto: number
verified_success: number
verified_non_success: number
unverified_auto: number
coverage_rate?: number | null
verification_success_rate?: number | null
last_auto_at?: string | null
last_verified_auto_at?: string | null
latest_auto_age_seconds?: number | null
last_verified_auto_age_seconds?: number | null
recent_unverified?: Array<{
id: string
incident_id: string
success: boolean
created_at?: string | null
}>
}
interface SummaryApiResponse {
data?: ViolationDataPoint[]
event_types?: string[]
@@ -79,6 +104,25 @@ function mapStatus(s: string): SloMetric['status'] {
return 'critical'
}
function coverageTone(status?: Adr100VerificationCoverage['status']): string {
if (status === 'ok') return '#22C55E'
if (status === 'warning') return '#F59E0B'
if (!status || status === 'skipped_low_volume' || status === 'no_data') return '#87867f'
return '#FF3300'
}
function coverageReasonKey(reason?: string | null): string {
if (reason === 'no_auto_repair_executions_24h') return reason
if (reason === 'verification_backlog_present') return reason
if (reason === 'non_success_verification_present') return reason
if (reason === 'postgresql_query_error') return reason
return 'none'
}
function formatPercent(value?: number | null): string {
return value == null ? '--' : `${(value * 100).toFixed(1)}%`
}
function buildMetrics(api: SloApiResponse): SloMetric[] {
const adr100Metrics = api.adr100?.metrics
if (adr100Metrics?.length) {
@@ -133,6 +177,68 @@ function buildMetrics(api: SloApiResponse): SloMetric[] {
})
}
function VerificationCoveragePanel({ coverage }: { coverage?: Adr100VerificationCoverage }) {
const t = useTranslations('governance.slo.coverage')
const color = coverageTone(coverage?.status)
const rows = [
{ label: t('totalAuto'), value: String(coverage?.total_auto ?? '--') },
{ label: t('verifiedAuto'), value: String(coverage?.verified_auto ?? '--') },
{ label: t('unverifiedAuto'), value: String(coverage?.unverified_auto ?? '--') },
{ label: t('coverageRate'), value: formatPercent(coverage?.coverage_rate) },
]
return (
<GlassCard variant="subtle" padding="md">
<div style={{ display: 'flex', flexDirection: 'column', gap: 12 }}>
<div style={{ display: 'flex', alignItems: 'flex-start', justifyContent: 'space-between', gap: 12 }}>
<div style={{ minWidth: 0 }}>
<div style={{ fontFamily: 'Syne, sans-serif', fontSize: 13, fontWeight: 700, color: '#141413' }}>
{t('title')}
</div>
<div style={{ marginTop: 4, fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45 }}>
{t('subtitle', { window: coverage?.window ?? '24h' })}
</div>
</div>
<div style={{
flexShrink: 0,
display: 'inline-flex',
alignItems: 'center',
minHeight: 26,
padding: '4px 8px',
borderRadius: 6,
border: `0.5px solid ${color}40`,
background: `${color}12`,
fontFamily: "'DM Mono', monospace",
fontSize: 10,
color,
}}>
{t(`state.${coverage?.status ?? 'no_data'}`)}
</div>
</div>
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(4, minmax(120px, 1fr))', gap: 10 }} className="slo-coverage-grid">
{rows.map(row => (
<div key={row.label} style={{ minWidth: 0 }}>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginBottom: 3 }}>
{row.label}
</div>
<div style={{ fontFamily: 'Syne, sans-serif', fontSize: 18, fontWeight: 700, color: '#141413', letterSpacing: 0 }}>
{row.value}
</div>
</div>
))}
</div>
<div style={{ display: 'flex', flexWrap: 'wrap', gap: '8px 14px', fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45 }}>
<span>{t('reasonLabel')} {t(`reason.${coverageReasonKey(coverage?.reason)}`)}</span>
<span>{t('successRate')} {formatPercent(coverage?.verification_success_rate)}</span>
<span>{t('lastVerified')} {coverage?.last_verified_auto_at ?? '--'}</span>
</div>
</div>
</GlassCard>
)
}
// =============================================================================
// Component
// =============================================================================
@@ -173,6 +279,7 @@ export function SloTab() {
const metrics = sloData ? buildMetrics(sloData) : []
const compliance = sloData?.adr100?.overall_compliance ?? sloData?.overall_compliance ?? null
const verificationCoverage = sloData?.adr100?.verification_coverage
const chartData: ViolationDataPoint[] = summaryData?.data ?? []
const eventTypes: string[] = summaryData?.event_types ?? []
@@ -235,6 +342,8 @@ export function SloTab() {
}
</div>
{!sloLoading && <VerificationCoveragePanel coverage={verificationCoverage} />}
{/* Violation timeline chart */}
<SloViolationChart
data={chartData}
@@ -248,6 +357,7 @@ export function SloTab() {
.slo-kpi-grid > * { flex: 1; min-width: 200px; }
@media (max-width: 640px) {
.slo-kpi-grid > * { flex: 0 0 100%; min-width: 0; }
.slo-coverage-grid { grid-template-columns: repeat(2, minmax(0, 1fr)) !important; }
}
`}</style>
</div>