diff --git a/apps/api/src/services/adr100_slo_status_service.py b/apps/api/src/services/adr100_slo_status_service.py index 37bd56ee..1b915120 100644 --- a/apps/api/src/services/adr100_slo_status_service.py +++ b/apps/api/src/services/adr100_slo_status_service.py @@ -10,13 +10,15 @@ from __future__ import annotations import math from dataclasses import dataclass -from datetime import UTC, datetime from typing import Any import httpx import structlog +from sqlalchemy import text from src.core.config import settings +from src.db.base import get_db_context +from src.utils.timezone import now_taipei_iso logger = structlog.get_logger(__name__) @@ -99,17 +101,19 @@ class Adr100SloStatusService: evaluable = [metric for metric in metrics if metric.get("evaluable")] ok_count = sum(1 for metric in evaluable if metric.get("status") == "ok") overall_compliance = (ok_count / len(evaluable)) if evaluable else None - overall_status = _overall_status(metrics, evaluable) + verification_coverage = await self._fetch_verification_coverage() + overall_status = _overall_status(metrics, evaluable, verification_coverage) return { "schema_version": "adr100_slo_status_v1", - "source": "prometheus", - "evaluated_at": datetime.now(UTC).isoformat(), + "source": "prometheus+postgresql", + "evaluated_at": now_taipei_iso(), "overall_status": overall_status, "overall_compliance": overall_compliance, "evaluable_count": len(evaluable), "metric_count": len(metrics), "metrics": metrics, + "verification_coverage": verification_coverage, } async def _fetch_metric( @@ -176,6 +180,116 @@ class Adr100SloStatusService: sample_count=sample_count if sample_count is not None else value, ) + async def _fetch_verification_coverage(self) -> dict[str, Any]: + """Summarize whether recent auto-repair executions have verifier evidence.""" + try: + async with get_db_context() as db: + summary_row = ( + await db.execute(text(_VERIFICATION_COVERAGE_SQL)) + ).mappings().one() + recent_rows = ( + await db.execute(text(_VERIFICATION_COVERAGE_RECENT_SQL)) + ).mappings().all() + except Exception as exc: + logger.warning("adr100_verification_coverage_query_error", error=str(exc)) + return { + "schema_version": "adr100_verification_coverage_v1", + "source": "postgresql", + "window": "24h", + "status": "error", + "reason": "postgresql_query_error", + "evaluable": False, + "total_auto": 0, + "successful_auto": 0, + "verified_auto": 0, + "verified_success": 0, + "verified_non_success": 0, + "unverified_auto": 0, + "coverage_rate": None, + "verification_success_rate": None, + "last_auto_at": None, + "last_verified_auto_at": None, + "last_verification_evidence_at": None, + "latest_auto_age_seconds": None, + "last_verified_auto_age_seconds": None, + "recent_unverified": [], + } + + return _build_verification_coverage_payload(summary_row, recent_rows) + + +_VERIFICATION_COVERAGE_SQL = """ + WITH recent_auto AS ( + SELECT id, incident_id, success, created_at + FROM auto_repair_executions + WHERE created_at >= NOW() - INTERVAL '24 hours' + ), + per_auto AS ( + SELECT + are.id, + are.incident_id, + are.success, + are.created_at, + latest.verification_result, + latest.collected_at AS verification_collected_at, + latest.self_healing_score + FROM recent_auto are + LEFT JOIN LATERAL ( + SELECT ev.verification_result, ev.collected_at, ev.self_healing_score + FROM incident_evidence ev + WHERE ev.incident_id = are.incident_id + AND ev.verification_result IS NOT NULL + ORDER BY ev.collected_at DESC + LIMIT 1 + ) latest ON TRUE + ) + SELECT + count(*)::int AS total_auto, + count(*) FILTER (WHERE success)::int AS successful_auto, + count(*) FILTER (WHERE verification_result IS NOT NULL)::int AS verified_auto, + count(*) FILTER (WHERE verification_result = 'success')::int AS verified_success, + count(*) FILTER (WHERE verification_result IN ('degraded','failed','timeout'))::int AS verified_non_success, + count(*) FILTER (WHERE verification_result IS NULL)::int AS unverified_auto, + max(created_at) AS last_auto_at, + max(created_at) FILTER (WHERE verification_result IS NOT NULL) AS last_verified_auto_at, + max(verification_collected_at) AS last_verification_evidence_at, + EXTRACT(EPOCH FROM (NOW() - max(created_at)))::int AS latest_auto_age_seconds, + EXTRACT(EPOCH FROM (NOW() - (max(created_at) FILTER (WHERE verification_result IS NOT NULL))))::int + AS last_verified_auto_age_seconds + FROM per_auto +""" + + +_VERIFICATION_COVERAGE_RECENT_SQL = """ + WITH recent_auto AS ( + SELECT id, incident_id, success, created_at + FROM auto_repair_executions + WHERE created_at >= NOW() - INTERVAL '24 hours' + ), + per_auto AS ( + SELECT + are.id, + are.incident_id, + are.success, + are.created_at, + latest.verification_result + FROM recent_auto are + LEFT JOIN LATERAL ( + SELECT ev.verification_result + FROM incident_evidence ev + WHERE ev.incident_id = are.incident_id + AND ev.verification_result IS NOT NULL + ORDER BY ev.collected_at DESC + LIMIT 1 + ) latest ON TRUE + ) + SELECT id, incident_id, success, created_at + FROM per_auto + WHERE verification_result IS NULL + ORDER BY created_at DESC + LIMIT 5 +""" + async def _query_prometheus_value( client: httpx.AsyncClient, @@ -254,9 +368,86 @@ def _classify_status(value: float, definition: Adr100SloDefinition) -> str: return "ok" -def _overall_status(metrics: list[dict[str, Any]], evaluable: list[dict[str, Any]]) -> str: +def _build_verification_coverage_payload( + summary_row: Any, + recent_unverified_rows: Any, +) -> dict[str, Any]: + row = dict(summary_row) + total_auto = int(row.get("total_auto") or 0) + verified_auto = int(row.get("verified_auto") or 0) + verified_success = int(row.get("verified_success") or 0) + verified_non_success = int(row.get("verified_non_success") or 0) + unverified_auto = int(row.get("unverified_auto") or 0) + + if total_auto == 0: + status = "skipped_low_volume" + reason = "no_auto_repair_executions_24h" + evaluable = False + elif unverified_auto > 0: + status = "warning" + reason = "verification_backlog_present" + evaluable = True + elif verified_non_success > 0: + status = "warning" + reason = "non_success_verification_present" + evaluable = True + else: + status = "ok" + reason = None + evaluable = True + + coverage_rate = (verified_auto / total_auto) if total_auto else None + verification_success_rate = (verified_success / verified_auto) if verified_auto else None + + return { + "schema_version": "adr100_verification_coverage_v1", + "source": "postgresql", + "window": "24h", + "status": status, + "reason": reason, + "evaluable": evaluable, + "total_auto": total_auto, + "successful_auto": int(row.get("successful_auto") or 0), + "verified_auto": verified_auto, + "verified_success": verified_success, + "verified_non_success": verified_non_success, + "unverified_auto": unverified_auto, + "coverage_rate": coverage_rate, + "verification_success_rate": verification_success_rate, + "last_auto_at": _iso(row.get("last_auto_at")), + "last_verified_auto_at": _iso(row.get("last_verified_auto_at")), + "last_verification_evidence_at": _iso(row.get("last_verification_evidence_at")), + "latest_auto_age_seconds": _int_or_none(row.get("latest_auto_age_seconds")), + "last_verified_auto_age_seconds": _int_or_none(row.get("last_verified_auto_age_seconds")), + "recent_unverified": [ + { + "id": str(item.get("id")), + "incident_id": str(item.get("incident_id")), + "success": bool(item.get("success")), + "created_at": _iso(item.get("created_at")), + } + for item in (dict(raw) for raw in recent_unverified_rows) + ], + } + + +def _iso(value: Any) -> str | None: + return value.isoformat() if hasattr(value, "isoformat") else None + + +def _int_or_none(value: Any) -> int | None: + return int(value) if value is not None else None + + +def _overall_status( + metrics: list[dict[str, Any]], + evaluable: list[dict[str, Any]], + verification_coverage: dict[str, Any] | None = None, +) -> str: if any(metric.get("status") == "violated" for metric in metrics): return "violated" + if verification_coverage and verification_coverage.get("status") in {"violated", "warning"}: + return str(verification_coverage["status"]) if any(metric.get("status") == "warning" for metric in metrics): return "warning" if evaluable and any(metric.get("status") == "skipped_low_volume" for metric in metrics): diff --git a/apps/api/tests/test_adr100_slo_status_service.py b/apps/api/tests/test_adr100_slo_status_service.py index 3fbd0f3e..10aaa95c 100644 --- a/apps/api/tests/test_adr100_slo_status_service.py +++ b/apps/api/tests/test_adr100_slo_status_service.py @@ -4,7 +4,10 @@ from typing import Any import pytest -from src.services.adr100_slo_status_service import Adr100SloStatusService +from src.services.adr100_slo_status_service import ( + Adr100SloStatusService, + _build_verification_coverage_payload, +) class _FakePrometheusResponse: @@ -39,6 +42,31 @@ class _FakePrometheusClient: }) +async def _low_volume_coverage(self): # noqa: ANN001 + return { + "schema_version": "adr100_verification_coverage_v1", + "source": "postgresql", + "window": "24h", + "status": "skipped_low_volume", + "reason": "no_auto_repair_executions_24h", + "evaluable": False, + "total_auto": 0, + "successful_auto": 0, + "verified_auto": 0, + "verified_success": 0, + "verified_non_success": 0, + "unverified_auto": 0, + "coverage_rate": None, + "verification_success_rate": None, + "last_auto_at": None, + "last_verified_auto_at": None, + "last_verification_evidence_at": None, + "latest_auto_age_seconds": None, + "last_verified_auto_age_seconds": None, + "recent_unverified": [], + } + + @pytest.mark.asyncio async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch): values = { @@ -52,6 +80,11 @@ async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch): "httpx.AsyncClient", lambda *args, **kwargs: _FakePrometheusClient(values), ) + monkeypatch.setattr( + Adr100SloStatusService, + "_fetch_verification_coverage", + _low_volume_coverage, + ) report = await Adr100SloStatusService().fetch_report() @@ -79,6 +112,11 @@ async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch): "httpx.AsyncClient", lambda *args, **kwargs: _FakePrometheusClient(values), ) + monkeypatch.setattr( + Adr100SloStatusService, + "_fetch_verification_coverage", + _low_volume_coverage, + ) report = await Adr100SloStatusService().fetch_report() @@ -87,3 +125,53 @@ async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch): assert by_name["autonomy_rate"]["sample_count"] == 6 assert by_name["km_growth_rate"]["status"] == "violated" assert report["overall_status"] == "violated" + + +def test_verification_coverage_payload_flags_backlog(): + payload = _build_verification_coverage_payload( + { + "total_auto": 7, + "successful_auto": 6, + "verified_auto": 5, + "verified_success": 4, + "verified_non_success": 1, + "unverified_auto": 2, + "last_auto_at": None, + "last_verified_auto_at": None, + "last_verification_evidence_at": None, + "latest_auto_age_seconds": 90, + "last_verified_auto_age_seconds": 120, + }, + [ + { + "id": "are-1", + "incident_id": "INC-1", + "success": True, + "created_at": None, + }, + ], + ) + + assert payload["status"] == "warning" + assert payload["reason"] == "verification_backlog_present" + assert payload["coverage_rate"] == pytest.approx(5 / 7) + assert payload["verification_success_rate"] == pytest.approx(4 / 5) + assert payload["recent_unverified"][0]["incident_id"] == "INC-1" + + +def test_verification_coverage_payload_skips_when_no_auto_repair(): + payload = _build_verification_coverage_payload( + { + "total_auto": 0, + "successful_auto": 0, + "verified_auto": 0, + "verified_success": 0, + "verified_non_success": 0, + "unverified_auto": 0, + }, + [], + ) + + assert payload["status"] == "skipped_low_volume" + assert payload["reason"] == "no_auto_repair_executions_24h" + assert payload["evaluable"] is False diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 06e0bdb5..36d2a79f 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -1399,6 +1399,32 @@ "compliance": { "title": "Overall Compliance", "target": "Target ≥ 95%" + }, + "coverage": { + "title": "Verification Coverage", + "subtitle": "Auto-repair executions and verifier writeback in the last {window}", + "totalAuto": "Auto repairs", + "verifiedAuto": "Verified", + "unverifiedAuto": "Unverified", + "coverageRate": "Coverage", + "successRate": "Success verification", + "lastVerified": "Last verified execution", + "reasonLabel": "Reason", + "state": { + "ok": "OK", + "warning": "Needs tracking", + "violated": "Hard red line", + "skipped_low_volume": "Waiting for samples", + "no_data": "No data", + "error": "Query failed" + }, + "reason": { + "none": "None", + "no_auto_repair_executions_24h": "No auto-repair executions in the last 24h", + "verification_backlog_present": "Some auto repairs are missing verification results", + "non_success_verification_present": "degraded / failed / timeout verification exists", + "postgresql_query_error": "PostgreSQL query failed" + } } }, "events": { diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index 42c0663f..843604c8 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -1400,6 +1400,32 @@ "compliance": { "title": "整體合規率", "target": "目標 ≥ 95%" + }, + "coverage": { + "title": "驗證覆蓋率", + "subtitle": "近 {window} 自動修復執行與 verifier 寫回狀態", + "totalAuto": "自動修復", + "verifiedAuto": "已驗證", + "unverifiedAuto": "待驗證", + "coverageRate": "覆蓋率", + "successRate": "成功驗證", + "lastVerified": "最後已驗證執行", + "reasonLabel": "原因", + "state": { + "ok": "正常", + "warning": "需追蹤", + "violated": "硬紅線", + "skipped_low_volume": "等待樣本", + "no_data": "沒有資料", + "error": "查詢失敗" + }, + "reason": { + "none": "無", + "no_auto_repair_executions_24h": "近 24h 無自動修復執行", + "verification_backlog_present": "有自動修復尚未寫入驗證結果", + "non_success_verification_present": "存在 degraded / failed / timeout 驗證結果", + "postgresql_query_error": "PostgreSQL 查詢失敗" + } } }, "events": { diff --git a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx index ed6cd089..63e35f4a 100644 --- a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx +++ b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx @@ -46,6 +46,7 @@ interface SloApiResponse { adr100?: { overall_status?: string overall_compliance?: number | null + verification_coverage?: Adr100VerificationCoverage metrics?: Array<{ name: SloMetric['name'] value: number | null @@ -61,6 +62,30 @@ interface SloApiResponse { computed_at?: string } +interface Adr100VerificationCoverage { + status: 'ok' | 'warning' | 'violated' | 'skipped_low_volume' | 'no_data' | 'error' + reason?: string | null + window?: string + total_auto: number + successful_auto: number + verified_auto: number + verified_success: number + verified_non_success: number + unverified_auto: number + coverage_rate?: number | null + verification_success_rate?: number | null + last_auto_at?: string | null + last_verified_auto_at?: string | null + latest_auto_age_seconds?: number | null + last_verified_auto_age_seconds?: number | null + recent_unverified?: Array<{ + id: string + incident_id: string + success: boolean + created_at?: string | null + }> +} + interface SummaryApiResponse { data?: ViolationDataPoint[] event_types?: string[] @@ -79,6 +104,25 @@ function mapStatus(s: string): SloMetric['status'] { return 'critical' } +function coverageTone(status?: Adr100VerificationCoverage['status']): string { + if (status === 'ok') return '#22C55E' + if (status === 'warning') return '#F59E0B' + if (!status || status === 'skipped_low_volume' || status === 'no_data') return '#87867f' + return '#FF3300' +} + +function coverageReasonKey(reason?: string | null): string { + if (reason === 'no_auto_repair_executions_24h') return reason + if (reason === 'verification_backlog_present') return reason + if (reason === 'non_success_verification_present') return reason + if (reason === 'postgresql_query_error') return reason + return 'none' +} + +function formatPercent(value?: number | null): string { + return value == null ? '--' : `${(value * 100).toFixed(1)}%` +} + function buildMetrics(api: SloApiResponse): SloMetric[] { const adr100Metrics = api.adr100?.metrics if (adr100Metrics?.length) { @@ -133,6 +177,68 @@ function buildMetrics(api: SloApiResponse): SloMetric[] { }) } +function VerificationCoveragePanel({ coverage }: { coverage?: Adr100VerificationCoverage }) { + const t = useTranslations('governance.slo.coverage') + const color = coverageTone(coverage?.status) + const rows = [ + { label: t('totalAuto'), value: String(coverage?.total_auto ?? '--') }, + { label: t('verifiedAuto'), value: String(coverage?.verified_auto ?? '--') }, + { label: t('unverifiedAuto'), value: String(coverage?.unverified_auto ?? '--') }, + { label: t('coverageRate'), value: formatPercent(coverage?.coverage_rate) }, + ] + + return ( + +
+
+
+
+ {t('title')} +
+
+ {t('subtitle', { window: coverage?.window ?? '24h' })} +
+
+
+ {t(`state.${coverage?.status ?? 'no_data'}`)} +
+
+ +
+ {rows.map(row => ( +
+
+ {row.label} +
+
+ {row.value} +
+
+ ))} +
+ +
+ {t('reasonLabel')} {t(`reason.${coverageReasonKey(coverage?.reason)}`)} + {t('successRate')} {formatPercent(coverage?.verification_success_rate)} + {t('lastVerified')} {coverage?.last_verified_auto_at ?? '--'} +
+
+
+ ) +} + // ============================================================================= // Component // ============================================================================= @@ -173,6 +279,7 @@ export function SloTab() { const metrics = sloData ? buildMetrics(sloData) : [] const compliance = sloData?.adr100?.overall_compliance ?? sloData?.overall_compliance ?? null + const verificationCoverage = sloData?.adr100?.verification_coverage const chartData: ViolationDataPoint[] = summaryData?.data ?? [] const eventTypes: string[] = summaryData?.event_types ?? [] @@ -235,6 +342,8 @@ export function SloTab() { } + {!sloLoading && } + {/* Violation timeline chart */} * { flex: 1; min-width: 200px; } @media (max-width: 640px) { .slo-kpi-grid > * { flex: 0 0 100%; min-width: 0; } + .slo-coverage-grid { grid-template-columns: repeat(2, minmax(0, 1fr)) !important; } } `}