feat(governance): surface verification coverage

2026-05-14 20:28:53 +08:00
parent bc1a11e373
commit 485c58d085
5 changed files with 447 additions and 6 deletions
--- a/apps/api/src/services/adr100_slo_status_service.py
+++ b/apps/api/src/services/adr100_slo_status_service.py
@@ -10,13 +10,15 @@ from __future__ import annotations

 import math
 from dataclasses import dataclass
-from datetime import UTC, datetime
 from typing import Any

 import httpx
 import structlog
+from sqlalchemy import text

 from src.core.config import settings
+from src.db.base import get_db_context
+from src.utils.timezone import now_taipei_iso

 logger = structlog.get_logger(__name__)

@@ -99,17 +101,19 @@ class Adr100SloStatusService:
        evaluable = [metric for metric in metrics if metric.get("evaluable")]
        ok_count = sum(1 for metric in evaluable if metric.get("status") == "ok")
        overall_compliance = (ok_count / len(evaluable)) if evaluable else None
-        overall_status = _overall_status(metrics, evaluable)
+        verification_coverage = await self._fetch_verification_coverage()
+        overall_status = _overall_status(metrics, evaluable, verification_coverage)

        return {
            "schema_version": "adr100_slo_status_v1",
-            "source": "prometheus",
-            "evaluated_at": datetime.now(UTC).isoformat(),
+            "source": "prometheus+postgresql",
+            "evaluated_at": now_taipei_iso(),
            "overall_status": overall_status,
            "overall_compliance": overall_compliance,
            "evaluable_count": len(evaluable),
            "metric_count": len(metrics),
            "metrics": metrics,
+            "verification_coverage": verification_coverage,
        }

    async def _fetch_metric(
@@ -176,6 +180,116 @@ class Adr100SloStatusService:
            sample_count=sample_count if sample_count is not None else value,
        )

+    async def _fetch_verification_coverage(self) -> dict[str, Any]:
+        """Summarize whether recent auto-repair executions have verifier evidence."""
+        try:
+            async with get_db_context() as db:
+                summary_row = (
+                    await db.execute(text(_VERIFICATION_COVERAGE_SQL))
+                ).mappings().one()
+                recent_rows = (
+                    await db.execute(text(_VERIFICATION_COVERAGE_RECENT_SQL))
+                ).mappings().all()
+        except Exception as exc:
+            logger.warning("adr100_verification_coverage_query_error", error=str(exc))
+            return {
+                "schema_version": "adr100_verification_coverage_v1",
+                "source": "postgresql",
+                "window": "24h",
+                "status": "error",
+                "reason": "postgresql_query_error",
+                "evaluable": False,
+                "total_auto": 0,
+                "successful_auto": 0,
+                "verified_auto": 0,
+                "verified_success": 0,
+                "verified_non_success": 0,
+                "unverified_auto": 0,
+                "coverage_rate": None,
+                "verification_success_rate": None,
+                "last_auto_at": None,
+                "last_verified_auto_at": None,
+                "last_verification_evidence_at": None,
+                "latest_auto_age_seconds": None,
+                "last_verified_auto_age_seconds": None,
+                "recent_unverified": [],
+            }
+
+        return _build_verification_coverage_payload(summary_row, recent_rows)
+
+
+_VERIFICATION_COVERAGE_SQL = """
+    WITH recent_auto AS (
+        SELECT id, incident_id, success, created_at
+        FROM auto_repair_executions
+        WHERE created_at >= NOW() - INTERVAL '24 hours'
+    ),
+    per_auto AS (
+        SELECT
+            are.id,
+            are.incident_id,
+            are.success,
+            are.created_at,
+            latest.verification_result,
+            latest.collected_at AS verification_collected_at,
+            latest.self_healing_score
+        FROM recent_auto are
+        LEFT JOIN LATERAL (
+            SELECT ev.verification_result, ev.collected_at, ev.self_healing_score
+            FROM incident_evidence ev
+            WHERE ev.incident_id = are.incident_id
+              AND ev.verification_result IS NOT NULL
+            ORDER BY ev.collected_at DESC
+            LIMIT 1
+        ) latest ON TRUE
+    )
+    SELECT
+        count(*)::int AS total_auto,
+        count(*) FILTER (WHERE success)::int AS successful_auto,
+        count(*) FILTER (WHERE verification_result IS NOT NULL)::int AS verified_auto,
+        count(*) FILTER (WHERE verification_result = 'success')::int AS verified_success,
+        count(*) FILTER (WHERE verification_result IN ('degraded','failed','timeout'))::int AS verified_non_success,
+        count(*) FILTER (WHERE verification_result IS NULL)::int AS unverified_auto,
+        max(created_at) AS last_auto_at,
+        max(created_at) FILTER (WHERE verification_result IS NOT NULL) AS last_verified_auto_at,
+        max(verification_collected_at) AS last_verification_evidence_at,
+        EXTRACT(EPOCH FROM (NOW() - max(created_at)))::int AS latest_auto_age_seconds,
+        EXTRACT(EPOCH FROM (NOW() - (max(created_at) FILTER (WHERE verification_result IS NOT NULL))))::int
+            AS last_verified_auto_age_seconds
+    FROM per_auto
+"""
+
+
+_VERIFICATION_COVERAGE_RECENT_SQL = """
+    WITH recent_auto AS (
+        SELECT id, incident_id, success, created_at
+        FROM auto_repair_executions
+        WHERE created_at >= NOW() - INTERVAL '24 hours'
+    ),
+    per_auto AS (
+        SELECT
+            are.id,
+            are.incident_id,
+            are.success,
+            are.created_at,
+            latest.verification_result
+        FROM recent_auto are
+        LEFT JOIN LATERAL (
+            SELECT ev.verification_result
+            FROM incident_evidence ev
+            WHERE ev.incident_id = are.incident_id
+              AND ev.verification_result IS NOT NULL
+            ORDER BY ev.collected_at DESC
+            LIMIT 1
+        ) latest ON TRUE
+    )
+    SELECT id, incident_id, success, created_at
+    FROM per_auto
+    WHERE verification_result IS NULL
+    ORDER BY created_at DESC
+    LIMIT 5
+"""
+

 async def _query_prometheus_value(
    client: httpx.AsyncClient,
@@ -254,9 +368,86 @@ def _classify_status(value: float, definition: Adr100SloDefinition) -> str:
    return "ok"


-def _overall_status(metrics: list[dict[str, Any]], evaluable: list[dict[str, Any]]) -> str:
+def _build_verification_coverage_payload(
+    summary_row: Any,
+    recent_unverified_rows: Any,
+) -> dict[str, Any]:
+    row = dict(summary_row)
+    total_auto = int(row.get("total_auto") or 0)
+    verified_auto = int(row.get("verified_auto") or 0)
+    verified_success = int(row.get("verified_success") or 0)
+    verified_non_success = int(row.get("verified_non_success") or 0)
+    unverified_auto = int(row.get("unverified_auto") or 0)
+
+    if total_auto == 0:
+        status = "skipped_low_volume"
+        reason = "no_auto_repair_executions_24h"
+        evaluable = False
+    elif unverified_auto > 0:
+        status = "warning"
+        reason = "verification_backlog_present"
+        evaluable = True
+    elif verified_non_success > 0:
+        status = "warning"
+        reason = "non_success_verification_present"
+        evaluable = True
+    else:
+        status = "ok"
+        reason = None
+        evaluable = True
+
+    coverage_rate = (verified_auto / total_auto) if total_auto else None
+    verification_success_rate = (verified_success / verified_auto) if verified_auto else None
+
+    return {
+        "schema_version": "adr100_verification_coverage_v1",
+        "source": "postgresql",
+        "window": "24h",
+        "status": status,
+        "reason": reason,
+        "evaluable": evaluable,
+        "total_auto": total_auto,
+        "successful_auto": int(row.get("successful_auto") or 0),
+        "verified_auto": verified_auto,
+        "verified_success": verified_success,
+        "verified_non_success": verified_non_success,
+        "unverified_auto": unverified_auto,
+        "coverage_rate": coverage_rate,
+        "verification_success_rate": verification_success_rate,
+        "last_auto_at": _iso(row.get("last_auto_at")),
+        "last_verified_auto_at": _iso(row.get("last_verified_auto_at")),
+        "last_verification_evidence_at": _iso(row.get("last_verification_evidence_at")),
+        "latest_auto_age_seconds": _int_or_none(row.get("latest_auto_age_seconds")),
+        "last_verified_auto_age_seconds": _int_or_none(row.get("last_verified_auto_age_seconds")),
+        "recent_unverified": [
+            {
+                "id": str(item.get("id")),
+                "incident_id": str(item.get("incident_id")),
+                "success": bool(item.get("success")),
+                "created_at": _iso(item.get("created_at")),
+            }
+            for item in (dict(raw) for raw in recent_unverified_rows)
+        ],
+    }
+
+
+def _iso(value: Any) -> str | None:
+    return value.isoformat() if hasattr(value, "isoformat") else None
+
+
+def _int_or_none(value: Any) -> int | None:
+    return int(value) if value is not None else None
+
+
+def _overall_status(
+    metrics: list[dict[str, Any]],
+    evaluable: list[dict[str, Any]],
+    verification_coverage: dict[str, Any] | None = None,
+) -> str:
    if any(metric.get("status") == "violated" for metric in metrics):
        return "violated"
+    if verification_coverage and verification_coverage.get("status") in {"violated", "warning"}:
+        return str(verification_coverage["status"])
    if any(metric.get("status") == "warning" for metric in metrics):
        return "warning"
    if evaluable and any(metric.get("status") == "skipped_low_volume" for metric in metrics):
--- a/apps/api/tests/test_adr100_slo_status_service.py
+++ b/apps/api/tests/test_adr100_slo_status_service.py
@@ -4,7 +4,10 @@ from typing import Any

 import pytest

-from src.services.adr100_slo_status_service import Adr100SloStatusService
+from src.services.adr100_slo_status_service import (
+    Adr100SloStatusService,
+    _build_verification_coverage_payload,
+)


 class _FakePrometheusResponse:
@@ -39,6 +42,31 @@ class _FakePrometheusClient:
        })


+async def _low_volume_coverage(self):  # noqa: ANN001
+    return {
+        "schema_version": "adr100_verification_coverage_v1",
+        "source": "postgresql",
+        "window": "24h",
+        "status": "skipped_low_volume",
+        "reason": "no_auto_repair_executions_24h",
+        "evaluable": False,
+        "total_auto": 0,
+        "successful_auto": 0,
+        "verified_auto": 0,
+        "verified_success": 0,
+        "verified_non_success": 0,
+        "unverified_auto": 0,
+        "coverage_rate": None,
+        "verification_success_rate": None,
+        "last_auto_at": None,
+        "last_verified_auto_at": None,
+        "last_verification_evidence_at": None,
+        "latest_auto_age_seconds": None,
+        "last_verified_auto_age_seconds": None,
+        "recent_unverified": [],
+    }
+
+
@pytest.mark.asyncio
 async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch):
    values = {
@@ -52,6 +80,11 @@ async def test_fetch_report_marks_ratio_slos_low_volume(monkeypatch):
        "httpx.AsyncClient",
        lambda *args, **kwargs: _FakePrometheusClient(values),
    )
+    monkeypatch.setattr(
+        Adr100SloStatusService,
+        "_fetch_verification_coverage",
+        _low_volume_coverage,
+    )

    report = await Adr100SloStatusService().fetch_report()

@@ -79,6 +112,11 @@ async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch):
        "httpx.AsyncClient",
        lambda *args, **kwargs: _FakePrometheusClient(values),
    )
+    monkeypatch.setattr(
+        Adr100SloStatusService,
+        "_fetch_verification_coverage",
+        _low_volume_coverage,
+    )

    report = await Adr100SloStatusService().fetch_report()

@@ -87,3 +125,53 @@ async def test_fetch_report_classifies_hard_red_line_violation(monkeypatch):
    assert by_name["autonomy_rate"]["sample_count"] == 6
    assert by_name["km_growth_rate"]["status"] == "violated"
    assert report["overall_status"] == "violated"
+
+
+def test_verification_coverage_payload_flags_backlog():
+    payload = _build_verification_coverage_payload(
+        {
+            "total_auto": 7,
+            "successful_auto": 6,
+            "verified_auto": 5,
+            "verified_success": 4,
+            "verified_non_success": 1,
+            "unverified_auto": 2,
+            "last_auto_at": None,
+            "last_verified_auto_at": None,
+            "last_verification_evidence_at": None,
+            "latest_auto_age_seconds": 90,
+            "last_verified_auto_age_seconds": 120,
+        },
+        [
+            {
+                "id": "are-1",
+                "incident_id": "INC-1",
+                "success": True,
+                "created_at": None,
+            },
+        ],
+    )
+
+    assert payload["status"] == "warning"
+    assert payload["reason"] == "verification_backlog_present"
+    assert payload["coverage_rate"] == pytest.approx(5 / 7)
+    assert payload["verification_success_rate"] == pytest.approx(4 / 5)
+    assert payload["recent_unverified"][0]["incident_id"] == "INC-1"
+
+
+def test_verification_coverage_payload_skips_when_no_auto_repair():
+    payload = _build_verification_coverage_payload(
+        {
+            "total_auto": 0,
+            "successful_auto": 0,
+            "verified_auto": 0,
+            "verified_success": 0,
+            "verified_non_success": 0,
+            "unverified_auto": 0,
+        },
+        [],
+    )
+
+    assert payload["status"] == "skipped_low_volume"
+    assert payload["reason"] == "no_auto_repair_executions_24h"
+    assert payload["evaluable"] is False
--- a/apps/web/messages/en.json
+++ b/apps/web/messages/en.json
@@ -1399,6 +1399,32 @@
      "compliance": {
        "title": "Overall Compliance",
        "target": "Target ≥ 95%"
+      },
+      "coverage": {
+        "title": "Verification Coverage",
+        "subtitle": "Auto-repair executions and verifier writeback in the last {window}",
+        "totalAuto": "Auto repairs",
+        "verifiedAuto": "Verified",
+        "unverifiedAuto": "Unverified",
+        "coverageRate": "Coverage",
+        "successRate": "Success verification",
+        "lastVerified": "Last verified execution",
+        "reasonLabel": "Reason",
+        "state": {
+          "ok": "OK",
+          "warning": "Needs tracking",
+          "violated": "Hard red line",
+          "skipped_low_volume": "Waiting for samples",
+          "no_data": "No data",
+          "error": "Query failed"
+        },
+        "reason": {
+          "none": "None",
+          "no_auto_repair_executions_24h": "No auto-repair executions in the last 24h",
+          "verification_backlog_present": "Some auto repairs are missing verification results",
+          "non_success_verification_present": "degraded / failed / timeout verification exists",
+          "postgresql_query_error": "PostgreSQL query failed"
+        }
      }
    },
    "events": {
--- a/apps/web/messages/zh-TW.json
+++ b/apps/web/messages/zh-TW.json
@@ -1400,6 +1400,32 @@
      "compliance": {
        "title": "整體合規率",
        "target": "目標 ≥ 95%"
+      },
+      "coverage": {
+        "title": "驗證覆蓋率",
+        "subtitle": "近 {window} 自動修復執行與 verifier 寫回狀態",
+        "totalAuto": "自動修復",
+        "verifiedAuto": "已驗證",
+        "unverifiedAuto": "待驗證",
+        "coverageRate": "覆蓋率",
+        "successRate": "成功驗證",
+        "lastVerified": "最後已驗證執行",
+        "reasonLabel": "原因",
+        "state": {
+          "ok": "正常",
+          "warning": "需追蹤",
+          "violated": "硬紅線",
+          "skipped_low_volume": "等待樣本",
+          "no_data": "沒有資料",
+          "error": "查詢失敗"
+        },
+        "reason": {
+          "none": "無",
+          "no_auto_repair_executions_24h": "近 24h 無自動修復執行",
+          "verification_backlog_present": "有自動修復尚未寫入驗證結果",
+          "non_success_verification_present": "存在 degraded / failed / timeout 驗證結果",
+          "postgresql_query_error": "PostgreSQL 查詢失敗"
+        }
      }
    },
    "events": {
--- a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx
+++ b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx
@@ -46,6 +46,7 @@ interface SloApiResponse {
  adr100?: {
    overall_status?: string
    overall_compliance?: number | null
+    verification_coverage?: Adr100VerificationCoverage
    metrics?: Array<{
      name: SloMetric['name']
      value: number | null
@@ -61,6 +62,30 @@ interface SloApiResponse {
  computed_at?: string
 }

+interface Adr100VerificationCoverage {
+  status: 'ok' | 'warning' | 'violated' | 'skipped_low_volume' | 'no_data' | 'error'
+  reason?: string | null
+  window?: string
+  total_auto: number
+  successful_auto: number
+  verified_auto: number
+  verified_success: number
+  verified_non_success: number
+  unverified_auto: number
+  coverage_rate?: number | null
+  verification_success_rate?: number | null
+  last_auto_at?: string | null
+  last_verified_auto_at?: string | null
+  latest_auto_age_seconds?: number | null
+  last_verified_auto_age_seconds?: number | null
+  recent_unverified?: Array<{
+    id: string
+    incident_id: string
+    success: boolean
+    created_at?: string | null
+  }>
+}
+
 interface SummaryApiResponse {
  data?: ViolationDataPoint[]
  event_types?: string[]
@@ -79,6 +104,25 @@ function mapStatus(s: string): SloMetric['status'] {
  return 'critical'
 }

+function coverageTone(status?: Adr100VerificationCoverage['status']): string {
+  if (status === 'ok') return '#22C55E'
+  if (status === 'warning') return '#F59E0B'
+  if (!status || status === 'skipped_low_volume' || status === 'no_data') return '#87867f'
+  return '#FF3300'
+}
+
+function coverageReasonKey(reason?: string | null): string {
+  if (reason === 'no_auto_repair_executions_24h') return reason
+  if (reason === 'verification_backlog_present') return reason
+  if (reason === 'non_success_verification_present') return reason
+  if (reason === 'postgresql_query_error') return reason
+  return 'none'
+}
+
+function formatPercent(value?: number | null): string {
+  return value == null ? '--' : `${(value * 100).toFixed(1)}%`
+}
+
 function buildMetrics(api: SloApiResponse): SloMetric[] {
  const adr100Metrics = api.adr100?.metrics
  if (adr100Metrics?.length) {
@@ -133,6 +177,68 @@ function buildMetrics(api: SloApiResponse): SloMetric[] {
  })
 }

+function VerificationCoveragePanel({ coverage }: { coverage?: Adr100VerificationCoverage }) {
+  const t = useTranslations('governance.slo.coverage')
+  const color = coverageTone(coverage?.status)
+  const rows = [
+    { label: t('totalAuto'), value: String(coverage?.total_auto ?? '--') },
+    { label: t('verifiedAuto'), value: String(coverage?.verified_auto ?? '--') },
+    { label: t('unverifiedAuto'), value: String(coverage?.unverified_auto ?? '--') },
+    { label: t('coverageRate'), value: formatPercent(coverage?.coverage_rate) },
+  ]
+
+  return (
+    <GlassCard variant="subtle" padding="md">
+      <div style={{ display: 'flex', flexDirection: 'column', gap: 12 }}>
+        <div style={{ display: 'flex', alignItems: 'flex-start', justifyContent: 'space-between', gap: 12 }}>
+          <div style={{ minWidth: 0 }}>
+            <div style={{ fontFamily: 'Syne, sans-serif', fontSize: 13, fontWeight: 700, color: '#141413' }}>
+              {t('title')}
+            </div>
+            <div style={{ marginTop: 4, fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45 }}>
+              {t('subtitle', { window: coverage?.window ?? '24h' })}
+            </div>
+          </div>
+          <div style={{
+            flexShrink: 0,
+            display: 'inline-flex',
+            alignItems: 'center',
+            minHeight: 26,
+            padding: '4px 8px',
+            borderRadius: 6,
+            border: `0.5px solid ${color}40`,
+            background: `${color}12`,
+            fontFamily: "'DM Mono', monospace",
+            fontSize: 10,
+            color,
+          }}>
+            {t(`state.${coverage?.status ?? 'no_data'}`)}
+          </div>
+        </div>
+
+        <div style={{ display: 'grid', gridTemplateColumns: 'repeat(4, minmax(120px, 1fr))', gap: 10 }} className="slo-coverage-grid">
+          {rows.map(row => (
+            <div key={row.label} style={{ minWidth: 0 }}>
+              <div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginBottom: 3 }}>
+                {row.label}
+              </div>
+              <div style={{ fontFamily: 'Syne, sans-serif', fontSize: 18, fontWeight: 700, color: '#141413', letterSpacing: 0 }}>
+                {row.value}
+              </div>
+            </div>
+          ))}
+        </div>
+
+        <div style={{ display: 'flex', flexWrap: 'wrap', gap: '8px 14px', fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45 }}>
+          <span>{t('reasonLabel')} {t(`reason.${coverageReasonKey(coverage?.reason)}`)}</span>
+          <span>{t('successRate')} {formatPercent(coverage?.verification_success_rate)}</span>
+          <span>{t('lastVerified')} {coverage?.last_verified_auto_at ?? '--'}</span>
+        </div>
+      </div>
+    </GlassCard>
+  )
+}
+
 // =============================================================================
 // Component
 // =============================================================================
@@ -173,6 +279,7 @@ export function SloTab() {

  const metrics = sloData ? buildMetrics(sloData) : []
  const compliance = sloData?.adr100?.overall_compliance ?? sloData?.overall_compliance ?? null
+  const verificationCoverage = sloData?.adr100?.verification_coverage

  const chartData: ViolationDataPoint[] = summaryData?.data ?? []
  const eventTypes: string[] = summaryData?.event_types ?? []
@@ -235,6 +342,8 @@ export function SloTab() {
        }
      </div>

+      {!sloLoading && <VerificationCoveragePanel coverage={verificationCoverage} />}
+
      {/* Violation timeline chart */}
      <SloViolationChart
        data={chartData}
@@ -248,6 +357,7 @@ export function SloTab() {
        .slo-kpi-grid > * { flex: 1; min-width: 200px; }
        @media (max-width: 640px) {
          .slo-kpi-grid > * { flex: 0 0 100%; min-width: 0; }
+          .slo-coverage-grid { grid-template-columns: repeat(2, minmax(0, 1fr)) !important; }
        }
      `}</style>
    </div>