feat(governance): explain verifier failures
All checks were successful
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / tests (push) Successful in 1m21s
CD Pipeline / build-and-deploy (push) Successful in 3m23s
CD Pipeline / post-deploy-checks (push) Successful in 1m18s

This commit is contained in:
Your Name
2026-05-14 20:49:20 +08:00
parent dd269b195c
commit bad48dee04
5 changed files with 364 additions and 1 deletions

View File

@@ -190,6 +190,9 @@ class Adr100SloStatusService:
recent_rows = (
await db.execute(text(_VERIFICATION_COVERAGE_RECENT_SQL))
).mappings().all()
recent_non_success_rows = (
await db.execute(text(_VERIFICATION_COVERAGE_NON_SUCCESS_SQL))
).mappings().all()
except Exception as exc:
logger.warning("adr100_verification_coverage_query_error", error=str(exc))
return {
@@ -213,9 +216,18 @@ class Adr100SloStatusService:
"latest_auto_age_seconds": None,
"last_verified_auto_age_seconds": None,
"recent_unverified": [],
"recent_non_success": [],
"non_success_breakdown": {
"by_verification_result": [],
"by_failure_class": [],
},
}
return _build_verification_coverage_payload(summary_row, recent_rows)
return _build_verification_coverage_payload(
summary_row,
recent_rows,
recent_non_success_rows,
)
_VERIFICATION_COVERAGE_SQL = """
@@ -291,6 +303,65 @@ _VERIFICATION_COVERAGE_RECENT_SQL = """
"""
_VERIFICATION_COVERAGE_NON_SUCCESS_SQL = """
WITH recent_auto AS (
SELECT
id,
incident_id,
success,
playbook_id,
playbook_name,
triggered_by,
risk_level,
error_message,
created_at
FROM auto_repair_executions
WHERE created_at >= NOW() - INTERVAL '24 hours'
),
per_auto AS (
SELECT
are.id AS auto_repair_id,
are.incident_id,
are.success AS auto_success,
are.playbook_id,
are.playbook_name,
are.triggered_by,
are.risk_level,
left(coalesce(are.error_message, ''), 240) AS auto_error,
are.created_at AS auto_created_at,
latest.verification_result,
latest.collected_at AS verification_collected_at,
left(coalesce(latest.post_execution_state::text, ''), 700) AS post_state_text,
left(coalesce(latest.evidence_summary, ''), 300) AS evidence_summary
FROM recent_auto are
LEFT JOIN LATERAL (
SELECT
ev.verification_result,
ev.collected_at,
ev.post_execution_state,
ev.evidence_summary
FROM incident_evidence ev
WHERE ev.incident_id = are.incident_id
AND ev.verification_result IS NOT NULL
ORDER BY ev.collected_at DESC
LIMIT 1
) latest ON TRUE
)
SELECT
p.*,
i.status::text AS incident_status,
i.severity::text AS incident_severity,
i.alert_category,
i.alertname
FROM per_auto p
LEFT JOIN incidents i ON i.incident_id = p.incident_id
WHERE p.verification_result IS NOT NULL
AND p.verification_result <> 'success'
ORDER BY p.auto_created_at DESC
LIMIT 8
"""
async def _query_prometheus_value(
client: httpx.AsyncClient,
prom_url: str,
@@ -371,6 +442,7 @@ def _classify_status(value: float, definition: Adr100SloDefinition) -> str:
def _build_verification_coverage_payload(
summary_row: Any,
recent_unverified_rows: Any,
recent_non_success_rows: Any = (),
) -> dict[str, Any]:
row = dict(summary_row)
total_auto = int(row.get("total_auto") or 0)
@@ -398,6 +470,10 @@ def _build_verification_coverage_payload(
coverage_rate = (verified_auto / total_auto) if total_auto else None
verification_success_rate = (verified_success / verified_auto) if verified_auto else None
recent_non_success = [
_non_success_finding_payload(dict(raw))
for raw in recent_non_success_rows
]
return {
"schema_version": "adr100_verification_coverage_v1",
@@ -428,9 +504,96 @@ def _build_verification_coverage_payload(
}
for item in (dict(raw) for raw in recent_unverified_rows)
],
"recent_non_success": recent_non_success,
"non_success_breakdown": {
"by_verification_result": _count_breakdown(
item["verification_result"] for item in recent_non_success
),
"by_failure_class": _count_breakdown(
item["failure_class"] for item in recent_non_success
),
},
}
def _non_success_finding_payload(row: dict[str, Any]) -> dict[str, Any]:
failure_class = _classify_non_success_failure(row)
return {
"auto_repair_id": str(row.get("auto_repair_id")),
"incident_id": str(row.get("incident_id")),
"incident_status": str(row.get("incident_status") or "unknown"),
"incident_severity": str(row.get("incident_severity") or "unknown"),
"alert_category": row.get("alert_category"),
"alertname": row.get("alertname"),
"auto_success": bool(row.get("auto_success")),
"playbook_id": row.get("playbook_id"),
"playbook_name": row.get("playbook_name"),
"triggered_by": row.get("triggered_by"),
"risk_level": row.get("risk_level"),
"verification_result": str(row.get("verification_result") or "unknown"),
"failure_class": failure_class,
"next_step": _next_step_for_failure_class(failure_class),
"auto_error_excerpt": _short_text(row.get("auto_error"), 180),
"evidence_excerpt": _short_text(row.get("evidence_summary"), 180),
"auto_created_at": _iso(row.get("auto_created_at")),
"verification_collected_at": _iso(row.get("verification_collected_at")),
}
def _classify_non_success_failure(row: dict[str, Any]) -> str:
combined = " ".join(
str(row.get(key) or "")
for key in ("auto_error", "post_state_text", "evidence_summary")
).lower()
if "unsupported scheme" in combined:
return "unsupported_action_scheme"
if "missing_query_parameter" in combined:
return "verifier_missing_promql"
if "empty_pod_name" in combined:
return "verifier_target_missing_pod"
if not bool(row.get("auto_success")):
return "auto_repair_execution_failed"
result = str(row.get("verification_result") or "").lower()
if result in {"failed", "timeout"}:
return f"verification_{result}"
return "verification_degraded"
def _next_step_for_failure_class(failure_class: str) -> str:
if failure_class == "unsupported_action_scheme":
return "normalize_playbook_executor"
if failure_class == "verifier_missing_promql":
return "add_verifier_query_template"
if failure_class == "verifier_target_missing_pod":
return "map_verifier_target"
if failure_class == "auto_repair_execution_failed":
return "review_auto_repair_execution"
if failure_class in {"verification_failed", "verification_timeout"}:
return "escalate_verification_failure"
return "review_degraded_verification"
def _count_breakdown(values: Any) -> list[dict[str, Any]]:
counts: dict[str, int] = {}
for value in values:
key = str(value or "unknown")
counts[key] = counts.get(key, 0) + 1
return [
{"name": name, "count": count}
for name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))
]
def _short_text(value: Any, limit: int) -> str | None:
if value is None:
return None
text = " ".join(str(value).split())
if not text:
return None
return text[:limit]
def _iso(value: Any) -> str | None:
return value.isoformat() if hasattr(value, "isoformat") else None

View File

@@ -64,6 +64,11 @@ async def _low_volume_coverage(self): # noqa: ANN001
"latest_auto_age_seconds": None,
"last_verified_auto_age_seconds": None,
"recent_unverified": [],
"recent_non_success": [],
"non_success_breakdown": {
"by_verification_result": [],
"by_failure_class": [],
},
}
@@ -150,6 +155,27 @@ def test_verification_coverage_payload_flags_backlog():
"created_at": None,
},
],
[
{
"auto_repair_id": "are-2",
"incident_id": "INC-2",
"incident_status": "INVESTIGATING",
"incident_severity": "P2",
"alert_category": "infrastructure",
"alertname": "DockerContainerMemoryLimitPressure",
"auto_success": False,
"playbook_id": "PB-1",
"playbook_name": "Docker pressure playbook",
"triggered_by": "auto_repair",
"risk_level": "LOW",
"verification_result": "degraded",
"auto_error": "FAILED: Unsupported scheme: 'ssh {host}'",
"post_state_text": '{"k8s_get_pod_logs": {"error": "empty_pod_name"}}',
"evidence_summary": "result=degraded",
"auto_created_at": None,
"verification_collected_at": None,
},
],
)
assert payload["status"] == "warning"
@@ -157,6 +183,14 @@ def test_verification_coverage_payload_flags_backlog():
assert payload["coverage_rate"] == pytest.approx(5 / 7)
assert payload["verification_success_rate"] == pytest.approx(4 / 5)
assert payload["recent_unverified"][0]["incident_id"] == "INC-1"
assert payload["recent_non_success"][0]["failure_class"] == "unsupported_action_scheme"
assert payload["recent_non_success"][0]["next_step"] == "normalize_playbook_executor"
assert payload["non_success_breakdown"]["by_failure_class"] == [
{"name": "unsupported_action_scheme", "count": 1},
]
assert payload["non_success_breakdown"]["by_verification_result"] == [
{"name": "degraded", "count": 1},
]
def test_verification_coverage_payload_skips_when_no_auto_repair():
@@ -170,6 +204,7 @@ def test_verification_coverage_payload_skips_when_no_auto_repair():
"unverified_auto": 0,
},
[],
[],
)
assert payload["status"] == "skipped_low_volume"

View File

@@ -1410,6 +1410,8 @@
"successRate": "Success verification",
"lastVerified": "Last verified execution",
"reasonLabel": "Reason",
"failureBreakdown": "Non-success Verification Classes",
"recentFindings": "Recent Non-success Verification",
"state": {
"ok": "OK",
"warning": "Needs tracking",
@@ -1424,6 +1426,24 @@
"verification_backlog_present": "Some auto repairs are missing verification results",
"non_success_verification_present": "degraded / failed / timeout verification exists",
"postgresql_query_error": "PostgreSQL query failed"
},
"failureClass": {
"unsupported_action_scheme": "PlayBook action misses supported executor",
"verifier_missing_promql": "Verifier missing PromQL query",
"verifier_target_missing_pod": "Verifier missing pod target",
"auto_repair_execution_failed": "Auto repair execution failed",
"verification_failed": "Verification failed",
"verification_timeout": "Verification timed out",
"verification_degraded": "Verification degraded",
"unknown": "Pending classification"
},
"nextStep": {
"normalize_playbook_executor": "Fix PlayBook executor",
"add_verifier_query_template": "Add verifier query template",
"map_verifier_target": "Map verifier target",
"review_auto_repair_execution": "Inspect auto repair record",
"escalate_verification_failure": "Escalate verification failure",
"review_degraded_verification": "Review degraded evidence"
}
}
},

View File

@@ -1411,6 +1411,8 @@
"successRate": "成功驗證",
"lastVerified": "最後已驗證執行",
"reasonLabel": "原因",
"failureBreakdown": "非成功驗證分類",
"recentFindings": "近期非成功驗證",
"state": {
"ok": "正常",
"warning": "需追蹤",
@@ -1425,6 +1427,24 @@
"verification_backlog_present": "有自動修復尚未寫入驗證結果",
"non_success_verification_present": "存在 degraded / failed / timeout 驗證結果",
"postgresql_query_error": "PostgreSQL 查詢失敗"
},
"failureClass": {
"unsupported_action_scheme": "PlayBook 動作未走支援執行器",
"verifier_missing_promql": "Verifier 缺 PromQL 查詢",
"verifier_target_missing_pod": "Verifier 缺 Pod 目標",
"auto_repair_execution_failed": "自動修復執行失敗",
"verification_failed": "驗證失敗",
"verification_timeout": "驗證逾時",
"verification_degraded": "驗證降級",
"unknown": "待分類"
},
"nextStep": {
"normalize_playbook_executor": "修正 PlayBook 執行器",
"add_verifier_query_template": "補 verifier 查詢模板",
"map_verifier_target": "補 verifier 目標映射",
"review_auto_repair_execution": "檢查自動修復紀錄",
"escalate_verification_failure": "升級驗證失敗",
"review_degraded_verification": "檢查降級證據"
}
}
},

View File

@@ -84,6 +84,30 @@ interface Adr100VerificationCoverage {
success: boolean
created_at?: string | null
}>
recent_non_success?: Array<{
auto_repair_id: string
incident_id: string
incident_status: string
incident_severity: string
alert_category?: string | null
alertname?: string | null
auto_success: boolean
playbook_id?: string | null
playbook_name?: string | null
triggered_by?: string | null
risk_level?: string | null
verification_result: string
failure_class: string
next_step: string
auto_error_excerpt?: string | null
evidence_excerpt?: string | null
auto_created_at?: string | null
verification_collected_at?: string | null
}>
non_success_breakdown?: {
by_verification_result?: Array<{ name: string; count: number }>
by_failure_class?: Array<{ name: string; count: number }>
}
}
interface SummaryApiResponse {
@@ -123,6 +147,32 @@ function formatPercent(value?: number | null): string {
return value == null ? '--' : `${(value * 100).toFixed(1)}%`
}
function failureClassKey(value?: string | null): string {
if (value === 'unsupported_action_scheme') return value
if (value === 'verifier_missing_promql') return value
if (value === 'verifier_target_missing_pod') return value
if (value === 'auto_repair_execution_failed') return value
if (value === 'verification_failed') return value
if (value === 'verification_timeout') return value
if (value === 'verification_degraded') return value
return 'unknown'
}
function nextStepKey(value?: string | null): string {
if (value === 'normalize_playbook_executor') return value
if (value === 'add_verifier_query_template') return value
if (value === 'map_verifier_target') return value
if (value === 'review_auto_repair_execution') return value
if (value === 'escalate_verification_failure') return value
if (value === 'review_degraded_verification') return value
return 'review_degraded_verification'
}
function compactLabel(value?: string | null, fallback = '--'): string {
if (!value) return fallback
return value.length > 54 ? `${value.slice(0, 54)}...` : value
}
function buildMetrics(api: SloApiResponse): SloMetric[] {
const adr100Metrics = api.adr100?.metrics
if (adr100Metrics?.length) {
@@ -186,6 +236,8 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
{ label: t('unverifiedAuto'), value: String(coverage?.unverified_auto ?? '--') },
{ label: t('coverageRate'), value: formatPercent(coverage?.coverage_rate) },
]
const failureBreakdown = coverage?.non_success_breakdown?.by_failure_class ?? []
const recentFindings = coverage?.recent_non_success ?? []
return (
<GlassCard variant="subtle" padding="md">
@@ -234,6 +286,78 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
<span>{t('successRate')} {formatPercent(coverage?.verification_success_rate)}</span>
<span>{t('lastVerified')} {coverage?.last_verified_auto_at ?? '--'}</span>
</div>
{failureBreakdown.length > 0 && (
<div style={{ display: 'flex', flexDirection: 'column', gap: 7 }}>
<div style={{ fontFamily: 'Syne, sans-serif', fontSize: 11, fontWeight: 700, color: '#141413' }}>
{t('failureBreakdown')}
</div>
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 6 }}>
{failureBreakdown.map(item => (
<span key={item.name} style={{
display: 'inline-flex',
alignItems: 'center',
minHeight: 24,
padding: '3px 7px',
borderRadius: 6,
border: '0.5px solid #F59E0B40',
background: 'rgba(245,158,11,0.08)',
fontFamily: "'DM Mono', monospace",
fontSize: 10,
color: '#7c5a10',
}}>
{t(`failureClass.${failureClassKey(item.name)}`)} · {item.count}
</span>
))}
</div>
</div>
)}
{recentFindings.length > 0 && (
<div style={{ display: 'flex', flexDirection: 'column', gap: 8 }}>
<div style={{ fontFamily: 'Syne, sans-serif', fontSize: 11, fontWeight: 700, color: '#141413' }}>
{t('recentFindings')}
</div>
<div style={{ display: 'flex', flexDirection: 'column', gap: 7 }}>
{recentFindings.slice(0, 5).map(item => (
<div key={`${item.auto_repair_id}-${item.incident_id}`} style={{
display: 'grid',
gridTemplateColumns: 'minmax(120px, 0.8fr) minmax(180px, 1.4fr) minmax(160px, 1fr)',
gap: 10,
alignItems: 'center',
minWidth: 0,
paddingTop: 7,
borderTop: '0.5px solid rgba(20,20,19,0.08)',
}} className="slo-finding-row">
<div style={{ minWidth: 0 }}>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#141413' }}>
{item.incident_id}
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginTop: 2 }}>
{item.incident_status} · {item.verification_result}
</div>
</div>
<div style={{ minWidth: 0 }}>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#7c5a10' }}>
{t(`failureClass.${failureClassKey(item.failure_class)}`)}
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginTop: 2, overflowWrap: 'anywhere' }}>
{compactLabel(item.alertname)} · {compactLabel(item.playbook_name)}
</div>
</div>
<div style={{ minWidth: 0 }}>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#141413' }}>
{t(`nextStep.${nextStepKey(item.next_step)}`)}
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginTop: 2, overflowWrap: 'anywhere' }}>
{compactLabel(item.auto_error_excerpt ?? item.evidence_excerpt)}
</div>
</div>
</div>
))}
</div>
</div>
)}
</div>
</GlassCard>
)
@@ -358,6 +482,7 @@ export function SloTab() {
@media (max-width: 640px) {
.slo-kpi-grid > * { flex: 0 0 100%; min-width: 0; }
.slo-coverage-grid { grid-template-columns: repeat(2, minmax(0, 1fr)) !important; }
.slo-finding-row { grid-template-columns: 1fr !important; }
}
`}</style>
</div>