feat(governance): explain verifier failures
This commit is contained in:
@@ -190,6 +190,9 @@ class Adr100SloStatusService:
|
||||
recent_rows = (
|
||||
await db.execute(text(_VERIFICATION_COVERAGE_RECENT_SQL))
|
||||
).mappings().all()
|
||||
recent_non_success_rows = (
|
||||
await db.execute(text(_VERIFICATION_COVERAGE_NON_SUCCESS_SQL))
|
||||
).mappings().all()
|
||||
except Exception as exc:
|
||||
logger.warning("adr100_verification_coverage_query_error", error=str(exc))
|
||||
return {
|
||||
@@ -213,9 +216,18 @@ class Adr100SloStatusService:
|
||||
"latest_auto_age_seconds": None,
|
||||
"last_verified_auto_age_seconds": None,
|
||||
"recent_unverified": [],
|
||||
"recent_non_success": [],
|
||||
"non_success_breakdown": {
|
||||
"by_verification_result": [],
|
||||
"by_failure_class": [],
|
||||
},
|
||||
}
|
||||
|
||||
return _build_verification_coverage_payload(summary_row, recent_rows)
|
||||
return _build_verification_coverage_payload(
|
||||
summary_row,
|
||||
recent_rows,
|
||||
recent_non_success_rows,
|
||||
)
|
||||
|
||||
|
||||
_VERIFICATION_COVERAGE_SQL = """
|
||||
@@ -291,6 +303,65 @@ _VERIFICATION_COVERAGE_RECENT_SQL = """
|
||||
"""
|
||||
|
||||
|
||||
_VERIFICATION_COVERAGE_NON_SUCCESS_SQL = """
|
||||
WITH recent_auto AS (
|
||||
SELECT
|
||||
id,
|
||||
incident_id,
|
||||
success,
|
||||
playbook_id,
|
||||
playbook_name,
|
||||
triggered_by,
|
||||
risk_level,
|
||||
error_message,
|
||||
created_at
|
||||
FROM auto_repair_executions
|
||||
WHERE created_at >= NOW() - INTERVAL '24 hours'
|
||||
),
|
||||
per_auto AS (
|
||||
SELECT
|
||||
are.id AS auto_repair_id,
|
||||
are.incident_id,
|
||||
are.success AS auto_success,
|
||||
are.playbook_id,
|
||||
are.playbook_name,
|
||||
are.triggered_by,
|
||||
are.risk_level,
|
||||
left(coalesce(are.error_message, ''), 240) AS auto_error,
|
||||
are.created_at AS auto_created_at,
|
||||
latest.verification_result,
|
||||
latest.collected_at AS verification_collected_at,
|
||||
left(coalesce(latest.post_execution_state::text, ''), 700) AS post_state_text,
|
||||
left(coalesce(latest.evidence_summary, ''), 300) AS evidence_summary
|
||||
FROM recent_auto are
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT
|
||||
ev.verification_result,
|
||||
ev.collected_at,
|
||||
ev.post_execution_state,
|
||||
ev.evidence_summary
|
||||
FROM incident_evidence ev
|
||||
WHERE ev.incident_id = are.incident_id
|
||||
AND ev.verification_result IS NOT NULL
|
||||
ORDER BY ev.collected_at DESC
|
||||
LIMIT 1
|
||||
) latest ON TRUE
|
||||
)
|
||||
SELECT
|
||||
p.*,
|
||||
i.status::text AS incident_status,
|
||||
i.severity::text AS incident_severity,
|
||||
i.alert_category,
|
||||
i.alertname
|
||||
FROM per_auto p
|
||||
LEFT JOIN incidents i ON i.incident_id = p.incident_id
|
||||
WHERE p.verification_result IS NOT NULL
|
||||
AND p.verification_result <> 'success'
|
||||
ORDER BY p.auto_created_at DESC
|
||||
LIMIT 8
|
||||
"""
|
||||
|
||||
|
||||
async def _query_prometheus_value(
|
||||
client: httpx.AsyncClient,
|
||||
prom_url: str,
|
||||
@@ -371,6 +442,7 @@ def _classify_status(value: float, definition: Adr100SloDefinition) -> str:
|
||||
def _build_verification_coverage_payload(
|
||||
summary_row: Any,
|
||||
recent_unverified_rows: Any,
|
||||
recent_non_success_rows: Any = (),
|
||||
) -> dict[str, Any]:
|
||||
row = dict(summary_row)
|
||||
total_auto = int(row.get("total_auto") or 0)
|
||||
@@ -398,6 +470,10 @@ def _build_verification_coverage_payload(
|
||||
|
||||
coverage_rate = (verified_auto / total_auto) if total_auto else None
|
||||
verification_success_rate = (verified_success / verified_auto) if verified_auto else None
|
||||
recent_non_success = [
|
||||
_non_success_finding_payload(dict(raw))
|
||||
for raw in recent_non_success_rows
|
||||
]
|
||||
|
||||
return {
|
||||
"schema_version": "adr100_verification_coverage_v1",
|
||||
@@ -428,9 +504,96 @@ def _build_verification_coverage_payload(
|
||||
}
|
||||
for item in (dict(raw) for raw in recent_unverified_rows)
|
||||
],
|
||||
"recent_non_success": recent_non_success,
|
||||
"non_success_breakdown": {
|
||||
"by_verification_result": _count_breakdown(
|
||||
item["verification_result"] for item in recent_non_success
|
||||
),
|
||||
"by_failure_class": _count_breakdown(
|
||||
item["failure_class"] for item in recent_non_success
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _non_success_finding_payload(row: dict[str, Any]) -> dict[str, Any]:
|
||||
failure_class = _classify_non_success_failure(row)
|
||||
return {
|
||||
"auto_repair_id": str(row.get("auto_repair_id")),
|
||||
"incident_id": str(row.get("incident_id")),
|
||||
"incident_status": str(row.get("incident_status") or "unknown"),
|
||||
"incident_severity": str(row.get("incident_severity") or "unknown"),
|
||||
"alert_category": row.get("alert_category"),
|
||||
"alertname": row.get("alertname"),
|
||||
"auto_success": bool(row.get("auto_success")),
|
||||
"playbook_id": row.get("playbook_id"),
|
||||
"playbook_name": row.get("playbook_name"),
|
||||
"triggered_by": row.get("triggered_by"),
|
||||
"risk_level": row.get("risk_level"),
|
||||
"verification_result": str(row.get("verification_result") or "unknown"),
|
||||
"failure_class": failure_class,
|
||||
"next_step": _next_step_for_failure_class(failure_class),
|
||||
"auto_error_excerpt": _short_text(row.get("auto_error"), 180),
|
||||
"evidence_excerpt": _short_text(row.get("evidence_summary"), 180),
|
||||
"auto_created_at": _iso(row.get("auto_created_at")),
|
||||
"verification_collected_at": _iso(row.get("verification_collected_at")),
|
||||
}
|
||||
|
||||
|
||||
def _classify_non_success_failure(row: dict[str, Any]) -> str:
|
||||
combined = " ".join(
|
||||
str(row.get(key) or "")
|
||||
for key in ("auto_error", "post_state_text", "evidence_summary")
|
||||
).lower()
|
||||
if "unsupported scheme" in combined:
|
||||
return "unsupported_action_scheme"
|
||||
if "missing_query_parameter" in combined:
|
||||
return "verifier_missing_promql"
|
||||
if "empty_pod_name" in combined:
|
||||
return "verifier_target_missing_pod"
|
||||
if not bool(row.get("auto_success")):
|
||||
return "auto_repair_execution_failed"
|
||||
|
||||
result = str(row.get("verification_result") or "").lower()
|
||||
if result in {"failed", "timeout"}:
|
||||
return f"verification_{result}"
|
||||
return "verification_degraded"
|
||||
|
||||
|
||||
def _next_step_for_failure_class(failure_class: str) -> str:
|
||||
if failure_class == "unsupported_action_scheme":
|
||||
return "normalize_playbook_executor"
|
||||
if failure_class == "verifier_missing_promql":
|
||||
return "add_verifier_query_template"
|
||||
if failure_class == "verifier_target_missing_pod":
|
||||
return "map_verifier_target"
|
||||
if failure_class == "auto_repair_execution_failed":
|
||||
return "review_auto_repair_execution"
|
||||
if failure_class in {"verification_failed", "verification_timeout"}:
|
||||
return "escalate_verification_failure"
|
||||
return "review_degraded_verification"
|
||||
|
||||
|
||||
def _count_breakdown(values: Any) -> list[dict[str, Any]]:
|
||||
counts: dict[str, int] = {}
|
||||
for value in values:
|
||||
key = str(value or "unknown")
|
||||
counts[key] = counts.get(key, 0) + 1
|
||||
return [
|
||||
{"name": name, "count": count}
|
||||
for name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))
|
||||
]
|
||||
|
||||
|
||||
def _short_text(value: Any, limit: int) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
text = " ".join(str(value).split())
|
||||
if not text:
|
||||
return None
|
||||
return text[:limit]
|
||||
|
||||
|
||||
def _iso(value: Any) -> str | None:
|
||||
return value.isoformat() if hasattr(value, "isoformat") else None
|
||||
|
||||
|
||||
@@ -64,6 +64,11 @@ async def _low_volume_coverage(self): # noqa: ANN001
|
||||
"latest_auto_age_seconds": None,
|
||||
"last_verified_auto_age_seconds": None,
|
||||
"recent_unverified": [],
|
||||
"recent_non_success": [],
|
||||
"non_success_breakdown": {
|
||||
"by_verification_result": [],
|
||||
"by_failure_class": [],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -150,6 +155,27 @@ def test_verification_coverage_payload_flags_backlog():
|
||||
"created_at": None,
|
||||
},
|
||||
],
|
||||
[
|
||||
{
|
||||
"auto_repair_id": "are-2",
|
||||
"incident_id": "INC-2",
|
||||
"incident_status": "INVESTIGATING",
|
||||
"incident_severity": "P2",
|
||||
"alert_category": "infrastructure",
|
||||
"alertname": "DockerContainerMemoryLimitPressure",
|
||||
"auto_success": False,
|
||||
"playbook_id": "PB-1",
|
||||
"playbook_name": "Docker pressure playbook",
|
||||
"triggered_by": "auto_repair",
|
||||
"risk_level": "LOW",
|
||||
"verification_result": "degraded",
|
||||
"auto_error": "FAILED: Unsupported scheme: 'ssh {host}'",
|
||||
"post_state_text": '{"k8s_get_pod_logs": {"error": "empty_pod_name"}}',
|
||||
"evidence_summary": "result=degraded",
|
||||
"auto_created_at": None,
|
||||
"verification_collected_at": None,
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
assert payload["status"] == "warning"
|
||||
@@ -157,6 +183,14 @@ def test_verification_coverage_payload_flags_backlog():
|
||||
assert payload["coverage_rate"] == pytest.approx(5 / 7)
|
||||
assert payload["verification_success_rate"] == pytest.approx(4 / 5)
|
||||
assert payload["recent_unverified"][0]["incident_id"] == "INC-1"
|
||||
assert payload["recent_non_success"][0]["failure_class"] == "unsupported_action_scheme"
|
||||
assert payload["recent_non_success"][0]["next_step"] == "normalize_playbook_executor"
|
||||
assert payload["non_success_breakdown"]["by_failure_class"] == [
|
||||
{"name": "unsupported_action_scheme", "count": 1},
|
||||
]
|
||||
assert payload["non_success_breakdown"]["by_verification_result"] == [
|
||||
{"name": "degraded", "count": 1},
|
||||
]
|
||||
|
||||
|
||||
def test_verification_coverage_payload_skips_when_no_auto_repair():
|
||||
@@ -170,6 +204,7 @@ def test_verification_coverage_payload_skips_when_no_auto_repair():
|
||||
"unverified_auto": 0,
|
||||
},
|
||||
[],
|
||||
[],
|
||||
)
|
||||
|
||||
assert payload["status"] == "skipped_low_volume"
|
||||
|
||||
@@ -1410,6 +1410,8 @@
|
||||
"successRate": "Success verification",
|
||||
"lastVerified": "Last verified execution",
|
||||
"reasonLabel": "Reason",
|
||||
"failureBreakdown": "Non-success Verification Classes",
|
||||
"recentFindings": "Recent Non-success Verification",
|
||||
"state": {
|
||||
"ok": "OK",
|
||||
"warning": "Needs tracking",
|
||||
@@ -1424,6 +1426,24 @@
|
||||
"verification_backlog_present": "Some auto repairs are missing verification results",
|
||||
"non_success_verification_present": "degraded / failed / timeout verification exists",
|
||||
"postgresql_query_error": "PostgreSQL query failed"
|
||||
},
|
||||
"failureClass": {
|
||||
"unsupported_action_scheme": "PlayBook action misses supported executor",
|
||||
"verifier_missing_promql": "Verifier missing PromQL query",
|
||||
"verifier_target_missing_pod": "Verifier missing pod target",
|
||||
"auto_repair_execution_failed": "Auto repair execution failed",
|
||||
"verification_failed": "Verification failed",
|
||||
"verification_timeout": "Verification timed out",
|
||||
"verification_degraded": "Verification degraded",
|
||||
"unknown": "Pending classification"
|
||||
},
|
||||
"nextStep": {
|
||||
"normalize_playbook_executor": "Fix PlayBook executor",
|
||||
"add_verifier_query_template": "Add verifier query template",
|
||||
"map_verifier_target": "Map verifier target",
|
||||
"review_auto_repair_execution": "Inspect auto repair record",
|
||||
"escalate_verification_failure": "Escalate verification failure",
|
||||
"review_degraded_verification": "Review degraded evidence"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -1411,6 +1411,8 @@
|
||||
"successRate": "成功驗證",
|
||||
"lastVerified": "最後已驗證執行",
|
||||
"reasonLabel": "原因",
|
||||
"failureBreakdown": "非成功驗證分類",
|
||||
"recentFindings": "近期非成功驗證",
|
||||
"state": {
|
||||
"ok": "正常",
|
||||
"warning": "需追蹤",
|
||||
@@ -1425,6 +1427,24 @@
|
||||
"verification_backlog_present": "有自動修復尚未寫入驗證結果",
|
||||
"non_success_verification_present": "存在 degraded / failed / timeout 驗證結果",
|
||||
"postgresql_query_error": "PostgreSQL 查詢失敗"
|
||||
},
|
||||
"failureClass": {
|
||||
"unsupported_action_scheme": "PlayBook 動作未走支援執行器",
|
||||
"verifier_missing_promql": "Verifier 缺 PromQL 查詢",
|
||||
"verifier_target_missing_pod": "Verifier 缺 Pod 目標",
|
||||
"auto_repair_execution_failed": "自動修復執行失敗",
|
||||
"verification_failed": "驗證失敗",
|
||||
"verification_timeout": "驗證逾時",
|
||||
"verification_degraded": "驗證降級",
|
||||
"unknown": "待分類"
|
||||
},
|
||||
"nextStep": {
|
||||
"normalize_playbook_executor": "修正 PlayBook 執行器",
|
||||
"add_verifier_query_template": "補 verifier 查詢模板",
|
||||
"map_verifier_target": "補 verifier 目標映射",
|
||||
"review_auto_repair_execution": "檢查自動修復紀錄",
|
||||
"escalate_verification_failure": "升級驗證失敗",
|
||||
"review_degraded_verification": "檢查降級證據"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -84,6 +84,30 @@ interface Adr100VerificationCoverage {
|
||||
success: boolean
|
||||
created_at?: string | null
|
||||
}>
|
||||
recent_non_success?: Array<{
|
||||
auto_repair_id: string
|
||||
incident_id: string
|
||||
incident_status: string
|
||||
incident_severity: string
|
||||
alert_category?: string | null
|
||||
alertname?: string | null
|
||||
auto_success: boolean
|
||||
playbook_id?: string | null
|
||||
playbook_name?: string | null
|
||||
triggered_by?: string | null
|
||||
risk_level?: string | null
|
||||
verification_result: string
|
||||
failure_class: string
|
||||
next_step: string
|
||||
auto_error_excerpt?: string | null
|
||||
evidence_excerpt?: string | null
|
||||
auto_created_at?: string | null
|
||||
verification_collected_at?: string | null
|
||||
}>
|
||||
non_success_breakdown?: {
|
||||
by_verification_result?: Array<{ name: string; count: number }>
|
||||
by_failure_class?: Array<{ name: string; count: number }>
|
||||
}
|
||||
}
|
||||
|
||||
interface SummaryApiResponse {
|
||||
@@ -123,6 +147,32 @@ function formatPercent(value?: number | null): string {
|
||||
return value == null ? '--' : `${(value * 100).toFixed(1)}%`
|
||||
}
|
||||
|
||||
function failureClassKey(value?: string | null): string {
|
||||
if (value === 'unsupported_action_scheme') return value
|
||||
if (value === 'verifier_missing_promql') return value
|
||||
if (value === 'verifier_target_missing_pod') return value
|
||||
if (value === 'auto_repair_execution_failed') return value
|
||||
if (value === 'verification_failed') return value
|
||||
if (value === 'verification_timeout') return value
|
||||
if (value === 'verification_degraded') return value
|
||||
return 'unknown'
|
||||
}
|
||||
|
||||
function nextStepKey(value?: string | null): string {
|
||||
if (value === 'normalize_playbook_executor') return value
|
||||
if (value === 'add_verifier_query_template') return value
|
||||
if (value === 'map_verifier_target') return value
|
||||
if (value === 'review_auto_repair_execution') return value
|
||||
if (value === 'escalate_verification_failure') return value
|
||||
if (value === 'review_degraded_verification') return value
|
||||
return 'review_degraded_verification'
|
||||
}
|
||||
|
||||
function compactLabel(value?: string | null, fallback = '--'): string {
|
||||
if (!value) return fallback
|
||||
return value.length > 54 ? `${value.slice(0, 54)}...` : value
|
||||
}
|
||||
|
||||
function buildMetrics(api: SloApiResponse): SloMetric[] {
|
||||
const adr100Metrics = api.adr100?.metrics
|
||||
if (adr100Metrics?.length) {
|
||||
@@ -186,6 +236,8 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
{ label: t('unverifiedAuto'), value: String(coverage?.unverified_auto ?? '--') },
|
||||
{ label: t('coverageRate'), value: formatPercent(coverage?.coverage_rate) },
|
||||
]
|
||||
const failureBreakdown = coverage?.non_success_breakdown?.by_failure_class ?? []
|
||||
const recentFindings = coverage?.recent_non_success ?? []
|
||||
|
||||
return (
|
||||
<GlassCard variant="subtle" padding="md">
|
||||
@@ -234,6 +286,78 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
<span>{t('successRate')} {formatPercent(coverage?.verification_success_rate)}</span>
|
||||
<span>{t('lastVerified')} {coverage?.last_verified_auto_at ?? '--'}</span>
|
||||
</div>
|
||||
|
||||
{failureBreakdown.length > 0 && (
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 7 }}>
|
||||
<div style={{ fontFamily: 'Syne, sans-serif', fontSize: 11, fontWeight: 700, color: '#141413' }}>
|
||||
{t('failureBreakdown')}
|
||||
</div>
|
||||
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 6 }}>
|
||||
{failureBreakdown.map(item => (
|
||||
<span key={item.name} style={{
|
||||
display: 'inline-flex',
|
||||
alignItems: 'center',
|
||||
minHeight: 24,
|
||||
padding: '3px 7px',
|
||||
borderRadius: 6,
|
||||
border: '0.5px solid #F59E0B40',
|
||||
background: 'rgba(245,158,11,0.08)',
|
||||
fontFamily: "'DM Mono', monospace",
|
||||
fontSize: 10,
|
||||
color: '#7c5a10',
|
||||
}}>
|
||||
{t(`failureClass.${failureClassKey(item.name)}`)} · {item.count}
|
||||
</span>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{recentFindings.length > 0 && (
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 8 }}>
|
||||
<div style={{ fontFamily: 'Syne, sans-serif', fontSize: 11, fontWeight: 700, color: '#141413' }}>
|
||||
{t('recentFindings')}
|
||||
</div>
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 7 }}>
|
||||
{recentFindings.slice(0, 5).map(item => (
|
||||
<div key={`${item.auto_repair_id}-${item.incident_id}`} style={{
|
||||
display: 'grid',
|
||||
gridTemplateColumns: 'minmax(120px, 0.8fr) minmax(180px, 1.4fr) minmax(160px, 1fr)',
|
||||
gap: 10,
|
||||
alignItems: 'center',
|
||||
minWidth: 0,
|
||||
paddingTop: 7,
|
||||
borderTop: '0.5px solid rgba(20,20,19,0.08)',
|
||||
}} className="slo-finding-row">
|
||||
<div style={{ minWidth: 0 }}>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#141413' }}>
|
||||
{item.incident_id}
|
||||
</div>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginTop: 2 }}>
|
||||
{item.incident_status} · {item.verification_result}
|
||||
</div>
|
||||
</div>
|
||||
<div style={{ minWidth: 0 }}>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#7c5a10' }}>
|
||||
{t(`failureClass.${failureClassKey(item.failure_class)}`)}
|
||||
</div>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginTop: 2, overflowWrap: 'anywhere' }}>
|
||||
{compactLabel(item.alertname)} · {compactLabel(item.playbook_name)}
|
||||
</div>
|
||||
</div>
|
||||
<div style={{ minWidth: 0 }}>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#141413' }}>
|
||||
{t(`nextStep.${nextStepKey(item.next_step)}`)}
|
||||
</div>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginTop: 2, overflowWrap: 'anywhere' }}>
|
||||
{compactLabel(item.auto_error_excerpt ?? item.evidence_excerpt)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</GlassCard>
|
||||
)
|
||||
@@ -358,6 +482,7 @@ export function SloTab() {
|
||||
@media (max-width: 640px) {
|
||||
.slo-kpi-grid > * { flex: 0 0 100%; min-width: 0; }
|
||||
.slo-coverage-grid { grid-template-columns: repeat(2, minmax(0, 1fr)) !important; }
|
||||
.slo-finding-row { grid-template-columns: 1fr !important; }
|
||||
}
|
||||
`}</style>
|
||||
</div>
|
||||
|
||||
Reference in New Issue
Block a user