diff --git a/apps/api/src/services/adr100_slo_status_service.py b/apps/api/src/services/adr100_slo_status_service.py index 1b915120..582217d4 100644 --- a/apps/api/src/services/adr100_slo_status_service.py +++ b/apps/api/src/services/adr100_slo_status_service.py @@ -190,6 +190,9 @@ class Adr100SloStatusService: recent_rows = ( await db.execute(text(_VERIFICATION_COVERAGE_RECENT_SQL)) ).mappings().all() + recent_non_success_rows = ( + await db.execute(text(_VERIFICATION_COVERAGE_NON_SUCCESS_SQL)) + ).mappings().all() except Exception as exc: logger.warning("adr100_verification_coverage_query_error", error=str(exc)) return { @@ -213,9 +216,18 @@ class Adr100SloStatusService: "latest_auto_age_seconds": None, "last_verified_auto_age_seconds": None, "recent_unverified": [], + "recent_non_success": [], + "non_success_breakdown": { + "by_verification_result": [], + "by_failure_class": [], + }, } - return _build_verification_coverage_payload(summary_row, recent_rows) + return _build_verification_coverage_payload( + summary_row, + recent_rows, + recent_non_success_rows, + ) _VERIFICATION_COVERAGE_SQL = """ @@ -291,6 +303,65 @@ _VERIFICATION_COVERAGE_RECENT_SQL = """ """ +_VERIFICATION_COVERAGE_NON_SUCCESS_SQL = """ + WITH recent_auto AS ( + SELECT + id, + incident_id, + success, + playbook_id, + playbook_name, + triggered_by, + risk_level, + error_message, + created_at + FROM auto_repair_executions + WHERE created_at >= NOW() - INTERVAL '24 hours' + ), + per_auto AS ( + SELECT + are.id AS auto_repair_id, + are.incident_id, + are.success AS auto_success, + are.playbook_id, + are.playbook_name, + are.triggered_by, + are.risk_level, + left(coalesce(are.error_message, ''), 240) AS auto_error, + are.created_at AS auto_created_at, + latest.verification_result, + latest.collected_at AS verification_collected_at, + left(coalesce(latest.post_execution_state::text, ''), 700) AS post_state_text, + left(coalesce(latest.evidence_summary, ''), 300) AS evidence_summary + FROM recent_auto are + LEFT JOIN LATERAL ( + SELECT + ev.verification_result, + ev.collected_at, + ev.post_execution_state, + ev.evidence_summary + FROM incident_evidence ev + WHERE ev.incident_id = are.incident_id + AND ev.verification_result IS NOT NULL + ORDER BY ev.collected_at DESC + LIMIT 1 + ) latest ON TRUE + ) + SELECT + p.*, + i.status::text AS incident_status, + i.severity::text AS incident_severity, + i.alert_category, + i.alertname + FROM per_auto p + LEFT JOIN incidents i ON i.incident_id = p.incident_id + WHERE p.verification_result IS NOT NULL + AND p.verification_result <> 'success' + ORDER BY p.auto_created_at DESC + LIMIT 8 +""" + + async def _query_prometheus_value( client: httpx.AsyncClient, prom_url: str, @@ -371,6 +442,7 @@ def _classify_status(value: float, definition: Adr100SloDefinition) -> str: def _build_verification_coverage_payload( summary_row: Any, recent_unverified_rows: Any, + recent_non_success_rows: Any = (), ) -> dict[str, Any]: row = dict(summary_row) total_auto = int(row.get("total_auto") or 0) @@ -398,6 +470,10 @@ def _build_verification_coverage_payload( coverage_rate = (verified_auto / total_auto) if total_auto else None verification_success_rate = (verified_success / verified_auto) if verified_auto else None + recent_non_success = [ + _non_success_finding_payload(dict(raw)) + for raw in recent_non_success_rows + ] return { "schema_version": "adr100_verification_coverage_v1", @@ -428,9 +504,96 @@ def _build_verification_coverage_payload( } for item in (dict(raw) for raw in recent_unverified_rows) ], + "recent_non_success": recent_non_success, + "non_success_breakdown": { + "by_verification_result": _count_breakdown( + item["verification_result"] for item in recent_non_success + ), + "by_failure_class": _count_breakdown( + item["failure_class"] for item in recent_non_success + ), + }, } +def _non_success_finding_payload(row: dict[str, Any]) -> dict[str, Any]: + failure_class = _classify_non_success_failure(row) + return { + "auto_repair_id": str(row.get("auto_repair_id")), + "incident_id": str(row.get("incident_id")), + "incident_status": str(row.get("incident_status") or "unknown"), + "incident_severity": str(row.get("incident_severity") or "unknown"), + "alert_category": row.get("alert_category"), + "alertname": row.get("alertname"), + "auto_success": bool(row.get("auto_success")), + "playbook_id": row.get("playbook_id"), + "playbook_name": row.get("playbook_name"), + "triggered_by": row.get("triggered_by"), + "risk_level": row.get("risk_level"), + "verification_result": str(row.get("verification_result") or "unknown"), + "failure_class": failure_class, + "next_step": _next_step_for_failure_class(failure_class), + "auto_error_excerpt": _short_text(row.get("auto_error"), 180), + "evidence_excerpt": _short_text(row.get("evidence_summary"), 180), + "auto_created_at": _iso(row.get("auto_created_at")), + "verification_collected_at": _iso(row.get("verification_collected_at")), + } + + +def _classify_non_success_failure(row: dict[str, Any]) -> str: + combined = " ".join( + str(row.get(key) or "") + for key in ("auto_error", "post_state_text", "evidence_summary") + ).lower() + if "unsupported scheme" in combined: + return "unsupported_action_scheme" + if "missing_query_parameter" in combined: + return "verifier_missing_promql" + if "empty_pod_name" in combined: + return "verifier_target_missing_pod" + if not bool(row.get("auto_success")): + return "auto_repair_execution_failed" + + result = str(row.get("verification_result") or "").lower() + if result in {"failed", "timeout"}: + return f"verification_{result}" + return "verification_degraded" + + +def _next_step_for_failure_class(failure_class: str) -> str: + if failure_class == "unsupported_action_scheme": + return "normalize_playbook_executor" + if failure_class == "verifier_missing_promql": + return "add_verifier_query_template" + if failure_class == "verifier_target_missing_pod": + return "map_verifier_target" + if failure_class == "auto_repair_execution_failed": + return "review_auto_repair_execution" + if failure_class in {"verification_failed", "verification_timeout"}: + return "escalate_verification_failure" + return "review_degraded_verification" + + +def _count_breakdown(values: Any) -> list[dict[str, Any]]: + counts: dict[str, int] = {} + for value in values: + key = str(value or "unknown") + counts[key] = counts.get(key, 0) + 1 + return [ + {"name": name, "count": count} + for name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0])) + ] + + +def _short_text(value: Any, limit: int) -> str | None: + if value is None: + return None + text = " ".join(str(value).split()) + if not text: + return None + return text[:limit] + + def _iso(value: Any) -> str | None: return value.isoformat() if hasattr(value, "isoformat") else None diff --git a/apps/api/tests/test_adr100_slo_status_service.py b/apps/api/tests/test_adr100_slo_status_service.py index 10aaa95c..0117a55e 100644 --- a/apps/api/tests/test_adr100_slo_status_service.py +++ b/apps/api/tests/test_adr100_slo_status_service.py @@ -64,6 +64,11 @@ async def _low_volume_coverage(self): # noqa: ANN001 "latest_auto_age_seconds": None, "last_verified_auto_age_seconds": None, "recent_unverified": [], + "recent_non_success": [], + "non_success_breakdown": { + "by_verification_result": [], + "by_failure_class": [], + }, } @@ -150,6 +155,27 @@ def test_verification_coverage_payload_flags_backlog(): "created_at": None, }, ], + [ + { + "auto_repair_id": "are-2", + "incident_id": "INC-2", + "incident_status": "INVESTIGATING", + "incident_severity": "P2", + "alert_category": "infrastructure", + "alertname": "DockerContainerMemoryLimitPressure", + "auto_success": False, + "playbook_id": "PB-1", + "playbook_name": "Docker pressure playbook", + "triggered_by": "auto_repair", + "risk_level": "LOW", + "verification_result": "degraded", + "auto_error": "FAILED: Unsupported scheme: 'ssh {host}'", + "post_state_text": '{"k8s_get_pod_logs": {"error": "empty_pod_name"}}', + "evidence_summary": "result=degraded", + "auto_created_at": None, + "verification_collected_at": None, + }, + ], ) assert payload["status"] == "warning" @@ -157,6 +183,14 @@ def test_verification_coverage_payload_flags_backlog(): assert payload["coverage_rate"] == pytest.approx(5 / 7) assert payload["verification_success_rate"] == pytest.approx(4 / 5) assert payload["recent_unverified"][0]["incident_id"] == "INC-1" + assert payload["recent_non_success"][0]["failure_class"] == "unsupported_action_scheme" + assert payload["recent_non_success"][0]["next_step"] == "normalize_playbook_executor" + assert payload["non_success_breakdown"]["by_failure_class"] == [ + {"name": "unsupported_action_scheme", "count": 1}, + ] + assert payload["non_success_breakdown"]["by_verification_result"] == [ + {"name": "degraded", "count": 1}, + ] def test_verification_coverage_payload_skips_when_no_auto_repair(): @@ -170,6 +204,7 @@ def test_verification_coverage_payload_skips_when_no_auto_repair(): "unverified_auto": 0, }, [], + [], ) assert payload["status"] == "skipped_low_volume" diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 36d2a79f..101c2adc 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -1410,6 +1410,8 @@ "successRate": "Success verification", "lastVerified": "Last verified execution", "reasonLabel": "Reason", + "failureBreakdown": "Non-success Verification Classes", + "recentFindings": "Recent Non-success Verification", "state": { "ok": "OK", "warning": "Needs tracking", @@ -1424,6 +1426,24 @@ "verification_backlog_present": "Some auto repairs are missing verification results", "non_success_verification_present": "degraded / failed / timeout verification exists", "postgresql_query_error": "PostgreSQL query failed" + }, + "failureClass": { + "unsupported_action_scheme": "PlayBook action misses supported executor", + "verifier_missing_promql": "Verifier missing PromQL query", + "verifier_target_missing_pod": "Verifier missing pod target", + "auto_repair_execution_failed": "Auto repair execution failed", + "verification_failed": "Verification failed", + "verification_timeout": "Verification timed out", + "verification_degraded": "Verification degraded", + "unknown": "Pending classification" + }, + "nextStep": { + "normalize_playbook_executor": "Fix PlayBook executor", + "add_verifier_query_template": "Add verifier query template", + "map_verifier_target": "Map verifier target", + "review_auto_repair_execution": "Inspect auto repair record", + "escalate_verification_failure": "Escalate verification failure", + "review_degraded_verification": "Review degraded evidence" } } }, diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index 843604c8..af888818 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -1411,6 +1411,8 @@ "successRate": "成功驗證", "lastVerified": "最後已驗證執行", "reasonLabel": "原因", + "failureBreakdown": "非成功驗證分類", + "recentFindings": "近期非成功驗證", "state": { "ok": "正常", "warning": "需追蹤", @@ -1425,6 +1427,24 @@ "verification_backlog_present": "有自動修復尚未寫入驗證結果", "non_success_verification_present": "存在 degraded / failed / timeout 驗證結果", "postgresql_query_error": "PostgreSQL 查詢失敗" + }, + "failureClass": { + "unsupported_action_scheme": "PlayBook 動作未走支援執行器", + "verifier_missing_promql": "Verifier 缺 PromQL 查詢", + "verifier_target_missing_pod": "Verifier 缺 Pod 目標", + "auto_repair_execution_failed": "自動修復執行失敗", + "verification_failed": "驗證失敗", + "verification_timeout": "驗證逾時", + "verification_degraded": "驗證降級", + "unknown": "待分類" + }, + "nextStep": { + "normalize_playbook_executor": "修正 PlayBook 執行器", + "add_verifier_query_template": "補 verifier 查詢模板", + "map_verifier_target": "補 verifier 目標映射", + "review_auto_repair_execution": "檢查自動修復紀錄", + "escalate_verification_failure": "升級驗證失敗", + "review_degraded_verification": "檢查降級證據" } } }, diff --git a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx index 63e35f4a..8685cbe4 100644 --- a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx +++ b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx @@ -84,6 +84,30 @@ interface Adr100VerificationCoverage { success: boolean created_at?: string | null }> + recent_non_success?: Array<{ + auto_repair_id: string + incident_id: string + incident_status: string + incident_severity: string + alert_category?: string | null + alertname?: string | null + auto_success: boolean + playbook_id?: string | null + playbook_name?: string | null + triggered_by?: string | null + risk_level?: string | null + verification_result: string + failure_class: string + next_step: string + auto_error_excerpt?: string | null + evidence_excerpt?: string | null + auto_created_at?: string | null + verification_collected_at?: string | null + }> + non_success_breakdown?: { + by_verification_result?: Array<{ name: string; count: number }> + by_failure_class?: Array<{ name: string; count: number }> + } } interface SummaryApiResponse { @@ -123,6 +147,32 @@ function formatPercent(value?: number | null): string { return value == null ? '--' : `${(value * 100).toFixed(1)}%` } +function failureClassKey(value?: string | null): string { + if (value === 'unsupported_action_scheme') return value + if (value === 'verifier_missing_promql') return value + if (value === 'verifier_target_missing_pod') return value + if (value === 'auto_repair_execution_failed') return value + if (value === 'verification_failed') return value + if (value === 'verification_timeout') return value + if (value === 'verification_degraded') return value + return 'unknown' +} + +function nextStepKey(value?: string | null): string { + if (value === 'normalize_playbook_executor') return value + if (value === 'add_verifier_query_template') return value + if (value === 'map_verifier_target') return value + if (value === 'review_auto_repair_execution') return value + if (value === 'escalate_verification_failure') return value + if (value === 'review_degraded_verification') return value + return 'review_degraded_verification' +} + +function compactLabel(value?: string | null, fallback = '--'): string { + if (!value) return fallback + return value.length > 54 ? `${value.slice(0, 54)}...` : value +} + function buildMetrics(api: SloApiResponse): SloMetric[] { const adr100Metrics = api.adr100?.metrics if (adr100Metrics?.length) { @@ -186,6 +236,8 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification { label: t('unverifiedAuto'), value: String(coverage?.unverified_auto ?? '--') }, { label: t('coverageRate'), value: formatPercent(coverage?.coverage_rate) }, ] + const failureBreakdown = coverage?.non_success_breakdown?.by_failure_class ?? [] + const recentFindings = coverage?.recent_non_success ?? [] return ( @@ -234,6 +286,78 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification {t('successRate')} {formatPercent(coverage?.verification_success_rate)} {t('lastVerified')} {coverage?.last_verified_auto_at ?? '--'} + + {failureBreakdown.length > 0 && ( +
+
+ {t('failureBreakdown')} +
+
+ {failureBreakdown.map(item => ( + + {t(`failureClass.${failureClassKey(item.name)}`)} · {item.count} + + ))} +
+
+ )} + + {recentFindings.length > 0 && ( +
+
+ {t('recentFindings')} +
+
+ {recentFindings.slice(0, 5).map(item => ( +
+
+
+ {item.incident_id} +
+
+ {item.incident_status} · {item.verification_result} +
+
+
+
+ {t(`failureClass.${failureClassKey(item.failure_class)}`)} +
+
+ {compactLabel(item.alertname)} · {compactLabel(item.playbook_name)} +
+
+
+
+ {t(`nextStep.${nextStepKey(item.next_step)}`)} +
+
+ {compactLabel(item.auto_error_excerpt ?? item.evidence_excerpt)} +
+
+
+ ))} +
+
+ )}
) @@ -358,6 +482,7 @@ export function SloTab() { @media (max-width: 640px) { .slo-kpi-grid > * { flex: 0 0 100%; min-width: 0; } .slo-coverage-grid { grid-template-columns: repeat(2, minmax(0, 1fr)) !important; } + .slo-finding-row { grid-template-columns: 1fr !important; } } `}