diff --git a/apps/api/src/services/adr100_slo_status_service.py b/apps/api/src/services/adr100_slo_status_service.py index 582217d4..e5fce3c8 100644 --- a/apps/api/src/services/adr100_slo_status_service.py +++ b/apps/api/src/services/adr100_slo_status_service.py @@ -221,6 +221,7 @@ class Adr100SloStatusService: "by_verification_result": [], "by_failure_class": [], }, + "remediation_queue": _remediation_queue_payload([]), } return _build_verification_coverage_payload( @@ -474,6 +475,7 @@ def _build_verification_coverage_payload( _non_success_finding_payload(dict(raw)) for raw in recent_non_success_rows ] + remediation_queue = _remediation_queue_payload(recent_non_success) return { "schema_version": "adr100_verification_coverage_v1", @@ -512,12 +514,17 @@ def _build_verification_coverage_payload( "by_failure_class": _count_breakdown( item["failure_class"] for item in recent_non_success ), + "by_remediation_status": _count_breakdown( + item["remediation_status"] for item in remediation_queue["items"] + ), }, + "remediation_queue": remediation_queue, } def _non_success_finding_payload(row: dict[str, Any]) -> dict[str, Any]: failure_class = _classify_non_success_failure(row) + remediation = _remediation_for_failure_class(failure_class) return { "auto_repair_id": str(row.get("auto_repair_id")), "incident_id": str(row.get("incident_id")), @@ -533,6 +540,10 @@ def _non_success_finding_payload(row: dict[str, Any]) -> dict[str, Any]: "verification_result": str(row.get("verification_result") or "unknown"), "failure_class": failure_class, "next_step": _next_step_for_failure_class(failure_class), + "remediation_status": remediation["status"], + "remediation_action": remediation["action"], + "remediation_owner": remediation["owner"], + "remediation_reason": remediation["reason"], "auto_error_excerpt": _short_text(row.get("auto_error"), 180), "evidence_excerpt": _short_text(row.get("evidence_summary"), 180), "auto_created_at": _iso(row.get("auto_created_at")), @@ -560,6 +571,55 @@ def _classify_non_success_failure(row: dict[str, Any]) -> str: return "verification_degraded" +def _remediation_for_failure_class(failure_class: str) -> dict[str, str]: + """Map a non-success verification class to a read-only remediation work item. + + This is dashboard triage metadata only. It does not auto-close incidents, + replay repairs, or approve write actions. + """ + if failure_class == "unsupported_action_scheme": + return { + "status": "ready_for_replay", + "action": "replay_with_supported_executor", + "owner": "auto_repair_executor", + "reason": "executor_gateway_available_after_t23", + } + if failure_class == "verifier_missing_promql": + return { + "status": "ready_for_reverify", + "action": "reverify_with_promql_template", + "owner": "post_execution_verifier", + "reason": "promql_template_available_after_t23", + } + if failure_class == "verifier_target_missing_pod": + return { + "status": "needs_target_mapping", + "action": "map_target_and_reverify", + "owner": "post_execution_verifier", + "reason": "verifier_target_missing", + } + if failure_class == "auto_repair_execution_failed": + return { + "status": "needs_playbook_ticket", + "action": "create_playbook_ticket", + "owner": "solver_or_operator", + "reason": "execution_failed_after_route_normalization", + } + if failure_class in {"verification_failed", "verification_timeout"}: + return { + "status": "manual_review", + "action": "escalate_verification_failure", + "owner": "sre_operator", + "reason": "verifier_returned_hard_failure", + } + return { + "status": "manual_review", + "action": "inspect_degraded_evidence", + "owner": "sre_operator", + "reason": "degraded_evidence_requires_human_context", + } + + def _next_step_for_failure_class(failure_class: str) -> str: if failure_class == "unsupported_action_scheme": return "normalize_playbook_executor" @@ -574,6 +634,57 @@ def _next_step_for_failure_class(failure_class: str) -> str: return "review_degraded_verification" +def _remediation_queue_payload(recent_non_success: list[dict[str, Any]]) -> dict[str, Any]: + items: list[dict[str, Any]] = [] + for item in recent_non_success: + items.append({ + "work_item_id": ( + f"verification:{item.get('incident_id')}:{item.get('auto_repair_id')}" + ), + "incident_id": item.get("incident_id"), + "auto_repair_id": item.get("auto_repair_id"), + "alertname": item.get("alertname"), + "playbook_id": item.get("playbook_id"), + "failure_class": item.get("failure_class"), + "verification_result": item.get("verification_result"), + "remediation_status": item.get("remediation_status"), + "remediation_action": item.get("remediation_action"), + "remediation_owner": item.get("remediation_owner"), + "remediation_reason": item.get("remediation_reason"), + "source": "adr100_verification_coverage", + "auto_created_at": item.get("auto_created_at"), + "verification_collected_at": item.get("verification_collected_at"), + }) + + ready_for_ai = sum( + 1 for item in items + if item.get("remediation_status") in {"ready_for_replay", "ready_for_reverify"} + ) + needs_human = sum( + 1 for item in items + if item.get("remediation_status") in { + "needs_target_mapping", + "needs_playbook_ticket", + "manual_review", + } + ) + + return { + "schema_version": "adr100_remediation_queue_v1", + "source": "recent_non_success_read_model", + "total": len(items), + "ready_for_ai": ready_for_ai, + "needs_human": needs_human, + "items": items, + "by_status": _count_breakdown( + item.get("remediation_status") for item in items + ), + "by_action": _count_breakdown( + item.get("remediation_action") for item in items + ), + } + + def _count_breakdown(values: Any) -> list[dict[str, Any]]: counts: dict[str, int] = {} for value in values: diff --git a/apps/api/tests/test_adr100_slo_status_service.py b/apps/api/tests/test_adr100_slo_status_service.py index 0117a55e..82f783e0 100644 --- a/apps/api/tests/test_adr100_slo_status_service.py +++ b/apps/api/tests/test_adr100_slo_status_service.py @@ -69,6 +69,16 @@ async def _low_volume_coverage(self): # noqa: ANN001 "by_verification_result": [], "by_failure_class": [], }, + "remediation_queue": { + "schema_version": "adr100_remediation_queue_v1", + "source": "recent_non_success_read_model", + "total": 0, + "ready_for_ai": 0, + "needs_human": 0, + "items": [], + "by_status": [], + "by_action": [], + }, } @@ -185,12 +195,25 @@ def test_verification_coverage_payload_flags_backlog(): assert payload["recent_unverified"][0]["incident_id"] == "INC-1" assert payload["recent_non_success"][0]["failure_class"] == "unsupported_action_scheme" assert payload["recent_non_success"][0]["next_step"] == "normalize_playbook_executor" + assert payload["recent_non_success"][0]["remediation_status"] == "ready_for_replay" + assert payload["recent_non_success"][0]["remediation_action"] == ( + "replay_with_supported_executor" + ) assert payload["non_success_breakdown"]["by_failure_class"] == [ {"name": "unsupported_action_scheme", "count": 1}, ] assert payload["non_success_breakdown"]["by_verification_result"] == [ {"name": "degraded", "count": 1}, ] + assert payload["non_success_breakdown"]["by_remediation_status"] == [ + {"name": "ready_for_replay", "count": 1}, + ] + assert payload["remediation_queue"]["total"] == 1 + assert payload["remediation_queue"]["ready_for_ai"] == 1 + assert payload["remediation_queue"]["needs_human"] == 0 + assert payload["remediation_queue"]["items"][0]["work_item_id"] == ( + "verification:INC-2:are-2" + ) def test_verification_coverage_payload_skips_when_no_auto_repair(): @@ -210,3 +233,4 @@ def test_verification_coverage_payload_skips_when_no_auto_repair(): assert payload["status"] == "skipped_low_volume" assert payload["reason"] == "no_auto_repair_executions_24h" assert payload["evaluable"] is False + assert payload["remediation_queue"]["total"] == 0 diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 101c2adc..55d0714e 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -1412,6 +1412,8 @@ "reasonLabel": "Reason", "failureBreakdown": "Non-success Verification Classes", "recentFindings": "Recent Non-success Verification", + "remediationQueue": "Remediation Work Queue", + "queueSummary": "Total {total}; AI-ready {ready}; human {human}", "state": { "ok": "OK", "warning": "Needs tracking", @@ -1444,6 +1446,22 @@ "review_auto_repair_execution": "Inspect auto repair record", "escalate_verification_failure": "Escalate verification failure", "review_degraded_verification": "Review degraded evidence" + }, + "remediationStatus": { + "ready_for_replay": "Ready for replay", + "ready_for_reverify": "Ready to reverify", + "needs_target_mapping": "Needs target mapping", + "needs_playbook_ticket": "Needs ticket", + "manual_review": "Manual review", + "unknown": "Pending classification" + }, + "remediationAction": { + "replay_with_supported_executor": "Replay with supported executor", + "reverify_with_promql_template": "Reverify with PromQL template", + "map_target_and_reverify": "Map target and reverify", + "create_playbook_ticket": "Create PlayBook ticket", + "escalate_verification_failure": "Escalate verification failure", + "inspect_degraded_evidence": "Inspect degraded evidence" } } }, @@ -1717,6 +1735,9 @@ "autoRepair": { "title": "Low-risk Alertmanager auto-repair loop" }, + "remediationQueue": { + "title": "Non-success verification remediation queue" + }, "telegramCallbacks": { "title": "Telegram detail / history as DB truth-first" }, @@ -1736,6 +1757,7 @@ "gates": { "sourceDossier": "Inbound alerts must show received / incident_linked / source refs", "autoRepair": "Requires auto_repair, verification_result=success, and KM writeback", + "remediationQueue": "Every degraded / failed / timeout row must map to replay, reverify, ticket, or manual review", "telegramCallbacks": "Detail and history buttons cannot depend only on Redis TTL or stale snapshots", "governanceDispatch": "Governance alerts must enter dispatch and expose skipped / pending / repaired", "frontendConsole": "Completed and in-progress work must be trackable from the frontend", @@ -1745,6 +1767,7 @@ "evidence": { "channelEvents": "Recent Alertmanager channel events: {count}", "autoRepair": "Verified auto-repairs: {verified}/{evaluated}", + "remediationQueue": "Remediation work: {total}; AI-ready: {ready}; human: {human}", "telegramCallbacks": "Telegram callback lookup and history summary are being repaired", "governance": "Unresolved governance alerts: {unresolved}; pending dispatch: {queued}", "governanceUnavailable": "Governance events API is not responding; pending dispatch: {queued}", diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index af888818..f01bbc91 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -1413,6 +1413,8 @@ "reasonLabel": "原因", "failureBreakdown": "非成功驗證分類", "recentFindings": "近期非成功驗證", + "remediationQueue": "補救工作佇列", + "queueSummary": "總數 {total};AI 可接手 {ready};人工 {human}", "state": { "ok": "正常", "warning": "需追蹤", @@ -1445,6 +1447,22 @@ "review_auto_repair_execution": "檢查自動修復紀錄", "escalate_verification_failure": "升級驗證失敗", "review_degraded_verification": "檢查降級證據" + }, + "remediationStatus": { + "ready_for_replay": "可重跑", + "ready_for_reverify": "可重驗", + "needs_target_mapping": "待補目標", + "needs_playbook_ticket": "待建 Ticket", + "manual_review": "人工檢查", + "unknown": "待分類" + }, + "remediationAction": { + "replay_with_supported_executor": "用支援 executor 重跑", + "reverify_with_promql_template": "用 PromQL 模板重驗", + "map_target_and_reverify": "補目標後重驗", + "create_playbook_ticket": "建立 PlayBook Ticket", + "escalate_verification_failure": "升級驗證失敗", + "inspect_degraded_evidence": "檢查降級證據" } } }, @@ -1718,6 +1736,9 @@ "autoRepair": { "title": "低風險 Alertmanager 自動修復閉環" }, + "remediationQueue": { + "title": "非成功驗證補救工作佇列" + }, "telegramCallbacks": { "title": "Telegram 詳情 / 歷史改為 DB truth-first" }, @@ -1737,6 +1758,7 @@ "gates": { "sourceDossier": "入站告警必須能查到 received / incident_linked / source refs", "autoRepair": "必須同時有 auto_repair、verification_result=success 與 KM 回寫", + "remediationQueue": "每筆 degraded / failed / timeout 都必須映射到重跑、重驗、Ticket 或人工檢查", "telegramCallbacks": "按下詳情與歷史不能再只依賴 Redis TTL 或舊快照", "governanceDispatch": "治理告警必須進 dispatch,並標示 skipped / pending / repaired", "frontendConsole": "已完成與推進中的工作必須能從前端直接追蹤", @@ -1746,6 +1768,7 @@ "evidence": { "channelEvents": "最近 Alertmanager channel events:{count}", "autoRepair": "已驗證自動修復:{verified}/{evaluated}", + "remediationQueue": "補救工作:{total};AI 可接手:{ready};人工:{human}", "telegramCallbacks": "目前修補 Telegram callback 查詢鏈與歷史摘要", "governance": "未解治理告警:{unresolved};pending dispatch:{queued}", "governanceUnavailable": "治理事件 API 目前無法回應;pending dispatch:{queued}", diff --git a/apps/web/src/app/[locale]/awooop/work-items/page.tsx b/apps/web/src/app/[locale]/awooop/work-items/page.tsx index 36ef5d21..f9fbffef 100644 --- a/apps/web/src/app/[locale]/awooop/work-items/page.tsx +++ b/apps/web/src/app/[locale]/awooop/work-items/page.tsx @@ -46,11 +46,24 @@ type RecentEventsResponse = { events?: Array<{ provider_event_id: string; is_duplicate: boolean }>; }; +type SloResponse = { + adr100?: { + verification_coverage?: { + remediation_queue?: { + total: number; + ready_for_ai: number; + needs_human: number; + }; + }; + }; +}; + type Telemetry = { quality: AutomationQualitySummary | null; governanceEvents: GovernanceEventsResponse | null; governanceQueue: GovernanceQueueResponse | null; channelEvents: RecentEventsResponse | null; + slo: SloResponse | null; }; type WorkItem = { @@ -108,6 +121,10 @@ function buildWorkItems( const recentChannelEvents = telemetry.channelEvents?.total ?? 0; const governanceUnresolved = telemetry.governanceEvents?.total ?? 0; const governanceQueuePending = telemetry.governanceQueue?.total ?? 0; + const remediationQueue = telemetry.slo?.adr100?.verification_coverage?.remediation_queue; + const remediationTotal = remediationQueue?.total ?? 0; + const remediationReadyForAi = remediationQueue?.ready_for_ai ?? 0; + const remediationNeedsHuman = remediationQueue?.needs_human ?? 0; const governanceEventsUnavailable = telemetry.governanceEvents === null; const governanceQueueMissing = telemetry.governanceQueue?.table_pending === true; const governanceDispatchBlocked = @@ -141,6 +158,24 @@ function buildWorkItems( }), href: "/awooop/runs", }, + { + id: "remediationQueue", + phase: "T24", + status: remediationTotal === 0 + ? "watching" + : remediationReadyForAi > 0 + ? "in_progress" + : "blocked", + surfaceKey: "governance", + source: "/api/v1/ai/slo remediation_queue", + gateKey: "remediationQueue", + evidence: t("evidence.remediationQueue", { + total: remediationTotal, + ready: remediationReadyForAi, + human: remediationNeedsHuman, + }), + href: "/governance", + }, { id: "telegramCallbacks", phase: "T17", @@ -252,6 +287,7 @@ export default function AwoooPWorkItemsPage() { governanceEvents: null, governanceQueue: null, channelEvents: null, + slo: null, }); const [loading, setLoading] = useState(true); const [lastUpdated, setLastUpdated] = useState(null); @@ -262,15 +298,17 @@ export default function AwoooPWorkItemsPage() { const governanceEventsUrl = `${API_BASE}/api/v1/ai/governance/events?event_type=knowledge_degradation&event_type=governance_slo_data_gap&status=unresolved&size=10`; const governanceQueueUrl = `${API_BASE}/api/v1/ai/governance/queue?dispatch_status=pending&size=10`; const channelEventsUrl = `${API_BASE}/api/v1/platform/events/recent?project_id=awoooi&provider_prefix=alertmanager&limit=20`; + const sloUrl = `${API_BASE}/api/v1/ai/slo`; - const [quality, governanceEvents, governanceQueue, channelEvents] = await Promise.all([ + const [quality, governanceEvents, governanceQueue, channelEvents, slo] = await Promise.all([ fetchJson(qualityUrl), fetchJson(governanceEventsUrl), fetchJson(governanceQueueUrl), fetchJson(channelEventsUrl), + fetchJson(sloUrl), ]); - setTelemetry({ quality, governanceEvents, governanceQueue, channelEvents }); + setTelemetry({ quality, governanceEvents, governanceQueue, channelEvents, slo }); setLastUpdated(new Date()); setLoading(false); }, []); diff --git a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx index 8685cbe4..1bebac75 100644 --- a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx +++ b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx @@ -99,6 +99,10 @@ interface Adr100VerificationCoverage { verification_result: string failure_class: string next_step: string + remediation_status?: string | null + remediation_action?: string | null + remediation_owner?: string | null + remediation_reason?: string | null auto_error_excerpt?: string | null evidence_excerpt?: string | null auto_created_at?: string | null @@ -107,6 +111,30 @@ interface Adr100VerificationCoverage { non_success_breakdown?: { by_verification_result?: Array<{ name: string; count: number }> by_failure_class?: Array<{ name: string; count: number }> + by_remediation_status?: Array<{ name: string; count: number }> + } + remediation_queue?: { + total: number + ready_for_ai: number + needs_human: number + items?: Array<{ + work_item_id: string + incident_id?: string | null + auto_repair_id?: string | null + alertname?: string | null + playbook_id?: string | null + failure_class?: string | null + verification_result?: string | null + remediation_status?: string | null + remediation_action?: string | null + remediation_owner?: string | null + remediation_reason?: string | null + source?: string | null + auto_created_at?: string | null + verification_collected_at?: string | null + }> + by_status?: Array<{ name: string; count: number }> + by_action?: Array<{ name: string; count: number }> } } @@ -168,6 +196,25 @@ function nextStepKey(value?: string | null): string { return 'review_degraded_verification' } +function remediationStatusKey(value?: string | null): string { + if (value === 'ready_for_replay') return value + if (value === 'ready_for_reverify') return value + if (value === 'needs_target_mapping') return value + if (value === 'needs_playbook_ticket') return value + if (value === 'manual_review') return value + return 'unknown' +} + +function remediationActionKey(value?: string | null): string { + if (value === 'replay_with_supported_executor') return value + if (value === 'reverify_with_promql_template') return value + if (value === 'map_target_and_reverify') return value + if (value === 'create_playbook_ticket') return value + if (value === 'escalate_verification_failure') return value + if (value === 'inspect_degraded_evidence') return value + return 'inspect_degraded_evidence' +} + function compactLabel(value?: string | null, fallback = '--'): string { if (!value) return fallback return value.length > 54 ? `${value.slice(0, 54)}...` : value @@ -238,6 +285,7 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification ] const failureBreakdown = coverage?.non_success_breakdown?.by_failure_class ?? [] const recentFindings = coverage?.recent_non_success ?? [] + const remediationQueue = coverage?.remediation_queue return ( @@ -313,6 +361,61 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification )} + {remediationQueue && remediationQueue.total > 0 && ( +
+
+
+ {t('remediationQueue')} +
+
+ {t('queueSummary', { + total: remediationQueue.total, + ready: remediationQueue.ready_for_ai, + human: remediationQueue.needs_human, + })} +
+
+
+ {(remediationQueue.items ?? []).slice(0, 4).map(item => ( +
+
+
+ {item.incident_id ?? '--'} +
+
+ {t(`remediationStatus.${remediationStatusKey(item.remediation_status)}`)} +
+
+
+
+ {t(`remediationAction.${remediationActionKey(item.remediation_action)}`)} +
+
+ {compactLabel(item.alertname)} · {compactLabel(item.playbook_id)} +
+
+
+
+ {item.remediation_owner ?? '--'} +
+
+ {compactLabel(item.remediation_reason)} +
+
+
+ ))} +
+
+ )} + {recentFindings.length > 0 && (
@@ -478,11 +581,12 @@ export function SloTab() { {/* Responsive helpers */}