feat(governance): surface verification remediation queue
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m0s
CD Pipeline / build-and-deploy (push) Successful in 3m25s
CD Pipeline / post-deploy-checks (push) Successful in 1m16s

This commit is contained in:
Your Name
2026-05-14 21:45:33 +08:00
parent f97127f704
commit aa63ae5eca
6 changed files with 326 additions and 3 deletions

View File

@@ -221,6 +221,7 @@ class Adr100SloStatusService:
"by_verification_result": [],
"by_failure_class": [],
},
"remediation_queue": _remediation_queue_payload([]),
}
return _build_verification_coverage_payload(
@@ -474,6 +475,7 @@ def _build_verification_coverage_payload(
_non_success_finding_payload(dict(raw))
for raw in recent_non_success_rows
]
remediation_queue = _remediation_queue_payload(recent_non_success)
return {
"schema_version": "adr100_verification_coverage_v1",
@@ -512,12 +514,17 @@ def _build_verification_coverage_payload(
"by_failure_class": _count_breakdown(
item["failure_class"] for item in recent_non_success
),
"by_remediation_status": _count_breakdown(
item["remediation_status"] for item in remediation_queue["items"]
),
},
"remediation_queue": remediation_queue,
}
def _non_success_finding_payload(row: dict[str, Any]) -> dict[str, Any]:
failure_class = _classify_non_success_failure(row)
remediation = _remediation_for_failure_class(failure_class)
return {
"auto_repair_id": str(row.get("auto_repair_id")),
"incident_id": str(row.get("incident_id")),
@@ -533,6 +540,10 @@ def _non_success_finding_payload(row: dict[str, Any]) -> dict[str, Any]:
"verification_result": str(row.get("verification_result") or "unknown"),
"failure_class": failure_class,
"next_step": _next_step_for_failure_class(failure_class),
"remediation_status": remediation["status"],
"remediation_action": remediation["action"],
"remediation_owner": remediation["owner"],
"remediation_reason": remediation["reason"],
"auto_error_excerpt": _short_text(row.get("auto_error"), 180),
"evidence_excerpt": _short_text(row.get("evidence_summary"), 180),
"auto_created_at": _iso(row.get("auto_created_at")),
@@ -560,6 +571,55 @@ def _classify_non_success_failure(row: dict[str, Any]) -> str:
return "verification_degraded"
def _remediation_for_failure_class(failure_class: str) -> dict[str, str]:
"""Map a non-success verification class to a read-only remediation work item.
This is dashboard triage metadata only. It does not auto-close incidents,
replay repairs, or approve write actions.
"""
if failure_class == "unsupported_action_scheme":
return {
"status": "ready_for_replay",
"action": "replay_with_supported_executor",
"owner": "auto_repair_executor",
"reason": "executor_gateway_available_after_t23",
}
if failure_class == "verifier_missing_promql":
return {
"status": "ready_for_reverify",
"action": "reverify_with_promql_template",
"owner": "post_execution_verifier",
"reason": "promql_template_available_after_t23",
}
if failure_class == "verifier_target_missing_pod":
return {
"status": "needs_target_mapping",
"action": "map_target_and_reverify",
"owner": "post_execution_verifier",
"reason": "verifier_target_missing",
}
if failure_class == "auto_repair_execution_failed":
return {
"status": "needs_playbook_ticket",
"action": "create_playbook_ticket",
"owner": "solver_or_operator",
"reason": "execution_failed_after_route_normalization",
}
if failure_class in {"verification_failed", "verification_timeout"}:
return {
"status": "manual_review",
"action": "escalate_verification_failure",
"owner": "sre_operator",
"reason": "verifier_returned_hard_failure",
}
return {
"status": "manual_review",
"action": "inspect_degraded_evidence",
"owner": "sre_operator",
"reason": "degraded_evidence_requires_human_context",
}
def _next_step_for_failure_class(failure_class: str) -> str:
if failure_class == "unsupported_action_scheme":
return "normalize_playbook_executor"
@@ -574,6 +634,57 @@ def _next_step_for_failure_class(failure_class: str) -> str:
return "review_degraded_verification"
def _remediation_queue_payload(recent_non_success: list[dict[str, Any]]) -> dict[str, Any]:
items: list[dict[str, Any]] = []
for item in recent_non_success:
items.append({
"work_item_id": (
f"verification:{item.get('incident_id')}:{item.get('auto_repair_id')}"
),
"incident_id": item.get("incident_id"),
"auto_repair_id": item.get("auto_repair_id"),
"alertname": item.get("alertname"),
"playbook_id": item.get("playbook_id"),
"failure_class": item.get("failure_class"),
"verification_result": item.get("verification_result"),
"remediation_status": item.get("remediation_status"),
"remediation_action": item.get("remediation_action"),
"remediation_owner": item.get("remediation_owner"),
"remediation_reason": item.get("remediation_reason"),
"source": "adr100_verification_coverage",
"auto_created_at": item.get("auto_created_at"),
"verification_collected_at": item.get("verification_collected_at"),
})
ready_for_ai = sum(
1 for item in items
if item.get("remediation_status") in {"ready_for_replay", "ready_for_reverify"}
)
needs_human = sum(
1 for item in items
if item.get("remediation_status") in {
"needs_target_mapping",
"needs_playbook_ticket",
"manual_review",
}
)
return {
"schema_version": "adr100_remediation_queue_v1",
"source": "recent_non_success_read_model",
"total": len(items),
"ready_for_ai": ready_for_ai,
"needs_human": needs_human,
"items": items,
"by_status": _count_breakdown(
item.get("remediation_status") for item in items
),
"by_action": _count_breakdown(
item.get("remediation_action") for item in items
),
}
def _count_breakdown(values: Any) -> list[dict[str, Any]]:
counts: dict[str, int] = {}
for value in values:

View File

@@ -69,6 +69,16 @@ async def _low_volume_coverage(self): # noqa: ANN001
"by_verification_result": [],
"by_failure_class": [],
},
"remediation_queue": {
"schema_version": "adr100_remediation_queue_v1",
"source": "recent_non_success_read_model",
"total": 0,
"ready_for_ai": 0,
"needs_human": 0,
"items": [],
"by_status": [],
"by_action": [],
},
}
@@ -185,12 +195,25 @@ def test_verification_coverage_payload_flags_backlog():
assert payload["recent_unverified"][0]["incident_id"] == "INC-1"
assert payload["recent_non_success"][0]["failure_class"] == "unsupported_action_scheme"
assert payload["recent_non_success"][0]["next_step"] == "normalize_playbook_executor"
assert payload["recent_non_success"][0]["remediation_status"] == "ready_for_replay"
assert payload["recent_non_success"][0]["remediation_action"] == (
"replay_with_supported_executor"
)
assert payload["non_success_breakdown"]["by_failure_class"] == [
{"name": "unsupported_action_scheme", "count": 1},
]
assert payload["non_success_breakdown"]["by_verification_result"] == [
{"name": "degraded", "count": 1},
]
assert payload["non_success_breakdown"]["by_remediation_status"] == [
{"name": "ready_for_replay", "count": 1},
]
assert payload["remediation_queue"]["total"] == 1
assert payload["remediation_queue"]["ready_for_ai"] == 1
assert payload["remediation_queue"]["needs_human"] == 0
assert payload["remediation_queue"]["items"][0]["work_item_id"] == (
"verification:INC-2:are-2"
)
def test_verification_coverage_payload_skips_when_no_auto_repair():
@@ -210,3 +233,4 @@ def test_verification_coverage_payload_skips_when_no_auto_repair():
assert payload["status"] == "skipped_low_volume"
assert payload["reason"] == "no_auto_repair_executions_24h"
assert payload["evaluable"] is False
assert payload["remediation_queue"]["total"] == 0

View File

@@ -1412,6 +1412,8 @@
"reasonLabel": "Reason",
"failureBreakdown": "Non-success Verification Classes",
"recentFindings": "Recent Non-success Verification",
"remediationQueue": "Remediation Work Queue",
"queueSummary": "Total {total}; AI-ready {ready}; human {human}",
"state": {
"ok": "OK",
"warning": "Needs tracking",
@@ -1444,6 +1446,22 @@
"review_auto_repair_execution": "Inspect auto repair record",
"escalate_verification_failure": "Escalate verification failure",
"review_degraded_verification": "Review degraded evidence"
},
"remediationStatus": {
"ready_for_replay": "Ready for replay",
"ready_for_reverify": "Ready to reverify",
"needs_target_mapping": "Needs target mapping",
"needs_playbook_ticket": "Needs ticket",
"manual_review": "Manual review",
"unknown": "Pending classification"
},
"remediationAction": {
"replay_with_supported_executor": "Replay with supported executor",
"reverify_with_promql_template": "Reverify with PromQL template",
"map_target_and_reverify": "Map target and reverify",
"create_playbook_ticket": "Create PlayBook ticket",
"escalate_verification_failure": "Escalate verification failure",
"inspect_degraded_evidence": "Inspect degraded evidence"
}
}
},
@@ -1717,6 +1735,9 @@
"autoRepair": {
"title": "Low-risk Alertmanager auto-repair loop"
},
"remediationQueue": {
"title": "Non-success verification remediation queue"
},
"telegramCallbacks": {
"title": "Telegram detail / history as DB truth-first"
},
@@ -1736,6 +1757,7 @@
"gates": {
"sourceDossier": "Inbound alerts must show received / incident_linked / source refs",
"autoRepair": "Requires auto_repair, verification_result=success, and KM writeback",
"remediationQueue": "Every degraded / failed / timeout row must map to replay, reverify, ticket, or manual review",
"telegramCallbacks": "Detail and history buttons cannot depend only on Redis TTL or stale snapshots",
"governanceDispatch": "Governance alerts must enter dispatch and expose skipped / pending / repaired",
"frontendConsole": "Completed and in-progress work must be trackable from the frontend",
@@ -1745,6 +1767,7 @@
"evidence": {
"channelEvents": "Recent Alertmanager channel events: {count}",
"autoRepair": "Verified auto-repairs: {verified}/{evaluated}",
"remediationQueue": "Remediation work: {total}; AI-ready: {ready}; human: {human}",
"telegramCallbacks": "Telegram callback lookup and history summary are being repaired",
"governance": "Unresolved governance alerts: {unresolved}; pending dispatch: {queued}",
"governanceUnavailable": "Governance events API is not responding; pending dispatch: {queued}",

View File

@@ -1413,6 +1413,8 @@
"reasonLabel": "原因",
"failureBreakdown": "非成功驗證分類",
"recentFindings": "近期非成功驗證",
"remediationQueue": "補救工作佇列",
"queueSummary": "總數 {total}AI 可接手 {ready};人工 {human}",
"state": {
"ok": "正常",
"warning": "需追蹤",
@@ -1445,6 +1447,22 @@
"review_auto_repair_execution": "檢查自動修復紀錄",
"escalate_verification_failure": "升級驗證失敗",
"review_degraded_verification": "檢查降級證據"
},
"remediationStatus": {
"ready_for_replay": "可重跑",
"ready_for_reverify": "可重驗",
"needs_target_mapping": "待補目標",
"needs_playbook_ticket": "待建 Ticket",
"manual_review": "人工檢查",
"unknown": "待分類"
},
"remediationAction": {
"replay_with_supported_executor": "用支援 executor 重跑",
"reverify_with_promql_template": "用 PromQL 模板重驗",
"map_target_and_reverify": "補目標後重驗",
"create_playbook_ticket": "建立 PlayBook Ticket",
"escalate_verification_failure": "升級驗證失敗",
"inspect_degraded_evidence": "檢查降級證據"
}
}
},
@@ -1718,6 +1736,9 @@
"autoRepair": {
"title": "低風險 Alertmanager 自動修復閉環"
},
"remediationQueue": {
"title": "非成功驗證補救工作佇列"
},
"telegramCallbacks": {
"title": "Telegram 詳情 / 歷史改為 DB truth-first"
},
@@ -1737,6 +1758,7 @@
"gates": {
"sourceDossier": "入站告警必須能查到 received / incident_linked / source refs",
"autoRepair": "必須同時有 auto_repair、verification_result=success 與 KM 回寫",
"remediationQueue": "每筆 degraded / failed / timeout 都必須映射到重跑、重驗、Ticket 或人工檢查",
"telegramCallbacks": "按下詳情與歷史不能再只依賴 Redis TTL 或舊快照",
"governanceDispatch": "治理告警必須進 dispatch並標示 skipped / pending / repaired",
"frontendConsole": "已完成與推進中的工作必須能從前端直接追蹤",
@@ -1746,6 +1768,7 @@
"evidence": {
"channelEvents": "最近 Alertmanager channel events{count}",
"autoRepair": "已驗證自動修復:{verified}/{evaluated}",
"remediationQueue": "補救工作:{total}AI 可接手:{ready};人工:{human}",
"telegramCallbacks": "目前修補 Telegram callback 查詢鏈與歷史摘要",
"governance": "未解治理告警:{unresolved}pending dispatch{queued}",
"governanceUnavailable": "治理事件 API 目前無法回應pending dispatch{queued}",

View File

@@ -46,11 +46,24 @@ type RecentEventsResponse = {
events?: Array<{ provider_event_id: string; is_duplicate: boolean }>;
};
type SloResponse = {
adr100?: {
verification_coverage?: {
remediation_queue?: {
total: number;
ready_for_ai: number;
needs_human: number;
};
};
};
};
type Telemetry = {
quality: AutomationQualitySummary | null;
governanceEvents: GovernanceEventsResponse | null;
governanceQueue: GovernanceQueueResponse | null;
channelEvents: RecentEventsResponse | null;
slo: SloResponse | null;
};
type WorkItem = {
@@ -108,6 +121,10 @@ function buildWorkItems(
const recentChannelEvents = telemetry.channelEvents?.total ?? 0;
const governanceUnresolved = telemetry.governanceEvents?.total ?? 0;
const governanceQueuePending = telemetry.governanceQueue?.total ?? 0;
const remediationQueue = telemetry.slo?.adr100?.verification_coverage?.remediation_queue;
const remediationTotal = remediationQueue?.total ?? 0;
const remediationReadyForAi = remediationQueue?.ready_for_ai ?? 0;
const remediationNeedsHuman = remediationQueue?.needs_human ?? 0;
const governanceEventsUnavailable = telemetry.governanceEvents === null;
const governanceQueueMissing = telemetry.governanceQueue?.table_pending === true;
const governanceDispatchBlocked =
@@ -141,6 +158,24 @@ function buildWorkItems(
}),
href: "/awooop/runs",
},
{
id: "remediationQueue",
phase: "T24",
status: remediationTotal === 0
? "watching"
: remediationReadyForAi > 0
? "in_progress"
: "blocked",
surfaceKey: "governance",
source: "/api/v1/ai/slo remediation_queue",
gateKey: "remediationQueue",
evidence: t("evidence.remediationQueue", {
total: remediationTotal,
ready: remediationReadyForAi,
human: remediationNeedsHuman,
}),
href: "/governance",
},
{
id: "telegramCallbacks",
phase: "T17",
@@ -252,6 +287,7 @@ export default function AwoooPWorkItemsPage() {
governanceEvents: null,
governanceQueue: null,
channelEvents: null,
slo: null,
});
const [loading, setLoading] = useState(true);
const [lastUpdated, setLastUpdated] = useState<Date | null>(null);
@@ -262,15 +298,17 @@ export default function AwoooPWorkItemsPage() {
const governanceEventsUrl = `${API_BASE}/api/v1/ai/governance/events?event_type=knowledge_degradation&event_type=governance_slo_data_gap&status=unresolved&size=10`;
const governanceQueueUrl = `${API_BASE}/api/v1/ai/governance/queue?dispatch_status=pending&size=10`;
const channelEventsUrl = `${API_BASE}/api/v1/platform/events/recent?project_id=awoooi&provider_prefix=alertmanager&limit=20`;
const sloUrl = `${API_BASE}/api/v1/ai/slo`;
const [quality, governanceEvents, governanceQueue, channelEvents] = await Promise.all([
const [quality, governanceEvents, governanceQueue, channelEvents, slo] = await Promise.all([
fetchJson<AutomationQualitySummary>(qualityUrl),
fetchJson<GovernanceEventsResponse>(governanceEventsUrl),
fetchJson<GovernanceQueueResponse>(governanceQueueUrl),
fetchJson<RecentEventsResponse>(channelEventsUrl),
fetchJson<SloResponse>(sloUrl),
]);
setTelemetry({ quality, governanceEvents, governanceQueue, channelEvents });
setTelemetry({ quality, governanceEvents, governanceQueue, channelEvents, slo });
setLastUpdated(new Date());
setLoading(false);
}, []);

View File

@@ -99,6 +99,10 @@ interface Adr100VerificationCoverage {
verification_result: string
failure_class: string
next_step: string
remediation_status?: string | null
remediation_action?: string | null
remediation_owner?: string | null
remediation_reason?: string | null
auto_error_excerpt?: string | null
evidence_excerpt?: string | null
auto_created_at?: string | null
@@ -107,6 +111,30 @@ interface Adr100VerificationCoverage {
non_success_breakdown?: {
by_verification_result?: Array<{ name: string; count: number }>
by_failure_class?: Array<{ name: string; count: number }>
by_remediation_status?: Array<{ name: string; count: number }>
}
remediation_queue?: {
total: number
ready_for_ai: number
needs_human: number
items?: Array<{
work_item_id: string
incident_id?: string | null
auto_repair_id?: string | null
alertname?: string | null
playbook_id?: string | null
failure_class?: string | null
verification_result?: string | null
remediation_status?: string | null
remediation_action?: string | null
remediation_owner?: string | null
remediation_reason?: string | null
source?: string | null
auto_created_at?: string | null
verification_collected_at?: string | null
}>
by_status?: Array<{ name: string; count: number }>
by_action?: Array<{ name: string; count: number }>
}
}
@@ -168,6 +196,25 @@ function nextStepKey(value?: string | null): string {
return 'review_degraded_verification'
}
function remediationStatusKey(value?: string | null): string {
if (value === 'ready_for_replay') return value
if (value === 'ready_for_reverify') return value
if (value === 'needs_target_mapping') return value
if (value === 'needs_playbook_ticket') return value
if (value === 'manual_review') return value
return 'unknown'
}
function remediationActionKey(value?: string | null): string {
if (value === 'replay_with_supported_executor') return value
if (value === 'reverify_with_promql_template') return value
if (value === 'map_target_and_reverify') return value
if (value === 'create_playbook_ticket') return value
if (value === 'escalate_verification_failure') return value
if (value === 'inspect_degraded_evidence') return value
return 'inspect_degraded_evidence'
}
function compactLabel(value?: string | null, fallback = '--'): string {
if (!value) return fallback
return value.length > 54 ? `${value.slice(0, 54)}...` : value
@@ -238,6 +285,7 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
]
const failureBreakdown = coverage?.non_success_breakdown?.by_failure_class ?? []
const recentFindings = coverage?.recent_non_success ?? []
const remediationQueue = coverage?.remediation_queue
return (
<GlassCard variant="subtle" padding="md">
@@ -313,6 +361,61 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
</div>
)}
{remediationQueue && remediationQueue.total > 0 && (
<div style={{ display: 'flex', flexDirection: 'column', gap: 8 }}>
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', gap: 10 }}>
<div style={{ fontFamily: 'Syne, sans-serif', fontSize: 11, fontWeight: 700, color: '#141413' }}>
{t('remediationQueue')}
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f' }}>
{t('queueSummary', {
total: remediationQueue.total,
ready: remediationQueue.ready_for_ai,
human: remediationQueue.needs_human,
})}
</div>
</div>
<div style={{ display: 'flex', flexDirection: 'column', gap: 7 }}>
{(remediationQueue.items ?? []).slice(0, 4).map(item => (
<div key={item.work_item_id} style={{
display: 'grid',
gridTemplateColumns: 'minmax(130px, 0.8fr) minmax(180px, 1fr) minmax(160px, 1fr)',
gap: 10,
alignItems: 'center',
minWidth: 0,
padding: '7px 0',
borderTop: '0.5px solid rgba(20,20,19,0.08)',
}} className="slo-remediation-row">
<div style={{ minWidth: 0 }}>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#141413' }}>
{item.incident_id ?? '--'}
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginTop: 2 }}>
{t(`remediationStatus.${remediationStatusKey(item.remediation_status)}`)}
</div>
</div>
<div style={{ minWidth: 0 }}>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#7c5a10' }}>
{t(`remediationAction.${remediationActionKey(item.remediation_action)}`)}
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginTop: 2, overflowWrap: 'anywhere' }}>
{compactLabel(item.alertname)} · {compactLabel(item.playbook_id)}
</div>
</div>
<div style={{ minWidth: 0 }}>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#141413' }}>
{item.remediation_owner ?? '--'}
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#87867f', marginTop: 2, overflowWrap: 'anywhere' }}>
{compactLabel(item.remediation_reason)}
</div>
</div>
</div>
))}
</div>
</div>
)}
{recentFindings.length > 0 && (
<div style={{ display: 'flex', flexDirection: 'column', gap: 8 }}>
<div style={{ fontFamily: 'Syne, sans-serif', fontSize: 11, fontWeight: 700, color: '#141413' }}>
@@ -478,11 +581,12 @@ export function SloTab() {
{/* Responsive helpers */}
<style>{`
.slo-kpi-grid > * { flex: 1; min-width: 200px; }
.slo-kpi-grid > * { flex: 1; min-width: 200px; }
@media (max-width: 640px) {
.slo-kpi-grid > * { flex: 0 0 100%; min-width: 0; }
.slo-coverage-grid { grid-template-columns: repeat(2, minmax(0, 1fr)) !important; }
.slo-finding-row { grid-template-columns: 1fr !important; }
.slo-remediation-row { grid-template-columns: 1fr !important; }
}
`}</style>
</div>