feat(adr100): surface playbook ticket remediation
Some checks failed
CD Pipeline / tests (push) Successful in 1m21s
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / build-and-deploy (push) Failing after 9m11s
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-06-01 20:01:10 +08:00
parent 40e65730c1
commit a7b807dbfa
5 changed files with 437 additions and 4 deletions

View File

@@ -31,9 +31,11 @@ from src.services.post_execution_verifier import (
logger = structlog.get_logger(__name__)
RemediationMode = Literal["auto", "reverify", "replay"]
RemediationMode = Literal["auto", "reverify", "replay", "ticket"]
_READY_STATUSES = {"ready_for_replay", "ready_for_reverify"}
_TICKET_STATUSES = {"needs_playbook_ticket"}
_TICKET_ACTIONS = {"create_playbook_ticket", "promote_diagnostic_to_repair_playbook"}
class RemediationNotFoundError(LookupError):
@@ -108,6 +110,8 @@ class Adr100RemediationService:
payload["history"] = await self._record_dry_run_history(item, payload)
return payload
if selected_mode == "ticket":
return await self._dry_run_ticket_proposal(item, incident, checks)
if selected_mode == "replay":
return await self._dry_run_replay(item, incident, checks)
return await self._dry_run_reverify(item, incident, checks)
@@ -255,6 +259,35 @@ class Adr100RemediationService:
payload["history"] = await self._record_dry_run_history(item, payload)
return payload
async def _dry_run_ticket_proposal(
self,
item: dict[str, Any],
incident: Incident,
checks: list[dict[str, Any]],
) -> dict[str, Any]:
ticket_preview = _ticket_preview_for_item(item, incident)
checks.append({
"name": "external_ticket_not_created",
"passed": True,
"detail": "dry_run_records_internal_history_only",
})
payload = _dry_run_result_payload(
item=item,
mode="ticket",
checks=checks,
post_state={},
verification_result_preview="ticket_proposal",
extra={
"ticket_preview": ticket_preview,
"writes_ticket": False,
"creates_external_ticket": False,
"plan": _plan_for_item(item, "ticket"),
},
)
payload["history"] = await self._record_dry_run_history(item, payload)
return payload
async def _collect_current_state(self, incident: Incident) -> dict[str, Any]:
try:
return await asyncio.wait_for(
@@ -351,9 +384,15 @@ class Adr100RemediationService:
return history
def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]:
def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay", "ticket"]:
if requested in ("reverify", "replay"):
return requested
if requested == "ticket":
return "ticket"
if item.get("remediation_status") in _TICKET_STATUSES:
return "ticket"
if item.get("remediation_action") in _TICKET_ACTIONS:
return "ticket"
if item.get("remediation_status") == "ready_for_reverify":
return "reverify"
if item.get("remediation_action") == "reverify_with_promql_template":
@@ -367,14 +406,15 @@ def _base_checks(item: dict[str, Any]) -> list[dict[str, Any]]:
return [
{
"name": "queue_item_ready",
"passed": status in _READY_STATUSES,
"passed": status in _READY_STATUSES or status in _TICKET_STATUSES,
"detail": status,
},
{
"name": "read_only_guardrail",
"name": "read_or_record_only_guardrail",
"passed": action in {
"replay_with_supported_executor",
"reverify_with_promql_template",
*_TICKET_ACTIONS,
},
"detail": action,
},
@@ -394,6 +434,14 @@ def _plan_for_item(item: dict[str, Any], mode: str) -> dict[str, Any]:
"required_scope": "read",
"writes": [],
}
if mode == "ticket":
return {
"step": "create_playbook_authoring_ticket_proposal",
"agent_id": "openclaw_playbook_planner",
"required_scope": "record_only",
"writes": ["alert_operation_log", "timeline"],
"target_action": item.get("remediation_action"),
}
return {
"step": "validate_supported_executor_route_then_collect_current_state",
"agent_id": "auto_repair_executor",
@@ -419,6 +467,8 @@ def _dry_run_blocked_payload(
"safety_level": "read_only",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"writes_ticket": False,
"creates_external_ticket": False,
"checks": checks,
"verification_result_preview": "blocked",
"post_state_summary": {},
@@ -445,6 +495,8 @@ def _dry_run_result_payload(
"safety_level": "read_only",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"writes_ticket": extra.get("writes_ticket", False),
"creates_external_ticket": extra.get("creates_external_ticket", False),
"checks": checks,
"verification_result_preview": verification_result_preview,
"post_state_summary": _summarize_post_state(post_state),
@@ -474,6 +526,10 @@ def _history_context(item: dict[str, Any], payload: dict[str, Any]) -> dict[str,
"safety_level": payload.get("safety_level"),
"writes_incident_state": payload.get("writes_incident_state"),
"writes_auto_repair_result": payload.get("writes_auto_repair_result"),
"writes_ticket": payload.get("writes_ticket"),
"creates_external_ticket": payload.get("creates_external_ticket"),
"ticket_preview": payload.get("ticket_preview"),
"plan": payload.get("plan"),
"verification_result_preview": payload.get("verification_result_preview"),
"post_state_summary": payload.get("post_state_summary"),
"mcp_route": payload.get("mcp_route"),
@@ -537,6 +593,10 @@ def _history_item(record: Any, context: dict[str, Any]) -> dict[str, Any]:
"required_scope": route.get("required_scope"),
"writes_incident_state": context.get("writes_incident_state"),
"writes_auto_repair_result": context.get("writes_auto_repair_result"),
"writes_ticket": context.get("writes_ticket"),
"creates_external_ticket": context.get("creates_external_ticket"),
"ticket_preview": context.get("ticket_preview"),
"plan": context.get("plan"),
"checks": context.get("checks") or [],
}
@@ -572,6 +632,44 @@ def _diagnostic_command_for_incident(incident: Incident) -> str:
return f"ssh {host} 'uptime; docker stats --no-stream'"
def _ticket_preview_for_item(item: dict[str, Any], incident: Incident) -> dict[str, Any]:
labels = _labels_for_incident(incident)
alertname = str(item.get("alertname") or labels.get("alertname") or "unknown_alert")
incident_id = str(item.get("incident_id") or incident.incident_id)
playbook_id = str(item.get("playbook_id") or "unknown_playbook")
host = str(labels.get("host") or labels.get("instance") or "unknown_host")
container = str(labels.get("container_name") or labels.get("container") or "")
target = f"host={host}" + (f" container={container}" if container else "")
title = f"[ADR-100] Promote diagnostic PlayBook to repair: {alertname}"
body = (
f"Incident: {incident_id}\n"
f"Auto repair: {item.get('auto_repair_id') or 'unknown'}\n"
f"PlayBook: {playbook_id}\n"
f"Target: {target}\n"
f"Failure class: {item.get('failure_class') or 'observe_only_playbook'}\n"
"Required change: add a gated mutating repair step such as docker restart, "
"Ansible check-mode/apply, or another approved executor action, then keep "
"post-execution verification tied to the same target.\n"
"Guardrail: do not mark the old diagnostic-only run as verified_success."
)
return {
"would_create": True,
"external_ticket_created": False,
"title": title,
"labels": [
"adr100",
"playbook-authoring",
"observe-only-playbook",
"needs-owner-review",
],
"body_preview": body[:1000],
"owner": item.get("remediation_owner") or "solver_or_operator",
"next_step": "author_mutating_repair_step",
"playbook_id": playbook_id,
"target": target,
}
def _promql_for_incident(incident: Incident) -> str:
labels = _labels_for_incident(incident)
alertname = ""

View File

@@ -193,6 +193,65 @@ async def test_preview_marks_replay_work_item_read_only():
assert result["plan"]["writes"] == []
@pytest.mark.asyncio
async def test_preview_marks_observe_only_work_item_as_ticket_proposal():
item = _queue_item(
remediation_status="needs_playbook_ticket",
remediation_action="promote_diagnostic_to_repair_playbook",
remediation_owner="solver_or_operator",
failure_class="observe_only_playbook",
)
svc = _service(item=item)
result = await svc.preview("verification:INC-20260514-TEST01:are-1")
assert result["allowed"] is True
assert result["mode"] == "ticket"
assert result["writes_incident_state"] is False
assert result["writes_auto_repair_result"] is False
assert result["plan"]["agent_id"] == "openclaw_playbook_planner"
assert result["plan"]["required_scope"] == "record_only"
assert result["plan"]["target_action"] == "promote_diagnostic_to_repair_playbook"
@pytest.mark.asyncio
async def test_dry_run_ticket_proposal_records_internal_history_only():
alert_repo = _FakeAlertOperationLogRepository()
timeline = _FakeTimelineService()
item = _queue_item(
remediation_status="needs_playbook_ticket",
remediation_action="promote_diagnostic_to_repair_playbook",
remediation_owner="solver_or_operator",
failure_class="observe_only_playbook",
)
svc = _service(
item=item,
timeline_service=timeline,
alert_operation_log_repository=alert_repo,
record_history=True,
)
result = await svc.dry_run("verification:INC-20260514-TEST01:are-1")
assert result["allowed"] is True
assert result["executed"] is True
assert result["mode"] == "ticket"
assert result["verification_result_preview"] == "ticket_proposal"
assert result["writes_ticket"] is False
assert result["creates_external_ticket"] is False
assert result["ticket_preview"]["would_create"] is True
assert result["ticket_preview"]["external_ticket_created"] is False
assert result["ticket_preview"]["playbook_id"] == "PB-1"
assert "momo-scheduler" in result["ticket_preview"]["target"]
assert result["history"]["recorded"] is True
assert alert_repo.calls[0]["event_type"] == "PRE_FLIGHT_PASSED"
assert alert_repo.calls[0]["context"]["ticket_preview"]["next_step"] == (
"author_mutating_repair_step"
)
assert alert_repo.calls[0]["context"]["creates_external_ticket"] is False
assert timeline.calls[0]["actor_role"] == "ticket"
@pytest.mark.asyncio
async def test_dry_run_reverify_collects_state_without_writes():
item = _queue_item(

View File

@@ -2966,6 +2966,31 @@
"yes": "是",
"no": "否"
},
"adr100Remediation": {
"title": "ADR-100 補救工作佇列",
"subtitle": "補救 {total} 筆AI 可接手 {ready};需人工 / PlayBook 改造 {human}",
"openGovernance": "開啟治理",
"empty": "目前沒有非成功驗證補救工作;若 SLO 再出現 degraded / failed會在這裡形成可操作項。",
"unknownAlert": "未知告警",
"ticketFallback": "PlayBook 改造草稿",
"fields": {
"failure": "失敗類型:{value}",
"action": "處置:{value}",
"owner": "Owner{value}",
"playbook": "PlayBook{value}"
},
"actions": {
"preview": "預覽",
"dryRun": "預檢 / 草稿",
"loading": "處理中",
"failed": "補救工作操作失敗"
},
"result": {
"mode": "模式={value}",
"allowed": "允許={value}",
"writes": "寫入 incident={incident} / autoRepair={autoRepair}"
}
},
"callbackTraceRecoveryActions": {
"unavailable": "summary 未回傳,先確認 callback-replies API",
"closed": "已符合關閉條件,保留歷史證據即可",

View File

@@ -2966,6 +2966,31 @@
"yes": "是",
"no": "否"
},
"adr100Remediation": {
"title": "ADR-100 補救工作佇列",
"subtitle": "補救 {total} 筆AI 可接手 {ready};需人工 / PlayBook 改造 {human}",
"openGovernance": "開啟治理",
"empty": "目前沒有非成功驗證補救工作;若 SLO 再出現 degraded / failed會在這裡形成可操作項。",
"unknownAlert": "未知告警",
"ticketFallback": "PlayBook 改造草稿",
"fields": {
"failure": "失敗類型:{value}",
"action": "處置:{value}",
"owner": "Owner{value}",
"playbook": "PlayBook{value}"
},
"actions": {
"preview": "預覽",
"dryRun": "預檢 / 草稿",
"loading": "處理中",
"failed": "補救工作操作失敗"
},
"result": {
"mode": "模式={value}",
"allowed": "允許={value}",
"writes": "寫入 incident={incident} / autoRepair={autoRepair}"
}
},
"callbackTraceRecoveryActions": {
"unavailable": "summary 未回傳,先確認 callback-replies API",
"closed": "已符合關閉條件,保留歷史證據即可",

View File

@@ -263,11 +263,35 @@ type SloResponse = {
total: number;
ready_for_ai: number;
needs_human: number;
items?: RemediationQueueItem[];
by_status?: Array<{ name?: string | null; count?: number | null }>;
by_action?: Array<{ name?: string | null; count?: number | null }>;
};
};
};
};
type RemediationQueueItem = {
work_item_id?: string | null;
incident_id?: string | null;
auto_repair_id?: string | null;
alertname?: string | null;
playbook_id?: string | null;
failure_class?: string | null;
verification_result?: string | null;
remediation_status?: string | null;
remediation_action?: string | null;
remediation_owner?: string | null;
remediation_reason?: string | null;
source?: string | null;
auto_created_at?: string | null;
verification_collected_at?: string | null;
};
type RemediationQueue = NonNullable<
NonNullable<NonNullable<SloResponse["adr100"]>["verification_coverage"]>["remediation_queue"]
>;
type RemediationHistoryItem = {
work_item_id?: string | null;
incident_id?: string | null;
@@ -2616,6 +2640,202 @@ function WorkItemIncidentAuditPanel({
);
}
function Adr100RemediationQueuePanel({
queue,
focusedWorkItemId,
onRecorded,
}: {
queue: RemediationQueue | null | undefined;
focusedWorkItemId: string | null;
onRecorded: () => void;
}) {
const t = useTranslations("awooop.workItems.adr100Remediation");
const [actionState, setActionState] = useState<Record<string, RecurrenceWorkItemActionState>>({});
const items = queue?.items ?? [];
const focusedItem = focusedWorkItemId
? items.find((item) => item.work_item_id === focusedWorkItemId)
: null;
const visibleItems = focusedItem
? [focusedItem, ...items.filter((item) => item !== focusedItem).slice(0, 5)]
: items.slice(0, 6);
const runAction = useCallback(async (
item: RemediationQueueItem,
action: "preview" | "dryRun"
) => {
const workItemId = item.work_item_id ?? "";
if (!workItemId) return;
setActionState((current) => ({
...current,
[workItemId]: { ...current[workItemId], loading: action, error: null },
}));
const mode = item.remediation_status === "needs_playbook_ticket" ? "ticket" : "auto";
try {
const result = action === "preview"
? await fetchJson<RecurrenceWorkItemActionResult>(
`${API_BASE}/api/v1/ai/slo/remediation/preview?work_item_id=${encodeURIComponent(workItemId)}&mode=${encodeURIComponent(mode)}`,
12000
)
: await postJson<RecurrenceWorkItemActionResult>(
`${API_BASE}/api/v1/ai/slo/remediation/dry-run`,
{ work_item_id: workItemId, mode },
15000
);
setActionState((current) => ({
...current,
[workItemId]: {
loading: null,
result,
error: result ? null : t("actions.failed"),
},
}));
if (result?.history?.recorded) {
onRecorded();
}
} catch (error) {
setActionState((current) => ({
...current,
[workItemId]: {
loading: null,
result: null,
error: error instanceof Error ? error.message : t("actions.failed"),
},
}));
}
}, [onRecorded, t]);
return (
<section className="border border-[#e0ddd4] bg-white">
<div className="flex flex-wrap items-center justify-between gap-3 border-b border-[#e0ddd4] bg-[#faf9f3] px-4 py-3">
<div className="flex min-w-0 items-center gap-2">
<ListChecks className="h-4 w-4 text-brand-accent" aria-hidden="true" />
<div className="min-w-0">
<h3 className="text-sm font-semibold text-[#141413]">{t("title")}</h3>
<p className="mt-1 text-xs leading-5 text-[#77736a]">
{t("subtitle", {
total: queue?.total ?? 0,
ready: queue?.ready_for_ai ?? 0,
human: queue?.needs_human ?? 0,
})}
</p>
</div>
</div>
<Link
href="/governance"
className="inline-flex items-center gap-1.5 border border-[#d8d3c7] bg-white px-2.5 py-1 text-xs font-semibold text-[#141413] hover:border-[#d97757]"
>
{t("openGovernance")}
<ArrowRight className="h-3.5 w-3.5" aria-hidden="true" />
</Link>
</div>
{visibleItems.length === 0 ? (
<div className="px-4 py-5 text-sm leading-6 text-[#77736a]">
{t("empty")}
</div>
) : (
<div className="grid gap-px bg-[#e0ddd4] lg:grid-cols-2">
{visibleItems.map((item) => {
const workItemId = item.work_item_id ?? "";
const state = workItemId ? actionState[workItemId] : undefined;
const result = state?.result ?? null;
const ticketPreview = result?.ticket_preview ?? null;
return (
<article key={workItemId || item.incident_id || item.auto_repair_id} className="min-w-0 bg-white p-4">
<div className="flex flex-wrap items-start justify-between gap-3">
<div className="min-w-0">
<p className="font-mono text-[11px] font-semibold text-[#77736a]">
{item.incident_id ?? "--"}
</p>
<h4 className="mt-1 truncate text-sm font-semibold text-[#141413]" title={item.alertname ?? undefined}>
{item.alertname ?? t("unknownAlert")}
</h4>
</div>
<span className={cn(
"inline-flex shrink-0 border px-2 py-0.5 text-[11px] font-semibold",
item.remediation_status === "needs_playbook_ticket"
? "border-[#f4c7a1] bg-[#fff7ed] text-[#9a4c18]"
: item.remediation_status?.startsWith("ready")
? "border-[#bbdfc5] bg-[#f1fbf3] text-[#24733d]"
: "border-[#ead5d5] bg-[#fff7f7] text-[#8b2f2f]"
)}>
{item.remediation_status ?? "--"}
</span>
</div>
<div className="mt-3 grid gap-2 text-xs leading-5 text-[#5f5b52] md:grid-cols-2">
<span className="min-w-0 truncate">
{t("fields.failure", { value: item.failure_class ?? "--" })}
</span>
<span className="min-w-0 truncate">
{t("fields.action", { value: item.remediation_action ?? "--" })}
</span>
<span className="min-w-0 truncate">
{t("fields.owner", { value: item.remediation_owner ?? "--" })}
</span>
<span className="min-w-0 truncate">
{t("fields.playbook", { value: item.playbook_id ?? "--" })}
</span>
</div>
<div className="mt-3 flex flex-wrap gap-2">
<button
type="button"
onClick={() => runAction(item, "preview")}
disabled={!workItemId || state?.loading === "preview"}
className="inline-flex items-center gap-1.5 border border-[#d8d3c7] bg-[#faf9f3] px-2.5 py-1 text-xs font-semibold text-[#141413] hover:border-[#d97757] disabled:opacity-50"
>
<SearchCheck className="h-3.5 w-3.5" aria-hidden="true" />
{state?.loading === "preview" ? t("actions.loading") : t("actions.preview")}
</button>
<button
type="button"
onClick={() => runAction(item, "dryRun")}
disabled={!workItemId || state?.loading === "dryRun"}
className="inline-flex items-center gap-1.5 border border-[#d8d3c7] bg-white px-2.5 py-1 text-xs font-semibold text-[#141413] hover:border-[#d97757] disabled:opacity-50"
>
<FileText className="h-3.5 w-3.5" aria-hidden="true" />
{state?.loading === "dryRun" ? t("actions.loading") : t("actions.dryRun")}
</button>
</div>
{state?.error ? (
<p className="mt-3 border border-[#ead5d5] bg-[#fff7f7] px-3 py-2 text-xs leading-5 text-[#8b2f2f]">
{state.error}
</p>
) : null}
{result ? (
<div className="mt-3 border border-[#eee9dd] bg-[#faf9f3] px-3 py-2 text-xs leading-5 text-[#5f5b52]">
<div className="flex flex-wrap gap-2">
<span className="font-mono">{t("result.mode", { value: result.mode ?? "--" })}</span>
<span className="font-mono">{t("result.allowed", { value: String(result.allowed ?? false) })}</span>
<span className="font-mono">{t("result.writes", {
incident: String(result.writes_incident_state ?? false),
autoRepair: String(result.writes_auto_repair_result ?? false),
})}</span>
</div>
{ticketPreview ? (
<div className="mt-2 border-t border-[#e0ddd4] pt-2">
<p className="font-semibold text-[#141413]">{ticketPreview.title ?? t("ticketFallback")}</p>
<p className="mt-1 line-clamp-3 whitespace-pre-line text-[#5f5b52]">
{ticketPreview.body_preview ?? "--"}
</p>
</div>
) : null}
</div>
) : null}
</article>
);
})}
</div>
)}
</section>
);
}
function RecurrenceWorkQueuePanel({
recurrence,
focusedWorkItemId,
@@ -5351,6 +5571,12 @@ export default function AwoooPWorkItemsPage() {
writesAutoRepairResult={latestRemediationHistory?.writes_auto_repair_result}
/>
<Adr100RemediationQueuePanel
queue={telemetry.slo?.adr100?.verification_coverage?.remediation_queue}
focusedWorkItemId={focusedWorkItemId}
onRecorded={fetchTelemetry}
/>
<AwoooPStatusChainPanel chain={telemetry.statusChain} />
<WorkItemIncidentAuditPanel