diff --git a/apps/api/src/services/adr100_remediation_service.py b/apps/api/src/services/adr100_remediation_service.py index d67c5d76..f9ff5149 100644 --- a/apps/api/src/services/adr100_remediation_service.py +++ b/apps/api/src/services/adr100_remediation_service.py @@ -31,9 +31,11 @@ from src.services.post_execution_verifier import ( logger = structlog.get_logger(__name__) -RemediationMode = Literal["auto", "reverify", "replay"] +RemediationMode = Literal["auto", "reverify", "replay", "ticket"] _READY_STATUSES = {"ready_for_replay", "ready_for_reverify"} +_TICKET_STATUSES = {"needs_playbook_ticket"} +_TICKET_ACTIONS = {"create_playbook_ticket", "promote_diagnostic_to_repair_playbook"} class RemediationNotFoundError(LookupError): @@ -108,6 +110,8 @@ class Adr100RemediationService: payload["history"] = await self._record_dry_run_history(item, payload) return payload + if selected_mode == "ticket": + return await self._dry_run_ticket_proposal(item, incident, checks) if selected_mode == "replay": return await self._dry_run_replay(item, incident, checks) return await self._dry_run_reverify(item, incident, checks) @@ -255,6 +259,35 @@ class Adr100RemediationService: payload["history"] = await self._record_dry_run_history(item, payload) return payload + async def _dry_run_ticket_proposal( + self, + item: dict[str, Any], + incident: Incident, + checks: list[dict[str, Any]], + ) -> dict[str, Any]: + ticket_preview = _ticket_preview_for_item(item, incident) + checks.append({ + "name": "external_ticket_not_created", + "passed": True, + "detail": "dry_run_records_internal_history_only", + }) + + payload = _dry_run_result_payload( + item=item, + mode="ticket", + checks=checks, + post_state={}, + verification_result_preview="ticket_proposal", + extra={ + "ticket_preview": ticket_preview, + "writes_ticket": False, + "creates_external_ticket": False, + "plan": _plan_for_item(item, "ticket"), + }, + ) + payload["history"] = await self._record_dry_run_history(item, payload) + return payload + async def _collect_current_state(self, incident: Incident) -> dict[str, Any]: try: return await asyncio.wait_for( @@ -351,9 +384,15 @@ class Adr100RemediationService: return history -def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]: +def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay", "ticket"]: if requested in ("reverify", "replay"): return requested + if requested == "ticket": + return "ticket" + if item.get("remediation_status") in _TICKET_STATUSES: + return "ticket" + if item.get("remediation_action") in _TICKET_ACTIONS: + return "ticket" if item.get("remediation_status") == "ready_for_reverify": return "reverify" if item.get("remediation_action") == "reverify_with_promql_template": @@ -367,14 +406,15 @@ def _base_checks(item: dict[str, Any]) -> list[dict[str, Any]]: return [ { "name": "queue_item_ready", - "passed": status in _READY_STATUSES, + "passed": status in _READY_STATUSES or status in _TICKET_STATUSES, "detail": status, }, { - "name": "read_only_guardrail", + "name": "read_or_record_only_guardrail", "passed": action in { "replay_with_supported_executor", "reverify_with_promql_template", + *_TICKET_ACTIONS, }, "detail": action, }, @@ -394,6 +434,14 @@ def _plan_for_item(item: dict[str, Any], mode: str) -> dict[str, Any]: "required_scope": "read", "writes": [], } + if mode == "ticket": + return { + "step": "create_playbook_authoring_ticket_proposal", + "agent_id": "openclaw_playbook_planner", + "required_scope": "record_only", + "writes": ["alert_operation_log", "timeline"], + "target_action": item.get("remediation_action"), + } return { "step": "validate_supported_executor_route_then_collect_current_state", "agent_id": "auto_repair_executor", @@ -419,6 +467,8 @@ def _dry_run_blocked_payload( "safety_level": "read_only", "writes_incident_state": False, "writes_auto_repair_result": False, + "writes_ticket": False, + "creates_external_ticket": False, "checks": checks, "verification_result_preview": "blocked", "post_state_summary": {}, @@ -445,6 +495,8 @@ def _dry_run_result_payload( "safety_level": "read_only", "writes_incident_state": False, "writes_auto_repair_result": False, + "writes_ticket": extra.get("writes_ticket", False), + "creates_external_ticket": extra.get("creates_external_ticket", False), "checks": checks, "verification_result_preview": verification_result_preview, "post_state_summary": _summarize_post_state(post_state), @@ -474,6 +526,10 @@ def _history_context(item: dict[str, Any], payload: dict[str, Any]) -> dict[str, "safety_level": payload.get("safety_level"), "writes_incident_state": payload.get("writes_incident_state"), "writes_auto_repair_result": payload.get("writes_auto_repair_result"), + "writes_ticket": payload.get("writes_ticket"), + "creates_external_ticket": payload.get("creates_external_ticket"), + "ticket_preview": payload.get("ticket_preview"), + "plan": payload.get("plan"), "verification_result_preview": payload.get("verification_result_preview"), "post_state_summary": payload.get("post_state_summary"), "mcp_route": payload.get("mcp_route"), @@ -537,6 +593,10 @@ def _history_item(record: Any, context: dict[str, Any]) -> dict[str, Any]: "required_scope": route.get("required_scope"), "writes_incident_state": context.get("writes_incident_state"), "writes_auto_repair_result": context.get("writes_auto_repair_result"), + "writes_ticket": context.get("writes_ticket"), + "creates_external_ticket": context.get("creates_external_ticket"), + "ticket_preview": context.get("ticket_preview"), + "plan": context.get("plan"), "checks": context.get("checks") or [], } @@ -572,6 +632,44 @@ def _diagnostic_command_for_incident(incident: Incident) -> str: return f"ssh {host} 'uptime; docker stats --no-stream'" +def _ticket_preview_for_item(item: dict[str, Any], incident: Incident) -> dict[str, Any]: + labels = _labels_for_incident(incident) + alertname = str(item.get("alertname") or labels.get("alertname") or "unknown_alert") + incident_id = str(item.get("incident_id") or incident.incident_id) + playbook_id = str(item.get("playbook_id") or "unknown_playbook") + host = str(labels.get("host") or labels.get("instance") or "unknown_host") + container = str(labels.get("container_name") or labels.get("container") or "") + target = f"host={host}" + (f" container={container}" if container else "") + title = f"[ADR-100] Promote diagnostic PlayBook to repair: {alertname}" + body = ( + f"Incident: {incident_id}\n" + f"Auto repair: {item.get('auto_repair_id') or 'unknown'}\n" + f"PlayBook: {playbook_id}\n" + f"Target: {target}\n" + f"Failure class: {item.get('failure_class') or 'observe_only_playbook'}\n" + "Required change: add a gated mutating repair step such as docker restart, " + "Ansible check-mode/apply, or another approved executor action, then keep " + "post-execution verification tied to the same target.\n" + "Guardrail: do not mark the old diagnostic-only run as verified_success." + ) + return { + "would_create": True, + "external_ticket_created": False, + "title": title, + "labels": [ + "adr100", + "playbook-authoring", + "observe-only-playbook", + "needs-owner-review", + ], + "body_preview": body[:1000], + "owner": item.get("remediation_owner") or "solver_or_operator", + "next_step": "author_mutating_repair_step", + "playbook_id": playbook_id, + "target": target, + } + + def _promql_for_incident(incident: Incident) -> str: labels = _labels_for_incident(incident) alertname = "" diff --git a/apps/api/tests/test_adr100_remediation_service.py b/apps/api/tests/test_adr100_remediation_service.py index 08f94f2a..35f30637 100644 --- a/apps/api/tests/test_adr100_remediation_service.py +++ b/apps/api/tests/test_adr100_remediation_service.py @@ -193,6 +193,65 @@ async def test_preview_marks_replay_work_item_read_only(): assert result["plan"]["writes"] == [] +@pytest.mark.asyncio +async def test_preview_marks_observe_only_work_item_as_ticket_proposal(): + item = _queue_item( + remediation_status="needs_playbook_ticket", + remediation_action="promote_diagnostic_to_repair_playbook", + remediation_owner="solver_or_operator", + failure_class="observe_only_playbook", + ) + svc = _service(item=item) + + result = await svc.preview("verification:INC-20260514-TEST01:are-1") + + assert result["allowed"] is True + assert result["mode"] == "ticket" + assert result["writes_incident_state"] is False + assert result["writes_auto_repair_result"] is False + assert result["plan"]["agent_id"] == "openclaw_playbook_planner" + assert result["plan"]["required_scope"] == "record_only" + assert result["plan"]["target_action"] == "promote_diagnostic_to_repair_playbook" + + +@pytest.mark.asyncio +async def test_dry_run_ticket_proposal_records_internal_history_only(): + alert_repo = _FakeAlertOperationLogRepository() + timeline = _FakeTimelineService() + item = _queue_item( + remediation_status="needs_playbook_ticket", + remediation_action="promote_diagnostic_to_repair_playbook", + remediation_owner="solver_or_operator", + failure_class="observe_only_playbook", + ) + svc = _service( + item=item, + timeline_service=timeline, + alert_operation_log_repository=alert_repo, + record_history=True, + ) + + result = await svc.dry_run("verification:INC-20260514-TEST01:are-1") + + assert result["allowed"] is True + assert result["executed"] is True + assert result["mode"] == "ticket" + assert result["verification_result_preview"] == "ticket_proposal" + assert result["writes_ticket"] is False + assert result["creates_external_ticket"] is False + assert result["ticket_preview"]["would_create"] is True + assert result["ticket_preview"]["external_ticket_created"] is False + assert result["ticket_preview"]["playbook_id"] == "PB-1" + assert "momo-scheduler" in result["ticket_preview"]["target"] + assert result["history"]["recorded"] is True + assert alert_repo.calls[0]["event_type"] == "PRE_FLIGHT_PASSED" + assert alert_repo.calls[0]["context"]["ticket_preview"]["next_step"] == ( + "author_mutating_repair_step" + ) + assert alert_repo.calls[0]["context"]["creates_external_ticket"] is False + assert timeline.calls[0]["actor_role"] == "ticket" + + @pytest.mark.asyncio async def test_dry_run_reverify_collects_state_without_writes(): item = _queue_item( diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 468351a9..b91c9bfc 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -2966,6 +2966,31 @@ "yes": "是", "no": "否" }, + "adr100Remediation": { + "title": "ADR-100 補救工作佇列", + "subtitle": "補救 {total} 筆;AI 可接手 {ready};需人工 / PlayBook 改造 {human}", + "openGovernance": "開啟治理", + "empty": "目前沒有非成功驗證補救工作;若 SLO 再出現 degraded / failed,會在這裡形成可操作項。", + "unknownAlert": "未知告警", + "ticketFallback": "PlayBook 改造草稿", + "fields": { + "failure": "失敗類型:{value}", + "action": "處置:{value}", + "owner": "Owner:{value}", + "playbook": "PlayBook:{value}" + }, + "actions": { + "preview": "預覽", + "dryRun": "預檢 / 草稿", + "loading": "處理中", + "failed": "補救工作操作失敗" + }, + "result": { + "mode": "模式={value}", + "allowed": "允許={value}", + "writes": "寫入 incident={incident} / autoRepair={autoRepair}" + } + }, "callbackTraceRecoveryActions": { "unavailable": "summary 未回傳,先確認 callback-replies API", "closed": "已符合關閉條件,保留歷史證據即可", diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index 468351a9..b91c9bfc 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -2966,6 +2966,31 @@ "yes": "是", "no": "否" }, + "adr100Remediation": { + "title": "ADR-100 補救工作佇列", + "subtitle": "補救 {total} 筆;AI 可接手 {ready};需人工 / PlayBook 改造 {human}", + "openGovernance": "開啟治理", + "empty": "目前沒有非成功驗證補救工作;若 SLO 再出現 degraded / failed,會在這裡形成可操作項。", + "unknownAlert": "未知告警", + "ticketFallback": "PlayBook 改造草稿", + "fields": { + "failure": "失敗類型:{value}", + "action": "處置:{value}", + "owner": "Owner:{value}", + "playbook": "PlayBook:{value}" + }, + "actions": { + "preview": "預覽", + "dryRun": "預檢 / 草稿", + "loading": "處理中", + "failed": "補救工作操作失敗" + }, + "result": { + "mode": "模式={value}", + "allowed": "允許={value}", + "writes": "寫入 incident={incident} / autoRepair={autoRepair}" + } + }, "callbackTraceRecoveryActions": { "unavailable": "summary 未回傳,先確認 callback-replies API", "closed": "已符合關閉條件,保留歷史證據即可", diff --git a/apps/web/src/app/[locale]/awooop/work-items/page.tsx b/apps/web/src/app/[locale]/awooop/work-items/page.tsx index 97916e91..ec34d006 100644 --- a/apps/web/src/app/[locale]/awooop/work-items/page.tsx +++ b/apps/web/src/app/[locale]/awooop/work-items/page.tsx @@ -263,11 +263,35 @@ type SloResponse = { total: number; ready_for_ai: number; needs_human: number; + items?: RemediationQueueItem[]; + by_status?: Array<{ name?: string | null; count?: number | null }>; + by_action?: Array<{ name?: string | null; count?: number | null }>; }; }; }; }; +type RemediationQueueItem = { + work_item_id?: string | null; + incident_id?: string | null; + auto_repair_id?: string | null; + alertname?: string | null; + playbook_id?: string | null; + failure_class?: string | null; + verification_result?: string | null; + remediation_status?: string | null; + remediation_action?: string | null; + remediation_owner?: string | null; + remediation_reason?: string | null; + source?: string | null; + auto_created_at?: string | null; + verification_collected_at?: string | null; +}; + +type RemediationQueue = NonNullable< + NonNullable["verification_coverage"]>["remediation_queue"] +>; + type RemediationHistoryItem = { work_item_id?: string | null; incident_id?: string | null; @@ -2616,6 +2640,202 @@ function WorkItemIncidentAuditPanel({ ); } +function Adr100RemediationQueuePanel({ + queue, + focusedWorkItemId, + onRecorded, +}: { + queue: RemediationQueue | null | undefined; + focusedWorkItemId: string | null; + onRecorded: () => void; +}) { + const t = useTranslations("awooop.workItems.adr100Remediation"); + const [actionState, setActionState] = useState>({}); + const items = queue?.items ?? []; + const focusedItem = focusedWorkItemId + ? items.find((item) => item.work_item_id === focusedWorkItemId) + : null; + const visibleItems = focusedItem + ? [focusedItem, ...items.filter((item) => item !== focusedItem).slice(0, 5)] + : items.slice(0, 6); + + const runAction = useCallback(async ( + item: RemediationQueueItem, + action: "preview" | "dryRun" + ) => { + const workItemId = item.work_item_id ?? ""; + if (!workItemId) return; + setActionState((current) => ({ + ...current, + [workItemId]: { ...current[workItemId], loading: action, error: null }, + })); + + const mode = item.remediation_status === "needs_playbook_ticket" ? "ticket" : "auto"; + try { + const result = action === "preview" + ? await fetchJson( + `${API_BASE}/api/v1/ai/slo/remediation/preview?work_item_id=${encodeURIComponent(workItemId)}&mode=${encodeURIComponent(mode)}`, + 12000 + ) + : await postJson( + `${API_BASE}/api/v1/ai/slo/remediation/dry-run`, + { work_item_id: workItemId, mode }, + 15000 + ); + + setActionState((current) => ({ + ...current, + [workItemId]: { + loading: null, + result, + error: result ? null : t("actions.failed"), + }, + })); + if (result?.history?.recorded) { + onRecorded(); + } + } catch (error) { + setActionState((current) => ({ + ...current, + [workItemId]: { + loading: null, + result: null, + error: error instanceof Error ? error.message : t("actions.failed"), + }, + })); + } + }, [onRecorded, t]); + + return ( +
+
+
+
+ + {t("openGovernance")} +
+ + {visibleItems.length === 0 ? ( +
+ {t("empty")} +
+ ) : ( +
+ {visibleItems.map((item) => { + const workItemId = item.work_item_id ?? ""; + const state = workItemId ? actionState[workItemId] : undefined; + const result = state?.result ?? null; + const ticketPreview = result?.ticket_preview ?? null; + return ( +
+
+
+

+ {item.incident_id ?? "--"} +

+

+ {item.alertname ?? t("unknownAlert")} +

+
+ + {item.remediation_status ?? "--"} + +
+ +
+ + {t("fields.failure", { value: item.failure_class ?? "--" })} + + + {t("fields.action", { value: item.remediation_action ?? "--" })} + + + {t("fields.owner", { value: item.remediation_owner ?? "--" })} + + + {t("fields.playbook", { value: item.playbook_id ?? "--" })} + +
+ +
+ + +
+ + {state?.error ? ( +

+ {state.error} +

+ ) : null} + + {result ? ( +
+
+ {t("result.mode", { value: result.mode ?? "--" })} + {t("result.allowed", { value: String(result.allowed ?? false) })} + {t("result.writes", { + incident: String(result.writes_incident_state ?? false), + autoRepair: String(result.writes_auto_repair_result ?? false), + })} +
+ {ticketPreview ? ( +
+

{ticketPreview.title ?? t("ticketFallback")}

+

+ {ticketPreview.body_preview ?? "--"} +

+
+ ) : null} +
+ ) : null} +
+ ); + })} +
+ )} +
+ ); +} + function RecurrenceWorkQueuePanel({ recurrence, focusedWorkItemId, @@ -5351,6 +5571,12 @@ export default function AwoooPWorkItemsPage() { writesAutoRepairResult={latestRemediationHistory?.writes_auto_repair_result} /> + +