feat(adr100): surface playbook ticket remediation

2026-06-01 20:01:10 +08:00
parent 40e65730c1
commit a7b807dbfa
5 changed files with 437 additions and 4 deletions
--- a/apps/api/src/services/adr100_remediation_service.py
+++ b/apps/api/src/services/adr100_remediation_service.py
@@ -31,9 +31,11 @@ from src.services.post_execution_verifier import (

 logger = structlog.get_logger(__name__)

-RemediationMode = Literal["auto", "reverify", "replay"]
+RemediationMode = Literal["auto", "reverify", "replay", "ticket"]

 _READY_STATUSES = {"ready_for_replay", "ready_for_reverify"}
+_TICKET_STATUSES = {"needs_playbook_ticket"}
+_TICKET_ACTIONS = {"create_playbook_ticket", "promote_diagnostic_to_repair_playbook"}


 class RemediationNotFoundError(LookupError):
@@ -108,6 +110,8 @@ class Adr100RemediationService:
            payload["history"] = await self._record_dry_run_history(item, payload)
            return payload

+        if selected_mode == "ticket":
+            return await self._dry_run_ticket_proposal(item, incident, checks)
        if selected_mode == "replay":
            return await self._dry_run_replay(item, incident, checks)
        return await self._dry_run_reverify(item, incident, checks)
@@ -255,6 +259,35 @@ class Adr100RemediationService:
        payload["history"] = await self._record_dry_run_history(item, payload)
        return payload

+    async def _dry_run_ticket_proposal(
+        self,
+        item: dict[str, Any],
+        incident: Incident,
+        checks: list[dict[str, Any]],
+    ) -> dict[str, Any]:
+        ticket_preview = _ticket_preview_for_item(item, incident)
+        checks.append({
+            "name": "external_ticket_not_created",
+            "passed": True,
+            "detail": "dry_run_records_internal_history_only",
+        })
+
+        payload = _dry_run_result_payload(
+            item=item,
+            mode="ticket",
+            checks=checks,
+            post_state={},
+            verification_result_preview="ticket_proposal",
+            extra={
+                "ticket_preview": ticket_preview,
+                "writes_ticket": False,
+                "creates_external_ticket": False,
+                "plan": _plan_for_item(item, "ticket"),
+            },
+        )
+        payload["history"] = await self._record_dry_run_history(item, payload)
+        return payload
+
    async def _collect_current_state(self, incident: Incident) -> dict[str, Any]:
        try:
            return await asyncio.wait_for(
@@ -351,9 +384,15 @@ class Adr100RemediationService:
        return history


-def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]:
+def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay", "ticket"]:
    if requested in ("reverify", "replay"):
        return requested
+    if requested == "ticket":
+        return "ticket"
+    if item.get("remediation_status") in _TICKET_STATUSES:
+        return "ticket"
+    if item.get("remediation_action") in _TICKET_ACTIONS:
+        return "ticket"
    if item.get("remediation_status") == "ready_for_reverify":
        return "reverify"
    if item.get("remediation_action") == "reverify_with_promql_template":
@@ -367,14 +406,15 @@ def _base_checks(item: dict[str, Any]) -> list[dict[str, Any]]:
    return [
        {
            "name": "queue_item_ready",
-            "passed": status in _READY_STATUSES,
+            "passed": status in _READY_STATUSES or status in _TICKET_STATUSES,
            "detail": status,
        },
        {
-            "name": "read_only_guardrail",
+            "name": "read_or_record_only_guardrail",
            "passed": action in {
                "replay_with_supported_executor",
                "reverify_with_promql_template",
+                *_TICKET_ACTIONS,
            },
            "detail": action,
        },
@@ -394,6 +434,14 @@ def _plan_for_item(item: dict[str, Any], mode: str) -> dict[str, Any]:
            "required_scope": "read",
            "writes": [],
        }
+    if mode == "ticket":
+        return {
+            "step": "create_playbook_authoring_ticket_proposal",
+            "agent_id": "openclaw_playbook_planner",
+            "required_scope": "record_only",
+            "writes": ["alert_operation_log", "timeline"],
+            "target_action": item.get("remediation_action"),
+        }
    return {
        "step": "validate_supported_executor_route_then_collect_current_state",
        "agent_id": "auto_repair_executor",
@@ -419,6 +467,8 @@ def _dry_run_blocked_payload(
        "safety_level": "read_only",
        "writes_incident_state": False,
        "writes_auto_repair_result": False,
+        "writes_ticket": False,
+        "creates_external_ticket": False,
        "checks": checks,
        "verification_result_preview": "blocked",
        "post_state_summary": {},
@@ -445,6 +495,8 @@ def _dry_run_result_payload(
        "safety_level": "read_only",
        "writes_incident_state": False,
        "writes_auto_repair_result": False,
+        "writes_ticket": extra.get("writes_ticket", False),
+        "creates_external_ticket": extra.get("creates_external_ticket", False),
        "checks": checks,
        "verification_result_preview": verification_result_preview,
        "post_state_summary": _summarize_post_state(post_state),
@@ -474,6 +526,10 @@ def _history_context(item: dict[str, Any], payload: dict[str, Any]) -> dict[str,
        "safety_level": payload.get("safety_level"),
        "writes_incident_state": payload.get("writes_incident_state"),
        "writes_auto_repair_result": payload.get("writes_auto_repair_result"),
+        "writes_ticket": payload.get("writes_ticket"),
+        "creates_external_ticket": payload.get("creates_external_ticket"),
+        "ticket_preview": payload.get("ticket_preview"),
+        "plan": payload.get("plan"),
        "verification_result_preview": payload.get("verification_result_preview"),
        "post_state_summary": payload.get("post_state_summary"),
        "mcp_route": payload.get("mcp_route"),
@@ -537,6 +593,10 @@ def _history_item(record: Any, context: dict[str, Any]) -> dict[str, Any]:
        "required_scope": route.get("required_scope"),
        "writes_incident_state": context.get("writes_incident_state"),
        "writes_auto_repair_result": context.get("writes_auto_repair_result"),
+        "writes_ticket": context.get("writes_ticket"),
+        "creates_external_ticket": context.get("creates_external_ticket"),
+        "ticket_preview": context.get("ticket_preview"),
+        "plan": context.get("plan"),
        "checks": context.get("checks") or [],
    }

@@ -572,6 +632,44 @@ def _diagnostic_command_for_incident(incident: Incident) -> str:
    return f"ssh {host} 'uptime; docker stats --no-stream'"


+def _ticket_preview_for_item(item: dict[str, Any], incident: Incident) -> dict[str, Any]:
+    labels = _labels_for_incident(incident)
+    alertname = str(item.get("alertname") or labels.get("alertname") or "unknown_alert")
+    incident_id = str(item.get("incident_id") or incident.incident_id)
+    playbook_id = str(item.get("playbook_id") or "unknown_playbook")
+    host = str(labels.get("host") or labels.get("instance") or "unknown_host")
+    container = str(labels.get("container_name") or labels.get("container") or "")
+    target = f"host={host}" + (f" container={container}" if container else "")
+    title = f"[ADR-100] Promote diagnostic PlayBook to repair: {alertname}"
+    body = (
+        f"Incident: {incident_id}\n"
+        f"Auto repair: {item.get('auto_repair_id') or 'unknown'}\n"
+        f"PlayBook: {playbook_id}\n"
+        f"Target: {target}\n"
+        f"Failure class: {item.get('failure_class') or 'observe_only_playbook'}\n"
+        "Required change: add a gated mutating repair step such as docker restart, "
+        "Ansible check-mode/apply, or another approved executor action, then keep "
+        "post-execution verification tied to the same target.\n"
+        "Guardrail: do not mark the old diagnostic-only run as verified_success."
+    )
+    return {
+        "would_create": True,
+        "external_ticket_created": False,
+        "title": title,
+        "labels": [
+            "adr100",
+            "playbook-authoring",
+            "observe-only-playbook",
+            "needs-owner-review",
+        ],
+        "body_preview": body[:1000],
+        "owner": item.get("remediation_owner") or "solver_or_operator",
+        "next_step": "author_mutating_repair_step",
+        "playbook_id": playbook_id,
+        "target": target,
+    }
+
+
 def _promql_for_incident(incident: Incident) -> str:
    labels = _labels_for_incident(incident)
    alertname = ""
--- a/apps/api/tests/test_adr100_remediation_service.py
+++ b/apps/api/tests/test_adr100_remediation_service.py
@@ -193,6 +193,65 @@ async def test_preview_marks_replay_work_item_read_only():
    assert result["plan"]["writes"] == []


+@pytest.mark.asyncio
+async def test_preview_marks_observe_only_work_item_as_ticket_proposal():
+    item = _queue_item(
+        remediation_status="needs_playbook_ticket",
+        remediation_action="promote_diagnostic_to_repair_playbook",
+        remediation_owner="solver_or_operator",
+        failure_class="observe_only_playbook",
+    )
+    svc = _service(item=item)
+
+    result = await svc.preview("verification:INC-20260514-TEST01:are-1")
+
+    assert result["allowed"] is True
+    assert result["mode"] == "ticket"
+    assert result["writes_incident_state"] is False
+    assert result["writes_auto_repair_result"] is False
+    assert result["plan"]["agent_id"] == "openclaw_playbook_planner"
+    assert result["plan"]["required_scope"] == "record_only"
+    assert result["plan"]["target_action"] == "promote_diagnostic_to_repair_playbook"
+
+
+@pytest.mark.asyncio
+async def test_dry_run_ticket_proposal_records_internal_history_only():
+    alert_repo = _FakeAlertOperationLogRepository()
+    timeline = _FakeTimelineService()
+    item = _queue_item(
+        remediation_status="needs_playbook_ticket",
+        remediation_action="promote_diagnostic_to_repair_playbook",
+        remediation_owner="solver_or_operator",
+        failure_class="observe_only_playbook",
+    )
+    svc = _service(
+        item=item,
+        timeline_service=timeline,
+        alert_operation_log_repository=alert_repo,
+        record_history=True,
+    )
+
+    result = await svc.dry_run("verification:INC-20260514-TEST01:are-1")
+
+    assert result["allowed"] is True
+    assert result["executed"] is True
+    assert result["mode"] == "ticket"
+    assert result["verification_result_preview"] == "ticket_proposal"
+    assert result["writes_ticket"] is False
+    assert result["creates_external_ticket"] is False
+    assert result["ticket_preview"]["would_create"] is True
+    assert result["ticket_preview"]["external_ticket_created"] is False
+    assert result["ticket_preview"]["playbook_id"] == "PB-1"
+    assert "momo-scheduler" in result["ticket_preview"]["target"]
+    assert result["history"]["recorded"] is True
+    assert alert_repo.calls[0]["event_type"] == "PRE_FLIGHT_PASSED"
+    assert alert_repo.calls[0]["context"]["ticket_preview"]["next_step"] == (
+        "author_mutating_repair_step"
+    )
+    assert alert_repo.calls[0]["context"]["creates_external_ticket"] is False
+    assert timeline.calls[0]["actor_role"] == "ticket"
+
+
@pytest.mark.asyncio
 async def test_dry_run_reverify_collects_state_without_writes():
    item = _queue_item(
--- a/apps/web/messages/en.json
+++ b/apps/web/messages/en.json
@@ -2966,6 +2966,31 @@
        "yes": "是",
        "no": "否"
      },
+      "adr100Remediation": {
+        "title": "ADR-100 補救工作佇列",
+        "subtitle": "補救 {total} 筆；AI 可接手 {ready}；需人工 / PlayBook 改造 {human}",
+        "openGovernance": "開啟治理",
+        "empty": "目前沒有非成功驗證補救工作；若 SLO 再出現 degraded / failed，會在這裡形成可操作項。",
+        "unknownAlert": "未知告警",
+        "ticketFallback": "PlayBook 改造草稿",
+        "fields": {
+          "failure": "失敗類型：{value}",
+          "action": "處置：{value}",
+          "owner": "Owner：{value}",
+          "playbook": "PlayBook：{value}"
+        },
+        "actions": {
+          "preview": "預覽",
+          "dryRun": "預檢 / 草稿",
+          "loading": "處理中",
+          "failed": "補救工作操作失敗"
+        },
+        "result": {
+          "mode": "模式={value}",
+          "allowed": "允許={value}",
+          "writes": "寫入 incident={incident} / autoRepair={autoRepair}"
+        }
+      },
      "callbackTraceRecoveryActions": {
        "unavailable": "summary 未回傳，先確認 callback-replies API",
        "closed": "已符合關閉條件，保留歷史證據即可",
--- a/apps/web/messages/zh-TW.json
+++ b/apps/web/messages/zh-TW.json
@@ -2966,6 +2966,31 @@
        "yes": "是",
        "no": "否"
      },
+      "adr100Remediation": {
+        "title": "ADR-100 補救工作佇列",
+        "subtitle": "補救 {total} 筆；AI 可接手 {ready}；需人工 / PlayBook 改造 {human}",
+        "openGovernance": "開啟治理",
+        "empty": "目前沒有非成功驗證補救工作；若 SLO 再出現 degraded / failed，會在這裡形成可操作項。",
+        "unknownAlert": "未知告警",
+        "ticketFallback": "PlayBook 改造草稿",
+        "fields": {
+          "failure": "失敗類型：{value}",
+          "action": "處置：{value}",
+          "owner": "Owner：{value}",
+          "playbook": "PlayBook：{value}"
+        },
+        "actions": {
+          "preview": "預覽",
+          "dryRun": "預檢 / 草稿",
+          "loading": "處理中",
+          "failed": "補救工作操作失敗"
+        },
+        "result": {
+          "mode": "模式={value}",
+          "allowed": "允許={value}",
+          "writes": "寫入 incident={incident} / autoRepair={autoRepair}"
+        }
+      },
      "callbackTraceRecoveryActions": {
        "unavailable": "summary 未回傳，先確認 callback-replies API",
        "closed": "已符合關閉條件，保留歷史證據即可",
--- a/apps/web/src/app/[locale]/awooop/work-items/page.tsx
+++ b/apps/web/src/app/[locale]/awooop/work-items/page.tsx
@@ -263,11 +263,35 @@ type SloResponse = {
        total: number;
        ready_for_ai: number;
        needs_human: number;
+        items?: RemediationQueueItem[];
+        by_status?: Array<{ name?: string | null; count?: number | null }>;
+        by_action?: Array<{ name?: string | null; count?: number | null }>;
      };
    };
  };
 };

+type RemediationQueueItem = {
+  work_item_id?: string | null;
+  incident_id?: string | null;
+  auto_repair_id?: string | null;
+  alertname?: string | null;
+  playbook_id?: string | null;
+  failure_class?: string | null;
+  verification_result?: string | null;
+  remediation_status?: string | null;
+  remediation_action?: string | null;
+  remediation_owner?: string | null;
+  remediation_reason?: string | null;
+  source?: string | null;
+  auto_created_at?: string | null;
+  verification_collected_at?: string | null;
+};
+
+type RemediationQueue = NonNullable<
+  NonNullable<NonNullable<SloResponse["adr100"]>["verification_coverage"]>["remediation_queue"]
+>;
+
 type RemediationHistoryItem = {
  work_item_id?: string | null;
  incident_id?: string | null;
@@ -2616,6 +2640,202 @@ function WorkItemIncidentAuditPanel({
  );
 }

+function Adr100RemediationQueuePanel({
+  queue,
+  focusedWorkItemId,
+  onRecorded,
+}: {
+  queue: RemediationQueue | null | undefined;
+  focusedWorkItemId: string | null;
+  onRecorded: () => void;
+}) {
+  const t = useTranslations("awooop.workItems.adr100Remediation");
+  const [actionState, setActionState] = useState<Record<string, RecurrenceWorkItemActionState>>({});
+  const items = queue?.items ?? [];
+  const focusedItem = focusedWorkItemId
+    ? items.find((item) => item.work_item_id === focusedWorkItemId)
+    : null;
+  const visibleItems = focusedItem
+    ? [focusedItem, ...items.filter((item) => item !== focusedItem).slice(0, 5)]
+    : items.slice(0, 6);
+
+  const runAction = useCallback(async (
+    item: RemediationQueueItem,
+    action: "preview" | "dryRun"
+  ) => {
+    const workItemId = item.work_item_id ?? "";
+    if (!workItemId) return;
+    setActionState((current) => ({
+      ...current,
+      [workItemId]: { ...current[workItemId], loading: action, error: null },
+    }));
+
+    const mode = item.remediation_status === "needs_playbook_ticket" ? "ticket" : "auto";
+    try {
+      const result = action === "preview"
+        ? await fetchJson<RecurrenceWorkItemActionResult>(
+            `${API_BASE}/api/v1/ai/slo/remediation/preview?work_item_id=${encodeURIComponent(workItemId)}&mode=${encodeURIComponent(mode)}`,
+            12000
+          )
+        : await postJson<RecurrenceWorkItemActionResult>(
+            `${API_BASE}/api/v1/ai/slo/remediation/dry-run`,
+            { work_item_id: workItemId, mode },
+            15000
+          );
+
+      setActionState((current) => ({
+        ...current,
+        [workItemId]: {
+          loading: null,
+          result,
+          error: result ? null : t("actions.failed"),
+        },
+      }));
+      if (result?.history?.recorded) {
+        onRecorded();
+      }
+    } catch (error) {
+      setActionState((current) => ({
+        ...current,
+        [workItemId]: {
+          loading: null,
+          result: null,
+          error: error instanceof Error ? error.message : t("actions.failed"),
+        },
+      }));
+    }
+  }, [onRecorded, t]);
+
+  return (
+    <section className="border border-[#e0ddd4] bg-white">
+      <div className="flex flex-wrap items-center justify-between gap-3 border-b border-[#e0ddd4] bg-[#faf9f3] px-4 py-3">
+        <div className="flex min-w-0 items-center gap-2">
+          <ListChecks className="h-4 w-4 text-brand-accent" aria-hidden="true" />
+          <div className="min-w-0">
+            <h3 className="text-sm font-semibold text-[#141413]">{t("title")}</h3>
+            <p className="mt-1 text-xs leading-5 text-[#77736a]">
+              {t("subtitle", {
+                total: queue?.total ?? 0,
+                ready: queue?.ready_for_ai ?? 0,
+                human: queue?.needs_human ?? 0,
+              })}
+            </p>
+          </div>
+        </div>
+        <Link
+          href="/governance"
+          className="inline-flex items-center gap-1.5 border border-[#d8d3c7] bg-white px-2.5 py-1 text-xs font-semibold text-[#141413] hover:border-[#d97757]"
+        >
+          {t("openGovernance")}
+          <ArrowRight className="h-3.5 w-3.5" aria-hidden="true" />
+        </Link>
+      </div>
+
+      {visibleItems.length === 0 ? (
+        <div className="px-4 py-5 text-sm leading-6 text-[#77736a]">
+          {t("empty")}
+        </div>
+      ) : (
+        <div className="grid gap-px bg-[#e0ddd4] lg:grid-cols-2">
+          {visibleItems.map((item) => {
+            const workItemId = item.work_item_id ?? "";
+            const state = workItemId ? actionState[workItemId] : undefined;
+            const result = state?.result ?? null;
+            const ticketPreview = result?.ticket_preview ?? null;
+            return (
+              <article key={workItemId || item.incident_id || item.auto_repair_id} className="min-w-0 bg-white p-4">
+                <div className="flex flex-wrap items-start justify-between gap-3">
+                  <div className="min-w-0">
+                    <p className="font-mono text-[11px] font-semibold text-[#77736a]">
+                      {item.incident_id ?? "--"}
+                    </p>
+                    <h4 className="mt-1 truncate text-sm font-semibold text-[#141413]" title={item.alertname ?? undefined}>
+                      {item.alertname ?? t("unknownAlert")}
+                    </h4>
+                  </div>
+                  <span className={cn(
+                    "inline-flex shrink-0 border px-2 py-0.5 text-[11px] font-semibold",
+                    item.remediation_status === "needs_playbook_ticket"
+                      ? "border-[#f4c7a1] bg-[#fff7ed] text-[#9a4c18]"
+                      : item.remediation_status?.startsWith("ready")
+                        ? "border-[#bbdfc5] bg-[#f1fbf3] text-[#24733d]"
+                        : "border-[#ead5d5] bg-[#fff7f7] text-[#8b2f2f]"
+                  )}>
+                    {item.remediation_status ?? "--"}
+                  </span>
+                </div>
+
+                <div className="mt-3 grid gap-2 text-xs leading-5 text-[#5f5b52] md:grid-cols-2">
+                  <span className="min-w-0 truncate">
+                    {t("fields.failure", { value: item.failure_class ?? "--" })}
+                  </span>
+                  <span className="min-w-0 truncate">
+                    {t("fields.action", { value: item.remediation_action ?? "--" })}
+                  </span>
+                  <span className="min-w-0 truncate">
+                    {t("fields.owner", { value: item.remediation_owner ?? "--" })}
+                  </span>
+                  <span className="min-w-0 truncate">
+                    {t("fields.playbook", { value: item.playbook_id ?? "--" })}
+                  </span>
+                </div>
+
+                <div className="mt-3 flex flex-wrap gap-2">
+                  <button
+                    type="button"
+                    onClick={() => runAction(item, "preview")}
+                    disabled={!workItemId || state?.loading === "preview"}
+                    className="inline-flex items-center gap-1.5 border border-[#d8d3c7] bg-[#faf9f3] px-2.5 py-1 text-xs font-semibold text-[#141413] hover:border-[#d97757] disabled:opacity-50"
+                  >
+                    <SearchCheck className="h-3.5 w-3.5" aria-hidden="true" />
+                    {state?.loading === "preview" ? t("actions.loading") : t("actions.preview")}
+                  </button>
+                  <button
+                    type="button"
+                    onClick={() => runAction(item, "dryRun")}
+                    disabled={!workItemId || state?.loading === "dryRun"}
+                    className="inline-flex items-center gap-1.5 border border-[#d8d3c7] bg-white px-2.5 py-1 text-xs font-semibold text-[#141413] hover:border-[#d97757] disabled:opacity-50"
+                  >
+                    <FileText className="h-3.5 w-3.5" aria-hidden="true" />
+                    {state?.loading === "dryRun" ? t("actions.loading") : t("actions.dryRun")}
+                  </button>
+                </div>
+
+                {state?.error ? (
+                  <p className="mt-3 border border-[#ead5d5] bg-[#fff7f7] px-3 py-2 text-xs leading-5 text-[#8b2f2f]">
+                    {state.error}
+                  </p>
+                ) : null}
+
+                {result ? (
+                  <div className="mt-3 border border-[#eee9dd] bg-[#faf9f3] px-3 py-2 text-xs leading-5 text-[#5f5b52]">
+                    <div className="flex flex-wrap gap-2">
+                      <span className="font-mono">{t("result.mode", { value: result.mode ?? "--" })}</span>
+                      <span className="font-mono">{t("result.allowed", { value: String(result.allowed ?? false) })}</span>
+                      <span className="font-mono">{t("result.writes", {
+                        incident: String(result.writes_incident_state ?? false),
+                        autoRepair: String(result.writes_auto_repair_result ?? false),
+                      })}</span>
+                    </div>
+                    {ticketPreview ? (
+                      <div className="mt-2 border-t border-[#e0ddd4] pt-2">
+                        <p className="font-semibold text-[#141413]">{ticketPreview.title ?? t("ticketFallback")}</p>
+                        <p className="mt-1 line-clamp-3 whitespace-pre-line text-[#5f5b52]">
+                          {ticketPreview.body_preview ?? "--"}
+                        </p>
+                      </div>
+                    ) : null}
+                  </div>
+                ) : null}
+              </article>
+            );
+          })}
+        </div>
+      )}
+    </section>
+  );
+}
+
 function RecurrenceWorkQueuePanel({
  recurrence,
  focusedWorkItemId,
@@ -5351,6 +5571,12 @@ export default function AwoooPWorkItemsPage() {
        writesAutoRepairResult={latestRemediationHistory?.writes_auto_repair_result}
      />

+      <Adr100RemediationQueuePanel
+        queue={telemetry.slo?.adr100?.verification_coverage?.remediation_queue}
+        focusedWorkItemId={focusedWorkItemId}
+        onRecorded={fetchTelemetry}
+      />
+
      <AwoooPStatusChainPanel chain={telemetry.statusChain} />

      <WorkItemIncidentAuditPanel