feat(governance): add remediation dry run entrypoint

2026-05-14 22:20:34 +08:00
parent 102f92dfc3
commit 04fdaee83a
8 changed files with 820 additions and 3 deletions
--- a/apps/api/src/api/v1/ai_slo.py
+++ b/apps/api/src/api/v1/ai_slo.py
@@ -18,8 +18,14 @@ Endpoints:
 from __future__ import annotations

 import structlog
-from fastapi import APIRouter, Query
+from fastapi import APIRouter, HTTPException, Query
+from pydantic import BaseModel, Field

+from src.services.adr100_remediation_service import (
+    RemediationMode,
+    RemediationNotFoundError,
+    get_adr100_remediation_service,
+)
 from src.services.adr100_slo_status_service import get_adr100_slo_status_service
 from src.services.ai_slo_calculator import AiSloCalculator

@@ -28,6 +34,20 @@ logger = structlog.get_logger(__name__)
 router = APIRouter()


+class RemediationPreviewRequest(BaseModel):
+    """ADR-100 remediation preview request."""
+
+    work_item_id: str = Field(min_length=1)
+    mode: RemediationMode = "auto"
+
+
+class RemediationDryRunRequest(BaseModel):
+    """ADR-100 remediation dry-run request."""
+
+    work_item_id: str = Field(min_length=1)
+    mode: RemediationMode = "auto"
+
+
@router.get("/ai/slo")
 async def get_ai_slo(
    force_refresh: bool = Query(False, description="忽略快取，強制重算"),
@@ -59,3 +79,42 @@ async def get_ai_slo(
    data["cache_hit"] = False
    data["adr100"] = await get_adr100_slo_status_service().fetch_report()
    return data
+
+
+@router.get("/ai/slo/remediation/preview")
+async def preview_ai_slo_remediation(
+    work_item_id: str = Query(..., min_length=1),
+    mode: RemediationMode = Query("auto"),
+) -> dict:
+    """Preview the safe remediation plan for one ADR-100 queue item."""
+
+    try:
+        return await get_adr100_remediation_service().preview(work_item_id, mode)
+    except RemediationNotFoundError as exc:
+        raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
+
+
+@router.post("/ai/slo/remediation/preview")
+async def preview_ai_slo_remediation_post(request: RemediationPreviewRequest) -> dict:
+    """POST variant for clients that prefer JSON bodies."""
+
+    try:
+        return await get_adr100_remediation_service().preview(
+            request.work_item_id,
+            request.mode,
+        )
+    except RemediationNotFoundError as exc:
+        raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
+
+
+@router.post("/ai/slo/remediation/dry-run")
+async def dry_run_ai_slo_remediation(request: RemediationDryRunRequest) -> dict:
+    """Run a read-only ADR-100 remediation dry-run."""
+
+    try:
+        return await get_adr100_remediation_service().dry_run(
+            request.work_item_id,
+            request.mode,
+        )
+    except RemediationNotFoundError as exc:
+        raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
--- a/apps/api/src/services/adr100_remediation_service.py
+++ b/apps/api/src/services/adr100_remediation_service.py
@@ -0,0 +1,356 @@
+"""
+ADR-100 Remediation Service
+===========================
+Safe operator entrypoints for verification remediation work items.
+
+T25: remediation queue items are now actionable without mutating incident state:
+- preview: show the selected guardrail path
+- dry-run: collect read-only current state and validate supported executor routing
+"""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any, Literal, Protocol
+
+import structlog
+
+from src.models.incident import Incident
+from src.repositories.incident_repository import IncidentDBRepository
+from src.services.adr100_slo_status_service import (
+    Adr100SloStatusService,
+    get_adr100_slo_status_service,
+)
+from src.services.auto_repair_service import AutoRepairService
+from src.services.post_execution_verifier import (
+    PostExecutionVerifier,
+    _assess_recovery,
+    _build_prometheus_query,
+    get_post_execution_verifier,
+)
+
+logger = structlog.get_logger(__name__)
+
+RemediationMode = Literal["auto", "reverify", "replay"]
+
+_READY_STATUSES = {"ready_for_replay", "ready_for_reverify"}
+
+
+class RemediationNotFoundError(LookupError):
+    """Requested ADR-100 remediation work item is not in the current read model."""
+
+
+class _IncidentRepository(Protocol):
+    async def get_by_id(self, incident_id: str) -> Incident | None:
+        ...
+
+
+class Adr100RemediationService:
+    """Read-only remediation preview and dry-run service."""
+
+    def __init__(
+        self,
+        *,
+        slo_service: Adr100SloStatusService | None = None,
+        incident_repository: _IncidentRepository | None = None,
+        auto_repair_service: AutoRepairService | None = None,
+        verifier: PostExecutionVerifier | None = None,
+    ) -> None:
+        self._slo_service = slo_service or get_adr100_slo_status_service()
+        self._incident_repository = incident_repository or IncidentDBRepository()
+        self._auto_repair_service = auto_repair_service or AutoRepairService()
+        self._verifier = verifier or get_post_execution_verifier()
+
+    async def preview(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]:
+        """Return the safe execution plan for a remediation queue item."""
+
+        item = await self._find_work_item(work_item_id)
+        selected_mode = _select_mode(item, mode)
+        checks = _base_checks(item)
+        allowed = all(check["passed"] for check in checks)
+
+        return {
+            "schema_version": "adr100_remediation_preview_v1",
+            "work_item_id": item.get("work_item_id"),
+            "incident_id": item.get("incident_id"),
+            "auto_repair_id": item.get("auto_repair_id"),
+            "mode": selected_mode,
+            "allowed": allowed,
+            "safety_level": "read_only",
+            "writes_incident_state": False,
+            "writes_auto_repair_result": False,
+            "checks": checks,
+            "plan": _plan_for_item(item, selected_mode),
+            "source": "adr100.verification_coverage.remediation_queue",
+        }
+
+    async def dry_run(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]:
+        """Run a safe, read-only remediation dry-run for one queue item."""
+
+        item = await self._find_work_item(work_item_id)
+        selected_mode = _select_mode(item, mode)
+        checks = _base_checks(item)
+        incident = await self._load_incident(item)
+        checks.append({
+            "name": "incident_loaded",
+            "passed": incident is not None,
+            "detail": item.get("incident_id") or "missing incident_id",
+        })
+
+        if incident is None or not all(check["passed"] for check in checks):
+            return _dry_run_blocked_payload(item, selected_mode, checks)
+
+        if selected_mode == "replay":
+            return await self._dry_run_replay(item, incident, checks)
+        return await self._dry_run_reverify(item, incident, checks)
+
+    async def _find_work_item(self, work_item_id: str) -> dict[str, Any]:
+        report = await self._slo_service.fetch_report()
+        coverage = report.get("verification_coverage") or {}
+        queue = coverage.get("remediation_queue") or {}
+
+        for item in queue.get("items") or []:
+            if item.get("work_item_id") == work_item_id:
+                return dict(item)
+
+        raise RemediationNotFoundError(work_item_id)
+
+    async def _load_incident(self, item: dict[str, Any]) -> Incident | None:
+        incident_id = str(item.get("incident_id") or "")
+        if not incident_id:
+            return None
+        return await self._incident_repository.get_by_id(incident_id)
+
+    async def _dry_run_reverify(
+        self,
+        item: dict[str, Any],
+        incident: Incident,
+        checks: list[dict[str, Any]],
+    ) -> dict[str, Any]:
+        post_state = await self._collect_current_state(incident)
+        action_taken = f"dry_run_reverify:{item.get('playbook_id') or 'unknown'}"
+        result = _assess_recovery(None, post_state, action_taken)
+
+        return _dry_run_result_payload(
+            item=item,
+            mode="reverify",
+            checks=checks,
+            post_state=post_state,
+            verification_result_preview=result,
+            extra={
+                "promql": _promql_for_incident(incident),
+                "mcp_route": {
+                    "agent_id": "post_execution_verifier",
+                    "required_scope": "read",
+                    "is_shadow": True,
+                    "flywheel_node": "verify",
+                },
+            },
+        )
+
+    async def _dry_run_replay(
+        self,
+        item: dict[str, Any],
+        incident: Incident,
+        checks: list[dict[str, Any]],
+    ) -> dict[str, Any]:
+        diagnostic_command = _diagnostic_command_for_incident(incident)
+        route = self._auto_repair_service.preview_read_only_ssh_mcp_route(
+            incident,
+            diagnostic_command,
+        )
+        checks.append({
+            "name": "supported_executor_route",
+            "passed": route is not None,
+            "detail": "mcp:ssh_diagnose" if route else "missing host/container route",
+        })
+
+        post_state = await self._collect_current_state(incident)
+        action_taken = f"dry_run_replay:{item.get('playbook_id') or 'unknown'}"
+        result = _assess_recovery(None, post_state, action_taken)
+
+        return _dry_run_result_payload(
+            item=item,
+            mode="replay",
+            checks=checks,
+            post_state=post_state,
+            verification_result_preview=result,
+            extra={
+                "diagnostic_command_preview": diagnostic_command,
+                "mcp_route": route,
+                "promql": _promql_for_incident(incident),
+            },
+        )
+
+    async def _collect_current_state(self, incident: Incident) -> dict[str, Any]:
+        try:
+            return await asyncio.wait_for(
+                self._verifier._collect_post_state(incident),
+                timeout=12.0,
+            )
+        except asyncio.TimeoutError:
+            logger.warning(
+                "adr100_remediation_dry_run_timeout",
+                incident_id=incident.incident_id,
+            )
+            return {}
+        except Exception as exc:
+            logger.warning(
+                "adr100_remediation_dry_run_collect_failed",
+                incident_id=incident.incident_id,
+                error=str(exc),
+            )
+            return {}
+
+
+def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]:
+    if requested in ("reverify", "replay"):
+        return requested
+    if item.get("remediation_status") == "ready_for_reverify":
+        return "reverify"
+    if item.get("remediation_action") == "reverify_with_promql_template":
+        return "reverify"
+    return "replay"
+
+
+def _base_checks(item: dict[str, Any]) -> list[dict[str, Any]]:
+    status = str(item.get("remediation_status") or "unknown")
+    action = str(item.get("remediation_action") or "unknown")
+    return [
+        {
+            "name": "queue_item_ready",
+            "passed": status in _READY_STATUSES,
+            "detail": status,
+        },
+        {
+            "name": "read_only_guardrail",
+            "passed": action in {
+                "replay_with_supported_executor",
+                "reverify_with_promql_template",
+            },
+            "detail": action,
+        },
+        {
+            "name": "no_state_mutation",
+            "passed": True,
+            "detail": "dry_run_does_not_update_incident_or_auto_repair_rows",
+        },
+    ]
+
+
+def _plan_for_item(item: dict[str, Any], mode: str) -> dict[str, Any]:
+    if mode == "reverify":
+        return {
+            "step": "collect_current_state_and_assess",
+            "agent_id": "post_execution_verifier",
+            "required_scope": "read",
+            "writes": [],
+        }
+    return {
+        "step": "validate_supported_executor_route_then_collect_current_state",
+        "agent_id": "auto_repair_executor",
+        "required_scope": "read",
+        "writes": [],
+        "target_action": item.get("remediation_action"),
+    }
+
+
+def _dry_run_blocked_payload(
+    item: dict[str, Any],
+    mode: str,
+    checks: list[dict[str, Any]],
+) -> dict[str, Any]:
+    return {
+        "schema_version": "adr100_remediation_dry_run_v1",
+        "work_item_id": item.get("work_item_id"),
+        "incident_id": item.get("incident_id"),
+        "auto_repair_id": item.get("auto_repair_id"),
+        "mode": mode,
+        "allowed": False,
+        "executed": False,
+        "safety_level": "read_only",
+        "writes_incident_state": False,
+        "writes_auto_repair_result": False,
+        "checks": checks,
+        "verification_result_preview": "blocked",
+        "post_state_summary": {},
+    }
+
+
+def _dry_run_result_payload(
+    *,
+    item: dict[str, Any],
+    mode: str,
+    checks: list[dict[str, Any]],
+    post_state: dict[str, Any],
+    verification_result_preview: str,
+    extra: dict[str, Any],
+) -> dict[str, Any]:
+    return {
+        "schema_version": "adr100_remediation_dry_run_v1",
+        "work_item_id": item.get("work_item_id"),
+        "incident_id": item.get("incident_id"),
+        "auto_repair_id": item.get("auto_repair_id"),
+        "mode": mode,
+        "allowed": all(check["passed"] for check in checks),
+        "executed": True,
+        "safety_level": "read_only",
+        "writes_incident_state": False,
+        "writes_auto_repair_result": False,
+        "checks": checks,
+        "verification_result_preview": verification_result_preview,
+        "post_state_summary": _summarize_post_state(post_state),
+        **extra,
+    }
+
+
+def _summarize_post_state(post_state: dict[str, Any]) -> dict[str, Any]:
+    keys = sorted(post_state.keys())
+    return {
+        "tool_count": len(keys),
+        "tools": keys[:8],
+        "has_state": bool(post_state),
+    }
+
+
+def _diagnostic_command_for_incident(incident: Incident) -> str:
+    labels = _labels_for_incident(incident)
+    host = str(labels.get("host") or labels.get("instance") or "{host}")
+    container = str(labels.get("container_name") or labels.get("container") or "")
+    if container:
+        return f"ssh {host} 'uptime; docker stats --no-stream {container}'"
+    return f"ssh {host} 'uptime; docker stats --no-stream'"
+
+
+def _promql_for_incident(incident: Incident) -> str:
+    labels = _labels_for_incident(incident)
+    alertname = ""
+    if incident.signals:
+        signal = incident.signals[0]
+        alertname = labels.get("alertname") or getattr(signal, "alert_name", "")
+    return _build_prometheus_query(alertname, labels)
+
+
+def _labels_for_incident(incident: Incident) -> dict[str, Any]:
+    if incident.signals:
+        return incident.signals[0].labels or {}
+    return {}
+
+
+_service: Adr100RemediationService | None = None
+
+
+def get_adr100_remediation_service() -> Adr100RemediationService:
+    """Return singleton ADR-100 remediation service."""
+
+    global _service
+    if _service is None:
+        _service = Adr100RemediationService()
+    return _service
+
+
+def set_adr100_remediation_service(service: Adr100RemediationService | None) -> None:
+    """Inject ADR-100 remediation service for tests."""
+
+    global _service
+    _service = service
--- a/apps/api/src/services/auto_repair_service.py
+++ b/apps/api/src/services/auto_repair_service.py
@@ -1003,6 +1003,29 @@ class AutoRepairService:

        return _SshMcpRoute(tool_name="ssh_diagnose", params=params)

+    def preview_read_only_ssh_mcp_route(
+        self,
+        incident: Incident,
+        command: str,
+    ) -> dict[str, Any] | None:
+        """Preview whether a legacy SSH diagnostic can use the MCP Gateway.
+
+        This is used by remediation dry-runs to prove the supported executor
+        path without running the original PlayBook step or writing an execution
+        result.
+        """
+
+        route = self._route_legacy_ssh_command_to_mcp(incident, command)
+        if route is None:
+            return None
+        return {
+            "tool_name": route.tool_name,
+            "params": route.params,
+            "agent_id": "auto_repair_executor",
+            "required_scope": "read",
+            "flywheel_node": "execute",
+        }
+
    def _resolve_ssh_host_for_incident(self, incident: Incident, command: str) -> str:
        """Resolve ``{host}``, short host labels, and exporter instance ports."""

--- a/apps/api/tests/test_adr100_remediation_service.py
+++ b/apps/api/tests/test_adr100_remediation_service.py
@@ -0,0 +1,227 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from typing import Any
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from src.api.v1.ai_slo import router
+from src.models.incident import Incident, IncidentStatus, Severity, Signal
+from src.models.playbook import Playbook
+from src.services.adr100_remediation_service import (
+    Adr100RemediationService,
+    RemediationNotFoundError,
+)
+from src.services.auto_repair_service import AutoRepairService
+
+
+class _FakeSloService:
+    def __init__(self, items: list[dict[str, Any]]) -> None:
+        self.items = items
+
+    async def fetch_report(self) -> dict[str, Any]:
+        return {
+            "verification_coverage": {
+                "remediation_queue": {
+                    "items": self.items,
+                },
+            },
+        }
+
+
+class _FakeIncidentRepository:
+    def __init__(self, incident: Incident | None) -> None:
+        self.incident = incident
+
+    async def get_by_id(self, incident_id: str) -> Incident | None:
+        if self.incident and self.incident.incident_id == incident_id:
+            return self.incident
+        return None
+
+
+class _FakeVerifier:
+    def __init__(self, state: dict[str, Any]) -> None:
+        self.state = state
+        self.calls = 0
+
+    async def _collect_post_state(self, incident: Incident) -> dict[str, Any]:
+        self.calls += 1
+        return self.state
+
+
+class _NoopPlaybookService:
+    async def get_recommendations(self, *_args, **_kwargs):  # noqa: ANN002, ANN003
+        return []
+
+    async def get_by_id(self, _playbook_id: str) -> Playbook | None:
+        return None
+
+    async def record_execution(self, _playbook_id: str, _success: bool) -> bool:
+        return True
+
+
+async def _no_cooldown(*_args, **_kwargs) -> tuple[bool, str]:  # noqa: ANN002, ANN003
+    return True, "test"
+
+
+def _incident() -> Incident:
+    now = datetime.now(timezone.utc)
+    return Incident(
+        incident_id="INC-20260514-TEST01",
+        status=IncidentStatus.INVESTIGATING,
+        severity=Severity.P2,
+        affected_services=["momo-scheduler"],
+        alert_category="infrastructure",
+        signals=[
+            Signal(
+                alert_name="DockerContainerMemoryLimitPressure",
+                severity=Severity.P2,
+                source="prometheus",
+                fired_at=now,
+                labels={
+                    "alertname": "DockerContainerMemoryLimitPressure",
+                    "host": "110",
+                    "container_name": "momo-scheduler",
+                },
+            ),
+        ],
+    )
+
+
+def _queue_item(**overrides: Any) -> dict[str, Any]:
+    item = {
+        "work_item_id": "verification:INC-20260514-TEST01:are-1",
+        "incident_id": "INC-20260514-TEST01",
+        "auto_repair_id": "are-1",
+        "alertname": "DockerContainerMemoryLimitPressure",
+        "playbook_id": "PB-1",
+        "verification_result": "degraded",
+        "remediation_status": "ready_for_replay",
+        "remediation_action": "replay_with_supported_executor",
+        "remediation_owner": "auto_repair_executor",
+    }
+    item.update(overrides)
+    return item
+
+
+def _service(
+    *,
+    item: dict[str, Any],
+    incident: Incident | None = None,
+    state: dict[str, Any] | None = None,
+) -> Adr100RemediationService:
+    return Adr100RemediationService(
+        slo_service=_FakeSloService([item]),
+        incident_repository=_FakeIncidentRepository(incident or _incident()),
+        auto_repair_service=AutoRepairService(
+            playbook_service=_NoopPlaybookService(),
+            cooldown_checker=_no_cooldown,
+        ),
+        verifier=_FakeVerifier(state or {"k8s_get_pod_status": {"phase": "Running"}}),
+    )
+
+
+@pytest.mark.asyncio
+async def test_preview_marks_replay_work_item_read_only():
+    svc = _service(item=_queue_item())
+
+    result = await svc.preview("verification:INC-20260514-TEST01:are-1")
+
+    assert result["allowed"] is True
+    assert result["mode"] == "replay"
+    assert result["safety_level"] == "read_only"
+    assert result["writes_incident_state"] is False
+    assert result["plan"]["agent_id"] == "auto_repair_executor"
+    assert result["plan"]["writes"] == []
+
+
+@pytest.mark.asyncio
+async def test_dry_run_reverify_collects_state_without_writes():
+    item = _queue_item(
+        remediation_status="ready_for_reverify",
+        remediation_action="reverify_with_promql_template",
+        remediation_owner="post_execution_verifier",
+    )
+    svc = _service(item=item, state={"k8s_get_pod_status": {"phase": "Running"}})
+
+    result = await svc.dry_run("verification:INC-20260514-TEST01:are-1")
+
+    assert result["allowed"] is True
+    assert result["executed"] is True
+    assert result["mode"] == "reverify"
+    assert result["verification_result_preview"] == "success"
+    assert result["writes_auto_repair_result"] is False
+    assert result["post_state_summary"]["tool_count"] == 1
+    assert result["mcp_route"]["agent_id"] == "post_execution_verifier"
+    assert result["mcp_route"]["required_scope"] == "read"
+
+
+@pytest.mark.asyncio
+async def test_dry_run_replay_validates_supported_executor_route():
+    svc = _service(item=_queue_item())
+
+    result = await svc.dry_run("verification:INC-20260514-TEST01:are-1")
+
+    assert result["allowed"] is True
+    assert result["mode"] == "replay"
+    assert result["mcp_route"]["agent_id"] == "auto_repair_executor"
+    assert result["mcp_route"]["tool_name"] == "ssh_diagnose"
+    assert result["mcp_route"]["required_scope"] == "read"
+    assert result["mcp_route"]["params"]["host"] == "192.168.0.110"
+    assert result["mcp_route"]["params"]["container_name"] == "momo-scheduler"
+    assert result["diagnostic_command_preview"].startswith("ssh 110")
+
+
+@pytest.mark.asyncio
+async def test_dry_run_blocks_when_incident_missing():
+    svc = _service(item=_queue_item(), incident=None)
+    svc._incident_repository = _FakeIncidentRepository(None)
+
+    result = await svc.dry_run("verification:INC-20260514-TEST01:are-1")
+
+    assert result["allowed"] is False
+    assert result["executed"] is False
+    assert result["verification_result_preview"] == "blocked"
+    assert any(check["name"] == "incident_loaded" and not check["passed"] for check in result["checks"])
+
+
+@pytest.mark.asyncio
+async def test_missing_work_item_raises_not_found():
+    svc = _service(item=_queue_item())
+
+    with pytest.raises(RemediationNotFoundError):
+        await svc.preview("verification:missing")
+
+
+def test_ai_slo_remediation_endpoints(monkeypatch):
+    app = FastAPI()
+    app.include_router(router, prefix="/api/v1")
+
+    class _FakeService:
+        async def preview(self, work_item_id: str, mode: str = "auto") -> dict[str, Any]:
+            return {"work_item_id": work_item_id, "mode": mode, "allowed": True}
+
+        async def dry_run(self, work_item_id: str, mode: str = "auto") -> dict[str, Any]:
+            return {"work_item_id": work_item_id, "mode": mode, "executed": True}
+
+    monkeypatch.setattr(
+        "src.api.v1.ai_slo.get_adr100_remediation_service",
+        lambda: _FakeService(),
+    )
+
+    client = TestClient(app)
+    preview = client.get(
+        "/api/v1/ai/slo/remediation/preview",
+        params={"work_item_id": "verification:INC:are-1", "mode": "reverify"},
+    )
+    dry_run = client.post(
+        "/api/v1/ai/slo/remediation/dry-run",
+        json={"work_item_id": "verification:INC:are-1", "mode": "replay"},
+    )
+
+    assert preview.status_code == 200
+    assert preview.json()["mode"] == "reverify"
+    assert dry_run.status_code == 200
+    assert dry_run.json()["executed"] is True
--- a/apps/web/messages/en.json
+++ b/apps/web/messages/en.json
@@ -1414,6 +1414,11 @@
        "recentFindings": "Recent Non-success Verification",
        "remediationQueue": "Remediation Work Queue",
        "queueSummary": "Total {total}; AI-ready {ready}; human {human}",
+        "dryRunButton": "Dry run",
+        "dryRunLoading": "Running",
+        "dryRunResult": "{mode}; preview {result}; tools {tools}",
+        "dryRunBlocked": "Dry run blocked",
+        "dryRunError": "Dry run failed",
        "state": {
          "ok": "OK",
          "warning": "Needs tracking",
--- a/apps/web/messages/zh-TW.json
+++ b/apps/web/messages/zh-TW.json
@@ -1415,6 +1415,11 @@
        "recentFindings": "近期非成功驗證",
        "remediationQueue": "補救工作佇列",
        "queueSummary": "總數 {total}；AI 可接手 {ready}；人工 {human}",
+        "dryRunButton": "試跑",
+        "dryRunLoading": "試跑中",
+        "dryRunResult": "{mode}；預覽 {result}；工具 {tools}",
+        "dryRunBlocked": "試跑未放行",
+        "dryRunError": "試跑失敗",
        "state": {
          "ok": "正常",
          "warning": "需追蹤",
--- a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx
+++ b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx
@@ -15,7 +15,7 @@

 import { useEffect, useState } from 'react'
 import { useTranslations } from 'next-intl'
-import { ShieldCheck, AlertTriangle } from 'lucide-react'
+import { ShieldCheck, AlertTriangle, PlayCircle, SearchCheck } from 'lucide-react'
 import { SloKpiCard, type SloMetric } from '@/components/governance/slo-kpi-card'
 import { SloViolationChart, type ViolationDataPoint } from '@/components/governance/slo-violation-chart'
 import { GlassCard } from '@/components/ui/glass-card'
@@ -144,6 +144,28 @@ interface SummaryApiResponse {
  days?: number
 }

+interface RemediationDryRunResponse {
+  mode?: string
+  allowed?: boolean
+  executed?: boolean
+  verification_result_preview?: string
+  post_state_summary?: {
+    tool_count?: number
+    tools?: string[]
+    has_state?: boolean
+  }
+  mcp_route?: {
+    agent_id?: string
+    tool_name?: string
+    required_scope?: string
+  } | null
+}
+
+interface RemediationActionState {
+  status: 'loading' | 'done' | 'error'
+  data?: RemediationDryRunResponse
+}
+
 // =============================================================================
 // Helpers
 // =============================================================================
@@ -220,6 +242,16 @@ function compactLabel(value?: string | null, fallback = '--'): string {
  return value.length > 54 ? `${value.slice(0, 54)}...` : value
 }

+async function requestRemediationDryRun(workItemId: string): Promise<RemediationDryRunResponse> {
+  const response = await fetch(`${API_BASE}/api/v1/ai/slo/remediation/dry-run`, {
+    method: 'POST',
+    headers: { 'content-type': 'application/json' },
+    body: JSON.stringify({ work_item_id: workItemId, mode: 'auto' }),
+  })
+  if (!response.ok) throw new Error(`dry_run_failed:${response.status}`)
+  return response.json()
+}
+
 function buildMetrics(api: SloApiResponse): SloMetric[] {
  const adr100Metrics = api.adr100?.metrics
  if (adr100Metrics?.length) {
@@ -276,6 +308,7 @@ function buildMetrics(api: SloApiResponse): SloMetric[] {

 function VerificationCoveragePanel({ coverage }: { coverage?: Adr100VerificationCoverage }) {
  const t = useTranslations('governance.slo.coverage')
+  const [actionState, setActionState] = useState<Record<string, RemediationActionState>>({})
  const color = coverageTone(coverage?.status)
  const rows = [
    { label: t('totalAuto'), value: String(coverage?.total_auto ?? '--') },
@@ -287,6 +320,25 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
  const recentFindings = coverage?.recent_non_success ?? []
  const remediationQueue = coverage?.remediation_queue

+  const handleDryRun = async (workItemId: string) => {
+    setActionState(prev => ({
+      ...prev,
+      [workItemId]: { status: 'loading' },
+    }))
+    try {
+      const data = await requestRemediationDryRun(workItemId)
+      setActionState(prev => ({
+        ...prev,
+        [workItemId]: { status: 'done', data },
+      }))
+    } catch {
+      setActionState(prev => ({
+        ...prev,
+        [workItemId]: { status: 'error' },
+      }))
+    }
+  }
+
  return (
    <GlassCard variant="subtle" padding="md">
      <div style={{ display: 'flex', flexDirection: 'column', gap: 12 }}>
@@ -379,7 +431,7 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
              {(remediationQueue.items ?? []).slice(0, 4).map(item => (
                <div key={item.work_item_id} style={{
                  display: 'grid',
-                  gridTemplateColumns: 'minmax(130px, 0.8fr) minmax(180px, 1fr) minmax(160px, 1fr)',
+                  gridTemplateColumns: 'minmax(130px, 0.8fr) minmax(180px, 1fr) minmax(150px, 0.9fr) minmax(150px, 0.8fr)',
                  gap: 10,
                  alignItems: 'center',
                  minWidth: 0,
@@ -410,6 +462,62 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
                      {compactLabel(item.remediation_reason)}
                    </div>
                  </div>
+                  <div style={{ minWidth: 0, display: 'flex', flexDirection: 'column', gap: 5 }}>
+                    <button
+                      type="button"
+                      onClick={() => { void handleDryRun(item.work_item_id) }}
+                      disabled={actionState[item.work_item_id]?.status === 'loading'}
+                      title={t('dryRunButton')}
+                      style={{
+                        width: 'fit-content',
+                        minHeight: 28,
+                        display: 'inline-flex',
+                        alignItems: 'center',
+                        gap: 6,
+                        padding: '5px 9px',
+                        borderRadius: 6,
+                        border: '0.5px solid rgba(20,20,19,0.14)',
+                        background: actionState[item.work_item_id]?.status === 'loading'
+                          ? 'rgba(135,134,127,0.08)'
+                          : 'rgba(34,197,94,0.08)',
+                        color: '#141413',
+                        fontFamily: "'DM Mono', monospace",
+                        fontSize: 10,
+                        cursor: actionState[item.work_item_id]?.status === 'loading' ? 'wait' : 'pointer',
+                      }}
+                    >
+                      <PlayCircle size={13} style={{ color: '#22C55E', flexShrink: 0 }} />
+                      <span>{actionState[item.work_item_id]?.status === 'loading' ? t('dryRunLoading') : t('dryRunButton')}</span>
+                    </button>
+                    {actionState[item.work_item_id]?.status === 'done' && (
+                      <div style={{
+                        display: 'flex',
+                        alignItems: 'flex-start',
+                        gap: 5,
+                        fontFamily: "'DM Mono', monospace",
+                        fontSize: 9,
+                        color: actionState[item.work_item_id].data?.allowed === false ? '#7c5a10' : '#166534',
+                        lineHeight: 1.4,
+                        overflowWrap: 'anywhere',
+                      }}>
+                        <SearchCheck size={12} style={{ flexShrink: 0, marginTop: 1 }} />
+                        <span>
+                          {actionState[item.work_item_id].data?.allowed === false
+                            ? t('dryRunBlocked')
+                            : t('dryRunResult', {
+                                mode: actionState[item.work_item_id].data?.mode ?? '--',
+                                result: actionState[item.work_item_id].data?.verification_result_preview ?? '--',
+                                tools: actionState[item.work_item_id].data?.post_state_summary?.tool_count ?? 0,
+                              })}
+                        </span>
+                      </div>
+                    )}
+                    {actionState[item.work_item_id]?.status === 'error' && (
+                      <div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#FF3300', lineHeight: 1.4 }}>
+                        {t('dryRunError')}
+                      </div>
+                    )}
+                  </div>
                </div>
              ))}
            </div>
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,37 @@
+## 2026-05-14 | T25 補救佇列新增安全試跑入口，replay/reverify 可先讀證據不改狀態
+
+**背景**：T24 已把 non-success verifier rows 轉成 `remediation_queue`，但 Operator 仍只能看見「應該 replay / reverify」，無法從前端或 API 直接觸發一個安全、可觀測、低風險的試跑步驟。這會讓「AI 可接手」停在文字標籤，還沒有形成可操作入口。
+
+**修正**：
+- 新增 `Adr100RemediationService`，從 ADR-100 `verification_coverage.remediation_queue` 找 work item，提供 read-only `preview` 與 `dry_run`。
+- 新增 API：
+  - `GET /api/v1/ai/slo/remediation/preview?work_item_id=...`
+  - `POST /api/v1/ai/slo/remediation/preview`
+  - `POST /api/v1/ai/slo/remediation/dry-run`
+- `dry_run` 不會更新 incident 狀態、不會新增 auto-repair result、不會做真正修復；它只做 queue readiness / read-only guardrail / incident loaded / supported executor route 等檢查，並用 verifier 收集當前狀態產生 `verification_result_preview`。
+- `ready_for_reverify` 走 `post_execution_verifier` read-only current-state collection，回傳 PromQL 與 MCP route metadata。
+- `ready_for_replay` 先驗證 legacy SSH diagnostic 是否可轉成 `auto_repair_executor -> mcp:ssh_diagnose -> required_scope=read`，再收集 current-state preview。
+- `AutoRepairService` 新增 `preview_read_only_ssh_mcp_route()`，讓 remediation dry-run 能驗證 supported executor path，而不碰私有修復執行流程。
+- `/governance` SLO tab 的補救工作佇列每筆新增「試跑」按鈕，呼叫 dry-run API 後回顯 mode、preview result、工具數；文案補齊 `zh-TW` / `en` i18n，使用 lucide icon，不用 emoji。
+
+**本地驗證**：
+- `python -m py_compile apps/api/src/services/adr100_remediation_service.py apps/api/src/api/v1/ai_slo.py apps/api/src/services/auto_repair_service.py`：pass。
+- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_auto_repair_service.py -q`：33 passed。
+- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q`：59 passed。
+- `ruff check --select F,E9 apps/api/src/services/adr100_remediation_service.py apps/api/src/api/v1/ai_slo.py apps/api/src/services/auto_repair_service.py apps/api/tests/test_adr100_remediation_service.py`：pass。
+- i18n JSON parse / `git diff --check`：pass。
+- `pnpm --filter @awoooi/web typecheck`：pass。
+- `pnpm --dir apps/web exec next lint --file src/app/[locale]/governance/tabs/slo-tab.tsx`：pass。
+- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --filter @awoooi/web build`：pass。
+
+**推版與 production 驗證**：
+- 待 T25 commit 推 Gitea main 後驗證。
+
+**目前整體進度**：
+- Alertmanager 低風險自動修復主線：約 98%。
+- 完整 AI 自動化管理產品化：約 90%。
+- T25 把「補救工作」從可視化清單推到安全試跑入口。下一段應把 dry-run 結果寫回可稽核 timeline / work item history，並把真正可 auto-closure 的條件與需要建 Ticket / 人工介入的條件分開。
+
 ## 2026-05-14 | T24 非成功驗證補救工作佇列，讓舊 degraded 變成可追蹤工作項

 **背景**：T22/T23 已找出近 24h non-success verifier 的根因並修掉 executor / PromQL template 斷點，但 `/api/v1/ai/slo` 仍只把 historical degraded rows 顯示為 warning。Operator 仍無法直接判斷每筆舊 degraded 要 replay、reverify、建 Ticket，還是人工檢查。