feat(governance): add remediation dry run entrypoint
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m5s
CD Pipeline / build-and-deploy (push) Successful in 3m43s
CD Pipeline / post-deploy-checks (push) Successful in 1m33s

This commit is contained in:
Your Name
2026-05-14 22:20:34 +08:00
parent 102f92dfc3
commit 04fdaee83a
8 changed files with 820 additions and 3 deletions

View File

@@ -18,8 +18,14 @@ Endpoints:
from __future__ import annotations
import structlog
from fastapi import APIRouter, Query
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel, Field
from src.services.adr100_remediation_service import (
RemediationMode,
RemediationNotFoundError,
get_adr100_remediation_service,
)
from src.services.adr100_slo_status_service import get_adr100_slo_status_service
from src.services.ai_slo_calculator import AiSloCalculator
@@ -28,6 +34,20 @@ logger = structlog.get_logger(__name__)
router = APIRouter()
class RemediationPreviewRequest(BaseModel):
"""ADR-100 remediation preview request."""
work_item_id: str = Field(min_length=1)
mode: RemediationMode = "auto"
class RemediationDryRunRequest(BaseModel):
"""ADR-100 remediation dry-run request."""
work_item_id: str = Field(min_length=1)
mode: RemediationMode = "auto"
@router.get("/ai/slo")
async def get_ai_slo(
force_refresh: bool = Query(False, description="忽略快取,強制重算"),
@@ -59,3 +79,42 @@ async def get_ai_slo(
data["cache_hit"] = False
data["adr100"] = await get_adr100_slo_status_service().fetch_report()
return data
@router.get("/ai/slo/remediation/preview")
async def preview_ai_slo_remediation(
work_item_id: str = Query(..., min_length=1),
mode: RemediationMode = Query("auto"),
) -> dict:
"""Preview the safe remediation plan for one ADR-100 queue item."""
try:
return await get_adr100_remediation_service().preview(work_item_id, mode)
except RemediationNotFoundError as exc:
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
@router.post("/ai/slo/remediation/preview")
async def preview_ai_slo_remediation_post(request: RemediationPreviewRequest) -> dict:
"""POST variant for clients that prefer JSON bodies."""
try:
return await get_adr100_remediation_service().preview(
request.work_item_id,
request.mode,
)
except RemediationNotFoundError as exc:
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
@router.post("/ai/slo/remediation/dry-run")
async def dry_run_ai_slo_remediation(request: RemediationDryRunRequest) -> dict:
"""Run a read-only ADR-100 remediation dry-run."""
try:
return await get_adr100_remediation_service().dry_run(
request.work_item_id,
request.mode,
)
except RemediationNotFoundError as exc:
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc

View File

@@ -0,0 +1,356 @@
"""
ADR-100 Remediation Service
===========================
Safe operator entrypoints for verification remediation work items.
T25: remediation queue items are now actionable without mutating incident state:
- preview: show the selected guardrail path
- dry-run: collect read-only current state and validate supported executor routing
"""
from __future__ import annotations
import asyncio
from typing import Any, Literal, Protocol
import structlog
from src.models.incident import Incident
from src.repositories.incident_repository import IncidentDBRepository
from src.services.adr100_slo_status_service import (
Adr100SloStatusService,
get_adr100_slo_status_service,
)
from src.services.auto_repair_service import AutoRepairService
from src.services.post_execution_verifier import (
PostExecutionVerifier,
_assess_recovery,
_build_prometheus_query,
get_post_execution_verifier,
)
logger = structlog.get_logger(__name__)
RemediationMode = Literal["auto", "reverify", "replay"]
_READY_STATUSES = {"ready_for_replay", "ready_for_reverify"}
class RemediationNotFoundError(LookupError):
"""Requested ADR-100 remediation work item is not in the current read model."""
class _IncidentRepository(Protocol):
async def get_by_id(self, incident_id: str) -> Incident | None:
...
class Adr100RemediationService:
"""Read-only remediation preview and dry-run service."""
def __init__(
self,
*,
slo_service: Adr100SloStatusService | None = None,
incident_repository: _IncidentRepository | None = None,
auto_repair_service: AutoRepairService | None = None,
verifier: PostExecutionVerifier | None = None,
) -> None:
self._slo_service = slo_service or get_adr100_slo_status_service()
self._incident_repository = incident_repository or IncidentDBRepository()
self._auto_repair_service = auto_repair_service or AutoRepairService()
self._verifier = verifier or get_post_execution_verifier()
async def preview(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]:
"""Return the safe execution plan for a remediation queue item."""
item = await self._find_work_item(work_item_id)
selected_mode = _select_mode(item, mode)
checks = _base_checks(item)
allowed = all(check["passed"] for check in checks)
return {
"schema_version": "adr100_remediation_preview_v1",
"work_item_id": item.get("work_item_id"),
"incident_id": item.get("incident_id"),
"auto_repair_id": item.get("auto_repair_id"),
"mode": selected_mode,
"allowed": allowed,
"safety_level": "read_only",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"checks": checks,
"plan": _plan_for_item(item, selected_mode),
"source": "adr100.verification_coverage.remediation_queue",
}
async def dry_run(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]:
"""Run a safe, read-only remediation dry-run for one queue item."""
item = await self._find_work_item(work_item_id)
selected_mode = _select_mode(item, mode)
checks = _base_checks(item)
incident = await self._load_incident(item)
checks.append({
"name": "incident_loaded",
"passed": incident is not None,
"detail": item.get("incident_id") or "missing incident_id",
})
if incident is None or not all(check["passed"] for check in checks):
return _dry_run_blocked_payload(item, selected_mode, checks)
if selected_mode == "replay":
return await self._dry_run_replay(item, incident, checks)
return await self._dry_run_reverify(item, incident, checks)
async def _find_work_item(self, work_item_id: str) -> dict[str, Any]:
report = await self._slo_service.fetch_report()
coverage = report.get("verification_coverage") or {}
queue = coverage.get("remediation_queue") or {}
for item in queue.get("items") or []:
if item.get("work_item_id") == work_item_id:
return dict(item)
raise RemediationNotFoundError(work_item_id)
async def _load_incident(self, item: dict[str, Any]) -> Incident | None:
incident_id = str(item.get("incident_id") or "")
if not incident_id:
return None
return await self._incident_repository.get_by_id(incident_id)
async def _dry_run_reverify(
self,
item: dict[str, Any],
incident: Incident,
checks: list[dict[str, Any]],
) -> dict[str, Any]:
post_state = await self._collect_current_state(incident)
action_taken = f"dry_run_reverify:{item.get('playbook_id') or 'unknown'}"
result = _assess_recovery(None, post_state, action_taken)
return _dry_run_result_payload(
item=item,
mode="reverify",
checks=checks,
post_state=post_state,
verification_result_preview=result,
extra={
"promql": _promql_for_incident(incident),
"mcp_route": {
"agent_id": "post_execution_verifier",
"required_scope": "read",
"is_shadow": True,
"flywheel_node": "verify",
},
},
)
async def _dry_run_replay(
self,
item: dict[str, Any],
incident: Incident,
checks: list[dict[str, Any]],
) -> dict[str, Any]:
diagnostic_command = _diagnostic_command_for_incident(incident)
route = self._auto_repair_service.preview_read_only_ssh_mcp_route(
incident,
diagnostic_command,
)
checks.append({
"name": "supported_executor_route",
"passed": route is not None,
"detail": "mcp:ssh_diagnose" if route else "missing host/container route",
})
post_state = await self._collect_current_state(incident)
action_taken = f"dry_run_replay:{item.get('playbook_id') or 'unknown'}"
result = _assess_recovery(None, post_state, action_taken)
return _dry_run_result_payload(
item=item,
mode="replay",
checks=checks,
post_state=post_state,
verification_result_preview=result,
extra={
"diagnostic_command_preview": diagnostic_command,
"mcp_route": route,
"promql": _promql_for_incident(incident),
},
)
async def _collect_current_state(self, incident: Incident) -> dict[str, Any]:
try:
return await asyncio.wait_for(
self._verifier._collect_post_state(incident),
timeout=12.0,
)
except asyncio.TimeoutError:
logger.warning(
"adr100_remediation_dry_run_timeout",
incident_id=incident.incident_id,
)
return {}
except Exception as exc:
logger.warning(
"adr100_remediation_dry_run_collect_failed",
incident_id=incident.incident_id,
error=str(exc),
)
return {}
def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]:
if requested in ("reverify", "replay"):
return requested
if item.get("remediation_status") == "ready_for_reverify":
return "reverify"
if item.get("remediation_action") == "reverify_with_promql_template":
return "reverify"
return "replay"
def _base_checks(item: dict[str, Any]) -> list[dict[str, Any]]:
status = str(item.get("remediation_status") or "unknown")
action = str(item.get("remediation_action") or "unknown")
return [
{
"name": "queue_item_ready",
"passed": status in _READY_STATUSES,
"detail": status,
},
{
"name": "read_only_guardrail",
"passed": action in {
"replay_with_supported_executor",
"reverify_with_promql_template",
},
"detail": action,
},
{
"name": "no_state_mutation",
"passed": True,
"detail": "dry_run_does_not_update_incident_or_auto_repair_rows",
},
]
def _plan_for_item(item: dict[str, Any], mode: str) -> dict[str, Any]:
if mode == "reverify":
return {
"step": "collect_current_state_and_assess",
"agent_id": "post_execution_verifier",
"required_scope": "read",
"writes": [],
}
return {
"step": "validate_supported_executor_route_then_collect_current_state",
"agent_id": "auto_repair_executor",
"required_scope": "read",
"writes": [],
"target_action": item.get("remediation_action"),
}
def _dry_run_blocked_payload(
item: dict[str, Any],
mode: str,
checks: list[dict[str, Any]],
) -> dict[str, Any]:
return {
"schema_version": "adr100_remediation_dry_run_v1",
"work_item_id": item.get("work_item_id"),
"incident_id": item.get("incident_id"),
"auto_repair_id": item.get("auto_repair_id"),
"mode": mode,
"allowed": False,
"executed": False,
"safety_level": "read_only",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"checks": checks,
"verification_result_preview": "blocked",
"post_state_summary": {},
}
def _dry_run_result_payload(
*,
item: dict[str, Any],
mode: str,
checks: list[dict[str, Any]],
post_state: dict[str, Any],
verification_result_preview: str,
extra: dict[str, Any],
) -> dict[str, Any]:
return {
"schema_version": "adr100_remediation_dry_run_v1",
"work_item_id": item.get("work_item_id"),
"incident_id": item.get("incident_id"),
"auto_repair_id": item.get("auto_repair_id"),
"mode": mode,
"allowed": all(check["passed"] for check in checks),
"executed": True,
"safety_level": "read_only",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"checks": checks,
"verification_result_preview": verification_result_preview,
"post_state_summary": _summarize_post_state(post_state),
**extra,
}
def _summarize_post_state(post_state: dict[str, Any]) -> dict[str, Any]:
keys = sorted(post_state.keys())
return {
"tool_count": len(keys),
"tools": keys[:8],
"has_state": bool(post_state),
}
def _diagnostic_command_for_incident(incident: Incident) -> str:
labels = _labels_for_incident(incident)
host = str(labels.get("host") or labels.get("instance") or "{host}")
container = str(labels.get("container_name") or labels.get("container") or "")
if container:
return f"ssh {host} 'uptime; docker stats --no-stream {container}'"
return f"ssh {host} 'uptime; docker stats --no-stream'"
def _promql_for_incident(incident: Incident) -> str:
labels = _labels_for_incident(incident)
alertname = ""
if incident.signals:
signal = incident.signals[0]
alertname = labels.get("alertname") or getattr(signal, "alert_name", "")
return _build_prometheus_query(alertname, labels)
def _labels_for_incident(incident: Incident) -> dict[str, Any]:
if incident.signals:
return incident.signals[0].labels or {}
return {}
_service: Adr100RemediationService | None = None
def get_adr100_remediation_service() -> Adr100RemediationService:
"""Return singleton ADR-100 remediation service."""
global _service
if _service is None:
_service = Adr100RemediationService()
return _service
def set_adr100_remediation_service(service: Adr100RemediationService | None) -> None:
"""Inject ADR-100 remediation service for tests."""
global _service
_service = service

View File

@@ -1003,6 +1003,29 @@ class AutoRepairService:
return _SshMcpRoute(tool_name="ssh_diagnose", params=params)
def preview_read_only_ssh_mcp_route(
self,
incident: Incident,
command: str,
) -> dict[str, Any] | None:
"""Preview whether a legacy SSH diagnostic can use the MCP Gateway.
This is used by remediation dry-runs to prove the supported executor
path without running the original PlayBook step or writing an execution
result.
"""
route = self._route_legacy_ssh_command_to_mcp(incident, command)
if route is None:
return None
return {
"tool_name": route.tool_name,
"params": route.params,
"agent_id": "auto_repair_executor",
"required_scope": "read",
"flywheel_node": "execute",
}
def _resolve_ssh_host_for_incident(self, incident: Incident, command: str) -> str:
"""Resolve ``{host}``, short host labels, and exporter instance ports."""