feat(governance): add remediation dry run entrypoint
This commit is contained in:
@@ -18,8 +18,14 @@ Endpoints:
|
||||
from __future__ import annotations
|
||||
|
||||
import structlog
|
||||
from fastapi import APIRouter, Query
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.services.adr100_remediation_service import (
|
||||
RemediationMode,
|
||||
RemediationNotFoundError,
|
||||
get_adr100_remediation_service,
|
||||
)
|
||||
from src.services.adr100_slo_status_service import get_adr100_slo_status_service
|
||||
from src.services.ai_slo_calculator import AiSloCalculator
|
||||
|
||||
@@ -28,6 +34,20 @@ logger = structlog.get_logger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class RemediationPreviewRequest(BaseModel):
|
||||
"""ADR-100 remediation preview request."""
|
||||
|
||||
work_item_id: str = Field(min_length=1)
|
||||
mode: RemediationMode = "auto"
|
||||
|
||||
|
||||
class RemediationDryRunRequest(BaseModel):
|
||||
"""ADR-100 remediation dry-run request."""
|
||||
|
||||
work_item_id: str = Field(min_length=1)
|
||||
mode: RemediationMode = "auto"
|
||||
|
||||
|
||||
@router.get("/ai/slo")
|
||||
async def get_ai_slo(
|
||||
force_refresh: bool = Query(False, description="忽略快取,強制重算"),
|
||||
@@ -59,3 +79,42 @@ async def get_ai_slo(
|
||||
data["cache_hit"] = False
|
||||
data["adr100"] = await get_adr100_slo_status_service().fetch_report()
|
||||
return data
|
||||
|
||||
|
||||
@router.get("/ai/slo/remediation/preview")
|
||||
async def preview_ai_slo_remediation(
|
||||
work_item_id: str = Query(..., min_length=1),
|
||||
mode: RemediationMode = Query("auto"),
|
||||
) -> dict:
|
||||
"""Preview the safe remediation plan for one ADR-100 queue item."""
|
||||
|
||||
try:
|
||||
return await get_adr100_remediation_service().preview(work_item_id, mode)
|
||||
except RemediationNotFoundError as exc:
|
||||
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
|
||||
|
||||
|
||||
@router.post("/ai/slo/remediation/preview")
|
||||
async def preview_ai_slo_remediation_post(request: RemediationPreviewRequest) -> dict:
|
||||
"""POST variant for clients that prefer JSON bodies."""
|
||||
|
||||
try:
|
||||
return await get_adr100_remediation_service().preview(
|
||||
request.work_item_id,
|
||||
request.mode,
|
||||
)
|
||||
except RemediationNotFoundError as exc:
|
||||
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
|
||||
|
||||
|
||||
@router.post("/ai/slo/remediation/dry-run")
|
||||
async def dry_run_ai_slo_remediation(request: RemediationDryRunRequest) -> dict:
|
||||
"""Run a read-only ADR-100 remediation dry-run."""
|
||||
|
||||
try:
|
||||
return await get_adr100_remediation_service().dry_run(
|
||||
request.work_item_id,
|
||||
request.mode,
|
||||
)
|
||||
except RemediationNotFoundError as exc:
|
||||
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
|
||||
|
||||
356
apps/api/src/services/adr100_remediation_service.py
Normal file
356
apps/api/src/services/adr100_remediation_service.py
Normal file
@@ -0,0 +1,356 @@
|
||||
"""
|
||||
ADR-100 Remediation Service
|
||||
===========================
|
||||
Safe operator entrypoints for verification remediation work items.
|
||||
|
||||
T25: remediation queue items are now actionable without mutating incident state:
|
||||
- preview: show the selected guardrail path
|
||||
- dry-run: collect read-only current state and validate supported executor routing
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any, Literal, Protocol
|
||||
|
||||
import structlog
|
||||
|
||||
from src.models.incident import Incident
|
||||
from src.repositories.incident_repository import IncidentDBRepository
|
||||
from src.services.adr100_slo_status_service import (
|
||||
Adr100SloStatusService,
|
||||
get_adr100_slo_status_service,
|
||||
)
|
||||
from src.services.auto_repair_service import AutoRepairService
|
||||
from src.services.post_execution_verifier import (
|
||||
PostExecutionVerifier,
|
||||
_assess_recovery,
|
||||
_build_prometheus_query,
|
||||
get_post_execution_verifier,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
RemediationMode = Literal["auto", "reverify", "replay"]
|
||||
|
||||
_READY_STATUSES = {"ready_for_replay", "ready_for_reverify"}
|
||||
|
||||
|
||||
class RemediationNotFoundError(LookupError):
|
||||
"""Requested ADR-100 remediation work item is not in the current read model."""
|
||||
|
||||
|
||||
class _IncidentRepository(Protocol):
|
||||
async def get_by_id(self, incident_id: str) -> Incident | None:
|
||||
...
|
||||
|
||||
|
||||
class Adr100RemediationService:
|
||||
"""Read-only remediation preview and dry-run service."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
slo_service: Adr100SloStatusService | None = None,
|
||||
incident_repository: _IncidentRepository | None = None,
|
||||
auto_repair_service: AutoRepairService | None = None,
|
||||
verifier: PostExecutionVerifier | None = None,
|
||||
) -> None:
|
||||
self._slo_service = slo_service or get_adr100_slo_status_service()
|
||||
self._incident_repository = incident_repository or IncidentDBRepository()
|
||||
self._auto_repair_service = auto_repair_service or AutoRepairService()
|
||||
self._verifier = verifier or get_post_execution_verifier()
|
||||
|
||||
async def preview(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]:
|
||||
"""Return the safe execution plan for a remediation queue item."""
|
||||
|
||||
item = await self._find_work_item(work_item_id)
|
||||
selected_mode = _select_mode(item, mode)
|
||||
checks = _base_checks(item)
|
||||
allowed = all(check["passed"] for check in checks)
|
||||
|
||||
return {
|
||||
"schema_version": "adr100_remediation_preview_v1",
|
||||
"work_item_id": item.get("work_item_id"),
|
||||
"incident_id": item.get("incident_id"),
|
||||
"auto_repair_id": item.get("auto_repair_id"),
|
||||
"mode": selected_mode,
|
||||
"allowed": allowed,
|
||||
"safety_level": "read_only",
|
||||
"writes_incident_state": False,
|
||||
"writes_auto_repair_result": False,
|
||||
"checks": checks,
|
||||
"plan": _plan_for_item(item, selected_mode),
|
||||
"source": "adr100.verification_coverage.remediation_queue",
|
||||
}
|
||||
|
||||
async def dry_run(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]:
|
||||
"""Run a safe, read-only remediation dry-run for one queue item."""
|
||||
|
||||
item = await self._find_work_item(work_item_id)
|
||||
selected_mode = _select_mode(item, mode)
|
||||
checks = _base_checks(item)
|
||||
incident = await self._load_incident(item)
|
||||
checks.append({
|
||||
"name": "incident_loaded",
|
||||
"passed": incident is not None,
|
||||
"detail": item.get("incident_id") or "missing incident_id",
|
||||
})
|
||||
|
||||
if incident is None or not all(check["passed"] for check in checks):
|
||||
return _dry_run_blocked_payload(item, selected_mode, checks)
|
||||
|
||||
if selected_mode == "replay":
|
||||
return await self._dry_run_replay(item, incident, checks)
|
||||
return await self._dry_run_reverify(item, incident, checks)
|
||||
|
||||
async def _find_work_item(self, work_item_id: str) -> dict[str, Any]:
|
||||
report = await self._slo_service.fetch_report()
|
||||
coverage = report.get("verification_coverage") or {}
|
||||
queue = coverage.get("remediation_queue") or {}
|
||||
|
||||
for item in queue.get("items") or []:
|
||||
if item.get("work_item_id") == work_item_id:
|
||||
return dict(item)
|
||||
|
||||
raise RemediationNotFoundError(work_item_id)
|
||||
|
||||
async def _load_incident(self, item: dict[str, Any]) -> Incident | None:
|
||||
incident_id = str(item.get("incident_id") or "")
|
||||
if not incident_id:
|
||||
return None
|
||||
return await self._incident_repository.get_by_id(incident_id)
|
||||
|
||||
async def _dry_run_reverify(
|
||||
self,
|
||||
item: dict[str, Any],
|
||||
incident: Incident,
|
||||
checks: list[dict[str, Any]],
|
||||
) -> dict[str, Any]:
|
||||
post_state = await self._collect_current_state(incident)
|
||||
action_taken = f"dry_run_reverify:{item.get('playbook_id') or 'unknown'}"
|
||||
result = _assess_recovery(None, post_state, action_taken)
|
||||
|
||||
return _dry_run_result_payload(
|
||||
item=item,
|
||||
mode="reverify",
|
||||
checks=checks,
|
||||
post_state=post_state,
|
||||
verification_result_preview=result,
|
||||
extra={
|
||||
"promql": _promql_for_incident(incident),
|
||||
"mcp_route": {
|
||||
"agent_id": "post_execution_verifier",
|
||||
"required_scope": "read",
|
||||
"is_shadow": True,
|
||||
"flywheel_node": "verify",
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
async def _dry_run_replay(
|
||||
self,
|
||||
item: dict[str, Any],
|
||||
incident: Incident,
|
||||
checks: list[dict[str, Any]],
|
||||
) -> dict[str, Any]:
|
||||
diagnostic_command = _diagnostic_command_for_incident(incident)
|
||||
route = self._auto_repair_service.preview_read_only_ssh_mcp_route(
|
||||
incident,
|
||||
diagnostic_command,
|
||||
)
|
||||
checks.append({
|
||||
"name": "supported_executor_route",
|
||||
"passed": route is not None,
|
||||
"detail": "mcp:ssh_diagnose" if route else "missing host/container route",
|
||||
})
|
||||
|
||||
post_state = await self._collect_current_state(incident)
|
||||
action_taken = f"dry_run_replay:{item.get('playbook_id') or 'unknown'}"
|
||||
result = _assess_recovery(None, post_state, action_taken)
|
||||
|
||||
return _dry_run_result_payload(
|
||||
item=item,
|
||||
mode="replay",
|
||||
checks=checks,
|
||||
post_state=post_state,
|
||||
verification_result_preview=result,
|
||||
extra={
|
||||
"diagnostic_command_preview": diagnostic_command,
|
||||
"mcp_route": route,
|
||||
"promql": _promql_for_incident(incident),
|
||||
},
|
||||
)
|
||||
|
||||
async def _collect_current_state(self, incident: Incident) -> dict[str, Any]:
|
||||
try:
|
||||
return await asyncio.wait_for(
|
||||
self._verifier._collect_post_state(incident),
|
||||
timeout=12.0,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(
|
||||
"adr100_remediation_dry_run_timeout",
|
||||
incident_id=incident.incident_id,
|
||||
)
|
||||
return {}
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"adr100_remediation_dry_run_collect_failed",
|
||||
incident_id=incident.incident_id,
|
||||
error=str(exc),
|
||||
)
|
||||
return {}
|
||||
|
||||
|
||||
def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]:
|
||||
if requested in ("reverify", "replay"):
|
||||
return requested
|
||||
if item.get("remediation_status") == "ready_for_reverify":
|
||||
return "reverify"
|
||||
if item.get("remediation_action") == "reverify_with_promql_template":
|
||||
return "reverify"
|
||||
return "replay"
|
||||
|
||||
|
||||
def _base_checks(item: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
status = str(item.get("remediation_status") or "unknown")
|
||||
action = str(item.get("remediation_action") or "unknown")
|
||||
return [
|
||||
{
|
||||
"name": "queue_item_ready",
|
||||
"passed": status in _READY_STATUSES,
|
||||
"detail": status,
|
||||
},
|
||||
{
|
||||
"name": "read_only_guardrail",
|
||||
"passed": action in {
|
||||
"replay_with_supported_executor",
|
||||
"reverify_with_promql_template",
|
||||
},
|
||||
"detail": action,
|
||||
},
|
||||
{
|
||||
"name": "no_state_mutation",
|
||||
"passed": True,
|
||||
"detail": "dry_run_does_not_update_incident_or_auto_repair_rows",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def _plan_for_item(item: dict[str, Any], mode: str) -> dict[str, Any]:
|
||||
if mode == "reverify":
|
||||
return {
|
||||
"step": "collect_current_state_and_assess",
|
||||
"agent_id": "post_execution_verifier",
|
||||
"required_scope": "read",
|
||||
"writes": [],
|
||||
}
|
||||
return {
|
||||
"step": "validate_supported_executor_route_then_collect_current_state",
|
||||
"agent_id": "auto_repair_executor",
|
||||
"required_scope": "read",
|
||||
"writes": [],
|
||||
"target_action": item.get("remediation_action"),
|
||||
}
|
||||
|
||||
|
||||
def _dry_run_blocked_payload(
|
||||
item: dict[str, Any],
|
||||
mode: str,
|
||||
checks: list[dict[str, Any]],
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"schema_version": "adr100_remediation_dry_run_v1",
|
||||
"work_item_id": item.get("work_item_id"),
|
||||
"incident_id": item.get("incident_id"),
|
||||
"auto_repair_id": item.get("auto_repair_id"),
|
||||
"mode": mode,
|
||||
"allowed": False,
|
||||
"executed": False,
|
||||
"safety_level": "read_only",
|
||||
"writes_incident_state": False,
|
||||
"writes_auto_repair_result": False,
|
||||
"checks": checks,
|
||||
"verification_result_preview": "blocked",
|
||||
"post_state_summary": {},
|
||||
}
|
||||
|
||||
|
||||
def _dry_run_result_payload(
|
||||
*,
|
||||
item: dict[str, Any],
|
||||
mode: str,
|
||||
checks: list[dict[str, Any]],
|
||||
post_state: dict[str, Any],
|
||||
verification_result_preview: str,
|
||||
extra: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"schema_version": "adr100_remediation_dry_run_v1",
|
||||
"work_item_id": item.get("work_item_id"),
|
||||
"incident_id": item.get("incident_id"),
|
||||
"auto_repair_id": item.get("auto_repair_id"),
|
||||
"mode": mode,
|
||||
"allowed": all(check["passed"] for check in checks),
|
||||
"executed": True,
|
||||
"safety_level": "read_only",
|
||||
"writes_incident_state": False,
|
||||
"writes_auto_repair_result": False,
|
||||
"checks": checks,
|
||||
"verification_result_preview": verification_result_preview,
|
||||
"post_state_summary": _summarize_post_state(post_state),
|
||||
**extra,
|
||||
}
|
||||
|
||||
|
||||
def _summarize_post_state(post_state: dict[str, Any]) -> dict[str, Any]:
|
||||
keys = sorted(post_state.keys())
|
||||
return {
|
||||
"tool_count": len(keys),
|
||||
"tools": keys[:8],
|
||||
"has_state": bool(post_state),
|
||||
}
|
||||
|
||||
|
||||
def _diagnostic_command_for_incident(incident: Incident) -> str:
|
||||
labels = _labels_for_incident(incident)
|
||||
host = str(labels.get("host") or labels.get("instance") or "{host}")
|
||||
container = str(labels.get("container_name") or labels.get("container") or "")
|
||||
if container:
|
||||
return f"ssh {host} 'uptime; docker stats --no-stream {container}'"
|
||||
return f"ssh {host} 'uptime; docker stats --no-stream'"
|
||||
|
||||
|
||||
def _promql_for_incident(incident: Incident) -> str:
|
||||
labels = _labels_for_incident(incident)
|
||||
alertname = ""
|
||||
if incident.signals:
|
||||
signal = incident.signals[0]
|
||||
alertname = labels.get("alertname") or getattr(signal, "alert_name", "")
|
||||
return _build_prometheus_query(alertname, labels)
|
||||
|
||||
|
||||
def _labels_for_incident(incident: Incident) -> dict[str, Any]:
|
||||
if incident.signals:
|
||||
return incident.signals[0].labels or {}
|
||||
return {}
|
||||
|
||||
|
||||
_service: Adr100RemediationService | None = None
|
||||
|
||||
|
||||
def get_adr100_remediation_service() -> Adr100RemediationService:
|
||||
"""Return singleton ADR-100 remediation service."""
|
||||
|
||||
global _service
|
||||
if _service is None:
|
||||
_service = Adr100RemediationService()
|
||||
return _service
|
||||
|
||||
|
||||
def set_adr100_remediation_service(service: Adr100RemediationService | None) -> None:
|
||||
"""Inject ADR-100 remediation service for tests."""
|
||||
|
||||
global _service
|
||||
_service = service
|
||||
@@ -1003,6 +1003,29 @@ class AutoRepairService:
|
||||
|
||||
return _SshMcpRoute(tool_name="ssh_diagnose", params=params)
|
||||
|
||||
def preview_read_only_ssh_mcp_route(
|
||||
self,
|
||||
incident: Incident,
|
||||
command: str,
|
||||
) -> dict[str, Any] | None:
|
||||
"""Preview whether a legacy SSH diagnostic can use the MCP Gateway.
|
||||
|
||||
This is used by remediation dry-runs to prove the supported executor
|
||||
path without running the original PlayBook step or writing an execution
|
||||
result.
|
||||
"""
|
||||
|
||||
route = self._route_legacy_ssh_command_to_mcp(incident, command)
|
||||
if route is None:
|
||||
return None
|
||||
return {
|
||||
"tool_name": route.tool_name,
|
||||
"params": route.params,
|
||||
"agent_id": "auto_repair_executor",
|
||||
"required_scope": "read",
|
||||
"flywheel_node": "execute",
|
||||
}
|
||||
|
||||
def _resolve_ssh_host_for_incident(self, incident: Incident, command: str) -> str:
|
||||
"""Resolve ``{host}``, short host labels, and exporter instance ports."""
|
||||
|
||||
|
||||
227
apps/api/tests/test_adr100_remediation_service.py
Normal file
227
apps/api/tests/test_adr100_remediation_service.py
Normal file
@@ -0,0 +1,227 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from fastapi import FastAPI
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from src.api.v1.ai_slo import router
|
||||
from src.models.incident import Incident, IncidentStatus, Severity, Signal
|
||||
from src.models.playbook import Playbook
|
||||
from src.services.adr100_remediation_service import (
|
||||
Adr100RemediationService,
|
||||
RemediationNotFoundError,
|
||||
)
|
||||
from src.services.auto_repair_service import AutoRepairService
|
||||
|
||||
|
||||
class _FakeSloService:
|
||||
def __init__(self, items: list[dict[str, Any]]) -> None:
|
||||
self.items = items
|
||||
|
||||
async def fetch_report(self) -> dict[str, Any]:
|
||||
return {
|
||||
"verification_coverage": {
|
||||
"remediation_queue": {
|
||||
"items": self.items,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class _FakeIncidentRepository:
|
||||
def __init__(self, incident: Incident | None) -> None:
|
||||
self.incident = incident
|
||||
|
||||
async def get_by_id(self, incident_id: str) -> Incident | None:
|
||||
if self.incident and self.incident.incident_id == incident_id:
|
||||
return self.incident
|
||||
return None
|
||||
|
||||
|
||||
class _FakeVerifier:
|
||||
def __init__(self, state: dict[str, Any]) -> None:
|
||||
self.state = state
|
||||
self.calls = 0
|
||||
|
||||
async def _collect_post_state(self, incident: Incident) -> dict[str, Any]:
|
||||
self.calls += 1
|
||||
return self.state
|
||||
|
||||
|
||||
class _NoopPlaybookService:
|
||||
async def get_recommendations(self, *_args, **_kwargs): # noqa: ANN002, ANN003
|
||||
return []
|
||||
|
||||
async def get_by_id(self, _playbook_id: str) -> Playbook | None:
|
||||
return None
|
||||
|
||||
async def record_execution(self, _playbook_id: str, _success: bool) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
async def _no_cooldown(*_args, **_kwargs) -> tuple[bool, str]: # noqa: ANN002, ANN003
|
||||
return True, "test"
|
||||
|
||||
|
||||
def _incident() -> Incident:
|
||||
now = datetime.now(timezone.utc)
|
||||
return Incident(
|
||||
incident_id="INC-20260514-TEST01",
|
||||
status=IncidentStatus.INVESTIGATING,
|
||||
severity=Severity.P2,
|
||||
affected_services=["momo-scheduler"],
|
||||
alert_category="infrastructure",
|
||||
signals=[
|
||||
Signal(
|
||||
alert_name="DockerContainerMemoryLimitPressure",
|
||||
severity=Severity.P2,
|
||||
source="prometheus",
|
||||
fired_at=now,
|
||||
labels={
|
||||
"alertname": "DockerContainerMemoryLimitPressure",
|
||||
"host": "110",
|
||||
"container_name": "momo-scheduler",
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def _queue_item(**overrides: Any) -> dict[str, Any]:
|
||||
item = {
|
||||
"work_item_id": "verification:INC-20260514-TEST01:are-1",
|
||||
"incident_id": "INC-20260514-TEST01",
|
||||
"auto_repair_id": "are-1",
|
||||
"alertname": "DockerContainerMemoryLimitPressure",
|
||||
"playbook_id": "PB-1",
|
||||
"verification_result": "degraded",
|
||||
"remediation_status": "ready_for_replay",
|
||||
"remediation_action": "replay_with_supported_executor",
|
||||
"remediation_owner": "auto_repair_executor",
|
||||
}
|
||||
item.update(overrides)
|
||||
return item
|
||||
|
||||
|
||||
def _service(
|
||||
*,
|
||||
item: dict[str, Any],
|
||||
incident: Incident | None = None,
|
||||
state: dict[str, Any] | None = None,
|
||||
) -> Adr100RemediationService:
|
||||
return Adr100RemediationService(
|
||||
slo_service=_FakeSloService([item]),
|
||||
incident_repository=_FakeIncidentRepository(incident or _incident()),
|
||||
auto_repair_service=AutoRepairService(
|
||||
playbook_service=_NoopPlaybookService(),
|
||||
cooldown_checker=_no_cooldown,
|
||||
),
|
||||
verifier=_FakeVerifier(state or {"k8s_get_pod_status": {"phase": "Running"}}),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_preview_marks_replay_work_item_read_only():
|
||||
svc = _service(item=_queue_item())
|
||||
|
||||
result = await svc.preview("verification:INC-20260514-TEST01:are-1")
|
||||
|
||||
assert result["allowed"] is True
|
||||
assert result["mode"] == "replay"
|
||||
assert result["safety_level"] == "read_only"
|
||||
assert result["writes_incident_state"] is False
|
||||
assert result["plan"]["agent_id"] == "auto_repair_executor"
|
||||
assert result["plan"]["writes"] == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dry_run_reverify_collects_state_without_writes():
|
||||
item = _queue_item(
|
||||
remediation_status="ready_for_reverify",
|
||||
remediation_action="reverify_with_promql_template",
|
||||
remediation_owner="post_execution_verifier",
|
||||
)
|
||||
svc = _service(item=item, state={"k8s_get_pod_status": {"phase": "Running"}})
|
||||
|
||||
result = await svc.dry_run("verification:INC-20260514-TEST01:are-1")
|
||||
|
||||
assert result["allowed"] is True
|
||||
assert result["executed"] is True
|
||||
assert result["mode"] == "reverify"
|
||||
assert result["verification_result_preview"] == "success"
|
||||
assert result["writes_auto_repair_result"] is False
|
||||
assert result["post_state_summary"]["tool_count"] == 1
|
||||
assert result["mcp_route"]["agent_id"] == "post_execution_verifier"
|
||||
assert result["mcp_route"]["required_scope"] == "read"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dry_run_replay_validates_supported_executor_route():
|
||||
svc = _service(item=_queue_item())
|
||||
|
||||
result = await svc.dry_run("verification:INC-20260514-TEST01:are-1")
|
||||
|
||||
assert result["allowed"] is True
|
||||
assert result["mode"] == "replay"
|
||||
assert result["mcp_route"]["agent_id"] == "auto_repair_executor"
|
||||
assert result["mcp_route"]["tool_name"] == "ssh_diagnose"
|
||||
assert result["mcp_route"]["required_scope"] == "read"
|
||||
assert result["mcp_route"]["params"]["host"] == "192.168.0.110"
|
||||
assert result["mcp_route"]["params"]["container_name"] == "momo-scheduler"
|
||||
assert result["diagnostic_command_preview"].startswith("ssh 110")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dry_run_blocks_when_incident_missing():
|
||||
svc = _service(item=_queue_item(), incident=None)
|
||||
svc._incident_repository = _FakeIncidentRepository(None)
|
||||
|
||||
result = await svc.dry_run("verification:INC-20260514-TEST01:are-1")
|
||||
|
||||
assert result["allowed"] is False
|
||||
assert result["executed"] is False
|
||||
assert result["verification_result_preview"] == "blocked"
|
||||
assert any(check["name"] == "incident_loaded" and not check["passed"] for check in result["checks"])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_work_item_raises_not_found():
|
||||
svc = _service(item=_queue_item())
|
||||
|
||||
with pytest.raises(RemediationNotFoundError):
|
||||
await svc.preview("verification:missing")
|
||||
|
||||
|
||||
def test_ai_slo_remediation_endpoints(monkeypatch):
|
||||
app = FastAPI()
|
||||
app.include_router(router, prefix="/api/v1")
|
||||
|
||||
class _FakeService:
|
||||
async def preview(self, work_item_id: str, mode: str = "auto") -> dict[str, Any]:
|
||||
return {"work_item_id": work_item_id, "mode": mode, "allowed": True}
|
||||
|
||||
async def dry_run(self, work_item_id: str, mode: str = "auto") -> dict[str, Any]:
|
||||
return {"work_item_id": work_item_id, "mode": mode, "executed": True}
|
||||
|
||||
monkeypatch.setattr(
|
||||
"src.api.v1.ai_slo.get_adr100_remediation_service",
|
||||
lambda: _FakeService(),
|
||||
)
|
||||
|
||||
client = TestClient(app)
|
||||
preview = client.get(
|
||||
"/api/v1/ai/slo/remediation/preview",
|
||||
params={"work_item_id": "verification:INC:are-1", "mode": "reverify"},
|
||||
)
|
||||
dry_run = client.post(
|
||||
"/api/v1/ai/slo/remediation/dry-run",
|
||||
json={"work_item_id": "verification:INC:are-1", "mode": "replay"},
|
||||
)
|
||||
|
||||
assert preview.status_code == 200
|
||||
assert preview.json()["mode"] == "reverify"
|
||||
assert dry_run.status_code == 200
|
||||
assert dry_run.json()["executed"] is True
|
||||
@@ -1414,6 +1414,11 @@
|
||||
"recentFindings": "Recent Non-success Verification",
|
||||
"remediationQueue": "Remediation Work Queue",
|
||||
"queueSummary": "Total {total}; AI-ready {ready}; human {human}",
|
||||
"dryRunButton": "Dry run",
|
||||
"dryRunLoading": "Running",
|
||||
"dryRunResult": "{mode}; preview {result}; tools {tools}",
|
||||
"dryRunBlocked": "Dry run blocked",
|
||||
"dryRunError": "Dry run failed",
|
||||
"state": {
|
||||
"ok": "OK",
|
||||
"warning": "Needs tracking",
|
||||
|
||||
@@ -1415,6 +1415,11 @@
|
||||
"recentFindings": "近期非成功驗證",
|
||||
"remediationQueue": "補救工作佇列",
|
||||
"queueSummary": "總數 {total};AI 可接手 {ready};人工 {human}",
|
||||
"dryRunButton": "試跑",
|
||||
"dryRunLoading": "試跑中",
|
||||
"dryRunResult": "{mode};預覽 {result};工具 {tools}",
|
||||
"dryRunBlocked": "試跑未放行",
|
||||
"dryRunError": "試跑失敗",
|
||||
"state": {
|
||||
"ok": "正常",
|
||||
"warning": "需追蹤",
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
import { useEffect, useState } from 'react'
|
||||
import { useTranslations } from 'next-intl'
|
||||
import { ShieldCheck, AlertTriangle } from 'lucide-react'
|
||||
import { ShieldCheck, AlertTriangle, PlayCircle, SearchCheck } from 'lucide-react'
|
||||
import { SloKpiCard, type SloMetric } from '@/components/governance/slo-kpi-card'
|
||||
import { SloViolationChart, type ViolationDataPoint } from '@/components/governance/slo-violation-chart'
|
||||
import { GlassCard } from '@/components/ui/glass-card'
|
||||
@@ -144,6 +144,28 @@ interface SummaryApiResponse {
|
||||
days?: number
|
||||
}
|
||||
|
||||
interface RemediationDryRunResponse {
|
||||
mode?: string
|
||||
allowed?: boolean
|
||||
executed?: boolean
|
||||
verification_result_preview?: string
|
||||
post_state_summary?: {
|
||||
tool_count?: number
|
||||
tools?: string[]
|
||||
has_state?: boolean
|
||||
}
|
||||
mcp_route?: {
|
||||
agent_id?: string
|
||||
tool_name?: string
|
||||
required_scope?: string
|
||||
} | null
|
||||
}
|
||||
|
||||
interface RemediationActionState {
|
||||
status: 'loading' | 'done' | 'error'
|
||||
data?: RemediationDryRunResponse
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Helpers
|
||||
// =============================================================================
|
||||
@@ -220,6 +242,16 @@ function compactLabel(value?: string | null, fallback = '--'): string {
|
||||
return value.length > 54 ? `${value.slice(0, 54)}...` : value
|
||||
}
|
||||
|
||||
async function requestRemediationDryRun(workItemId: string): Promise<RemediationDryRunResponse> {
|
||||
const response = await fetch(`${API_BASE}/api/v1/ai/slo/remediation/dry-run`, {
|
||||
method: 'POST',
|
||||
headers: { 'content-type': 'application/json' },
|
||||
body: JSON.stringify({ work_item_id: workItemId, mode: 'auto' }),
|
||||
})
|
||||
if (!response.ok) throw new Error(`dry_run_failed:${response.status}`)
|
||||
return response.json()
|
||||
}
|
||||
|
||||
function buildMetrics(api: SloApiResponse): SloMetric[] {
|
||||
const adr100Metrics = api.adr100?.metrics
|
||||
if (adr100Metrics?.length) {
|
||||
@@ -276,6 +308,7 @@ function buildMetrics(api: SloApiResponse): SloMetric[] {
|
||||
|
||||
function VerificationCoveragePanel({ coverage }: { coverage?: Adr100VerificationCoverage }) {
|
||||
const t = useTranslations('governance.slo.coverage')
|
||||
const [actionState, setActionState] = useState<Record<string, RemediationActionState>>({})
|
||||
const color = coverageTone(coverage?.status)
|
||||
const rows = [
|
||||
{ label: t('totalAuto'), value: String(coverage?.total_auto ?? '--') },
|
||||
@@ -287,6 +320,25 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
const recentFindings = coverage?.recent_non_success ?? []
|
||||
const remediationQueue = coverage?.remediation_queue
|
||||
|
||||
const handleDryRun = async (workItemId: string) => {
|
||||
setActionState(prev => ({
|
||||
...prev,
|
||||
[workItemId]: { status: 'loading' },
|
||||
}))
|
||||
try {
|
||||
const data = await requestRemediationDryRun(workItemId)
|
||||
setActionState(prev => ({
|
||||
...prev,
|
||||
[workItemId]: { status: 'done', data },
|
||||
}))
|
||||
} catch {
|
||||
setActionState(prev => ({
|
||||
...prev,
|
||||
[workItemId]: { status: 'error' },
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<GlassCard variant="subtle" padding="md">
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 12 }}>
|
||||
@@ -379,7 +431,7 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
{(remediationQueue.items ?? []).slice(0, 4).map(item => (
|
||||
<div key={item.work_item_id} style={{
|
||||
display: 'grid',
|
||||
gridTemplateColumns: 'minmax(130px, 0.8fr) minmax(180px, 1fr) minmax(160px, 1fr)',
|
||||
gridTemplateColumns: 'minmax(130px, 0.8fr) minmax(180px, 1fr) minmax(150px, 0.9fr) minmax(150px, 0.8fr)',
|
||||
gap: 10,
|
||||
alignItems: 'center',
|
||||
minWidth: 0,
|
||||
@@ -410,6 +462,62 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
{compactLabel(item.remediation_reason)}
|
||||
</div>
|
||||
</div>
|
||||
<div style={{ minWidth: 0, display: 'flex', flexDirection: 'column', gap: 5 }}>
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => { void handleDryRun(item.work_item_id) }}
|
||||
disabled={actionState[item.work_item_id]?.status === 'loading'}
|
||||
title={t('dryRunButton')}
|
||||
style={{
|
||||
width: 'fit-content',
|
||||
minHeight: 28,
|
||||
display: 'inline-flex',
|
||||
alignItems: 'center',
|
||||
gap: 6,
|
||||
padding: '5px 9px',
|
||||
borderRadius: 6,
|
||||
border: '0.5px solid rgba(20,20,19,0.14)',
|
||||
background: actionState[item.work_item_id]?.status === 'loading'
|
||||
? 'rgba(135,134,127,0.08)'
|
||||
: 'rgba(34,197,94,0.08)',
|
||||
color: '#141413',
|
||||
fontFamily: "'DM Mono', monospace",
|
||||
fontSize: 10,
|
||||
cursor: actionState[item.work_item_id]?.status === 'loading' ? 'wait' : 'pointer',
|
||||
}}
|
||||
>
|
||||
<PlayCircle size={13} style={{ color: '#22C55E', flexShrink: 0 }} />
|
||||
<span>{actionState[item.work_item_id]?.status === 'loading' ? t('dryRunLoading') : t('dryRunButton')}</span>
|
||||
</button>
|
||||
{actionState[item.work_item_id]?.status === 'done' && (
|
||||
<div style={{
|
||||
display: 'flex',
|
||||
alignItems: 'flex-start',
|
||||
gap: 5,
|
||||
fontFamily: "'DM Mono', monospace",
|
||||
fontSize: 9,
|
||||
color: actionState[item.work_item_id].data?.allowed === false ? '#7c5a10' : '#166534',
|
||||
lineHeight: 1.4,
|
||||
overflowWrap: 'anywhere',
|
||||
}}>
|
||||
<SearchCheck size={12} style={{ flexShrink: 0, marginTop: 1 }} />
|
||||
<span>
|
||||
{actionState[item.work_item_id].data?.allowed === false
|
||||
? t('dryRunBlocked')
|
||||
: t('dryRunResult', {
|
||||
mode: actionState[item.work_item_id].data?.mode ?? '--',
|
||||
result: actionState[item.work_item_id].data?.verification_result_preview ?? '--',
|
||||
tools: actionState[item.work_item_id].data?.post_state_summary?.tool_count ?? 0,
|
||||
})}
|
||||
</span>
|
||||
</div>
|
||||
)}
|
||||
{actionState[item.work_item_id]?.status === 'error' && (
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#FF3300', lineHeight: 1.4 }}>
|
||||
{t('dryRunError')}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
||||
@@ -1,3 +1,37 @@
|
||||
## 2026-05-14 | T25 補救佇列新增安全試跑入口,replay/reverify 可先讀證據不改狀態
|
||||
|
||||
**背景**:T24 已把 non-success verifier rows 轉成 `remediation_queue`,但 Operator 仍只能看見「應該 replay / reverify」,無法從前端或 API 直接觸發一個安全、可觀測、低風險的試跑步驟。這會讓「AI 可接手」停在文字標籤,還沒有形成可操作入口。
|
||||
|
||||
**修正**:
|
||||
- 新增 `Adr100RemediationService`,從 ADR-100 `verification_coverage.remediation_queue` 找 work item,提供 read-only `preview` 與 `dry_run`。
|
||||
- 新增 API:
|
||||
- `GET /api/v1/ai/slo/remediation/preview?work_item_id=...`
|
||||
- `POST /api/v1/ai/slo/remediation/preview`
|
||||
- `POST /api/v1/ai/slo/remediation/dry-run`
|
||||
- `dry_run` 不會更新 incident 狀態、不會新增 auto-repair result、不會做真正修復;它只做 queue readiness / read-only guardrail / incident loaded / supported executor route 等檢查,並用 verifier 收集當前狀態產生 `verification_result_preview`。
|
||||
- `ready_for_reverify` 走 `post_execution_verifier` read-only current-state collection,回傳 PromQL 與 MCP route metadata。
|
||||
- `ready_for_replay` 先驗證 legacy SSH diagnostic 是否可轉成 `auto_repair_executor -> mcp:ssh_diagnose -> required_scope=read`,再收集 current-state preview。
|
||||
- `AutoRepairService` 新增 `preview_read_only_ssh_mcp_route()`,讓 remediation dry-run 能驗證 supported executor path,而不碰私有修復執行流程。
|
||||
- `/governance` SLO tab 的補救工作佇列每筆新增「試跑」按鈕,呼叫 dry-run API 後回顯 mode、preview result、工具數;文案補齊 `zh-TW` / `en` i18n,使用 lucide icon,不用 emoji。
|
||||
|
||||
**本地驗證**:
|
||||
- `python -m py_compile apps/api/src/services/adr100_remediation_service.py apps/api/src/api/v1/ai_slo.py apps/api/src/services/auto_repair_service.py`:pass。
|
||||
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_auto_repair_service.py -q`:33 passed。
|
||||
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q`:59 passed。
|
||||
- `ruff check --select F,E9 apps/api/src/services/adr100_remediation_service.py apps/api/src/api/v1/ai_slo.py apps/api/src/services/auto_repair_service.py apps/api/tests/test_adr100_remediation_service.py`:pass。
|
||||
- i18n JSON parse / `git diff --check`:pass。
|
||||
- `pnpm --filter @awoooi/web typecheck`:pass。
|
||||
- `pnpm --dir apps/web exec next lint --file src/app/[locale]/governance/tabs/slo-tab.tsx`:pass。
|
||||
- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --filter @awoooi/web build`:pass。
|
||||
|
||||
**推版與 production 驗證**:
|
||||
- 待 T25 commit 推 Gitea main 後驗證。
|
||||
|
||||
**目前整體進度**:
|
||||
- Alertmanager 低風險自動修復主線:約 98%。
|
||||
- 完整 AI 自動化管理產品化:約 90%。
|
||||
- T25 把「補救工作」從可視化清單推到安全試跑入口。下一段應把 dry-run 結果寫回可稽核 timeline / work item history,並把真正可 auto-closure 的條件與需要建 Ticket / 人工介入的條件分開。
|
||||
|
||||
## 2026-05-14 | T24 非成功驗證補救工作佇列,讓舊 degraded 變成可追蹤工作項
|
||||
|
||||
**背景**:T22/T23 已找出近 24h non-success verifier 的根因並修掉 executor / PromQL template 斷點,但 `/api/v1/ai/slo` 仍只把 historical degraded rows 顯示為 warning。Operator 仍無法直接判斷每筆舊 degraded 要 replay、reverify、建 Ticket,還是人工檢查。
|
||||
|
||||
Reference in New Issue
Block a user