diff --git a/apps/api/src/api/v1/ai_slo.py b/apps/api/src/api/v1/ai_slo.py index 5ddf0e1b..b8154fda 100644 --- a/apps/api/src/api/v1/ai_slo.py +++ b/apps/api/src/api/v1/ai_slo.py @@ -18,8 +18,14 @@ Endpoints: from __future__ import annotations import structlog -from fastapi import APIRouter, Query +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel, Field +from src.services.adr100_remediation_service import ( + RemediationMode, + RemediationNotFoundError, + get_adr100_remediation_service, +) from src.services.adr100_slo_status_service import get_adr100_slo_status_service from src.services.ai_slo_calculator import AiSloCalculator @@ -28,6 +34,20 @@ logger = structlog.get_logger(__name__) router = APIRouter() +class RemediationPreviewRequest(BaseModel): + """ADR-100 remediation preview request.""" + + work_item_id: str = Field(min_length=1) + mode: RemediationMode = "auto" + + +class RemediationDryRunRequest(BaseModel): + """ADR-100 remediation dry-run request.""" + + work_item_id: str = Field(min_length=1) + mode: RemediationMode = "auto" + + @router.get("/ai/slo") async def get_ai_slo( force_refresh: bool = Query(False, description="忽略快取,強制重算"), @@ -59,3 +79,42 @@ async def get_ai_slo( data["cache_hit"] = False data["adr100"] = await get_adr100_slo_status_service().fetch_report() return data + + +@router.get("/ai/slo/remediation/preview") +async def preview_ai_slo_remediation( + work_item_id: str = Query(..., min_length=1), + mode: RemediationMode = Query("auto"), +) -> dict: + """Preview the safe remediation plan for one ADR-100 queue item.""" + + try: + return await get_adr100_remediation_service().preview(work_item_id, mode) + except RemediationNotFoundError as exc: + raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc + + +@router.post("/ai/slo/remediation/preview") +async def preview_ai_slo_remediation_post(request: RemediationPreviewRequest) -> dict: + """POST variant for clients that prefer JSON bodies.""" + + try: + return await get_adr100_remediation_service().preview( + request.work_item_id, + request.mode, + ) + except RemediationNotFoundError as exc: + raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc + + +@router.post("/ai/slo/remediation/dry-run") +async def dry_run_ai_slo_remediation(request: RemediationDryRunRequest) -> dict: + """Run a read-only ADR-100 remediation dry-run.""" + + try: + return await get_adr100_remediation_service().dry_run( + request.work_item_id, + request.mode, + ) + except RemediationNotFoundError as exc: + raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc diff --git a/apps/api/src/services/adr100_remediation_service.py b/apps/api/src/services/adr100_remediation_service.py new file mode 100644 index 00000000..1dc0e97e --- /dev/null +++ b/apps/api/src/services/adr100_remediation_service.py @@ -0,0 +1,356 @@ +""" +ADR-100 Remediation Service +=========================== +Safe operator entrypoints for verification remediation work items. + +T25: remediation queue items are now actionable without mutating incident state: +- preview: show the selected guardrail path +- dry-run: collect read-only current state and validate supported executor routing +""" + +from __future__ import annotations + +import asyncio +from typing import Any, Literal, Protocol + +import structlog + +from src.models.incident import Incident +from src.repositories.incident_repository import IncidentDBRepository +from src.services.adr100_slo_status_service import ( + Adr100SloStatusService, + get_adr100_slo_status_service, +) +from src.services.auto_repair_service import AutoRepairService +from src.services.post_execution_verifier import ( + PostExecutionVerifier, + _assess_recovery, + _build_prometheus_query, + get_post_execution_verifier, +) + +logger = structlog.get_logger(__name__) + +RemediationMode = Literal["auto", "reverify", "replay"] + +_READY_STATUSES = {"ready_for_replay", "ready_for_reverify"} + + +class RemediationNotFoundError(LookupError): + """Requested ADR-100 remediation work item is not in the current read model.""" + + +class _IncidentRepository(Protocol): + async def get_by_id(self, incident_id: str) -> Incident | None: + ... + + +class Adr100RemediationService: + """Read-only remediation preview and dry-run service.""" + + def __init__( + self, + *, + slo_service: Adr100SloStatusService | None = None, + incident_repository: _IncidentRepository | None = None, + auto_repair_service: AutoRepairService | None = None, + verifier: PostExecutionVerifier | None = None, + ) -> None: + self._slo_service = slo_service or get_adr100_slo_status_service() + self._incident_repository = incident_repository or IncidentDBRepository() + self._auto_repair_service = auto_repair_service or AutoRepairService() + self._verifier = verifier or get_post_execution_verifier() + + async def preview(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]: + """Return the safe execution plan for a remediation queue item.""" + + item = await self._find_work_item(work_item_id) + selected_mode = _select_mode(item, mode) + checks = _base_checks(item) + allowed = all(check["passed"] for check in checks) + + return { + "schema_version": "adr100_remediation_preview_v1", + "work_item_id": item.get("work_item_id"), + "incident_id": item.get("incident_id"), + "auto_repair_id": item.get("auto_repair_id"), + "mode": selected_mode, + "allowed": allowed, + "safety_level": "read_only", + "writes_incident_state": False, + "writes_auto_repair_result": False, + "checks": checks, + "plan": _plan_for_item(item, selected_mode), + "source": "adr100.verification_coverage.remediation_queue", + } + + async def dry_run(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]: + """Run a safe, read-only remediation dry-run for one queue item.""" + + item = await self._find_work_item(work_item_id) + selected_mode = _select_mode(item, mode) + checks = _base_checks(item) + incident = await self._load_incident(item) + checks.append({ + "name": "incident_loaded", + "passed": incident is not None, + "detail": item.get("incident_id") or "missing incident_id", + }) + + if incident is None or not all(check["passed"] for check in checks): + return _dry_run_blocked_payload(item, selected_mode, checks) + + if selected_mode == "replay": + return await self._dry_run_replay(item, incident, checks) + return await self._dry_run_reverify(item, incident, checks) + + async def _find_work_item(self, work_item_id: str) -> dict[str, Any]: + report = await self._slo_service.fetch_report() + coverage = report.get("verification_coverage") or {} + queue = coverage.get("remediation_queue") or {} + + for item in queue.get("items") or []: + if item.get("work_item_id") == work_item_id: + return dict(item) + + raise RemediationNotFoundError(work_item_id) + + async def _load_incident(self, item: dict[str, Any]) -> Incident | None: + incident_id = str(item.get("incident_id") or "") + if not incident_id: + return None + return await self._incident_repository.get_by_id(incident_id) + + async def _dry_run_reverify( + self, + item: dict[str, Any], + incident: Incident, + checks: list[dict[str, Any]], + ) -> dict[str, Any]: + post_state = await self._collect_current_state(incident) + action_taken = f"dry_run_reverify:{item.get('playbook_id') or 'unknown'}" + result = _assess_recovery(None, post_state, action_taken) + + return _dry_run_result_payload( + item=item, + mode="reverify", + checks=checks, + post_state=post_state, + verification_result_preview=result, + extra={ + "promql": _promql_for_incident(incident), + "mcp_route": { + "agent_id": "post_execution_verifier", + "required_scope": "read", + "is_shadow": True, + "flywheel_node": "verify", + }, + }, + ) + + async def _dry_run_replay( + self, + item: dict[str, Any], + incident: Incident, + checks: list[dict[str, Any]], + ) -> dict[str, Any]: + diagnostic_command = _diagnostic_command_for_incident(incident) + route = self._auto_repair_service.preview_read_only_ssh_mcp_route( + incident, + diagnostic_command, + ) + checks.append({ + "name": "supported_executor_route", + "passed": route is not None, + "detail": "mcp:ssh_diagnose" if route else "missing host/container route", + }) + + post_state = await self._collect_current_state(incident) + action_taken = f"dry_run_replay:{item.get('playbook_id') or 'unknown'}" + result = _assess_recovery(None, post_state, action_taken) + + return _dry_run_result_payload( + item=item, + mode="replay", + checks=checks, + post_state=post_state, + verification_result_preview=result, + extra={ + "diagnostic_command_preview": diagnostic_command, + "mcp_route": route, + "promql": _promql_for_incident(incident), + }, + ) + + async def _collect_current_state(self, incident: Incident) -> dict[str, Any]: + try: + return await asyncio.wait_for( + self._verifier._collect_post_state(incident), + timeout=12.0, + ) + except asyncio.TimeoutError: + logger.warning( + "adr100_remediation_dry_run_timeout", + incident_id=incident.incident_id, + ) + return {} + except Exception as exc: + logger.warning( + "adr100_remediation_dry_run_collect_failed", + incident_id=incident.incident_id, + error=str(exc), + ) + return {} + + +def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]: + if requested in ("reverify", "replay"): + return requested + if item.get("remediation_status") == "ready_for_reverify": + return "reverify" + if item.get("remediation_action") == "reverify_with_promql_template": + return "reverify" + return "replay" + + +def _base_checks(item: dict[str, Any]) -> list[dict[str, Any]]: + status = str(item.get("remediation_status") or "unknown") + action = str(item.get("remediation_action") or "unknown") + return [ + { + "name": "queue_item_ready", + "passed": status in _READY_STATUSES, + "detail": status, + }, + { + "name": "read_only_guardrail", + "passed": action in { + "replay_with_supported_executor", + "reverify_with_promql_template", + }, + "detail": action, + }, + { + "name": "no_state_mutation", + "passed": True, + "detail": "dry_run_does_not_update_incident_or_auto_repair_rows", + }, + ] + + +def _plan_for_item(item: dict[str, Any], mode: str) -> dict[str, Any]: + if mode == "reverify": + return { + "step": "collect_current_state_and_assess", + "agent_id": "post_execution_verifier", + "required_scope": "read", + "writes": [], + } + return { + "step": "validate_supported_executor_route_then_collect_current_state", + "agent_id": "auto_repair_executor", + "required_scope": "read", + "writes": [], + "target_action": item.get("remediation_action"), + } + + +def _dry_run_blocked_payload( + item: dict[str, Any], + mode: str, + checks: list[dict[str, Any]], +) -> dict[str, Any]: + return { + "schema_version": "adr100_remediation_dry_run_v1", + "work_item_id": item.get("work_item_id"), + "incident_id": item.get("incident_id"), + "auto_repair_id": item.get("auto_repair_id"), + "mode": mode, + "allowed": False, + "executed": False, + "safety_level": "read_only", + "writes_incident_state": False, + "writes_auto_repair_result": False, + "checks": checks, + "verification_result_preview": "blocked", + "post_state_summary": {}, + } + + +def _dry_run_result_payload( + *, + item: dict[str, Any], + mode: str, + checks: list[dict[str, Any]], + post_state: dict[str, Any], + verification_result_preview: str, + extra: dict[str, Any], +) -> dict[str, Any]: + return { + "schema_version": "adr100_remediation_dry_run_v1", + "work_item_id": item.get("work_item_id"), + "incident_id": item.get("incident_id"), + "auto_repair_id": item.get("auto_repair_id"), + "mode": mode, + "allowed": all(check["passed"] for check in checks), + "executed": True, + "safety_level": "read_only", + "writes_incident_state": False, + "writes_auto_repair_result": False, + "checks": checks, + "verification_result_preview": verification_result_preview, + "post_state_summary": _summarize_post_state(post_state), + **extra, + } + + +def _summarize_post_state(post_state: dict[str, Any]) -> dict[str, Any]: + keys = sorted(post_state.keys()) + return { + "tool_count": len(keys), + "tools": keys[:8], + "has_state": bool(post_state), + } + + +def _diagnostic_command_for_incident(incident: Incident) -> str: + labels = _labels_for_incident(incident) + host = str(labels.get("host") or labels.get("instance") or "{host}") + container = str(labels.get("container_name") or labels.get("container") or "") + if container: + return f"ssh {host} 'uptime; docker stats --no-stream {container}'" + return f"ssh {host} 'uptime; docker stats --no-stream'" + + +def _promql_for_incident(incident: Incident) -> str: + labels = _labels_for_incident(incident) + alertname = "" + if incident.signals: + signal = incident.signals[0] + alertname = labels.get("alertname") or getattr(signal, "alert_name", "") + return _build_prometheus_query(alertname, labels) + + +def _labels_for_incident(incident: Incident) -> dict[str, Any]: + if incident.signals: + return incident.signals[0].labels or {} + return {} + + +_service: Adr100RemediationService | None = None + + +def get_adr100_remediation_service() -> Adr100RemediationService: + """Return singleton ADR-100 remediation service.""" + + global _service + if _service is None: + _service = Adr100RemediationService() + return _service + + +def set_adr100_remediation_service(service: Adr100RemediationService | None) -> None: + """Inject ADR-100 remediation service for tests.""" + + global _service + _service = service diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index 2b3bac4a..c5dfcb5f 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -1003,6 +1003,29 @@ class AutoRepairService: return _SshMcpRoute(tool_name="ssh_diagnose", params=params) + def preview_read_only_ssh_mcp_route( + self, + incident: Incident, + command: str, + ) -> dict[str, Any] | None: + """Preview whether a legacy SSH diagnostic can use the MCP Gateway. + + This is used by remediation dry-runs to prove the supported executor + path without running the original PlayBook step or writing an execution + result. + """ + + route = self._route_legacy_ssh_command_to_mcp(incident, command) + if route is None: + return None + return { + "tool_name": route.tool_name, + "params": route.params, + "agent_id": "auto_repair_executor", + "required_scope": "read", + "flywheel_node": "execute", + } + def _resolve_ssh_host_for_incident(self, incident: Incident, command: str) -> str: """Resolve ``{host}``, short host labels, and exporter instance ports.""" diff --git a/apps/api/tests/test_adr100_remediation_service.py b/apps/api/tests/test_adr100_remediation_service.py new file mode 100644 index 00000000..da365b43 --- /dev/null +++ b/apps/api/tests/test_adr100_remediation_service.py @@ -0,0 +1,227 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from src.api.v1.ai_slo import router +from src.models.incident import Incident, IncidentStatus, Severity, Signal +from src.models.playbook import Playbook +from src.services.adr100_remediation_service import ( + Adr100RemediationService, + RemediationNotFoundError, +) +from src.services.auto_repair_service import AutoRepairService + + +class _FakeSloService: + def __init__(self, items: list[dict[str, Any]]) -> None: + self.items = items + + async def fetch_report(self) -> dict[str, Any]: + return { + "verification_coverage": { + "remediation_queue": { + "items": self.items, + }, + }, + } + + +class _FakeIncidentRepository: + def __init__(self, incident: Incident | None) -> None: + self.incident = incident + + async def get_by_id(self, incident_id: str) -> Incident | None: + if self.incident and self.incident.incident_id == incident_id: + return self.incident + return None + + +class _FakeVerifier: + def __init__(self, state: dict[str, Any]) -> None: + self.state = state + self.calls = 0 + + async def _collect_post_state(self, incident: Incident) -> dict[str, Any]: + self.calls += 1 + return self.state + + +class _NoopPlaybookService: + async def get_recommendations(self, *_args, **_kwargs): # noqa: ANN002, ANN003 + return [] + + async def get_by_id(self, _playbook_id: str) -> Playbook | None: + return None + + async def record_execution(self, _playbook_id: str, _success: bool) -> bool: + return True + + +async def _no_cooldown(*_args, **_kwargs) -> tuple[bool, str]: # noqa: ANN002, ANN003 + return True, "test" + + +def _incident() -> Incident: + now = datetime.now(timezone.utc) + return Incident( + incident_id="INC-20260514-TEST01", + status=IncidentStatus.INVESTIGATING, + severity=Severity.P2, + affected_services=["momo-scheduler"], + alert_category="infrastructure", + signals=[ + Signal( + alert_name="DockerContainerMemoryLimitPressure", + severity=Severity.P2, + source="prometheus", + fired_at=now, + labels={ + "alertname": "DockerContainerMemoryLimitPressure", + "host": "110", + "container_name": "momo-scheduler", + }, + ), + ], + ) + + +def _queue_item(**overrides: Any) -> dict[str, Any]: + item = { + "work_item_id": "verification:INC-20260514-TEST01:are-1", + "incident_id": "INC-20260514-TEST01", + "auto_repair_id": "are-1", + "alertname": "DockerContainerMemoryLimitPressure", + "playbook_id": "PB-1", + "verification_result": "degraded", + "remediation_status": "ready_for_replay", + "remediation_action": "replay_with_supported_executor", + "remediation_owner": "auto_repair_executor", + } + item.update(overrides) + return item + + +def _service( + *, + item: dict[str, Any], + incident: Incident | None = None, + state: dict[str, Any] | None = None, +) -> Adr100RemediationService: + return Adr100RemediationService( + slo_service=_FakeSloService([item]), + incident_repository=_FakeIncidentRepository(incident or _incident()), + auto_repair_service=AutoRepairService( + playbook_service=_NoopPlaybookService(), + cooldown_checker=_no_cooldown, + ), + verifier=_FakeVerifier(state or {"k8s_get_pod_status": {"phase": "Running"}}), + ) + + +@pytest.mark.asyncio +async def test_preview_marks_replay_work_item_read_only(): + svc = _service(item=_queue_item()) + + result = await svc.preview("verification:INC-20260514-TEST01:are-1") + + assert result["allowed"] is True + assert result["mode"] == "replay" + assert result["safety_level"] == "read_only" + assert result["writes_incident_state"] is False + assert result["plan"]["agent_id"] == "auto_repair_executor" + assert result["plan"]["writes"] == [] + + +@pytest.mark.asyncio +async def test_dry_run_reverify_collects_state_without_writes(): + item = _queue_item( + remediation_status="ready_for_reverify", + remediation_action="reverify_with_promql_template", + remediation_owner="post_execution_verifier", + ) + svc = _service(item=item, state={"k8s_get_pod_status": {"phase": "Running"}}) + + result = await svc.dry_run("verification:INC-20260514-TEST01:are-1") + + assert result["allowed"] is True + assert result["executed"] is True + assert result["mode"] == "reverify" + assert result["verification_result_preview"] == "success" + assert result["writes_auto_repair_result"] is False + assert result["post_state_summary"]["tool_count"] == 1 + assert result["mcp_route"]["agent_id"] == "post_execution_verifier" + assert result["mcp_route"]["required_scope"] == "read" + + +@pytest.mark.asyncio +async def test_dry_run_replay_validates_supported_executor_route(): + svc = _service(item=_queue_item()) + + result = await svc.dry_run("verification:INC-20260514-TEST01:are-1") + + assert result["allowed"] is True + assert result["mode"] == "replay" + assert result["mcp_route"]["agent_id"] == "auto_repair_executor" + assert result["mcp_route"]["tool_name"] == "ssh_diagnose" + assert result["mcp_route"]["required_scope"] == "read" + assert result["mcp_route"]["params"]["host"] == "192.168.0.110" + assert result["mcp_route"]["params"]["container_name"] == "momo-scheduler" + assert result["diagnostic_command_preview"].startswith("ssh 110") + + +@pytest.mark.asyncio +async def test_dry_run_blocks_when_incident_missing(): + svc = _service(item=_queue_item(), incident=None) + svc._incident_repository = _FakeIncidentRepository(None) + + result = await svc.dry_run("verification:INC-20260514-TEST01:are-1") + + assert result["allowed"] is False + assert result["executed"] is False + assert result["verification_result_preview"] == "blocked" + assert any(check["name"] == "incident_loaded" and not check["passed"] for check in result["checks"]) + + +@pytest.mark.asyncio +async def test_missing_work_item_raises_not_found(): + svc = _service(item=_queue_item()) + + with pytest.raises(RemediationNotFoundError): + await svc.preview("verification:missing") + + +def test_ai_slo_remediation_endpoints(monkeypatch): + app = FastAPI() + app.include_router(router, prefix="/api/v1") + + class _FakeService: + async def preview(self, work_item_id: str, mode: str = "auto") -> dict[str, Any]: + return {"work_item_id": work_item_id, "mode": mode, "allowed": True} + + async def dry_run(self, work_item_id: str, mode: str = "auto") -> dict[str, Any]: + return {"work_item_id": work_item_id, "mode": mode, "executed": True} + + monkeypatch.setattr( + "src.api.v1.ai_slo.get_adr100_remediation_service", + lambda: _FakeService(), + ) + + client = TestClient(app) + preview = client.get( + "/api/v1/ai/slo/remediation/preview", + params={"work_item_id": "verification:INC:are-1", "mode": "reverify"}, + ) + dry_run = client.post( + "/api/v1/ai/slo/remediation/dry-run", + json={"work_item_id": "verification:INC:are-1", "mode": "replay"}, + ) + + assert preview.status_code == 200 + assert preview.json()["mode"] == "reverify" + assert dry_run.status_code == 200 + assert dry_run.json()["executed"] is True diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 55d0714e..eaf4c8e9 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -1414,6 +1414,11 @@ "recentFindings": "Recent Non-success Verification", "remediationQueue": "Remediation Work Queue", "queueSummary": "Total {total}; AI-ready {ready}; human {human}", + "dryRunButton": "Dry run", + "dryRunLoading": "Running", + "dryRunResult": "{mode}; preview {result}; tools {tools}", + "dryRunBlocked": "Dry run blocked", + "dryRunError": "Dry run failed", "state": { "ok": "OK", "warning": "Needs tracking", diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index f01bbc91..94bcbd58 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -1415,6 +1415,11 @@ "recentFindings": "近期非成功驗證", "remediationQueue": "補救工作佇列", "queueSummary": "總數 {total};AI 可接手 {ready};人工 {human}", + "dryRunButton": "試跑", + "dryRunLoading": "試跑中", + "dryRunResult": "{mode};預覽 {result};工具 {tools}", + "dryRunBlocked": "試跑未放行", + "dryRunError": "試跑失敗", "state": { "ok": "正常", "warning": "需追蹤", diff --git a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx index 1bebac75..0b4981d3 100644 --- a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx +++ b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx @@ -15,7 +15,7 @@ import { useEffect, useState } from 'react' import { useTranslations } from 'next-intl' -import { ShieldCheck, AlertTriangle } from 'lucide-react' +import { ShieldCheck, AlertTriangle, PlayCircle, SearchCheck } from 'lucide-react' import { SloKpiCard, type SloMetric } from '@/components/governance/slo-kpi-card' import { SloViolationChart, type ViolationDataPoint } from '@/components/governance/slo-violation-chart' import { GlassCard } from '@/components/ui/glass-card' @@ -144,6 +144,28 @@ interface SummaryApiResponse { days?: number } +interface RemediationDryRunResponse { + mode?: string + allowed?: boolean + executed?: boolean + verification_result_preview?: string + post_state_summary?: { + tool_count?: number + tools?: string[] + has_state?: boolean + } + mcp_route?: { + agent_id?: string + tool_name?: string + required_scope?: string + } | null +} + +interface RemediationActionState { + status: 'loading' | 'done' | 'error' + data?: RemediationDryRunResponse +} + // ============================================================================= // Helpers // ============================================================================= @@ -220,6 +242,16 @@ function compactLabel(value?: string | null, fallback = '--'): string { return value.length > 54 ? `${value.slice(0, 54)}...` : value } +async function requestRemediationDryRun(workItemId: string): Promise { + const response = await fetch(`${API_BASE}/api/v1/ai/slo/remediation/dry-run`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ work_item_id: workItemId, mode: 'auto' }), + }) + if (!response.ok) throw new Error(`dry_run_failed:${response.status}`) + return response.json() +} + function buildMetrics(api: SloApiResponse): SloMetric[] { const adr100Metrics = api.adr100?.metrics if (adr100Metrics?.length) { @@ -276,6 +308,7 @@ function buildMetrics(api: SloApiResponse): SloMetric[] { function VerificationCoveragePanel({ coverage }: { coverage?: Adr100VerificationCoverage }) { const t = useTranslations('governance.slo.coverage') + const [actionState, setActionState] = useState>({}) const color = coverageTone(coverage?.status) const rows = [ { label: t('totalAuto'), value: String(coverage?.total_auto ?? '--') }, @@ -287,6 +320,25 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification const recentFindings = coverage?.recent_non_success ?? [] const remediationQueue = coverage?.remediation_queue + const handleDryRun = async (workItemId: string) => { + setActionState(prev => ({ + ...prev, + [workItemId]: { status: 'loading' }, + })) + try { + const data = await requestRemediationDryRun(workItemId) + setActionState(prev => ({ + ...prev, + [workItemId]: { status: 'done', data }, + })) + } catch { + setActionState(prev => ({ + ...prev, + [workItemId]: { status: 'error' }, + })) + } + } + return (
@@ -379,7 +431,7 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification {(remediationQueue.items ?? []).slice(0, 4).map(item => (
+
+ + {actionState[item.work_item_id]?.status === 'done' && ( +
+ + + {actionState[item.work_item_id].data?.allowed === false + ? t('dryRunBlocked') + : t('dryRunResult', { + mode: actionState[item.work_item_id].data?.mode ?? '--', + result: actionState[item.work_item_id].data?.verification_result_preview ?? '--', + tools: actionState[item.work_item_id].data?.post_state_summary?.tool_count ?? 0, + })} + +
+ )} + {actionState[item.work_item_id]?.status === 'error' && ( +
+ {t('dryRunError')} +
+ )} +
))} diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index be96cae7..f71ffe05 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,37 @@ +## 2026-05-14 | T25 補救佇列新增安全試跑入口,replay/reverify 可先讀證據不改狀態 + +**背景**:T24 已把 non-success verifier rows 轉成 `remediation_queue`,但 Operator 仍只能看見「應該 replay / reverify」,無法從前端或 API 直接觸發一個安全、可觀測、低風險的試跑步驟。這會讓「AI 可接手」停在文字標籤,還沒有形成可操作入口。 + +**修正**: +- 新增 `Adr100RemediationService`,從 ADR-100 `verification_coverage.remediation_queue` 找 work item,提供 read-only `preview` 與 `dry_run`。 +- 新增 API: + - `GET /api/v1/ai/slo/remediation/preview?work_item_id=...` + - `POST /api/v1/ai/slo/remediation/preview` + - `POST /api/v1/ai/slo/remediation/dry-run` +- `dry_run` 不會更新 incident 狀態、不會新增 auto-repair result、不會做真正修復;它只做 queue readiness / read-only guardrail / incident loaded / supported executor route 等檢查,並用 verifier 收集當前狀態產生 `verification_result_preview`。 +- `ready_for_reverify` 走 `post_execution_verifier` read-only current-state collection,回傳 PromQL 與 MCP route metadata。 +- `ready_for_replay` 先驗證 legacy SSH diagnostic 是否可轉成 `auto_repair_executor -> mcp:ssh_diagnose -> required_scope=read`,再收集 current-state preview。 +- `AutoRepairService` 新增 `preview_read_only_ssh_mcp_route()`,讓 remediation dry-run 能驗證 supported executor path,而不碰私有修復執行流程。 +- `/governance` SLO tab 的補救工作佇列每筆新增「試跑」按鈕,呼叫 dry-run API 後回顯 mode、preview result、工具數;文案補齊 `zh-TW` / `en` i18n,使用 lucide icon,不用 emoji。 + +**本地驗證**: +- `python -m py_compile apps/api/src/services/adr100_remediation_service.py apps/api/src/api/v1/ai_slo.py apps/api/src/services/auto_repair_service.py`:pass。 +- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_auto_repair_service.py -q`:33 passed。 +- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_adr100_slo_metrics_service.py tests/test_governance_agent.py tests/test_ai_governance_endpoints.py -q`:59 passed。 +- `ruff check --select F,E9 apps/api/src/services/adr100_remediation_service.py apps/api/src/api/v1/ai_slo.py apps/api/src/services/auto_repair_service.py apps/api/tests/test_adr100_remediation_service.py`:pass。 +- i18n JSON parse / `git diff --check`:pass。 +- `pnpm --filter @awoooi/web typecheck`:pass。 +- `pnpm --dir apps/web exec next lint --file src/app/[locale]/governance/tabs/slo-tab.tsx`:pass。 +- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --filter @awoooi/web build`:pass。 + +**推版與 production 驗證**: +- 待 T25 commit 推 Gitea main 後驗證。 + +**目前整體進度**: +- Alertmanager 低風險自動修復主線:約 98%。 +- 完整 AI 自動化管理產品化:約 90%。 +- T25 把「補救工作」從可視化清單推到安全試跑入口。下一段應把 dry-run 結果寫回可稽核 timeline / work item history,並把真正可 auto-closure 的條件與需要建 Ticket / 人工介入的條件分開。 + ## 2026-05-14 | T24 非成功驗證補救工作佇列,讓舊 degraded 變成可追蹤工作項 **背景**:T22/T23 已找出近 24h non-success verifier 的根因並修掉 executor / PromQL template 斷點,但 `/api/v1/ai/slo` 仍只把 historical degraded rows 顯示為 warning。Operator 仍無法直接判斷每筆舊 degraded 要 replay、reverify、建 Ticket,還是人工檢查。