feat(governance): persist remediation dry run history
This commit is contained in:
@@ -55,11 +55,17 @@ class Adr100RemediationService:
|
||||
incident_repository: _IncidentRepository | None = None,
|
||||
auto_repair_service: AutoRepairService | None = None,
|
||||
verifier: PostExecutionVerifier | None = None,
|
||||
timeline_service: Any | None = None,
|
||||
alert_operation_log_repository: Any | None = None,
|
||||
record_history: bool = True,
|
||||
) -> None:
|
||||
self._slo_service = slo_service or get_adr100_slo_status_service()
|
||||
self._incident_repository = incident_repository or IncidentDBRepository()
|
||||
self._auto_repair_service = auto_repair_service or AutoRepairService()
|
||||
self._verifier = verifier or get_post_execution_verifier()
|
||||
self._timeline_service = timeline_service
|
||||
self._alert_operation_log_repository = alert_operation_log_repository
|
||||
self._record_history_enabled = record_history
|
||||
|
||||
async def preview(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]:
|
||||
"""Return the safe execution plan for a remediation queue item."""
|
||||
@@ -98,7 +104,9 @@ class Adr100RemediationService:
|
||||
})
|
||||
|
||||
if incident is None or not all(check["passed"] for check in checks):
|
||||
return _dry_run_blocked_payload(item, selected_mode, checks)
|
||||
payload = _dry_run_blocked_payload(item, selected_mode, checks)
|
||||
payload["history"] = await self._record_dry_run_history(item, payload)
|
||||
return payload
|
||||
|
||||
if selected_mode == "replay":
|
||||
return await self._dry_run_replay(item, incident, checks)
|
||||
@@ -131,7 +139,7 @@ class Adr100RemediationService:
|
||||
action_taken = f"dry_run_reverify:{item.get('playbook_id') or 'unknown'}"
|
||||
result = _assess_recovery(None, post_state, action_taken)
|
||||
|
||||
return _dry_run_result_payload(
|
||||
payload = _dry_run_result_payload(
|
||||
item=item,
|
||||
mode="reverify",
|
||||
checks=checks,
|
||||
@@ -147,6 +155,8 @@ class Adr100RemediationService:
|
||||
},
|
||||
},
|
||||
)
|
||||
payload["history"] = await self._record_dry_run_history(item, payload)
|
||||
return payload
|
||||
|
||||
async def _dry_run_replay(
|
||||
self,
|
||||
@@ -169,7 +179,7 @@ class Adr100RemediationService:
|
||||
action_taken = f"dry_run_replay:{item.get('playbook_id') or 'unknown'}"
|
||||
result = _assess_recovery(None, post_state, action_taken)
|
||||
|
||||
return _dry_run_result_payload(
|
||||
payload = _dry_run_result_payload(
|
||||
item=item,
|
||||
mode="replay",
|
||||
checks=checks,
|
||||
@@ -181,6 +191,8 @@ class Adr100RemediationService:
|
||||
"promql": _promql_for_incident(incident),
|
||||
},
|
||||
)
|
||||
payload["history"] = await self._record_dry_run_history(item, payload)
|
||||
return payload
|
||||
|
||||
async def _collect_current_state(self, incident: Incident) -> dict[str, Any]:
|
||||
try:
|
||||
@@ -202,6 +214,81 @@ class Adr100RemediationService:
|
||||
)
|
||||
return {}
|
||||
|
||||
async def _record_dry_run_history(
|
||||
self,
|
||||
item: dict[str, Any],
|
||||
payload: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
if not self._record_history_enabled:
|
||||
return {"recorded": False, "reason": "disabled"}
|
||||
|
||||
incident_id = str(item.get("incident_id") or "")
|
||||
if not incident_id:
|
||||
return {"recorded": False, "reason": "missing_incident_id"}
|
||||
|
||||
history: dict[str, Any] = {
|
||||
"recorded": False,
|
||||
"alert_operation_id": None,
|
||||
"timeline_event_id": None,
|
||||
}
|
||||
context = _history_context(item, payload)
|
||||
allowed = bool(payload.get("allowed"))
|
||||
|
||||
try:
|
||||
repo = self._alert_operation_log_repository
|
||||
if repo is None:
|
||||
from src.repositories.alert_operation_log_repository import (
|
||||
get_alert_operation_log_repository,
|
||||
)
|
||||
|
||||
repo = get_alert_operation_log_repository()
|
||||
record = await repo.append(
|
||||
"PRE_FLIGHT_PASSED" if allowed else "PRE_FLIGHT_FAILED",
|
||||
incident_id=incident_id,
|
||||
auto_repair_id=str(item.get("auto_repair_id") or "") or None,
|
||||
actor="adr100_remediation_service",
|
||||
action_detail=f"adr100_remediation_dry_run:{payload.get('mode')}"[:200],
|
||||
success=allowed,
|
||||
context=context,
|
||||
)
|
||||
if record is not None:
|
||||
history["alert_operation_id"] = getattr(record, "id", None)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"adr100_remediation_alert_operation_history_failed",
|
||||
incident_id=incident_id,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
try:
|
||||
timeline = self._timeline_service
|
||||
if timeline is None:
|
||||
from src.services.approval_db import get_timeline_service
|
||||
|
||||
timeline = get_timeline_service()
|
||||
event = await timeline.add_event(
|
||||
event_type="verifier",
|
||||
status=_timeline_status(payload),
|
||||
title="ADR-100 remediation dry-run",
|
||||
description=_history_description(context),
|
||||
actor="adr100_remediation_service",
|
||||
actor_role=str(payload.get("mode") or "dry_run"),
|
||||
incident_id=incident_id,
|
||||
)
|
||||
if event:
|
||||
history["timeline_event_id"] = event.get("id")
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"adr100_remediation_timeline_history_failed",
|
||||
incident_id=incident_id,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
history["recorded"] = bool(
|
||||
history.get("alert_operation_id") or history.get("timeline_event_id")
|
||||
)
|
||||
return history
|
||||
|
||||
|
||||
def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]:
|
||||
if requested in ("reverify", "replay"):
|
||||
@@ -313,6 +400,48 @@ def _summarize_post_state(post_state: dict[str, Any]) -> dict[str, Any]:
|
||||
}
|
||||
|
||||
|
||||
def _history_context(item: dict[str, Any], payload: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"schema_version": "adr100_remediation_dry_run_history_v1",
|
||||
"work_item_id": item.get("work_item_id"),
|
||||
"auto_repair_id": item.get("auto_repair_id"),
|
||||
"playbook_id": item.get("playbook_id"),
|
||||
"alertname": item.get("alertname"),
|
||||
"mode": payload.get("mode"),
|
||||
"allowed": payload.get("allowed"),
|
||||
"executed": payload.get("executed"),
|
||||
"safety_level": payload.get("safety_level"),
|
||||
"writes_incident_state": payload.get("writes_incident_state"),
|
||||
"writes_auto_repair_result": payload.get("writes_auto_repair_result"),
|
||||
"verification_result_preview": payload.get("verification_result_preview"),
|
||||
"post_state_summary": payload.get("post_state_summary"),
|
||||
"mcp_route": payload.get("mcp_route"),
|
||||
"checks": payload.get("checks"),
|
||||
}
|
||||
|
||||
|
||||
def _timeline_status(payload: dict[str, Any]) -> str:
|
||||
if not payload.get("allowed"):
|
||||
return "warning"
|
||||
if payload.get("verification_result_preview") == "success":
|
||||
return "success"
|
||||
return "warning"
|
||||
|
||||
|
||||
def _history_description(context: dict[str, Any]) -> str:
|
||||
tool_count = (context.get("post_state_summary") or {}).get("tool_count", 0)
|
||||
route = context.get("mcp_route") or {}
|
||||
agent = route.get("agent_id") or "unknown_agent"
|
||||
tool = route.get("tool_name") or "current_state"
|
||||
return (
|
||||
f"mode={context.get('mode')} "
|
||||
f"preview={context.get('verification_result_preview')} "
|
||||
f"tools={tool_count} route={agent}/{tool} "
|
||||
f"writes_incident={context.get('writes_incident_state')} "
|
||||
f"writes_auto_repair={context.get('writes_auto_repair_result')}"
|
||||
)[:500]
|
||||
|
||||
|
||||
def _diagnostic_command_for_incident(incident: Incident) -> str:
|
||||
labels = _labels_for_incident(incident)
|
||||
host = str(labels.get("host") or labels.get("instance") or "{host}")
|
||||
|
||||
@@ -51,6 +51,24 @@ class _FakeVerifier:
|
||||
return self.state
|
||||
|
||||
|
||||
class _FakeAlertOperationLogRepository:
|
||||
def __init__(self) -> None:
|
||||
self.calls: list[dict[str, Any]] = []
|
||||
|
||||
async def append(self, event_type: str, **kwargs: Any):
|
||||
self.calls.append({"event_type": event_type, **kwargs})
|
||||
return type("AlertOperationRecord", (), {"id": "aol-1"})()
|
||||
|
||||
|
||||
class _FakeTimelineService:
|
||||
def __init__(self) -> None:
|
||||
self.calls: list[dict[str, Any]] = []
|
||||
|
||||
async def add_event(self, **kwargs: Any) -> dict[str, Any]:
|
||||
self.calls.append(kwargs)
|
||||
return {"id": "timeline-1"}
|
||||
|
||||
|
||||
class _NoopPlaybookService:
|
||||
async def get_recommendations(self, *_args, **_kwargs): # noqa: ANN002, ANN003
|
||||
return []
|
||||
@@ -111,6 +129,9 @@ def _service(
|
||||
item: dict[str, Any],
|
||||
incident: Incident | None = None,
|
||||
state: dict[str, Any] | None = None,
|
||||
timeline_service: Any | None = None,
|
||||
alert_operation_log_repository: Any | None = None,
|
||||
record_history: bool = False,
|
||||
) -> Adr100RemediationService:
|
||||
return Adr100RemediationService(
|
||||
slo_service=_FakeSloService([item]),
|
||||
@@ -120,6 +141,9 @@ def _service(
|
||||
cooldown_checker=_no_cooldown,
|
||||
),
|
||||
verifier=_FakeVerifier(state or {"k8s_get_pod_status": {"phase": "Running"}}),
|
||||
timeline_service=timeline_service,
|
||||
alert_operation_log_repository=alert_operation_log_repository,
|
||||
record_history=record_history,
|
||||
)
|
||||
|
||||
|
||||
@@ -156,6 +180,7 @@ async def test_dry_run_reverify_collects_state_without_writes():
|
||||
assert result["post_state_summary"]["tool_count"] == 1
|
||||
assert result["mcp_route"]["agent_id"] == "post_execution_verifier"
|
||||
assert result["mcp_route"]["required_scope"] == "read"
|
||||
assert result["history"]["recorded"] is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -187,6 +212,36 @@ async def test_dry_run_blocks_when_incident_missing():
|
||||
assert any(check["name"] == "incident_loaded" and not check["passed"] for check in result["checks"])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dry_run_records_alert_operation_and_timeline_history():
|
||||
alert_repo = _FakeAlertOperationLogRepository()
|
||||
timeline = _FakeTimelineService()
|
||||
svc = _service(
|
||||
item=_queue_item(),
|
||||
timeline_service=timeline,
|
||||
alert_operation_log_repository=alert_repo,
|
||||
record_history=True,
|
||||
)
|
||||
|
||||
result = await svc.dry_run("verification:INC-20260514-TEST01:are-1")
|
||||
|
||||
assert result["history"] == {
|
||||
"recorded": True,
|
||||
"alert_operation_id": "aol-1",
|
||||
"timeline_event_id": "timeline-1",
|
||||
}
|
||||
assert alert_repo.calls[0]["event_type"] == "PRE_FLIGHT_PASSED"
|
||||
assert alert_repo.calls[0]["incident_id"] == "INC-20260514-TEST01"
|
||||
assert alert_repo.calls[0]["success"] is True
|
||||
assert alert_repo.calls[0]["context"]["schema_version"] == (
|
||||
"adr100_remediation_dry_run_history_v1"
|
||||
)
|
||||
assert alert_repo.calls[0]["context"]["writes_incident_state"] is False
|
||||
assert timeline.calls[0]["event_type"] == "verifier"
|
||||
assert timeline.calls[0]["status"] == "success"
|
||||
assert timeline.calls[0]["actor_role"] == "replay"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_work_item_raises_not_found():
|
||||
svc = _service(item=_queue_item())
|
||||
|
||||
@@ -1417,6 +1417,7 @@
|
||||
"dryRunButton": "Dry run",
|
||||
"dryRunLoading": "Running",
|
||||
"dryRunResult": "{mode}; preview {result}; tools {tools}",
|
||||
"dryRunHistoryRecorded": "History recorded",
|
||||
"dryRunBlocked": "Dry run blocked",
|
||||
"dryRunError": "Dry run failed",
|
||||
"state": {
|
||||
|
||||
@@ -1418,6 +1418,7 @@
|
||||
"dryRunButton": "試跑",
|
||||
"dryRunLoading": "試跑中",
|
||||
"dryRunResult": "{mode};預覽 {result};工具 {tools}",
|
||||
"dryRunHistoryRecorded": "已寫入歷史",
|
||||
"dryRunBlocked": "試跑未放行",
|
||||
"dryRunError": "試跑失敗",
|
||||
"state": {
|
||||
|
||||
@@ -159,6 +159,9 @@ interface RemediationDryRunResponse {
|
||||
tool_name?: string
|
||||
required_scope?: string
|
||||
} | null
|
||||
history?: {
|
||||
recorded?: boolean
|
||||
}
|
||||
}
|
||||
|
||||
interface RemediationActionState {
|
||||
@@ -490,27 +493,40 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
<span>{actionState[item.work_item_id]?.status === 'loading' ? t('dryRunLoading') : t('dryRunButton')}</span>
|
||||
</button>
|
||||
{actionState[item.work_item_id]?.status === 'done' && (
|
||||
<div style={{
|
||||
display: 'flex',
|
||||
alignItems: 'flex-start',
|
||||
gap: 5,
|
||||
fontFamily: "'DM Mono', monospace",
|
||||
fontSize: 9,
|
||||
color: actionState[item.work_item_id].data?.allowed === false ? '#7c5a10' : '#166534',
|
||||
lineHeight: 1.4,
|
||||
overflowWrap: 'anywhere',
|
||||
}}>
|
||||
<SearchCheck size={12} style={{ flexShrink: 0, marginTop: 1 }} />
|
||||
<span>
|
||||
{actionState[item.work_item_id].data?.allowed === false
|
||||
? t('dryRunBlocked')
|
||||
: t('dryRunResult', {
|
||||
mode: actionState[item.work_item_id].data?.mode ?? '--',
|
||||
result: actionState[item.work_item_id].data?.verification_result_preview ?? '--',
|
||||
tools: actionState[item.work_item_id].data?.post_state_summary?.tool_count ?? 0,
|
||||
})}
|
||||
</span>
|
||||
</div>
|
||||
<>
|
||||
<div style={{
|
||||
display: 'flex',
|
||||
alignItems: 'flex-start',
|
||||
gap: 5,
|
||||
fontFamily: "'DM Mono', monospace",
|
||||
fontSize: 9,
|
||||
color: actionState[item.work_item_id].data?.allowed === false ? '#7c5a10' : '#166534',
|
||||
lineHeight: 1.4,
|
||||
overflowWrap: 'anywhere',
|
||||
}}>
|
||||
<SearchCheck size={12} style={{ flexShrink: 0, marginTop: 1 }} />
|
||||
<span>
|
||||
{actionState[item.work_item_id].data?.allowed === false
|
||||
? t('dryRunBlocked')
|
||||
: t('dryRunResult', {
|
||||
mode: actionState[item.work_item_id].data?.mode ?? '--',
|
||||
result: actionState[item.work_item_id].data?.verification_result_preview ?? '--',
|
||||
tools: actionState[item.work_item_id].data?.post_state_summary?.tool_count ?? 0,
|
||||
})}
|
||||
</span>
|
||||
</div>
|
||||
{actionState[item.work_item_id].data?.history?.recorded && (
|
||||
<div style={{
|
||||
fontFamily: "'DM Mono', monospace",
|
||||
fontSize: 9,
|
||||
color: '#166534',
|
||||
lineHeight: 1.4,
|
||||
overflowWrap: 'anywhere',
|
||||
}}>
|
||||
{t('dryRunHistoryRecorded')}
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
{actionState[item.work_item_id]?.status === 'error' && (
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#FF3300', lineHeight: 1.4 }}>
|
||||
|
||||
@@ -1,3 +1,30 @@
|
||||
## 2026-05-14 | T26 Remediation dry-run 寫入 history,試跑結果不再只停在前端暫存
|
||||
|
||||
**背景**:T25 已讓 Operator 能在 `/governance` 補救佇列點「試跑」,但結果只存在當次 UI state。這仍無法完全回答「這次 dry-run 是否真的發生、跑到哪個流程、MCP 有沒有用到、後續是否能從 Incident history 回看」。
|
||||
|
||||
**修正**:
|
||||
- `Adr100RemediationService.dry_run()` 完成後會寫入兩條既有稽核軌道,不新增資料表:
|
||||
- `alert_operation_log`:使用 `PRE_FLIGHT_PASSED` / `PRE_FLIGHT_FAILED`,context schema `adr100_remediation_dry_run_history_v1`,保留 `work_item_id / auto_repair_id / playbook_id / mode / checks / post_state_summary / mcp_route / writes_*`。
|
||||
- `timeline_events`:寫 `event_type=verifier`、title `ADR-100 remediation dry-run`,讓 Incident Timeline 能看到 verifier 階段真的有 dry-run。
|
||||
- dry-run API response 新增 `history.recorded / alert_operation_id / timeline_event_id`。
|
||||
- `/governance` 補救佇列試跑完成後,如果 history 寫入成功會顯示「已寫入歷史」。
|
||||
|
||||
**本地驗證**:
|
||||
- `python -m py_compile apps/api/src/services/adr100_remediation_service.py apps/api/tests/test_adr100_remediation_service.py`:pass。
|
||||
- `ruff check --select F,E9 apps/api/src/services/adr100_remediation_service.py apps/api/tests/test_adr100_remediation_service.py`:pass。
|
||||
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_ai_governance_endpoints.py -q`:35 passed。
|
||||
- `pnpm --filter @awoooi/web typecheck`:pass。
|
||||
- `pnpm --dir apps/web exec next lint --file src/app/[locale]/governance/tabs/slo-tab.tsx`:pass。
|
||||
- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --filter @awoooi/web build`:pass。
|
||||
|
||||
**推版與 production 驗證**:
|
||||
- 待 T26 commit 推 Gitea main 後驗證。
|
||||
|
||||
**目前整體進度**:
|
||||
- Alertmanager 低風險自動修復主線:約 98%。
|
||||
- 完整 AI 自動化管理產品化:約 91%。
|
||||
- T26 補上 dry-run 的 durable history。下一段應把這些 history 聚合回「工作鏈路 / Incident 詳情 / Telegram 詳情」同一視角,讓使用者不必猜流程走到哪一格。
|
||||
|
||||
## 2026-05-14 | T25 補救佇列新增安全試跑入口,replay/reverify 可先讀證據不改狀態
|
||||
|
||||
**背景**:T24 已把 non-success verifier rows 轉成 `remediation_queue`,但 Operator 仍只能看見「應該 replay / reverify」,無法從前端或 API 直接觸發一個安全、可觀測、低風險的試跑步驟。這會讓「AI 可接手」停在文字標籤,還沒有形成可操作入口。
|
||||
|
||||
Reference in New Issue
Block a user