feat(governance): persist remediation dry run history
All checks were successful
Code Review / ai-code-review (push) Successful in 12s
CD Pipeline / tests (push) Successful in 1m4s
CD Pipeline / build-and-deploy (push) Successful in 3m44s
CD Pipeline / post-deploy-checks (push) Successful in 1m24s

This commit is contained in:
Your Name
2026-05-14 22:38:42 +08:00
parent 36cb9d6aeb
commit 6aaaf87ade
6 changed files with 253 additions and 24 deletions

View File

@@ -55,11 +55,17 @@ class Adr100RemediationService:
incident_repository: _IncidentRepository | None = None,
auto_repair_service: AutoRepairService | None = None,
verifier: PostExecutionVerifier | None = None,
timeline_service: Any | None = None,
alert_operation_log_repository: Any | None = None,
record_history: bool = True,
) -> None:
self._slo_service = slo_service or get_adr100_slo_status_service()
self._incident_repository = incident_repository or IncidentDBRepository()
self._auto_repair_service = auto_repair_service or AutoRepairService()
self._verifier = verifier or get_post_execution_verifier()
self._timeline_service = timeline_service
self._alert_operation_log_repository = alert_operation_log_repository
self._record_history_enabled = record_history
async def preview(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]:
"""Return the safe execution plan for a remediation queue item."""
@@ -98,7 +104,9 @@ class Adr100RemediationService:
})
if incident is None or not all(check["passed"] for check in checks):
return _dry_run_blocked_payload(item, selected_mode, checks)
payload = _dry_run_blocked_payload(item, selected_mode, checks)
payload["history"] = await self._record_dry_run_history(item, payload)
return payload
if selected_mode == "replay":
return await self._dry_run_replay(item, incident, checks)
@@ -131,7 +139,7 @@ class Adr100RemediationService:
action_taken = f"dry_run_reverify:{item.get('playbook_id') or 'unknown'}"
result = _assess_recovery(None, post_state, action_taken)
return _dry_run_result_payload(
payload = _dry_run_result_payload(
item=item,
mode="reverify",
checks=checks,
@@ -147,6 +155,8 @@ class Adr100RemediationService:
},
},
)
payload["history"] = await self._record_dry_run_history(item, payload)
return payload
async def _dry_run_replay(
self,
@@ -169,7 +179,7 @@ class Adr100RemediationService:
action_taken = f"dry_run_replay:{item.get('playbook_id') or 'unknown'}"
result = _assess_recovery(None, post_state, action_taken)
return _dry_run_result_payload(
payload = _dry_run_result_payload(
item=item,
mode="replay",
checks=checks,
@@ -181,6 +191,8 @@ class Adr100RemediationService:
"promql": _promql_for_incident(incident),
},
)
payload["history"] = await self._record_dry_run_history(item, payload)
return payload
async def _collect_current_state(self, incident: Incident) -> dict[str, Any]:
try:
@@ -202,6 +214,81 @@ class Adr100RemediationService:
)
return {}
async def _record_dry_run_history(
self,
item: dict[str, Any],
payload: dict[str, Any],
) -> dict[str, Any]:
if not self._record_history_enabled:
return {"recorded": False, "reason": "disabled"}
incident_id = str(item.get("incident_id") or "")
if not incident_id:
return {"recorded": False, "reason": "missing_incident_id"}
history: dict[str, Any] = {
"recorded": False,
"alert_operation_id": None,
"timeline_event_id": None,
}
context = _history_context(item, payload)
allowed = bool(payload.get("allowed"))
try:
repo = self._alert_operation_log_repository
if repo is None:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
repo = get_alert_operation_log_repository()
record = await repo.append(
"PRE_FLIGHT_PASSED" if allowed else "PRE_FLIGHT_FAILED",
incident_id=incident_id,
auto_repair_id=str(item.get("auto_repair_id") or "") or None,
actor="adr100_remediation_service",
action_detail=f"adr100_remediation_dry_run:{payload.get('mode')}"[:200],
success=allowed,
context=context,
)
if record is not None:
history["alert_operation_id"] = getattr(record, "id", None)
except Exception as exc:
logger.warning(
"adr100_remediation_alert_operation_history_failed",
incident_id=incident_id,
error=str(exc),
)
try:
timeline = self._timeline_service
if timeline is None:
from src.services.approval_db import get_timeline_service
timeline = get_timeline_service()
event = await timeline.add_event(
event_type="verifier",
status=_timeline_status(payload),
title="ADR-100 remediation dry-run",
description=_history_description(context),
actor="adr100_remediation_service",
actor_role=str(payload.get("mode") or "dry_run"),
incident_id=incident_id,
)
if event:
history["timeline_event_id"] = event.get("id")
except Exception as exc:
logger.warning(
"adr100_remediation_timeline_history_failed",
incident_id=incident_id,
error=str(exc),
)
history["recorded"] = bool(
history.get("alert_operation_id") or history.get("timeline_event_id")
)
return history
def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]:
if requested in ("reverify", "replay"):
@@ -313,6 +400,48 @@ def _summarize_post_state(post_state: dict[str, Any]) -> dict[str, Any]:
}
def _history_context(item: dict[str, Any], payload: dict[str, Any]) -> dict[str, Any]:
return {
"schema_version": "adr100_remediation_dry_run_history_v1",
"work_item_id": item.get("work_item_id"),
"auto_repair_id": item.get("auto_repair_id"),
"playbook_id": item.get("playbook_id"),
"alertname": item.get("alertname"),
"mode": payload.get("mode"),
"allowed": payload.get("allowed"),
"executed": payload.get("executed"),
"safety_level": payload.get("safety_level"),
"writes_incident_state": payload.get("writes_incident_state"),
"writes_auto_repair_result": payload.get("writes_auto_repair_result"),
"verification_result_preview": payload.get("verification_result_preview"),
"post_state_summary": payload.get("post_state_summary"),
"mcp_route": payload.get("mcp_route"),
"checks": payload.get("checks"),
}
def _timeline_status(payload: dict[str, Any]) -> str:
if not payload.get("allowed"):
return "warning"
if payload.get("verification_result_preview") == "success":
return "success"
return "warning"
def _history_description(context: dict[str, Any]) -> str:
tool_count = (context.get("post_state_summary") or {}).get("tool_count", 0)
route = context.get("mcp_route") or {}
agent = route.get("agent_id") or "unknown_agent"
tool = route.get("tool_name") or "current_state"
return (
f"mode={context.get('mode')} "
f"preview={context.get('verification_result_preview')} "
f"tools={tool_count} route={agent}/{tool} "
f"writes_incident={context.get('writes_incident_state')} "
f"writes_auto_repair={context.get('writes_auto_repair_result')}"
)[:500]
def _diagnostic_command_for_incident(incident: Incident) -> str:
labels = _labels_for_incident(incident)
host = str(labels.get("host") or labels.get("instance") or "{host}")

View File

@@ -51,6 +51,24 @@ class _FakeVerifier:
return self.state
class _FakeAlertOperationLogRepository:
def __init__(self) -> None:
self.calls: list[dict[str, Any]] = []
async def append(self, event_type: str, **kwargs: Any):
self.calls.append({"event_type": event_type, **kwargs})
return type("AlertOperationRecord", (), {"id": "aol-1"})()
class _FakeTimelineService:
def __init__(self) -> None:
self.calls: list[dict[str, Any]] = []
async def add_event(self, **kwargs: Any) -> dict[str, Any]:
self.calls.append(kwargs)
return {"id": "timeline-1"}
class _NoopPlaybookService:
async def get_recommendations(self, *_args, **_kwargs): # noqa: ANN002, ANN003
return []
@@ -111,6 +129,9 @@ def _service(
item: dict[str, Any],
incident: Incident | None = None,
state: dict[str, Any] | None = None,
timeline_service: Any | None = None,
alert_operation_log_repository: Any | None = None,
record_history: bool = False,
) -> Adr100RemediationService:
return Adr100RemediationService(
slo_service=_FakeSloService([item]),
@@ -120,6 +141,9 @@ def _service(
cooldown_checker=_no_cooldown,
),
verifier=_FakeVerifier(state or {"k8s_get_pod_status": {"phase": "Running"}}),
timeline_service=timeline_service,
alert_operation_log_repository=alert_operation_log_repository,
record_history=record_history,
)
@@ -156,6 +180,7 @@ async def test_dry_run_reverify_collects_state_without_writes():
assert result["post_state_summary"]["tool_count"] == 1
assert result["mcp_route"]["agent_id"] == "post_execution_verifier"
assert result["mcp_route"]["required_scope"] == "read"
assert result["history"]["recorded"] is False
@pytest.mark.asyncio
@@ -187,6 +212,36 @@ async def test_dry_run_blocks_when_incident_missing():
assert any(check["name"] == "incident_loaded" and not check["passed"] for check in result["checks"])
@pytest.mark.asyncio
async def test_dry_run_records_alert_operation_and_timeline_history():
alert_repo = _FakeAlertOperationLogRepository()
timeline = _FakeTimelineService()
svc = _service(
item=_queue_item(),
timeline_service=timeline,
alert_operation_log_repository=alert_repo,
record_history=True,
)
result = await svc.dry_run("verification:INC-20260514-TEST01:are-1")
assert result["history"] == {
"recorded": True,
"alert_operation_id": "aol-1",
"timeline_event_id": "timeline-1",
}
assert alert_repo.calls[0]["event_type"] == "PRE_FLIGHT_PASSED"
assert alert_repo.calls[0]["incident_id"] == "INC-20260514-TEST01"
assert alert_repo.calls[0]["success"] is True
assert alert_repo.calls[0]["context"]["schema_version"] == (
"adr100_remediation_dry_run_history_v1"
)
assert alert_repo.calls[0]["context"]["writes_incident_state"] is False
assert timeline.calls[0]["event_type"] == "verifier"
assert timeline.calls[0]["status"] == "success"
assert timeline.calls[0]["actor_role"] == "replay"
@pytest.mark.asyncio
async def test_missing_work_item_raises_not_found():
svc = _service(item=_queue_item())

View File

@@ -1417,6 +1417,7 @@
"dryRunButton": "Dry run",
"dryRunLoading": "Running",
"dryRunResult": "{mode}; preview {result}; tools {tools}",
"dryRunHistoryRecorded": "History recorded",
"dryRunBlocked": "Dry run blocked",
"dryRunError": "Dry run failed",
"state": {

View File

@@ -1418,6 +1418,7 @@
"dryRunButton": "試跑",
"dryRunLoading": "試跑中",
"dryRunResult": "{mode};預覽 {result};工具 {tools}",
"dryRunHistoryRecorded": "已寫入歷史",
"dryRunBlocked": "試跑未放行",
"dryRunError": "試跑失敗",
"state": {

View File

@@ -159,6 +159,9 @@ interface RemediationDryRunResponse {
tool_name?: string
required_scope?: string
} | null
history?: {
recorded?: boolean
}
}
interface RemediationActionState {
@@ -490,27 +493,40 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
<span>{actionState[item.work_item_id]?.status === 'loading' ? t('dryRunLoading') : t('dryRunButton')}</span>
</button>
{actionState[item.work_item_id]?.status === 'done' && (
<div style={{
display: 'flex',
alignItems: 'flex-start',
gap: 5,
fontFamily: "'DM Mono', monospace",
fontSize: 9,
color: actionState[item.work_item_id].data?.allowed === false ? '#7c5a10' : '#166534',
lineHeight: 1.4,
overflowWrap: 'anywhere',
}}>
<SearchCheck size={12} style={{ flexShrink: 0, marginTop: 1 }} />
<span>
{actionState[item.work_item_id].data?.allowed === false
? t('dryRunBlocked')
: t('dryRunResult', {
mode: actionState[item.work_item_id].data?.mode ?? '--',
result: actionState[item.work_item_id].data?.verification_result_preview ?? '--',
tools: actionState[item.work_item_id].data?.post_state_summary?.tool_count ?? 0,
})}
</span>
</div>
<>
<div style={{
display: 'flex',
alignItems: 'flex-start',
gap: 5,
fontFamily: "'DM Mono', monospace",
fontSize: 9,
color: actionState[item.work_item_id].data?.allowed === false ? '#7c5a10' : '#166534',
lineHeight: 1.4,
overflowWrap: 'anywhere',
}}>
<SearchCheck size={12} style={{ flexShrink: 0, marginTop: 1 }} />
<span>
{actionState[item.work_item_id].data?.allowed === false
? t('dryRunBlocked')
: t('dryRunResult', {
mode: actionState[item.work_item_id].data?.mode ?? '--',
result: actionState[item.work_item_id].data?.verification_result_preview ?? '--',
tools: actionState[item.work_item_id].data?.post_state_summary?.tool_count ?? 0,
})}
</span>
</div>
{actionState[item.work_item_id].data?.history?.recorded && (
<div style={{
fontFamily: "'DM Mono', monospace",
fontSize: 9,
color: '#166534',
lineHeight: 1.4,
overflowWrap: 'anywhere',
}}>
{t('dryRunHistoryRecorded')}
</div>
)}
</>
)}
{actionState[item.work_item_id]?.status === 'error' && (
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#FF3300', lineHeight: 1.4 }}>

View File

@@ -1,3 +1,30 @@
## 2026-05-14 | T26 Remediation dry-run 寫入 history試跑結果不再只停在前端暫存
**背景**T25 已讓 Operator 能在 `/governance` 補救佇列點「試跑」,但結果只存在當次 UI state。這仍無法完全回答「這次 dry-run 是否真的發生、跑到哪個流程、MCP 有沒有用到、後續是否能從 Incident history 回看」。
**修正**
- `Adr100RemediationService.dry_run()` 完成後會寫入兩條既有稽核軌道,不新增資料表:
- `alert_operation_log`:使用 `PRE_FLIGHT_PASSED` / `PRE_FLIGHT_FAILED`context schema `adr100_remediation_dry_run_history_v1`,保留 `work_item_id / auto_repair_id / playbook_id / mode / checks / post_state_summary / mcp_route / writes_*`
- `timeline_events`:寫 `event_type=verifier`、title `ADR-100 remediation dry-run`,讓 Incident Timeline 能看到 verifier 階段真的有 dry-run。
- dry-run API response 新增 `history.recorded / alert_operation_id / timeline_event_id`
- `/governance` 補救佇列試跑完成後,如果 history 寫入成功會顯示「已寫入歷史」。
**本地驗證**
- `python -m py_compile apps/api/src/services/adr100_remediation_service.py apps/api/tests/test_adr100_remediation_service.py`pass。
- `ruff check --select F,E9 apps/api/src/services/adr100_remediation_service.py apps/api/tests/test_adr100_remediation_service.py`pass。
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_ai_governance_endpoints.py -q`35 passed。
- `pnpm --filter @awoooi/web typecheck`pass。
- `pnpm --dir apps/web exec next lint --file src/app/[locale]/governance/tabs/slo-tab.tsx`pass。
- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --filter @awoooi/web build`pass。
**推版與 production 驗證**
- 待 T26 commit 推 Gitea main 後驗證。
**目前整體進度**
- Alertmanager 低風險自動修復主線:約 98%。
- 完整 AI 自動化管理產品化:約 91%。
- T26 補上 dry-run 的 durable history。下一段應把這些 history 聚合回「工作鏈路 / Incident 詳情 / Telegram 詳情」同一視角,讓使用者不必猜流程走到哪一格。
## 2026-05-14 | T25 補救佇列新增安全試跑入口replay/reverify 可先讀證據不改狀態
**背景**T24 已把 non-success verifier rows 轉成 `remediation_queue`,但 Operator 仍只能看見「應該 replay / reverify」無法從前端或 API 直接觸發一個安全、可觀測、低風險的試跑步驟。這會讓「AI 可接手」停在文字標籤,還沒有形成可操作入口。