diff --git a/apps/api/src/services/adr100_remediation_service.py b/apps/api/src/services/adr100_remediation_service.py index 1dc0e97e..4aaf988b 100644 --- a/apps/api/src/services/adr100_remediation_service.py +++ b/apps/api/src/services/adr100_remediation_service.py @@ -55,11 +55,17 @@ class Adr100RemediationService: incident_repository: _IncidentRepository | None = None, auto_repair_service: AutoRepairService | None = None, verifier: PostExecutionVerifier | None = None, + timeline_service: Any | None = None, + alert_operation_log_repository: Any | None = None, + record_history: bool = True, ) -> None: self._slo_service = slo_service or get_adr100_slo_status_service() self._incident_repository = incident_repository or IncidentDBRepository() self._auto_repair_service = auto_repair_service or AutoRepairService() self._verifier = verifier or get_post_execution_verifier() + self._timeline_service = timeline_service + self._alert_operation_log_repository = alert_operation_log_repository + self._record_history_enabled = record_history async def preview(self, work_item_id: str, mode: RemediationMode = "auto") -> dict[str, Any]: """Return the safe execution plan for a remediation queue item.""" @@ -98,7 +104,9 @@ class Adr100RemediationService: }) if incident is None or not all(check["passed"] for check in checks): - return _dry_run_blocked_payload(item, selected_mode, checks) + payload = _dry_run_blocked_payload(item, selected_mode, checks) + payload["history"] = await self._record_dry_run_history(item, payload) + return payload if selected_mode == "replay": return await self._dry_run_replay(item, incident, checks) @@ -131,7 +139,7 @@ class Adr100RemediationService: action_taken = f"dry_run_reverify:{item.get('playbook_id') or 'unknown'}" result = _assess_recovery(None, post_state, action_taken) - return _dry_run_result_payload( + payload = _dry_run_result_payload( item=item, mode="reverify", checks=checks, @@ -147,6 +155,8 @@ class Adr100RemediationService: }, }, ) + payload["history"] = await self._record_dry_run_history(item, payload) + return payload async def _dry_run_replay( self, @@ -169,7 +179,7 @@ class Adr100RemediationService: action_taken = f"dry_run_replay:{item.get('playbook_id') or 'unknown'}" result = _assess_recovery(None, post_state, action_taken) - return _dry_run_result_payload( + payload = _dry_run_result_payload( item=item, mode="replay", checks=checks, @@ -181,6 +191,8 @@ class Adr100RemediationService: "promql": _promql_for_incident(incident), }, ) + payload["history"] = await self._record_dry_run_history(item, payload) + return payload async def _collect_current_state(self, incident: Incident) -> dict[str, Any]: try: @@ -202,6 +214,81 @@ class Adr100RemediationService: ) return {} + async def _record_dry_run_history( + self, + item: dict[str, Any], + payload: dict[str, Any], + ) -> dict[str, Any]: + if not self._record_history_enabled: + return {"recorded": False, "reason": "disabled"} + + incident_id = str(item.get("incident_id") or "") + if not incident_id: + return {"recorded": False, "reason": "missing_incident_id"} + + history: dict[str, Any] = { + "recorded": False, + "alert_operation_id": None, + "timeline_event_id": None, + } + context = _history_context(item, payload) + allowed = bool(payload.get("allowed")) + + try: + repo = self._alert_operation_log_repository + if repo is None: + from src.repositories.alert_operation_log_repository import ( + get_alert_operation_log_repository, + ) + + repo = get_alert_operation_log_repository() + record = await repo.append( + "PRE_FLIGHT_PASSED" if allowed else "PRE_FLIGHT_FAILED", + incident_id=incident_id, + auto_repair_id=str(item.get("auto_repair_id") or "") or None, + actor="adr100_remediation_service", + action_detail=f"adr100_remediation_dry_run:{payload.get('mode')}"[:200], + success=allowed, + context=context, + ) + if record is not None: + history["alert_operation_id"] = getattr(record, "id", None) + except Exception as exc: + logger.warning( + "adr100_remediation_alert_operation_history_failed", + incident_id=incident_id, + error=str(exc), + ) + + try: + timeline = self._timeline_service + if timeline is None: + from src.services.approval_db import get_timeline_service + + timeline = get_timeline_service() + event = await timeline.add_event( + event_type="verifier", + status=_timeline_status(payload), + title="ADR-100 remediation dry-run", + description=_history_description(context), + actor="adr100_remediation_service", + actor_role=str(payload.get("mode") or "dry_run"), + incident_id=incident_id, + ) + if event: + history["timeline_event_id"] = event.get("id") + except Exception as exc: + logger.warning( + "adr100_remediation_timeline_history_failed", + incident_id=incident_id, + error=str(exc), + ) + + history["recorded"] = bool( + history.get("alert_operation_id") or history.get("timeline_event_id") + ) + return history + def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay"]: if requested in ("reverify", "replay"): @@ -313,6 +400,48 @@ def _summarize_post_state(post_state: dict[str, Any]) -> dict[str, Any]: } +def _history_context(item: dict[str, Any], payload: dict[str, Any]) -> dict[str, Any]: + return { + "schema_version": "adr100_remediation_dry_run_history_v1", + "work_item_id": item.get("work_item_id"), + "auto_repair_id": item.get("auto_repair_id"), + "playbook_id": item.get("playbook_id"), + "alertname": item.get("alertname"), + "mode": payload.get("mode"), + "allowed": payload.get("allowed"), + "executed": payload.get("executed"), + "safety_level": payload.get("safety_level"), + "writes_incident_state": payload.get("writes_incident_state"), + "writes_auto_repair_result": payload.get("writes_auto_repair_result"), + "verification_result_preview": payload.get("verification_result_preview"), + "post_state_summary": payload.get("post_state_summary"), + "mcp_route": payload.get("mcp_route"), + "checks": payload.get("checks"), + } + + +def _timeline_status(payload: dict[str, Any]) -> str: + if not payload.get("allowed"): + return "warning" + if payload.get("verification_result_preview") == "success": + return "success" + return "warning" + + +def _history_description(context: dict[str, Any]) -> str: + tool_count = (context.get("post_state_summary") or {}).get("tool_count", 0) + route = context.get("mcp_route") or {} + agent = route.get("agent_id") or "unknown_agent" + tool = route.get("tool_name") or "current_state" + return ( + f"mode={context.get('mode')} " + f"preview={context.get('verification_result_preview')} " + f"tools={tool_count} route={agent}/{tool} " + f"writes_incident={context.get('writes_incident_state')} " + f"writes_auto_repair={context.get('writes_auto_repair_result')}" + )[:500] + + def _diagnostic_command_for_incident(incident: Incident) -> str: labels = _labels_for_incident(incident) host = str(labels.get("host") or labels.get("instance") or "{host}") diff --git a/apps/api/tests/test_adr100_remediation_service.py b/apps/api/tests/test_adr100_remediation_service.py index da365b43..f1bbbe6b 100644 --- a/apps/api/tests/test_adr100_remediation_service.py +++ b/apps/api/tests/test_adr100_remediation_service.py @@ -51,6 +51,24 @@ class _FakeVerifier: return self.state +class _FakeAlertOperationLogRepository: + def __init__(self) -> None: + self.calls: list[dict[str, Any]] = [] + + async def append(self, event_type: str, **kwargs: Any): + self.calls.append({"event_type": event_type, **kwargs}) + return type("AlertOperationRecord", (), {"id": "aol-1"})() + + +class _FakeTimelineService: + def __init__(self) -> None: + self.calls: list[dict[str, Any]] = [] + + async def add_event(self, **kwargs: Any) -> dict[str, Any]: + self.calls.append(kwargs) + return {"id": "timeline-1"} + + class _NoopPlaybookService: async def get_recommendations(self, *_args, **_kwargs): # noqa: ANN002, ANN003 return [] @@ -111,6 +129,9 @@ def _service( item: dict[str, Any], incident: Incident | None = None, state: dict[str, Any] | None = None, + timeline_service: Any | None = None, + alert_operation_log_repository: Any | None = None, + record_history: bool = False, ) -> Adr100RemediationService: return Adr100RemediationService( slo_service=_FakeSloService([item]), @@ -120,6 +141,9 @@ def _service( cooldown_checker=_no_cooldown, ), verifier=_FakeVerifier(state or {"k8s_get_pod_status": {"phase": "Running"}}), + timeline_service=timeline_service, + alert_operation_log_repository=alert_operation_log_repository, + record_history=record_history, ) @@ -156,6 +180,7 @@ async def test_dry_run_reverify_collects_state_without_writes(): assert result["post_state_summary"]["tool_count"] == 1 assert result["mcp_route"]["agent_id"] == "post_execution_verifier" assert result["mcp_route"]["required_scope"] == "read" + assert result["history"]["recorded"] is False @pytest.mark.asyncio @@ -187,6 +212,36 @@ async def test_dry_run_blocks_when_incident_missing(): assert any(check["name"] == "incident_loaded" and not check["passed"] for check in result["checks"]) +@pytest.mark.asyncio +async def test_dry_run_records_alert_operation_and_timeline_history(): + alert_repo = _FakeAlertOperationLogRepository() + timeline = _FakeTimelineService() + svc = _service( + item=_queue_item(), + timeline_service=timeline, + alert_operation_log_repository=alert_repo, + record_history=True, + ) + + result = await svc.dry_run("verification:INC-20260514-TEST01:are-1") + + assert result["history"] == { + "recorded": True, + "alert_operation_id": "aol-1", + "timeline_event_id": "timeline-1", + } + assert alert_repo.calls[0]["event_type"] == "PRE_FLIGHT_PASSED" + assert alert_repo.calls[0]["incident_id"] == "INC-20260514-TEST01" + assert alert_repo.calls[0]["success"] is True + assert alert_repo.calls[0]["context"]["schema_version"] == ( + "adr100_remediation_dry_run_history_v1" + ) + assert alert_repo.calls[0]["context"]["writes_incident_state"] is False + assert timeline.calls[0]["event_type"] == "verifier" + assert timeline.calls[0]["status"] == "success" + assert timeline.calls[0]["actor_role"] == "replay" + + @pytest.mark.asyncio async def test_missing_work_item_raises_not_found(): svc = _service(item=_queue_item()) diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index eaf4c8e9..7e33410e 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -1417,6 +1417,7 @@ "dryRunButton": "Dry run", "dryRunLoading": "Running", "dryRunResult": "{mode}; preview {result}; tools {tools}", + "dryRunHistoryRecorded": "History recorded", "dryRunBlocked": "Dry run blocked", "dryRunError": "Dry run failed", "state": { diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index 94bcbd58..13d31898 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -1418,6 +1418,7 @@ "dryRunButton": "試跑", "dryRunLoading": "試跑中", "dryRunResult": "{mode};預覽 {result};工具 {tools}", + "dryRunHistoryRecorded": "已寫入歷史", "dryRunBlocked": "試跑未放行", "dryRunError": "試跑失敗", "state": { diff --git a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx index 0b4981d3..58efdd37 100644 --- a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx +++ b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx @@ -159,6 +159,9 @@ interface RemediationDryRunResponse { tool_name?: string required_scope?: string } | null + history?: { + recorded?: boolean + } } interface RemediationActionState { @@ -490,27 +493,40 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification {actionState[item.work_item_id]?.status === 'loading' ? t('dryRunLoading') : t('dryRunButton')} {actionState[item.work_item_id]?.status === 'done' && ( -