diff --git a/apps/api/src/api/v1/ai_slo.py b/apps/api/src/api/v1/ai_slo.py index b8154fda..7e8bc59b 100644 --- a/apps/api/src/api/v1/ai_slo.py +++ b/apps/api/src/api/v1/ai_slo.py @@ -118,3 +118,18 @@ async def dry_run_ai_slo_remediation(request: RemediationDryRunRequest) -> dict: ) except RemediationNotFoundError as exc: raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc + + +@router.get("/ai/slo/remediation/history") +async def list_ai_slo_remediation_history( + limit: int = Query(50, ge=1, le=200), + incident_id: str | None = Query(default=None, min_length=1), + work_item_id: str | None = Query(default=None, min_length=1), +) -> dict: + """List durable ADR-100 remediation dry-run history from alert_operation_log.""" + + return await get_adr100_remediation_service().history( + limit=limit, + incident_id=incident_id, + work_item_id=work_item_id, + ) diff --git a/apps/api/src/services/adr100_remediation_service.py b/apps/api/src/services/adr100_remediation_service.py index 4aaf988b..d67c5d76 100644 --- a/apps/api/src/services/adr100_remediation_service.py +++ b/apps/api/src/services/adr100_remediation_service.py @@ -112,6 +112,67 @@ class Adr100RemediationService: return await self._dry_run_replay(item, incident, checks) return await self._dry_run_reverify(item, incident, checks) + async def history( + self, + *, + limit: int = 50, + incident_id: str | None = None, + work_item_id: str | None = None, + ) -> dict[str, Any]: + """Return durable dry-run history written by this remediation service.""" + + safe_limit = max(1, min(limit, 200)) + fetch_limit = min(max(safe_limit * 4, 50), 200) + rows: list[Any] = [] + repo = self._alert_operation_log_repository + if repo is None: + from src.repositories.alert_operation_log_repository import ( + get_alert_operation_log_repository, + ) + + repo = get_alert_operation_log_repository() + + for event_type in ("PRE_FLIGHT_PASSED", "PRE_FLIGHT_FAILED"): + try: + batch, _total = await repo.list_recent( + limit=fetch_limit, + event_type=event_type, + incident_id=incident_id, + ) + rows.extend(batch) + except Exception as exc: + logger.warning( + "adr100_remediation_history_fetch_failed", + event_type=event_type, + incident_id=incident_id, + error=str(exc), + ) + + rows.sort(key=_record_created_at, reverse=True) + + items: list[dict[str, Any]] = [] + for row in rows: + context = getattr(row, "context", None) or {} + if context.get("schema_version") != "adr100_remediation_dry_run_history_v1": + continue + if work_item_id and context.get("work_item_id") != work_item_id: + continue + items.append(_history_item(row, context)) + if len(items) >= safe_limit: + break + + return { + "schema_version": "adr100_remediation_history_v1", + "total": len(items), + "limit": safe_limit, + "filters": { + "incident_id": incident_id, + "work_item_id": work_item_id, + }, + "items": items, + "by_work_item": _summarize_history_by_work_item(items), + } + async def _find_work_item(self, work_item_id: str) -> dict[str, Any]: report = await self._slo_service.fetch_report() coverage = report.get("verification_coverage") or {} @@ -442,6 +503,66 @@ def _history_description(context: dict[str, Any]) -> str: )[:500] +def _record_created_at(record: Any) -> str: + value = getattr(record, "created_at", None) + if hasattr(value, "isoformat"): + return value.isoformat() + return str(value or "") + + +def _history_item(record: Any, context: dict[str, Any]) -> dict[str, Any]: + route = context.get("mcp_route") or {} + post_state = context.get("post_state_summary") or {} + return { + "id": str(getattr(record, "id", "")), + "incident_id": getattr(record, "incident_id", None), + "auto_repair_id": getattr(record, "auto_repair_id", None) + or context.get("auto_repair_id"), + "event_type": str(getattr(record, "event_type", "")), + "actor": getattr(record, "actor", None), + "success": getattr(record, "success", None), + "created_at": _record_created_at(record), + "work_item_id": context.get("work_item_id"), + "playbook_id": context.get("playbook_id"), + "alertname": context.get("alertname"), + "mode": context.get("mode"), + "allowed": context.get("allowed"), + "executed": context.get("executed"), + "safety_level": context.get("safety_level"), + "verification_result_preview": context.get("verification_result_preview"), + "tool_count": post_state.get("tool_count", 0), + "tools": post_state.get("tools") or [], + "agent_id": route.get("agent_id"), + "tool_name": route.get("tool_name") or "current_state", + "required_scope": route.get("required_scope"), + "writes_incident_state": context.get("writes_incident_state"), + "writes_auto_repair_result": context.get("writes_auto_repair_result"), + "checks": context.get("checks") or [], + } + + +def _summarize_history_by_work_item(items: list[dict[str, Any]]) -> list[dict[str, Any]]: + summary: dict[str, dict[str, Any]] = {} + for item in items: + key = str(item.get("work_item_id") or item.get("incident_id") or item.get("id")) + if key not in summary: + summary[key] = { + "work_item_id": item.get("work_item_id"), + "incident_id": item.get("incident_id"), + "count": 0, + "latest_at": item.get("created_at"), + "latest_event_type": item.get("event_type"), + "latest_success": item.get("success"), + "latest_preview": item.get("verification_result_preview"), + "latest_mode": item.get("mode"), + "latest_agent_id": item.get("agent_id"), + "latest_tool_name": item.get("tool_name"), + "required_scope": item.get("required_scope"), + } + summary[key]["count"] += 1 + return list(summary.values()) + + def _diagnostic_command_for_incident(incident: Incident) -> str: labels = _labels_for_incident(incident) host = str(labels.get("host") or labels.get("instance") or "{host}") diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 8c82fbd8..54d36658 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -161,6 +161,40 @@ def _format_automation_quality_lines(quality: dict[str, object] | None) -> list[ return lines +def _format_remediation_history_lines(history: dict[str, object] | None) -> list[str]: + if not history or int(history.get("total") or 0) <= 0: + return [] + + items = history.get("items") if isinstance(history.get("items"), list) else [] + latest = items[0] if items and isinstance(items[0], dict) else {} + agent = latest.get("agent_id") or "unknown_agent" + tool = latest.get("tool_name") or "current_state" + scope = latest.get("required_scope") or "unknown" + writes_incident = latest.get("writes_incident_state") + writes_auto_repair = latest.get("writes_auto_repair_result") + + return [ + "", + "🧪 ADR-100 補救試跑", + f"歷史: {int(history.get('total') or 0)} 次", + ( + "上次: " + f"{html.escape(str(latest.get('mode') or 'unknown'))} / " + f"{html.escape(str(latest.get('verification_result_preview') or 'unknown'))}" + ), + ( + "MCP: " + f"{html.escape(str(agent))}/{html.escape(str(tool))} / " + f"{html.escape(str(scope))}" + ), + ( + "寫入: " + f"incident {html.escape(str(writes_incident))} / " + f"auto-repair {html.escape(str(writes_auto_repair))}" + ), + ] + + def _sanitize_telegram_error(text: str) -> str: """遮蔽 Telegram Bot URL 中的 token,避免例外字串污染 log / trace。""" return _TELEGRAM_BOT_URL_RE.sub(r"\1", text) @@ -5276,6 +5310,23 @@ class TelegramGateway: + html.escape(", ".join(mismatch_codes[:4])) ) + try: + from src.services.adr100_remediation_service import ( + get_adr100_remediation_service, + ) + + remediation_history = await get_adr100_remediation_service().history( + limit=5, + incident_id=incident_id, + ) + lines += _format_remediation_history_lines(remediation_history) + except Exception as remediation_exc: + logger.warning( + "incident_detail_remediation_history_summary_failed", + incident_id=incident_id, + error=str(remediation_exc), + ) + try: from src.services.awooop_truth_chain_service import fetch_truth_chain @@ -5298,6 +5349,23 @@ class TelegramGateway: error=str(truth_exc), ) + try: + from src.services.adr100_remediation_service import ( + get_adr100_remediation_service, + ) + + remediation_history = await get_adr100_remediation_service().history( + limit=5, + incident_id=incident_id, + ) + lines += _format_remediation_history_lines(remediation_history) + except Exception as remediation_exc: + logger.warning( + "incident_history_remediation_summary_failed", + incident_id=incident_id, + error=str(remediation_exc), + ) + await self.send_notification("\n".join(lines)) except Exception as e: diff --git a/apps/api/tests/test_adr100_remediation_service.py b/apps/api/tests/test_adr100_remediation_service.py index f1bbbe6b..08f94f2a 100644 --- a/apps/api/tests/test_adr100_remediation_service.py +++ b/apps/api/tests/test_adr100_remediation_service.py @@ -54,10 +54,42 @@ class _FakeVerifier: class _FakeAlertOperationLogRepository: def __init__(self) -> None: self.calls: list[dict[str, Any]] = [] + self.records: list[Any] = [] async def append(self, event_type: str, **kwargs: Any): self.calls.append({"event_type": event_type, **kwargs}) - return type("AlertOperationRecord", (), {"id": "aol-1"})() + record = type( + "AlertOperationRecord", + (), + { + "id": f"aol-{len(self.records) + 1}", + "incident_id": kwargs.get("incident_id"), + "auto_repair_id": kwargs.get("auto_repair_id"), + "event_type": event_type, + "actor": kwargs.get("actor"), + "success": kwargs.get("success"), + "context": kwargs.get("context") or {}, + "created_at": datetime(2026, 5, 14, 14, 45, len(self.records), tzinfo=timezone.utc), + }, + )() + self.records.append(record) + return record + + async def list_recent( + self, + limit: int = 50, + offset: int = 0, + event_type: str | None = None, + incident_id: str | None = None, + ): + rows = [ + record + for record in self.records + if (event_type is None or record.event_type == event_type) + and (incident_id is None or record.incident_id == incident_id) + ] + rows = sorted(rows, key=lambda record: record.created_at, reverse=True) + return rows[offset:offset + limit], len(rows) class _FakeTimelineService: @@ -242,6 +274,29 @@ async def test_dry_run_records_alert_operation_and_timeline_history(): assert timeline.calls[0]["actor_role"] == "replay" +@pytest.mark.asyncio +async def test_history_lists_dry_run_records_grouped_by_work_item(): + alert_repo = _FakeAlertOperationLogRepository() + svc = _service( + item=_queue_item(), + alert_operation_log_repository=alert_repo, + record_history=True, + ) + + await svc.dry_run("verification:INC-20260514-TEST01:are-1") + history = await svc.history(limit=10) + + assert history["schema_version"] == "adr100_remediation_history_v1" + assert history["total"] == 1 + assert history["items"][0]["work_item_id"] == "verification:INC-20260514-TEST01:are-1" + assert history["items"][0]["agent_id"] == "auto_repair_executor" + assert history["items"][0]["tool_name"] == "ssh_diagnose" + assert history["items"][0]["required_scope"] == "read" + assert history["items"][0]["writes_incident_state"] is False + assert history["by_work_item"][0]["count"] == 1 + assert history["by_work_item"][0]["latest_tool_name"] == "ssh_diagnose" + + @pytest.mark.asyncio async def test_missing_work_item_raises_not_found(): svc = _service(item=_queue_item()) @@ -261,6 +316,24 @@ def test_ai_slo_remediation_endpoints(monkeypatch): async def dry_run(self, work_item_id: str, mode: str = "auto") -> dict[str, Any]: return {"work_item_id": work_item_id, "mode": mode, "executed": True} + async def history( + self, + *, + limit: int = 50, + incident_id: str | None = None, + work_item_id: str | None = None, + ) -> dict[str, Any]: + return { + "schema_version": "adr100_remediation_history_v1", + "limit": limit, + "filters": { + "incident_id": incident_id, + "work_item_id": work_item_id, + }, + "items": [], + "by_work_item": [], + } + monkeypatch.setattr( "src.api.v1.ai_slo.get_adr100_remediation_service", lambda: _FakeService(), @@ -275,8 +348,15 @@ def test_ai_slo_remediation_endpoints(monkeypatch): "/api/v1/ai/slo/remediation/dry-run", json={"work_item_id": "verification:INC:are-1", "mode": "replay"}, ) + history = client.get( + "/api/v1/ai/slo/remediation/history", + params={"limit": 10, "work_item_id": "verification:INC:are-1"}, + ) assert preview.status_code == 200 assert preview.json()["mode"] == "reverify" assert dry_run.status_code == 200 assert dry_run.json()["executed"] is True + assert history.status_code == 200 + assert history.json()["schema_version"] == "adr100_remediation_history_v1" + assert history.json()["filters"]["work_item_id"] == "verification:INC:are-1" diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 7e33410e..04b45a6f 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -1418,6 +1418,7 @@ "dryRunLoading": "Running", "dryRunResult": "{mode}; preview {result}; tools {tools}", "dryRunHistoryRecorded": "History recorded", + "dryRunHistorySummary": "History {count}x; last {time}; {route}", "dryRunBlocked": "Dry run blocked", "dryRunError": "Dry run failed", "state": { diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index 13d31898..aad30e3d 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -1419,6 +1419,7 @@ "dryRunLoading": "試跑中", "dryRunResult": "{mode};預覽 {result};工具 {tools}", "dryRunHistoryRecorded": "已寫入歷史", + "dryRunHistorySummary": "歷史 {count} 次;上次 {time};{route}", "dryRunBlocked": "試跑未放行", "dryRunError": "試跑失敗", "state": { diff --git a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx index 58efdd37..bc477866 100644 --- a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx +++ b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx @@ -164,6 +164,24 @@ interface RemediationDryRunResponse { } } +interface RemediationHistoryWorkItemSummary { + work_item_id?: string | null + incident_id?: string | null + count: number + latest_at?: string | null + latest_preview?: string | null + latest_mode?: string | null + latest_agent_id?: string | null + latest_tool_name?: string | null + required_scope?: string | null +} + +interface RemediationHistoryResponse { + schema_version?: string + total?: number + by_work_item?: RemediationHistoryWorkItemSummary[] +} + interface RemediationActionState { status: 'loading' | 'done' | 'error' data?: RemediationDryRunResponse @@ -255,6 +273,34 @@ async function requestRemediationDryRun(workItemId: string): Promise { + const params = new URLSearchParams({ limit: '80' }) + const response = await fetch(`${API_BASE}/api/v1/ai/slo/remediation/history?${params}`) + if (!response.ok) throw new Error(`history_failed:${response.status}`) + return response.json() +} + +function formatShortDateTime(value?: string | null): string { + if (!value) return '--' + const date = new Date(value) + if (Number.isNaN(date.getTime())) return value + return date.toLocaleString(undefined, { + month: '2-digit', + day: '2-digit', + hour: '2-digit', + minute: '2-digit', + }) +} + +function historyRouteLabel(summary: RemediationHistoryWorkItemSummary): string { + return compactLabel( + [summary.latest_agent_id, summary.latest_tool_name] + .filter(Boolean) + .join('/') || summary.required_scope || '--', + '--', + ) +} + function buildMetrics(api: SloApiResponse): SloMetric[] { const adr100Metrics = api.adr100?.metrics if (adr100Metrics?.length) { @@ -312,6 +358,7 @@ function buildMetrics(api: SloApiResponse): SloMetric[] { function VerificationCoveragePanel({ coverage }: { coverage?: Adr100VerificationCoverage }) { const t = useTranslations('governance.slo.coverage') const [actionState, setActionState] = useState>({}) + const [remediationHistory, setRemediationHistory] = useState(null) const color = coverageTone(coverage?.status) const rows = [ { label: t('totalAuto'), value: String(coverage?.total_auto ?? '--') }, @@ -322,6 +369,28 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification const failureBreakdown = coverage?.non_success_breakdown?.by_failure_class ?? [] const recentFindings = coverage?.recent_non_success ?? [] const remediationQueue = coverage?.remediation_queue + const remediationQueueTotal = remediationQueue?.total ?? 0 + const historyByWorkItem = new Map( + (remediationHistory?.by_work_item ?? []) + .filter(item => item.work_item_id) + .map(item => [item.work_item_id as string, item]), + ) + + useEffect(() => { + let cancelled = false + if (remediationQueueTotal === 0) { + setRemediationHistory(null) + return () => { cancelled = true } + } + requestRemediationHistory() + .then(data => { + if (!cancelled) setRemediationHistory(data) + }) + .catch(() => { + if (!cancelled) setRemediationHistory(null) + }) + return () => { cancelled = true } + }, [remediationQueueTotal]) const handleDryRun = async (workItemId: string) => { setActionState(prev => ({ @@ -334,6 +403,9 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification ...prev, [workItemId]: { status: 'done', data }, })) + requestRemediationHistory() + .then(setRemediationHistory) + .catch(() => undefined) } catch { setActionState(prev => ({ ...prev, @@ -431,7 +503,9 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
- {(remediationQueue.items ?? []).slice(0, 4).map(item => ( + {(remediationQueue.items ?? []).slice(0, 4).map(item => { + const historySummary = historyByWorkItem.get(item.work_item_id) + return (
)} + {historySummary && ( +
+ {t('dryRunHistorySummary', { + count: historySummary.count, + time: formatShortDateTime(historySummary.latest_at), + route: historyRouteLabel(historySummary), + })} +
+ )} {actionState[item.work_item_id]?.status === 'error' && (
{t('dryRunError')} @@ -535,7 +624,8 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification )}
- ))} + ) + })}
)} diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 0c52c63a..241e1d5c 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,30 @@ +## 2026-05-14 | T27 Remediation history read model,前端與 Telegram 可看見試跑重複次數與 MCP 路徑 + +**背景**:T26 已讓 ADR-100 remediation dry-run 寫入 `alert_operation_log` 與 `timeline_events`,但 governance 頁重新整理後仍看不到某筆補救工作過去試跑幾次、上次跑到哪個 preview、是否有用 MCP、是否仍只讀。Telegram 詳情 / 歷史也還沒有把這段 dry-run history 明確帶出。 + +**修正**: +- `Adr100RemediationService.history()` 新增 `adr100_remediation_history_v1` read model,從既有 `alert_operation_log` 讀 `adr100_remediation_dry_run_history_v1` context,不新增資料表。 +- 新增 `GET /api/v1/ai/slo/remediation/history`,支援 `limit / incident_id / work_item_id`,回傳 `items` 與 `by_work_item` 聚合,包含 count、latest preview、agent、tool、scope、writes flags。 +- `/governance` 補救工作佇列會讀 history endpoint;每筆工作顯示「歷史 N 次;上次時間;agent/tool」,點試跑成功後會重新整理 history,不只依賴當次 UI state。 +- Telegram `detail:{incident_id}` 與 `history:{incident_id}` 會補上 `ADR-100 補救試跑` 摘要,包含歷史次數、上次 mode/preview、MCP agent/tool/scope、是否寫 incident/auto-repair 狀態。 + +**本地驗證**: +- `python -m py_compile apps/api/src/services/adr100_remediation_service.py apps/api/src/api/v1/ai_slo.py apps/api/src/services/telegram_gateway.py apps/api/tests/test_adr100_remediation_service.py`:pass。 +- `ruff check --select F,E9 src/services/adr100_remediation_service.py src/api/v1/ai_slo.py src/services/telegram_gateway.py tests/test_adr100_remediation_service.py`:pass。 +- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_ai_governance_endpoints.py -q`:36 passed。 +- i18n JSON parse / `git diff --check`:pass。 +- `pnpm --filter @awoooi/web typecheck`:pass。 +- `pnpm --dir apps/web exec next lint --file src/app/[locale]/governance/tabs/slo-tab.tsx`:pass。 +- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --filter @awoooi/web build`:pass。 + +**推版與 production 驗證**: +- 待 T27 commit 推 Gitea main 後驗證。 + +**目前整體進度**: +- Alertmanager 低風險自動修復主線:約 98%。 +- 完整 AI 自動化管理產品化:約 92%。 +- T27 補上「試跑歷史」read model 與 UI/Telegram 可讀摘要。下一段應把 Incident 詳情頁的 stage event 展開成更完整的工作鏈路,而不是只顯示壓縮 ascii timeline。 + ## 2026-05-14 | T26 Remediation dry-run 寫入 history,試跑結果不再只停在前端暫存 **背景**:T25 已讓 Operator 能在 `/governance` 補救佇列點「試跑」,但結果只存在當次 UI state。這仍無法完全回答「這次 dry-run 是否真的發生、跑到哪個流程、MCP 有沒有用到、後續是否能從 Incident history 回看」。