feat(governance): surface remediation dry run history

2026-05-14 22:56:51 +08:00
parent 53cd7f9d66
commit 392cfb9025
8 changed files with 406 additions and 3 deletions
--- a/apps/api/src/api/v1/ai_slo.py
+++ b/apps/api/src/api/v1/ai_slo.py
@@ -118,3 +118,18 @@ async def dry_run_ai_slo_remediation(request: RemediationDryRunRequest) -> dict:
        )
    except RemediationNotFoundError as exc:
        raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
+
+
+@router.get("/ai/slo/remediation/history")
+async def list_ai_slo_remediation_history(
+    limit: int = Query(50, ge=1, le=200),
+    incident_id: str | None = Query(default=None, min_length=1),
+    work_item_id: str | None = Query(default=None, min_length=1),
+) -> dict:
+    """List durable ADR-100 remediation dry-run history from alert_operation_log."""
+
+    return await get_adr100_remediation_service().history(
+        limit=limit,
+        incident_id=incident_id,
+        work_item_id=work_item_id,
+    )
--- a/apps/api/src/services/adr100_remediation_service.py
+++ b/apps/api/src/services/adr100_remediation_service.py
@@ -112,6 +112,67 @@ class Adr100RemediationService:
            return await self._dry_run_replay(item, incident, checks)
        return await self._dry_run_reverify(item, incident, checks)

+    async def history(
+        self,
+        *,
+        limit: int = 50,
+        incident_id: str | None = None,
+        work_item_id: str | None = None,
+    ) -> dict[str, Any]:
+        """Return durable dry-run history written by this remediation service."""
+
+        safe_limit = max(1, min(limit, 200))
+        fetch_limit = min(max(safe_limit * 4, 50), 200)
+        rows: list[Any] = []
+        repo = self._alert_operation_log_repository
+        if repo is None:
+            from src.repositories.alert_operation_log_repository import (
+                get_alert_operation_log_repository,
+            )
+
+            repo = get_alert_operation_log_repository()
+
+        for event_type in ("PRE_FLIGHT_PASSED", "PRE_FLIGHT_FAILED"):
+            try:
+                batch, _total = await repo.list_recent(
+                    limit=fetch_limit,
+                    event_type=event_type,
+                    incident_id=incident_id,
+                )
+                rows.extend(batch)
+            except Exception as exc:
+                logger.warning(
+                    "adr100_remediation_history_fetch_failed",
+                    event_type=event_type,
+                    incident_id=incident_id,
+                    error=str(exc),
+                )
+
+        rows.sort(key=_record_created_at, reverse=True)
+
+        items: list[dict[str, Any]] = []
+        for row in rows:
+            context = getattr(row, "context", None) or {}
+            if context.get("schema_version") != "adr100_remediation_dry_run_history_v1":
+                continue
+            if work_item_id and context.get("work_item_id") != work_item_id:
+                continue
+            items.append(_history_item(row, context))
+            if len(items) >= safe_limit:
+                break
+
+        return {
+            "schema_version": "adr100_remediation_history_v1",
+            "total": len(items),
+            "limit": safe_limit,
+            "filters": {
+                "incident_id": incident_id,
+                "work_item_id": work_item_id,
+            },
+            "items": items,
+            "by_work_item": _summarize_history_by_work_item(items),
+        }
+
    async def _find_work_item(self, work_item_id: str) -> dict[str, Any]:
        report = await self._slo_service.fetch_report()
        coverage = report.get("verification_coverage") or {}
@@ -442,6 +503,66 @@ def _history_description(context: dict[str, Any]) -> str:
    )[:500]


+def _record_created_at(record: Any) -> str:
+    value = getattr(record, "created_at", None)
+    if hasattr(value, "isoformat"):
+        return value.isoformat()
+    return str(value or "")
+
+
+def _history_item(record: Any, context: dict[str, Any]) -> dict[str, Any]:
+    route = context.get("mcp_route") or {}
+    post_state = context.get("post_state_summary") or {}
+    return {
+        "id": str(getattr(record, "id", "")),
+        "incident_id": getattr(record, "incident_id", None),
+        "auto_repair_id": getattr(record, "auto_repair_id", None)
+        or context.get("auto_repair_id"),
+        "event_type": str(getattr(record, "event_type", "")),
+        "actor": getattr(record, "actor", None),
+        "success": getattr(record, "success", None),
+        "created_at": _record_created_at(record),
+        "work_item_id": context.get("work_item_id"),
+        "playbook_id": context.get("playbook_id"),
+        "alertname": context.get("alertname"),
+        "mode": context.get("mode"),
+        "allowed": context.get("allowed"),
+        "executed": context.get("executed"),
+        "safety_level": context.get("safety_level"),
+        "verification_result_preview": context.get("verification_result_preview"),
+        "tool_count": post_state.get("tool_count", 0),
+        "tools": post_state.get("tools") or [],
+        "agent_id": route.get("agent_id"),
+        "tool_name": route.get("tool_name") or "current_state",
+        "required_scope": route.get("required_scope"),
+        "writes_incident_state": context.get("writes_incident_state"),
+        "writes_auto_repair_result": context.get("writes_auto_repair_result"),
+        "checks": context.get("checks") or [],
+    }
+
+
+def _summarize_history_by_work_item(items: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    summary: dict[str, dict[str, Any]] = {}
+    for item in items:
+        key = str(item.get("work_item_id") or item.get("incident_id") or item.get("id"))
+        if key not in summary:
+            summary[key] = {
+                "work_item_id": item.get("work_item_id"),
+                "incident_id": item.get("incident_id"),
+                "count": 0,
+                "latest_at": item.get("created_at"),
+                "latest_event_type": item.get("event_type"),
+                "latest_success": item.get("success"),
+                "latest_preview": item.get("verification_result_preview"),
+                "latest_mode": item.get("mode"),
+                "latest_agent_id": item.get("agent_id"),
+                "latest_tool_name": item.get("tool_name"),
+                "required_scope": item.get("required_scope"),
+            }
+        summary[key]["count"] += 1
+    return list(summary.values())
+
+
 def _diagnostic_command_for_incident(incident: Incident) -> str:
    labels = _labels_for_incident(incident)
    host = str(labels.get("host") or labels.get("instance") or "{host}")
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -161,6 +161,40 @@ def _format_automation_quality_lines(quality: dict[str, object] | None) -> list[
    return lines


+def _format_remediation_history_lines(history: dict[str, object] | None) -> list[str]:
+    if not history or int(history.get("total") or 0) <= 0:
+        return []
+
+    items = history.get("items") if isinstance(history.get("items"), list) else []
+    latest = items[0] if items and isinstance(items[0], dict) else {}
+    agent = latest.get("agent_id") or "unknown_agent"
+    tool = latest.get("tool_name") or "current_state"
+    scope = latest.get("required_scope") or "unknown"
+    writes_incident = latest.get("writes_incident_state")
+    writes_auto_repair = latest.get("writes_auto_repair_result")
+
+    return [
+        "",
+        "🧪 <b>ADR-100 補救試跑</b>",
+        f"歷史: <code>{int(history.get('total') or 0)}</code> 次",
+        (
+            "上次: "
+            f"<code>{html.escape(str(latest.get('mode') or 'unknown'))}</code> / "
+            f"<code>{html.escape(str(latest.get('verification_result_preview') or 'unknown'))}</code>"
+        ),
+        (
+            "MCP: "
+            f"<code>{html.escape(str(agent))}/{html.escape(str(tool))}</code> / "
+            f"<code>{html.escape(str(scope))}</code>"
+        ),
+        (
+            "寫入: "
+            f"incident <code>{html.escape(str(writes_incident))}</code> / "
+            f"auto-repair <code>{html.escape(str(writes_auto_repair))}</code>"
+        ),
+    ]
+
+
 def _sanitize_telegram_error(text: str) -> str:
    """遮蔽 Telegram Bot URL 中的 token，避免例外字串污染 log / trace。"""
    return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
@@ -5276,6 +5310,23 @@ class TelegramGateway:
                            + html.escape(", ".join(mismatch_codes[:4]))
                        )

+            try:
+                from src.services.adr100_remediation_service import (
+                    get_adr100_remediation_service,
+                )
+
+                remediation_history = await get_adr100_remediation_service().history(
+                    limit=5,
+                    incident_id=incident_id,
+                )
+                lines += _format_remediation_history_lines(remediation_history)
+            except Exception as remediation_exc:
+                logger.warning(
+                    "incident_detail_remediation_history_summary_failed",
+                    incident_id=incident_id,
+                    error=str(remediation_exc),
+                )
+
            try:
                from src.services.awooop_truth_chain_service import fetch_truth_chain

@@ -5298,6 +5349,23 @@ class TelegramGateway:
                    error=str(truth_exc),
                )

+            try:
+                from src.services.adr100_remediation_service import (
+                    get_adr100_remediation_service,
+                )
+
+                remediation_history = await get_adr100_remediation_service().history(
+                    limit=5,
+                    incident_id=incident_id,
+                )
+                lines += _format_remediation_history_lines(remediation_history)
+            except Exception as remediation_exc:
+                logger.warning(
+                    "incident_history_remediation_summary_failed",
+                    incident_id=incident_id,
+                    error=str(remediation_exc),
+                )
+
            await self.send_notification("\n".join(lines))

        except Exception as e:
--- a/apps/api/tests/test_adr100_remediation_service.py
+++ b/apps/api/tests/test_adr100_remediation_service.py
@@ -54,10 +54,42 @@ class _FakeVerifier:
 class _FakeAlertOperationLogRepository:
    def __init__(self) -> None:
        self.calls: list[dict[str, Any]] = []
+        self.records: list[Any] = []

    async def append(self, event_type: str, **kwargs: Any):
        self.calls.append({"event_type": event_type, **kwargs})
-        return type("AlertOperationRecord", (), {"id": "aol-1"})()
+        record = type(
+            "AlertOperationRecord",
+            (),
+            {
+                "id": f"aol-{len(self.records) + 1}",
+                "incident_id": kwargs.get("incident_id"),
+                "auto_repair_id": kwargs.get("auto_repair_id"),
+                "event_type": event_type,
+                "actor": kwargs.get("actor"),
+                "success": kwargs.get("success"),
+                "context": kwargs.get("context") or {},
+                "created_at": datetime(2026, 5, 14, 14, 45, len(self.records), tzinfo=timezone.utc),
+            },
+        )()
+        self.records.append(record)
+        return record
+
+    async def list_recent(
+        self,
+        limit: int = 50,
+        offset: int = 0,
+        event_type: str | None = None,
+        incident_id: str | None = None,
+    ):
+        rows = [
+            record
+            for record in self.records
+            if (event_type is None or record.event_type == event_type)
+            and (incident_id is None or record.incident_id == incident_id)
+        ]
+        rows = sorted(rows, key=lambda record: record.created_at, reverse=True)
+        return rows[offset:offset + limit], len(rows)


 class _FakeTimelineService:
@@ -242,6 +274,29 @@ async def test_dry_run_records_alert_operation_and_timeline_history():
    assert timeline.calls[0]["actor_role"] == "replay"


+@pytest.mark.asyncio
+async def test_history_lists_dry_run_records_grouped_by_work_item():
+    alert_repo = _FakeAlertOperationLogRepository()
+    svc = _service(
+        item=_queue_item(),
+        alert_operation_log_repository=alert_repo,
+        record_history=True,
+    )
+
+    await svc.dry_run("verification:INC-20260514-TEST01:are-1")
+    history = await svc.history(limit=10)
+
+    assert history["schema_version"] == "adr100_remediation_history_v1"
+    assert history["total"] == 1
+    assert history["items"][0]["work_item_id"] == "verification:INC-20260514-TEST01:are-1"
+    assert history["items"][0]["agent_id"] == "auto_repair_executor"
+    assert history["items"][0]["tool_name"] == "ssh_diagnose"
+    assert history["items"][0]["required_scope"] == "read"
+    assert history["items"][0]["writes_incident_state"] is False
+    assert history["by_work_item"][0]["count"] == 1
+    assert history["by_work_item"][0]["latest_tool_name"] == "ssh_diagnose"
+
+
@pytest.mark.asyncio
 async def test_missing_work_item_raises_not_found():
    svc = _service(item=_queue_item())
@@ -261,6 +316,24 @@ def test_ai_slo_remediation_endpoints(monkeypatch):
        async def dry_run(self, work_item_id: str, mode: str = "auto") -> dict[str, Any]:
            return {"work_item_id": work_item_id, "mode": mode, "executed": True}

+        async def history(
+            self,
+            *,
+            limit: int = 50,
+            incident_id: str | None = None,
+            work_item_id: str | None = None,
+        ) -> dict[str, Any]:
+            return {
+                "schema_version": "adr100_remediation_history_v1",
+                "limit": limit,
+                "filters": {
+                    "incident_id": incident_id,
+                    "work_item_id": work_item_id,
+                },
+                "items": [],
+                "by_work_item": [],
+            }
+
    monkeypatch.setattr(
        "src.api.v1.ai_slo.get_adr100_remediation_service",
        lambda: _FakeService(),
@@ -275,8 +348,15 @@ def test_ai_slo_remediation_endpoints(monkeypatch):
        "/api/v1/ai/slo/remediation/dry-run",
        json={"work_item_id": "verification:INC:are-1", "mode": "replay"},
    )
+    history = client.get(
+        "/api/v1/ai/slo/remediation/history",
+        params={"limit": 10, "work_item_id": "verification:INC:are-1"},
+    )

    assert preview.status_code == 200
    assert preview.json()["mode"] == "reverify"
    assert dry_run.status_code == 200
    assert dry_run.json()["executed"] is True
+    assert history.status_code == 200
+    assert history.json()["schema_version"] == "adr100_remediation_history_v1"
+    assert history.json()["filters"]["work_item_id"] == "verification:INC:are-1"
--- a/apps/web/messages/en.json
+++ b/apps/web/messages/en.json
@@ -1418,6 +1418,7 @@
        "dryRunLoading": "Running",
        "dryRunResult": "{mode}; preview {result}; tools {tools}",
        "dryRunHistoryRecorded": "History recorded",
+        "dryRunHistorySummary": "History {count}x; last {time}; {route}",
        "dryRunBlocked": "Dry run blocked",
        "dryRunError": "Dry run failed",
        "state": {
--- a/apps/web/messages/zh-TW.json
+++ b/apps/web/messages/zh-TW.json
@@ -1419,6 +1419,7 @@
        "dryRunLoading": "試跑中",
        "dryRunResult": "{mode}；預覽 {result}；工具 {tools}",
        "dryRunHistoryRecorded": "已寫入歷史",
+        "dryRunHistorySummary": "歷史 {count} 次；上次 {time}；{route}",
        "dryRunBlocked": "試跑未放行",
        "dryRunError": "試跑失敗",
        "state": {
--- a/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx
+++ b/apps/web/src/app/[locale]/governance/tabs/slo-tab.tsx
@@ -164,6 +164,24 @@ interface RemediationDryRunResponse {
  }
 }

+interface RemediationHistoryWorkItemSummary {
+  work_item_id?: string | null
+  incident_id?: string | null
+  count: number
+  latest_at?: string | null
+  latest_preview?: string | null
+  latest_mode?: string | null
+  latest_agent_id?: string | null
+  latest_tool_name?: string | null
+  required_scope?: string | null
+}
+
+interface RemediationHistoryResponse {
+  schema_version?: string
+  total?: number
+  by_work_item?: RemediationHistoryWorkItemSummary[]
+}
+
 interface RemediationActionState {
  status: 'loading' | 'done' | 'error'
  data?: RemediationDryRunResponse
@@ -255,6 +273,34 @@ async function requestRemediationDryRun(workItemId: string): Promise<Remediation
  return response.json()
 }

+async function requestRemediationHistory(): Promise<RemediationHistoryResponse> {
+  const params = new URLSearchParams({ limit: '80' })
+  const response = await fetch(`${API_BASE}/api/v1/ai/slo/remediation/history?${params}`)
+  if (!response.ok) throw new Error(`history_failed:${response.status}`)
+  return response.json()
+}
+
+function formatShortDateTime(value?: string | null): string {
+  if (!value) return '--'
+  const date = new Date(value)
+  if (Number.isNaN(date.getTime())) return value
+  return date.toLocaleString(undefined, {
+    month: '2-digit',
+    day: '2-digit',
+    hour: '2-digit',
+    minute: '2-digit',
+  })
+}
+
+function historyRouteLabel(summary: RemediationHistoryWorkItemSummary): string {
+  return compactLabel(
+    [summary.latest_agent_id, summary.latest_tool_name]
+      .filter(Boolean)
+      .join('/') || summary.required_scope || '--',
+    '--',
+  )
+}
+
 function buildMetrics(api: SloApiResponse): SloMetric[] {
  const adr100Metrics = api.adr100?.metrics
  if (adr100Metrics?.length) {
@@ -312,6 +358,7 @@ function buildMetrics(api: SloApiResponse): SloMetric[] {
 function VerificationCoveragePanel({ coverage }: { coverage?: Adr100VerificationCoverage }) {
  const t = useTranslations('governance.slo.coverage')
  const [actionState, setActionState] = useState<Record<string, RemediationActionState>>({})
+  const [remediationHistory, setRemediationHistory] = useState<RemediationHistoryResponse | null>(null)
  const color = coverageTone(coverage?.status)
  const rows = [
    { label: t('totalAuto'), value: String(coverage?.total_auto ?? '--') },
@@ -322,6 +369,28 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
  const failureBreakdown = coverage?.non_success_breakdown?.by_failure_class ?? []
  const recentFindings = coverage?.recent_non_success ?? []
  const remediationQueue = coverage?.remediation_queue
+  const remediationQueueTotal = remediationQueue?.total ?? 0
+  const historyByWorkItem = new Map(
+    (remediationHistory?.by_work_item ?? [])
+      .filter(item => item.work_item_id)
+      .map(item => [item.work_item_id as string, item]),
+  )
+
+  useEffect(() => {
+    let cancelled = false
+    if (remediationQueueTotal === 0) {
+      setRemediationHistory(null)
+      return () => { cancelled = true }
+    }
+    requestRemediationHistory()
+      .then(data => {
+        if (!cancelled) setRemediationHistory(data)
+      })
+      .catch(() => {
+        if (!cancelled) setRemediationHistory(null)
+      })
+    return () => { cancelled = true }
+  }, [remediationQueueTotal])

  const handleDryRun = async (workItemId: string) => {
    setActionState(prev => ({
@@ -334,6 +403,9 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
        ...prev,
        [workItemId]: { status: 'done', data },
      }))
+      requestRemediationHistory()
+        .then(setRemediationHistory)
+        .catch(() => undefined)
    } catch {
      setActionState(prev => ({
        ...prev,
@@ -431,7 +503,9 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
              </div>
            </div>
            <div style={{ display: 'flex', flexDirection: 'column', gap: 7 }}>
-              {(remediationQueue.items ?? []).slice(0, 4).map(item => (
+              {(remediationQueue.items ?? []).slice(0, 4).map(item => {
+                const historySummary = historyByWorkItem.get(item.work_item_id)
+                return (
                <div key={item.work_item_id} style={{
                  display: 'grid',
                  gridTemplateColumns: 'minmax(130px, 0.8fr) minmax(180px, 1fr) minmax(150px, 0.9fr) minmax(150px, 0.8fr)',
@@ -528,6 +602,21 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
                        )}
                      </>
                    )}
+                    {historySummary && (
+                      <div style={{
+                        fontFamily: "'DM Mono', monospace",
+                        fontSize: 9,
+                        color: '#4b5563',
+                        lineHeight: 1.4,
+                        overflowWrap: 'anywhere',
+                      }}>
+                        {t('dryRunHistorySummary', {
+                          count: historySummary.count,
+                          time: formatShortDateTime(historySummary.latest_at),
+                          route: historyRouteLabel(historySummary),
+                        })}
+                      </div>
+                    )}
                    {actionState[item.work_item_id]?.status === 'error' && (
                      <div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#FF3300', lineHeight: 1.4 }}>
                        {t('dryRunError')}
@@ -535,7 +624,8 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
                    )}
                  </div>
                </div>
-              ))}
+                )
+              })}
            </div>
          </div>
        )}
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,30 @@
+## 2026-05-14 | T27 Remediation history read model，前端與 Telegram 可看見試跑重複次數與 MCP 路徑
+
+**背景**：T26 已讓 ADR-100 remediation dry-run 寫入 `alert_operation_log` 與 `timeline_events`，但 governance 頁重新整理後仍看不到某筆補救工作過去試跑幾次、上次跑到哪個 preview、是否有用 MCP、是否仍只讀。Telegram 詳情 / 歷史也還沒有把這段 dry-run history 明確帶出。
+
+**修正**：
+- `Adr100RemediationService.history()` 新增 `adr100_remediation_history_v1` read model，從既有 `alert_operation_log` 讀 `adr100_remediation_dry_run_history_v1` context，不新增資料表。
+- 新增 `GET /api/v1/ai/slo/remediation/history`，支援 `limit / incident_id / work_item_id`，回傳 `items` 與 `by_work_item` 聚合，包含 count、latest preview、agent、tool、scope、writes flags。
+- `/governance` 補救工作佇列會讀 history endpoint；每筆工作顯示「歷史 N 次；上次時間；agent/tool」，點試跑成功後會重新整理 history，不只依賴當次 UI state。
+- Telegram `detail:{incident_id}` 與 `history:{incident_id}` 會補上 `ADR-100 補救試跑` 摘要，包含歷史次數、上次 mode/preview、MCP agent/tool/scope、是否寫 incident/auto-repair 狀態。
+
+**本地驗證**：
+- `python -m py_compile apps/api/src/services/adr100_remediation_service.py apps/api/src/api/v1/ai_slo.py apps/api/src/services/telegram_gateway.py apps/api/tests/test_adr100_remediation_service.py`：pass。
+- `ruff check --select F,E9 src/services/adr100_remediation_service.py src/api/v1/ai_slo.py src/services/telegram_gateway.py tests/test_adr100_remediation_service.py`：pass。
+- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_ai_governance_endpoints.py -q`：36 passed。
+- i18n JSON parse / `git diff --check`：pass。
+- `pnpm --filter @awoooi/web typecheck`：pass。
+- `pnpm --dir apps/web exec next lint --file src/app/[locale]/governance/tabs/slo-tab.tsx`：pass。
+- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --filter @awoooi/web build`：pass。
+
+**推版與 production 驗證**：
+- 待 T27 commit 推 Gitea main 後驗證。
+
+**目前整體進度**：
+- Alertmanager 低風險自動修復主線：約 98%。
+- 完整 AI 自動化管理產品化：約 92%。
+- T27 補上「試跑歷史」read model 與 UI/Telegram 可讀摘要。下一段應把 Incident 詳情頁的 stage event 展開成更完整的工作鏈路，而不是只顯示壓縮 ascii timeline。
+
 ## 2026-05-14 | T26 Remediation dry-run 寫入 history，試跑結果不再只停在前端暫存

 **背景**：T25 已讓 Operator 能在 `/governance` 補救佇列點「試跑」，但結果只存在當次 UI state。這仍無法完全回答「這次 dry-run 是否真的發生、跑到哪個流程、MCP 有沒有用到、後續是否能從 Incident history 回看」。