feat(governance): surface remediation dry run history
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m1s
CD Pipeline / build-and-deploy (push) Successful in 3m45s
CD Pipeline / post-deploy-checks (push) Successful in 1m39s

This commit is contained in:
Your Name
2026-05-14 22:56:51 +08:00
parent 53cd7f9d66
commit 392cfb9025
8 changed files with 406 additions and 3 deletions

View File

@@ -118,3 +118,18 @@ async def dry_run_ai_slo_remediation(request: RemediationDryRunRequest) -> dict:
)
except RemediationNotFoundError as exc:
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
@router.get("/ai/slo/remediation/history")
async def list_ai_slo_remediation_history(
limit: int = Query(50, ge=1, le=200),
incident_id: str | None = Query(default=None, min_length=1),
work_item_id: str | None = Query(default=None, min_length=1),
) -> dict:
"""List durable ADR-100 remediation dry-run history from alert_operation_log."""
return await get_adr100_remediation_service().history(
limit=limit,
incident_id=incident_id,
work_item_id=work_item_id,
)

View File

@@ -112,6 +112,67 @@ class Adr100RemediationService:
return await self._dry_run_replay(item, incident, checks)
return await self._dry_run_reverify(item, incident, checks)
async def history(
self,
*,
limit: int = 50,
incident_id: str | None = None,
work_item_id: str | None = None,
) -> dict[str, Any]:
"""Return durable dry-run history written by this remediation service."""
safe_limit = max(1, min(limit, 200))
fetch_limit = min(max(safe_limit * 4, 50), 200)
rows: list[Any] = []
repo = self._alert_operation_log_repository
if repo is None:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
repo = get_alert_operation_log_repository()
for event_type in ("PRE_FLIGHT_PASSED", "PRE_FLIGHT_FAILED"):
try:
batch, _total = await repo.list_recent(
limit=fetch_limit,
event_type=event_type,
incident_id=incident_id,
)
rows.extend(batch)
except Exception as exc:
logger.warning(
"adr100_remediation_history_fetch_failed",
event_type=event_type,
incident_id=incident_id,
error=str(exc),
)
rows.sort(key=_record_created_at, reverse=True)
items: list[dict[str, Any]] = []
for row in rows:
context = getattr(row, "context", None) or {}
if context.get("schema_version") != "adr100_remediation_dry_run_history_v1":
continue
if work_item_id and context.get("work_item_id") != work_item_id:
continue
items.append(_history_item(row, context))
if len(items) >= safe_limit:
break
return {
"schema_version": "adr100_remediation_history_v1",
"total": len(items),
"limit": safe_limit,
"filters": {
"incident_id": incident_id,
"work_item_id": work_item_id,
},
"items": items,
"by_work_item": _summarize_history_by_work_item(items),
}
async def _find_work_item(self, work_item_id: str) -> dict[str, Any]:
report = await self._slo_service.fetch_report()
coverage = report.get("verification_coverage") or {}
@@ -442,6 +503,66 @@ def _history_description(context: dict[str, Any]) -> str:
)[:500]
def _record_created_at(record: Any) -> str:
value = getattr(record, "created_at", None)
if hasattr(value, "isoformat"):
return value.isoformat()
return str(value or "")
def _history_item(record: Any, context: dict[str, Any]) -> dict[str, Any]:
route = context.get("mcp_route") or {}
post_state = context.get("post_state_summary") or {}
return {
"id": str(getattr(record, "id", "")),
"incident_id": getattr(record, "incident_id", None),
"auto_repair_id": getattr(record, "auto_repair_id", None)
or context.get("auto_repair_id"),
"event_type": str(getattr(record, "event_type", "")),
"actor": getattr(record, "actor", None),
"success": getattr(record, "success", None),
"created_at": _record_created_at(record),
"work_item_id": context.get("work_item_id"),
"playbook_id": context.get("playbook_id"),
"alertname": context.get("alertname"),
"mode": context.get("mode"),
"allowed": context.get("allowed"),
"executed": context.get("executed"),
"safety_level": context.get("safety_level"),
"verification_result_preview": context.get("verification_result_preview"),
"tool_count": post_state.get("tool_count", 0),
"tools": post_state.get("tools") or [],
"agent_id": route.get("agent_id"),
"tool_name": route.get("tool_name") or "current_state",
"required_scope": route.get("required_scope"),
"writes_incident_state": context.get("writes_incident_state"),
"writes_auto_repair_result": context.get("writes_auto_repair_result"),
"checks": context.get("checks") or [],
}
def _summarize_history_by_work_item(items: list[dict[str, Any]]) -> list[dict[str, Any]]:
summary: dict[str, dict[str, Any]] = {}
for item in items:
key = str(item.get("work_item_id") or item.get("incident_id") or item.get("id"))
if key not in summary:
summary[key] = {
"work_item_id": item.get("work_item_id"),
"incident_id": item.get("incident_id"),
"count": 0,
"latest_at": item.get("created_at"),
"latest_event_type": item.get("event_type"),
"latest_success": item.get("success"),
"latest_preview": item.get("verification_result_preview"),
"latest_mode": item.get("mode"),
"latest_agent_id": item.get("agent_id"),
"latest_tool_name": item.get("tool_name"),
"required_scope": item.get("required_scope"),
}
summary[key]["count"] += 1
return list(summary.values())
def _diagnostic_command_for_incident(incident: Incident) -> str:
labels = _labels_for_incident(incident)
host = str(labels.get("host") or labels.get("instance") or "{host}")

View File

@@ -161,6 +161,40 @@ def _format_automation_quality_lines(quality: dict[str, object] | None) -> list[
return lines
def _format_remediation_history_lines(history: dict[str, object] | None) -> list[str]:
if not history or int(history.get("total") or 0) <= 0:
return []
items = history.get("items") if isinstance(history.get("items"), list) else []
latest = items[0] if items and isinstance(items[0], dict) else {}
agent = latest.get("agent_id") or "unknown_agent"
tool = latest.get("tool_name") or "current_state"
scope = latest.get("required_scope") or "unknown"
writes_incident = latest.get("writes_incident_state")
writes_auto_repair = latest.get("writes_auto_repair_result")
return [
"",
"🧪 <b>ADR-100 補救試跑</b>",
f"歷史: <code>{int(history.get('total') or 0)}</code> 次",
(
"上次: "
f"<code>{html.escape(str(latest.get('mode') or 'unknown'))}</code> / "
f"<code>{html.escape(str(latest.get('verification_result_preview') or 'unknown'))}</code>"
),
(
"MCP: "
f"<code>{html.escape(str(agent))}/{html.escape(str(tool))}</code> / "
f"<code>{html.escape(str(scope))}</code>"
),
(
"寫入: "
f"incident <code>{html.escape(str(writes_incident))}</code> / "
f"auto-repair <code>{html.escape(str(writes_auto_repair))}</code>"
),
]
def _sanitize_telegram_error(text: str) -> str:
"""遮蔽 Telegram Bot URL 中的 token避免例外字串污染 log / trace。"""
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
@@ -5276,6 +5310,23 @@ class TelegramGateway:
+ html.escape(", ".join(mismatch_codes[:4]))
)
try:
from src.services.adr100_remediation_service import (
get_adr100_remediation_service,
)
remediation_history = await get_adr100_remediation_service().history(
limit=5,
incident_id=incident_id,
)
lines += _format_remediation_history_lines(remediation_history)
except Exception as remediation_exc:
logger.warning(
"incident_detail_remediation_history_summary_failed",
incident_id=incident_id,
error=str(remediation_exc),
)
try:
from src.services.awooop_truth_chain_service import fetch_truth_chain
@@ -5298,6 +5349,23 @@ class TelegramGateway:
error=str(truth_exc),
)
try:
from src.services.adr100_remediation_service import (
get_adr100_remediation_service,
)
remediation_history = await get_adr100_remediation_service().history(
limit=5,
incident_id=incident_id,
)
lines += _format_remediation_history_lines(remediation_history)
except Exception as remediation_exc:
logger.warning(
"incident_history_remediation_summary_failed",
incident_id=incident_id,
error=str(remediation_exc),
)
await self.send_notification("\n".join(lines))
except Exception as e:

View File

@@ -54,10 +54,42 @@ class _FakeVerifier:
class _FakeAlertOperationLogRepository:
def __init__(self) -> None:
self.calls: list[dict[str, Any]] = []
self.records: list[Any] = []
async def append(self, event_type: str, **kwargs: Any):
self.calls.append({"event_type": event_type, **kwargs})
return type("AlertOperationRecord", (), {"id": "aol-1"})()
record = type(
"AlertOperationRecord",
(),
{
"id": f"aol-{len(self.records) + 1}",
"incident_id": kwargs.get("incident_id"),
"auto_repair_id": kwargs.get("auto_repair_id"),
"event_type": event_type,
"actor": kwargs.get("actor"),
"success": kwargs.get("success"),
"context": kwargs.get("context") or {},
"created_at": datetime(2026, 5, 14, 14, 45, len(self.records), tzinfo=timezone.utc),
},
)()
self.records.append(record)
return record
async def list_recent(
self,
limit: int = 50,
offset: int = 0,
event_type: str | None = None,
incident_id: str | None = None,
):
rows = [
record
for record in self.records
if (event_type is None or record.event_type == event_type)
and (incident_id is None or record.incident_id == incident_id)
]
rows = sorted(rows, key=lambda record: record.created_at, reverse=True)
return rows[offset:offset + limit], len(rows)
class _FakeTimelineService:
@@ -242,6 +274,29 @@ async def test_dry_run_records_alert_operation_and_timeline_history():
assert timeline.calls[0]["actor_role"] == "replay"
@pytest.mark.asyncio
async def test_history_lists_dry_run_records_grouped_by_work_item():
alert_repo = _FakeAlertOperationLogRepository()
svc = _service(
item=_queue_item(),
alert_operation_log_repository=alert_repo,
record_history=True,
)
await svc.dry_run("verification:INC-20260514-TEST01:are-1")
history = await svc.history(limit=10)
assert history["schema_version"] == "adr100_remediation_history_v1"
assert history["total"] == 1
assert history["items"][0]["work_item_id"] == "verification:INC-20260514-TEST01:are-1"
assert history["items"][0]["agent_id"] == "auto_repair_executor"
assert history["items"][0]["tool_name"] == "ssh_diagnose"
assert history["items"][0]["required_scope"] == "read"
assert history["items"][0]["writes_incident_state"] is False
assert history["by_work_item"][0]["count"] == 1
assert history["by_work_item"][0]["latest_tool_name"] == "ssh_diagnose"
@pytest.mark.asyncio
async def test_missing_work_item_raises_not_found():
svc = _service(item=_queue_item())
@@ -261,6 +316,24 @@ def test_ai_slo_remediation_endpoints(monkeypatch):
async def dry_run(self, work_item_id: str, mode: str = "auto") -> dict[str, Any]:
return {"work_item_id": work_item_id, "mode": mode, "executed": True}
async def history(
self,
*,
limit: int = 50,
incident_id: str | None = None,
work_item_id: str | None = None,
) -> dict[str, Any]:
return {
"schema_version": "adr100_remediation_history_v1",
"limit": limit,
"filters": {
"incident_id": incident_id,
"work_item_id": work_item_id,
},
"items": [],
"by_work_item": [],
}
monkeypatch.setattr(
"src.api.v1.ai_slo.get_adr100_remediation_service",
lambda: _FakeService(),
@@ -275,8 +348,15 @@ def test_ai_slo_remediation_endpoints(monkeypatch):
"/api/v1/ai/slo/remediation/dry-run",
json={"work_item_id": "verification:INC:are-1", "mode": "replay"},
)
history = client.get(
"/api/v1/ai/slo/remediation/history",
params={"limit": 10, "work_item_id": "verification:INC:are-1"},
)
assert preview.status_code == 200
assert preview.json()["mode"] == "reverify"
assert dry_run.status_code == 200
assert dry_run.json()["executed"] is True
assert history.status_code == 200
assert history.json()["schema_version"] == "adr100_remediation_history_v1"
assert history.json()["filters"]["work_item_id"] == "verification:INC:are-1"

View File

@@ -1418,6 +1418,7 @@
"dryRunLoading": "Running",
"dryRunResult": "{mode}; preview {result}; tools {tools}",
"dryRunHistoryRecorded": "History recorded",
"dryRunHistorySummary": "History {count}x; last {time}; {route}",
"dryRunBlocked": "Dry run blocked",
"dryRunError": "Dry run failed",
"state": {

View File

@@ -1419,6 +1419,7 @@
"dryRunLoading": "試跑中",
"dryRunResult": "{mode};預覽 {result};工具 {tools}",
"dryRunHistoryRecorded": "已寫入歷史",
"dryRunHistorySummary": "歷史 {count} 次;上次 {time}{route}",
"dryRunBlocked": "試跑未放行",
"dryRunError": "試跑失敗",
"state": {

View File

@@ -164,6 +164,24 @@ interface RemediationDryRunResponse {
}
}
interface RemediationHistoryWorkItemSummary {
work_item_id?: string | null
incident_id?: string | null
count: number
latest_at?: string | null
latest_preview?: string | null
latest_mode?: string | null
latest_agent_id?: string | null
latest_tool_name?: string | null
required_scope?: string | null
}
interface RemediationHistoryResponse {
schema_version?: string
total?: number
by_work_item?: RemediationHistoryWorkItemSummary[]
}
interface RemediationActionState {
status: 'loading' | 'done' | 'error'
data?: RemediationDryRunResponse
@@ -255,6 +273,34 @@ async function requestRemediationDryRun(workItemId: string): Promise<Remediation
return response.json()
}
async function requestRemediationHistory(): Promise<RemediationHistoryResponse> {
const params = new URLSearchParams({ limit: '80' })
const response = await fetch(`${API_BASE}/api/v1/ai/slo/remediation/history?${params}`)
if (!response.ok) throw new Error(`history_failed:${response.status}`)
return response.json()
}
function formatShortDateTime(value?: string | null): string {
if (!value) return '--'
const date = new Date(value)
if (Number.isNaN(date.getTime())) return value
return date.toLocaleString(undefined, {
month: '2-digit',
day: '2-digit',
hour: '2-digit',
minute: '2-digit',
})
}
function historyRouteLabel(summary: RemediationHistoryWorkItemSummary): string {
return compactLabel(
[summary.latest_agent_id, summary.latest_tool_name]
.filter(Boolean)
.join('/') || summary.required_scope || '--',
'--',
)
}
function buildMetrics(api: SloApiResponse): SloMetric[] {
const adr100Metrics = api.adr100?.metrics
if (adr100Metrics?.length) {
@@ -312,6 +358,7 @@ function buildMetrics(api: SloApiResponse): SloMetric[] {
function VerificationCoveragePanel({ coverage }: { coverage?: Adr100VerificationCoverage }) {
const t = useTranslations('governance.slo.coverage')
const [actionState, setActionState] = useState<Record<string, RemediationActionState>>({})
const [remediationHistory, setRemediationHistory] = useState<RemediationHistoryResponse | null>(null)
const color = coverageTone(coverage?.status)
const rows = [
{ label: t('totalAuto'), value: String(coverage?.total_auto ?? '--') },
@@ -322,6 +369,28 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
const failureBreakdown = coverage?.non_success_breakdown?.by_failure_class ?? []
const recentFindings = coverage?.recent_non_success ?? []
const remediationQueue = coverage?.remediation_queue
const remediationQueueTotal = remediationQueue?.total ?? 0
const historyByWorkItem = new Map(
(remediationHistory?.by_work_item ?? [])
.filter(item => item.work_item_id)
.map(item => [item.work_item_id as string, item]),
)
useEffect(() => {
let cancelled = false
if (remediationQueueTotal === 0) {
setRemediationHistory(null)
return () => { cancelled = true }
}
requestRemediationHistory()
.then(data => {
if (!cancelled) setRemediationHistory(data)
})
.catch(() => {
if (!cancelled) setRemediationHistory(null)
})
return () => { cancelled = true }
}, [remediationQueueTotal])
const handleDryRun = async (workItemId: string) => {
setActionState(prev => ({
@@ -334,6 +403,9 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
...prev,
[workItemId]: { status: 'done', data },
}))
requestRemediationHistory()
.then(setRemediationHistory)
.catch(() => undefined)
} catch {
setActionState(prev => ({
...prev,
@@ -431,7 +503,9 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
</div>
</div>
<div style={{ display: 'flex', flexDirection: 'column', gap: 7 }}>
{(remediationQueue.items ?? []).slice(0, 4).map(item => (
{(remediationQueue.items ?? []).slice(0, 4).map(item => {
const historySummary = historyByWorkItem.get(item.work_item_id)
return (
<div key={item.work_item_id} style={{
display: 'grid',
gridTemplateColumns: 'minmax(130px, 0.8fr) minmax(180px, 1fr) minmax(150px, 0.9fr) minmax(150px, 0.8fr)',
@@ -528,6 +602,21 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
)}
</>
)}
{historySummary && (
<div style={{
fontFamily: "'DM Mono', monospace",
fontSize: 9,
color: '#4b5563',
lineHeight: 1.4,
overflowWrap: 'anywhere',
}}>
{t('dryRunHistorySummary', {
count: historySummary.count,
time: formatShortDateTime(historySummary.latest_at),
route: historyRouteLabel(historySummary),
})}
</div>
)}
{actionState[item.work_item_id]?.status === 'error' && (
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#FF3300', lineHeight: 1.4 }}>
{t('dryRunError')}
@@ -535,7 +624,8 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
)}
</div>
</div>
))}
)
})}
</div>
</div>
)}

View File

@@ -1,3 +1,30 @@
## 2026-05-14 | T27 Remediation history read model前端與 Telegram 可看見試跑重複次數與 MCP 路徑
**背景**T26 已讓 ADR-100 remediation dry-run 寫入 `alert_operation_log``timeline_events`,但 governance 頁重新整理後仍看不到某筆補救工作過去試跑幾次、上次跑到哪個 preview、是否有用 MCP、是否仍只讀。Telegram 詳情 / 歷史也還沒有把這段 dry-run history 明確帶出。
**修正**
- `Adr100RemediationService.history()` 新增 `adr100_remediation_history_v1` read model從既有 `alert_operation_log``adr100_remediation_dry_run_history_v1` context不新增資料表。
- 新增 `GET /api/v1/ai/slo/remediation/history`,支援 `limit / incident_id / work_item_id`,回傳 `items``by_work_item` 聚合,包含 count、latest preview、agent、tool、scope、writes flags。
- `/governance` 補救工作佇列會讀 history endpoint每筆工作顯示「歷史 N 次上次時間agent/tool」點試跑成功後會重新整理 history不只依賴當次 UI state。
- Telegram `detail:{incident_id}``history:{incident_id}` 會補上 `ADR-100 補救試跑` 摘要,包含歷史次數、上次 mode/preview、MCP agent/tool/scope、是否寫 incident/auto-repair 狀態。
**本地驗證**
- `python -m py_compile apps/api/src/services/adr100_remediation_service.py apps/api/src/api/v1/ai_slo.py apps/api/src/services/telegram_gateway.py apps/api/tests/test_adr100_remediation_service.py`pass。
- `ruff check --select F,E9 src/services/adr100_remediation_service.py src/api/v1/ai_slo.py src/services/telegram_gateway.py tests/test_adr100_remediation_service.py`pass。
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_ai_governance_endpoints.py -q`36 passed。
- i18n JSON parse / `git diff --check`pass。
- `pnpm --filter @awoooi/web typecheck`pass。
- `pnpm --dir apps/web exec next lint --file src/app/[locale]/governance/tabs/slo-tab.tsx`pass。
- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --filter @awoooi/web build`pass。
**推版與 production 驗證**
- 待 T27 commit 推 Gitea main 後驗證。
**目前整體進度**
- Alertmanager 低風險自動修復主線:約 98%。
- 完整 AI 自動化管理產品化:約 92%。
- T27 補上「試跑歷史」read model 與 UI/Telegram 可讀摘要。下一段應把 Incident 詳情頁的 stage event 展開成更完整的工作鏈路,而不是只顯示壓縮 ascii timeline。
## 2026-05-14 | T26 Remediation dry-run 寫入 history試跑結果不再只停在前端暫存
**背景**T25 已讓 Operator 能在 `/governance` 補救佇列點「試跑」,但結果只存在當次 UI state。這仍無法完全回答「這次 dry-run 是否真的發生、跑到哪個流程、MCP 有沒有用到、後續是否能從 Incident history 回看」。