feat(governance): surface remediation dry run history
This commit is contained in:
@@ -118,3 +118,18 @@ async def dry_run_ai_slo_remediation(request: RemediationDryRunRequest) -> dict:
|
||||
)
|
||||
except RemediationNotFoundError as exc:
|
||||
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
|
||||
|
||||
|
||||
@router.get("/ai/slo/remediation/history")
|
||||
async def list_ai_slo_remediation_history(
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
incident_id: str | None = Query(default=None, min_length=1),
|
||||
work_item_id: str | None = Query(default=None, min_length=1),
|
||||
) -> dict:
|
||||
"""List durable ADR-100 remediation dry-run history from alert_operation_log."""
|
||||
|
||||
return await get_adr100_remediation_service().history(
|
||||
limit=limit,
|
||||
incident_id=incident_id,
|
||||
work_item_id=work_item_id,
|
||||
)
|
||||
|
||||
@@ -112,6 +112,67 @@ class Adr100RemediationService:
|
||||
return await self._dry_run_replay(item, incident, checks)
|
||||
return await self._dry_run_reverify(item, incident, checks)
|
||||
|
||||
async def history(
|
||||
self,
|
||||
*,
|
||||
limit: int = 50,
|
||||
incident_id: str | None = None,
|
||||
work_item_id: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Return durable dry-run history written by this remediation service."""
|
||||
|
||||
safe_limit = max(1, min(limit, 200))
|
||||
fetch_limit = min(max(safe_limit * 4, 50), 200)
|
||||
rows: list[Any] = []
|
||||
repo = self._alert_operation_log_repository
|
||||
if repo is None:
|
||||
from src.repositories.alert_operation_log_repository import (
|
||||
get_alert_operation_log_repository,
|
||||
)
|
||||
|
||||
repo = get_alert_operation_log_repository()
|
||||
|
||||
for event_type in ("PRE_FLIGHT_PASSED", "PRE_FLIGHT_FAILED"):
|
||||
try:
|
||||
batch, _total = await repo.list_recent(
|
||||
limit=fetch_limit,
|
||||
event_type=event_type,
|
||||
incident_id=incident_id,
|
||||
)
|
||||
rows.extend(batch)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"adr100_remediation_history_fetch_failed",
|
||||
event_type=event_type,
|
||||
incident_id=incident_id,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
rows.sort(key=_record_created_at, reverse=True)
|
||||
|
||||
items: list[dict[str, Any]] = []
|
||||
for row in rows:
|
||||
context = getattr(row, "context", None) or {}
|
||||
if context.get("schema_version") != "adr100_remediation_dry_run_history_v1":
|
||||
continue
|
||||
if work_item_id and context.get("work_item_id") != work_item_id:
|
||||
continue
|
||||
items.append(_history_item(row, context))
|
||||
if len(items) >= safe_limit:
|
||||
break
|
||||
|
||||
return {
|
||||
"schema_version": "adr100_remediation_history_v1",
|
||||
"total": len(items),
|
||||
"limit": safe_limit,
|
||||
"filters": {
|
||||
"incident_id": incident_id,
|
||||
"work_item_id": work_item_id,
|
||||
},
|
||||
"items": items,
|
||||
"by_work_item": _summarize_history_by_work_item(items),
|
||||
}
|
||||
|
||||
async def _find_work_item(self, work_item_id: str) -> dict[str, Any]:
|
||||
report = await self._slo_service.fetch_report()
|
||||
coverage = report.get("verification_coverage") or {}
|
||||
@@ -442,6 +503,66 @@ def _history_description(context: dict[str, Any]) -> str:
|
||||
)[:500]
|
||||
|
||||
|
||||
def _record_created_at(record: Any) -> str:
|
||||
value = getattr(record, "created_at", None)
|
||||
if hasattr(value, "isoformat"):
|
||||
return value.isoformat()
|
||||
return str(value or "")
|
||||
|
||||
|
||||
def _history_item(record: Any, context: dict[str, Any]) -> dict[str, Any]:
|
||||
route = context.get("mcp_route") or {}
|
||||
post_state = context.get("post_state_summary") or {}
|
||||
return {
|
||||
"id": str(getattr(record, "id", "")),
|
||||
"incident_id": getattr(record, "incident_id", None),
|
||||
"auto_repair_id": getattr(record, "auto_repair_id", None)
|
||||
or context.get("auto_repair_id"),
|
||||
"event_type": str(getattr(record, "event_type", "")),
|
||||
"actor": getattr(record, "actor", None),
|
||||
"success": getattr(record, "success", None),
|
||||
"created_at": _record_created_at(record),
|
||||
"work_item_id": context.get("work_item_id"),
|
||||
"playbook_id": context.get("playbook_id"),
|
||||
"alertname": context.get("alertname"),
|
||||
"mode": context.get("mode"),
|
||||
"allowed": context.get("allowed"),
|
||||
"executed": context.get("executed"),
|
||||
"safety_level": context.get("safety_level"),
|
||||
"verification_result_preview": context.get("verification_result_preview"),
|
||||
"tool_count": post_state.get("tool_count", 0),
|
||||
"tools": post_state.get("tools") or [],
|
||||
"agent_id": route.get("agent_id"),
|
||||
"tool_name": route.get("tool_name") or "current_state",
|
||||
"required_scope": route.get("required_scope"),
|
||||
"writes_incident_state": context.get("writes_incident_state"),
|
||||
"writes_auto_repair_result": context.get("writes_auto_repair_result"),
|
||||
"checks": context.get("checks") or [],
|
||||
}
|
||||
|
||||
|
||||
def _summarize_history_by_work_item(items: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
summary: dict[str, dict[str, Any]] = {}
|
||||
for item in items:
|
||||
key = str(item.get("work_item_id") or item.get("incident_id") or item.get("id"))
|
||||
if key not in summary:
|
||||
summary[key] = {
|
||||
"work_item_id": item.get("work_item_id"),
|
||||
"incident_id": item.get("incident_id"),
|
||||
"count": 0,
|
||||
"latest_at": item.get("created_at"),
|
||||
"latest_event_type": item.get("event_type"),
|
||||
"latest_success": item.get("success"),
|
||||
"latest_preview": item.get("verification_result_preview"),
|
||||
"latest_mode": item.get("mode"),
|
||||
"latest_agent_id": item.get("agent_id"),
|
||||
"latest_tool_name": item.get("tool_name"),
|
||||
"required_scope": item.get("required_scope"),
|
||||
}
|
||||
summary[key]["count"] += 1
|
||||
return list(summary.values())
|
||||
|
||||
|
||||
def _diagnostic_command_for_incident(incident: Incident) -> str:
|
||||
labels = _labels_for_incident(incident)
|
||||
host = str(labels.get("host") or labels.get("instance") or "{host}")
|
||||
|
||||
@@ -161,6 +161,40 @@ def _format_automation_quality_lines(quality: dict[str, object] | None) -> list[
|
||||
return lines
|
||||
|
||||
|
||||
def _format_remediation_history_lines(history: dict[str, object] | None) -> list[str]:
|
||||
if not history or int(history.get("total") or 0) <= 0:
|
||||
return []
|
||||
|
||||
items = history.get("items") if isinstance(history.get("items"), list) else []
|
||||
latest = items[0] if items and isinstance(items[0], dict) else {}
|
||||
agent = latest.get("agent_id") or "unknown_agent"
|
||||
tool = latest.get("tool_name") or "current_state"
|
||||
scope = latest.get("required_scope") or "unknown"
|
||||
writes_incident = latest.get("writes_incident_state")
|
||||
writes_auto_repair = latest.get("writes_auto_repair_result")
|
||||
|
||||
return [
|
||||
"",
|
||||
"🧪 <b>ADR-100 補救試跑</b>",
|
||||
f"歷史: <code>{int(history.get('total') or 0)}</code> 次",
|
||||
(
|
||||
"上次: "
|
||||
f"<code>{html.escape(str(latest.get('mode') or 'unknown'))}</code> / "
|
||||
f"<code>{html.escape(str(latest.get('verification_result_preview') or 'unknown'))}</code>"
|
||||
),
|
||||
(
|
||||
"MCP: "
|
||||
f"<code>{html.escape(str(agent))}/{html.escape(str(tool))}</code> / "
|
||||
f"<code>{html.escape(str(scope))}</code>"
|
||||
),
|
||||
(
|
||||
"寫入: "
|
||||
f"incident <code>{html.escape(str(writes_incident))}</code> / "
|
||||
f"auto-repair <code>{html.escape(str(writes_auto_repair))}</code>"
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _sanitize_telegram_error(text: str) -> str:
|
||||
"""遮蔽 Telegram Bot URL 中的 token,避免例外字串污染 log / trace。"""
|
||||
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
|
||||
@@ -5276,6 +5310,23 @@ class TelegramGateway:
|
||||
+ html.escape(", ".join(mismatch_codes[:4]))
|
||||
)
|
||||
|
||||
try:
|
||||
from src.services.adr100_remediation_service import (
|
||||
get_adr100_remediation_service,
|
||||
)
|
||||
|
||||
remediation_history = await get_adr100_remediation_service().history(
|
||||
limit=5,
|
||||
incident_id=incident_id,
|
||||
)
|
||||
lines += _format_remediation_history_lines(remediation_history)
|
||||
except Exception as remediation_exc:
|
||||
logger.warning(
|
||||
"incident_detail_remediation_history_summary_failed",
|
||||
incident_id=incident_id,
|
||||
error=str(remediation_exc),
|
||||
)
|
||||
|
||||
try:
|
||||
from src.services.awooop_truth_chain_service import fetch_truth_chain
|
||||
|
||||
@@ -5298,6 +5349,23 @@ class TelegramGateway:
|
||||
error=str(truth_exc),
|
||||
)
|
||||
|
||||
try:
|
||||
from src.services.adr100_remediation_service import (
|
||||
get_adr100_remediation_service,
|
||||
)
|
||||
|
||||
remediation_history = await get_adr100_remediation_service().history(
|
||||
limit=5,
|
||||
incident_id=incident_id,
|
||||
)
|
||||
lines += _format_remediation_history_lines(remediation_history)
|
||||
except Exception as remediation_exc:
|
||||
logger.warning(
|
||||
"incident_history_remediation_summary_failed",
|
||||
incident_id=incident_id,
|
||||
error=str(remediation_exc),
|
||||
)
|
||||
|
||||
await self.send_notification("\n".join(lines))
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -54,10 +54,42 @@ class _FakeVerifier:
|
||||
class _FakeAlertOperationLogRepository:
|
||||
def __init__(self) -> None:
|
||||
self.calls: list[dict[str, Any]] = []
|
||||
self.records: list[Any] = []
|
||||
|
||||
async def append(self, event_type: str, **kwargs: Any):
|
||||
self.calls.append({"event_type": event_type, **kwargs})
|
||||
return type("AlertOperationRecord", (), {"id": "aol-1"})()
|
||||
record = type(
|
||||
"AlertOperationRecord",
|
||||
(),
|
||||
{
|
||||
"id": f"aol-{len(self.records) + 1}",
|
||||
"incident_id": kwargs.get("incident_id"),
|
||||
"auto_repair_id": kwargs.get("auto_repair_id"),
|
||||
"event_type": event_type,
|
||||
"actor": kwargs.get("actor"),
|
||||
"success": kwargs.get("success"),
|
||||
"context": kwargs.get("context") or {},
|
||||
"created_at": datetime(2026, 5, 14, 14, 45, len(self.records), tzinfo=timezone.utc),
|
||||
},
|
||||
)()
|
||||
self.records.append(record)
|
||||
return record
|
||||
|
||||
async def list_recent(
|
||||
self,
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
event_type: str | None = None,
|
||||
incident_id: str | None = None,
|
||||
):
|
||||
rows = [
|
||||
record
|
||||
for record in self.records
|
||||
if (event_type is None or record.event_type == event_type)
|
||||
and (incident_id is None or record.incident_id == incident_id)
|
||||
]
|
||||
rows = sorted(rows, key=lambda record: record.created_at, reverse=True)
|
||||
return rows[offset:offset + limit], len(rows)
|
||||
|
||||
|
||||
class _FakeTimelineService:
|
||||
@@ -242,6 +274,29 @@ async def test_dry_run_records_alert_operation_and_timeline_history():
|
||||
assert timeline.calls[0]["actor_role"] == "replay"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_history_lists_dry_run_records_grouped_by_work_item():
|
||||
alert_repo = _FakeAlertOperationLogRepository()
|
||||
svc = _service(
|
||||
item=_queue_item(),
|
||||
alert_operation_log_repository=alert_repo,
|
||||
record_history=True,
|
||||
)
|
||||
|
||||
await svc.dry_run("verification:INC-20260514-TEST01:are-1")
|
||||
history = await svc.history(limit=10)
|
||||
|
||||
assert history["schema_version"] == "adr100_remediation_history_v1"
|
||||
assert history["total"] == 1
|
||||
assert history["items"][0]["work_item_id"] == "verification:INC-20260514-TEST01:are-1"
|
||||
assert history["items"][0]["agent_id"] == "auto_repair_executor"
|
||||
assert history["items"][0]["tool_name"] == "ssh_diagnose"
|
||||
assert history["items"][0]["required_scope"] == "read"
|
||||
assert history["items"][0]["writes_incident_state"] is False
|
||||
assert history["by_work_item"][0]["count"] == 1
|
||||
assert history["by_work_item"][0]["latest_tool_name"] == "ssh_diagnose"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_work_item_raises_not_found():
|
||||
svc = _service(item=_queue_item())
|
||||
@@ -261,6 +316,24 @@ def test_ai_slo_remediation_endpoints(monkeypatch):
|
||||
async def dry_run(self, work_item_id: str, mode: str = "auto") -> dict[str, Any]:
|
||||
return {"work_item_id": work_item_id, "mode": mode, "executed": True}
|
||||
|
||||
async def history(
|
||||
self,
|
||||
*,
|
||||
limit: int = 50,
|
||||
incident_id: str | None = None,
|
||||
work_item_id: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"schema_version": "adr100_remediation_history_v1",
|
||||
"limit": limit,
|
||||
"filters": {
|
||||
"incident_id": incident_id,
|
||||
"work_item_id": work_item_id,
|
||||
},
|
||||
"items": [],
|
||||
"by_work_item": [],
|
||||
}
|
||||
|
||||
monkeypatch.setattr(
|
||||
"src.api.v1.ai_slo.get_adr100_remediation_service",
|
||||
lambda: _FakeService(),
|
||||
@@ -275,8 +348,15 @@ def test_ai_slo_remediation_endpoints(monkeypatch):
|
||||
"/api/v1/ai/slo/remediation/dry-run",
|
||||
json={"work_item_id": "verification:INC:are-1", "mode": "replay"},
|
||||
)
|
||||
history = client.get(
|
||||
"/api/v1/ai/slo/remediation/history",
|
||||
params={"limit": 10, "work_item_id": "verification:INC:are-1"},
|
||||
)
|
||||
|
||||
assert preview.status_code == 200
|
||||
assert preview.json()["mode"] == "reverify"
|
||||
assert dry_run.status_code == 200
|
||||
assert dry_run.json()["executed"] is True
|
||||
assert history.status_code == 200
|
||||
assert history.json()["schema_version"] == "adr100_remediation_history_v1"
|
||||
assert history.json()["filters"]["work_item_id"] == "verification:INC:are-1"
|
||||
|
||||
@@ -1418,6 +1418,7 @@
|
||||
"dryRunLoading": "Running",
|
||||
"dryRunResult": "{mode}; preview {result}; tools {tools}",
|
||||
"dryRunHistoryRecorded": "History recorded",
|
||||
"dryRunHistorySummary": "History {count}x; last {time}; {route}",
|
||||
"dryRunBlocked": "Dry run blocked",
|
||||
"dryRunError": "Dry run failed",
|
||||
"state": {
|
||||
|
||||
@@ -1419,6 +1419,7 @@
|
||||
"dryRunLoading": "試跑中",
|
||||
"dryRunResult": "{mode};預覽 {result};工具 {tools}",
|
||||
"dryRunHistoryRecorded": "已寫入歷史",
|
||||
"dryRunHistorySummary": "歷史 {count} 次;上次 {time};{route}",
|
||||
"dryRunBlocked": "試跑未放行",
|
||||
"dryRunError": "試跑失敗",
|
||||
"state": {
|
||||
|
||||
@@ -164,6 +164,24 @@ interface RemediationDryRunResponse {
|
||||
}
|
||||
}
|
||||
|
||||
interface RemediationHistoryWorkItemSummary {
|
||||
work_item_id?: string | null
|
||||
incident_id?: string | null
|
||||
count: number
|
||||
latest_at?: string | null
|
||||
latest_preview?: string | null
|
||||
latest_mode?: string | null
|
||||
latest_agent_id?: string | null
|
||||
latest_tool_name?: string | null
|
||||
required_scope?: string | null
|
||||
}
|
||||
|
||||
interface RemediationHistoryResponse {
|
||||
schema_version?: string
|
||||
total?: number
|
||||
by_work_item?: RemediationHistoryWorkItemSummary[]
|
||||
}
|
||||
|
||||
interface RemediationActionState {
|
||||
status: 'loading' | 'done' | 'error'
|
||||
data?: RemediationDryRunResponse
|
||||
@@ -255,6 +273,34 @@ async function requestRemediationDryRun(workItemId: string): Promise<Remediation
|
||||
return response.json()
|
||||
}
|
||||
|
||||
async function requestRemediationHistory(): Promise<RemediationHistoryResponse> {
|
||||
const params = new URLSearchParams({ limit: '80' })
|
||||
const response = await fetch(`${API_BASE}/api/v1/ai/slo/remediation/history?${params}`)
|
||||
if (!response.ok) throw new Error(`history_failed:${response.status}`)
|
||||
return response.json()
|
||||
}
|
||||
|
||||
function formatShortDateTime(value?: string | null): string {
|
||||
if (!value) return '--'
|
||||
const date = new Date(value)
|
||||
if (Number.isNaN(date.getTime())) return value
|
||||
return date.toLocaleString(undefined, {
|
||||
month: '2-digit',
|
||||
day: '2-digit',
|
||||
hour: '2-digit',
|
||||
minute: '2-digit',
|
||||
})
|
||||
}
|
||||
|
||||
function historyRouteLabel(summary: RemediationHistoryWorkItemSummary): string {
|
||||
return compactLabel(
|
||||
[summary.latest_agent_id, summary.latest_tool_name]
|
||||
.filter(Boolean)
|
||||
.join('/') || summary.required_scope || '--',
|
||||
'--',
|
||||
)
|
||||
}
|
||||
|
||||
function buildMetrics(api: SloApiResponse): SloMetric[] {
|
||||
const adr100Metrics = api.adr100?.metrics
|
||||
if (adr100Metrics?.length) {
|
||||
@@ -312,6 +358,7 @@ function buildMetrics(api: SloApiResponse): SloMetric[] {
|
||||
function VerificationCoveragePanel({ coverage }: { coverage?: Adr100VerificationCoverage }) {
|
||||
const t = useTranslations('governance.slo.coverage')
|
||||
const [actionState, setActionState] = useState<Record<string, RemediationActionState>>({})
|
||||
const [remediationHistory, setRemediationHistory] = useState<RemediationHistoryResponse | null>(null)
|
||||
const color = coverageTone(coverage?.status)
|
||||
const rows = [
|
||||
{ label: t('totalAuto'), value: String(coverage?.total_auto ?? '--') },
|
||||
@@ -322,6 +369,28 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
const failureBreakdown = coverage?.non_success_breakdown?.by_failure_class ?? []
|
||||
const recentFindings = coverage?.recent_non_success ?? []
|
||||
const remediationQueue = coverage?.remediation_queue
|
||||
const remediationQueueTotal = remediationQueue?.total ?? 0
|
||||
const historyByWorkItem = new Map(
|
||||
(remediationHistory?.by_work_item ?? [])
|
||||
.filter(item => item.work_item_id)
|
||||
.map(item => [item.work_item_id as string, item]),
|
||||
)
|
||||
|
||||
useEffect(() => {
|
||||
let cancelled = false
|
||||
if (remediationQueueTotal === 0) {
|
||||
setRemediationHistory(null)
|
||||
return () => { cancelled = true }
|
||||
}
|
||||
requestRemediationHistory()
|
||||
.then(data => {
|
||||
if (!cancelled) setRemediationHistory(data)
|
||||
})
|
||||
.catch(() => {
|
||||
if (!cancelled) setRemediationHistory(null)
|
||||
})
|
||||
return () => { cancelled = true }
|
||||
}, [remediationQueueTotal])
|
||||
|
||||
const handleDryRun = async (workItemId: string) => {
|
||||
setActionState(prev => ({
|
||||
@@ -334,6 +403,9 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
...prev,
|
||||
[workItemId]: { status: 'done', data },
|
||||
}))
|
||||
requestRemediationHistory()
|
||||
.then(setRemediationHistory)
|
||||
.catch(() => undefined)
|
||||
} catch {
|
||||
setActionState(prev => ({
|
||||
...prev,
|
||||
@@ -431,7 +503,9 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
</div>
|
||||
</div>
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 7 }}>
|
||||
{(remediationQueue.items ?? []).slice(0, 4).map(item => (
|
||||
{(remediationQueue.items ?? []).slice(0, 4).map(item => {
|
||||
const historySummary = historyByWorkItem.get(item.work_item_id)
|
||||
return (
|
||||
<div key={item.work_item_id} style={{
|
||||
display: 'grid',
|
||||
gridTemplateColumns: 'minmax(130px, 0.8fr) minmax(180px, 1fr) minmax(150px, 0.9fr) minmax(150px, 0.8fr)',
|
||||
@@ -528,6 +602,21 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
{historySummary && (
|
||||
<div style={{
|
||||
fontFamily: "'DM Mono', monospace",
|
||||
fontSize: 9,
|
||||
color: '#4b5563',
|
||||
lineHeight: 1.4,
|
||||
overflowWrap: 'anywhere',
|
||||
}}>
|
||||
{t('dryRunHistorySummary', {
|
||||
count: historySummary.count,
|
||||
time: formatShortDateTime(historySummary.latest_at),
|
||||
route: historyRouteLabel(historySummary),
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
{actionState[item.work_item_id]?.status === 'error' && (
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 9, color: '#FF3300', lineHeight: 1.4 }}>
|
||||
{t('dryRunError')}
|
||||
@@ -535,7 +624,8 @@ function VerificationCoveragePanel({ coverage }: { coverage?: Adr100Verification
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
@@ -1,3 +1,30 @@
|
||||
## 2026-05-14 | T27 Remediation history read model,前端與 Telegram 可看見試跑重複次數與 MCP 路徑
|
||||
|
||||
**背景**:T26 已讓 ADR-100 remediation dry-run 寫入 `alert_operation_log` 與 `timeline_events`,但 governance 頁重新整理後仍看不到某筆補救工作過去試跑幾次、上次跑到哪個 preview、是否有用 MCP、是否仍只讀。Telegram 詳情 / 歷史也還沒有把這段 dry-run history 明確帶出。
|
||||
|
||||
**修正**:
|
||||
- `Adr100RemediationService.history()` 新增 `adr100_remediation_history_v1` read model,從既有 `alert_operation_log` 讀 `adr100_remediation_dry_run_history_v1` context,不新增資料表。
|
||||
- 新增 `GET /api/v1/ai/slo/remediation/history`,支援 `limit / incident_id / work_item_id`,回傳 `items` 與 `by_work_item` 聚合,包含 count、latest preview、agent、tool、scope、writes flags。
|
||||
- `/governance` 補救工作佇列會讀 history endpoint;每筆工作顯示「歷史 N 次;上次時間;agent/tool」,點試跑成功後會重新整理 history,不只依賴當次 UI state。
|
||||
- Telegram `detail:{incident_id}` 與 `history:{incident_id}` 會補上 `ADR-100 補救試跑` 摘要,包含歷史次數、上次 mode/preview、MCP agent/tool/scope、是否寫 incident/auto-repair 狀態。
|
||||
|
||||
**本地驗證**:
|
||||
- `python -m py_compile apps/api/src/services/adr100_remediation_service.py apps/api/src/api/v1/ai_slo.py apps/api/src/services/telegram_gateway.py apps/api/tests/test_adr100_remediation_service.py`:pass。
|
||||
- `ruff check --select F,E9 src/services/adr100_remediation_service.py src/api/v1/ai_slo.py src/services/telegram_gateway.py tests/test_adr100_remediation_service.py`:pass。
|
||||
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test python -m pytest tests/test_adr100_remediation_service.py tests/test_adr100_slo_status_service.py tests/test_ai_governance_endpoints.py -q`:36 passed。
|
||||
- i18n JSON parse / `git diff --check`:pass。
|
||||
- `pnpm --filter @awoooi/web typecheck`:pass。
|
||||
- `pnpm --dir apps/web exec next lint --file src/app/[locale]/governance/tabs/slo-tab.tsx`:pass。
|
||||
- `NEXT_PUBLIC_API_URL=https://awoooi.wooo.work pnpm --filter @awoooi/web build`:pass。
|
||||
|
||||
**推版與 production 驗證**:
|
||||
- 待 T27 commit 推 Gitea main 後驗證。
|
||||
|
||||
**目前整體進度**:
|
||||
- Alertmanager 低風險自動修復主線:約 98%。
|
||||
- 完整 AI 自動化管理產品化:約 92%。
|
||||
- T27 補上「試跑歷史」read model 與 UI/Telegram 可讀摘要。下一段應把 Incident 詳情頁的 stage event 展開成更完整的工作鏈路,而不是只顯示壓縮 ascii timeline。
|
||||
|
||||
## 2026-05-14 | T26 Remediation dry-run 寫入 history,試跑結果不再只停在前端暫存
|
||||
|
||||
**背景**:T25 已讓 Operator 能在 `/governance` 補救佇列點「試跑」,但結果只存在當次 UI state。這仍無法完全回答「這次 dry-run 是否真的發生、跑到哪個流程、MCP 有沒有用到、後續是否能從 Incident history 回看」。
|
||||
|
||||
Reference in New Issue
Block a user