feat(adr100): bridge playbook authoring approvals
All checks were successful
CD Pipeline / tests (push) Successful in 1m20s
Code Review / ai-code-review (push) Successful in 13s
CD Pipeline / build-and-deploy (push) Successful in 7m44s
CD Pipeline / post-deploy-checks (push) Successful in 2m49s

This commit is contained in:
Your Name
2026-06-01 20:48:24 +08:00
parent f0daaccbba
commit 16775bb4fa
9 changed files with 784 additions and 7 deletions

View File

@@ -48,6 +48,13 @@ class RemediationDryRunRequest(BaseModel):
mode: RemediationMode = "auto"
class RemediationApprovalRequest(BaseModel):
"""ADR-100 record-only PlayBook authoring approval request."""
work_item_id: str = Field(min_length=1)
mode: RemediationMode = "approval"
@router.get("/ai/slo")
async def get_ai_slo(
force_refresh: bool = Query(False, description="忽略快取,強制重算"),
@@ -120,6 +127,21 @@ async def dry_run_ai_slo_remediation(request: RemediationDryRunRequest) -> dict:
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
@router.post("/ai/slo/remediation/approval-request")
async def create_ai_slo_remediation_approval_request(
request: RemediationApprovalRequest,
) -> dict:
"""Create a record-only approval request for ADR-100 PlayBook authoring."""
try:
return await get_adr100_remediation_service().create_approval_request(
request.work_item_id,
request.mode,
)
except RemediationNotFoundError as exc:
raise HTTPException(status_code=404, detail="remediation_work_item_not_found") from exc
@router.get("/ai/slo/remediation/history")
async def list_ai_slo_remediation_history(
limit: int = Query(50, ge=1, le=200),

View File

@@ -11,10 +11,18 @@ T25: remediation queue items are now actionable without mutating incident state:
from __future__ import annotations
import asyncio
from datetime import datetime, timedelta, timezone
from typing import Any, Literal, Protocol
import structlog
from src.models.approval import (
ApprovalRequestCreate,
BlastRadius,
DataImpact,
DryRunCheck,
RiskLevel,
)
from src.models.incident import Incident
from src.repositories.incident_repository import IncidentDBRepository
from src.services.adr100_slo_status_service import (
@@ -31,7 +39,7 @@ from src.services.post_execution_verifier import (
logger = structlog.get_logger(__name__)
RemediationMode = Literal["auto", "reverify", "replay", "ticket"]
RemediationMode = Literal["auto", "reverify", "replay", "ticket", "approval"]
_READY_STATUSES = {"ready_for_replay", "ready_for_reverify"}
_TICKET_STATUSES = {"needs_playbook_ticket"}
@@ -57,6 +65,7 @@ class Adr100RemediationService:
incident_repository: _IncidentRepository | None = None,
auto_repair_service: AutoRepairService | None = None,
verifier: PostExecutionVerifier | None = None,
approval_service: Any | None = None,
timeline_service: Any | None = None,
alert_operation_log_repository: Any | None = None,
record_history: bool = True,
@@ -65,6 +74,7 @@ class Adr100RemediationService:
self._incident_repository = incident_repository or IncidentDBRepository()
self._auto_repair_service = auto_repair_service or AutoRepairService()
self._verifier = verifier or get_post_execution_verifier()
self._approval_service = approval_service
self._timeline_service = timeline_service
self._alert_operation_log_repository = alert_operation_log_repository
self._record_history_enabled = record_history
@@ -116,6 +126,74 @@ class Adr100RemediationService:
return await self._dry_run_replay(item, incident, checks)
return await self._dry_run_reverify(item, incident, checks)
async def create_approval_request(
self,
work_item_id: str,
mode: RemediationMode = "approval",
) -> dict[str, Any]:
"""Create a record-only approval for PlayBook authoring remediation."""
item = await self._find_work_item(work_item_id)
selected_mode = _select_mode(item, mode)
checks = _base_checks(item)
checks.append({
"name": "playbook_authoring_ticket_required",
"passed": selected_mode in {"ticket", "approval"},
"detail": str(item.get("remediation_status") or "unknown"),
})
incident = await self._load_incident(item)
checks.append({
"name": "incident_loaded",
"passed": incident is not None,
"detail": item.get("incident_id") or "missing incident_id",
})
if incident is None or not all(check["passed"] for check in checks):
payload = _approval_blocked_payload(item, selected_mode, checks)
payload["history"] = await self._record_dry_run_history(item, payload)
return payload
approval_request = _approval_request_for_item(item, incident, checks)
approval_svc = self._approval_service
if approval_svc is None:
from src.services.approval_db import get_approval_service
approval_svc = get_approval_service()
fingerprint = _approval_fingerprint(item)
approval = None
if hasattr(approval_svc, "find_by_fingerprint"):
try:
approval = await approval_svc.find_by_fingerprint(fingerprint)
except Exception as exc:
logger.warning(
"adr100_remediation_approval_dedupe_lookup_failed",
fingerprint=fingerprint,
error=str(exc),
)
approval_created = approval is None
if approval is None and hasattr(approval_svc, "create_approval_with_fingerprint"):
approval = await approval_svc.create_approval_with_fingerprint(
approval_request,
fingerprint=fingerprint,
)
elif approval is None:
approval = await approval_svc.create_approval(approval_request)
payload = _approval_result_payload(
item=item,
incident=incident,
checks=checks,
approval=approval,
request=approval_request,
approval_created=approval_created,
fingerprint=fingerprint,
)
payload["history"] = await self._record_approval_history(item, payload)
return payload
async def history(
self,
*,
@@ -136,7 +214,7 @@ class Adr100RemediationService:
repo = get_alert_operation_log_repository()
for event_type in ("PRE_FLIGHT_PASSED", "PRE_FLIGHT_FAILED"):
for event_type in ("PRE_FLIGHT_PASSED", "PRE_FLIGHT_FAILED", "APPROVAL_ESCALATED"):
try:
batch, _total = await repo.list_recent(
limit=fetch_limit,
@@ -157,7 +235,10 @@ class Adr100RemediationService:
items: list[dict[str, Any]] = []
for row in rows:
context = getattr(row, "context", None) or {}
if context.get("schema_version") != "adr100_remediation_dry_run_history_v1":
if context.get("schema_version") not in {
"adr100_remediation_dry_run_history_v1",
"adr100_remediation_approval_history_v1",
}:
continue
if work_item_id and context.get("work_item_id") != work_item_id:
continue
@@ -383,8 +464,86 @@ class Adr100RemediationService:
)
return history
async def _record_approval_history(
self,
item: dict[str, Any],
payload: dict[str, Any],
) -> dict[str, Any]:
if not self._record_history_enabled:
return {"recorded": False, "reason": "disabled"}
incident_id = str(item.get("incident_id") or "")
approval_id = str(payload.get("approval_id") or "")
history: dict[str, Any] = {
"recorded": False,
"alert_operation_id": None,
"timeline_event_id": None,
}
context = _approval_history_context(item, payload)
try:
repo = self._alert_operation_log_repository
if repo is None:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
repo = get_alert_operation_log_repository()
record = await repo.append(
"APPROVAL_ESCALATED",
incident_id=incident_id or None,
approval_id=approval_id or None,
auto_repair_id=str(item.get("auto_repair_id") or "") or None,
actor="adr100_remediation_service",
action_detail="adr100_playbook_authoring_approval_requested",
success=True,
context=context,
)
if record is not None:
history["alert_operation_id"] = getattr(record, "id", None)
except Exception as exc:
logger.warning(
"adr100_remediation_approval_history_failed",
incident_id=incident_id,
approval_id=approval_id,
error=str(exc),
)
try:
timeline = self._timeline_service
if timeline is None:
from src.services.approval_db import get_timeline_service
timeline = get_timeline_service()
event = await timeline.add_event(
event_type="human",
status="warning",
title="ADR-100 PlayBook authoring approval requested",
description=_approval_history_description(context),
actor="adr100_remediation_service",
actor_role="approval",
approval_id=approval_id or None,
incident_id=incident_id or None,
)
if event:
history["timeline_event_id"] = event.get("id")
except Exception as exc:
logger.warning(
"adr100_remediation_approval_timeline_failed",
incident_id=incident_id,
approval_id=approval_id,
error=str(exc),
)
history["recorded"] = bool(
history.get("alert_operation_id") or history.get("timeline_event_id")
)
return history
def _select_mode(item: dict[str, Any], requested: RemediationMode) -> Literal["reverify", "replay", "ticket"]:
if requested == "approval":
return "ticket"
if requested in ("reverify", "replay"):
return requested
if requested == "ticket":
@@ -442,6 +601,14 @@ def _plan_for_item(item: dict[str, Any], mode: str) -> dict[str, Any]:
"writes": ["alert_operation_log", "timeline"],
"target_action": item.get("remediation_action"),
}
if mode == "approval":
return {
"step": "request_playbook_authoring_approval",
"agent_id": "openclaw_playbook_planner",
"required_scope": "record_only",
"writes": ["approval_records", "alert_operation_log", "timeline"],
"target_action": item.get("remediation_action"),
}
return {
"step": "validate_supported_executor_route_then_collect_current_state",
"agent_id": "auto_repair_executor",
@@ -475,6 +642,34 @@ def _dry_run_blocked_payload(
}
def _approval_blocked_payload(
item: dict[str, Any],
mode: str,
checks: list[dict[str, Any]],
) -> dict[str, Any]:
return {
"schema_version": "adr100_remediation_approval_v1",
"work_item_id": item.get("work_item_id"),
"incident_id": item.get("incident_id"),
"auto_repair_id": item.get("auto_repair_id"),
"mode": "approval",
"requested_mode": mode,
"allowed": False,
"executed": False,
"safety_level": "approval_record_only",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"writes_ticket": False,
"writes_approval_record": False,
"creates_external_ticket": False,
"checks": checks,
"verification_result_preview": "blocked",
"approval": None,
"approval_id": None,
"plan": _plan_for_item(item, "approval"),
}
def _dry_run_result_payload(
*,
item: dict[str, Any],
@@ -504,6 +699,139 @@ def _dry_run_result_payload(
}
def _approval_request_for_item(
item: dict[str, Any],
incident: Incident,
checks: list[dict[str, Any]],
) -> ApprovalRequestCreate:
ticket_preview = _ticket_preview_for_item(item, incident)
services = [svc for svc in (incident.affected_services or []) if svc]
if not services:
services = [str(item.get("alertname") or "unknown_alert")]
playbook_id = str(item.get("playbook_id") or "unknown_playbook")
work_item_id = str(item.get("work_item_id") or "")
action = (
"PLAYBOOK_AUTHORING_RECORD_ONLY: "
f"ADR-100 promote diagnostic PlayBook {playbook_id}"
)
description = (
f"{ticket_preview.get('title')}\n\n"
f"{ticket_preview.get('body_preview')}\n\n"
"Approval scope: record-only PlayBook authoring. Signing this request "
"does not execute a runtime repair, does not resolve the incident, and "
"does not mark the old diagnostic run as verified_success."
)
return ApprovalRequestCreate(
action=action,
description=description[:4000],
risk_level=RiskLevel.MEDIUM,
blast_radius=BlastRadius(
affected_pods=0,
estimated_downtime="0",
related_services=services[:6],
data_impact=DataImpact.READ_ONLY,
),
dry_run_checks=[
DryRunCheck(
name=str(check.get("name") or "check"),
passed=bool(check.get("passed")),
message=str(check.get("detail") or ""),
)
for check in checks
],
requested_by="adr100_remediation_service",
expires_at=datetime.now(timezone.utc) + timedelta(hours=48),
metadata={
"schema_version": "adr100_playbook_authoring_approval_v1",
"approval_kind": "adr100_playbook_authoring",
"execution_kind": "playbook_authoring_record_only",
"execution_authorized": False,
"repair_attempted": False,
"repair_executed": False,
"work_item_id": work_item_id,
"auto_repair_id": item.get("auto_repair_id"),
"source": "adr100.verification_coverage.remediation_queue",
"ticket_preview": ticket_preview,
"target_action": item.get("remediation_action"),
"required_scope": "record_only",
"next_step": "author_mutating_repair_step",
"playbook_id": playbook_id,
"flywheel_node": "approval",
"agent_id": "openclaw_playbook_planner",
"mcp_gate": "not_required_record_only",
},
incident_id=str(item.get("incident_id") or incident.incident_id),
matched_playbook_id=playbook_id if playbook_id != "unknown_playbook" else None,
)
def _approval_fingerprint(item: dict[str, Any]) -> str:
work_item_id = str(item.get("work_item_id") or "")
playbook_id = str(item.get("playbook_id") or "")
incident_id = str(item.get("incident_id") or "")
basis = work_item_id or f"{incident_id}:{playbook_id}:{item.get('remediation_action') or ''}"
return f"adr100_playbook_authoring:{basis}"[:240]
def _approval_result_payload(
*,
item: dict[str, Any],
incident: Incident,
checks: list[dict[str, Any]],
approval: Any,
request: ApprovalRequestCreate,
approval_created: bool,
fingerprint: str,
) -> dict[str, Any]:
ticket_preview = (request.metadata or {}).get("ticket_preview") or _ticket_preview_for_item(
item,
incident,
)
approval_id = str(getattr(approval, "id", "") or "")
approval_status = getattr(getattr(approval, "status", None), "value", None) or getattr(
approval,
"status",
None,
)
risk_level = getattr(getattr(approval, "risk_level", None), "value", None) or getattr(
approval,
"risk_level",
None,
)
return {
"schema_version": "adr100_remediation_approval_v1",
"work_item_id": item.get("work_item_id"),
"incident_id": item.get("incident_id") or incident.incident_id,
"auto_repair_id": item.get("auto_repair_id"),
"mode": "approval",
"allowed": True,
"executed": False,
"safety_level": "approval_record_only",
"writes_incident_state": False,
"writes_auto_repair_result": False,
"writes_ticket": False,
"writes_approval_record": approval_created,
"creates_external_ticket": False,
"deduplicated": not approval_created,
"fingerprint": fingerprint,
"checks": checks,
"verification_result_preview": "approval_requested",
"approval_id": approval_id or None,
"approval": {
"id": approval_id or None,
"status": str(approval_status or ""),
"risk_level": str(risk_level or ""),
"required_signatures": getattr(approval, "required_signatures", None),
"current_signatures": getattr(approval, "current_signatures", None),
"requested_by": getattr(approval, "requested_by", None),
"incident_id": getattr(approval, "incident_id", None),
"matched_playbook_id": getattr(approval, "matched_playbook_id", None),
},
"ticket_preview": ticket_preview,
"plan": _plan_for_item(item, "approval"),
}
def _summarize_post_state(post_state: dict[str, Any]) -> dict[str, Any]:
keys = sorted(post_state.keys())
return {
@@ -537,6 +865,33 @@ def _history_context(item: dict[str, Any], payload: dict[str, Any]) -> dict[str,
}
def _approval_history_context(item: dict[str, Any], payload: dict[str, Any]) -> dict[str, Any]:
return {
"schema_version": "adr100_remediation_approval_history_v1",
"work_item_id": item.get("work_item_id"),
"auto_repair_id": item.get("auto_repair_id"),
"playbook_id": item.get("playbook_id"),
"alertname": item.get("alertname"),
"mode": payload.get("mode"),
"allowed": payload.get("allowed"),
"executed": payload.get("executed"),
"safety_level": payload.get("safety_level"),
"writes_incident_state": payload.get("writes_incident_state"),
"writes_auto_repair_result": payload.get("writes_auto_repair_result"),
"writes_ticket": payload.get("writes_ticket"),
"writes_approval_record": payload.get("writes_approval_record"),
"creates_external_ticket": payload.get("creates_external_ticket"),
"deduplicated": payload.get("deduplicated"),
"fingerprint": payload.get("fingerprint"),
"ticket_preview": payload.get("ticket_preview"),
"approval": payload.get("approval"),
"approval_id": payload.get("approval_id"),
"plan": payload.get("plan"),
"verification_result_preview": payload.get("verification_result_preview"),
"checks": payload.get("checks"),
}
def _timeline_status(payload: dict[str, Any]) -> str:
if not payload.get("allowed"):
return "warning"
@@ -559,6 +914,18 @@ def _history_description(context: dict[str, Any]) -> str:
)[:500]
def _approval_history_description(context: dict[str, Any]) -> str:
approval = context.get("approval") or {}
return (
f"approval={approval.get('id') or context.get('approval_id') or 'unknown'} "
f"status={approval.get('status') or 'unknown'} "
f"preview={context.get('verification_result_preview')} "
f"writes_approval={context.get('writes_approval_record')} "
f"writes_incident={context.get('writes_incident_state')} "
f"writes_auto_repair={context.get('writes_auto_repair_result')}"
)[:500]
def _record_created_at(record: Any) -> str:
value = getattr(record, "created_at", None)
if hasattr(value, "isoformat"):
@@ -569,6 +936,7 @@ def _record_created_at(record: Any) -> str:
def _history_item(record: Any, context: dict[str, Any]) -> dict[str, Any]:
route = context.get("mcp_route") or {}
post_state = context.get("post_state_summary") or {}
approval = context.get("approval") or {}
return {
"id": str(getattr(record, "id", "")),
"incident_id": getattr(record, "incident_id", None),
@@ -594,7 +962,13 @@ def _history_item(record: Any, context: dict[str, Any]) -> dict[str, Any]:
"writes_incident_state": context.get("writes_incident_state"),
"writes_auto_repair_result": context.get("writes_auto_repair_result"),
"writes_ticket": context.get("writes_ticket"),
"writes_approval_record": context.get("writes_approval_record"),
"creates_external_ticket": context.get("creates_external_ticket"),
"approval_id": context.get("approval_id") or approval.get("id"),
"approval_status": approval.get("status"),
"approval_risk_level": approval.get("risk_level"),
"deduplicated": context.get("deduplicated"),
"fingerprint": context.get("fingerprint"),
"ticket_preview": context.get("ticket_preview"),
"plan": context.get("plan"),
"checks": context.get("checks") or [],

View File

@@ -72,6 +72,15 @@ _SSH_GATEWAY_TOOL_SCOPES: dict[str, str] = {
}
def _is_playbook_authoring_record_only_approval(approval: ApprovalRequest) -> bool:
metadata = dict(getattr(approval, "metadata", None) or {})
return (
metadata.get("approval_kind") == "adr100_playbook_authoring"
and metadata.get("execution_kind") == "playbook_authoring_record_only"
and metadata.get("execution_authorized") is False
)
class ApprovalExecutionService:
"""
授權執行服務 - 編排整個執行流程
@@ -254,6 +263,70 @@ class ApprovalExecutionService:
pass
if operation_type is None or resource_name is None:
if _is_playbook_authoring_record_only_approval(approval):
metadata = dict(getattr(approval, "metadata", None) or {})
logger.info(
"background_execution_playbook_authoring_record_only",
approval_id=str(approval.id),
action=approval.action,
incident_id=getattr(approval, "incident_id", None),
work_item_id=metadata.get("work_item_id"),
playbook_id=metadata.get("playbook_id"),
)
await service.update_execution_status(
approval.id,
success=True,
execution_kind="playbook_authoring_record_only",
repair_executed=False,
repair_attempted=False,
)
await timeline.add_event(
event_type="exec",
status="success",
title="PlayBook authoring approval recorded (no runtime repair)",
description=(
"Approval scope is record-only. No incident state change, "
"no runtime repair, no verified_success promotion."
),
actor="leWOOOgo",
actor_role="executor",
approval_id=str(approval.id),
incident_id=approval.incident_id,
)
await self._log_aol_completed(
op_id=_aol_op_id,
status="success",
duration_ms=int((time.time() - _aol_started_ms) * 1000),
output={
"reason": "PLAYBOOK_AUTHORING_RECORD_ONLY",
"execution_kind": "playbook_authoring_record_only",
"repair_executed": False,
"repair_attempted": False,
"work_item_id": metadata.get("work_item_id"),
"playbook_id": metadata.get("playbook_id"),
},
)
await self._log_alert_execution_completed(
approval,
success=True,
execution_kind="playbook_authoring_record_only",
duration_ms=int((time.time() - _aol_started_ms) * 1000),
output={
"reason": "PLAYBOOK_AUTHORING_RECORD_ONLY",
"repair_executed": False,
"repair_attempted": False,
},
)
await self._push_execution_result_to_alert(
approval,
success=True,
error=None,
execution_kind="playbook_authoring_record_only",
repair_executed=False,
repair_attempted=False,
)
return True
# 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗
# NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED
# 污染 auto_execute 成功率 KPI (MASTER §7.1 #11)