From 1ae8f809af5cf2fb9f5da1809e90cc8ee79a92f7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 4 Jun 2026 15:26:56 +0800 Subject: [PATCH] fix(api): record approval gate timeline events --- apps/api/src/services/approval_db.py | 145 ++++++++++++++++++ .../src/services/incident_approval_service.py | 2 + .../api/tests/test_approval_timeline_event.py | 60 ++++++++ .../tests/test_awooop_truth_chain_service.py | 26 ++++ docs/LOGBOOK.md | 36 +++++ ...-04-navigation-and-ai-flywheel-workplan.md | 12 +- 6 files changed, 277 insertions(+), 4 deletions(-) create mode 100644 apps/api/tests/test_approval_timeline_event.py diff --git a/apps/api/src/services/approval_db.py b/apps/api/src/services/approval_db.py index 40a26320..71b2a554 100644 --- a/apps/api/src/services/approval_db.py +++ b/apps/api/src/services/approval_db.py @@ -175,6 +175,149 @@ def approval_request_to_record_data( } +def _record_value(value: Any) -> str: + if hasattr(value, "value"): + value = value.value + text = str(value or "").strip() + if "." in text: + text = text.rsplit(".", 1)[-1] + return text.lower() + + +def _record_int(value: Any) -> int: + try: + return int(value or 0) + except (TypeError, ValueError): + return 0 + + +def _approval_gate_stage(record: ApprovalRecord) -> str: + status = _record_value(record.status) + current = _record_int(getattr(record, "current_signatures", 0)) + required = _record_int(getattr(record, "required_signatures", 0)) + + if status == "pending": + return "approval_required" + if status == "approved" and current == 0 and required == 0: + return "approval_auto_approved" + if status == "approved": + return "approval_approved" + if status == "execution_success": + return "execution_verified" + if status == "execution_failed": + return "execution_failed" + if status in {"rejected", "expired"}: + return f"approval_{status}" + return "approval_status_recorded" + + +def _approval_gate_status(record: ApprovalRecord) -> str: + status = _record_value(record.status) + if status == "pending": + return "warning" + if status in {"approved", "execution_success"}: + return "success" + if status in {"rejected", "expired", "execution_failed"}: + return "error" + return "info" + + +def _approval_needs_human(record: ApprovalRecord) -> bool: + status = _record_value(record.status) + current = _record_int(getattr(record, "current_signatures", 0)) + required = _record_int(getattr(record, "required_signatures", 0)) + return status == "pending" and current < required + + +def _approval_next_action(record: ApprovalRecord) -> str: + status = _record_value(record.status) + if status == "pending": + return ( + "operator_approve_or_reject" + if _approval_needs_human(record) + else "execute_or_verify" + ) + if status == "approved": + return "execute_or_verify" + if status == "execution_success": + return "verify_or_close" + if status == "execution_failed": + return "manual_fix_or_rollback" + if status in {"rejected", "expired"}: + return "review_or_close" + return "review_status_chain" + + +def _approval_blocked_reason(record: ApprovalRecord) -> str: + status = _record_value(record.status) + if status == "pending" and _approval_needs_human(record): + return "waiting_for_required_signatures" + if status == "execution_failed": + return "execution_failed" + if status == "rejected": + return "operator_rejected" + if status == "expired": + return "approval_expired" + return "none" + + +def _approval_decision_mode(record: ApprovalRecord) -> str: + current = _record_int(getattr(record, "current_signatures", 0)) + required = _record_int(getattr(record, "required_signatures", 0)) + risk_level = _record_value(record.risk_level) + if _approval_needs_human(record) or current > 0: + return "manual" + if risk_level == "low" and required == 0: + return "auto" + return "manual" + + +def build_approval_created_timeline_event(record: ApprovalRecord) -> TimelineEvent: + """Create the raw audit rail event that mirrors a newly-created approval gate.""" + current = _record_int(getattr(record, "current_signatures", 0)) + required = _record_int(getattr(record, "required_signatures", 0)) + risk_level = _record_value(record.risk_level) + needs_human = _approval_needs_human(record) + stage = _approval_gate_stage(record) + next_action = _approval_next_action(record) + mode = _approval_decision_mode(record) + description = "; ".join( + [ + f"stage={stage}", + f"next_action={next_action}", + f"blocked_reason={_approval_blocked_reason(record)}", + f"auto_or_manual={mode}", + f"needs_human={'yes' if needs_human else 'no'}", + f"risk_level={risk_level}", + f"signatures={current}/{required}", + f"action={str(getattr(record, 'action', '') or '')[:240]}", + ] + ) + + title = ( + "Approval gate waiting for human decision" + if needs_human + else "Approval gate passed" + ) + return TimelineEvent( + event_type="human", + status=_approval_gate_status(record), + title=title, + description=description, + actor=getattr(record, "requested_by", None), + actor_role="approval_gate", + risk_level=risk_level, + approval_id=str(record.id), + incident_id=getattr(record, "incident_id", None), + ) + + +def add_approval_created_timeline_event(db: Any, record: ApprovalRecord) -> TimelineEvent: + event = build_approval_created_timeline_event(record) + db.add(event) + return event + + # ============================================================================= # Database Approval Service # ============================================================================= @@ -224,6 +367,7 @@ class ApprovalDBService: db.add(record) await db.flush() await db.refresh(record) + add_approval_created_timeline_event(db, record) logger.info( "approval_created_db", @@ -275,6 +419,7 @@ class ApprovalDBService: db.add(record) await db.flush() await db.refresh(record) + add_approval_created_timeline_event(db, record) logger.info( "approval_created_with_fingerprint", diff --git a/apps/api/src/services/incident_approval_service.py b/apps/api/src/services/incident_approval_service.py index 756cb59d..8defc0a1 100644 --- a/apps/api/src/services/incident_approval_service.py +++ b/apps/api/src/services/incident_approval_service.py @@ -36,6 +36,7 @@ from src.core.unit_of_work import UnitOfWork from src.db.models import ApprovalRecord, IncidentRecord from src.models.approval import ApprovalRequestCreate, ApprovalStatus from src.models.incident import IncidentStatus +from src.services.approval_db import add_approval_created_timeline_event if TYPE_CHECKING: from redis.asyncio import Redis @@ -167,6 +168,7 @@ class IncidentApprovalService: ) uow.session.add(approval_record) await uow.flush() # 取得 ID + add_approval_created_timeline_event(uow.session, approval_record) # 更新 Incident metadata 連結 Approval if link_metadata: diff --git a/apps/api/tests/test_approval_timeline_event.py b/apps/api/tests/test_approval_timeline_event.py new file mode 100644 index 00000000..069488e6 --- /dev/null +++ b/apps/api/tests/test_approval_timeline_event.py @@ -0,0 +1,60 @@ +from types import SimpleNamespace + +from src.models.approval import ApprovalStatus, RiskLevel +from src.services.approval_db import build_approval_created_timeline_event + + +def test_pending_approval_timeline_event_exposes_operator_gate() -> None: + record = SimpleNamespace( + id="approval-pending-1", + action="kubectl rollout restart deployment/api", + status=ApprovalStatus.PENDING, + risk_level=RiskLevel.MEDIUM, + required_signatures=1, + current_signatures=0, + requested_by="openclaw", + incident_id="INC-20260604-TEST01", + ) + + event = build_approval_created_timeline_event(record) + + assert event.event_type == "human" + assert event.status == "warning" + assert event.title == "Approval gate waiting for human decision" + assert event.actor == "openclaw" + assert event.actor_role == "approval_gate" + assert event.risk_level == "medium" + assert event.approval_id == "approval-pending-1" + assert event.incident_id == "INC-20260604-TEST01" + assert "stage=approval_required" in event.description + assert "next_action=operator_approve_or_reject" in event.description + assert "blocked_reason=waiting_for_required_signatures" in event.description + assert "auto_or_manual=manual" in event.description + assert "needs_human=yes" in event.description + assert "signatures=0/1" in event.description + + +def test_low_risk_auto_approved_timeline_event_exposes_auto_gate() -> None: + record = SimpleNamespace( + id="approval-auto-1", + action="collect read-only diagnostics", + status=ApprovalStatus.APPROVED, + risk_level=RiskLevel.LOW, + required_signatures=0, + current_signatures=0, + requested_by="openclaw", + incident_id="INC-20260604-TEST02", + ) + + event = build_approval_created_timeline_event(record) + + assert event.event_type == "human" + assert event.status == "success" + assert event.title == "Approval gate passed" + assert event.risk_level == "low" + assert "stage=approval_auto_approved" in event.description + assert "next_action=execute_or_verify" in event.description + assert "blocked_reason=none" in event.description + assert "auto_or_manual=auto" in event.description + assert "needs_human=no" in event.description + assert "signatures=0/0" in event.description diff --git a/apps/api/tests/test_awooop_truth_chain_service.py b/apps/api/tests/test_awooop_truth_chain_service.py index d9af182e..40a709cd 100644 --- a/apps/api/tests/test_awooop_truth_chain_service.py +++ b/apps/api/tests/test_awooop_truth_chain_service.py @@ -421,6 +421,32 @@ def test_reconciliation_blocks_open_incident_after_no_action_approval() -> None: assert "timeline_missing_for_approval" in codes +def test_reconciliation_accepts_approval_with_raw_timeline_event() -> None: + reconciliation = build_incident_reconciliation( + incident={"incident_id": "INC-1", "status": "INVESTIGATING"}, + approvals=[ + { + "id": "approval-1", + "status": "PENDING", + "action": "kubectl rollout restart deployment/api", + } + ], + evidence_rows=[], + automation_ops=[], + timeline_events=[ + { + "event_type": "human", + "status": "warning", + "approval_id": "approval-1", + } + ], + ) + + codes = {row["code"] for row in reconciliation["mismatches"]} + assert "timeline_missing_for_approval" not in codes + assert reconciliation["facts"]["timeline_events"] == 1 + + def test_reconciliation_counts_auto_repair_execution_as_real_execution() -> None: reconciliation = build_incident_reconciliation( incident={"incident_id": "INC-2", "status": "INVESTIGATING"}, diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index b02934d9..f7fe4f8e 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -28938,3 +28938,39 @@ production browser smoke: 1. 針對 `timeline_missing_for_approval` 補 pending approval lifecycle event,至少要能看到 stage、handler、AI action、manual need。 2. 針對 Telegram 告警文案補 stage、next action、blocked reason、auto/manual,讓 operator 不開前端也能判讀。 3. 建立 risk level ↔ approval gate 對照表,釐清低 / 中風險何時能自動修復、何時必須人工。 + +## 2026-06-04 — Phase 1 approval gate timeline event 修復 + +**背景**: +- Phase 1 live truth check 發現 `INC-20260603-9B2535` 已有 pending approval,但 raw `timeline_events=0`,truth-chain 標記 `timeline_missing_for_approval`。 +- Runs 詳情雖可由 approval record 合成畫面時間線,但 operator truth-chain 與 Telegram / API 判讀仍需要原始事件表留痕。 + +**本輪完成**: +- `ApprovalDBService.create_approval()` 與 `create_approval_with_fingerprint()` 建立 `ApprovalRecord` 後,會在同一 DB transaction 追加一筆 `TimelineEvent`。 +- `IncidentApprovalService.create_with_approval()` 的 incident + approval 原子建立路徑也會追加同格式 `TimelineEvent`。 +- 新事件以 `event_type=human`、`actor_role=approval_gate` 表示 approval gate,description 固定包含: + - `stage` + - `next_action` + - `blocked_reason` + - `auto_or_manual` + - `needs_human` + - `risk_level` + - `signatures` +- pending gate 會顯示 `stage=approval_required`、`next_action=operator_approve_or_reject`、`blocked_reason=waiting_for_required_signatures`、`needs_human=yes`。 +- low-risk auto gate 會顯示 `stage=approval_auto_approved`、`next_action=execute_or_verify`、`blocked_reason=none`、`auto_or_manual=auto`。 +- 新增 `apps/api/tests/test_approval_timeline_event.py`,鎖定 pending manual gate 與 low-risk auto gate 的事件形狀。 +- `apps/api/tests/test_awooop_truth_chain_service.py` 新增回歸測試:approval 有 raw timeline event 時,不再標記 `timeline_missing_for_approval`。 + +**驗證**: +- `python3 -m py_compile apps/api/src/services/approval_db.py apps/api/src/services/incident_approval_service.py apps/api/tests/test_approval_timeline_event.py apps/api/tests/test_awooop_truth_chain_service.py` → pass +- `PYTHONPATH=apps/api DATABASE_URL=sqlite+aiosqlite:///:memory: pytest apps/api/tests/test_approval_timeline_event.py apps/api/tests/test_awooop_truth_chain_service.py -q` → `53 passed` +- `PYTHONPATH=apps/api DATABASE_URL=sqlite+aiosqlite:///:memory: pytest apps/api/tests/test_incident_timeline_service.py -q` → `6 passed` + +**目前狀態**: +- Phase 1 AI 自動化飛輪真相盤點:`38% → 48%`。 +- 完整 AI 自動化飛輪:維持 `72%`;本輪只完成 approval gate raw timeline 留痕與本地驗證,尚未完成 Telegram 文案、risk gate 對照、executor / verifier / KM 回寫與正式站頁面驗證。 + +**下一步**: +1. 推 Gitea main 後等 CD 上線,production 查同一 incident / 新 incident 是否能看到 raw `timeline_events`,並補 `/zh-TW/awooop/runs` Browser smoke。 +2. 補 Telegram stage / next action / blocked reason / auto-or-manual 文案。 +3. 建立 risk level ↔ approval gate 對照表,並接到 Approvals / Runs 的 operator view。 diff --git a/docs/workplans/2026-06-04-navigation-and-ai-flywheel-workplan.md b/docs/workplans/2026-06-04-navigation-and-ai-flywheel-workplan.md index 83057f34..4904e8ff 100644 --- a/docs/workplans/2026-06-04-navigation-and-ai-flywheel-workplan.md +++ b/docs/workplans/2026-06-04-navigation-and-ai-flywheel-workplan.md @@ -94,14 +94,16 @@ | 欄位 | 內容 | | --- | --- | | 優先級 | P0 | -| 狀態 | 盤點中:SLO 已回綠,流程斷點已定位,待修 timeline / Telegram / gate 對照 | -| 本階段完成度 | 38% | +| 狀態 | 修復中:新建 approval gate 已寫入 raw timeline event 並通過本地測試,待正式站 rollout / page smoke | +| 本階段完成度 | 48% | | 目標完成度 | 完整 AI 自動化飛輪 69% → 75% | 細項: - [x] 查 `auto_execute_success_rate` SLO 違反原因,對照最近自動修復任務。 - [x] 盤點流程是否真跑完:alert ingest → classify → rule match → MCP evidence → PlayBook → approval gate → Ansible/repair → verifier → KM → postmortem。 -- [ ] 補 timeline event:每個 incident 必須看到 stage、handler、AI action、manual need。 +- [x] 補新建 approval gate raw timeline event:同一 transaction 寫入 `stage`、`next_action`、`blocked_reason`、`auto_or_manual`、`needs_human`、risk 與 signature 狀態。 +- [ ] 補既有 pending approval 歷史 backfill / production smoke,確認正式站同一 incident 不再出現 `timeline_missing_for_approval`。 +- [ ] 補 executor / verifier / KM 狀態變更事件:每個 incident 必須看到 stage、handler、AI action、manual need。 - [ ] Telegram 告警補 stage、next action、blocked reason、auto/manual。 - [ ] 建立風險分級與 approval gate 對照表,釐清中低風險是否允許 AI 自動修復。 - [ ] 確認 MCP / 自建 MCP 實際 tool call、evidence、result 有被前端顯示;目前 Runs 表格可見 MCP summary,但 result / blocker 還不夠一眼可讀。 @@ -116,9 +118,11 @@ Live evidence: - Production `/api/v1/platform/ai-route-status?workload_type=deep_rca`:policy order 為 `ollama_gcp_a → ollama_gcp_b → ollama_local → gemini`,目前 selected provider `ollama_gcp_a`,Gemini 仍是 final fallback。 - Browser smoke:`/zh-TW/awooop/runs?project_id=awoooi` 表格載入 50 列,含 `INC-20260603-9B2535` 與 MCP 摘要,`horizontalOverflow=0`;Automation Flow Gate 顯示 28 件、8 個 blocked gate、verified auto repair 0。 - Browser smoke:`/zh-TW/awooop/work-items` 與 `/zh-TW/awooop/approvals` 可開且 `horizontalOverflow=0`,但未直接浮出 `INC-20260603-9B2535` / `INC-20260601-1B3388` 的可操作下一步。 +- Local code verification:`ApprovalDBService.create_approval()`、`create_approval_with_fingerprint()`、`IncidentApprovalService.create_with_approval()` 會在建立 approval 後追加 raw `timeline_events`,描述包含 `stage`、`next_action`、`blocked_reason`、`auto_or_manual`、`needs_human`。 +- Local tests:`test_approval_timeline_event.py` 覆蓋 pending manual gate 與 low-risk auto gate;`test_awooop_truth_chain_service.py` 覆蓋有 raw timeline event 時不再標記 `timeline_missing_for_approval`。 驗收: -- [ ] DB / API / browser 都能查到同一 incident 的 stage timeline。 +- [ ] DB / API / browser 都能查到同一 incident 的 stage timeline;本地程式與單元測試已過,production rollout 後需補實際 incident page smoke。 - [ ] Telegram 訊息可不開前端判斷目前流程狀態。 - [ ] 低風險自動修復權限與人工 gate 條件有可驗證表格。