diff --git a/apps/api/src/services/approval_db.py b/apps/api/src/services/approval_db.py index 1e6eef24..fcd71664 100644 --- a/apps/api/src/services/approval_db.py +++ b/apps/api/src/services/approval_db.py @@ -610,22 +610,32 @@ class ApprovalDBService: self, approval_id: UUID, success: bool, + error_message: str | None = None, ) -> None: """ 更新執行狀態 + + 2026-04-18 ogt + Claude Opus 4.7: ADR-090 L5 斷鏈修復 — P0.2 + 失敗時必寫 rejection_reason,讓診斷不再黑盒 + (之前 EXECUTION_FAILED 216 筆 reason 全空) """ async with get_db_context() as db: status = ApprovalStatus.EXECUTION_SUCCESS if success else ApprovalStatus.EXECUTION_FAILED + values: dict = {"status": status} + if not success and error_message: + # 截斷至合理長度,避免爆欄位 + values["rejection_reason"] = str(error_message)[:2000] await db.execute( update(ApprovalRecord) .where(ApprovalRecord.id == str(approval_id)) - .values(status=status) + .values(**values) ) logger.info( "approval_execution_status_updated", id=str(approval_id), success=success, + has_error=bool(error_message), ) async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None: diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index b34d490b..b8d1c5de 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -222,7 +222,12 @@ class ApprovalExecutionService: attempt += 1 # Phase 5: 更新資料庫狀態 - await service.update_execution_status(approval.id, success=result.success) + # 2026-04-18 ADR-090 L5 P0.2: 失敗時帶上 error_message,寫進 rejection_reason + await service.update_execution_status( + approval.id, + success=result.success, + error_message=None if result.success else (result.error or "(executor 未回傳錯誤)"), + ) # Update approval status based on result total_attempts = attempt # attempt 在重試迴圈後為最終嘗試次數 @@ -389,6 +394,19 @@ class ApprovalExecutionService: approval_id=str(approval.id), timeout_sec=30.0, ) + + # 2026-04-18 ogt + Claude Opus 4.7: ADR-090 L6 斷鏈修復 — P0.3 + # 失敗時也跑 verifier,把 verification_result='failed' 回寫 evidence + # 之前 988 筆 evidence 的 verification_result 全 NULL,因 verifier 只在 success 時跑 + from src.core.feature_flags import aiops_flags + if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"): + asyncio.create_task( + self._run_post_execution_verify( + approval=approval, + action_taken=f"{operation_type.value}:{resource_name}:FAILED", + ) + ) + return False # K8s 執行失敗 async def _push_execution_result_to_alert( diff --git a/k8s/awoooi-prod/07-rbac.yaml b/k8s/awoooi-prod/07-rbac.yaml index 75959f2c..09b9fe60 100644 --- a/k8s/awoooi-prod/07-rbac.yaml +++ b/k8s/awoooi-prod/07-rbac.yaml @@ -70,6 +70,25 @@ rules: resources: ["deployments", "deployments/status", "replicasets"] verbs: ["get", "list", "watch"] + # 2026-04-18 Claude Opus 4.7: 補齊 L5/L6 斷鏈 — executor.py 需讀 nodes + HPA + metrics + # 根因: 修 P0.1 — 99% EXECUTION_FAILED 因 RBAC 不足(Forbidden),連 evidence gathering 都斷 + # 2026-04-18 評估: P1 安全 — 純讀取,無寫入,cluster-scope 限定特定資源 + - apiGroups: [""] + resources: ["nodes", "nodes/status"] + verbs: ["get", "list", "watch"] + + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["get", "list", "watch"] + + - apiGroups: ["metrics.k8s.io"] + resources: ["nodes", "pods"] + verbs: ["get", "list"] + + - apiGroups: ["apps"] + resources: ["statefulsets", "daemonsets"] + verbs: ["get", "list", "watch"] + # ============================================================================ # 寫入權限 (Write) - 僅限故障排除操作 # ============================================================================