diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index 871469b0..2318c19c 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -24,6 +24,7 @@ Approval Execution Service - Phase 16 R4.2 瘦身 Router 抽取 """ import asyncio +import time from typing import TYPE_CHECKING import structlog @@ -39,6 +40,11 @@ if TYPE_CHECKING: logger = structlog.get_logger(__name__) +# ADR-090 § 自動化動作回灌 (2026-04-19 ogt + Claude Opus 4.7 亞太): +# PostExecutionVerifier 從 fire-and-forget 改 await,確保 verification_result 必寫入 incident_evidence. +# 上限 60s 涵蓋 verifier warmup(10s) + collect(30s) + 緩衝 20s. +_VERIFIER_AWAIT_TIMEOUT_SEC = 60.0 + class ApprovalExecutionService: """ @@ -134,6 +140,11 @@ class ApprovalExecutionService: action=approval.action, ) + # ADR-090 § 自動化動作回灌 (2026-04-19): 主流程開始即在 aol 留痕, + # 結束時 update。不依賴 fire-and-forget,確保 33 件/7d approval 全部可觀測。 + _aol_op_id = await self._log_aol_started(approval) + _aol_started_ms = time.time() + service = get_approval_service() timeline = get_timeline_service() @@ -181,6 +192,13 @@ class ApprovalExecutionService: approval, success=True, error=None, ) ) + # ADR-090 § aol completed (NO_ACTION 視為成功) + await self._log_aol_completed( + op_id=_aol_op_id, + status="success", + duration_ms=int((time.time() - _aol_started_ms) * 1000), + output={"reason": "NO_ACTION", "action": approval.action[:200]}, + ) return True # NO_ACTION 視為成功完成 # 真解析失敗 (非 NO_ACTION) @@ -215,6 +233,13 @@ class ApprovalExecutionService: error_message="Could not parse operation type", ) ) + # ADR-090 § aol completed (parse 失敗) + await self._log_aol_completed( + op_id=_aol_op_id, + status="failed", + duration_ms=int((time.time() - _aol_started_ms) * 1000), + error=f"parse_fail: {approval.action[:300]}", + ) return False # 解析失敗 → 執行未發生 # ADR-076 Task 3: 執行失敗重試機制 @@ -337,16 +362,25 @@ class ApprovalExecutionService: timeout_sec=30.0, ) - # ADR-081 Phase 1: 執行後驗證 (fire-and-forget) - # PostExecutionVerifier 等待 K8s 收斂後抓取後狀態,補填 EvidenceSnapshot + # ADR-081 Phase 1 + ADR-090 修復 (2026-04-19 ogt + Claude Opus 4.7): + # PostExecutionVerifier 改 await + 60s timeout,確保 verification_result 必寫入。 + # 之前 fire-and-forget 在 Pod recycle 時 task 被殺,導致 1212 筆 evidence 全 NULL. from src.core.feature_flags import aiops_flags if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"): - asyncio.create_task( - self._run_post_execution_verify( - approval=approval, - action_taken=f"{operation_type.value}:{resource_name}", + try: + await asyncio.wait_for( + self._run_post_execution_verify( + approval=approval, + action_taken=f"{operation_type.value}:{resource_name}", + ), + timeout=_VERIFIER_AWAIT_TIMEOUT_SEC, + ) + except asyncio.TimeoutError: + logger.warning( + "post_verify_timeout_exceeded", + approval_id=str(approval.id), + timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC, ) - ) # 2026-04-07 Claude Code: Sprint 4 B3 — 記錄人工批准處置類型 try: @@ -373,6 +407,19 @@ class ApprovalExecutionService: except Exception as _resolve_e: logger.warning("incident_resolve_after_execution_failed", error=str(_resolve_e)) + # ADR-090 § aol completed (執行成功) + await self._log_aol_completed( + op_id=_aol_op_id, + status="success", + duration_ms=int((time.time() - _aol_started_ms) * 1000), + output={ + "operation_type": operation_type.value, + "resource_name": resource_name, + "namespace": namespace, + "executor_duration_ms": result.duration_ms, + "total_attempts": total_attempts, + }, + ) return True # K8s 執行成功 else: @@ -438,18 +485,41 @@ class ApprovalExecutionService: timeout_sec=30.0, ) - # 2026-04-18 ogt + Claude Opus 4.7: ADR-090 L6 斷鏈修復 — P0.3 - # 失敗時也跑 verifier,把 verification_result='failed' 回寫 evidence - # 之前 988 筆 evidence 的 verification_result 全 NULL,因 verifier 只在 success 時跑 + # ADR-090 修復 (2026-04-19 ogt + Claude Opus 4.7): + # 失敗時也跑 verifier,把 verification_result='failed' 回寫 evidence。 + # 改 await + 60s timeout (原為 fire-and-forget,task 在 Pod recycle 時被殺)。 from src.core.feature_flags import aiops_flags if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"): - asyncio.create_task( - self._run_post_execution_verify( - approval=approval, - action_taken=f"{operation_type.value}:{resource_name}:FAILED", + try: + await asyncio.wait_for( + self._run_post_execution_verify( + approval=approval, + action_taken=f"{operation_type.value}:{resource_name}:FAILED", + ), + timeout=_VERIFIER_AWAIT_TIMEOUT_SEC, + ) + except asyncio.TimeoutError: + logger.warning( + "post_verify_timeout_exceeded_failed_path", + approval_id=str(approval.id), + timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC, ) - ) + # ADR-090 § aol completed (執行失敗) + await self._log_aol_completed( + op_id=_aol_op_id, + status="failed", + duration_ms=int((time.time() - _aol_started_ms) * 1000), + output={ + "operation_type": operation_type.value, + "resource_name": resource_name, + "namespace": namespace, + "executor_duration_ms": result.duration_ms, + "total_attempts": total_attempts, + }, + error=result.error, + stderr=result.error, # E6 stderr 回灌 — 給 retry/Playbook 負向強化用 + ) return False # K8s 執行失敗 async def _push_execution_result_to_alert( @@ -1014,6 +1084,107 @@ class ApprovalExecutionService: return None + # ========================================================================= + # ADR-090 § AOL Writer (2026-04-19 ogt + Claude Opus 4.7 亞太) + # 把 approval execution 的生命週期回灌 automation_operation_log. + # 之前 33 件/7d approval 動作完全沒寫入 aol,只有 drift_narrator 的 + # 22 筆 notification_formatted。修復後每次執行都留痕。 + # ========================================================================= + + async def _log_aol_started(self, approval: ApprovalRequest) -> str | None: + """ + 在 automation_operation_log 寫一筆 'pending' 紀錄,回傳 op_id 供 _log_aol_completed 更新。 + + 失敗時 (DB 異常) 回 None,主流程繼續 — aol 寫入永不阻塞執行。 + """ + try: + from sqlalchemy import text as _sql + from src.db.base import get_db_context + import json as _json + + input_payload = { + "approval_id": str(approval.id), + "incident_id": approval.incident_id or "", + "action": (approval.action or "")[:500], + "risk_level": getattr(approval, "risk_level", None) or "", + "requested_by": getattr(approval, "requested_by", "") or "", + } + + async with get_db_context() as db: + row = await db.execute( + _sql(""" + INSERT INTO automation_operation_log ( + operation_type, actor, status, + input, output, tags + ) VALUES ( + 'playbook_executed', + 'approval_execution', + 'pending', + CAST(:input AS jsonb), + '{}'::jsonb, + :tags + ) + RETURNING op_id + """), + { + "input": _json.dumps(input_payload, ensure_ascii=False), + "tags": ["approval", "execution", "playbook"], + }, + ) + op_id = row.scalar() + return str(op_id) if op_id else None + except Exception as e: + logger.warning("aol_started_write_failed", approval_id=str(approval.id), error=str(e)) + return None + + async def _log_aol_completed( + self, + op_id: str | None, + status: str, + duration_ms: int, + output: dict | None = None, + error: str | None = None, + stderr: str | None = None, + ) -> None: + """ + UPDATE automation_operation_log 為 success/failed 並寫入結果摘要 + stderr。 + + status 必須是 aol constraint 允許的值: + pending | success | failed | dry_run | rolled_back + + op_id 為 None 時靜默跳過 (started 寫入失敗時不應觸發 update 例外)。 + """ + if not op_id: + return + try: + from sqlalchemy import text as _sql + from src.db.base import get_db_context + import json as _json + + async with get_db_context() as db: + await db.execute( + _sql(""" + UPDATE automation_operation_log + SET status = :status, + duration_ms = :duration_ms, + output = CAST(:output AS jsonb), + error = :error, + stderr_feed_back = :stderr + WHERE op_id = CAST(:op_id AS uuid) + """), + { + "status": status, + "duration_ms": duration_ms, + "output": _json.dumps(output or {}, ensure_ascii=False), + "error": (error or "")[:2000] if error else None, + "stderr": (stderr or "")[:8000] if stderr else None, + "op_id": op_id, + }, + ) + except Exception as e: + logger.warning("aol_completed_write_failed", op_id=op_id, error=str(e)) + + # ============================================================================= # Singleton Instance # ============================================================================= diff --git a/apps/api/src/services/declarative_remediation.py b/apps/api/src/services/declarative_remediation.py index 92746092..52e4dfd2 100644 --- a/apps/api/src/services/declarative_remediation.py +++ b/apps/api/src/services/declarative_remediation.py @@ -168,13 +168,30 @@ class DeclarativeRemediation: ) # 2026-04-18 ADR-090-D: 寫入 remediation_events 表(MASTER §7.1 #6 KPI 資料源) - # fire-and-forget,不阻塞主流程 + # 2026-04-19 ogt + Claude Opus 4.7 修復: 原 fire-and-forget 0 筆寫入。 + # evaluate() 是同步函式,無法直接 await — 改用 named task + done_callback, + # 確保 task 失敗時有 log,後續可診斷為何 0 筆。 try: import asyncio as _a - _a.create_task(_log_remediation_event(spec, action, target, namespace)) + _task = _a.create_task( + _log_remediation_event(spec, action, target, namespace), + name=f"log_remediation:{action[:30]}", + ) + + def _on_done(t: _a.Task) -> None: + if t.cancelled(): + logger.warning("log_remediation_event_cancelled", action=action[:80]) + elif t.exception(): + logger.warning( + "log_remediation_event_failed", + action=action[:80], + error=str(t.exception()), + ) + + _task.add_done_callback(_on_done) except RuntimeError: # 非 async context (正規呼叫都是 async),靜默跳過 - pass + logger.debug("log_remediation_event_no_event_loop", action=action[:80]) return spec