""" Approval Execution Service - Phase 16 R4.2 瘦身 Router 抽取 ============================================================ 從 approvals.py 抽取執行編排邏輯,整合: - OperationParser: 解析操作類型 - K8s Executor: 執行 K8s 操作 - ApprovalDBService: 更新狀態 - TimelineService: 記錄事件 - NotificationManager: 發送通知 - Phase 7.6: Playbook 自動萃取 版本: v1.2 建立: 2026-03-25 (台北時區) 更新: 2026-03-26 (Phase 7.6 自動萃取) 更新: 2026-04-14 (ADR-076 Task 3: 執行失敗重試機制 — Claude Haiku 4.5 Asia/Taipei) 建立者: Claude Code (Phase 16 R4.2) 重試設計 (ADR-076): - MAX_RETRY = 2 次(共最多 3 次嘗試) - RETRY_DELAY_SECONDS = 30 秒 - 只重試瞬態錯誤(connection refused, timeout, i/o error 等) - 永久性錯誤(not found, permission denied, already exists)不重試 """ import asyncio import time from typing import TYPE_CHECKING, Any from uuid import UUID import structlog from src.core.config import settings from src.core.redis_client import get_redis from src.db.base import get_db_context from src.models.approval import ApprovalRequest from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError from src.plugins.mcp.interfaces import MCPToolResult from src.services.approval_db import get_approval_service, get_timeline_service from src.services.executor import ExecutionResult, OperationType, get_executor from src.services.operation_parser import parse_operation_from_action if TYPE_CHECKING: from src.services.notifications import ExecutionStatus logger = structlog.get_logger(__name__) # ADR-090 § 自動化動作回灌 (2026-04-19 ogt + Claude Opus 4.7 亞太): # PostExecutionVerifier 從 fire-and-forget 改 await,確保 verification_result 必寫入 incident_evidence. # 上限 60s 涵蓋 verifier warmup(10s) + collect(30s) + 緩衝 20s. _VERIFIER_AWAIT_TIMEOUT_SEC = 60.0 # T9: approved SSH execution must go through AwoooP MCP Gateway. # ApprovalRequest itself is the human/multi-sig decision artifact; for write/admin # tools we project it into the short-lived Gate 5 Redis key expected by Gateway. _SSH_GATEWAY_AGENT_ID = "approval_executor" _SSH_GATEWAY_PROJECT_ID = "awoooi" _SSH_GATEWAY_APPROVAL_TTL_SECONDS = 600 _SSH_GATEWAY_TOOL_SCOPES: dict[str, str] = { "ssh_diagnose": "read", "ssh_docker_restart": "write", "ssh_docker_compose_restart": "write", "ssh_systemctl_restart": "write", "ssh_clear_docker_logs": "write", "ssh_renew_ssl": "write", "ssh_reload_nginx": "write", "ssh_docker_prune": "admin", } class ApprovalExecutionService: """ 授權執行服務 - 編排整個執行流程 職責: 1. 解析操作類型 2. 呼叫 K8s Executor 執行(含重試) 3. 更新資料庫狀態 4. 記錄 Timeline 事件 5. 發送通知 """ # ADR-076 Task 3: 重試常數 MAX_RETRY: int = 2 RETRY_DELAY_SECONDS: int = 30 # 瞬態錯誤關鍵字(小寫比對),符合任一 → 可重試 _TRANSIENT_ERROR_KEYWORDS: tuple[str, ...] = ( "connection refused", "connection reset", "timeout", "timed out", "i/o error", "io error", "temporary failure", "service unavailable", "too many requests", "dial tcp", "eof", ) # 永久性錯誤關鍵字(小寫比對),符合任一 → 不重試 _PERMANENT_ERROR_KEYWORDS: tuple[str, ...] = ( "not found", "forbidden", "permission denied", "unauthorized", "already exists", "invalid", "immutable", "destructive", "blocked", ) @classmethod def _is_transient_error(cls, error_message: str | None) -> bool: """ 判斷執行錯誤是否為瞬態(可重試) 優先檢查永久性錯誤(比瞬態錯誤有更高的優先順序), 避免 "connection refused (not found)" 這類混合訊息誤判。 Args: error_message: 執行錯誤訊息 Returns: True 表示可重試,False 表示永久失敗 """ if not error_message: return False lower = error_message.lower() # 永久性錯誤 → 不重試 if any(kw in lower for kw in cls._PERMANENT_ERROR_KEYWORDS): return False # 瞬態錯誤 → 可重試 return any(kw in lower for kw in cls._TRANSIENT_ERROR_KEYWORDS) async def execute_approved_action(self, approval: ApprovalRequest) -> bool: """ 背景執行已批准的操作 此函數由 BackgroundTasks 呼叫,不阻塞 API 回應 Phase 5: 執行後更新資料庫狀態 Phase 6: 執行後發送通知 (Post-Execution Hook) 2026-04-17 ogt + Claude Sonnet 4.6: 返回 bool 表示 K8s 執行成功與否 根本原因: 原本返回 None → decision_manager.py auto-execute 路徑無法得知結果 → 永遠傳 success=True 給 _push_auto_repair_result → 假成功廣播 修復: 返回 result.success,讓呼叫端自行決定 Telegram 訊息 Args: approval: 已批准的授權請求 Returns: bool: True = K8s 執行成功,False = 執行失敗(含解析失敗) """ from src.services.notifications import ExecutionStatus logger.info( "background_execution_start", approval_id=str(approval.id), action=approval.action, ) # ADR-090 § 自動化動作回灌 (2026-04-19): 主流程開始即在 aol 留痕, # 結束時 update。不依賴 fire-and-forget,確保 33 件/7d approval 全部可觀測。 _aol_op_id = await self._log_aol_started(approval) _aol_started_ms = time.time() service = get_approval_service() timeline = get_timeline_service() # Parse operation details parsed = parse_operation_from_action(approval.action) operation_type = parsed.operation_type resource_name = parsed.resource_name namespace = parsed.namespace # 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合 # kubectl 指令解析後,動態驗證資源是否存在於 K8s,並套用 normalized name # exception 不阻斷主流程;miss/suggestion 只記 warning + metadata,不攔截執行 if resource_name is not None and operation_type is not None: try: from src.services.resource_resolver import get_resource_resolver from src.core.metrics import RESOURCE_RESOLVE_TOTAL _resolver = get_resource_resolver() _resolve = await _resolver.resolve( raw_resource=resource_name, namespace=namespace, resource_kind="deployment", ) if _resolve.success and _resolve.resource_name: if _resolve.resource_name != resource_name: logger.info( "resource_name_normalized", original=resource_name, normalized=_resolve.resource_name, namespace=namespace, ) resource_name = _resolve.resource_name RESOURCE_RESOLVE_TOTAL.labels(result="hit").inc() elif _resolve.candidates: logger.warning( "resource_not_found_in_k8s", resource=resource_name, namespace=namespace, suggestions=_resolve.candidates, ) RESOURCE_RESOLVE_TOTAL.labels(result="suggestion").inc() else: logger.warning( "resource_not_found_in_k8s", resource=resource_name, namespace=namespace, suggestions=[], ) RESOURCE_RESOLVE_TOTAL.labels(result="miss").inc() except Exception as _rr_e: logger.warning("resource_resolve_failed", error=str(_rr_e)) try: from src.core.metrics import RESOURCE_RESOLVE_TOTAL RESOURCE_RESOLVE_TOTAL.labels(result="error").inc() except Exception: pass if operation_type is None or resource_name is None: # 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗 # NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED # 污染 auto_execute 成功率 KPI (MASTER §7.1 #11) _action_upper = (approval.action or "").upper() _is_no_action = ( "NO_ACTION" in _action_upper or "NO-ACTION" in _action_upper or "NOACTION" in _action_upper or "(未設)" in approval.action or _action_upper.startswith("OBSERVE") or _action_upper.startswith("INVESTIGATE") ) if _is_no_action: logger.info( "background_execution_noop", approval_id=str(approval.id), action=approval.action, reason="NO_ACTION - 純調查/觀察類,不執行破壞動作", path="no_action", ) # 標為 SUCCESS (觀察/調查本身就是成功完成) await service.update_execution_status(approval.id, success=True) await timeline.add_event( event_type="exec", status="success", title="✅ 純觀察類動作完成 (NO_ACTION)", description=f"Action: {approval.action[:120]}", actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), incident_id=approval.incident_id, ) # 執行結果 reply 原告警卡片 asyncio.create_task( self._push_execution_result_to_alert( approval, success=True, error=None, ) ) # ADR-090 § aol completed (NO_ACTION 視為成功) await self._log_aol_completed( op_id=_aol_op_id, status="success", duration_ms=int((time.time() - _aol_started_ms) * 1000), output={"reason": "NO_ACTION", "action": approval.action[:200]}, ) # F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex): # NO_ACTION 路徑要把 incident 推到 RESOLVED,否則 incident 永遠卡 # INVESTIGATING(FlywheelExecutionRateMissing 死告警 + 566 stuck 增長根因 #1)。 # resolve_incident 內已加 RESOLVED 冪等 guard,重複 resolve 會 idempotent # return existing incident 不會重觸發 postmortem。 if approval.incident_id: try: from src.services.incident_service import get_incident_service await get_incident_service().resolve_incident(approval.incident_id) logger.info( "incident_resolved_after_no_action_execution", incident_id=approval.incident_id, approval_id=str(approval.id), path="no_action", ) except Exception as _resolve_e: logger.warning( "incident_resolve_after_no_action_execution_failed", incident_id=approval.incident_id, approval_id=str(approval.id), error=str(_resolve_e), ) return True # NO_ACTION 視為成功完成 # 真解析失敗 (非 NO_ACTION) logger.warning( "background_execution_skip", approval_id=str(approval.id), reason="Could not parse operation type from action", action=approval.action, ) # Phase 5: 更新資料庫狀態 + 帶 error_message (P0.2) await service.update_execution_status( approval.id, success=False, error_message=f"Could not parse operation type from action: {approval.action[:150]}", ) await timeline.add_event( event_type="exec", status="error", title="執行失敗: 無法解析操作類型", description=f"Action: {approval.action}", actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), incident_id=approval.incident_id, ) # Phase 6: 發送失敗通知 (fire-and-forget) asyncio.create_task( self._send_execution_notification( approval=approval, execution_status=ExecutionStatus.FAILED, operation_type="unknown", namespace=namespace, error_message="Could not parse operation type", ) ) # ADR-090 § aol completed (parse 失敗) await self._log_aol_completed( op_id=_aol_op_id, status="failed", duration_ms=int((time.time() - _aol_started_ms) * 1000), error=f"parse_fail: {approval.action[:300]}", ) return False # 解析失敗 → 執行未發生 executor = get_executor() attempt = 1 # 重試計數(INVESTIGATE 路徑不進入重試迴圈,保持 1) # 2026-05-02 ogt + Claude Sonnet 4.6: 主機 SSH 操作分支 # 根因:手動批准 ssh action 時 parser 只懂 kubectl,回 None → 「Could not parse」假失敗 # 修法:偵測到 SSH_HOST 類型,走 SSHProvider 而非 K8s executor if operation_type == OperationType.SSH_HOST: result = await self._execute_ssh_host_action( approval=approval, host=resource_name or "", ) logger.info( "background_execution_ssh_host", approval_id=str(approval.id), action=approval.action, host=resource_name, success=result.success, message=result.message, ) elif operation_type == OperationType.INVESTIGATE: # 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢 # 根因:INVESTIGATE 不在 executor.execute_with_audit 的 switch,走 else → success=False # 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action) # 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間) result = await executor.execute_kubectl_command( command=approval.action, timeout_sec=30, ) logger.info( "background_execution_investigate", approval_id=str(approval.id), action=approval.action, success=result.success, message=result.message, ) else: # ADR-076 Task 3: 執行失敗重試機制 # 瞬態錯誤 (connection refused, timeout 等) 自動重試,最多 MAX_RETRY 次 result = await executor.execute_with_audit( approval=approval, operation_type=operation_type, resource_name=resource_name, namespace=namespace, ) attempt = 1 while not result.success and attempt <= self.MAX_RETRY: if not self._is_transient_error(result.error): logger.info( "execution_retry_skipped_permanent_error", approval_id=str(approval.id), attempt=attempt, error=result.error, ) break logger.warning( "execution_retry_transient_error", approval_id=str(approval.id), attempt=attempt, max_retry=self.MAX_RETRY, error=result.error, delay_seconds=self.RETRY_DELAY_SECONDS, ) await timeline.add_event( event_type="exec", status="warning", title=f"⚠️ 執行失敗,{self.RETRY_DELAY_SECONDS}s 後重試 ({attempt}/{self.MAX_RETRY})", description=f"Error: {result.error}", actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), incident_id=approval.incident_id, ) await asyncio.sleep(self.RETRY_DELAY_SECONDS) result = await executor.execute_with_audit( approval=approval, operation_type=operation_type, resource_name=resource_name, namespace=namespace, ) attempt += 1 # Phase 5: 更新資料庫狀態 # 2026-04-18 ADR-090 L5 P0.2: 失敗時帶上 error_message,寫進 rejection_reason await service.update_execution_status( approval.id, success=result.success, error_message=None if result.success else (result.error or "(executor 未回傳錯誤)"), ) # Update approval status based on result total_attempts = attempt # attempt 在重試迴圈後為最終嘗試次數 if result.success: logger.info( "background_execution_success", approval_id=str(approval.id), operation=operation_type.value, target=resource_name, namespace=namespace, duration_ms=result.duration_ms, total_attempts=total_attempts, ) retry_note = f" (重試 {total_attempts - 1} 次後成功)" if total_attempts > 1 else "" await timeline.add_event( event_type="exec", status="success", title=f"✅ K8s 執行成功: {operation_type.value}{retry_note}", description=f"Target: {resource_name} @ {namespace} ({result.duration_ms}ms)", actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), incident_id=approval.incident_id, ) # Phase 6: 發送成功通知 (fire-and-forget) asyncio.create_task( self._send_execution_notification( approval=approval, execution_status=ExecutionStatus.SUCCESS, operation_type=operation_type.value, namespace=namespace, duration_ms=result.duration_ms, ) ) # 2026-04-14 Claude Sonnet 4.6: reply_to 原告警卡片顯示執行結果 # auto_approve 路徑由 _push_auto_repair_result 處理,此處僅處理人工批准 asyncio.create_task( self._push_execution_result_to_alert(approval, success=True, error=None) ) # Phase 7.6: 觸發 Playbook 自動萃取 (fire-and-forget) asyncio.create_task( self._trigger_playbook_extraction(approval) ) # ADR-030 Phase 5 / ADR-083 Phase 3: 觸發學習服務 # Phase 3 修復:移除 fire-and-forget,改用 await + 30s 熔斷 # 超時 → 記錄 metric,主流程繼續(不 crash) # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 fire-and-forget 修復 try: await asyncio.wait_for( self._trigger_learning( approval=approval, success=True, duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0, ), timeout=30.0, ) except asyncio.TimeoutError: logger.warning( "learning_trigger_timeout", approval_id=str(approval.id), timeout_sec=30.0, ) # ADR-081 Phase 1 + ADR-090 修復 (2026-04-19 ogt + Claude Opus 4.7): # PostExecutionVerifier 改 await + 60s timeout,確保 verification_result 必寫入。 # 之前 fire-and-forget 在 Pod recycle 時 task 被殺,導致 1212 筆 evidence 全 NULL. from src.core.feature_flags import aiops_flags if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"): try: await asyncio.wait_for( self._run_post_execution_verify( approval=approval, action_taken=f"{operation_type.value}:{resource_name}", ), timeout=_VERIFIER_AWAIT_TIMEOUT_SEC, ) except asyncio.TimeoutError: logger.warning( "post_verify_timeout_exceeded", approval_id=str(approval.id), timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC, ) # 2026-04-07 Claude Code: Sprint 4 B3 — 記錄人工批准處置類型 try: anomaly_key = await self._get_anomaly_key_from_approval(approval) if anomaly_key: from src.services.anomaly_counter import get_anomaly_counter counter = get_anomaly_counter() await counter.record_disposition(anomaly_key, "human_approved") except Exception as _disp_e: logger.warning("disposition_record_failed", error=str(_disp_e)) # ADR-073 修補: 執行成功 → 解決 Incident → 觸發 KM 轉換 # 之前 RESOLVED 從未被呼叫,導致 KM 永遠不生成、Playbook 永遠是 0 if approval.incident_id: try: from src.services.incident_service import get_incident_service _inc_svc = get_incident_service() await _inc_svc.resolve_incident(approval.incident_id) logger.info( "incident_resolved_after_execution", incident_id=approval.incident_id, approval_id=str(approval.id), ) except Exception as _resolve_e: logger.warning("incident_resolve_after_execution_failed", error=str(_resolve_e)) # ADR-090 § aol completed (執行成功) await self._log_aol_completed( op_id=_aol_op_id, status="success", duration_ms=int((time.time() - _aol_started_ms) * 1000), output={ "operation_type": operation_type.value, "resource_name": resource_name, "namespace": namespace, "executor_duration_ms": result.duration_ms, "total_attempts": total_attempts, }, ) return True # K8s 執行成功 else: logger.error( "background_execution_failed", approval_id=str(approval.id), operation=operation_type.value, target=resource_name, namespace=namespace, error=result.error, ) await timeline.add_event( event_type="exec", status="error", title=f"❌ K8s 執行失敗: {operation_type.value}", description=f"Error: {result.error}", actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), incident_id=approval.incident_id, ) # Phase 6: 發送失敗通知 (fire-and-forget, 包含 Dry-Run 攔截) exec_status = ( ExecutionStatus.DRY_RUN_BLOCKED if "not found" in (result.error or "") else ExecutionStatus.FAILED ) asyncio.create_task( self._send_execution_notification( approval=approval, execution_status=exec_status, operation_type=operation_type.value, namespace=namespace, error_message=result.error, duration_ms=result.duration_ms, ) ) # 2026-04-14 Claude Sonnet 4.6: reply_to 原告警卡片顯示失敗結果 asyncio.create_task( self._push_execution_result_to_alert( approval, success=False, error=result.error ) ) # ADR-030 Phase 5 / ADR-083 Phase 3: 觸發學習服務(失敗案例) # Phase 3 修復:fire-and-forget → await + 30s 熔斷 # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 fire-and-forget 修復 try: await asyncio.wait_for( self._trigger_learning( approval=approval, success=False, error_message=result.error, duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0, ), timeout=30.0, ) except asyncio.TimeoutError: logger.warning( "learning_trigger_timeout", approval_id=str(approval.id), timeout_sec=30.0, ) # ADR-090 修復 (2026-04-19 ogt + Claude Opus 4.7): # 失敗時也跑 verifier,把 verification_result='failed' 回寫 evidence。 # 改 await + 60s timeout (原為 fire-and-forget,task 在 Pod recycle 時被殺)。 from src.core.feature_flags import aiops_flags if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"): try: await asyncio.wait_for( self._run_post_execution_verify( approval=approval, action_taken=f"{operation_type.value}:{resource_name}:FAILED", ), timeout=_VERIFIER_AWAIT_TIMEOUT_SEC, ) except asyncio.TimeoutError: logger.warning( "post_verify_timeout_exceeded_failed_path", approval_id=str(approval.id), timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC, ) # ADR-090 § aol completed (執行失敗) await self._log_aol_completed( op_id=_aol_op_id, status="failed", duration_ms=int((time.time() - _aol_started_ms) * 1000), output={ "operation_type": operation_type.value, "resource_name": resource_name, "namespace": namespace, "executor_duration_ms": result.duration_ms, "total_attempts": total_attempts, }, error=result.error, stderr=result.error, # E6 stderr 回灌 — 給 retry/Playbook 負向強化用 ) return False # K8s 執行失敗 async def _execute_ssh_host_action( self, approval: ApprovalRequest, host: str, ) -> ExecutionResult: """ 執行 SSH 主機 action(手動批准路徑專用) 2026-05-02 ogt + Claude Sonnet 4.6: 修補手動批准 SSH action 卡住的 bug 根因:parse_operation_from_action 只懂 kubectl,approval_execution 走 K8s executor 拒收 修法:偵測 SSH_HOST 後改走 SSHProvider,行為與 decision_manager._ssh_execute 對齊 action 解析邏輯: - "docker prune" / "docker image prune" / "docker volume prune" → ssh_docker_prune - "docker restart " → ssh_docker_restart - "systemctl restart " → ssh_systemctl_restart - "ps aux" / "df -h" / "free -h" / "top" / "uptime" / 'echo' / 'ls -lah' → ssh_diagnose - 其他:回傳失敗,提示 LLM 改寫 action """ start = time.time() action = approval.action or "" action_lower = action.lower().strip() # 路由 SSH MCP tool(與 decision_manager._ssh_execute 對齊) params: dict = {"host": host} tool_name: str | None = None if "docker" in action_lower and "prune" in action_lower: tool_name = "ssh_docker_prune" params["trust_score"] = 0.85 elif "docker restart" in action_lower: tool_name = "ssh_docker_restart" # 嘗試萃取 container name import re as _re m = _re.search(r"docker\s+restart\s+([a-z0-9._-]+)", action_lower) if m: params["container_name"] = m.group(1) params["trust_score"] = 0.85 else: tool_name = None # 沒抓到 container 名稱,降級 elif "systemctl restart" in action_lower: tool_name = "ssh_systemctl_restart" import re as _re m = _re.search(r"systemctl\s+restart\s+([a-z0-9._-]+)", action_lower) if m: params["service"] = m.group(1) params["trust_score"] = 0.85 else: tool_name = None elif any(kw in action_lower for kw in ("ps aux", "df -h", "free -h", "top ", "uptime", "echo ", "ls -")): # 主機診斷類(合 ssh_diagnose 一鍵收集) tool_name = "ssh_diagnose" if tool_name is None: duration_ms = int((time.time() - start) * 1000) err = f"SSH action 無法路由到 SSH MCP tool: {action[:120]}" logger.warning( "ssh_host_action_unrouted", approval_id=str(approval.id), action=action, host=host, ) return ExecutionResult( success=False, message="SSH action unrouted", operation_type=OperationType.SSH_HOST, target_resource=host, namespace="host", duration_ms=duration_ms, error=err, ) try: logger.warning( "mcp_gateway_approved_ssh_execution_path", approval_id=str(approval.id), incident_id=approval.incident_id, tool=tool_name, host=host, agent_id=_SSH_GATEWAY_AGENT_ID, ) mcp_result = await self._execute_ssh_tool_via_gateway( approval=approval, tool_name=tool_name, params=params, ) duration_ms = int((time.time() - start) * 1000) success = bool(mcp_result.success) return ExecutionResult( success=success, message=f"ssh_mcp:{tool_name} {'ok' if success else 'failed'}", operation_type=OperationType.SSH_HOST, target_resource=host, namespace="host", duration_ms=duration_ms, k8s_response={"tool": tool_name, "output": mcp_result.output} if success else None, error=None if success else (mcp_result.error or "ssh_mcp execution failed"), ) except Exception as e: duration_ms = int((time.time() - start) * 1000) logger.warning( "ssh_host_action_exception", approval_id=str(approval.id), tool=tool_name, error=str(e), ) return ExecutionResult( success=False, message="ssh_mcp exception", operation_type=OperationType.SSH_HOST, target_resource=host, namespace="host", duration_ms=duration_ms, error=str(e), ) async def _execute_ssh_tool_via_gateway( self, approval: ApprovalRequest, tool_name: str, params: dict[str, Any], ) -> MCPToolResult: required_scope = _SSH_GATEWAY_TOOL_SCOPES.get(tool_name, "read") run_id = approval.id if isinstance(approval.id, UUID) else UUID(str(approval.id)) if required_scope != "read": approval_key = ( f"mcp_approval:{_SSH_GATEWAY_PROJECT_ID}:{_SSH_GATEWAY_AGENT_ID}:" f"{tool_name}:{run_id}" ) try: redis = get_redis() await redis.set( approval_key, "approved", ex=_SSH_GATEWAY_APPROVAL_TTL_SECONDS, ) except Exception as exc: logger.warning( "mcp_gateway_approval_projection_failed", approval_id=str(approval.id), tool=tool_name, approval_key=approval_key, error=str(exc), ) params_with_audit = { **params, "_mcp_audit": { "session_id": f"approval:{approval.id}", "incident_id": approval.incident_id, "agent_role": _SSH_GATEWAY_AGENT_ID, "flywheel_node": "execute", "approval_id": str(approval.id), }, } async with get_db_context(_SSH_GATEWAY_PROJECT_ID) as db: ctx = GatewayContext( project_id=_SSH_GATEWAY_PROJECT_ID, agent_id=_SSH_GATEWAY_AGENT_ID, tool_name=tool_name, run_id=run_id, trace_id=approval.incident_id or str(approval.id), is_shadow=False, environment={"env": "prod"}, required_scope=required_scope, ) try: return await McpGateway(db).call(ctx, params_with_audit) except McpGatewayError as exc: logger.warning( "mcp_gateway_approved_ssh_blocked", approval_id=str(approval.id), incident_id=approval.incident_id, tool=tool_name, gate=exc.gate, error_code=exc.error_code, error=str(exc), ) return MCPToolResult( success=False, execution_id=f"blocked:{tool_name}:{run_id}", error=f"{exc.error_code}: {exc}", ) async def _push_execution_result_to_alert( self, approval: ApprovalRequest, success: bool, error: str | None, ) -> None: """ 執行結果回覆到原告警 Telegram 卡片(reply_to_message_id) 2026-04-14 Claude Sonnet 4.6 實裝: - 人工路徑:人類在 Telegram 點批准後,等執行完成,在原告警下 reply 執行結果 - 自動路徑 (requested_by=auto_approve) 由 _push_auto_repair_result 處理,此處 skip 透過 Redis tg_msg:{incident_id} 查原告警 message_id,找不到則靜默不發。 """ try: # 自動執行路徑 skip(避免與 _push_auto_repair_result 重複發訊息) if self._is_auto_approved_request(approval): return if not approval.incident_id: return from src.core.redis_client import get_redis redis = get_redis() msg_id_raw = await redis.get(f"tg_msg:{approval.incident_id}") if not msg_id_raw: logger.debug( "push_execution_result_no_msg_id", incident_id=approval.incident_id, approval_id=str(approval.id), ) return try: orig_msg_id = int(msg_id_raw) except (TypeError, ValueError): return from src.core.config import get_settings from src.services.telegram_gateway import get_telegram_gateway settings = get_settings() gateway = get_telegram_gateway() target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID # 2026-04-19 ogt + Claude Opus 4.7 修 AP-2: 除了 reply 外, # 也 edit 原卡片移除按鈕 + 更新狀態戳記(避免卡片永遠停在「執行中」) try: await gateway._send_request("editMessageReplyMarkup", { "chat_id": target_chat_id, "message_id": orig_msg_id, "reply_markup": {"inline_keyboard": []}, }) except Exception as _edit_e: logger.debug("push_execution_edit_buttons_failed", approval_id=str(approval.id), error=str(_edit_e)) # 附加 KM/Playbook 增量(查最近該 incident 的 KM + playbook 使用) km_info = "" try: from sqlalchemy import text as _sql from src.db.base import get_db_context async with get_db_context() as _db: _km_row = await _db.execute( _sql("""SELECT COUNT(*) FROM knowledge_entries WHERE created_at > NOW() - interval '2 minutes'"""), ) _km_count = _km_row.scalar() or 0 _pb_row = await _db.execute( _sql("""SELECT COUNT(*) FROM playbooks WHERE updated_at > NOW() - interval '2 minutes'"""), ) _pb_count = _pb_row.scalar() or 0 if _km_count or _pb_count: km_info = f"\n📚 KM +{_km_count} 🎯 Playbook 更新×{_pb_count}" except Exception: pass if success: text = ( f"✅ 執行成功\n" f"{(approval.action or '')[:180]}" f"{km_info}" ) else: err_short = (error or "未知錯誤")[:150] text = ( f"❌ 執行失敗\n" f"{(approval.action or '')[:180]}\n" f"原因: {err_short}" f"{km_info}" ) await gateway._send_request( "sendMessage", { "chat_id": target_chat_id, "text": text, "parse_mode": "HTML", "reply_to_message_id": orig_msg_id, }, ) logger.info( "push_execution_result_sent", incident_id=approval.incident_id, approval_id=str(approval.id), success=success, orig_msg_id=orig_msg_id, ) except Exception as e: logger.warning( "push_execution_result_failed", approval_id=str(approval.id), error=str(e), ) async def _get_anomaly_key_from_approval(self, approval: ApprovalRequest) -> str | None: """ 從 approval → incident → anomaly_key。 2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident() """ try: if not approval.incident_id: return None from src.services.incident_service import get_incident_service incident_service = get_incident_service() incident = await incident_service.get_from_working_memory(approval.incident_id) if not incident: return None from src.services.anomaly_counter import AnomalyCounter return AnomalyCounter.derive_key_from_incident(incident) except Exception as e: logger.warning("get_anomaly_key_from_approval_failed", error=str(e)) return None async def _trigger_learning( self, approval: ApprovalRequest, success: bool, duration_seconds: float = 0, error_message: str | None = None, ) -> None: """ ADR-030 Phase 5: 觸發學習服務 處理執行結果,調整信任度和 Playbook 統計 """ try: from src.services.learning_service import ( ExecutionResult, get_learning_service, ) learning = get_learning_service() result = ExecutionResult( approval_id=str(approval.id), incident_id=approval.incident_id or "", action=approval.action, success=success, error_message=error_message, duration_seconds=duration_seconds, ) await learning.process_execution_result( approval=approval, result=result, ) except Exception as e: # 學習失敗不影響主流程 logger.warning( "learning_trigger_failed", approval_id=str(approval.id), error=str(e), ) # 2026-04-04 ogt: 執行結果沉澱到 KM — 移出 try/except 確保 learning 失敗也寫入 # 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM # P1.5 fix 2026-04-24 ogt + Claude Sonnet 4.6: fire-and-forget → await(30s 熔斷) # P1-1 2026-04-28 ogt + Claude Sonnet 4.6: 改用 write_execution_result_to_km(公開) # KMWriter 統一契約:timeout / retry / DLQ 由 km_writer.py 統一管理 await self.write_execution_result_to_km(approval, success, error_message) async def _run_post_execution_verify( self, approval: "ApprovalRequest", action_taken: str, ) -> None: """ ADR-081 Phase 1: 執行後驗證 (fire-and-forget 包裝) 1. 從 incident_id 查 Incident 2. 從 incident_evidence 取最新 EvidenceSnapshot 3. 呼叫 PostExecutionVerifier.verify() 補填後狀態 + 驗證結果 4. 結果傳給 learning_service 更新 Playbook trust_score(Phase 3) """ if not approval.incident_id: return try: from src.services.incident_service import get_incident_service from src.services.post_execution_verifier import get_post_execution_verifier # 2026-04-26 critic-B2 hotfix by Claude Opus 4.7 # get_latest_snapshot 是 module-level async function,不是 EvidenceSnapshot classmethod from src.services.evidence_snapshot import get_latest_snapshot incident_svc = get_incident_service() # 2026-04-25 修復 L1:IncidentService 沒有 get_incident() 方法 # 應用正確方法 get_from_working_memory() 或 get_from_episodic_memory() incident = await incident_svc.get_from_working_memory(approval.incident_id) if incident is None: incident = await incident_svc.get_from_episodic_memory(approval.incident_id) if incident is None: logger.warning( "post_verify_incident_not_found", approval_id=str(approval.id), incident_id=approval.incident_id, ) return # 取最新 EvidenceSnapshot(若 Phase 1 flag 有啟動才會有) snapshot = await get_latest_snapshot(approval.incident_id) verifier = get_post_execution_verifier() verification_result = await verifier.verify( incident=incident, snapshot=snapshot, action_taken=action_taken, ) logger.info( "post_verify_complete", approval_id=str(approval.id), incident_id=approval.incident_id, result=verification_result, action=action_taken, ) # ADR-083 Phase 3 Root cause 3: 驗證結果接線到學習服務 # 環境驗證(Pod Running / 指標恢復)是比執行 exit code 更精確的學習訊號 # 2026-04-15 ogt + Claude Sonnet 4.6(亞太) try: from src.services.learning_service import get_learning_service _matched_pb_id = getattr(approval, "matched_playbook_id", None) await get_learning_service().record_verification_result( incident_id=approval.incident_id, action_taken=action_taken, verification_result=verification_result, matched_playbook_id=_matched_pb_id, ) except Exception as _lerr: logger.warning( "post_verify_learning_failed", approval_id=str(approval.id), error=str(_lerr), ) except Exception as _e: # 驗證失敗不影響執行結果 logger.warning( "post_verify_failed", approval_id=str(approval.id), error=str(_e), ) @staticmethod def _is_auto_approved_request(approval: "ApprovalRequest") -> bool: requested_by = (getattr(approval, "requested_by", "") or "").lower() return requested_by.startswith("auto_approve") @staticmethod def _is_observation_only_action(action: str | None) -> bool: action_upper = (action or "").strip().upper() return ( not action_upper or "NO_ACTION" in action_upper or "NO-ACTION" in action_upper or "NOACTION" in action_upper or action_upper.startswith("OBSERVE") or action_upper.startswith("INVESTIGATE") ) @staticmethod def _approval_risk_value(approval: "ApprovalRequest") -> str | None: risk_level = getattr(approval, "risk_level", None) if risk_level is None: return None return getattr(risk_level, "value", str(risk_level)) async def finalize_auto_approved_execution( self, approval: "ApprovalRequest", *, success: bool, error_message: str | None = None, ) -> None: """ 補齊「自動批准已執行」路徑的 incident-linked 證據鏈。 CS2/CS3 webhook 路徑為了快速執行,會先呼叫 execute_approved_action(), 再建立 Incident。executor 當下沒有 incident_id,導致 verifier/KM/ auto_repair_executions 都無法串回同一張告警卡。此方法只在 incident 建立後補上 durable trace,不重新執行 action。 """ if not self._is_auto_approved_request(approval): return incident_id = getattr(approval, "incident_id", None) if not incident_id: logger.warning( "auto_approved_execution_finalize_skipped_no_incident", approval_id=str(getattr(approval, "id", "")), requested_by=getattr(approval, "requested_by", None), ) return if self._is_observation_only_action(getattr(approval, "action", None)): logger.info( "auto_approved_execution_finalize_skipped_observation_only", approval_id=str(approval.id), incident_id=incident_id, action=(approval.action or "")[:120], ) return parsed = parse_operation_from_action(approval.action) operation_type = parsed.operation_type resource_name = parsed.resource_name or "unknown" namespace = parsed.namespace or "default" playbook_id = str(getattr(approval, "matched_playbook_id", None) or approval.id)[:36] operation_label = operation_type.value if operation_type else "unknown" playbook_name = f"approval_auto_execute:{operation_label}:{resource_name}"[:200] triggered_by = (getattr(approval, "requested_by", None) or "auto_approve")[:50] action_taken = f"auto_repair_playbook:{playbook_id}:{operation_label}:{resource_name}" if not success: action_taken = f"{action_taken}:FAILED" error_message = error_message or "auto-approved executor returned failure; see approval/aol logs" try: from src.repositories.audit_log_repository import get_auto_repair_execution_repository repo = get_auto_repair_execution_repository() existing = await repo.list_by_incident(incident_id) already_recorded = any( str(getattr(row, "playbook_id", "")) == playbook_id and getattr(row, "triggered_by", "") == triggered_by and (approval.action or "") in list(getattr(row, "executed_steps", []) or []) for row in existing ) if not already_recorded: await repo.create( incident_id=incident_id, playbook_id=playbook_id, playbook_name=playbook_name, success=success, executed_steps=[approval.action], error_message=error_message, triggered_by=triggered_by, risk_level=self._approval_risk_value(approval), ) else: logger.info( "auto_approved_execution_record_already_exists", approval_id=str(approval.id), incident_id=incident_id, playbook_id=playbook_id, ) except Exception as exc: logger.warning( "auto_approved_execution_record_failed", approval_id=str(approval.id), incident_id=incident_id, error=str(exc), ) try: timeline = get_timeline_service() await timeline.add_event( event_type="exec", status="success" if success else "error", title=f"{'✅' if success else '❌'} 自動批准執行已補鏈: {operation_label}", description=( f"Target: {resource_name} @ {namespace}; " f"source={triggered_by}; action={approval.action[:160]}" ), actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), incident_id=incident_id, ) except Exception as exc: logger.warning( "auto_approved_execution_timeline_failed", approval_id=str(approval.id), incident_id=incident_id, error=str(exc), ) try: await self.write_execution_result_to_km(approval, success, error_message) except Exception as exc: logger.warning( "auto_approved_execution_km_failed", approval_id=str(approval.id), incident_id=incident_id, error=str(exc), ) from src.core.feature_flags import aiops_flags if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"): try: await asyncio.wait_for( self._run_post_execution_verify( approval=approval, action_taken=action_taken, ), timeout=_VERIFIER_AWAIT_TIMEOUT_SEC, ) except asyncio.TimeoutError: logger.warning( "auto_approved_execution_post_verify_timeout", approval_id=str(approval.id), incident_id=incident_id, timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC, ) if success: try: from src.services.incident_service import get_incident_service await get_incident_service().resolve_incident(incident_id) logger.info( "incident_resolved_after_auto_approved_execution_finalize", incident_id=incident_id, approval_id=str(approval.id), ) except Exception as exc: logger.warning( "incident_resolve_after_auto_approved_execution_finalize_failed", incident_id=incident_id, approval_id=str(approval.id), error=str(exc), ) async def write_execution_result_to_km( self, approval: "ApprovalRequest", success: bool, error_message: str | None, ) -> None: """ 執行結果沉澱到 KM (Knowledge Base) 2026-04-04 ogt: 統帥鐵律 — 成功/失敗執行記錄都必須回寫 KM 2026-04-14 Claude Sonnet 4.6 (BP-1 B.1 精修): 區分 auto_approve vs 人工路徑, 補齊 alert_category / alertname / affected_services 供 RAG 檢索。 P1-1 2026-04-28 ogt + Claude Sonnet 4.6: 改名公開(去底線),委派 KMWriter 統一契約。 """ from src.models.knowledge import EntrySource, EntryType from src.services.km_writer import KMWritePayload, km_write_with_flag # 來源辨識(B.1 精修) _is_auto = self._is_auto_approved_request(approval) _mode_prefix = "[自動修復]" if _is_auto else "[人工修復]" _mode_tag = "auto_executed" if _is_auto else "human_approved" status_icon = "✅" if success else "❌" status_text = "成功" if success else f"失敗: {error_message or '未知原因'}" _status_tag = "success" if success else "failure" # 從關聯 Incident 提取豐富元資料 alertname = "unknown" alert_category = "general" affected_services: list[str] = [] if approval.incident_id: try: from src.services.incident_service import get_incident_service _svc = get_incident_service() # get_from_working_memory (Redis) → fallback get_from_episodic_memory (PG) _inc = await _svc.get_from_working_memory(approval.incident_id) if _inc is None: _inc = await _svc.get_from_episodic_memory(approval.incident_id) if _inc: if _inc.signals: alertname = _inc.signals[0].labels.get("alertname", "unknown") or "unknown" alert_category = getattr(_inc, "alert_category", "") or "general" affected_services = list(_inc.affected_services or []) except Exception as _ie: logger.debug("km_incident_enrich_failed", incident_id=approval.incident_id, error=str(_ie)) _services_str = ", ".join(affected_services) if affected_services else "未關聯" content = ( f"# {status_icon} {_mode_prefix} {alertname}\n\n" f"**告警名稱**: {alertname}\n" f"**告警類別**: {alert_category}\n" f"**受影響服務**: {_services_str}\n" f"**執行命令**: `{approval.action[:200]}`\n" f"**執行結果**: {status_text}\n" f"**風險等級**: {approval.risk_level.value if approval.risk_level else '未知'}\n" f"**執行路徑**: {'自動執行 (confidence >= 0.65)' if _is_auto else '人工審核批准'}\n" f"**Incident ID**: {approval.incident_id or '未關聯'}\n" f"**Approval ID**: {approval.id}\n\n" f"## 操作描述\n{approval.description or '無描述'}\n" ) # Tags: 模式 + 狀態 + 類別(供 RAG 多維度檢索) tags = [_mode_tag, _status_tag, alert_category, "execution"] if not success: tags.append("execution_failed") payload = KMWritePayload( path_type="approval_auto_ok" if (_is_auto and success) else "approval_auto_fail" if (_is_auto and not success) else "approval_manual", entry_create_kwargs=dict( title=f"{_mode_prefix} {alertname}: {approval.action[:50]}", content=content, entry_type=EntryType.INCIDENT_CASE, category=alert_category, tags=tags, source=EntrySource.AI_EXTRACTED, related_incident_id=approval.incident_id or None, created_by="auto_execute" if _is_auto else "approval_execution", ), incident_id=approval.incident_id or None, approval_id=str(approval.id), ) await km_write_with_flag(payload) async def _send_execution_notification( self, approval: ApprovalRequest, execution_status: "ExecutionStatus", operation_type: str, namespace: str, duration_ms: int | None = None, error_message: str | None = None, ) -> None: """ Phase 6: 發送執行通知 (Post-Execution Hook) 將執行結果發送至所有已配置的通知頻道 (Discord, Slack, etc.) """ from src.services.notifications import ( NotificationMessage, get_notification_manager, ) if not settings.NOTIFICATION_ENABLED: logger.info("notification_disabled", approval_id=str(approval.id)) return try: # 建構簽核者列表 signers = [ {"name": sig.signer_name, "comment": sig.comment or ""} for sig in approval.signatures ] # 建構通知訊息 message = NotificationMessage( execution_status=execution_status, action_title=approval.action[:100], action_description=approval.description[:200] if approval.description else "", approval_id=str(approval.id), signers=signers, required_signatures=approval.required_signatures, affected_pods=approval.blast_radius.affected_pods if approval.blast_radius else 0, estimated_downtime=approval.blast_radius.estimated_downtime if approval.blast_radius else "N/A", related_services=approval.blast_radius.related_services if approval.blast_radius else [], data_impact=approval.blast_radius.data_impact.value if approval.blast_radius else "none", namespace=namespace, operation_type=operation_type, duration_ms=duration_ms, error_message=error_message, risk_level=approval.risk_level.value, ai_provider=approval.requested_by, ) # 發送通知 manager = get_notification_manager() results = await manager.send_all(message) for result in results: logger.info( "notification_result", approval_id=str(approval.id), provider=result.provider, status=result.status.value, message=result.message, ) except Exception as e: logger.exception( "notification_failed", approval_id=str(approval.id), error=str(e), ) async def _trigger_playbook_extraction( self, approval: ApprovalRequest, ) -> None: """ Phase 7.6: 觸發 Playbook 自動萃取 條件: - 執行成功 - 關聯的 Incident 狀態為 RESOLVED 或 CLOSED - effectiveness_score >= 4 此函數為 fire-and-forget,失敗不影響主流程 """ try: # 1. 從 approval.incident_id 直接取得 (Phase 26 修復) # 原本靠 regex 掃文字找 INC- 前綴,中文 action 完全找不到 incident_id = getattr(approval, "incident_id", None) if not incident_id: # Fallback: 嘗試文字解析 (向後兼容舊資料) incident_id = self._extract_incident_id_from_approval(approval) if not incident_id: logger.info( "playbook_extraction_skipped", approval_id=str(approval.id), reason="No incident_id found in approval.incident_id or text", ) return # 2. 取得 Incident from src.services.incident_service import get_incident_service incident_service = get_incident_service() # 2026-04-25 修復 L1:IncidentService 沒有 get_incident() 方法 incident = await incident_service.get_from_working_memory(incident_id) if incident is None: incident = await incident_service.get_from_episodic_memory(incident_id) if not incident: logger.info( "playbook_extraction_skipped", approval_id=str(approval.id), incident_id=incident_id, reason="Incident not found", ) return # 3. 執行成功後自動設定 outcome (冷啟動關鍵) # 2026-04-04 ogt: 首席架構師 Review — 補上 execution_success + effectiveness_score # 確保 Playbook 萃取前置條件能成立,不再依賴人工填分 from src.models.incident import IncidentOutcome, IncidentStatus from src.utils.timezone import now_taipei if incident.outcome is None: incident.outcome = IncidentOutcome() if not incident.outcome.execution_success: incident.outcome.execution_success = True if incident.outcome.effectiveness_score is None or incident.outcome.effectiveness_score < 4: incident.outcome.effectiveness_score = 4 # 系統判斷:K8s 執行成功 = 有效 if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]: incident.status = IncidentStatus.RESOLVED incident.resolved_at = now_taipei() # Task 3.3 (2026-04-14): 記錄執行動作供 SSH 路徑 KM 萃取 # approval.action 含實際執行指令(可能是 kubectl 或 ssh ...), # 寫入 learning_notes 供 playbook_service._extract_repair_steps 萃取 SSH RepairStep if not incident.outcome.learning_notes and approval.action: incident.outcome.learning_notes = approval.action # 回存 Incident(fire-and-forget 路徑,失敗不影響主流程) await incident_service.save_to_working_memory(incident) logger.info( "playbook_extraction_incident_updated", approval_id=str(approval.id), incident_id=incident_id, effectiveness_score=incident.outcome.effectiveness_score, status=incident.status.value, ) # 4. 觸發萃取(effectiveness 已保證 >= 4) from src.services.playbook_service import get_playbook_service playbook_service = get_playbook_service() effectiveness = incident.outcome.effectiveness_score or 4 playbook = await playbook_service.extract_from_incident( incident=incident, auto_approve=effectiveness >= 5, # 滿分自動核准 ) if playbook: logger.info( "playbook_auto_extracted", approval_id=str(approval.id), incident_id=incident_id, playbook_id=playbook.playbook_id, playbook_name=playbook.name, auto_approved=playbook.status.value == "approved", ) else: logger.debug( "playbook_extraction_no_result", approval_id=str(approval.id), incident_id=incident_id, ) except Exception as e: # 萃取失敗不影響主流程 logger.warning( "playbook_extraction_error", approval_id=str(approval.id), error=str(e), ) def _extract_incident_id_from_approval( self, approval: ApprovalRequest, ) -> str | None: """ 從 approval 提取關聯的 incident_id 嘗試以下來源: 1. approval.metadata (如果有) 2. approval.description 中的 INC- 模式 3. approval.requested_by 中的 incident 資訊 """ import re # 從 description 或 action 中尋找 INC-XXXXXX 模式 text = f"{approval.description or ''} {approval.action or ''}" match = re.search(r"INC-([A-Z0-9-]+)", text) if match: return match.group(0) # 返回完整的 INC-XXXXX # 從 requested_by 尋找 if approval.requested_by and "INC-" in approval.requested_by: match = re.search(r"INC-([A-Z0-9-]+)", approval.requested_by) if match: return match.group(0) return None # ========================================================================= # ADR-090 § AOL Writer (2026-04-19 ogt + Claude Opus 4.7 亞太) # 把 approval execution 的生命週期回灌 automation_operation_log. # 之前 33 件/7d approval 動作完全沒寫入 aol,只有 drift_narrator 的 # 22 筆 notification_formatted。修復後每次執行都留痕。 # ========================================================================= async def _log_aol_started(self, approval: ApprovalRequest) -> str | None: """ 在 automation_operation_log 寫一筆 'pending' 紀錄,回傳 op_id 供 _log_aol_completed 更新。 失敗時 (DB 異常) 回 None,主流程繼續 — aol 寫入永不阻塞執行。 2026-04-20 P0.3: input 補 target / operation_type / namespace, 失敗時 aol.input 就能直接看到 target 是什麼(追 awoooi-service 類誤判的 source trace)。 """ try: from sqlalchemy import text as _sql from src.db.base import get_db_context import json as _json # 2026-04-20 P0.3: 先嘗試從 action 解析 target / op_type,失敗不阻塞 _parsed_target: str | None = None _parsed_op: str | None = None _parsed_ns: str | None = None try: _parsed = parse_operation_from_action(approval.action or "") _parsed_target = _parsed.resource_name _parsed_op = _parsed.operation_type.value if _parsed.operation_type else None _parsed_ns = _parsed.namespace except Exception: pass input_payload = { "approval_id": str(approval.id), "incident_id": approval.incident_id or "", "action": (approval.action or "")[:500], "risk_level": getattr(approval, "risk_level", None) or "", "requested_by": getattr(approval, "requested_by", "") or "", # 2026-04-20 P0.3: target source trace "parsed_target": _parsed_target or "", "parsed_operation": _parsed_op or "", "parsed_namespace": _parsed_ns or "", } async with get_db_context() as db: row = await db.execute( _sql(""" INSERT INTO automation_operation_log ( operation_type, actor, status, input, output, tags ) VALUES ( 'playbook_executed', 'approval_execution', 'pending', CAST(:input AS jsonb), '{}'::jsonb, :tags ) RETURNING op_id """), { "input": _json.dumps(input_payload, ensure_ascii=False), "tags": ["approval", "execution", "playbook"], }, ) op_id = row.scalar() return str(op_id) if op_id else None except Exception as e: logger.warning("aol_started_write_failed", approval_id=str(approval.id), error=str(e)) return None async def _log_aol_completed( self, op_id: str | None, status: str, duration_ms: int, output: dict | None = None, error: str | None = None, stderr: str | None = None, ) -> None: """ UPDATE automation_operation_log 為 success/failed 並寫入結果摘要 + stderr。 status 必須是 aol constraint 允許的值: pending | success | failed | dry_run | rolled_back op_id 為 None 時靜默跳過 (started 寫入失敗時不應觸發 update 例外)。 """ if not op_id: return try: from sqlalchemy import text as _sql from src.db.base import get_db_context import json as _json async with get_db_context() as db: await db.execute( _sql(""" UPDATE automation_operation_log SET status = :status, duration_ms = :duration_ms, output = CAST(:output AS jsonb), error = :error, stderr_feed_back = :stderr WHERE op_id = CAST(:op_id AS uuid) """), { "status": status, "duration_ms": duration_ms, "output": _json.dumps(output or {}, ensure_ascii=False), "error": (error or "")[:2000] if error else None, "stderr": (stderr or "")[:8000] if stderr else None, "op_id": op_id, }, ) except Exception as e: logger.warning("aol_completed_write_failed", op_id=op_id, error=str(e)) # ============================================================================= # Singleton Instance # ============================================================================= _execution_service: ApprovalExecutionService | None = None def get_execution_service() -> ApprovalExecutionService: """ 取得 ApprovalExecutionService 單例 Returns: ApprovalExecutionService: 執行服務實例 """ global _execution_service if _execution_service is None: _execution_service = ApprovalExecutionService() return _execution_service