awoooi/apps/api/src/services/approval_execution.py

"""
Approval Execution Service - Phase 16 R4.2 瘦身 Router 抽取
============================================================

從 approvals.py 抽取執行編排邏輯，整合:
- OperationParser: 解析操作類型
- K8s Executor: 執行 K8s 操作
- ApprovalDBService: 更新狀態
- TimelineService: 記錄事件
- NotificationManager: 發送通知
- Phase 7.6: Playbook 自動萃取

版本: v1.2
建立: 2026-03-25 (台北時區)
更新: 2026-03-26 (Phase 7.6 自動萃取)
更新: 2026-04-14 (ADR-076 Task 3: 執行失敗重試機制 — Claude Haiku 4.5 Asia/Taipei)
建立者: Claude Code (Phase 16 R4.2)

重試設計 (ADR-076):
- MAX_RETRY = 2 次（共最多 3 次嘗試）
- RETRY_DELAY_SECONDS = 30 秒
- 只重試瞬態錯誤（connection refused, timeout, i/o error 等）
- 永久性錯誤（not found, permission denied, already exists）不重試
"""

import asyncio
import time
from typing import TYPE_CHECKING, Any
from uuid import UUID

import structlog

from src.core.config import settings
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.models.approval import ApprovalRequest
from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError
from src.plugins.mcp.interfaces import MCPToolResult
from src.services.approval_db import get_approval_service, get_timeline_service
from src.services.executor import ExecutionResult, OperationType, get_executor
from src.services.operation_parser import parse_operation_from_action

if TYPE_CHECKING:
    from src.services.notifications import ExecutionStatus

logger = structlog.get_logger(__name__)

# ADR-090 § 自動化動作回灌 (2026-04-19 ogt + Claude Opus 4.7 亞太):
# PostExecutionVerifier 從 fire-and-forget 改 await,確保 verification_result 必寫入 incident_evidence.
# 上限 60s 涵蓋 verifier warmup(10s) + collect(30s) + 緩衝 20s.
_VERIFIER_AWAIT_TIMEOUT_SEC = 60.0

# T9: approved SSH execution must go through AwoooP MCP Gateway.
# ApprovalRequest itself is the human/multi-sig decision artifact; for write/admin
# tools we project it into the short-lived Gate 5 Redis key expected by Gateway.
_SSH_GATEWAY_AGENT_ID = "approval_executor"
_SSH_GATEWAY_PROJECT_ID = "awoooi"
_SSH_GATEWAY_APPROVAL_TTL_SECONDS = 600
_SSH_GATEWAY_TOOL_SCOPES: dict[str, str] = {
    "ssh_diagnose": "read",
    "ssh_docker_restart": "write",
    "ssh_docker_compose_restart": "write",
    "ssh_systemctl_restart": "write",
    "ssh_clear_docker_logs": "write",
    "ssh_renew_ssl": "write",
    "ssh_reload_nginx": "write",
    "ssh_docker_prune": "admin",
}


class ApprovalExecutionService:
    """
    授權執行服務 - 編排整個執行流程

    職責:
    1. 解析操作類型
    2. 呼叫 K8s Executor 執行（含重試）
    3. 更新資料庫狀態
    4. 記錄 Timeline 事件
    5. 發送通知
    """

    # ADR-076 Task 3: 重試常數
    MAX_RETRY: int = 2
    RETRY_DELAY_SECONDS: int = 30

    # 瞬態錯誤關鍵字（小寫比對），符合任一 → 可重試
    _TRANSIENT_ERROR_KEYWORDS: tuple[str, ...] = (
        "connection refused",
        "connection reset",
        "timeout",
        "timed out",
        "i/o error",
        "io error",
        "temporary failure",
        "service unavailable",
        "too many requests",
        "dial tcp",
        "eof",
    )

    # 永久性錯誤關鍵字（小寫比對），符合任一 → 不重試
    _PERMANENT_ERROR_KEYWORDS: tuple[str, ...] = (
        "not found",
        "forbidden",
        "permission denied",
        "unauthorized",
        "already exists",
        "invalid",
        "immutable",
        "destructive",
        "blocked",
    )

    @classmethod
    def _is_transient_error(cls, error_message: str | None) -> bool:
        """
        判斷執行錯誤是否為瞬態（可重試）

        優先檢查永久性錯誤（比瞬態錯誤有更高的優先順序），
        避免 "connection refused (not found)" 這類混合訊息誤判。

        Args:
            error_message: 執行錯誤訊息

        Returns:
            True 表示可重試，False 表示永久失敗
        """
        if not error_message:
            return False
        lower = error_message.lower()
        # 永久性錯誤 → 不重試
        if any(kw in lower for kw in cls._PERMANENT_ERROR_KEYWORDS):
            return False
        # 瞬態錯誤 → 可重試
        return any(kw in lower for kw in cls._TRANSIENT_ERROR_KEYWORDS)

    async def execute_approved_action(self, approval: ApprovalRequest) -> bool:
        """
        背景執行已批准的操作

        此函數由 BackgroundTasks 呼叫，不阻塞 API 回應
        Phase 5: 執行後更新資料庫狀態
        Phase 6: 執行後發送通知 (Post-Execution Hook)

        2026-04-17 ogt + Claude Sonnet 4.6: 返回 bool 表示 K8s 執行成功與否
        根本原因: 原本返回 None → decision_manager.py auto-execute 路徑無法得知結果
        → 永遠傳 success=True 給 _push_auto_repair_result → 假成功廣播
        修復: 返回 result.success，讓呼叫端自行決定 Telegram 訊息

        Args:
            approval: 已批准的授權請求

        Returns:
            bool: True = K8s 執行成功，False = 執行失敗（含解析失敗）
        """
        from src.services.notifications import ExecutionStatus

        logger.info(
            "background_execution_start",
            approval_id=str(approval.id),
            action=approval.action,
        )

        # ADR-090 § 自動化動作回灌 (2026-04-19): 主流程開始即在 aol 留痕,
        # 結束時 update。不依賴 fire-and-forget,確保 33 件/7d approval 全部可觀測。
        _aol_op_id = await self._log_aol_started(approval)
        _aol_started_ms = time.time()

        service = get_approval_service()
        timeline = get_timeline_service()

        # Parse operation details
        parsed = parse_operation_from_action(approval.action)
        operation_type = parsed.operation_type
        resource_name = parsed.resource_name
        namespace = parsed.namespace

        # 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合
        # kubectl 指令解析後，動態驗證資源是否存在於 K8s，並套用 normalized name
        # exception 不阻斷主流程；miss/suggestion 只記 warning + metadata，不攔截執行
        if resource_name is not None and operation_type is not None:
            try:
                from src.services.resource_resolver import get_resource_resolver
                from src.core.metrics import RESOURCE_RESOLVE_TOTAL

                _resolver = get_resource_resolver()
                _resolve = await _resolver.resolve(
                    raw_resource=resource_name,
                    namespace=namespace,
                    resource_kind="deployment",
                )
                if _resolve.success and _resolve.resource_name:
                    if _resolve.resource_name != resource_name:
                        logger.info(
                            "resource_name_normalized",
                            original=resource_name,
                            normalized=_resolve.resource_name,
                            namespace=namespace,
                        )
                    resource_name = _resolve.resource_name
                    RESOURCE_RESOLVE_TOTAL.labels(result="hit").inc()
                elif _resolve.candidates:
                    logger.warning(
                        "resource_not_found_in_k8s",
                        resource=resource_name,
                        namespace=namespace,
                        suggestions=_resolve.candidates,
                    )
                    RESOURCE_RESOLVE_TOTAL.labels(result="suggestion").inc()
                else:
                    logger.warning(
                        "resource_not_found_in_k8s",
                        resource=resource_name,
                        namespace=namespace,
                        suggestions=[],
                    )
                    RESOURCE_RESOLVE_TOTAL.labels(result="miss").inc()
            except Exception as _rr_e:
                logger.warning("resource_resolve_failed", error=str(_rr_e))
                try:
                    from src.core.metrics import RESOURCE_RESOLVE_TOTAL
                    RESOURCE_RESOLVE_TOTAL.labels(result="error").inc()
                except Exception:
                    pass

        if operation_type is None or resource_name is None:
            # 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗
            # NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED
            # 污染 auto_execute 成功率 KPI (MASTER §7.1 #11)
            _action_upper = (approval.action or "").upper()
            _is_no_action = (
                "NO_ACTION" in _action_upper
                or "NO-ACTION" in _action_upper
                or "NOACTION" in _action_upper
                or "(未設)" in approval.action
                or _action_upper.startswith("OBSERVE")
                or _action_upper.startswith("INVESTIGATE")
            )

            if _is_no_action:
                logger.info(
                    "background_execution_noop",
                    approval_id=str(approval.id),
                    action=approval.action,
                    reason="NO_ACTION - 純調查/觀察類,不執行破壞動作",
                    path="no_action",
                )
                # 標為 SUCCESS (觀察/調查本身就是成功完成)
                await service.update_execution_status(approval.id, success=True)
                await timeline.add_event(
                    event_type="exec",
                    status="success",
                    title="✅ 純觀察類動作完成 (NO_ACTION)",
                    description=f"Action: {approval.action[:120]}",
                    actor="leWOOOgo",
                    actor_role="executor",
                    approval_id=str(approval.id),
                    incident_id=approval.incident_id,
                )
                # 執行結果 reply 原告警卡片
                asyncio.create_task(
                    self._push_execution_result_to_alert(
                        approval, success=True, error=None,
                    )
                )
                # ADR-090 § aol completed (NO_ACTION 視為成功)
                await self._log_aol_completed(
                    op_id=_aol_op_id,
                    status="success",
                    duration_ms=int((time.time() - _aol_started_ms) * 1000),
                    output={"reason": "NO_ACTION", "action": approval.action[:200]},
                )
                # F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex):
                # NO_ACTION 路徑要把 incident 推到 RESOLVED，否則 incident 永遠卡
                # INVESTIGATING（FlywheelExecutionRateMissing 死告警 + 566 stuck 增長根因 #1）。
                # resolve_incident 內已加 RESOLVED 冪等 guard，重複 resolve 會 idempotent
                # return existing incident 不會重觸發 postmortem。
                if approval.incident_id:
                    try:
                        from src.services.incident_service import get_incident_service

                        await get_incident_service().resolve_incident(approval.incident_id)
                        logger.info(
                            "incident_resolved_after_no_action_execution",
                            incident_id=approval.incident_id,
                            approval_id=str(approval.id),
                            path="no_action",
                        )
                    except Exception as _resolve_e:
                        logger.warning(
                            "incident_resolve_after_no_action_execution_failed",
                            incident_id=approval.incident_id,
                            approval_id=str(approval.id),
                            error=str(_resolve_e),
                        )
                return True  # NO_ACTION 視為成功完成

            # 真解析失敗 (非 NO_ACTION)
            logger.warning(
                "background_execution_skip",
                approval_id=str(approval.id),
                reason="Could not parse operation type from action",
                action=approval.action,
            )
            # Phase 5: 更新資料庫狀態 + 帶 error_message (P0.2)
            await service.update_execution_status(
                approval.id, success=False,
                error_message=f"Could not parse operation type from action: {approval.action[:150]}",
            )
            await timeline.add_event(
                event_type="exec",
                status="error",
                title="執行失敗: 無法解析操作類型",
                description=f"Action: {approval.action}",
                actor="leWOOOgo",
                actor_role="executor",
                approval_id=str(approval.id),
                incident_id=approval.incident_id,
            )

            # Phase 6: 發送失敗通知 (fire-and-forget)
            asyncio.create_task(
                self._send_execution_notification(
                    approval=approval,
                    execution_status=ExecutionStatus.FAILED,
                    operation_type="unknown",
                    namespace=namespace,
                    error_message="Could not parse operation type",
                )
            )
            # ADR-090 § aol completed (parse 失敗)
            await self._log_aol_completed(
                op_id=_aol_op_id,
                status="failed",
                duration_ms=int((time.time() - _aol_started_ms) * 1000),
                error=f"parse_fail: {approval.action[:300]}",
            )
            return False  # 解析失敗 → 執行未發生

        executor = get_executor()
        attempt = 1  # 重試計數（INVESTIGATE 路徑不進入重試迴圈，保持 1）

        # 2026-05-02 ogt + Claude Sonnet 4.6: 主機 SSH 操作分支
        # 根因：手動批准 ssh action 時 parser 只懂 kubectl，回 None → 「Could not parse」假失敗
        # 修法：偵測到 SSH_HOST 類型，走 SSHProvider 而非 K8s executor
        if operation_type == OperationType.SSH_HOST:
            result = await self._execute_ssh_host_action(
                approval=approval,
                host=resource_name or "",
            )
            logger.info(
                "background_execution_ssh_host",
                approval_id=str(approval.id),
                action=approval.action,
                host=resource_name,
                success=result.success,
                message=result.message,
            )
        elif operation_type == OperationType.INVESTIGATE:
            # 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢
            # 根因：INVESTIGATE 不在 executor.execute_with_audit 的 switch，走 else → success=False
            # 修法：偵測到 INVESTIGATE 類型，直接呼叫 execute_kubectl_command(approval.action)
            #       唯讀指令無需重試迴圈（失敗即失敗，不會有 transient error 改善空間）
            result = await executor.execute_kubectl_command(
                command=approval.action,
                timeout_sec=30,
            )
            logger.info(
                "background_execution_investigate",
                approval_id=str(approval.id),
                action=approval.action,
                success=result.success,
                message=result.message,
            )
        else:
            # ADR-076 Task 3: 執行失敗重試機制
            # 瞬態錯誤 (connection refused, timeout 等) 自動重試，最多 MAX_RETRY 次
            result = await executor.execute_with_audit(
                approval=approval,
                operation_type=operation_type,
                resource_name=resource_name,
                namespace=namespace,
            )

            attempt = 1
            while not result.success and attempt <= self.MAX_RETRY:
                if not self._is_transient_error(result.error):
                    logger.info(
                        "execution_retry_skipped_permanent_error",
                        approval_id=str(approval.id),
                        attempt=attempt,
                        error=result.error,
                    )
                    break

                logger.warning(
                    "execution_retry_transient_error",
                    approval_id=str(approval.id),
                    attempt=attempt,
                    max_retry=self.MAX_RETRY,
                    error=result.error,
                    delay_seconds=self.RETRY_DELAY_SECONDS,
                )
                await timeline.add_event(
                    event_type="exec",
                    status="warning",
                    title=f"⚠️ 執行失敗，{self.RETRY_DELAY_SECONDS}s 後重試 ({attempt}/{self.MAX_RETRY})",
                    description=f"Error: {result.error}",
                    actor="leWOOOgo",
                    actor_role="executor",
                    approval_id=str(approval.id),
                    incident_id=approval.incident_id,
                )
                await asyncio.sleep(self.RETRY_DELAY_SECONDS)
                result = await executor.execute_with_audit(
                    approval=approval,
                    operation_type=operation_type,
                    resource_name=resource_name,
                    namespace=namespace,
                )
                attempt += 1

        # Phase 5: 更新資料庫狀態
        # 2026-04-18 ADR-090 L5 P0.2: 失敗時帶上 error_message,寫進 rejection_reason
        await service.update_execution_status(
            approval.id,
            success=result.success,
            error_message=None if result.success else (result.error or "(executor 未回傳錯誤)"),
        )

        # Update approval status based on result
        total_attempts = attempt  # attempt 在重試迴圈後為最終嘗試次數
        if result.success:
            logger.info(
                "background_execution_success",
                approval_id=str(approval.id),
                operation=operation_type.value,
                target=resource_name,
                namespace=namespace,
                duration_ms=result.duration_ms,
                total_attempts=total_attempts,
            )
            retry_note = f" (重試 {total_attempts - 1} 次後成功)" if total_attempts > 1 else ""
            await timeline.add_event(
                event_type="exec",
                status="success",
                title=f"✅ K8s 執行成功: {operation_type.value}{retry_note}",
                description=f"Target: {resource_name} @ {namespace} ({result.duration_ms}ms)",
                actor="leWOOOgo",
                actor_role="executor",
                approval_id=str(approval.id),
                incident_id=approval.incident_id,
            )

            # Phase 6: 發送成功通知 (fire-and-forget)
            asyncio.create_task(
                self._send_execution_notification(
                    approval=approval,
                    execution_status=ExecutionStatus.SUCCESS,
                    operation_type=operation_type.value,
                    namespace=namespace,
                    duration_ms=result.duration_ms,
                )
            )

            # 2026-04-14 Claude Sonnet 4.6: reply_to 原告警卡片顯示執行結果
            # auto_approve 路徑由 _push_auto_repair_result 處理，此處僅處理人工批准
            asyncio.create_task(
                self._push_execution_result_to_alert(approval, success=True, error=None)
            )

            # Phase 7.6: 觸發 Playbook 自動萃取 (fire-and-forget)
            asyncio.create_task(
                self._trigger_playbook_extraction(approval)
            )

            # ADR-030 Phase 5 / ADR-083 Phase 3: 觸發學習服務
            # Phase 3 修復：移除 fire-and-forget，改用 await + 30s 熔斷
            # 超時 → 記錄 metric，主流程繼續（不 crash）
            # 2026-04-15 ogt + Claude Sonnet 4.6（亞太）: Phase 3 fire-and-forget 修復
            try:
                await asyncio.wait_for(
                    self._trigger_learning(
                        approval=approval,
                        success=True,
                        duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0,
                    ),
                    timeout=30.0,
                )
            except asyncio.TimeoutError:
                logger.warning(
                    "learning_trigger_timeout",
                    approval_id=str(approval.id),
                    timeout_sec=30.0,
                )

            # ADR-081 Phase 1 + ADR-090 修復 (2026-04-19 ogt + Claude Opus 4.7):
            # PostExecutionVerifier 改 await + 60s timeout,確保 verification_result 必寫入。
            # 之前 fire-and-forget 在 Pod recycle 時 task 被殺,導致 1212 筆 evidence 全 NULL.
            from src.core.feature_flags import aiops_flags
            if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
                try:
                    await asyncio.wait_for(
                        self._run_post_execution_verify(
                            approval=approval,
                            action_taken=f"{operation_type.value}:{resource_name}",
                        ),
                        timeout=_VERIFIER_AWAIT_TIMEOUT_SEC,
                    )
                except asyncio.TimeoutError:
                    logger.warning(
                        "post_verify_timeout_exceeded",
                        approval_id=str(approval.id),
                        timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC,
                    )

            # 2026-04-07 Claude Code: Sprint 4 B3 — 記錄人工批准處置類型
            try:
                anomaly_key = await self._get_anomaly_key_from_approval(approval)
                if anomaly_key:
                    from src.services.anomaly_counter import get_anomaly_counter
                    counter = get_anomaly_counter()
                    await counter.record_disposition(anomaly_key, "human_approved")
            except Exception as _disp_e:
                logger.warning("disposition_record_failed", error=str(_disp_e))

            # ADR-073 修補: 執行成功 → 解決 Incident → 觸發 KM 轉換
            # 之前 RESOLVED 從未被呼叫，導致 KM 永遠不生成、Playbook 永遠是 0
            if approval.incident_id:
                try:
                    from src.services.incident_service import get_incident_service
                    _inc_svc = get_incident_service()
                    await _inc_svc.resolve_incident(approval.incident_id)
                    logger.info(
                        "incident_resolved_after_execution",
                        incident_id=approval.incident_id,
                        approval_id=str(approval.id),
                    )
                except Exception as _resolve_e:
                    logger.warning("incident_resolve_after_execution_failed", error=str(_resolve_e))

            # ADR-090 § aol completed (執行成功)
            await self._log_aol_completed(
                op_id=_aol_op_id,
                status="success",
                duration_ms=int((time.time() - _aol_started_ms) * 1000),
                output={
                    "operation_type": operation_type.value,
                    "resource_name": resource_name,
                    "namespace": namespace,
                    "executor_duration_ms": result.duration_ms,
                    "total_attempts": total_attempts,
                },
            )
            return True  # K8s 執行成功

        else:
            logger.error(
                "background_execution_failed",
                approval_id=str(approval.id),
                operation=operation_type.value,
                target=resource_name,
                namespace=namespace,
                error=result.error,
            )
            await timeline.add_event(
                event_type="exec",
                status="error",
                title=f"❌ K8s 執行失敗: {operation_type.value}",
                description=f"Error: {result.error}",
                actor="leWOOOgo",
                actor_role="executor",
                approval_id=str(approval.id),
                incident_id=approval.incident_id,
            )

            # Phase 6: 發送失敗通知 (fire-and-forget, 包含 Dry-Run 攔截)
            exec_status = (
                ExecutionStatus.DRY_RUN_BLOCKED
                if "not found" in (result.error or "")
                else ExecutionStatus.FAILED
            )
            asyncio.create_task(
                self._send_execution_notification(
                    approval=approval,
                    execution_status=exec_status,
                    operation_type=operation_type.value,
                    namespace=namespace,
                    error_message=result.error,
                    duration_ms=result.duration_ms,
                )
            )

            # 2026-04-14 Claude Sonnet 4.6: reply_to 原告警卡片顯示失敗結果
            asyncio.create_task(
                self._push_execution_result_to_alert(
                    approval, success=False, error=result.error
                )
            )

            # ADR-030 Phase 5 / ADR-083 Phase 3: 觸發學習服務（失敗案例）
            # Phase 3 修復：fire-and-forget → await + 30s 熔斷
            # 2026-04-15 ogt + Claude Sonnet 4.6（亞太）: Phase 3 fire-and-forget 修復
            try:
                await asyncio.wait_for(
                    self._trigger_learning(
                        approval=approval,
                        success=False,
                        error_message=result.error,
                        duration_seconds=result.duration_ms / 1000 if result.duration_ms else 0,
                    ),
                    timeout=30.0,
                )
            except asyncio.TimeoutError:
                logger.warning(
                    "learning_trigger_timeout",
                    approval_id=str(approval.id),
                    timeout_sec=30.0,
                )

            # ADR-090 修復 (2026-04-19 ogt + Claude Opus 4.7):
            # 失敗時也跑 verifier,把 verification_result='failed' 回寫 evidence。
            # 改 await + 60s timeout (原為 fire-and-forget,task 在 Pod recycle 時被殺)。
            from src.core.feature_flags import aiops_flags
            if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
                try:
                    await asyncio.wait_for(
                        self._run_post_execution_verify(
                            approval=approval,
                            action_taken=f"{operation_type.value}:{resource_name}:FAILED",
                        ),
                        timeout=_VERIFIER_AWAIT_TIMEOUT_SEC,
                    )
                except asyncio.TimeoutError:
                    logger.warning(
                        "post_verify_timeout_exceeded_failed_path",
                        approval_id=str(approval.id),
                        timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC,
                    )

            # ADR-090 § aol completed (執行失敗)
            await self._log_aol_completed(
                op_id=_aol_op_id,
                status="failed",
                duration_ms=int((time.time() - _aol_started_ms) * 1000),
                output={
                    "operation_type": operation_type.value,
                    "resource_name": resource_name,
                    "namespace": namespace,
                    "executor_duration_ms": result.duration_ms,
                    "total_attempts": total_attempts,
                },
                error=result.error,
                stderr=result.error,  # E6 stderr 回灌 — 給 retry/Playbook 負向強化用
            )
            return False  # K8s 執行失敗

    async def _execute_ssh_host_action(
        self,
        approval: ApprovalRequest,
        host: str,
    ) -> ExecutionResult:
        """
        執行 SSH 主機 action（手動批准路徑專用）

        2026-05-02 ogt + Claude Sonnet 4.6: 修補手動批准 SSH action 卡住的 bug
        根因：parse_operation_from_action 只懂 kubectl，approval_execution 走 K8s executor 拒收
        修法：偵測 SSH_HOST 後改走 SSHProvider，行為與 decision_manager._ssh_execute 對齊

        action 解析邏輯：
        - "docker prune" / "docker image prune" / "docker volume prune" → ssh_docker_prune
        - "docker restart <name>" → ssh_docker_restart
        - "systemctl restart <svc>" → ssh_systemctl_restart
        - "ps aux" / "df -h" / "free -h" / "top" / "uptime" / 'echo' / 'ls -lah' → ssh_diagnose
        - 其他：回傳失敗，提示 LLM 改寫 action
        """
        start = time.time()
        action = approval.action or ""
        action_lower = action.lower().strip()

        # 路由 SSH MCP tool（與 decision_manager._ssh_execute 對齊）
        params: dict = {"host": host}
        tool_name: str | None = None

        if "docker" in action_lower and "prune" in action_lower:
            tool_name = "ssh_docker_prune"
            params["trust_score"] = 0.85
        elif "docker restart" in action_lower:
            tool_name = "ssh_docker_restart"
            # 嘗試萃取 container name
            import re as _re
            m = _re.search(r"docker\s+restart\s+([a-z0-9._-]+)", action_lower)
            if m:
                params["container_name"] = m.group(1)
                params["trust_score"] = 0.85
            else:
                tool_name = None  # 沒抓到 container 名稱，降級
        elif "systemctl restart" in action_lower:
            tool_name = "ssh_systemctl_restart"
            import re as _re
            m = _re.search(r"systemctl\s+restart\s+([a-z0-9._-]+)", action_lower)
            if m:
                params["service"] = m.group(1)
                params["trust_score"] = 0.85
            else:
                tool_name = None
        elif any(kw in action_lower for kw in ("ps aux", "df -h", "free -h", "top ", "uptime", "echo ", "ls -")):
            # 主機診斷類（合 ssh_diagnose 一鍵收集）
            tool_name = "ssh_diagnose"

        if tool_name is None:
            duration_ms = int((time.time() - start) * 1000)
            err = f"SSH action 無法路由到 SSH MCP tool: {action[:120]}"
            logger.warning(
                "ssh_host_action_unrouted",
                approval_id=str(approval.id),
                action=action,
                host=host,
            )
            return ExecutionResult(
                success=False,
                message="SSH action unrouted",
                operation_type=OperationType.SSH_HOST,
                target_resource=host,
                namespace="host",
                duration_ms=duration_ms,
                error=err,
            )

        try:
            logger.warning(
                "mcp_gateway_approved_ssh_execution_path",
                approval_id=str(approval.id),
                incident_id=approval.incident_id,
                tool=tool_name,
                host=host,
                agent_id=_SSH_GATEWAY_AGENT_ID,
            )
            mcp_result = await self._execute_ssh_tool_via_gateway(
                approval=approval,
                tool_name=tool_name,
                params=params,
            )
            duration_ms = int((time.time() - start) * 1000)
            success = bool(mcp_result.success)
            return ExecutionResult(
                success=success,
                message=f"ssh_mcp:{tool_name} {'ok' if success else 'failed'}",
                operation_type=OperationType.SSH_HOST,
                target_resource=host,
                namespace="host",
                duration_ms=duration_ms,
                k8s_response={"tool": tool_name, "output": mcp_result.output} if success else None,
                error=None if success else (mcp_result.error or "ssh_mcp execution failed"),
            )
        except Exception as e:
            duration_ms = int((time.time() - start) * 1000)
            logger.warning(
                "ssh_host_action_exception",
                approval_id=str(approval.id),
                tool=tool_name,
                error=str(e),
            )
            return ExecutionResult(
                success=False,
                message="ssh_mcp exception",
                operation_type=OperationType.SSH_HOST,
                target_resource=host,
                namespace="host",
                duration_ms=duration_ms,
                error=str(e),
            )

    async def _execute_ssh_tool_via_gateway(
        self,
        approval: ApprovalRequest,
        tool_name: str,
        params: dict[str, Any],
    ) -> MCPToolResult:
        required_scope = _SSH_GATEWAY_TOOL_SCOPES.get(tool_name, "read")
        run_id = approval.id if isinstance(approval.id, UUID) else UUID(str(approval.id))

        if required_scope != "read":
            approval_key = (
                f"mcp_approval:{_SSH_GATEWAY_PROJECT_ID}:{_SSH_GATEWAY_AGENT_ID}:"
                f"{tool_name}:{run_id}"
            )
            try:
                redis = get_redis()
                await redis.set(
                    approval_key,
                    "approved",
                    ex=_SSH_GATEWAY_APPROVAL_TTL_SECONDS,
                )
            except Exception as exc:
                logger.warning(
                    "mcp_gateway_approval_projection_failed",
                    approval_id=str(approval.id),
                    tool=tool_name,
                    approval_key=approval_key,
                    error=str(exc),
                )

        params_with_audit = {
            **params,
            "_mcp_audit": {
                "session_id": f"approval:{approval.id}",
                "incident_id": approval.incident_id,
                "agent_role": _SSH_GATEWAY_AGENT_ID,
                "flywheel_node": "execute",
                "approval_id": str(approval.id),
            },
        }
        async with get_db_context(_SSH_GATEWAY_PROJECT_ID) as db:
            ctx = GatewayContext(
                project_id=_SSH_GATEWAY_PROJECT_ID,
                agent_id=_SSH_GATEWAY_AGENT_ID,
                tool_name=tool_name,
                run_id=run_id,
                trace_id=approval.incident_id or str(approval.id),
                is_shadow=False,
                environment={"env": "prod"},
                required_scope=required_scope,
            )
            try:
                return await McpGateway(db).call(ctx, params_with_audit)
            except McpGatewayError as exc:
                logger.warning(
                    "mcp_gateway_approved_ssh_blocked",
                    approval_id=str(approval.id),
                    incident_id=approval.incident_id,
                    tool=tool_name,
                    gate=exc.gate,
                    error_code=exc.error_code,
                    error=str(exc),
                )
                return MCPToolResult(
                    success=False,
                    execution_id=f"blocked:{tool_name}:{run_id}",
                    error=f"{exc.error_code}: {exc}",
                )

    async def _push_execution_result_to_alert(
        self,
        approval: ApprovalRequest,
        success: bool,
        error: str | None,
    ) -> None:
        """
        執行結果回覆到原告警 Telegram 卡片（reply_to_message_id）

        2026-04-14 Claude Sonnet 4.6 實裝:
        - 人工路徑：人類在 Telegram 點批准後，等執行完成，在原告警下 reply 執行結果
        - 自動路徑 (requested_by=auto_approve) 由 _push_auto_repair_result 處理，此處 skip

        透過 Redis tg_msg:{incident_id} 查原告警 message_id，找不到則靜默不發。
        """
        try:
            # 自動執行路徑 skip（避免與 _push_auto_repair_result 重複發訊息）
            if self._is_auto_approved_request(approval):
                return

            if not approval.incident_id:
                return

            from src.core.redis_client import get_redis
            redis = get_redis()
            msg_id_raw = await redis.get(f"tg_msg:{approval.incident_id}")
            if not msg_id_raw:
                logger.debug(
                    "push_execution_result_no_msg_id",
                    incident_id=approval.incident_id,
                    approval_id=str(approval.id),
                )
                return

            try:
                orig_msg_id = int(msg_id_raw)
            except (TypeError, ValueError):
                return

            from src.core.config import get_settings
            from src.services.telegram_gateway import get_telegram_gateway
            settings = get_settings()
            gateway = get_telegram_gateway()
            target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID

            # 2026-04-19 ogt + Claude Opus 4.7 修 AP-2: 除了 reply 外,
            # 也 edit 原卡片移除按鈕 + 更新狀態戳記(避免卡片永遠停在「執行中」)
            try:
                await gateway._send_request("editMessageReplyMarkup", {
                    "chat_id": target_chat_id,
                    "message_id": orig_msg_id,
                    "reply_markup": {"inline_keyboard": []},
                })
            except Exception as _edit_e:
                logger.debug("push_execution_edit_buttons_failed",
                             approval_id=str(approval.id), error=str(_edit_e))

            # 附加 KM/Playbook 增量（查最近該 incident 的 KM + playbook 使用）
            km_info = ""
            try:
                from sqlalchemy import text as _sql
                from src.db.base import get_db_context
                async with get_db_context() as _db:
                    _km_row = await _db.execute(
                        _sql("""SELECT COUNT(*) FROM knowledge_entries
                                WHERE created_at > NOW() - interval '2 minutes'"""),
                    )
                    _km_count = _km_row.scalar() or 0
                    _pb_row = await _db.execute(
                        _sql("""SELECT COUNT(*) FROM playbooks
                                WHERE updated_at > NOW() - interval '2 minutes'"""),
                    )
                    _pb_count = _pb_row.scalar() or 0
                    if _km_count or _pb_count:
                        km_info = f"\n📚 KM +{_km_count}  🎯 Playbook 更新×{_pb_count}"
            except Exception:
                pass

            if success:
                text = (
                    f"✅ <b>執行成功</b>\n"
                    f"<code>{(approval.action or '')[:180]}</code>"
                    f"{km_info}"
                )
            else:
                err_short = (error or "未知錯誤")[:150]
                text = (
                    f"❌ <b>執行失敗</b>\n"
                    f"<code>{(approval.action or '')[:180]}</code>\n"
                    f"原因: {err_short}"
                    f"{km_info}"
                )

            await gateway._send_request(
                "sendMessage",
                {
                    "chat_id": target_chat_id,
                    "text": text,
                    "parse_mode": "HTML",
                    "reply_to_message_id": orig_msg_id,
                },
            )
            logger.info(
                "push_execution_result_sent",
                incident_id=approval.incident_id,
                approval_id=str(approval.id),
                success=success,
                orig_msg_id=orig_msg_id,
            )
        except Exception as e:
            logger.warning(
                "push_execution_result_failed",
                approval_id=str(approval.id),
                error=str(e),
            )

    async def _get_anomaly_key_from_approval(self, approval: ApprovalRequest) -> str | None:
        """
        從 approval → incident → anomaly_key。
        2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident()
        """
        try:
            if not approval.incident_id:
                return None
            from src.services.incident_service import get_incident_service
            incident_service = get_incident_service()
            incident = await incident_service.get_from_working_memory(approval.incident_id)
            if not incident:
                return None
            from src.services.anomaly_counter import AnomalyCounter
            return AnomalyCounter.derive_key_from_incident(incident)
        except Exception as e:
            logger.warning("get_anomaly_key_from_approval_failed", error=str(e))
            return None

    async def _trigger_learning(
        self,
        approval: ApprovalRequest,
        success: bool,
        duration_seconds: float = 0,
        error_message: str | None = None,
    ) -> None:
        """
        ADR-030 Phase 5: 觸發學習服務

        處理執行結果，調整信任度和 Playbook 統計
        """
        try:
            from src.services.learning_service import (
                ExecutionResult,
                get_learning_service,
            )

            learning = get_learning_service()
            result = ExecutionResult(
                approval_id=str(approval.id),
                incident_id=approval.incident_id or "",
                action=approval.action,
                success=success,
                error_message=error_message,
                duration_seconds=duration_seconds,
            )

            await learning.process_execution_result(
                approval=approval,
                result=result,
            )

        except Exception as e:
            # 學習失敗不影響主流程
            logger.warning(
                "learning_trigger_failed",
                approval_id=str(approval.id),
                error=str(e),
            )

        # 2026-04-04 ogt: 執行結果沉澱到 KM — 移出 try/except 確保 learning 失敗也寫入
        # 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
        # P1.5 fix 2026-04-24 ogt + Claude Sonnet 4.6: fire-and-forget → await（30s 熔斷）
        # P1-1 2026-04-28 ogt + Claude Sonnet 4.6: 改用 write_execution_result_to_km（公開）
        #   KMWriter 統一契約：timeout / retry / DLQ 由 km_writer.py 統一管理
        await self.write_execution_result_to_km(approval, success, error_message)

    async def _run_post_execution_verify(
        self,
        approval: "ApprovalRequest",
        action_taken: str,
    ) -> None:
        """
        ADR-081 Phase 1: 執行後驗證 (fire-and-forget 包裝)

        1. 從 incident_id 查 Incident
        2. 從 incident_evidence 取最新 EvidenceSnapshot
        3. 呼叫 PostExecutionVerifier.verify() 補填後狀態 + 驗證結果
        4. 結果傳給 learning_service 更新 Playbook trust_score（Phase 3）
        """
        if not approval.incident_id:
            return

        try:
            from src.services.incident_service import get_incident_service
            from src.services.post_execution_verifier import get_post_execution_verifier
            # 2026-04-26 critic-B2 hotfix by Claude Opus 4.7
            # get_latest_snapshot 是 module-level async function，不是 EvidenceSnapshot classmethod
            from src.services.evidence_snapshot import get_latest_snapshot

            incident_svc = get_incident_service()
            # 2026-04-25 修復 L1：IncidentService 沒有 get_incident() 方法
            # 應用正確方法 get_from_working_memory() 或 get_from_episodic_memory()
            incident = await incident_svc.get_from_working_memory(approval.incident_id)
            if incident is None:
                incident = await incident_svc.get_from_episodic_memory(approval.incident_id)
            if incident is None:
                logger.warning(
                    "post_verify_incident_not_found",
                    approval_id=str(approval.id),
                    incident_id=approval.incident_id,
                )
                return

            # 取最新 EvidenceSnapshot（若 Phase 1 flag 有啟動才會有）
            snapshot = await get_latest_snapshot(approval.incident_id)

            verifier = get_post_execution_verifier()
            verification_result = await verifier.verify(
                incident=incident,
                snapshot=snapshot,
                action_taken=action_taken,
            )

            logger.info(
                "post_verify_complete",
                approval_id=str(approval.id),
                incident_id=approval.incident_id,
                result=verification_result,
                action=action_taken,
            )

            # ADR-083 Phase 3 Root cause 3: 驗證結果接線到學習服務
            # 環境驗證（Pod Running / 指標恢復）是比執行 exit code 更精確的學習訊號
            # 2026-04-15 ogt + Claude Sonnet 4.6（亞太）
            try:
                from src.services.learning_service import get_learning_service
                _matched_pb_id = getattr(approval, "matched_playbook_id", None)
                await get_learning_service().record_verification_result(
                    incident_id=approval.incident_id,
                    action_taken=action_taken,
                    verification_result=verification_result,
                    matched_playbook_id=_matched_pb_id,
                )
            except Exception as _lerr:
                logger.warning(
                    "post_verify_learning_failed",
                    approval_id=str(approval.id),
                    error=str(_lerr),
                )

        except Exception as _e:
            # 驗證失敗不影響執行結果
            logger.warning(
                "post_verify_failed",
                approval_id=str(approval.id),
                error=str(_e),
            )

    @staticmethod
    def _is_auto_approved_request(approval: "ApprovalRequest") -> bool:
        requested_by = (getattr(approval, "requested_by", "") or "").lower()
        return requested_by.startswith("auto_approve")

    @staticmethod
    def _is_observation_only_action(action: str | None) -> bool:
        action_upper = (action or "").strip().upper()
        return (
            not action_upper
            or "NO_ACTION" in action_upper
            or "NO-ACTION" in action_upper
            or "NOACTION" in action_upper
            or action_upper.startswith("OBSERVE")
            or action_upper.startswith("INVESTIGATE")
        )

    @staticmethod
    def _approval_risk_value(approval: "ApprovalRequest") -> str | None:
        risk_level = getattr(approval, "risk_level", None)
        if risk_level is None:
            return None
        return getattr(risk_level, "value", str(risk_level))

    async def finalize_auto_approved_execution(
        self,
        approval: "ApprovalRequest",
        *,
        success: bool,
        error_message: str | None = None,
    ) -> None:
        """
        補齊「自動批准已執行」路徑的 incident-linked 證據鏈。

        CS2/CS3 webhook 路徑為了快速執行，會先呼叫 execute_approved_action()，
        再建立 Incident。executor 當下沒有 incident_id，導致 verifier/KM/
        auto_repair_executions 都無法串回同一張告警卡。此方法只在 incident
        建立後補上 durable trace，不重新執行 action。
        """
        if not self._is_auto_approved_request(approval):
            return

        incident_id = getattr(approval, "incident_id", None)
        if not incident_id:
            logger.warning(
                "auto_approved_execution_finalize_skipped_no_incident",
                approval_id=str(getattr(approval, "id", "")),
                requested_by=getattr(approval, "requested_by", None),
            )
            return

        if self._is_observation_only_action(getattr(approval, "action", None)):
            logger.info(
                "auto_approved_execution_finalize_skipped_observation_only",
                approval_id=str(approval.id),
                incident_id=incident_id,
                action=(approval.action or "")[:120],
            )
            return

        parsed = parse_operation_from_action(approval.action)
        operation_type = parsed.operation_type
        resource_name = parsed.resource_name or "unknown"
        namespace = parsed.namespace or "default"

        playbook_id = str(getattr(approval, "matched_playbook_id", None) or approval.id)[:36]
        operation_label = operation_type.value if operation_type else "unknown"
        playbook_name = f"approval_auto_execute:{operation_label}:{resource_name}"[:200]
        triggered_by = (getattr(approval, "requested_by", None) or "auto_approve")[:50]
        action_taken = f"auto_repair_playbook:{playbook_id}:{operation_label}:{resource_name}"
        if not success:
            action_taken = f"{action_taken}:FAILED"
            error_message = error_message or "auto-approved executor returned failure; see approval/aol logs"

        try:
            from src.repositories.audit_log_repository import get_auto_repair_execution_repository

            repo = get_auto_repair_execution_repository()
            existing = await repo.list_by_incident(incident_id)
            already_recorded = any(
                str(getattr(row, "playbook_id", "")) == playbook_id
                and getattr(row, "triggered_by", "") == triggered_by
                and (approval.action or "") in list(getattr(row, "executed_steps", []) or [])
                for row in existing
            )
            if not already_recorded:
                await repo.create(
                    incident_id=incident_id,
                    playbook_id=playbook_id,
                    playbook_name=playbook_name,
                    success=success,
                    executed_steps=[approval.action],
                    error_message=error_message,
                    triggered_by=triggered_by,
                    risk_level=self._approval_risk_value(approval),
                )
            else:
                logger.info(
                    "auto_approved_execution_record_already_exists",
                    approval_id=str(approval.id),
                    incident_id=incident_id,
                    playbook_id=playbook_id,
                )
        except Exception as exc:
            logger.warning(
                "auto_approved_execution_record_failed",
                approval_id=str(approval.id),
                incident_id=incident_id,
                error=str(exc),
            )

        try:
            timeline = get_timeline_service()
            await timeline.add_event(
                event_type="exec",
                status="success" if success else "error",
                title=f"{'✅' if success else '❌'} 自動批准執行已補鏈: {operation_label}",
                description=(
                    f"Target: {resource_name} @ {namespace}; "
                    f"source={triggered_by}; action={approval.action[:160]}"
                ),
                actor="leWOOOgo",
                actor_role="executor",
                approval_id=str(approval.id),
                incident_id=incident_id,
            )
        except Exception as exc:
            logger.warning(
                "auto_approved_execution_timeline_failed",
                approval_id=str(approval.id),
                incident_id=incident_id,
                error=str(exc),
            )

        try:
            await self.write_execution_result_to_km(approval, success, error_message)
        except Exception as exc:
            logger.warning(
                "auto_approved_execution_km_failed",
                approval_id=str(approval.id),
                incident_id=incident_id,
                error=str(exc),
            )

        from src.core.feature_flags import aiops_flags
        if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
            try:
                await asyncio.wait_for(
                    self._run_post_execution_verify(
                        approval=approval,
                        action_taken=action_taken,
                    ),
                    timeout=_VERIFIER_AWAIT_TIMEOUT_SEC,
                )
            except asyncio.TimeoutError:
                logger.warning(
                    "auto_approved_execution_post_verify_timeout",
                    approval_id=str(approval.id),
                    incident_id=incident_id,
                    timeout_sec=_VERIFIER_AWAIT_TIMEOUT_SEC,
                )

        if success:
            try:
                from src.services.incident_service import get_incident_service

                await get_incident_service().resolve_incident(incident_id)
                logger.info(
                    "incident_resolved_after_auto_approved_execution_finalize",
                    incident_id=incident_id,
                    approval_id=str(approval.id),
                )
            except Exception as exc:
                logger.warning(
                    "incident_resolve_after_auto_approved_execution_finalize_failed",
                    incident_id=incident_id,
                    approval_id=str(approval.id),
                    error=str(exc),
                )

    async def write_execution_result_to_km(
        self,
        approval: "ApprovalRequest",
        success: bool,
        error_message: str | None,
    ) -> None:
        """
        執行結果沉澱到 KM (Knowledge Base)

        2026-04-04 ogt: 統帥鐵律 — 成功/失敗執行記錄都必須回寫 KM
        2026-04-14 Claude Sonnet 4.6 (BP-1 B.1 精修): 區分 auto_approve vs 人工路徑，
        補齊 alert_category / alertname / affected_services 供 RAG 檢索。
        P1-1 2026-04-28 ogt + Claude Sonnet 4.6: 改名公開（去底線），委派 KMWriter 統一契約。
        """
        from src.models.knowledge import EntrySource, EntryType
        from src.services.km_writer import KMWritePayload, km_write_with_flag

        # 來源辨識（B.1 精修）
        _is_auto = self._is_auto_approved_request(approval)
        _mode_prefix = "[自動修復]" if _is_auto else "[人工修復]"
        _mode_tag = "auto_executed" if _is_auto else "human_approved"

        status_icon = "✅" if success else "❌"
        status_text = "成功" if success else f"失敗: {error_message or '未知原因'}"
        _status_tag = "success" if success else "failure"

        # 從關聯 Incident 提取豐富元資料
        alertname = "unknown"
        alert_category = "general"
        affected_services: list[str] = []
        if approval.incident_id:
            try:
                from src.services.incident_service import get_incident_service
                _svc = get_incident_service()
                # get_from_working_memory (Redis) → fallback get_from_episodic_memory (PG)
                _inc = await _svc.get_from_working_memory(approval.incident_id)
                if _inc is None:
                    _inc = await _svc.get_from_episodic_memory(approval.incident_id)
                if _inc:
                    if _inc.signals:
                        alertname = _inc.signals[0].labels.get("alertname", "unknown") or "unknown"
                    alert_category = getattr(_inc, "alert_category", "") or "general"
                    affected_services = list(_inc.affected_services or [])
            except Exception as _ie:
                logger.debug("km_incident_enrich_failed",
                             incident_id=approval.incident_id, error=str(_ie))

        _services_str = ", ".join(affected_services) if affected_services else "未關聯"

        content = (
            f"# {status_icon} {_mode_prefix} {alertname}\n\n"
            f"**告警名稱**: {alertname}\n"
            f"**告警類別**: {alert_category}\n"
            f"**受影響服務**: {_services_str}\n"
            f"**執行命令**: `{approval.action[:200]}`\n"
            f"**執行結果**: {status_text}\n"
            f"**風險等級**: {approval.risk_level.value if approval.risk_level else '未知'}\n"
            f"**執行路徑**: {'自動執行 (confidence >= 0.65)' if _is_auto else '人工審核批准'}\n"
            f"**Incident ID**: {approval.incident_id or '未關聯'}\n"
            f"**Approval ID**: {approval.id}\n\n"
            f"## 操作描述\n{approval.description or '無描述'}\n"
        )

        # Tags: 模式 + 狀態 + 類別（供 RAG 多維度檢索）
        tags = [_mode_tag, _status_tag, alert_category, "execution"]
        if not success:
            tags.append("execution_failed")

        payload = KMWritePayload(
            path_type="approval_auto_ok" if (_is_auto and success) else
                      "approval_auto_fail" if (_is_auto and not success) else
                      "approval_manual",
            entry_create_kwargs=dict(
                title=f"{_mode_prefix} {alertname}: {approval.action[:50]}",
                content=content,
                entry_type=EntryType.INCIDENT_CASE,
                category=alert_category,
                tags=tags,
                source=EntrySource.AI_EXTRACTED,
                related_incident_id=approval.incident_id or None,
                created_by="auto_execute" if _is_auto else "approval_execution",
            ),
            incident_id=approval.incident_id or None,
            approval_id=str(approval.id),
        )
        await km_write_with_flag(payload)

    async def _send_execution_notification(
        self,
        approval: ApprovalRequest,
        execution_status: "ExecutionStatus",
        operation_type: str,
        namespace: str,
        duration_ms: int | None = None,
        error_message: str | None = None,
    ) -> None:
        """
        Phase 6: 發送執行通知 (Post-Execution Hook)

        將執行結果發送至所有已配置的通知頻道 (Discord, Slack, etc.)
        """
        from src.services.notifications import (
            NotificationMessage,
            get_notification_manager,
        )

        if not settings.NOTIFICATION_ENABLED:
            logger.info("notification_disabled", approval_id=str(approval.id))
            return

        try:
            # 建構簽核者列表
            signers = [
                {"name": sig.signer_name, "comment": sig.comment or ""}
                for sig in approval.signatures
            ]

            # 建構通知訊息
            message = NotificationMessage(
                execution_status=execution_status,
                action_title=approval.action[:100],
                action_description=approval.description[:200] if approval.description else "",
                approval_id=str(approval.id),
                signers=signers,
                required_signatures=approval.required_signatures,
                affected_pods=approval.blast_radius.affected_pods if approval.blast_radius else 0,
                estimated_downtime=approval.blast_radius.estimated_downtime if approval.blast_radius else "N/A",
                related_services=approval.blast_radius.related_services if approval.blast_radius else [],
                data_impact=approval.blast_radius.data_impact.value if approval.blast_radius else "none",
                namespace=namespace,
                operation_type=operation_type,
                duration_ms=duration_ms,
                error_message=error_message,
                risk_level=approval.risk_level.value,
                ai_provider=approval.requested_by,
            )

            # 發送通知
            manager = get_notification_manager()
            results = await manager.send_all(message)

            for result in results:
                logger.info(
                    "notification_result",
                    approval_id=str(approval.id),
                    provider=result.provider,
                    status=result.status.value,
                    message=result.message,
                )

        except Exception as e:
            logger.exception(
                "notification_failed",
                approval_id=str(approval.id),
                error=str(e),
            )

    async def _trigger_playbook_extraction(
        self,
        approval: ApprovalRequest,
    ) -> None:
        """
        Phase 7.6: 觸發 Playbook 自動萃取

        條件:
        - 執行成功
        - 關聯的 Incident 狀態為 RESOLVED 或 CLOSED
        - effectiveness_score >= 4

        此函數為 fire-and-forget，失敗不影響主流程
        """
        try:
            # 1. 從 approval.incident_id 直接取得 (Phase 26 修復)
            # 原本靠 regex 掃文字找 INC- 前綴，中文 action 完全找不到
            incident_id = getattr(approval, "incident_id", None)
            if not incident_id:
                # Fallback: 嘗試文字解析 (向後兼容舊資料)
                incident_id = self._extract_incident_id_from_approval(approval)
            if not incident_id:
                logger.info(
                    "playbook_extraction_skipped",
                    approval_id=str(approval.id),
                    reason="No incident_id found in approval.incident_id or text",
                )
                return

            # 2. 取得 Incident
            from src.services.incident_service import get_incident_service

            incident_service = get_incident_service()
            # 2026-04-25 修復 L1：IncidentService 沒有 get_incident() 方法
            incident = await incident_service.get_from_working_memory(incident_id)
            if incident is None:
                incident = await incident_service.get_from_episodic_memory(incident_id)

            if not incident:
                logger.info(
                    "playbook_extraction_skipped",
                    approval_id=str(approval.id),
                    incident_id=incident_id,
                    reason="Incident not found",
                )
                return

            # 3. 執行成功後自動設定 outcome (冷啟動關鍵)
            # 2026-04-04 ogt: 首席架構師 Review — 補上 execution_success + effectiveness_score
            # 確保 Playbook 萃取前置條件能成立，不再依賴人工填分
            from src.models.incident import IncidentOutcome, IncidentStatus
            from src.utils.timezone import now_taipei

            if incident.outcome is None:
                incident.outcome = IncidentOutcome()
            if not incident.outcome.execution_success:
                incident.outcome.execution_success = True
            if incident.outcome.effectiveness_score is None or incident.outcome.effectiveness_score < 4:
                incident.outcome.effectiveness_score = 4  # 系統判斷：K8s 執行成功 = 有效
            if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]:
                incident.status = IncidentStatus.RESOLVED
                incident.resolved_at = now_taipei()
            # Task 3.3 (2026-04-14): 記錄執行動作供 SSH 路徑 KM 萃取
            # approval.action 含實際執行指令（可能是 kubectl 或 ssh ...），
            # 寫入 learning_notes 供 playbook_service._extract_repair_steps 萃取 SSH RepairStep
            if not incident.outcome.learning_notes and approval.action:
                incident.outcome.learning_notes = approval.action

            # 回存 Incident（fire-and-forget 路徑，失敗不影響主流程）
            await incident_service.save_to_working_memory(incident)

            logger.info(
                "playbook_extraction_incident_updated",
                approval_id=str(approval.id),
                incident_id=incident_id,
                effectiveness_score=incident.outcome.effectiveness_score,
                status=incident.status.value,
            )

            # 4. 觸發萃取（effectiveness 已保證 >= 4）
            from src.services.playbook_service import get_playbook_service

            playbook_service = get_playbook_service()
            effectiveness = incident.outcome.effectiveness_score or 4
            playbook = await playbook_service.extract_from_incident(
                incident=incident,
                auto_approve=effectiveness >= 5,  # 滿分自動核准
            )

            if playbook:
                logger.info(
                    "playbook_auto_extracted",
                    approval_id=str(approval.id),
                    incident_id=incident_id,
                    playbook_id=playbook.playbook_id,
                    playbook_name=playbook.name,
                    auto_approved=playbook.status.value == "approved",
                )
            else:
                logger.debug(
                    "playbook_extraction_no_result",
                    approval_id=str(approval.id),
                    incident_id=incident_id,
                )

        except Exception as e:
            # 萃取失敗不影響主流程
            logger.warning(
                "playbook_extraction_error",
                approval_id=str(approval.id),
                error=str(e),
            )

    def _extract_incident_id_from_approval(
        self,
        approval: ApprovalRequest,
    ) -> str | None:
        """
        從 approval 提取關聯的 incident_id

        嘗試以下來源:
        1. approval.metadata (如果有)
        2. approval.description 中的 INC- 模式
        3. approval.requested_by 中的 incident 資訊
        """
        import re

        # 從 description 或 action 中尋找 INC-XXXXXX 模式
        text = f"{approval.description or ''} {approval.action or ''}"
        match = re.search(r"INC-([A-Z0-9-]+)", text)
        if match:
            return match.group(0)  # 返回完整的 INC-XXXXX

        # 從 requested_by 尋找
        if approval.requested_by and "INC-" in approval.requested_by:
            match = re.search(r"INC-([A-Z0-9-]+)", approval.requested_by)
            if match:
                return match.group(0)

        return None


    # =========================================================================
    # ADR-090 § AOL Writer (2026-04-19 ogt + Claude Opus 4.7 亞太)
    # 把 approval execution 的生命週期回灌 automation_operation_log.
    # 之前 33 件/7d approval 動作完全沒寫入 aol,只有 drift_narrator 的
    # 22 筆 notification_formatted。修復後每次執行都留痕。
    # =========================================================================

    async def _log_aol_started(self, approval: ApprovalRequest) -> str | None:
        """
        在 automation_operation_log 寫一筆 'pending' 紀錄,回傳 op_id 供 _log_aol_completed 更新。

        失敗時 (DB 異常) 回 None,主流程繼續 — aol 寫入永不阻塞執行。

        2026-04-20 P0.3: input 補 target / operation_type / namespace，
        失敗時 aol.input 就能直接看到 target 是什麼（追 awoooi-service 類誤判的 source trace）。
        """
        try:
            from sqlalchemy import text as _sql
            from src.db.base import get_db_context
            import json as _json

            # 2026-04-20 P0.3: 先嘗試從 action 解析 target / op_type，失敗不阻塞
            _parsed_target: str | None = None
            _parsed_op: str | None = None
            _parsed_ns: str | None = None
            try:
                _parsed = parse_operation_from_action(approval.action or "")
                _parsed_target = _parsed.resource_name
                _parsed_op = _parsed.operation_type.value if _parsed.operation_type else None
                _parsed_ns = _parsed.namespace
            except Exception:
                pass

            input_payload = {
                "approval_id": str(approval.id),
                "incident_id": approval.incident_id or "",
                "action": (approval.action or "")[:500],
                "risk_level": getattr(approval, "risk_level", None) or "",
                "requested_by": getattr(approval, "requested_by", "") or "",
                # 2026-04-20 P0.3: target source trace
                "parsed_target": _parsed_target or "",
                "parsed_operation": _parsed_op or "",
                "parsed_namespace": _parsed_ns or "",
            }

            async with get_db_context() as db:
                row = await db.execute(
                    _sql("""
                        INSERT INTO automation_operation_log (
                            operation_type, actor, status,
                            input, output, tags
                        ) VALUES (
                            'playbook_executed',
                            'approval_execution',
                            'pending',
                            CAST(:input AS jsonb),
                            '{}'::jsonb,
                            :tags
                        )
                        RETURNING op_id
                    """),
                    {
                        "input": _json.dumps(input_payload, ensure_ascii=False),
                        "tags": ["approval", "execution", "playbook"],
                    },
                )
                op_id = row.scalar()
                return str(op_id) if op_id else None
        except Exception as e:
            logger.warning("aol_started_write_failed", approval_id=str(approval.id), error=str(e))
            return None

    async def _log_aol_completed(
        self,
        op_id: str | None,
        status: str,
        duration_ms: int,
        output: dict | None = None,
        error: str | None = None,
        stderr: str | None = None,
    ) -> None:
        """
        UPDATE automation_operation_log 為 success/failed 並寫入結果摘要 + stderr。

        status 必須是 aol constraint 允許的值:
        pending | success | failed | dry_run | rolled_back

        op_id 為 None 時靜默跳過 (started 寫入失敗時不應觸發 update 例外)。
        """
        if not op_id:
            return
        try:
            from sqlalchemy import text as _sql
            from src.db.base import get_db_context
            import json as _json

            async with get_db_context() as db:
                await db.execute(
                    _sql("""
                        UPDATE automation_operation_log
                        SET status = :status,
                            duration_ms = :duration_ms,
                            output = CAST(:output AS jsonb),
                            error = :error,
                            stderr_feed_back = :stderr
                        WHERE op_id = CAST(:op_id AS uuid)
                    """),
                    {
                        "status": status,
                        "duration_ms": duration_ms,
                        "output": _json.dumps(output or {}, ensure_ascii=False),
                        "error": (error or "")[:2000] if error else None,
                        "stderr": (stderr or "")[:8000] if stderr else None,
                        "op_id": op_id,
                    },
                )
        except Exception as e:
            logger.warning("aol_completed_write_failed", op_id=op_id, error=str(e))


# =============================================================================
# Singleton Instance
# =============================================================================

_execution_service: ApprovalExecutionService | None = None


def get_execution_service() -> ApprovalExecutionService:
    """
    取得 ApprovalExecutionService 單例

    Returns:
        ApprovalExecutionService: 執行服務實例
    """
    global _execution_service
    if _execution_service is None:
        _execution_service = ApprovalExecutionService()
    return _execution_service