awoooi/apps/api/src/agents/action_planner.py

"""
Action Planner Agent - 執行計畫生成專家
========================================

職責:
- 生成結構化執行計畫
- 定義 rollback 策略
- 設定驗證步驟
- 回傳完整 ActionPlan

符合 ADR-009 ActionPlannerAgent 規範
"""

import time
from dataclasses import dataclass, field
from enum import Enum
from typing import Any

import structlog

from src.agents.base import AgentResult, AgentStatus, BaseAgent

logger = structlog.get_logger(__name__)


# =============================================================================
# Action Plan Types
# =============================================================================


class ActionType(str, Enum):
    """執行動作類型"""
    RESTART = "restart"       # 重啟服務
    SCALE = "scale"           # 擴縮容
    ROLLBACK = "rollback"     # 回滾版本
    DELETE = "delete"         # 刪除資源
    PATCH = "patch"           # 修補配置
    EXEC = "exec"             # 執行指令
    APPLY = "apply"           # 應用變更
    CUSTOM = "custom"         # 自訂


class ActionPhase(str, Enum):
    """執行階段"""
    PRE_CHECK = "pre_check"     # 前置檢查
    EXECUTE = "execute"         # 主要執行
    VERIFY = "verify"           # 驗證結果
    ROLLBACK = "rollback"       # 回滾 (如果失敗)


@dataclass
class ActionStep:
    """
    單一執行步驟

    包含:
    - command: 要執行的指令
    - description: 步驟說明
    - phase: 執行階段
    - timeout_sec: 超時時間
    - can_fail: 是否允許失敗
    """
    command: str
    description: str
    phase: ActionPhase
    timeout_sec: int = 60
    can_fail: bool = False
    order: int = 0

    def to_dict(self) -> dict[str, Any]:
        return {
            "command": self.command,
            "description": self.description,
            "phase": self.phase.value,
            "timeout_sec": self.timeout_sec,
            "can_fail": self.can_fail,
            "order": self.order,
        }


@dataclass
class ActionPlan(AgentResult):
    """
    ActionPlannerAgent 分析結果

    完整的執行計畫，包含:
    - action_type: 動作類型
    - pre_check_steps: 前置檢查
    - execute_steps: 主要執行步驟
    - verify_steps: 驗證步驟
    - rollback_steps: 回滾步驟
    - estimated_duration: 預估執行時間
    """
    action_type: ActionType = ActionType.CUSTOM
    pre_check_steps: list[ActionStep] = field(default_factory=list)
    execute_steps: list[ActionStep] = field(default_factory=list)
    verify_steps: list[ActionStep] = field(default_factory=list)
    rollback_steps: list[ActionStep] = field(default_factory=list)
    estimated_duration_sec: int = 0
    requires_approval: bool = True
    kubectl_commands: list[str] = field(default_factory=list)

    def to_dict(self) -> dict[str, Any]:
        """轉換為 dict"""
        base = super().to_dict()
        base.update({
            "action_type": self.action_type.value,
            "pre_check_steps": [s.to_dict() for s in self.pre_check_steps],
            "execute_steps": [s.to_dict() for s in self.execute_steps],
            "verify_steps": [s.to_dict() for s in self.verify_steps],
            "rollback_steps": [s.to_dict() for s in self.rollback_steps],
            "estimated_duration_sec": self.estimated_duration_sec,
            "requires_approval": self.requires_approval,
            "kubectl_commands": self.kubectl_commands,
        })
        return base

    def get_all_steps(self) -> list[ActionStep]:
        """取得所有步驟 (按順序)"""
        all_steps = (
            self.pre_check_steps
            + self.execute_steps
            + self.verify_steps
        )
        return sorted(all_steps, key=lambda s: s.order)

    def get_primary_command(self) -> str | None:
        """取得主要執行指令"""
        if self.execute_steps:
            return self.execute_steps[0].command
        return None


# =============================================================================
# Action Templates
# =============================================================================


# 預定義的執行計畫模板
ACTION_TEMPLATES: dict[str, dict[str, Any]] = {
    "restart": {
        "action_type": ActionType.RESTART,
        "requires_approval": False,  # 重啟相對安全
        "pre_check": [
            {
                "command": "kubectl get deployment {target} -n {namespace} -o wide",
                "description": "確認目標 Deployment 存在且健康",
            },
            {
                "command": "kubectl get pods -l app={target} -n {namespace} --no-headers | wc -l",
                "description": "確認目前 Pod 數量",
            },
        ],
        "execute": [
            {
                "command": "kubectl rollout restart deployment/{target} -n {namespace}",
                "description": "執行滾動重啟",
            },
        ],
        "verify": [
            {
                "command": "kubectl rollout status deployment/{target} -n {namespace} --timeout=120s",
                "description": "等待滾動更新完成",
                "timeout_sec": 120,
            },
            {
                "command": "kubectl get pods -l app={target} -n {namespace} -o wide",
                "description": "確認新 Pod 狀態",
            },
        ],
        "rollback": [
            {
                "command": "kubectl rollout undo deployment/{target} -n {namespace}",
                "description": "回滾到上一個版本",
            },
        ],
    },

    "scale": {
        "action_type": ActionType.SCALE,
        "requires_approval": False,
        "pre_check": [
            {
                "command": "kubectl get deployment {target} -n {namespace} -o jsonpath='{.spec.replicas}'",
                "description": "記錄目前副本數",
            },
        ],
        "execute": [
            {
                "command": "kubectl scale deployment/{target} --replicas={replicas} -n {namespace}",
                "description": "調整副本數至 {replicas}",
            },
        ],
        "verify": [
            {
                "command": "kubectl rollout status deployment/{target} -n {namespace} --timeout=60s",
                "description": "等待擴縮容完成",
                "timeout_sec": 60,
            },
        ],
        "rollback": [
            {
                "command": "kubectl scale deployment/{target} --replicas={original_replicas} -n {namespace}",
                "description": "恢復原始副本數",
            },
        ],
    },

    "rollback": {
        "action_type": ActionType.ROLLBACK,
        "requires_approval": True,  # 回滾需要審核
        "pre_check": [
            {
                "command": "kubectl rollout history deployment/{target} -n {namespace}",
                "description": "查看版本歷史",
            },
        ],
        "execute": [
            {
                "command": "kubectl rollout undo deployment/{target} -n {namespace}",
                "description": "回滾到上一個版本",
            },
        ],
        "verify": [
            {
                "command": "kubectl rollout status deployment/{target} -n {namespace} --timeout=120s",
                "description": "等待回滾完成",
                "timeout_sec": 120,
            },
            {
                "command": "kubectl get pods -l app={target} -n {namespace} -o wide",
                "description": "確認 Pod 狀態",
            },
        ],
        "rollback": [
            {
                "command": "kubectl rollout undo deployment/{target} -n {namespace}",
                "description": "再次回滾 (恢復原版本)",
            },
        ],
    },

    "delete_pod": {
        "action_type": ActionType.DELETE,
        "requires_approval": True,  # 刪除需要審核
        "pre_check": [
            {
                "command": "kubectl get pod {target} -n {namespace} -o wide",
                "description": "確認目標 Pod 存在",
            },
        ],
        "execute": [
            {
                "command": "kubectl delete pod {target} -n {namespace}",
                "description": "刪除異常 Pod (觸發重建)",
            },
        ],
        "verify": [
            {
                "command": "kubectl get pods -n {namespace} | grep -v Completed | grep -v Terminating",
                "description": "確認新 Pod 已建立",
                "can_fail": True,
            },
        ],
        "rollback": [],  # 刪除 Pod 無法回滾，但 Deployment 會自動重建
    },
}


class ActionPlannerAgent(BaseAgent[ActionPlan]):
    """
    執行計畫生成專家 Agent

    分析流程:
    1. 解析輸入的問題/指令
    2. 匹配最佳執行模板
    3. 填充參數生成完整計畫
    4. 計算預估執行時間

    使用方式:
    ```python
    agent = ActionPlannerAgent()
    result = await agent.analyze({
        "problem": "Pod 頻繁重啟",
        "target_service": "api",
        "namespace": "awoooi-prod",
    })
    print(result.execute_steps)  # [ActionStep(...), ...]
    ```
    """

    AGENT_NAME = "action-planner"
    AGENT_DESCRIPTION = "行動規劃師，制定修復步驟與回滾方案"
    AGENT_TOOLS = ["Read", "Glob"]

    def __init__(
        self,
        timeout_sec: float = 30.0,
        default_namespace: str = "awoooi-prod",
    ):
        """
        初始化 ActionPlannerAgent

        Args:
            timeout_sec: 執行超時時間
            default_namespace: 預設命名空間
        """
        super().__init__(timeout_sec)
        self.default_namespace = default_namespace

    async def analyze(self, context: dict[str, Any]) -> ActionPlan:
        """
        生成執行計畫

        Args:
            context: 分析上下文
                - problem: 問題描述
                - suggested_action: 建議的動作 (restart/scale/rollback)
                - target_service: 目標服務
                - namespace: 命名空間
                - replicas: 副本數 (scale 用)

        Returns:
            ActionPlan 包含完整執行計畫
        """
        start_time = time.time()

        self.logger.info(
            "action_planning_start",
            problem=context.get("problem", "")[:100],
            target=context.get("target_service"),
        )

        try:
            # 1. 決定動作類型
            action_type = self._determine_action_type(context)

            # 2. 取得模板
            template = ACTION_TEMPLATES.get(action_type, ACTION_TEMPLATES["restart"])

            # 3. 準備參數
            params = self._prepare_params(context)

            # 4. 生成步驟
            pre_check_steps = self._generate_steps(
                template.get("pre_check", []),
                params,
                ActionPhase.PRE_CHECK,
            )

            execute_steps = self._generate_steps(
                template.get("execute", []),
                params,
                ActionPhase.EXECUTE,
            )

            verify_steps = self._generate_steps(
                template.get("verify", []),
                params,
                ActionPhase.VERIFY,
            )

            rollback_steps = self._generate_steps(
                template.get("rollback", []),
                params,
                ActionPhase.ROLLBACK,
            )

            # 5. 計算預估時間
            estimated_duration = self._estimate_duration(
                pre_check_steps + execute_steps + verify_steps
            )

            # 6. 提取主要 kubectl 指令
            kubectl_commands = [
                step.command for step in execute_steps
                if step.command.startswith("kubectl")
            ]

            latency_ms = int((time.time() - start_time) * 1000)

            # 7. 生成分析摘要
            analysis = self._generate_analysis(
                template["action_type"],
                params.get("target", "unknown"),
                len(execute_steps),
            )

            result = ActionPlan(
                agent_name=self.AGENT_NAME,
                status=AgentStatus.SUCCESS,
                confidence=0.0,  # 🔴 規則匹配/模板，非 AI 分析
                analysis=analysis,
                latency_ms=latency_ms,
                action_type=template["action_type"],
                pre_check_steps=pre_check_steps,
                execute_steps=execute_steps,
                verify_steps=verify_steps,
                rollback_steps=rollback_steps,
                estimated_duration_sec=estimated_duration,
                requires_approval=template.get("requires_approval", True),
                kubectl_commands=kubectl_commands,
            )

            self.logger.info(
                "action_planning_complete",
                action_type=result.action_type.value,
                step_count=len(execute_steps),
                latency_ms=latency_ms,
            )

            return result

        except Exception as e:
            latency_ms = int((time.time() - start_time) * 1000)

            self.logger.exception(
                "action_planning_error",
                error=str(e),
            )

            return ActionPlan(
                agent_name=self.AGENT_NAME,
                status=AgentStatus.FAILED,
                confidence=0.0,
                analysis=f"計畫生成失敗: {str(e)}",
                latency_ms=latency_ms,
                error=str(e),
                requires_approval=True,
            )

    def _determine_action_type(self, context: dict[str, Any]) -> str:
        """
        根據上下文決定最佳動作類型

        解析 problem 或 suggested_action 來決定
        """
        # 如果有明確指定
        suggested = context.get("suggested_action", "").lower()
        if suggested in ACTION_TEMPLATES:
            return suggested

        # 從 problem 推斷
        problem = context.get("problem", "").lower()

        # 關鍵字匹配
        if any(kw in problem for kw in ["crash", "restart", "oom", "killed"]):
            return "restart"

        if any(kw in problem for kw in ["slow", "latency", "capacity", "scale"]):
            return "scale"

        if any(kw in problem for kw in ["error", "failed", "rollback", "undo"]):
            return "rollback"

        if any(kw in problem for kw in ["stuck", "pending", "delete pod"]):
            return "delete_pod"

        # 預設: 重啟 (最安全)
        return "restart"

    def _prepare_params(self, context: dict[str, Any]) -> dict[str, str]:
        """準備模板參數"""
        target = context.get("target_service", "unknown")
        namespace = context.get("namespace", self.default_namespace)

        # 處理 target 可能是列表的情況
        if isinstance(target, list):
            target = target[0] if target else "unknown"

        return {
            "target": target,
            "namespace": namespace,
            "replicas": str(context.get("replicas", 3)),
            "original_replicas": str(context.get("original_replicas", 1)),
        }

    def _generate_steps(
        self,
        template_steps: list[dict[str, Any]],
        params: dict[str, str],
        phase: ActionPhase,
    ) -> list[ActionStep]:
        """從模板生成實際步驟"""
        steps: list[ActionStep] = []

        for i, tmpl in enumerate(template_steps):
            command = tmpl["command"].format(**params)
            description = tmpl["description"].format(**params)

            steps.append(ActionStep(
                command=command,
                description=description,
                phase=phase,
                timeout_sec=tmpl.get("timeout_sec", 60),
                can_fail=tmpl.get("can_fail", False),
                order=i,
            ))

        return steps

    def _estimate_duration(self, steps: list[ActionStep]) -> int:
        """估計執行時間 (秒)"""
        total = 0
        for step in steps:
            # 假設每個步驟平均執行時間為 timeout 的 1/3
            total += step.timeout_sec // 3
        return max(total, 30)  # 最少 30 秒

    def _generate_analysis(
        self,
        action_type: ActionType,
        target: str,
        step_count: int,
    ) -> str:
        """生成分析摘要"""
        action_desc = {
            ActionType.RESTART: "滾動重啟",
            ActionType.SCALE: "擴縮容",
            ActionType.ROLLBACK: "版本回滾",
            ActionType.DELETE: "資源清理",
            ActionType.PATCH: "配置修補",
            ActionType.APPLY: "配置應用",
            ActionType.EXEC: "指令執行",
            ActionType.CUSTOM: "自訂操作",
        }

        return (
            f"建議執行 {action_desc.get(action_type, '操作')} "
            f"於 {target}，共 {step_count} 個步驟"
        )

    def _build_prompt(self, context: dict[str, Any]) -> str:
        """建構 LLM Prompt (Phase 9.4 擴展)"""
        return f"""你是 AWOOOI 的行動規劃師。
根據以下問題制定修復計畫：

問題描述: {context.get("problem", "N/A")}
目標服務: {context.get("target_service", "N/A")}
命名空間: {context.get("namespace", "awoooi-prod")}

注意:
- 所有 kubectl 必須帶 -n {{namespace}}
- 必須包含前置檢查、執行步驟、驗證步驟、回滾方案

輸出 JSON:
```json
{{
  "action_type": "restart|scale|rollback|delete",
  "pre_check_steps": [
    {{"command": "kubectl ...", "description": "..."}}
  ],
  "execute_steps": [
    {{"command": "kubectl ...", "description": "..."}}
  ],
  "verify_steps": [
    {{"command": "kubectl ...", "description": "..."}}
  ],
  "rollback_steps": [
    {{"command": "kubectl ...", "description": "..."}}
  ],
  "estimated_duration_sec": 60,
  "analysis": "一句話摘要",
  "confidence": 0-1
}}
```"""

    def _parse_response(self, response: str) -> dict[str, Any]:
        """解析 LLM 回應"""
        return self._extract_json(response)