awoooi/apps/api/src/services/agent_service.py

"""
Agent Service - Phase 17 技術債修復
===================================
Phase 17 R4: 從 agents.py Router 抽離 Redis 操作

設計原則:
- Router 層只做 HTTP 路由，不直接存取 Redis
- Service 層封裝所有業務邏輯與 Redis 操作
- 使用 Protocol 定義介面，支援依賴注入

版本: v1.0
建立: 2026-03-26 (台北時區)
建立者: Claude Code (Phase 17 技術債修復)
"""

import json
from enum import Enum
from typing import Any, Protocol, runtime_checkable
from uuid import uuid4

from src.core.logging import get_logger
from src.core.redis_client import get_redis
from src.core.sse import EventType, SSEEvent, get_publisher
from src.models.incident import Incident, IncidentStatus, Severity, Signal
from src.services.consensus_engine import get_consensus_engine
from src.utils.timezone import now_taipei, now_taipei_iso

logger = get_logger("awoooi.agent_service")


# =============================================================================
# Constants
# =============================================================================

TASK_PREFIX = "agent_task:"
INCIDENT_PREFIX = "incident:"
TASK_TTL = 604800  # 7 天


# =============================================================================
# Task States
# =============================================================================

class TaskState(str, Enum):
    """分析任務狀態"""
    PENDING = "pending"       # 等待中
    ANALYZING = "analyzing"   # 分析中
    CONSENSUS = "consensus"   # 共識計算中
    COMPLETED = "completed"   # 已完成
    FAILED = "failed"         # 失敗


# =============================================================================
# Protocol Interface
# =============================================================================

@runtime_checkable
class IAgentTaskRepository(Protocol):
    """
    Agent Task Repository Protocol

    職責: Agent Task CRUD 操作 (Redis Working Memory)
    """

    async def create_task(
        self,
        task_id: str,
        incident_id: str,
        trigger: str = "manual",
    ) -> dict[str, Any]:
        """建立新的分析任務"""
        ...

    async def get_task(self, task_id: str) -> dict[str, Any] | None:
        """取得任務資料"""
        ...

    async def update_task_state(
        self,
        task_id: str,
        state: TaskState,
        progress: int = 0,
        current_step: str | None = None,
        agents_completed: int = 0,
        **extra_data: Any,
    ) -> bool:
        """更新任務狀態"""
        ...

    async def save_task_result(
        self,
        task_id: str,
        result_data: dict[str, Any],
    ) -> bool:
        """儲存任務結果"""
        ...

    async def get_incident(self, incident_id: str) -> Incident | None:
        """從 Redis 取得 Incident"""
        ...


# =============================================================================
# Repository Implementation
# =============================================================================

class AgentTaskRedisRepository:
    """
    Agent Task Redis Repository

    實作 IAgentTaskRepository Protocol
    所有 Redis 操作都封裝在此
    """

    async def create_task(
        self,
        task_id: str,
        incident_id: str,
        trigger: str = "manual",
    ) -> dict[str, Any]:
        """
        建立新的分析任務

        Args:
            task_id: 任務 ID
            incident_id: 關聯的 Incident ID
            trigger: 觸發來源 (manual/auto)

        Returns:
            建立的任務資料
        """
        redis_client = get_redis()
        task_key = f"{TASK_PREFIX}{task_id}"

        task_data = {
            "task_id": task_id,
            "state": TaskState.PENDING.value,
            "progress": 0,
            "current_step": "任務已建立",
            "agents_completed": 0,
            "total_agents": 4,
            "incident_id": incident_id,
            "started_at": now_taipei_iso(),
            "trigger": trigger,
        }

        await redis_client.set(
            task_key,
            json.dumps(task_data),
            ex=TASK_TTL,
        )

        logger.info(
            "task_created",
            task_id=task_id,
            incident_id=incident_id,
            trigger=trigger,
        )

        return task_data

    async def get_task(self, task_id: str) -> dict[str, Any] | None:
        """
        取得任務資料

        Args:
            task_id: 任務 ID

        Returns:
            任務資料字典，不存在則返回 None
        """
        redis_client = get_redis()
        task_key = f"{TASK_PREFIX}{task_id}"

        data = await redis_client.get(task_key)
        if data is None:
            return None

        return json.loads(data)

    async def update_task_state(
        self,
        task_id: str,
        state: TaskState,
        progress: int = 0,
        current_step: str | None = None,
        agents_completed: int = 0,
        **extra_data: Any,
    ) -> bool:
        """
        更新任務狀態

        Args:
            task_id: 任務 ID
            state: 新狀態
            progress: 進度百分比 (0-100)
            current_step: 目前步驟描述
            agents_completed: 已完成的 Agent 數量
            **extra_data: 額外資料

        Returns:
            是否更新成功
        """
        redis_client = get_redis()
        task_key = f"{TASK_PREFIX}{task_id}"

        try:
            # 讀取現有資料
            existing = await redis_client.get(task_key)
            if existing:
                task_data = json.loads(existing)
            else:
                task_data = {"task_id": task_id}

            # 更新欄位
            task_data.update({
                "state": state.value,
                "progress": progress,
                "current_step": current_step,
                "agents_completed": agents_completed,
                **extra_data,
            })

            await redis_client.set(
                task_key,
                json.dumps(task_data),
                ex=TASK_TTL,
            )

            return True

        except Exception as e:
            logger.exception(
                "update_task_state_error",
                task_id=task_id,
                error=str(e),
            )
            return False

    async def save_task_result(
        self,
        task_id: str,
        result_data: dict[str, Any],
    ) -> bool:
        """
        儲存任務結果

        Args:
            task_id: 任務 ID
            result_data: 完整結果資料

        Returns:
            是否儲存成功
        """
        redis_client = get_redis()
        task_key = f"{TASK_PREFIX}{task_id}"

        try:
            await redis_client.set(
                task_key,
                json.dumps(result_data),
                ex=TASK_TTL,
            )

            logger.info(
                "task_result_saved",
                task_id=task_id,
                state=result_data.get("state"),
            )
            return True

        except Exception as e:
            logger.exception(
                "save_task_result_error",
                task_id=task_id,
                error=str(e),
            )
            return False

    async def get_incident(self, incident_id: str) -> Incident | None:
        """
        從 Redis 取得 Incident

        Args:
            incident_id: Incident ID

        Returns:
            Incident 物件，不存在則返回 None
        """
        redis_client = get_redis()
        key = f"{INCIDENT_PREFIX}{incident_id}"

        data = await redis_client.get(key)
        if data is None:
            return None

        return Incident.model_validate_json(data)


# =============================================================================
# Agent Service
# =============================================================================

class AgentService:
    """
    Agent Service - Agent Teams 業務邏輯

    職責:
    1. 任務生命週期管理
    2. 協調 ConsensusEngine 執行分析
    3. 推送 SSE 進度通知

    使用方式:
        service = get_agent_service()
        task_id = await service.create_analysis_task(incident_id)
    """

    def __init__(
        self,
        repository: IAgentTaskRepository | None = None,
    ) -> None:
        """
        初始化 Agent Service

        Args:
            repository: Task Repository (預設使用 Redis 實作)
        """
        self._repository = repository or AgentTaskRedisRepository()

    # =========================================================================
    # Task Management
    # =========================================================================

    def generate_task_id(self) -> str:
        """產生新的 Task ID"""
        return f"TASK-{now_taipei().strftime('%Y%m%d')}-{uuid4().hex[:8].upper()}"

    async def create_analysis_task(
        self,
        incident: Incident,
        trigger: str = "manual",
    ) -> str:
        """
        建立分析任務

        Args:
            incident: 要分析的 Incident
            trigger: 觸發來源 (manual/auto)

        Returns:
            task_id
        """
        task_id = self.generate_task_id()

        await self._repository.create_task(
            task_id=task_id,
            incident_id=incident.incident_id,
            trigger=trigger,
        )

        logger.info(
            "analysis_task_created",
            task_id=task_id,
            incident_id=incident.incident_id,
            severity=incident.severity.value,
        )

        return task_id

    async def get_task_status(self, task_id: str) -> dict[str, Any] | None:
        """取得任務狀態"""
        return await self._repository.get_task(task_id)

    async def get_task_result(self, task_id: str) -> dict[str, Any] | None:
        """取得任務結果"""
        return await self._repository.get_task(task_id)

    async def get_incident(self, incident_id: str) -> Incident | None:
        """取得 Incident"""
        return await self._repository.get_incident(incident_id)

    # =========================================================================
    # Analysis Execution
    # =========================================================================

    async def run_analysis(
        self,
        task_id: str,
        incident: Incident,
    ) -> None:
        """
        執行 Agent Teams 分析

        流程:
        1. 更新狀態為 ANALYZING
        2. 收集各專家意見
        3. 計算共識
        4. 儲存結果
        5. 推送 SSE 通知

        Args:
            task_id: 任務 ID
            incident: 要分析的 Incident
        """
        consensus_engine = get_consensus_engine()

        try:
            # Step 1: 更新狀態
            await self._repository.update_task_state(
                task_id,
                TaskState.ANALYZING,
                progress=10,
                current_step="正在收集專家意見...",
            )

            # 推送 SSE 進度
            publisher = await get_publisher()
            await publisher.publish(SSEEvent(
                type=EventType.AI_THINKING,
                data={
                    "task_id": task_id,
                    "state": TaskState.ANALYZING.value,
                    "progress": 10,
                    "message": "Agent Teams 分析開始",
                },
            ))

            # Step 2: 收集意見
            opinions = await consensus_engine.gather_opinions(
                incident, timeout_sec=25.0
            )

            await self._repository.update_task_state(
                task_id,
                TaskState.CONSENSUS,
                progress=60,
                current_step="正在計算共識...",
                agents_completed=len(opinions),
            )

            await publisher.publish(SSEEvent(
                type=EventType.AI_THINKING,
                data={
                    "task_id": task_id,
                    "state": TaskState.CONSENSUS.value,
                    "progress": 60,
                    "message": f"已收集 {len(opinions)} 位專家意見",
                },
            ))

            # Step 3: 計算共識
            (
                consensus_score,
                recommended_action,
                dissenting,
            ) = consensus_engine.calculate_consensus(opinions)

            await self._repository.update_task_state(
                task_id,
                TaskState.CONSENSUS,
                progress=80,
                current_step="正在產生最終決策...",
            )

            # Step 4: 產生最終決策
            result = await consensus_engine.generate_final_decision(
                incident=incident,
                opinions=opinions,
                consensus_score=consensus_score,
                recommended_action_type=recommended_action,
                dissenting=dissenting,
            )

            # Step 5: 儲存完整結果
            task_data = {
                "task_id": task_id,
                "state": TaskState.COMPLETED.value,
                "progress": 100,
                "current_step": "分析完成",
                "agents_completed": len(opinions),
                "total_agents": 4,
                "consensus_id": result.consensus_id,
                "incident_id": incident.incident_id,
                "consensus_score": result.consensus_score,
                "recommended_action": result.recommended_action,
                "recommended_kubectl": result.recommended_kubectl,
                "risk_level": result.risk_level,
                "final_reasoning": result.final_reasoning,
                "opinions": [op.to_dict() for op in result.opinions],
                "dissenting_opinions": result.dissenting_opinions,
                "completed_at": now_taipei_iso(),
            }

            await self._repository.save_task_result(task_id, task_data)

            # 推送完成通知
            await publisher.publish(SSEEvent(
                type=EventType.AI_THINKING,
                data={
                    "task_id": task_id,
                    "state": TaskState.COMPLETED.value,
                    "progress": 100,
                    "message": "分析完成",
                    "consensus_score": result.consensus_score,
                    "recommended_action": result.recommended_action,
                },
            ))

            logger.info(
                "analysis_completed",
                task_id=task_id,
                consensus_id=result.consensus_id,
                consensus_score=result.consensus_score,
            )

        except Exception as e:
            logger.exception(
                "analysis_failed",
                task_id=task_id,
                error=str(e),
            )

            # 更新為失敗狀態
            task_data = {
                "task_id": task_id,
                "state": TaskState.FAILED.value,
                "progress": 0,
                "error": str(e),
                "completed_at": now_taipei_iso(),
            }

            await self._repository.save_task_result(task_id, task_data)

            # 推送失敗通知
            publisher = await get_publisher()
            await publisher.publish(SSEEvent(
                type=EventType.ERROR,
                data={
                    "task_id": task_id,
                    "state": TaskState.FAILED.value,
                    "error": str(e),
                },
            ))

    # =========================================================================
    # Incident Integration
    # =========================================================================

    def should_trigger_agent_analysis(self, incident: Incident) -> bool:
        """
        判斷是否需要觸發 Agent Teams 分析

        條件 (任一符合):
        - P0/P1 緊急事件
        - 多個服務受影響 (>2)
        - 多個告警 (>3)

        Args:
            incident: 要判斷的 Incident

        Returns:
            是否應觸發分析
        """
        return (
            # P0/P1 緊急事件
            incident.severity in (Severity.P0, Severity.P1)
            # 或多個服務受影響
            or len(incident.affected_services) > 2
            # 或多個告警
            or len(incident.signals) > 3
        )

    async def trigger_for_incident(
        self,
        incident_id: str,
    ) -> tuple[str | None, Incident | None]:
        """
        為 Incident 觸發 Agent Teams 分析 (如果符合條件)

        Args:
            incident_id: Incident ID

        Returns:
            (task_id, incident) - task_id 為 None 表示未觸發
        """
        # 讀取 Incident
        incident = await self._repository.get_incident(incident_id)
        if incident is None:
            logger.warning("trigger_skipped_not_found", incident_id=incident_id)
            return None, None

        # 判斷是否需要 Agent Teams
        if not self.should_trigger_agent_analysis(incident):
            logger.debug(
                "trigger_skipped_simple_case",
                incident_id=incident_id,
                severity=incident.severity.value,
            )
            return None, incident

        # 建立任務
        task_id = await self.create_analysis_task(incident, trigger="auto")

        logger.info(
            "auto_trigger_success",
            task_id=task_id,
            incident_id=incident_id,
            severity=incident.severity.value,
        )

        return task_id, incident

    # =========================================================================
    # Incident Factory
    # =========================================================================

    def create_adhoc_incident(
        self,
        severity: str,
        affected_services: list[str],
        alert_names: list[str] | None = None,
    ) -> Incident:
        """
        建立臨時 Incident (用於直接分析請求)

        Args:
            severity: 嚴重度 (P0/P1/P2/P3)
            affected_services: 受影響服務列表
            alert_names: 告警名稱列表

        Returns:
            建立的 Incident
        """
        signals = []
        if alert_names:
            for alert_name in alert_names:
                signals.append(Signal(
                    alert_name=alert_name,
                    severity=Severity(severity),
                    source="manual",
                    fired_at=now_taipei(),
                ))

        return Incident(
            severity=Severity(severity),
            status=IncidentStatus.INVESTIGATING,
            signals=signals,
            affected_services=affected_services,
        )


# =============================================================================
# Singleton
# =============================================================================

_agent_service: AgentService | None = None


def get_agent_service() -> AgentService:
    """取得 Agent Service 實例 (Singleton)"""
    global _agent_service
    if _agent_service is None:
        _agent_service = AgentService()
    return _agent_service