""" Agent Service - Phase 17 技術債修復 =================================== Phase 17 R4: 從 agents.py Router 抽離 Redis 操作 設計原則: - Router 層只做 HTTP 路由,不直接存取 Redis - Service 層封裝所有業務邏輯與 Redis 操作 - 使用 Protocol 定義介面,支援依賴注入 版本: v1.0 建立: 2026-03-26 (台北時區) 建立者: Claude Code (Phase 17 技術債修復) """ import json from enum import Enum from typing import Any, Protocol, runtime_checkable from uuid import uuid4 from src.core.logging import get_logger from src.core.redis_client import get_redis from src.core.sse import EventType, SSEEvent, get_publisher from src.models.incident import Incident, IncidentStatus, Severity, Signal from src.services.consensus_engine import get_consensus_engine from src.utils.timezone import now_taipei, now_taipei_iso logger = get_logger("awoooi.agent_service") # ============================================================================= # Constants # ============================================================================= TASK_PREFIX = "agent_task:" INCIDENT_PREFIX = "incident:" TASK_TTL = 604800 # 7 天 # ============================================================================= # Task States # ============================================================================= class TaskState(str, Enum): """分析任務狀態""" PENDING = "pending" # 等待中 ANALYZING = "analyzing" # 分析中 CONSENSUS = "consensus" # 共識計算中 COMPLETED = "completed" # 已完成 FAILED = "failed" # 失敗 # ============================================================================= # Protocol Interface # ============================================================================= @runtime_checkable class IAgentTaskRepository(Protocol): """ Agent Task Repository Protocol 職責: Agent Task CRUD 操作 (Redis Working Memory) """ async def create_task( self, task_id: str, incident_id: str, trigger: str = "manual", ) -> dict[str, Any]: """建立新的分析任務""" ... async def get_task(self, task_id: str) -> dict[str, Any] | None: """取得任務資料""" ... async def update_task_state( self, task_id: str, state: TaskState, progress: int = 0, current_step: str | None = None, agents_completed: int = 0, **extra_data: Any, ) -> bool: """更新任務狀態""" ... async def save_task_result( self, task_id: str, result_data: dict[str, Any], ) -> bool: """儲存任務結果""" ... async def get_incident(self, incident_id: str) -> Incident | None: """從 Redis 取得 Incident""" ... # ============================================================================= # Repository Implementation # ============================================================================= class AgentTaskRedisRepository: """ Agent Task Redis Repository 實作 IAgentTaskRepository Protocol 所有 Redis 操作都封裝在此 """ async def create_task( self, task_id: str, incident_id: str, trigger: str = "manual", ) -> dict[str, Any]: """ 建立新的分析任務 Args: task_id: 任務 ID incident_id: 關聯的 Incident ID trigger: 觸發來源 (manual/auto) Returns: 建立的任務資料 """ redis_client = get_redis() task_key = f"{TASK_PREFIX}{task_id}" task_data = { "task_id": task_id, "state": TaskState.PENDING.value, "progress": 0, "current_step": "任務已建立", "agents_completed": 0, "total_agents": 4, "incident_id": incident_id, "started_at": now_taipei_iso(), "trigger": trigger, } await redis_client.set( task_key, json.dumps(task_data), ex=TASK_TTL, ) logger.info( "task_created", task_id=task_id, incident_id=incident_id, trigger=trigger, ) return task_data async def get_task(self, task_id: str) -> dict[str, Any] | None: """ 取得任務資料 Args: task_id: 任務 ID Returns: 任務資料字典,不存在則返回 None """ redis_client = get_redis() task_key = f"{TASK_PREFIX}{task_id}" data = await redis_client.get(task_key) if data is None: return None return json.loads(data) async def update_task_state( self, task_id: str, state: TaskState, progress: int = 0, current_step: str | None = None, agents_completed: int = 0, **extra_data: Any, ) -> bool: """ 更新任務狀態 Args: task_id: 任務 ID state: 新狀態 progress: 進度百分比 (0-100) current_step: 目前步驟描述 agents_completed: 已完成的 Agent 數量 **extra_data: 額外資料 Returns: 是否更新成功 """ redis_client = get_redis() task_key = f"{TASK_PREFIX}{task_id}" try: # 讀取現有資料 existing = await redis_client.get(task_key) if existing: task_data = json.loads(existing) else: task_data = {"task_id": task_id} # 更新欄位 task_data.update({ "state": state.value, "progress": progress, "current_step": current_step, "agents_completed": agents_completed, **extra_data, }) await redis_client.set( task_key, json.dumps(task_data), ex=TASK_TTL, ) return True except Exception as e: logger.exception( "update_task_state_error", task_id=task_id, error=str(e), ) return False async def save_task_result( self, task_id: str, result_data: dict[str, Any], ) -> bool: """ 儲存任務結果 Args: task_id: 任務 ID result_data: 完整結果資料 Returns: 是否儲存成功 """ redis_client = get_redis() task_key = f"{TASK_PREFIX}{task_id}" try: await redis_client.set( task_key, json.dumps(result_data), ex=TASK_TTL, ) logger.info( "task_result_saved", task_id=task_id, state=result_data.get("state"), ) return True except Exception as e: logger.exception( "save_task_result_error", task_id=task_id, error=str(e), ) return False async def get_incident(self, incident_id: str) -> Incident | None: """ 從 Redis 取得 Incident Args: incident_id: Incident ID Returns: Incident 物件,不存在則返回 None """ redis_client = get_redis() key = f"{INCIDENT_PREFIX}{incident_id}" data = await redis_client.get(key) if data is None: return None return Incident.model_validate_json(data) # ============================================================================= # Agent Service # ============================================================================= class AgentService: """ Agent Service - Agent Teams 業務邏輯 職責: 1. 任務生命週期管理 2. 協調 ConsensusEngine 執行分析 3. 推送 SSE 進度通知 使用方式: service = get_agent_service() task_id = await service.create_analysis_task(incident_id) """ def __init__( self, repository: IAgentTaskRepository | None = None, ) -> None: """ 初始化 Agent Service Args: repository: Task Repository (預設使用 Redis 實作) """ self._repository = repository or AgentTaskRedisRepository() # ========================================================================= # Task Management # ========================================================================= def generate_task_id(self) -> str: """產生新的 Task ID""" return f"TASK-{now_taipei().strftime('%Y%m%d')}-{uuid4().hex[:8].upper()}" async def create_analysis_task( self, incident: Incident, trigger: str = "manual", ) -> str: """ 建立分析任務 Args: incident: 要分析的 Incident trigger: 觸發來源 (manual/auto) Returns: task_id """ task_id = self.generate_task_id() await self._repository.create_task( task_id=task_id, incident_id=incident.incident_id, trigger=trigger, ) logger.info( "analysis_task_created", task_id=task_id, incident_id=incident.incident_id, severity=incident.severity.value, ) return task_id async def get_task_status(self, task_id: str) -> dict[str, Any] | None: """取得任務狀態""" return await self._repository.get_task(task_id) async def get_task_result(self, task_id: str) -> dict[str, Any] | None: """取得任務結果""" return await self._repository.get_task(task_id) async def get_incident(self, incident_id: str) -> Incident | None: """取得 Incident""" return await self._repository.get_incident(incident_id) # ========================================================================= # Analysis Execution # ========================================================================= async def run_analysis( self, task_id: str, incident: Incident, ) -> None: """ 執行 Agent Teams 分析 流程: 1. 更新狀態為 ANALYZING 2. 收集各專家意見 3. 計算共識 4. 儲存結果 5. 推送 SSE 通知 Args: task_id: 任務 ID incident: 要分析的 Incident """ consensus_engine = get_consensus_engine() try: # Step 1: 更新狀態 await self._repository.update_task_state( task_id, TaskState.ANALYZING, progress=10, current_step="正在收集專家意見...", ) # 推送 SSE 進度 publisher = await get_publisher() await publisher.publish(SSEEvent( type=EventType.AI_THINKING, data={ "task_id": task_id, "state": TaskState.ANALYZING.value, "progress": 10, "message": "Agent Teams 分析開始", }, )) # Step 2: 收集意見 opinions = await consensus_engine.gather_opinions( incident, timeout_sec=25.0 ) await self._repository.update_task_state( task_id, TaskState.CONSENSUS, progress=60, current_step="正在計算共識...", agents_completed=len(opinions), ) await publisher.publish(SSEEvent( type=EventType.AI_THINKING, data={ "task_id": task_id, "state": TaskState.CONSENSUS.value, "progress": 60, "message": f"已收集 {len(opinions)} 位專家意見", }, )) # Step 3: 計算共識 ( consensus_score, recommended_action, dissenting, ) = consensus_engine.calculate_consensus(opinions) await self._repository.update_task_state( task_id, TaskState.CONSENSUS, progress=80, current_step="正在產生最終決策...", ) # Step 4: 產生最終決策 result = await consensus_engine.generate_final_decision( incident=incident, opinions=opinions, consensus_score=consensus_score, recommended_action_type=recommended_action, dissenting=dissenting, ) # Step 5: 儲存完整結果 task_data = { "task_id": task_id, "state": TaskState.COMPLETED.value, "progress": 100, "current_step": "分析完成", "agents_completed": len(opinions), "total_agents": 4, "consensus_id": result.consensus_id, "incident_id": incident.incident_id, "consensus_score": result.consensus_score, "recommended_action": result.recommended_action, "recommended_kubectl": result.recommended_kubectl, "risk_level": result.risk_level, "final_reasoning": result.final_reasoning, "opinions": [op.to_dict() for op in result.opinions], "dissenting_opinions": result.dissenting_opinions, "completed_at": now_taipei_iso(), } await self._repository.save_task_result(task_id, task_data) # 推送完成通知 await publisher.publish(SSEEvent( type=EventType.AI_THINKING, data={ "task_id": task_id, "state": TaskState.COMPLETED.value, "progress": 100, "message": "分析完成", "consensus_score": result.consensus_score, "recommended_action": result.recommended_action, }, )) logger.info( "analysis_completed", task_id=task_id, consensus_id=result.consensus_id, consensus_score=result.consensus_score, ) except Exception as e: logger.exception( "analysis_failed", task_id=task_id, error=str(e), ) # 更新為失敗狀態 task_data = { "task_id": task_id, "state": TaskState.FAILED.value, "progress": 0, "error": str(e), "completed_at": now_taipei_iso(), } await self._repository.save_task_result(task_id, task_data) # 推送失敗通知 publisher = await get_publisher() await publisher.publish(SSEEvent( type=EventType.ERROR, data={ "task_id": task_id, "state": TaskState.FAILED.value, "error": str(e), }, )) # ========================================================================= # Incident Integration # ========================================================================= def should_trigger_agent_analysis(self, incident: Incident) -> bool: """ 判斷是否需要觸發 Agent Teams 分析 條件 (任一符合): - P0/P1 緊急事件 - 多個服務受影響 (>2) - 多個告警 (>3) Args: incident: 要判斷的 Incident Returns: 是否應觸發分析 """ return ( # P0/P1 緊急事件 incident.severity in (Severity.P0, Severity.P1) # 或多個服務受影響 or len(incident.affected_services) > 2 # 或多個告警 or len(incident.signals) > 3 ) async def trigger_for_incident( self, incident_id: str, ) -> tuple[str | None, Incident | None]: """ 為 Incident 觸發 Agent Teams 分析 (如果符合條件) Args: incident_id: Incident ID Returns: (task_id, incident) - task_id 為 None 表示未觸發 """ # 讀取 Incident incident = await self._repository.get_incident(incident_id) if incident is None: logger.warning("trigger_skipped_not_found", incident_id=incident_id) return None, None # 判斷是否需要 Agent Teams if not self.should_trigger_agent_analysis(incident): logger.debug( "trigger_skipped_simple_case", incident_id=incident_id, severity=incident.severity.value, ) return None, incident # 建立任務 task_id = await self.create_analysis_task(incident, trigger="auto") logger.info( "auto_trigger_success", task_id=task_id, incident_id=incident_id, severity=incident.severity.value, ) return task_id, incident # ========================================================================= # Incident Factory # ========================================================================= def create_adhoc_incident( self, severity: str, affected_services: list[str], alert_names: list[str] | None = None, ) -> Incident: """ 建立臨時 Incident (用於直接分析請求) Args: severity: 嚴重度 (P0/P1/P2/P3) affected_services: 受影響服務列表 alert_names: 告警名稱列表 Returns: 建立的 Incident """ signals = [] if alert_names: for alert_name in alert_names: signals.append(Signal( alert_name=alert_name, severity=Severity(severity), source="manual", fired_at=now_taipei(), )) return Incident( severity=Severity(severity), status=IncidentStatus.INVESTIGATING, signals=signals, affected_services=affected_services, ) # ============================================================================= # Singleton # ============================================================================= _agent_service: AgentService | None = None def get_agent_service() -> AgentService: """取得 Agent Service 實例 (Singleton)""" global _agent_service if _agent_service is None: _agent_service = AgentService() return _agent_service