Files
awoooi/apps/api/src/services/agent_service.py
OG T 30153496d1 fix(api): 修復全部 lint 錯誤 (ruff --fix)
- Import sorting (I001)
- Unused imports (F401)
- f-string without placeholders (F541)
- Loop variable unused (B007)
- zip() strict parameter (B905)
- Exception chaining (B904)
- collections.abc imports (UP035)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-26 16:06:20 +08:00

665 lines
19 KiB
Python

"""
Agent Service - Phase 17 技術債修復
===================================
Phase 17 R4: 從 agents.py Router 抽離 Redis 操作
設計原則:
- Router 層只做 HTTP 路由,不直接存取 Redis
- Service 層封裝所有業務邏輯與 Redis 操作
- 使用 Protocol 定義介面,支援依賴注入
版本: v1.0
建立: 2026-03-26 (台北時區)
建立者: Claude Code (Phase 17 技術債修復)
"""
import json
from enum import Enum
from typing import Any, Protocol, runtime_checkable
from uuid import uuid4
from src.core.logging import get_logger
from src.core.redis_client import get_redis
from src.core.sse import EventType, SSEEvent, get_publisher
from src.models.incident import Incident, IncidentStatus, Severity, Signal
from src.services.consensus_engine import get_consensus_engine
from src.utils.timezone import now_taipei, now_taipei_iso
logger = get_logger("awoooi.agent_service")
# =============================================================================
# Constants
# =============================================================================
TASK_PREFIX = "agent_task:"
INCIDENT_PREFIX = "incident:"
TASK_TTL = 604800 # 7 天
# =============================================================================
# Task States
# =============================================================================
class TaskState(str, Enum):
"""分析任務狀態"""
PENDING = "pending" # 等待中
ANALYZING = "analyzing" # 分析中
CONSENSUS = "consensus" # 共識計算中
COMPLETED = "completed" # 已完成
FAILED = "failed" # 失敗
# =============================================================================
# Protocol Interface
# =============================================================================
@runtime_checkable
class IAgentTaskRepository(Protocol):
"""
Agent Task Repository Protocol
職責: Agent Task CRUD 操作 (Redis Working Memory)
"""
async def create_task(
self,
task_id: str,
incident_id: str,
trigger: str = "manual",
) -> dict[str, Any]:
"""建立新的分析任務"""
...
async def get_task(self, task_id: str) -> dict[str, Any] | None:
"""取得任務資料"""
...
async def update_task_state(
self,
task_id: str,
state: TaskState,
progress: int = 0,
current_step: str | None = None,
agents_completed: int = 0,
**extra_data: Any,
) -> bool:
"""更新任務狀態"""
...
async def save_task_result(
self,
task_id: str,
result_data: dict[str, Any],
) -> bool:
"""儲存任務結果"""
...
async def get_incident(self, incident_id: str) -> Incident | None:
"""從 Redis 取得 Incident"""
...
# =============================================================================
# Repository Implementation
# =============================================================================
class AgentTaskRedisRepository:
"""
Agent Task Redis Repository
實作 IAgentTaskRepository Protocol
所有 Redis 操作都封裝在此
"""
async def create_task(
self,
task_id: str,
incident_id: str,
trigger: str = "manual",
) -> dict[str, Any]:
"""
建立新的分析任務
Args:
task_id: 任務 ID
incident_id: 關聯的 Incident ID
trigger: 觸發來源 (manual/auto)
Returns:
建立的任務資料
"""
redis_client = get_redis()
task_key = f"{TASK_PREFIX}{task_id}"
task_data = {
"task_id": task_id,
"state": TaskState.PENDING.value,
"progress": 0,
"current_step": "任務已建立",
"agents_completed": 0,
"total_agents": 4,
"incident_id": incident_id,
"started_at": now_taipei_iso(),
"trigger": trigger,
}
await redis_client.set(
task_key,
json.dumps(task_data),
ex=TASK_TTL,
)
logger.info(
"task_created",
task_id=task_id,
incident_id=incident_id,
trigger=trigger,
)
return task_data
async def get_task(self, task_id: str) -> dict[str, Any] | None:
"""
取得任務資料
Args:
task_id: 任務 ID
Returns:
任務資料字典,不存在則返回 None
"""
redis_client = get_redis()
task_key = f"{TASK_PREFIX}{task_id}"
data = await redis_client.get(task_key)
if data is None:
return None
return json.loads(data)
async def update_task_state(
self,
task_id: str,
state: TaskState,
progress: int = 0,
current_step: str | None = None,
agents_completed: int = 0,
**extra_data: Any,
) -> bool:
"""
更新任務狀態
Args:
task_id: 任務 ID
state: 新狀態
progress: 進度百分比 (0-100)
current_step: 目前步驟描述
agents_completed: 已完成的 Agent 數量
**extra_data: 額外資料
Returns:
是否更新成功
"""
redis_client = get_redis()
task_key = f"{TASK_PREFIX}{task_id}"
try:
# 讀取現有資料
existing = await redis_client.get(task_key)
if existing:
task_data = json.loads(existing)
else:
task_data = {"task_id": task_id}
# 更新欄位
task_data.update({
"state": state.value,
"progress": progress,
"current_step": current_step,
"agents_completed": agents_completed,
**extra_data,
})
await redis_client.set(
task_key,
json.dumps(task_data),
ex=TASK_TTL,
)
return True
except Exception as e:
logger.exception(
"update_task_state_error",
task_id=task_id,
error=str(e),
)
return False
async def save_task_result(
self,
task_id: str,
result_data: dict[str, Any],
) -> bool:
"""
儲存任務結果
Args:
task_id: 任務 ID
result_data: 完整結果資料
Returns:
是否儲存成功
"""
redis_client = get_redis()
task_key = f"{TASK_PREFIX}{task_id}"
try:
await redis_client.set(
task_key,
json.dumps(result_data),
ex=TASK_TTL,
)
logger.info(
"task_result_saved",
task_id=task_id,
state=result_data.get("state"),
)
return True
except Exception as e:
logger.exception(
"save_task_result_error",
task_id=task_id,
error=str(e),
)
return False
async def get_incident(self, incident_id: str) -> Incident | None:
"""
從 Redis 取得 Incident
Args:
incident_id: Incident ID
Returns:
Incident 物件,不存在則返回 None
"""
redis_client = get_redis()
key = f"{INCIDENT_PREFIX}{incident_id}"
data = await redis_client.get(key)
if data is None:
return None
return Incident.model_validate_json(data)
# =============================================================================
# Agent Service
# =============================================================================
class AgentService:
"""
Agent Service - Agent Teams 業務邏輯
職責:
1. 任務生命週期管理
2. 協調 ConsensusEngine 執行分析
3. 推送 SSE 進度通知
使用方式:
service = get_agent_service()
task_id = await service.create_analysis_task(incident_id)
"""
def __init__(
self,
repository: IAgentTaskRepository | None = None,
) -> None:
"""
初始化 Agent Service
Args:
repository: Task Repository (預設使用 Redis 實作)
"""
self._repository = repository or AgentTaskRedisRepository()
# =========================================================================
# Task Management
# =========================================================================
def generate_task_id(self) -> str:
"""產生新的 Task ID"""
return f"TASK-{now_taipei().strftime('%Y%m%d')}-{uuid4().hex[:8].upper()}"
async def create_analysis_task(
self,
incident: Incident,
trigger: str = "manual",
) -> str:
"""
建立分析任務
Args:
incident: 要分析的 Incident
trigger: 觸發來源 (manual/auto)
Returns:
task_id
"""
task_id = self.generate_task_id()
await self._repository.create_task(
task_id=task_id,
incident_id=incident.incident_id,
trigger=trigger,
)
logger.info(
"analysis_task_created",
task_id=task_id,
incident_id=incident.incident_id,
severity=incident.severity.value,
)
return task_id
async def get_task_status(self, task_id: str) -> dict[str, Any] | None:
"""取得任務狀態"""
return await self._repository.get_task(task_id)
async def get_task_result(self, task_id: str) -> dict[str, Any] | None:
"""取得任務結果"""
return await self._repository.get_task(task_id)
async def get_incident(self, incident_id: str) -> Incident | None:
"""取得 Incident"""
return await self._repository.get_incident(incident_id)
# =========================================================================
# Analysis Execution
# =========================================================================
async def run_analysis(
self,
task_id: str,
incident: Incident,
) -> None:
"""
執行 Agent Teams 分析
流程:
1. 更新狀態為 ANALYZING
2. 收集各專家意見
3. 計算共識
4. 儲存結果
5. 推送 SSE 通知
Args:
task_id: 任務 ID
incident: 要分析的 Incident
"""
consensus_engine = get_consensus_engine()
try:
# Step 1: 更新狀態
await self._repository.update_task_state(
task_id,
TaskState.ANALYZING,
progress=10,
current_step="正在收集專家意見...",
)
# 推送 SSE 進度
publisher = await get_publisher()
await publisher.publish(SSEEvent(
type=EventType.AI_THINKING,
data={
"task_id": task_id,
"state": TaskState.ANALYZING.value,
"progress": 10,
"message": "Agent Teams 分析開始",
},
))
# Step 2: 收集意見
opinions = await consensus_engine.gather_opinions(
incident, timeout_sec=25.0
)
await self._repository.update_task_state(
task_id,
TaskState.CONSENSUS,
progress=60,
current_step="正在計算共識...",
agents_completed=len(opinions),
)
await publisher.publish(SSEEvent(
type=EventType.AI_THINKING,
data={
"task_id": task_id,
"state": TaskState.CONSENSUS.value,
"progress": 60,
"message": f"已收集 {len(opinions)} 位專家意見",
},
))
# Step 3: 計算共識
(
consensus_score,
recommended_action,
dissenting,
) = consensus_engine.calculate_consensus(opinions)
await self._repository.update_task_state(
task_id,
TaskState.CONSENSUS,
progress=80,
current_step="正在產生最終決策...",
)
# Step 4: 產生最終決策
result = await consensus_engine.generate_final_decision(
incident=incident,
opinions=opinions,
consensus_score=consensus_score,
recommended_action_type=recommended_action,
dissenting=dissenting,
)
# Step 5: 儲存完整結果
task_data = {
"task_id": task_id,
"state": TaskState.COMPLETED.value,
"progress": 100,
"current_step": "分析完成",
"agents_completed": len(opinions),
"total_agents": 4,
"consensus_id": result.consensus_id,
"incident_id": incident.incident_id,
"consensus_score": result.consensus_score,
"recommended_action": result.recommended_action,
"recommended_kubectl": result.recommended_kubectl,
"risk_level": result.risk_level,
"final_reasoning": result.final_reasoning,
"opinions": [op.to_dict() for op in result.opinions],
"dissenting_opinions": result.dissenting_opinions,
"completed_at": now_taipei_iso(),
}
await self._repository.save_task_result(task_id, task_data)
# 推送完成通知
await publisher.publish(SSEEvent(
type=EventType.AI_THINKING,
data={
"task_id": task_id,
"state": TaskState.COMPLETED.value,
"progress": 100,
"message": "分析完成",
"consensus_score": result.consensus_score,
"recommended_action": result.recommended_action,
},
))
logger.info(
"analysis_completed",
task_id=task_id,
consensus_id=result.consensus_id,
consensus_score=result.consensus_score,
)
except Exception as e:
logger.exception(
"analysis_failed",
task_id=task_id,
error=str(e),
)
# 更新為失敗狀態
task_data = {
"task_id": task_id,
"state": TaskState.FAILED.value,
"progress": 0,
"error": str(e),
"completed_at": now_taipei_iso(),
}
await self._repository.save_task_result(task_id, task_data)
# 推送失敗通知
publisher = await get_publisher()
await publisher.publish(SSEEvent(
type=EventType.ERROR,
data={
"task_id": task_id,
"state": TaskState.FAILED.value,
"error": str(e),
},
))
# =========================================================================
# Incident Integration
# =========================================================================
def should_trigger_agent_analysis(self, incident: Incident) -> bool:
"""
判斷是否需要觸發 Agent Teams 分析
條件 (任一符合):
- P0/P1 緊急事件
- 多個服務受影響 (>2)
- 多個告警 (>3)
Args:
incident: 要判斷的 Incident
Returns:
是否應觸發分析
"""
return (
# P0/P1 緊急事件
incident.severity in (Severity.P0, Severity.P1)
# 或多個服務受影響
or len(incident.affected_services) > 2
# 或多個告警
or len(incident.signals) > 3
)
async def trigger_for_incident(
self,
incident_id: str,
) -> tuple[str | None, Incident | None]:
"""
為 Incident 觸發 Agent Teams 分析 (如果符合條件)
Args:
incident_id: Incident ID
Returns:
(task_id, incident) - task_id 為 None 表示未觸發
"""
# 讀取 Incident
incident = await self._repository.get_incident(incident_id)
if incident is None:
logger.warning("trigger_skipped_not_found", incident_id=incident_id)
return None, None
# 判斷是否需要 Agent Teams
if not self.should_trigger_agent_analysis(incident):
logger.debug(
"trigger_skipped_simple_case",
incident_id=incident_id,
severity=incident.severity.value,
)
return None, incident
# 建立任務
task_id = await self.create_analysis_task(incident, trigger="auto")
logger.info(
"auto_trigger_success",
task_id=task_id,
incident_id=incident_id,
severity=incident.severity.value,
)
return task_id, incident
# =========================================================================
# Incident Factory
# =========================================================================
def create_adhoc_incident(
self,
severity: str,
affected_services: list[str],
alert_names: list[str] | None = None,
) -> Incident:
"""
建立臨時 Incident (用於直接分析請求)
Args:
severity: 嚴重度 (P0/P1/P2/P3)
affected_services: 受影響服務列表
alert_names: 告警名稱列表
Returns:
建立的 Incident
"""
signals = []
if alert_names:
for alert_name in alert_names:
signals.append(Signal(
alert_name=alert_name,
severity=Severity(severity),
source="manual",
fired_at=now_taipei(),
))
return Incident(
severity=Severity(severity),
status=IncidentStatus.INVESTIGATING,
signals=signals,
affected_services=affected_services,
)
# =============================================================================
# Singleton
# =============================================================================
_agent_service: AgentService | None = None
def get_agent_service() -> AgentService:
"""取得 Agent Service 實例 (Singleton)"""
global _agent_service
if _agent_service is None:
_agent_service = AgentService()
return _agent_service