Files
awoooi/apps/api/src/agents/blast_radius.py
OG T 7478dc0254 feat(phase6-9): Complete modular architecture and Agent Teams
Phase 6.4 - Modular Architecture:
- Add lewooogo-brain adapters for LLM providers
- Add lewooogo-data dual memory (Redis + PostgreSQL)
- Implement consensus engine for multi-agent decisions
- Add incident memory service for historical context

Phase 9 - Agent Teams (Claude Agent SDK):
- Add base agent class with Claude Sonnet 4 integration
- Implement action planner, blast radius, and security agents
- Add agent API endpoints and proposal workflow
- Integrate ADR-009 OpenClaw Agent Teams architecture

DevOps & CI/CD:
- Add GitHub Actions CI/CD workflows (ci.yaml, cd.yaml)
- Add pre-commit hooks and secrets baseline
- Add docker-compose for local development
- Update Kubernetes network policies

Frontend Improvements:
- Add auto-healing error boundary component
- Update i18n messages for agent features
- Enhance dual-state incident card with execution feedback

Documentation:
- Add 7 ADRs covering MCP, design system, architecture decisions
- Update ARCHITECTURE_MEMORY.md with modular design
- Add GLOBAL_RULES.md and SOUL.md for project identity

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-23 18:40:36 +08:00

526 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Blast Radius Agent - 影響範圍分析專家
======================================
職責:
- 評估操作的影響範圍
- 識別受影響的服務和依賴
- 估計使用者影響人數
- 回傳影響等級 (low/medium/high/critical)
符合 ADR-009 BlastRadiusAgent 規範
"""
import time
from dataclasses import dataclass, field
from enum import Enum
from typing import Any
import structlog
from src.agents.base import AgentResult, AgentStatus, BaseAgent
logger = structlog.get_logger(__name__)
# =============================================================================
# Blast Radius Types
# =============================================================================
class ImpactLevel(str, Enum):
"""影響等級"""
LOW = "low" # 單一服務,<100 用戶
MEDIUM = "medium" # 2-5 服務100-1000 用戶
HIGH = "high" # 5-10 服務1000-10000 用戶
CRITICAL = "critical" # >10 服務,>10000 用戶或核心服務
@dataclass
class AffectedService:
"""受影響服務"""
name: str
impact_type: str # direct, indirect, transitive
confidence: float
reason: str
def to_dict(self) -> dict[str, Any]:
return {
"name": self.name,
"impact_type": self.impact_type,
"confidence": self.confidence,
"reason": self.reason,
}
@dataclass
class BlastRadiusResult(AgentResult):
"""
BlastRadiusAgent 分析結果
額外欄位:
- impact_level: 影響等級 (low/medium/high/critical)
- affected_services: 受影響服務列表
- estimated_users: 估計影響用戶數
- dependency_chain: 依賴鏈
- recovery_time_estimate: 預估恢復時間 (分鐘)
"""
impact_level: ImpactLevel = ImpactLevel.LOW
affected_services: list[AffectedService] = field(default_factory=list)
estimated_users: int = 0
dependency_chain: list[str] = field(default_factory=list)
recovery_time_estimate: int = 0
def to_dict(self) -> dict[str, Any]:
"""轉換為 dict"""
base = super().to_dict()
base.update({
"impact_level": self.impact_level.value,
"affected_services": [s.to_dict() for s in self.affected_services],
"estimated_users": self.estimated_users,
"dependency_chain": self.dependency_chain,
"recovery_time_estimate": self.recovery_time_estimate,
})
return base
# =============================================================================
# Service Dependency Graph (簡化版)
# =============================================================================
# AWOOOI 服務依賴圖 (簡化版,實際應從 GraphRAG 讀取)
SERVICE_DEPENDENCIES: dict[str, dict[str, Any]] = {
# === Core Services ===
"api": {
"dependencies": ["postgres", "redis", "openclaw"],
"dependents": ["web", "telegram-gateway"],
"criticality": "critical",
"estimated_users": 5000,
},
"web": {
"dependencies": ["api"],
"dependents": [],
"criticality": "high",
"estimated_users": 3000,
},
"openclaw": {
"dependencies": ["redis", "ollama"],
"dependents": ["api"],
"criticality": "critical",
"estimated_users": 5000,
},
# === Infrastructure ===
"postgres": {
"dependencies": [],
"dependents": ["api", "signoz"],
"criticality": "critical",
"estimated_users": 10000,
},
"redis": {
"dependencies": [],
"dependents": ["api", "openclaw", "signal-worker"],
"criticality": "critical",
"estimated_users": 8000,
},
"ollama": {
"dependencies": [],
"dependents": ["openclaw"],
"criticality": "high",
"estimated_users": 2000,
},
# === Workers ===
"signal-worker": {
"dependencies": ["redis", "api"],
"dependents": [],
"criticality": "medium",
"estimated_users": 500,
},
"telegram-gateway": {
"dependencies": ["api"],
"dependents": [],
"criticality": "medium",
"estimated_users": 1000,
},
# === Observability ===
"signoz": {
"dependencies": ["postgres"],
"dependents": [],
"criticality": "low",
"estimated_users": 100,
},
"prometheus": {
"dependencies": [],
"dependents": [],
"criticality": "low",
"estimated_users": 50,
},
}
class BlastRadiusAgent(BaseAgent[BlastRadiusResult]):
"""
影響範圍分析專家 Agent
分析流程:
1. 識別直接影響的服務
2. 遍歷依賴圖找出間接影響
3. 計算總影響用戶數
4. 判定影響等級
使用方式:
```python
agent = BlastRadiusAgent()
result = await agent.analyze({
"target_service": "api",
"action": "kubectl rollout restart",
"namespace": "awoooi-prod",
})
print(result.impact_level) # ImpactLevel.CRITICAL
```
"""
AGENT_NAME = "blast-radius"
AGENT_DESCRIPTION = "影響範圍分析師,評估相依服務與影響範圍"
AGENT_TOOLS = ["Read", "Glob", "Grep"]
def __init__(
self,
timeout_sec: float = 30.0,
dependency_graph: dict[str, dict[str, Any]] | None = None,
):
"""
初始化 BlastRadiusAgent
Args:
timeout_sec: 執行超時時間
dependency_graph: 自訂依賴圖 (測試用)
"""
super().__init__(timeout_sec)
self.dependency_graph = dependency_graph or SERVICE_DEPENDENCIES
async def analyze(self, context: dict[str, Any]) -> BlastRadiusResult:
"""
執行影響範圍分析
Args:
context: 分析上下文
- target_service: 目標服務 (可以是列表)
- action: 執行的操作
- namespace: 命名空間
Returns:
BlastRadiusResult 包含影響等級和詳細分析
"""
start_time = time.time()
self.logger.info(
"blast_radius_analysis_start",
target=context.get("target_service"),
action=context.get("action", "")[:50],
)
try:
# 取得目標服務列表
target_services = context.get("target_service", [])
if isinstance(target_services, str):
target_services = [target_services]
# 分析每個目標服務的影響
all_affected: list[AffectedService] = []
total_users = 0
dependency_chain: list[str] = []
for target in target_services:
affected, users, chain = self._analyze_service_impact(target)
all_affected.extend(affected)
total_users = max(total_users, users) # 取最大值避免重複計算
dependency_chain.extend(chain)
# 去重
seen_services = set()
unique_affected: list[AffectedService] = []
for svc in all_affected:
if svc.name not in seen_services:
seen_services.add(svc.name)
unique_affected.append(svc)
# 判定影響等級
impact_level = self._calculate_impact_level(
len(unique_affected),
total_users,
unique_affected,
)
# 估計恢復時間
recovery_time = self._estimate_recovery_time(impact_level, len(unique_affected))
latency_ms = int((time.time() - start_time) * 1000)
# 生成分析摘要
analysis = self._generate_analysis(
impact_level,
len(unique_affected),
total_users,
)
result = BlastRadiusResult(
agent_name=self.AGENT_NAME,
status=AgentStatus.SUCCESS,
confidence=0.85, # 基於依賴圖的信心分數
analysis=analysis,
latency_ms=latency_ms,
impact_level=impact_level,
affected_services=unique_affected,
estimated_users=total_users,
dependency_chain=list(set(dependency_chain)),
recovery_time_estimate=recovery_time,
)
self.logger.info(
"blast_radius_analysis_complete",
impact_level=impact_level.value,
affected_count=len(unique_affected),
estimated_users=total_users,
latency_ms=latency_ms,
)
return result
except Exception as e:
latency_ms = int((time.time() - start_time) * 1000)
self.logger.exception(
"blast_radius_analysis_error",
error=str(e),
)
return BlastRadiusResult(
agent_name=self.AGENT_NAME,
status=AgentStatus.FAILED,
confidence=0.0,
analysis=f"分析失敗: {str(e)}",
latency_ms=latency_ms,
error=str(e),
impact_level=ImpactLevel.CRITICAL, # 失敗時假設最大影響
)
def _analyze_service_impact(
self,
target_service: str,
) -> tuple[list[AffectedService], int, list[str]]:
"""
分析單一服務的影響
Returns:
(受影響服務列表, 估計用戶數, 依賴鏈)
"""
affected: list[AffectedService] = []
visited: set[str] = set()
dependency_chain: list[str] = []
total_users = 0
# 標準化服務名稱
target_key = self._normalize_service_name(target_service)
if target_key not in self.dependency_graph:
# 未知服務,假設中等影響
affected.append(AffectedService(
name=target_service,
impact_type="direct",
confidence=0.5,
reason="未知服務,無法確定依賴關係",
))
return affected, 1000, [target_service]
# 1. 直接影響 (目標服務本身)
target_info = self.dependency_graph[target_key]
affected.append(AffectedService(
name=target_key,
impact_type="direct",
confidence=1.0,
reason="目標服務",
))
total_users += target_info.get("estimated_users", 0)
dependency_chain.append(target_key)
visited.add(target_key)
# 2. 依賴此服務的上游 (dependents)
self._find_dependents(
target_key,
affected,
visited,
dependency_chain,
depth=0,
max_depth=3,
)
# 計算總用戶數
for svc in affected:
if svc.name in self.dependency_graph:
total_users += self.dependency_graph[svc.name].get("estimated_users", 0)
return affected, total_users, dependency_chain
def _find_dependents(
self,
service: str,
affected: list[AffectedService],
visited: set[str],
chain: list[str],
depth: int,
max_depth: int,
) -> None:
"""遞迴查找依賴此服務的上游"""
if depth >= max_depth:
return
if service not in self.dependency_graph:
return
dependents = self.dependency_graph[service].get("dependents", [])
for dep in dependents:
if dep in visited:
continue
visited.add(dep)
chain.append(dep)
impact_type = "indirect" if depth == 0 else "transitive"
confidence = 0.9 - (depth * 0.1)
affected.append(AffectedService(
name=dep,
impact_type=impact_type,
confidence=confidence,
reason=f"依賴 {service}",
))
# 遞迴查找
self._find_dependents(
dep,
affected,
visited,
chain,
depth + 1,
max_depth,
)
def _normalize_service_name(self, service: str) -> str:
"""標準化服務名稱"""
# 移除常見後綴
service = service.lower()
for suffix in ["-deployment", "-svc", "-service", "-pod"]:
if service.endswith(suffix):
service = service[: -len(suffix)]
# 處理常見別名
aliases = {
"awoooi-api": "api",
"awoooi-web": "web",
"nginx": "web",
"frontend": "web",
"backend": "api",
"database": "postgres",
"db": "postgres",
"cache": "redis",
}
return aliases.get(service, service)
def _calculate_impact_level(
self,
service_count: int,
user_count: int,
affected: list[AffectedService],
) -> ImpactLevel:
"""計算影響等級"""
# 檢查是否有 critical 服務
has_critical = any(
svc.name in self.dependency_graph
and self.dependency_graph[svc.name].get("criticality") == "critical"
for svc in affected
)
if has_critical or service_count > 10 or user_count > 10000:
return ImpactLevel.CRITICAL
if service_count > 5 or user_count > 1000:
return ImpactLevel.HIGH
if service_count > 2 or user_count > 100:
return ImpactLevel.MEDIUM
return ImpactLevel.LOW
def _estimate_recovery_time(
self,
impact_level: ImpactLevel,
service_count: int,
) -> int:
"""估計恢復時間 (分鐘)"""
base_time = {
ImpactLevel.LOW: 5,
ImpactLevel.MEDIUM: 15,
ImpactLevel.HIGH: 30,
ImpactLevel.CRITICAL: 60,
}
# 每多一個服務增加 5 分鐘
return base_time[impact_level] + (service_count * 5)
def _generate_analysis(
self,
impact_level: ImpactLevel,
service_count: int,
user_count: int,
) -> str:
"""生成分析摘要"""
level_desc = {
ImpactLevel.LOW: "低影響",
ImpactLevel.MEDIUM: "中等影響",
ImpactLevel.HIGH: "高影響",
ImpactLevel.CRITICAL: "嚴重影響",
}
return (
f"{level_desc[impact_level]}: "
f"影響 {service_count} 個服務,預估 {user_count:,} 用戶受影響"
)
def _build_prompt(self, context: dict[str, Any]) -> str:
"""建構 LLM Prompt (Phase 9.4 擴展)"""
return f"""你是 AWOOOI 的影響範圍分析師。
分析以下操作的影響範圍:
目標服務: {context.get("target_service", "N/A")}
操作: {context.get("action", "N/A")}
命名空間: {context.get("namespace", "N/A")}
評估:
1. 直接影響的服務
2. 間接相依的服務
3. 使用者影響人數估計
輸出 JSON:
```json
{{
"impact_level": "low|medium|high|critical",
"affected_services": [
{{"name": "...", "impact_type": "direct|indirect", "reason": "..."}}
],
"estimated_users": 0,
"dependency_chain": ["service1", "service2"],
"analysis": "一句話摘要",
"confidence": 0-1
}}
```"""
def _parse_response(self, response: str) -> dict[str, Any]:
"""解析 LLM 回應"""
return self._extract_json(response)