""" GraphRAG - 知識圖譜引擎 Phase 3.4: 微服務依賴分析與根本原因追溯 核心功能: 1. TopologyGraph: 建構微服務依賴圖 (Dependency Graph) 2. Blast Radius Analysis: 某服務掛掉時,誰會跟著掛?(向上追溯) 3. Root Cause Analysis: 某服務報錯時,底層哪個依賴有問題?(向下追溯) 圖結構: - Nodes: 微服務 (ingress, frontend, auth-service, postgres-db) - Edges: 依賴關係 (frontend -> depends_on -> auth-service) """ import logging from dataclasses import dataclass, field from datetime import datetime from enum import Enum logger = logging.getLogger(__name__) # ==================== Types ==================== class NodeType(str, Enum): """節點類型""" INGRESS = "ingress" SERVICE = "service" DATABASE = "database" CACHE = "cache" QUEUE = "queue" EXTERNAL = "external" class EdgeType(str, Enum): """邊的類型""" DEPENDS_ON = "depends_on" # A depends_on B (A 依賴 B) CALLS = "calls" # A calls B (同步呼叫) PUBLISHES_TO = "publishes_to" # A publishes_to B (異步訊息) READS_FROM = "reads_from" # A reads_from B (讀取資料) WRITES_TO = "writes_to" # A writes_to B (寫入資料) class HealthStatus(str, Enum): """健康狀態""" HEALTHY = "healthy" DEGRADED = "degraded" UNHEALTHY = "unhealthy" UNKNOWN = "unknown" @dataclass class ServiceNode: """服務節點""" name: str node_type: NodeType namespace: str = "default" health_status: HealthStatus = HealthStatus.HEALTHY last_incident_at: datetime | None = None incident_message: str | None = None metadata: dict = field(default_factory=dict) def to_dict(self) -> dict: return { "name": self.name, "nodeType": self.node_type.value, "namespace": self.namespace, "healthStatus": self.health_status.value, "lastIncidentAt": self.last_incident_at.isoformat() if self.last_incident_at else None, "incidentMessage": self.incident_message, "metadata": self.metadata, } @dataclass class DependencyEdge: """依賴邊""" source: str # 依賴方 (e.g., frontend) target: str # 被依賴方 (e.g., auth-service) edge_type: EdgeType is_critical: bool = False # 是否為關鍵依賴 (掛了就整個掛) latency_p99_ms: float | None = None def to_dict(self) -> dict: return { "source": self.source, "target": self.target, "edgeType": self.edge_type.value, "isCritical": self.is_critical, "latencyP99Ms": self.latency_p99_ms, } @dataclass class BlastRadiusResult: """爆炸半徑分析結果""" target_service: str affected_services: list[str] # 會受影響的上游服務 affected_count: int critical_path: list[str] # 關鍵路徑 (全部是 critical edge) impact_summary: str def to_dict(self) -> dict: return { "targetService": self.target_service, "affectedServices": self.affected_services, "affectedCount": self.affected_count, "criticalPath": self.critical_path, "impactSummary": self.impact_summary, } @dataclass class RootCauseResult: """根本原因分析結果""" target_service: str unhealthy_dependencies: list[ServiceNode] # 有問題的下游依賴 dependency_chain: list[str] # 依賴鏈 probable_root_causes: list[str] # 所有可能的根本原因 (不只一個!) analysis_summary: str def to_dict(self) -> dict: return { "targetService": self.target_service, "unhealthyDependencies": [d.to_dict() for d in self.unhealthy_dependencies], "dependencyChain": self.dependency_chain, "probableRootCauses": self.probable_root_causes, # 陣列,非單一值 "analysisSummary": self.analysis_summary, } @dataclass class FullAnalysisResult: """完整分析結果 (Blast Radius + Root Cause)""" target_service: str blast_radius: BlastRadiusResult root_cause: RootCauseResult analyzed_at: datetime def to_dict(self) -> dict: return { "targetService": self.target_service, "blastRadius": self.blast_radius.to_dict(), "rootCause": self.root_cause.to_dict(), "analyzedAt": self.analyzed_at.isoformat(), } # ==================== Topology Graph ==================== class TopologyGraph: """ 微服務拓撲圖 用於理解服務間的依賴關係,支援: 1. 向上追溯 (Blast Radius): 某服務掛了,誰會受影響 2. 向下追溯 (Root Cause): 某服務報錯,底層誰有問題 """ def __init__(self): # In-memory storage (Phase 4+ 換成 Neo4j/ArangoDB) self._nodes: dict[str, ServiceNode] = {} self._edges: list[DependencyEdge] = [] # 索引: source -> [edges], target -> [edges] self._outgoing: dict[str, list[DependencyEdge]] = {} # source -> edges (我依賴誰) self._incoming: dict[str, list[DependencyEdge]] = {} # target -> edges (誰依賴我) # ==================== Graph Construction ==================== def add_node(self, node: ServiceNode) -> None: """新增節點""" self._nodes[node.name] = node if node.name not in self._outgoing: self._outgoing[node.name] = [] if node.name not in self._incoming: self._incoming[node.name] = [] logger.debug(f"[GraphRAG] Node added: {node.name} ({node.node_type.value})") def add_edge(self, edge: DependencyEdge) -> None: """新增邊""" self._edges.append(edge) # 更新索引 if edge.source not in self._outgoing: self._outgoing[edge.source] = [] self._outgoing[edge.source].append(edge) if edge.target not in self._incoming: self._incoming[edge.target] = [] self._incoming[edge.target].append(edge) logger.debug( f"[GraphRAG] Edge added: {edge.source} --{edge.edge_type.value}--> {edge.target}" f"{' [CRITICAL]' if edge.is_critical else ''}" ) def get_node(self, name: str) -> ServiceNode | None: """取得節點""" return self._nodes.get(name) def update_health( self, service_name: str, status: HealthStatus, incident_message: str | None = None, ) -> None: """更新服務健康狀態""" if service_name in self._nodes: node = self._nodes[service_name] node.health_status = status if status != HealthStatus.HEALTHY: node.last_incident_at = datetime.utcnow() node.incident_message = incident_message logger.info(f"[GraphRAG] Health updated: {service_name} -> {status.value}") # ==================== Blast Radius Analysis (向上追溯) ==================== def get_blast_radius( self, target_service: str, max_depth: int = 3, ) -> BlastRadiusResult: """ 計算爆炸半徑 (Blast Radius) 向上追溯: 如果 target_service 掛了,哪些上游服務會跟著掛? 使用 BFS 從 target 往上找所有依賴它的服務 Args: target_service: 目標服務 max_depth: 最大追溯深度 (預設 3,避免大型叢集無限擴散) """ if target_service not in self._nodes: return BlastRadiusResult( target_service=target_service, affected_services=[], affected_count=0, critical_path=[], impact_summary=f"Service '{target_service}' not found in topology", ) affected = [] critical_path = [] visited = {target_service} # queue 改為 (node, depth) tuple queue: list[tuple[str, int]] = [(target_service, 0)] # BFS 向上追溯 (找誰依賴我) while queue: current, depth = queue.pop(0) # ⚠️ 深度限制: 避免大型叢集無限擴散 if depth >= max_depth: continue # 找所有依賴 current 的服務 (incoming edges) for edge in self._incoming.get(current, []): if edge.source not in visited: visited.add(edge.source) affected.append(edge.source) queue.append((edge.source, depth + 1)) # 記錄關鍵路徑 if edge.is_critical: critical_path.append(f"{edge.source} -> {edge.target}") # 產生摘要 if not affected: summary = f"No upstream services depend on '{target_service}'. Blast radius is contained." else: summary = ( f"If '{target_service}' goes down, {len(affected)} upstream services will be affected: " f"{', '.join(affected[:5])}{'...' if len(affected) > 5 else ''}. " f"Critical dependencies: {len(critical_path)}." ) return BlastRadiusResult( target_service=target_service, affected_services=affected, affected_count=len(affected), critical_path=critical_path, impact_summary=summary, ) # ==================== Root Cause Analysis (向下追溯) ==================== def get_root_cause( self, target_service: str, max_depth: int = 3, ) -> RootCauseResult: """ 根本原因分析 (Root Cause Analysis) 向下追溯: 如果 target_service 報錯,它依賴的底層服務誰目前有異常? 使用 BFS 從 target 往下找所有它依賴的服務, 然後過濾出目前 health != HEALTHY 的 Args: target_service: 目標服務 max_depth: 最大追溯深度 (預設 3,避免大型叢集無限擴散) """ if target_service not in self._nodes: return RootCauseResult( target_service=target_service, unhealthy_dependencies=[], dependency_chain=[], probable_root_causes=[], analysis_summary=f"Service '{target_service}' not found in topology", ) all_dependencies = [] unhealthy = [] visited = {target_service} # queue 改為 (node, depth) tuple queue: list[tuple[str, int]] = [(target_service, 0)] # BFS 向下追溯 (找我依賴誰) while queue: current, depth = queue.pop(0) # ⚠️ 深度限制: 避免大型叢集無限擴散 if depth >= max_depth: continue # 找 current 依賴的所有服務 (outgoing edges) for edge in self._outgoing.get(current, []): if edge.target not in visited: visited.add(edge.target) all_dependencies.append(edge.target) queue.append((edge.target, depth + 1)) # 檢查健康狀態 dep_node = self._nodes.get(edge.target) if dep_node and dep_node.health_status != HealthStatus.HEALTHY: unhealthy.append(dep_node) # ╔════════════════════════════════════════════════════════════════╗ # ║ 收集所有可能的根本原因 (不只一個!) ║ # ║ 優先排序: DATABASE > CACHE > QUEUE > 其他 ║ # ║ ⚠️ 不使用 break,收集全部異常節點 ║ # ╚════════════════════════════════════════════════════════════════╝ probable_roots: list[str] = [] priority_order = [NodeType.DATABASE, NodeType.CACHE, NodeType.QUEUE] if unhealthy: # 先加入高優先級節點 (DB/CACHE/QUEUE) for priority_type in priority_order: for node in unhealthy: if node.node_type == priority_type and node.name not in probable_roots: probable_roots.append(node.name) # 再加入其他類型的異常節點 for node in unhealthy: if node.name not in probable_roots: probable_roots.append(node.name) # 產生摘要 if not unhealthy: summary = ( f"All {len(all_dependencies)} dependencies of '{target_service}' are healthy. " "Issue might be within the service itself." ) else: unhealthy_names = [n.name for n in unhealthy] summary = ( f"Found {len(unhealthy)} unhealthy dependencies for '{target_service}': " f"{', '.join(unhealthy_names)}. " f"Probable root causes: {', '.join(probable_roots)}." ) return RootCauseResult( target_service=target_service, unhealthy_dependencies=unhealthy, dependency_chain=all_dependencies, probable_root_causes=probable_roots, analysis_summary=summary, ) # ==================== Combined Analysis ==================== def get_blast_radius_and_root_cause( self, target_service: str, max_depth: int = 3, ) -> FullAnalysisResult: """ 完整分析: Blast Radius + Root Cause OpenClaw 主要呼叫這個方法,一次取得: 1. 向上追溯: 誰會受影響 2. 向下追溯: 誰是根本原因 Args: target_service: 目標服務 max_depth: 最大追溯深度 (預設 3) """ blast = self.get_blast_radius(target_service, max_depth) root = self.get_root_cause(target_service, max_depth) logger.info( f"[GraphRAG] Full analysis for '{target_service}': " f"blast_radius={blast.affected_count}, " f"unhealthy_deps={len(root.unhealthy_dependencies)}" ) return FullAnalysisResult( target_service=target_service, blast_radius=blast, root_cause=root, analyzed_at=datetime.utcnow(), ) # ==================== Utilities ==================== def get_all_nodes(self) -> list[ServiceNode]: """取得所有節點""" return list(self._nodes.values()) def get_all_edges(self) -> list[DependencyEdge]: """取得所有邊""" return self._edges def to_dict(self) -> dict: """輸出完整圖結構""" return { "nodes": [n.to_dict() for n in self._nodes.values()], "edges": [e.to_dict() for e in self._edges], "nodeCount": len(self._nodes), "edgeCount": len(self._edges), } # ==================== Mock Data Factory ==================== def create_mock_topology() -> TopologyGraph: r""" 建立 Mock 拓撲圖 (Phase 3 用) 典型微服務架構: ingress -> frontend -> auth-service -> postgres-db \-> product-api -> postgres-db \-> order-api -> postgres-db \-> redis-cache """ graph = TopologyGraph() # 建立節點 nodes = [ ServiceNode("ingress", NodeType.INGRESS), ServiceNode("frontend", NodeType.SERVICE), ServiceNode("auth-service", NodeType.SERVICE), ServiceNode("product-api", NodeType.SERVICE), ServiceNode("order-api", NodeType.SERVICE), ServiceNode("postgres-db", NodeType.DATABASE), ServiceNode("redis-cache", NodeType.CACHE), ] for node in nodes: graph.add_node(node) # 建立邊 (依賴關係) edges = [ DependencyEdge("ingress", "frontend", EdgeType.CALLS, is_critical=True), DependencyEdge("frontend", "auth-service", EdgeType.DEPENDS_ON, is_critical=True), DependencyEdge("frontend", "product-api", EdgeType.CALLS), DependencyEdge("frontend", "order-api", EdgeType.CALLS), DependencyEdge("auth-service", "postgres-db", EdgeType.READS_FROM, is_critical=True), DependencyEdge("product-api", "postgres-db", EdgeType.READS_FROM), DependencyEdge("order-api", "postgres-db", EdgeType.WRITES_TO, is_critical=True), DependencyEdge("order-api", "redis-cache", EdgeType.READS_FROM), ] for edge in edges: graph.add_edge(edge) logger.info(f"[GraphRAG] Mock topology created: {len(nodes)} nodes, {len(edges)} edges") return graph # 全域實例 (預載 Mock 資料) topology_graph = create_mock_topology()