Files
awoooi/apps/api/src/services/graph_rag.py
OG T 196d269b92 feat: add all application source code
- apps/api: FastAPI backend with Dockerfile
- apps/web: Next.js frontend with Dockerfile
- apps/sensor: Signal collection agent
- packages: shared packages

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-22 18:57:44 +08:00

488 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
GraphRAG - 知識圖譜引擎
Phase 3.4: 微服務依賴分析與根本原因追溯
核心功能:
1. TopologyGraph: 建構微服務依賴圖 (Dependency Graph)
2. Blast Radius Analysis: 某服務掛掉時,誰會跟著掛?(向上追溯)
3. Root Cause Analysis: 某服務報錯時,底層哪個依賴有問題?(向下追溯)
圖結構:
- Nodes: 微服務 (ingress, frontend, auth-service, postgres-db)
- Edges: 依賴關係 (frontend -> depends_on -> auth-service)
"""
import logging
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
logger = logging.getLogger(__name__)
# ==================== Types ====================
class NodeType(str, Enum):
"""節點類型"""
INGRESS = "ingress"
SERVICE = "service"
DATABASE = "database"
CACHE = "cache"
QUEUE = "queue"
EXTERNAL = "external"
class EdgeType(str, Enum):
"""邊的類型"""
DEPENDS_ON = "depends_on" # A depends_on B (A 依賴 B)
CALLS = "calls" # A calls B (同步呼叫)
PUBLISHES_TO = "publishes_to" # A publishes_to B (異步訊息)
READS_FROM = "reads_from" # A reads_from B (讀取資料)
WRITES_TO = "writes_to" # A writes_to B (寫入資料)
class HealthStatus(str, Enum):
"""健康狀態"""
HEALTHY = "healthy"
DEGRADED = "degraded"
UNHEALTHY = "unhealthy"
UNKNOWN = "unknown"
@dataclass
class ServiceNode:
"""服務節點"""
name: str
node_type: NodeType
namespace: str = "default"
health_status: HealthStatus = HealthStatus.HEALTHY
last_incident_at: datetime | None = None
incident_message: str | None = None
metadata: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return {
"name": self.name,
"nodeType": self.node_type.value,
"namespace": self.namespace,
"healthStatus": self.health_status.value,
"lastIncidentAt": self.last_incident_at.isoformat() if self.last_incident_at else None,
"incidentMessage": self.incident_message,
"metadata": self.metadata,
}
@dataclass
class DependencyEdge:
"""依賴邊"""
source: str # 依賴方 (e.g., frontend)
target: str # 被依賴方 (e.g., auth-service)
edge_type: EdgeType
is_critical: bool = False # 是否為關鍵依賴 (掛了就整個掛)
latency_p99_ms: float | None = None
def to_dict(self) -> dict:
return {
"source": self.source,
"target": self.target,
"edgeType": self.edge_type.value,
"isCritical": self.is_critical,
"latencyP99Ms": self.latency_p99_ms,
}
@dataclass
class BlastRadiusResult:
"""爆炸半徑分析結果"""
target_service: str
affected_services: list[str] # 會受影響的上游服務
affected_count: int
critical_path: list[str] # 關鍵路徑 (全部是 critical edge)
impact_summary: str
def to_dict(self) -> dict:
return {
"targetService": self.target_service,
"affectedServices": self.affected_services,
"affectedCount": self.affected_count,
"criticalPath": self.critical_path,
"impactSummary": self.impact_summary,
}
@dataclass
class RootCauseResult:
"""根本原因分析結果"""
target_service: str
unhealthy_dependencies: list[ServiceNode] # 有問題的下游依賴
dependency_chain: list[str] # 依賴鏈
probable_root_causes: list[str] # 所有可能的根本原因 (不只一個!)
analysis_summary: str
def to_dict(self) -> dict:
return {
"targetService": self.target_service,
"unhealthyDependencies": [d.to_dict() for d in self.unhealthy_dependencies],
"dependencyChain": self.dependency_chain,
"probableRootCauses": self.probable_root_causes, # 陣列,非單一值
"analysisSummary": self.analysis_summary,
}
@dataclass
class FullAnalysisResult:
"""完整分析結果 (Blast Radius + Root Cause)"""
target_service: str
blast_radius: BlastRadiusResult
root_cause: RootCauseResult
analyzed_at: datetime
def to_dict(self) -> dict:
return {
"targetService": self.target_service,
"blastRadius": self.blast_radius.to_dict(),
"rootCause": self.root_cause.to_dict(),
"analyzedAt": self.analyzed_at.isoformat(),
}
# ==================== Topology Graph ====================
class TopologyGraph:
"""
微服務拓撲圖
用於理解服務間的依賴關係,支援:
1. 向上追溯 (Blast Radius): 某服務掛了,誰會受影響
2. 向下追溯 (Root Cause): 某服務報錯,底層誰有問題
"""
def __init__(self):
# In-memory storage (Phase 4+ 換成 Neo4j/ArangoDB)
self._nodes: dict[str, ServiceNode] = {}
self._edges: list[DependencyEdge] = []
# 索引: source -> [edges], target -> [edges]
self._outgoing: dict[str, list[DependencyEdge]] = {} # source -> edges (我依賴誰)
self._incoming: dict[str, list[DependencyEdge]] = {} # target -> edges (誰依賴我)
# ==================== Graph Construction ====================
def add_node(self, node: ServiceNode) -> None:
"""新增節點"""
self._nodes[node.name] = node
if node.name not in self._outgoing:
self._outgoing[node.name] = []
if node.name not in self._incoming:
self._incoming[node.name] = []
logger.debug(f"[GraphRAG] Node added: {node.name} ({node.node_type.value})")
def add_edge(self, edge: DependencyEdge) -> None:
"""新增邊"""
self._edges.append(edge)
# 更新索引
if edge.source not in self._outgoing:
self._outgoing[edge.source] = []
self._outgoing[edge.source].append(edge)
if edge.target not in self._incoming:
self._incoming[edge.target] = []
self._incoming[edge.target].append(edge)
logger.debug(
f"[GraphRAG] Edge added: {edge.source} --{edge.edge_type.value}--> {edge.target}"
f"{' [CRITICAL]' if edge.is_critical else ''}"
)
def get_node(self, name: str) -> ServiceNode | None:
"""取得節點"""
return self._nodes.get(name)
def update_health(
self,
service_name: str,
status: HealthStatus,
incident_message: str | None = None,
) -> None:
"""更新服務健康狀態"""
if service_name in self._nodes:
node = self._nodes[service_name]
node.health_status = status
if status != HealthStatus.HEALTHY:
node.last_incident_at = datetime.utcnow()
node.incident_message = incident_message
logger.info(f"[GraphRAG] Health updated: {service_name} -> {status.value}")
# ==================== Blast Radius Analysis (向上追溯) ====================
def get_blast_radius(
self,
target_service: str,
max_depth: int = 3,
) -> BlastRadiusResult:
"""
計算爆炸半徑 (Blast Radius)
向上追溯: 如果 target_service 掛了,哪些上游服務會跟著掛?
使用 BFS 從 target 往上找所有依賴它的服務
Args:
target_service: 目標服務
max_depth: 最大追溯深度 (預設 3避免大型叢集無限擴散)
"""
if target_service not in self._nodes:
return BlastRadiusResult(
target_service=target_service,
affected_services=[],
affected_count=0,
critical_path=[],
impact_summary=f"Service '{target_service}' not found in topology",
)
affected = []
critical_path = []
visited = {target_service}
# queue 改為 (node, depth) tuple
queue: list[tuple[str, int]] = [(target_service, 0)]
# BFS 向上追溯 (找誰依賴我)
while queue:
current, depth = queue.pop(0)
# ⚠️ 深度限制: 避免大型叢集無限擴散
if depth >= max_depth:
continue
# 找所有依賴 current 的服務 (incoming edges)
for edge in self._incoming.get(current, []):
if edge.source not in visited:
visited.add(edge.source)
affected.append(edge.source)
queue.append((edge.source, depth + 1))
# 記錄關鍵路徑
if edge.is_critical:
critical_path.append(f"{edge.source} -> {edge.target}")
# 產生摘要
if not affected:
summary = f"No upstream services depend on '{target_service}'. Blast radius is contained."
else:
summary = (
f"If '{target_service}' goes down, {len(affected)} upstream services will be affected: "
f"{', '.join(affected[:5])}{'...' if len(affected) > 5 else ''}. "
f"Critical dependencies: {len(critical_path)}."
)
return BlastRadiusResult(
target_service=target_service,
affected_services=affected,
affected_count=len(affected),
critical_path=critical_path,
impact_summary=summary,
)
# ==================== Root Cause Analysis (向下追溯) ====================
def get_root_cause(
self,
target_service: str,
max_depth: int = 3,
) -> RootCauseResult:
"""
根本原因分析 (Root Cause Analysis)
向下追溯: 如果 target_service 報錯,它依賴的底層服務誰目前有異常?
使用 BFS 從 target 往下找所有它依賴的服務,
然後過濾出目前 health != HEALTHY 的
Args:
target_service: 目標服務
max_depth: 最大追溯深度 (預設 3避免大型叢集無限擴散)
"""
if target_service not in self._nodes:
return RootCauseResult(
target_service=target_service,
unhealthy_dependencies=[],
dependency_chain=[],
probable_root_causes=[],
analysis_summary=f"Service '{target_service}' not found in topology",
)
all_dependencies = []
unhealthy = []
visited = {target_service}
# queue 改為 (node, depth) tuple
queue: list[tuple[str, int]] = [(target_service, 0)]
# BFS 向下追溯 (找我依賴誰)
while queue:
current, depth = queue.pop(0)
# ⚠️ 深度限制: 避免大型叢集無限擴散
if depth >= max_depth:
continue
# 找 current 依賴的所有服務 (outgoing edges)
for edge in self._outgoing.get(current, []):
if edge.target not in visited:
visited.add(edge.target)
all_dependencies.append(edge.target)
queue.append((edge.target, depth + 1))
# 檢查健康狀態
dep_node = self._nodes.get(edge.target)
if dep_node and dep_node.health_status != HealthStatus.HEALTHY:
unhealthy.append(dep_node)
# ╔════════════════════════════════════════════════════════════════╗
# ║ 收集所有可能的根本原因 (不只一個!) ║
# ║ 優先排序: DATABASE > CACHE > QUEUE > 其他 ║
# ║ ⚠️ 不使用 break收集全部異常節點 ║
# ╚════════════════════════════════════════════════════════════════╝
probable_roots: list[str] = []
priority_order = [NodeType.DATABASE, NodeType.CACHE, NodeType.QUEUE]
if unhealthy:
# 先加入高優先級節點 (DB/CACHE/QUEUE)
for priority_type in priority_order:
for node in unhealthy:
if node.node_type == priority_type and node.name not in probable_roots:
probable_roots.append(node.name)
# 再加入其他類型的異常節點
for node in unhealthy:
if node.name not in probable_roots:
probable_roots.append(node.name)
# 產生摘要
if not unhealthy:
summary = (
f"All {len(all_dependencies)} dependencies of '{target_service}' are healthy. "
"Issue might be within the service itself."
)
else:
unhealthy_names = [n.name for n in unhealthy]
summary = (
f"Found {len(unhealthy)} unhealthy dependencies for '{target_service}': "
f"{', '.join(unhealthy_names)}. "
f"Probable root causes: {', '.join(probable_roots)}."
)
return RootCauseResult(
target_service=target_service,
unhealthy_dependencies=unhealthy,
dependency_chain=all_dependencies,
probable_root_causes=probable_roots,
analysis_summary=summary,
)
# ==================== Combined Analysis ====================
def get_blast_radius_and_root_cause(
self,
target_service: str,
max_depth: int = 3,
) -> FullAnalysisResult:
"""
完整分析: Blast Radius + Root Cause
ClawBot 主要呼叫這個方法,一次取得:
1. 向上追溯: 誰會受影響
2. 向下追溯: 誰是根本原因
Args:
target_service: 目標服務
max_depth: 最大追溯深度 (預設 3)
"""
blast = self.get_blast_radius(target_service, max_depth)
root = self.get_root_cause(target_service, max_depth)
logger.info(
f"[GraphRAG] Full analysis for '{target_service}': "
f"blast_radius={blast.affected_count}, "
f"unhealthy_deps={len(root.unhealthy_dependencies)}"
)
return FullAnalysisResult(
target_service=target_service,
blast_radius=blast,
root_cause=root,
analyzed_at=datetime.utcnow(),
)
# ==================== Utilities ====================
def get_all_nodes(self) -> list[ServiceNode]:
"""取得所有節點"""
return list(self._nodes.values())
def get_all_edges(self) -> list[DependencyEdge]:
"""取得所有邊"""
return self._edges
def to_dict(self) -> dict:
"""輸出完整圖結構"""
return {
"nodes": [n.to_dict() for n in self._nodes.values()],
"edges": [e.to_dict() for e in self._edges],
"nodeCount": len(self._nodes),
"edgeCount": len(self._edges),
}
# ==================== Mock Data Factory ====================
def create_mock_topology() -> TopologyGraph:
"""
建立 Mock 拓撲圖 (Phase 3 用)
典型微服務架構:
ingress -> frontend -> auth-service -> postgres-db
\-> product-api -> postgres-db
\-> order-api -> postgres-db
\-> redis-cache
"""
graph = TopologyGraph()
# 建立節點
nodes = [
ServiceNode("ingress", NodeType.INGRESS),
ServiceNode("frontend", NodeType.SERVICE),
ServiceNode("auth-service", NodeType.SERVICE),
ServiceNode("product-api", NodeType.SERVICE),
ServiceNode("order-api", NodeType.SERVICE),
ServiceNode("postgres-db", NodeType.DATABASE),
ServiceNode("redis-cache", NodeType.CACHE),
]
for node in nodes:
graph.add_node(node)
# 建立邊 (依賴關係)
edges = [
DependencyEdge("ingress", "frontend", EdgeType.CALLS, is_critical=True),
DependencyEdge("frontend", "auth-service", EdgeType.DEPENDS_ON, is_critical=True),
DependencyEdge("frontend", "product-api", EdgeType.CALLS),
DependencyEdge("frontend", "order-api", EdgeType.CALLS),
DependencyEdge("auth-service", "postgres-db", EdgeType.READS_FROM, is_critical=True),
DependencyEdge("product-api", "postgres-db", EdgeType.READS_FROM),
DependencyEdge("order-api", "postgres-db", EdgeType.WRITES_TO, is_critical=True),
DependencyEdge("order-api", "redis-cache", EdgeType.READS_FROM),
]
for edge in edges:
graph.add_edge(edge)
logger.info(f"[GraphRAG] Mock topology created: {len(nodes)} nodes, {len(edges)} edges")
return graph
# 全域實例 (預載 Mock 資料)
topology_graph = create_mock_topology()