- 刪除舊版 clawbot.py (已有新版 openclaw.py) - 更新 models/ai.py 類型定義 (ClawBotAnalysisRequest/Response) - 更新 api/v1/ai.py import 與註解 - 更新 Discord username - 更新所有註解與文檔 依據: feedback_openclaw_naming.md (統帥 2026-03-20 正式命名決議) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
488 lines
17 KiB
Python
488 lines
17 KiB
Python
"""
|
||
GraphRAG - 知識圖譜引擎
|
||
Phase 3.4: 微服務依賴分析與根本原因追溯
|
||
|
||
核心功能:
|
||
1. TopologyGraph: 建構微服務依賴圖 (Dependency Graph)
|
||
2. Blast Radius Analysis: 某服務掛掉時,誰會跟著掛?(向上追溯)
|
||
3. Root Cause Analysis: 某服務報錯時,底層哪個依賴有問題?(向下追溯)
|
||
|
||
圖結構:
|
||
- Nodes: 微服務 (ingress, frontend, auth-service, postgres-db)
|
||
- Edges: 依賴關係 (frontend -> depends_on -> auth-service)
|
||
"""
|
||
|
||
import logging
|
||
from dataclasses import dataclass, field
|
||
from datetime import datetime
|
||
from enum import Enum
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
# ==================== Types ====================
|
||
|
||
|
||
class NodeType(str, Enum):
|
||
"""節點類型"""
|
||
INGRESS = "ingress"
|
||
SERVICE = "service"
|
||
DATABASE = "database"
|
||
CACHE = "cache"
|
||
QUEUE = "queue"
|
||
EXTERNAL = "external"
|
||
|
||
|
||
class EdgeType(str, Enum):
|
||
"""邊的類型"""
|
||
DEPENDS_ON = "depends_on" # A depends_on B (A 依賴 B)
|
||
CALLS = "calls" # A calls B (同步呼叫)
|
||
PUBLISHES_TO = "publishes_to" # A publishes_to B (異步訊息)
|
||
READS_FROM = "reads_from" # A reads_from B (讀取資料)
|
||
WRITES_TO = "writes_to" # A writes_to B (寫入資料)
|
||
|
||
|
||
class HealthStatus(str, Enum):
|
||
"""健康狀態"""
|
||
HEALTHY = "healthy"
|
||
DEGRADED = "degraded"
|
||
UNHEALTHY = "unhealthy"
|
||
UNKNOWN = "unknown"
|
||
|
||
|
||
@dataclass
|
||
class ServiceNode:
|
||
"""服務節點"""
|
||
name: str
|
||
node_type: NodeType
|
||
namespace: str = "default"
|
||
health_status: HealthStatus = HealthStatus.HEALTHY
|
||
last_incident_at: datetime | None = None
|
||
incident_message: str | None = None
|
||
metadata: dict = field(default_factory=dict)
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"name": self.name,
|
||
"nodeType": self.node_type.value,
|
||
"namespace": self.namespace,
|
||
"healthStatus": self.health_status.value,
|
||
"lastIncidentAt": self.last_incident_at.isoformat() if self.last_incident_at else None,
|
||
"incidentMessage": self.incident_message,
|
||
"metadata": self.metadata,
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class DependencyEdge:
|
||
"""依賴邊"""
|
||
source: str # 依賴方 (e.g., frontend)
|
||
target: str # 被依賴方 (e.g., auth-service)
|
||
edge_type: EdgeType
|
||
is_critical: bool = False # 是否為關鍵依賴 (掛了就整個掛)
|
||
latency_p99_ms: float | None = None
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"source": self.source,
|
||
"target": self.target,
|
||
"edgeType": self.edge_type.value,
|
||
"isCritical": self.is_critical,
|
||
"latencyP99Ms": self.latency_p99_ms,
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class BlastRadiusResult:
|
||
"""爆炸半徑分析結果"""
|
||
target_service: str
|
||
affected_services: list[str] # 會受影響的上游服務
|
||
affected_count: int
|
||
critical_path: list[str] # 關鍵路徑 (全部是 critical edge)
|
||
impact_summary: str
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"targetService": self.target_service,
|
||
"affectedServices": self.affected_services,
|
||
"affectedCount": self.affected_count,
|
||
"criticalPath": self.critical_path,
|
||
"impactSummary": self.impact_summary,
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class RootCauseResult:
|
||
"""根本原因分析結果"""
|
||
target_service: str
|
||
unhealthy_dependencies: list[ServiceNode] # 有問題的下游依賴
|
||
dependency_chain: list[str] # 依賴鏈
|
||
probable_root_causes: list[str] # 所有可能的根本原因 (不只一個!)
|
||
analysis_summary: str
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"targetService": self.target_service,
|
||
"unhealthyDependencies": [d.to_dict() for d in self.unhealthy_dependencies],
|
||
"dependencyChain": self.dependency_chain,
|
||
"probableRootCauses": self.probable_root_causes, # 陣列,非單一值
|
||
"analysisSummary": self.analysis_summary,
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class FullAnalysisResult:
|
||
"""完整分析結果 (Blast Radius + Root Cause)"""
|
||
target_service: str
|
||
blast_radius: BlastRadiusResult
|
||
root_cause: RootCauseResult
|
||
analyzed_at: datetime
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"targetService": self.target_service,
|
||
"blastRadius": self.blast_radius.to_dict(),
|
||
"rootCause": self.root_cause.to_dict(),
|
||
"analyzedAt": self.analyzed_at.isoformat(),
|
||
}
|
||
|
||
|
||
# ==================== Topology Graph ====================
|
||
|
||
|
||
class TopologyGraph:
|
||
"""
|
||
微服務拓撲圖
|
||
|
||
用於理解服務間的依賴關係,支援:
|
||
1. 向上追溯 (Blast Radius): 某服務掛了,誰會受影響
|
||
2. 向下追溯 (Root Cause): 某服務報錯,底層誰有問題
|
||
"""
|
||
|
||
def __init__(self):
|
||
# In-memory storage (Phase 4+ 換成 Neo4j/ArangoDB)
|
||
self._nodes: dict[str, ServiceNode] = {}
|
||
self._edges: list[DependencyEdge] = []
|
||
|
||
# 索引: source -> [edges], target -> [edges]
|
||
self._outgoing: dict[str, list[DependencyEdge]] = {} # source -> edges (我依賴誰)
|
||
self._incoming: dict[str, list[DependencyEdge]] = {} # target -> edges (誰依賴我)
|
||
|
||
# ==================== Graph Construction ====================
|
||
|
||
def add_node(self, node: ServiceNode) -> None:
|
||
"""新增節點"""
|
||
self._nodes[node.name] = node
|
||
if node.name not in self._outgoing:
|
||
self._outgoing[node.name] = []
|
||
if node.name not in self._incoming:
|
||
self._incoming[node.name] = []
|
||
logger.debug(f"[GraphRAG] Node added: {node.name} ({node.node_type.value})")
|
||
|
||
def add_edge(self, edge: DependencyEdge) -> None:
|
||
"""新增邊"""
|
||
self._edges.append(edge)
|
||
|
||
# 更新索引
|
||
if edge.source not in self._outgoing:
|
||
self._outgoing[edge.source] = []
|
||
self._outgoing[edge.source].append(edge)
|
||
|
||
if edge.target not in self._incoming:
|
||
self._incoming[edge.target] = []
|
||
self._incoming[edge.target].append(edge)
|
||
|
||
logger.debug(
|
||
f"[GraphRAG] Edge added: {edge.source} --{edge.edge_type.value}--> {edge.target}"
|
||
f"{' [CRITICAL]' if edge.is_critical else ''}"
|
||
)
|
||
|
||
def get_node(self, name: str) -> ServiceNode | None:
|
||
"""取得節點"""
|
||
return self._nodes.get(name)
|
||
|
||
def update_health(
|
||
self,
|
||
service_name: str,
|
||
status: HealthStatus,
|
||
incident_message: str | None = None,
|
||
) -> None:
|
||
"""更新服務健康狀態"""
|
||
if service_name in self._nodes:
|
||
node = self._nodes[service_name]
|
||
node.health_status = status
|
||
if status != HealthStatus.HEALTHY:
|
||
node.last_incident_at = datetime.utcnow()
|
||
node.incident_message = incident_message
|
||
logger.info(f"[GraphRAG] Health updated: {service_name} -> {status.value}")
|
||
|
||
# ==================== Blast Radius Analysis (向上追溯) ====================
|
||
|
||
def get_blast_radius(
|
||
self,
|
||
target_service: str,
|
||
max_depth: int = 3,
|
||
) -> BlastRadiusResult:
|
||
"""
|
||
計算爆炸半徑 (Blast Radius)
|
||
|
||
向上追溯: 如果 target_service 掛了,哪些上游服務會跟著掛?
|
||
|
||
使用 BFS 從 target 往上找所有依賴它的服務
|
||
|
||
Args:
|
||
target_service: 目標服務
|
||
max_depth: 最大追溯深度 (預設 3,避免大型叢集無限擴散)
|
||
"""
|
||
if target_service not in self._nodes:
|
||
return BlastRadiusResult(
|
||
target_service=target_service,
|
||
affected_services=[],
|
||
affected_count=0,
|
||
critical_path=[],
|
||
impact_summary=f"Service '{target_service}' not found in topology",
|
||
)
|
||
|
||
affected = []
|
||
critical_path = []
|
||
visited = {target_service}
|
||
# queue 改為 (node, depth) tuple
|
||
queue: list[tuple[str, int]] = [(target_service, 0)]
|
||
|
||
# BFS 向上追溯 (找誰依賴我)
|
||
while queue:
|
||
current, depth = queue.pop(0)
|
||
|
||
# ⚠️ 深度限制: 避免大型叢集無限擴散
|
||
if depth >= max_depth:
|
||
continue
|
||
|
||
# 找所有依賴 current 的服務 (incoming edges)
|
||
for edge in self._incoming.get(current, []):
|
||
if edge.source not in visited:
|
||
visited.add(edge.source)
|
||
affected.append(edge.source)
|
||
queue.append((edge.source, depth + 1))
|
||
|
||
# 記錄關鍵路徑
|
||
if edge.is_critical:
|
||
critical_path.append(f"{edge.source} -> {edge.target}")
|
||
|
||
# 產生摘要
|
||
if not affected:
|
||
summary = f"No upstream services depend on '{target_service}'. Blast radius is contained."
|
||
else:
|
||
summary = (
|
||
f"If '{target_service}' goes down, {len(affected)} upstream services will be affected: "
|
||
f"{', '.join(affected[:5])}{'...' if len(affected) > 5 else ''}. "
|
||
f"Critical dependencies: {len(critical_path)}."
|
||
)
|
||
|
||
return BlastRadiusResult(
|
||
target_service=target_service,
|
||
affected_services=affected,
|
||
affected_count=len(affected),
|
||
critical_path=critical_path,
|
||
impact_summary=summary,
|
||
)
|
||
|
||
# ==================== Root Cause Analysis (向下追溯) ====================
|
||
|
||
def get_root_cause(
|
||
self,
|
||
target_service: str,
|
||
max_depth: int = 3,
|
||
) -> RootCauseResult:
|
||
"""
|
||
根本原因分析 (Root Cause Analysis)
|
||
|
||
向下追溯: 如果 target_service 報錯,它依賴的底層服務誰目前有異常?
|
||
|
||
使用 BFS 從 target 往下找所有它依賴的服務,
|
||
然後過濾出目前 health != HEALTHY 的
|
||
|
||
Args:
|
||
target_service: 目標服務
|
||
max_depth: 最大追溯深度 (預設 3,避免大型叢集無限擴散)
|
||
"""
|
||
if target_service not in self._nodes:
|
||
return RootCauseResult(
|
||
target_service=target_service,
|
||
unhealthy_dependencies=[],
|
||
dependency_chain=[],
|
||
probable_root_causes=[],
|
||
analysis_summary=f"Service '{target_service}' not found in topology",
|
||
)
|
||
|
||
all_dependencies = []
|
||
unhealthy = []
|
||
visited = {target_service}
|
||
# queue 改為 (node, depth) tuple
|
||
queue: list[tuple[str, int]] = [(target_service, 0)]
|
||
|
||
# BFS 向下追溯 (找我依賴誰)
|
||
while queue:
|
||
current, depth = queue.pop(0)
|
||
|
||
# ⚠️ 深度限制: 避免大型叢集無限擴散
|
||
if depth >= max_depth:
|
||
continue
|
||
|
||
# 找 current 依賴的所有服務 (outgoing edges)
|
||
for edge in self._outgoing.get(current, []):
|
||
if edge.target not in visited:
|
||
visited.add(edge.target)
|
||
all_dependencies.append(edge.target)
|
||
queue.append((edge.target, depth + 1))
|
||
|
||
# 檢查健康狀態
|
||
dep_node = self._nodes.get(edge.target)
|
||
if dep_node and dep_node.health_status != HealthStatus.HEALTHY:
|
||
unhealthy.append(dep_node)
|
||
|
||
# ╔════════════════════════════════════════════════════════════════╗
|
||
# ║ 收集所有可能的根本原因 (不只一個!) ║
|
||
# ║ 優先排序: DATABASE > CACHE > QUEUE > 其他 ║
|
||
# ║ ⚠️ 不使用 break,收集全部異常節點 ║
|
||
# ╚════════════════════════════════════════════════════════════════╝
|
||
probable_roots: list[str] = []
|
||
priority_order = [NodeType.DATABASE, NodeType.CACHE, NodeType.QUEUE]
|
||
|
||
if unhealthy:
|
||
# 先加入高優先級節點 (DB/CACHE/QUEUE)
|
||
for priority_type in priority_order:
|
||
for node in unhealthy:
|
||
if node.node_type == priority_type and node.name not in probable_roots:
|
||
probable_roots.append(node.name)
|
||
|
||
# 再加入其他類型的異常節點
|
||
for node in unhealthy:
|
||
if node.name not in probable_roots:
|
||
probable_roots.append(node.name)
|
||
|
||
# 產生摘要
|
||
if not unhealthy:
|
||
summary = (
|
||
f"All {len(all_dependencies)} dependencies of '{target_service}' are healthy. "
|
||
"Issue might be within the service itself."
|
||
)
|
||
else:
|
||
unhealthy_names = [n.name for n in unhealthy]
|
||
summary = (
|
||
f"Found {len(unhealthy)} unhealthy dependencies for '{target_service}': "
|
||
f"{', '.join(unhealthy_names)}. "
|
||
f"Probable root causes: {', '.join(probable_roots)}."
|
||
)
|
||
|
||
return RootCauseResult(
|
||
target_service=target_service,
|
||
unhealthy_dependencies=unhealthy,
|
||
dependency_chain=all_dependencies,
|
||
probable_root_causes=probable_roots,
|
||
analysis_summary=summary,
|
||
)
|
||
|
||
# ==================== Combined Analysis ====================
|
||
|
||
def get_blast_radius_and_root_cause(
|
||
self,
|
||
target_service: str,
|
||
max_depth: int = 3,
|
||
) -> FullAnalysisResult:
|
||
"""
|
||
完整分析: Blast Radius + Root Cause
|
||
|
||
OpenClaw 主要呼叫這個方法,一次取得:
|
||
1. 向上追溯: 誰會受影響
|
||
2. 向下追溯: 誰是根本原因
|
||
|
||
Args:
|
||
target_service: 目標服務
|
||
max_depth: 最大追溯深度 (預設 3)
|
||
"""
|
||
blast = self.get_blast_radius(target_service, max_depth)
|
||
root = self.get_root_cause(target_service, max_depth)
|
||
|
||
logger.info(
|
||
f"[GraphRAG] Full analysis for '{target_service}': "
|
||
f"blast_radius={blast.affected_count}, "
|
||
f"unhealthy_deps={len(root.unhealthy_dependencies)}"
|
||
)
|
||
|
||
return FullAnalysisResult(
|
||
target_service=target_service,
|
||
blast_radius=blast,
|
||
root_cause=root,
|
||
analyzed_at=datetime.utcnow(),
|
||
)
|
||
|
||
# ==================== Utilities ====================
|
||
|
||
def get_all_nodes(self) -> list[ServiceNode]:
|
||
"""取得所有節點"""
|
||
return list(self._nodes.values())
|
||
|
||
def get_all_edges(self) -> list[DependencyEdge]:
|
||
"""取得所有邊"""
|
||
return self._edges
|
||
|
||
def to_dict(self) -> dict:
|
||
"""輸出完整圖結構"""
|
||
return {
|
||
"nodes": [n.to_dict() for n in self._nodes.values()],
|
||
"edges": [e.to_dict() for e in self._edges],
|
||
"nodeCount": len(self._nodes),
|
||
"edgeCount": len(self._edges),
|
||
}
|
||
|
||
|
||
# ==================== Mock Data Factory ====================
|
||
|
||
|
||
def create_mock_topology() -> TopologyGraph:
|
||
r"""
|
||
建立 Mock 拓撲圖 (Phase 3 用)
|
||
|
||
典型微服務架構:
|
||
ingress -> frontend -> auth-service -> postgres-db
|
||
\-> product-api -> postgres-db
|
||
\-> order-api -> postgres-db
|
||
\-> redis-cache
|
||
"""
|
||
graph = TopologyGraph()
|
||
|
||
# 建立節點
|
||
nodes = [
|
||
ServiceNode("ingress", NodeType.INGRESS),
|
||
ServiceNode("frontend", NodeType.SERVICE),
|
||
ServiceNode("auth-service", NodeType.SERVICE),
|
||
ServiceNode("product-api", NodeType.SERVICE),
|
||
ServiceNode("order-api", NodeType.SERVICE),
|
||
ServiceNode("postgres-db", NodeType.DATABASE),
|
||
ServiceNode("redis-cache", NodeType.CACHE),
|
||
]
|
||
for node in nodes:
|
||
graph.add_node(node)
|
||
|
||
# 建立邊 (依賴關係)
|
||
edges = [
|
||
DependencyEdge("ingress", "frontend", EdgeType.CALLS, is_critical=True),
|
||
DependencyEdge("frontend", "auth-service", EdgeType.DEPENDS_ON, is_critical=True),
|
||
DependencyEdge("frontend", "product-api", EdgeType.CALLS),
|
||
DependencyEdge("frontend", "order-api", EdgeType.CALLS),
|
||
DependencyEdge("auth-service", "postgres-db", EdgeType.READS_FROM, is_critical=True),
|
||
DependencyEdge("product-api", "postgres-db", EdgeType.READS_FROM),
|
||
DependencyEdge("order-api", "postgres-db", EdgeType.WRITES_TO, is_critical=True),
|
||
DependencyEdge("order-api", "redis-cache", EdgeType.READS_FROM),
|
||
]
|
||
for edge in edges:
|
||
graph.add_edge(edge)
|
||
|
||
logger.info(f"[GraphRAG] Mock topology created: {len(nodes)} nodes, {len(edges)} edges")
|
||
|
||
return graph
|
||
|
||
|
||
# 全域實例 (預載 Mock 資料)
|
||
topology_graph = create_mock_topology()
|