- Fix 35 Python ruff errors (B904, F841, E722, E741, B007, B008) - Add eslint config for lewooogo-core package - Update pyproject.toml to new ruff lint config format - Relax frontend eslint rules to warnings for unused vars - Allow console.* for debugging (TODO: unified logger) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
361 lines
13 KiB
Python
361 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Tracer Bullet 2.0 - 全站閉環測試腳本
|
||
Phase 4: E2E Integration Test
|
||
|
||
測試流程:
|
||
1. 觸發假告警 (Mock Alert)
|
||
2. GraphRAG 分析 (Blast Radius + Root Cause)
|
||
3. 產生 ApprovalCard (Dry-Run)
|
||
4. 人類批准 (Multi-Sig)
|
||
5. MCP 模擬執行
|
||
|
||
執行方式:
|
||
cd apps/api
|
||
python scripts/tracer_bullet_2.py
|
||
"""
|
||
|
||
import asyncio
|
||
import json
|
||
from datetime import datetime
|
||
|
||
# ==================== 模擬模組導入 ====================
|
||
|
||
# 實際運行時這些會從專案導入
|
||
# from src.services import (
|
||
# topology_graph, trust_engine, multi_sig_engine, dry_run_engine
|
||
# )
|
||
# from src.plugins.finops import idle_scanner
|
||
# from src.plugins.mcp import mcp_bridge
|
||
|
||
|
||
# ==================== Test Configuration ====================
|
||
|
||
|
||
class TracerBullet2:
|
||
"""全站閉環測試器"""
|
||
|
||
def __init__(self):
|
||
self.test_id = f"tb2-{datetime.utcnow().strftime('%Y%m%d%H%M%S')}"
|
||
self.results: list[dict] = []
|
||
|
||
def log(self, step: str, status: str, data: dict | None = None):
|
||
"""記錄測試結果"""
|
||
result = {
|
||
"step": step,
|
||
"status": status,
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
"data": data or {},
|
||
}
|
||
self.results.append(result)
|
||
emoji = "✅" if status == "PASS" else "❌" if status == "FAIL" else "🔄"
|
||
print(f"{emoji} [{step}] {status}")
|
||
if data:
|
||
print(f" {json.dumps(data, indent=2, default=str)}")
|
||
|
||
# ==================== Step 1: Mock Alert ====================
|
||
|
||
async def step1_trigger_alert(self) -> dict:
|
||
"""
|
||
Step 1: 觸發假告警
|
||
|
||
模擬 Prometheus AlertManager 發送告警:
|
||
- frontend 服務 5xx 錯誤率上升
|
||
"""
|
||
print("\n" + "=" * 60)
|
||
print("STEP 1: TRIGGER MOCK ALERT")
|
||
print("=" * 60)
|
||
|
||
alert = {
|
||
"alertname": "HighErrorRate",
|
||
"service": "frontend",
|
||
"namespace": "production",
|
||
"severity": "critical",
|
||
"error_rate": 15.2, # 15% 5xx
|
||
"threshold": 5.0,
|
||
"fired_at": datetime.utcnow().isoformat(),
|
||
}
|
||
|
||
self.log("trigger_alert", "PASS", alert)
|
||
return alert
|
||
|
||
# ==================== Step 2: GraphRAG Analysis ====================
|
||
|
||
async def step2_graphrag_analysis(self, alert: dict) -> dict:
|
||
"""
|
||
Step 2: GraphRAG 分析
|
||
|
||
呼叫 TopologyGraph.get_blast_radius_and_root_cause()
|
||
分析:
|
||
- Blast Radius: frontend 掛了誰會跟著掛
|
||
- Root Cause: frontend 的依賴誰目前有問題
|
||
"""
|
||
print("\n" + "=" * 60)
|
||
print("STEP 2: GRAPHRAG ANALYSIS")
|
||
print("=" * 60)
|
||
|
||
target_service = alert["service"]
|
||
|
||
# Mock GraphRAG 結果 (實際會呼叫 topology_graph)
|
||
analysis = {
|
||
"targetService": target_service,
|
||
"blastRadius": {
|
||
"affectedServices": ["ingress"],
|
||
"affectedCount": 1,
|
||
"criticalPath": ["ingress -> frontend"],
|
||
"impactSummary": "If 'frontend' goes down, 1 upstream service (ingress) will be affected.",
|
||
},
|
||
"rootCause": {
|
||
"unhealthyDependencies": ["postgres-db"],
|
||
"dependencyChain": ["auth-service", "product-api", "order-api", "postgres-db", "redis-cache"],
|
||
"probableRootCauses": ["postgres-db"],
|
||
"analysisSummary": "Found 1 unhealthy dependency: postgres-db. Probable root cause: postgres-db.",
|
||
},
|
||
"analyzedAt": datetime.utcnow().isoformat(),
|
||
}
|
||
|
||
# 視覺化輸出
|
||
print("\n[BLAST RADIUS - Upstream Impact]")
|
||
print(" ┌─────────────────────┐")
|
||
print(" │ ingress │")
|
||
print(" └─────────┬───────────┘")
|
||
print(" │ depends on")
|
||
print(" ▼")
|
||
print(" ┌─────────────────────┐")
|
||
print(" │ frontend │ X")
|
||
print(" └─────────────────────┘")
|
||
|
||
print("\n[ROOT CAUSE - Downstream Chain]")
|
||
print(" ┌─────────────────────┐")
|
||
print(" │ frontend │ !")
|
||
print(" └─────────┬───────────┘")
|
||
print(" │ calls")
|
||
print(" ▼")
|
||
print(" ┌─────────────────────┐")
|
||
print(" │ postgres-db │ X (UNHEALTHY)")
|
||
print(" └─────────────────────┘")
|
||
|
||
self.log("graphrag_analysis", "PASS", analysis)
|
||
return analysis
|
||
|
||
# ==================== Step 3: Dry-Run & ApprovalCard ====================
|
||
|
||
async def step3_generate_approval(self, analysis: dict) -> dict:
|
||
"""
|
||
Step 3: 產生 ApprovalCard
|
||
|
||
根據分析結果,建議重啟 postgres-db
|
||
執行 Dry-Run 檢查
|
||
"""
|
||
print("\n" + "=" * 60)
|
||
print("STEP 3: DRY-RUN & APPROVAL CARD")
|
||
print("=" * 60)
|
||
|
||
root_cause = analysis["rootCause"]["probableRootCauses"][0]
|
||
|
||
# 建議動作
|
||
proposed_action = {
|
||
"operation": "restart_pod",
|
||
"parameters": {
|
||
"pod_name": f"{root_cause}-0",
|
||
"namespace": "production",
|
||
"graceful": True,
|
||
},
|
||
"reason": f"Auto-suggested based on GraphRAG root cause analysis: {root_cause} is unhealthy",
|
||
}
|
||
|
||
# Mock Dry-Run 結果
|
||
dry_run_result = {
|
||
"checks": [
|
||
{"name": "RBAC Permission", "passed": True, "message": "User has pod/delete permission"},
|
||
{"name": "Syntax Validation", "passed": True, "message": "Parameters valid"},
|
||
{"name": "Resource Exists", "passed": True, "message": "Pod postgres-db-0 exists"},
|
||
{"name": "No PDB Violation", "passed": True, "message": "PodDisruptionBudget allows 1 eviction"},
|
||
],
|
||
"overallPassed": True,
|
||
"blastRadius": {
|
||
"affectedPods": 1,
|
||
"affectedServices": ["postgres-db"],
|
||
"dataImpact": "NONE", # Graceful restart
|
||
},
|
||
"riskLevel": "high", # Database 操作
|
||
}
|
||
|
||
# 產生 ApprovalCard
|
||
approval_card = {
|
||
"approvalId": f"approval-{self.test_id}",
|
||
"action": proposed_action,
|
||
"dryRunResult": dry_run_result,
|
||
"requiredSignatures": 2, # HIGH risk = 2-sig
|
||
"allowedRoles": ["admin", "devops", "sre"],
|
||
"createdAt": datetime.utcnow().isoformat(),
|
||
"expiresAt": None, # No expiry for critical ops
|
||
}
|
||
|
||
print("\n[APPROVAL CARD]")
|
||
print(f" Action: {proposed_action['operation']}")
|
||
print(f" Target: {proposed_action['parameters']['pod_name']}")
|
||
print(f" Risk Level: {dry_run_result['riskLevel'].upper()}")
|
||
print(f" Required Signatures: {approval_card['requiredSignatures']}")
|
||
print(f" Dry-Run: {'PASSED' if dry_run_result['overallPassed'] else 'FAILED'}")
|
||
|
||
self.log("generate_approval", "PASS", approval_card)
|
||
return approval_card
|
||
|
||
# ==================== Step 4: Multi-Sig Approval ====================
|
||
|
||
async def step4_multisig_approval(self, approval_card: dict) -> dict:
|
||
"""
|
||
Step 4: 人類批准 (Multi-Sig)
|
||
|
||
模擬兩位管理者簽名:
|
||
1. DevOps Engineer
|
||
2. SRE Lead
|
||
"""
|
||
print("\n" + "=" * 60)
|
||
print("STEP 4: MULTI-SIG APPROVAL")
|
||
print("=" * 60)
|
||
|
||
approval_id = approval_card["approvalId"]
|
||
|
||
# 第一位簽名
|
||
sig1 = {
|
||
"userId": "devops-alice",
|
||
"role": "devops",
|
||
"signedAt": datetime.utcnow().isoformat(),
|
||
"comment": "GraphRAG analysis looks correct. Approving restart.",
|
||
}
|
||
print(f"\n[SIGNATURE 1] {sig1['role']}: {sig1['userId']}")
|
||
print(f" Comment: {sig1['comment']}")
|
||
|
||
# 第二位簽名
|
||
sig2 = {
|
||
"userId": "sre-bob",
|
||
"role": "sre",
|
||
"signedAt": datetime.utcnow().isoformat(),
|
||
"comment": "Verified PDB. Safe to proceed.",
|
||
}
|
||
print(f"\n[SIGNATURE 2] {sig2['role']}: {sig2['userId']}")
|
||
print(f" Comment: {sig2['comment']}")
|
||
|
||
# 批准結果
|
||
approval_result = {
|
||
"approvalId": approval_id,
|
||
"status": "APPROVED",
|
||
"signatures": [sig1, sig2],
|
||
"approvedAt": datetime.utcnow().isoformat(),
|
||
}
|
||
|
||
print(f"\n[APPROVAL STATUS] {approval_result['status']}")
|
||
print(f" Signatures: {len(approval_result['signatures'])}/{approval_card['requiredSignatures']}")
|
||
|
||
self.log("multisig_approval", "PASS", approval_result)
|
||
return approval_result
|
||
|
||
# ==================== Step 5: MCP Execution ====================
|
||
|
||
async def step5_mcp_execution(self, approval_result: dict, approval_card: dict) -> dict:
|
||
"""
|
||
Step 5: MCP 模擬執行
|
||
|
||
透過 MCP Bridge 執行操作
|
||
(Phase 3 為模擬,Phase 4+ 連接真實 K8s)
|
||
"""
|
||
print("\n" + "=" * 60)
|
||
print("STEP 5: MCP EXECUTION")
|
||
print("=" * 60)
|
||
|
||
action = approval_card["action"]
|
||
|
||
# TOCTOU 保護: 再次執行 Dry-Run
|
||
print("\n[TOCTOU CHECK] Re-running dry-run before execution...")
|
||
toctou_passed = True # Mock
|
||
print(f" Result: {'PASSED' if toctou_passed else 'VOIDED'}")
|
||
|
||
if not toctou_passed:
|
||
self.log("mcp_execution", "FAIL", {"reason": "TOCTOU conflict detected"})
|
||
return {"status": "VOIDED"}
|
||
|
||
# MCP 執行
|
||
execution_result = {
|
||
"executionId": f"exec-{self.test_id}",
|
||
"operation": action["operation"],
|
||
"parameters": action["parameters"],
|
||
"status": "SUCCESS",
|
||
"output": {
|
||
"message": f"Pod {action['parameters']['pod_name']} restarted successfully",
|
||
"newPodName": "postgres-db-0", # Same name after restart
|
||
"restartTime": "2.3s",
|
||
},
|
||
"executedAt": datetime.utcnow().isoformat(),
|
||
}
|
||
|
||
print("\n[EXECUTION RESULT]")
|
||
print(f" Status: {execution_result['status']}")
|
||
print(f" Output: {execution_result['output']['message']}")
|
||
print(f" Restart Time: {execution_result['output']['restartTime']}")
|
||
|
||
# 更新 Trust Engine
|
||
print("\n[TRUST ENGINE] Recording approval for progressive autonomy...")
|
||
print(" Action Pattern: restart_pod:postgres-*")
|
||
print(" Trust Score: +1")
|
||
|
||
self.log("mcp_execution", "PASS", execution_result)
|
||
return execution_result
|
||
|
||
# ==================== Run All ====================
|
||
|
||
async def run(self):
|
||
"""執行完整測試流程"""
|
||
print("\n" + "=" * 60)
|
||
print("TRACER BULLET 2.0 - FULL LOOP TEST")
|
||
print(f"Test ID: {self.test_id}")
|
||
print("=" * 60)
|
||
|
||
try:
|
||
# Step 1: Trigger Alert
|
||
alert = await self.step1_trigger_alert()
|
||
|
||
# Step 2: GraphRAG Analysis
|
||
analysis = await self.step2_graphrag_analysis(alert)
|
||
|
||
# Step 3: Dry-Run & Approval Card
|
||
approval_card = await self.step3_generate_approval(analysis)
|
||
|
||
# Step 4: Multi-Sig Approval
|
||
approval_result = await self.step4_multisig_approval(approval_card)
|
||
|
||
# Step 5: MCP Execution
|
||
_execution_result = await self.step5_mcp_execution(approval_result, approval_card)
|
||
|
||
# Summary
|
||
print("\n" + "=" * 60)
|
||
print("TEST SUMMARY")
|
||
print("=" * 60)
|
||
|
||
passed = sum(1 for r in self.results if r["status"] == "PASS")
|
||
failed = sum(1 for r in self.results if r["status"] == "FAIL")
|
||
|
||
print(f" Total Steps: {len(self.results)}")
|
||
print(f" Passed: {passed}")
|
||
print(f" Failed: {failed}")
|
||
print(f" Status: {'ALL PASSED' if failed == 0 else 'SOME FAILED'}")
|
||
|
||
return {
|
||
"testId": self.test_id,
|
||
"status": "PASS" if failed == 0 else "FAIL",
|
||
"results": self.results,
|
||
}
|
||
|
||
except Exception as e:
|
||
self.log("unexpected_error", "FAIL", {"error": str(e)})
|
||
raise
|
||
|
||
|
||
# ==================== Main ====================
|
||
|
||
|
||
if __name__ == "__main__":
|
||
tracer = TracerBullet2()
|
||
asyncio.run(tracer.run())
|