#!/usr/bin/env python3 """ Tracer Bullet 2.0 - 全站閉環測試腳本 Phase 4: E2E Integration Test 測試流程: 1. 觸發假告警 (Mock Alert) 2. GraphRAG 分析 (Blast Radius + Root Cause) 3. 產生 ApprovalCard (Dry-Run) 4. 人類批准 (Multi-Sig) 5. MCP 模擬執行 執行方式: cd apps/api python scripts/tracer_bullet_2.py """ import asyncio import json from datetime import datetime # ==================== 模擬模組導入 ==================== # 實際運行時這些會從專案導入 # from src.services import ( # topology_graph, trust_engine, multi_sig_engine, dry_run_engine # ) # from src.plugins.finops import idle_scanner # from src.plugins.mcp import mcp_bridge # ==================== Test Configuration ==================== class TracerBullet2: """全站閉環測試器""" def __init__(self): self.test_id = f"tb2-{datetime.utcnow().strftime('%Y%m%d%H%M%S')}" self.results: list[dict] = [] def log(self, step: str, status: str, data: dict | None = None): """記錄測試結果""" result = { "step": step, "status": status, "timestamp": datetime.utcnow().isoformat(), "data": data or {}, } self.results.append(result) emoji = "✅" if status == "PASS" else "❌" if status == "FAIL" else "🔄" print(f"{emoji} [{step}] {status}") if data: print(f" {json.dumps(data, indent=2, default=str)}") # ==================== Step 1: Mock Alert ==================== async def step1_trigger_alert(self) -> dict: """ Step 1: 觸發假告警 模擬 Prometheus AlertManager 發送告警: - frontend 服務 5xx 錯誤率上升 """ print("\n" + "=" * 60) print("STEP 1: TRIGGER MOCK ALERT") print("=" * 60) alert = { "alertname": "HighErrorRate", "service": "frontend", "namespace": "production", "severity": "critical", "error_rate": 15.2, # 15% 5xx "threshold": 5.0, "fired_at": datetime.utcnow().isoformat(), } self.log("trigger_alert", "PASS", alert) return alert # ==================== Step 2: GraphRAG Analysis ==================== async def step2_graphrag_analysis(self, alert: dict) -> dict: """ Step 2: GraphRAG 分析 呼叫 TopologyGraph.get_blast_radius_and_root_cause() 分析: - Blast Radius: frontend 掛了誰會跟著掛 - Root Cause: frontend 的依賴誰目前有問題 """ print("\n" + "=" * 60) print("STEP 2: GRAPHRAG ANALYSIS") print("=" * 60) target_service = alert["service"] # Mock GraphRAG 結果 (實際會呼叫 topology_graph) analysis = { "targetService": target_service, "blastRadius": { "affectedServices": ["ingress"], "affectedCount": 1, "criticalPath": ["ingress -> frontend"], "impactSummary": "If 'frontend' goes down, 1 upstream service (ingress) will be affected.", }, "rootCause": { "unhealthyDependencies": ["postgres-db"], "dependencyChain": ["auth-service", "product-api", "order-api", "postgres-db", "redis-cache"], "probableRootCauses": ["postgres-db"], "analysisSummary": "Found 1 unhealthy dependency: postgres-db. Probable root cause: postgres-db.", }, "analyzedAt": datetime.utcnow().isoformat(), } # 視覺化輸出 print("\n[BLAST RADIUS - Upstream Impact]") print(" ┌─────────────────────┐") print(" │ ingress │") print(" └─────────┬───────────┘") print(" │ depends on") print(" ▼") print(" ┌─────────────────────┐") print(" │ frontend │ X") print(" └─────────────────────┘") print("\n[ROOT CAUSE - Downstream Chain]") print(" ┌─────────────────────┐") print(" │ frontend │ !") print(" └─────────┬───────────┘") print(" │ calls") print(" ▼") print(" ┌─────────────────────┐") print(" │ postgres-db │ X (UNHEALTHY)") print(" └─────────────────────┘") self.log("graphrag_analysis", "PASS", analysis) return analysis # ==================== Step 3: Dry-Run & ApprovalCard ==================== async def step3_generate_approval(self, analysis: dict) -> dict: """ Step 3: 產生 ApprovalCard 根據分析結果,建議重啟 postgres-db 執行 Dry-Run 檢查 """ print("\n" + "=" * 60) print("STEP 3: DRY-RUN & APPROVAL CARD") print("=" * 60) root_cause = analysis["rootCause"]["probableRootCauses"][0] # 建議動作 proposed_action = { "operation": "restart_pod", "parameters": { "pod_name": f"{root_cause}-0", "namespace": "production", "graceful": True, }, "reason": f"Auto-suggested based on GraphRAG root cause analysis: {root_cause} is unhealthy", } # Mock Dry-Run 結果 dry_run_result = { "checks": [ {"name": "RBAC Permission", "passed": True, "message": "User has pod/delete permission"}, {"name": "Syntax Validation", "passed": True, "message": "Parameters valid"}, {"name": "Resource Exists", "passed": True, "message": "Pod postgres-db-0 exists"}, {"name": "No PDB Violation", "passed": True, "message": "PodDisruptionBudget allows 1 eviction"}, ], "overallPassed": True, "blastRadius": { "affectedPods": 1, "affectedServices": ["postgres-db"], "dataImpact": "NONE", # Graceful restart }, "riskLevel": "high", # Database 操作 } # 產生 ApprovalCard approval_card = { "approvalId": f"approval-{self.test_id}", "action": proposed_action, "dryRunResult": dry_run_result, "requiredSignatures": 2, # HIGH risk = 2-sig "allowedRoles": ["admin", "devops", "sre"], "createdAt": datetime.utcnow().isoformat(), "expiresAt": None, # No expiry for critical ops } print("\n[APPROVAL CARD]") print(f" Action: {proposed_action['operation']}") print(f" Target: {proposed_action['parameters']['pod_name']}") print(f" Risk Level: {dry_run_result['riskLevel'].upper()}") print(f" Required Signatures: {approval_card['requiredSignatures']}") print(f" Dry-Run: {'PASSED' if dry_run_result['overallPassed'] else 'FAILED'}") self.log("generate_approval", "PASS", approval_card) return approval_card # ==================== Step 4: Multi-Sig Approval ==================== async def step4_multisig_approval(self, approval_card: dict) -> dict: """ Step 4: 人類批准 (Multi-Sig) 模擬兩位管理者簽名: 1. DevOps Engineer 2. SRE Lead """ print("\n" + "=" * 60) print("STEP 4: MULTI-SIG APPROVAL") print("=" * 60) approval_id = approval_card["approvalId"] # 第一位簽名 sig1 = { "userId": "devops-alice", "role": "devops", "signedAt": datetime.utcnow().isoformat(), "comment": "GraphRAG analysis looks correct. Approving restart.", } print(f"\n[SIGNATURE 1] {sig1['role']}: {sig1['userId']}") print(f" Comment: {sig1['comment']}") # 第二位簽名 sig2 = { "userId": "sre-bob", "role": "sre", "signedAt": datetime.utcnow().isoformat(), "comment": "Verified PDB. Safe to proceed.", } print(f"\n[SIGNATURE 2] {sig2['role']}: {sig2['userId']}") print(f" Comment: {sig2['comment']}") # 批准結果 approval_result = { "approvalId": approval_id, "status": "APPROVED", "signatures": [sig1, sig2], "approvedAt": datetime.utcnow().isoformat(), } print(f"\n[APPROVAL STATUS] {approval_result['status']}") print(f" Signatures: {len(approval_result['signatures'])}/{approval_card['requiredSignatures']}") self.log("multisig_approval", "PASS", approval_result) return approval_result # ==================== Step 5: MCP Execution ==================== async def step5_mcp_execution(self, approval_result: dict, approval_card: dict) -> dict: """ Step 5: MCP 模擬執行 透過 MCP Bridge 執行操作 (Phase 3 為模擬,Phase 4+ 連接真實 K8s) """ print("\n" + "=" * 60) print("STEP 5: MCP EXECUTION") print("=" * 60) action = approval_card["action"] # TOCTOU 保護: 再次執行 Dry-Run print("\n[TOCTOU CHECK] Re-running dry-run before execution...") toctou_passed = True # Mock print(f" Result: {'PASSED' if toctou_passed else 'VOIDED'}") if not toctou_passed: self.log("mcp_execution", "FAIL", {"reason": "TOCTOU conflict detected"}) return {"status": "VOIDED"} # MCP 執行 execution_result = { "executionId": f"exec-{self.test_id}", "operation": action["operation"], "parameters": action["parameters"], "status": "SUCCESS", "output": { "message": f"Pod {action['parameters']['pod_name']} restarted successfully", "newPodName": "postgres-db-0", # Same name after restart "restartTime": "2.3s", }, "executedAt": datetime.utcnow().isoformat(), } print(f"\n[EXECUTION RESULT]") print(f" Status: {execution_result['status']}") print(f" Output: {execution_result['output']['message']}") print(f" Restart Time: {execution_result['output']['restartTime']}") # 更新 Trust Engine print("\n[TRUST ENGINE] Recording approval for progressive autonomy...") print(" Action Pattern: restart_pod:postgres-*") print(" Trust Score: +1") self.log("mcp_execution", "PASS", execution_result) return execution_result # ==================== Run All ==================== async def run(self): """執行完整測試流程""" print("\n" + "=" * 60) print("TRACER BULLET 2.0 - FULL LOOP TEST") print(f"Test ID: {self.test_id}") print("=" * 60) try: # Step 1: Trigger Alert alert = await self.step1_trigger_alert() # Step 2: GraphRAG Analysis analysis = await self.step2_graphrag_analysis(alert) # Step 3: Dry-Run & Approval Card approval_card = await self.step3_generate_approval(analysis) # Step 4: Multi-Sig Approval approval_result = await self.step4_multisig_approval(approval_card) # Step 5: MCP Execution execution_result = await self.step5_mcp_execution(approval_result, approval_card) # Summary print("\n" + "=" * 60) print("TEST SUMMARY") print("=" * 60) passed = sum(1 for r in self.results if r["status"] == "PASS") failed = sum(1 for r in self.results if r["status"] == "FAIL") print(f" Total Steps: {len(self.results)}") print(f" Passed: {passed}") print(f" Failed: {failed}") print(f" Status: {'ALL PASSED' if failed == 0 else 'SOME FAILED'}") return { "testId": self.test_id, "status": "PASS" if failed == 0 else "FAIL", "results": self.results, } except Exception as e: self.log("unexpected_error", "FAIL", {"error": str(e)}) raise # ==================== Main ==================== if __name__ == "__main__": tracer = TracerBullet2() asyncio.run(tracer.run())