awoooi/apps/api/scripts/tracer_bullet_2.py

#!/usr/bin/env python3
"""
Tracer Bullet 2.0 - 全站閉環測試腳本
Phase 4: E2E Integration Test

測試流程:
1. 觸發假告警 (Mock Alert)
2. GraphRAG 分析 (Blast Radius + Root Cause)
3. 產生 ApprovalCard (Dry-Run)
4. 人類批准 (Multi-Sig)
5. MCP 模擬執行

執行方式:
  cd apps/api
  python scripts/tracer_bullet_2.py
"""

import asyncio
import json
from datetime import datetime

# ==================== 模擬模組導入 ====================

# 實際運行時這些會從專案導入
# from src.services import (
#     topology_graph, trust_engine, multi_sig_engine, dry_run_engine
# )
# from src.plugins.finops import idle_scanner
# from src.plugins.mcp import mcp_bridge


# ==================== Test Configuration ====================


class TracerBullet2:
    """全站閉環測試器"""

    def __init__(self):
        self.test_id = f"tb2-{datetime.utcnow().strftime('%Y%m%d%H%M%S')}"
        self.results: list[dict] = []

    def log(self, step: str, status: str, data: dict | None = None):
        """記錄測試結果"""
        result = {
            "step": step,
            "status": status,
            "timestamp": datetime.utcnow().isoformat(),
            "data": data or {},
        }
        self.results.append(result)
        emoji = "✅" if status == "PASS" else "❌" if status == "FAIL" else "🔄"
        print(f"{emoji} [{step}] {status}")
        if data:
            print(f"   {json.dumps(data, indent=2, default=str)}")

    # ==================== Step 1: Mock Alert ====================

    async def step1_trigger_alert(self) -> dict:
        """
        Step 1: 觸發假告警

        模擬 Prometheus AlertManager 發送告警:
        - frontend 服務 5xx 錯誤率上升
        """
        print("\n" + "=" * 60)
        print("STEP 1: TRIGGER MOCK ALERT")
        print("=" * 60)

        alert = {
            "alertname": "HighErrorRate",
            "service": "frontend",
            "namespace": "production",
            "severity": "critical",
            "error_rate": 15.2,  # 15% 5xx
            "threshold": 5.0,
            "fired_at": datetime.utcnow().isoformat(),
        }

        self.log("trigger_alert", "PASS", alert)
        return alert

    # ==================== Step 2: GraphRAG Analysis ====================

    async def step2_graphrag_analysis(self, alert: dict) -> dict:
        """
        Step 2: GraphRAG 分析

        呼叫 TopologyGraph.get_blast_radius_and_root_cause()
        分析:
        - Blast Radius: frontend 掛了誰會跟著掛
        - Root Cause: frontend 的依賴誰目前有問題
        """
        print("\n" + "=" * 60)
        print("STEP 2: GRAPHRAG ANALYSIS")
        print("=" * 60)

        target_service = alert["service"]

        # Mock GraphRAG 結果 (實際會呼叫 topology_graph)
        analysis = {
            "targetService": target_service,
            "blastRadius": {
                "affectedServices": ["ingress"],
                "affectedCount": 1,
                "criticalPath": ["ingress -> frontend"],
                "impactSummary": "If 'frontend' goes down, 1 upstream service (ingress) will be affected.",
            },
            "rootCause": {
                "unhealthyDependencies": ["postgres-db"],
                "dependencyChain": ["auth-service", "product-api", "order-api", "postgres-db", "redis-cache"],
                "probableRootCauses": ["postgres-db"],
                "analysisSummary": "Found 1 unhealthy dependency: postgres-db. Probable root cause: postgres-db.",
            },
            "analyzedAt": datetime.utcnow().isoformat(),
        }

        # 視覺化輸出
        print("\n[BLAST RADIUS - Upstream Impact]")
        print("    ┌─────────────────────┐")
        print("    │ ingress             │")
        print("    └─────────┬───────────┘")
        print("              │ depends on")
        print("              ▼")
        print("    ┌─────────────────────┐")
        print("    │ frontend            │ X")
        print("    └─────────────────────┘")

        print("\n[ROOT CAUSE - Downstream Chain]")
        print("    ┌─────────────────────┐")
        print("    │ frontend            │ !")
        print("    └─────────┬───────────┘")
        print("              │ calls")
        print("              ▼")
        print("    ┌─────────────────────┐")
        print("    │ postgres-db         │ X (UNHEALTHY)")
        print("    └─────────────────────┘")

        self.log("graphrag_analysis", "PASS", analysis)
        return analysis

    # ==================== Step 3: Dry-Run & ApprovalCard ====================

    async def step3_generate_approval(self, analysis: dict) -> dict:
        """
        Step 3: 產生 ApprovalCard

        根據分析結果，建議重啟 postgres-db
        執行 Dry-Run 檢查
        """
        print("\n" + "=" * 60)
        print("STEP 3: DRY-RUN & APPROVAL CARD")
        print("=" * 60)

        root_cause = analysis["rootCause"]["probableRootCauses"][0]

        # 建議動作
        proposed_action = {
            "operation": "restart_pod",
            "parameters": {
                "pod_name": f"{root_cause}-0",
                "namespace": "production",
                "graceful": True,
            },
            "reason": f"Auto-suggested based on GraphRAG root cause analysis: {root_cause} is unhealthy",
        }

        # Mock Dry-Run 結果
        dry_run_result = {
            "checks": [
                {"name": "RBAC Permission", "passed": True, "message": "User has pod/delete permission"},
                {"name": "Syntax Validation", "passed": True, "message": "Parameters valid"},
                {"name": "Resource Exists", "passed": True, "message": "Pod postgres-db-0 exists"},
                {"name": "No PDB Violation", "passed": True, "message": "PodDisruptionBudget allows 1 eviction"},
            ],
            "overallPassed": True,
            "blastRadius": {
                "affectedPods": 1,
                "affectedServices": ["postgres-db"],
                "dataImpact": "NONE",  # Graceful restart
            },
            "riskLevel": "high",  # Database 操作
        }

        # 產生 ApprovalCard
        approval_card = {
            "approvalId": f"approval-{self.test_id}",
            "action": proposed_action,
            "dryRunResult": dry_run_result,
            "requiredSignatures": 2,  # HIGH risk = 2-sig
            "allowedRoles": ["admin", "devops", "sre"],
            "createdAt": datetime.utcnow().isoformat(),
            "expiresAt": None,  # No expiry for critical ops
        }

        print("\n[APPROVAL CARD]")
        print(f"  Action: {proposed_action['operation']}")
        print(f"  Target: {proposed_action['parameters']['pod_name']}")
        print(f"  Risk Level: {dry_run_result['riskLevel'].upper()}")
        print(f"  Required Signatures: {approval_card['requiredSignatures']}")
        print(f"  Dry-Run: {'PASSED' if dry_run_result['overallPassed'] else 'FAILED'}")

        self.log("generate_approval", "PASS", approval_card)
        return approval_card

    # ==================== Step 4: Multi-Sig Approval ====================

    async def step4_multisig_approval(self, approval_card: dict) -> dict:
        """
        Step 4: 人類批准 (Multi-Sig)

        模擬兩位管理者簽名:
        1. DevOps Engineer
        2. SRE Lead
        """
        print("\n" + "=" * 60)
        print("STEP 4: MULTI-SIG APPROVAL")
        print("=" * 60)

        approval_id = approval_card["approvalId"]

        # 第一位簽名
        sig1 = {
            "userId": "devops-alice",
            "role": "devops",
            "signedAt": datetime.utcnow().isoformat(),
            "comment": "GraphRAG analysis looks correct. Approving restart.",
        }
        print(f"\n[SIGNATURE 1] {sig1['role']}: {sig1['userId']}")
        print(f"  Comment: {sig1['comment']}")

        # 第二位簽名
        sig2 = {
            "userId": "sre-bob",
            "role": "sre",
            "signedAt": datetime.utcnow().isoformat(),
            "comment": "Verified PDB. Safe to proceed.",
        }
        print(f"\n[SIGNATURE 2] {sig2['role']}: {sig2['userId']}")
        print(f"  Comment: {sig2['comment']}")

        # 批准結果
        approval_result = {
            "approvalId": approval_id,
            "status": "APPROVED",
            "signatures": [sig1, sig2],
            "approvedAt": datetime.utcnow().isoformat(),
        }

        print(f"\n[APPROVAL STATUS] {approval_result['status']}")
        print(f"  Signatures: {len(approval_result['signatures'])}/{approval_card['requiredSignatures']}")

        self.log("multisig_approval", "PASS", approval_result)
        return approval_result

    # ==================== Step 5: MCP Execution ====================

    async def step5_mcp_execution(self, approval_result: dict, approval_card: dict) -> dict:
        """
        Step 5: MCP 模擬執行

        透過 MCP Bridge 執行操作
        (Phase 3 為模擬，Phase 4+ 連接真實 K8s)
        """
        print("\n" + "=" * 60)
        print("STEP 5: MCP EXECUTION")
        print("=" * 60)

        action = approval_card["action"]

        # TOCTOU 保護: 再次執行 Dry-Run
        print("\n[TOCTOU CHECK] Re-running dry-run before execution...")
        toctou_passed = True  # Mock
        print(f"  Result: {'PASSED' if toctou_passed else 'VOIDED'}")

        if not toctou_passed:
            self.log("mcp_execution", "FAIL", {"reason": "TOCTOU conflict detected"})
            return {"status": "VOIDED"}

        # MCP 執行
        execution_result = {
            "executionId": f"exec-{self.test_id}",
            "operation": action["operation"],
            "parameters": action["parameters"],
            "status": "SUCCESS",
            "output": {
                "message": f"Pod {action['parameters']['pod_name']} restarted successfully",
                "newPodName": "postgres-db-0",  # Same name after restart
                "restartTime": "2.3s",
            },
            "executedAt": datetime.utcnow().isoformat(),
        }

        print("\n[EXECUTION RESULT]")
        print(f"  Status: {execution_result['status']}")
        print(f"  Output: {execution_result['output']['message']}")
        print(f"  Restart Time: {execution_result['output']['restartTime']}")

        # 更新 Trust Engine
        print("\n[TRUST ENGINE] Recording approval for progressive autonomy...")
        print("  Action Pattern: restart_pod:postgres-*")
        print("  Trust Score: +1")

        self.log("mcp_execution", "PASS", execution_result)
        return execution_result

    # ==================== Run All ====================

    async def run(self):
        """執行完整測試流程"""
        print("\n" + "=" * 60)
        print("TRACER BULLET 2.0 - FULL LOOP TEST")
        print(f"Test ID: {self.test_id}")
        print("=" * 60)

        try:
            # Step 1: Trigger Alert
            alert = await self.step1_trigger_alert()

            # Step 2: GraphRAG Analysis
            analysis = await self.step2_graphrag_analysis(alert)

            # Step 3: Dry-Run & Approval Card
            approval_card = await self.step3_generate_approval(analysis)

            # Step 4: Multi-Sig Approval
            approval_result = await self.step4_multisig_approval(approval_card)

            # Step 5: MCP Execution
            _execution_result = await self.step5_mcp_execution(approval_result, approval_card)

            # Summary
            print("\n" + "=" * 60)
            print("TEST SUMMARY")
            print("=" * 60)

            passed = sum(1 for r in self.results if r["status"] == "PASS")
            failed = sum(1 for r in self.results if r["status"] == "FAIL")

            print(f"  Total Steps: {len(self.results)}")
            print(f"  Passed: {passed}")
            print(f"  Failed: {failed}")
            print(f"  Status: {'ALL PASSED' if failed == 0 else 'SOME FAILED'}")

            return {
                "testId": self.test_id,
                "status": "PASS" if failed == 0 else "FAIL",
                "results": self.results,
            }

        except Exception as e:
            self.log("unexpected_error", "FAIL", {"error": str(e)})
            raise


# ==================== Main ====================


if __name__ == "__main__":
    tracer = TracerBullet2()
    asyncio.run(tracer.run())