feat: add all application source code

- apps/api: FastAPI backend with Dockerfile
- apps/web: Next.js frontend with Dockerfile
- apps/sensor: Signal collection agent
- packages: shared packages

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
OG T
2026-03-22 18:57:44 +08:00
parent a840bf975b
commit 196d269b92
245 changed files with 42207 additions and 6 deletions

View File

@@ -0,0 +1,198 @@
#!/bin/bash
# =============================================================================
# Prometheus Alertmanager 自動對接腳本
# =============================================================================
# Phase 5: Shadow Mode - 自動化環境對接
#
# 功能:
# 1. 建立 Alertmanager ConfigMap
# 2. 套用至 K3s 叢集
# 3. 自動重載 Alertmanager
#
# 使用方式:
# ./scripts/apply_prometheus_config.sh
#
# 前提條件:
# - kubectl 已配置並可連線至 K3s (192.168.0.120)
# - 有權限操作 monitoring namespace
#
# Tier 2 授權: 此腳本會變更 K3s 環境,需統帥授權
# =============================================================================
set -euo pipefail
# -----------------------------------------------------------------------------
# Configuration
# -----------------------------------------------------------------------------
NAMESPACE="monitoring"
CONFIGMAP_NAME="alertmanager-awoooi-webhook"
AWOOOI_WEBHOOK_URL="http://192.168.0.188:8000/api/v1/webhooks/alerts"
KUBECONFIG_PATH="${KUBECONFIG:-./k3s-prod.yaml}"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# -----------------------------------------------------------------------------
# Functions
# -----------------------------------------------------------------------------
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
check_prerequisites() {
log_info "檢查前提條件..."
# Check kubectl
if ! command -v kubectl &> /dev/null; then
log_error "kubectl 未安裝"
exit 1
fi
# Check kubeconfig
if [[ ! -f "$KUBECONFIG_PATH" ]]; then
log_error "找不到 kubeconfig: $KUBECONFIG_PATH"
exit 1
fi
# Test connection
if ! kubectl --kubeconfig="$KUBECONFIG_PATH" cluster-info &> /dev/null; then
log_error "無法連線至 K3s 叢集"
exit 1
fi
log_info "前提條件檢查通過"
}
create_namespace_if_not_exists() {
log_info "確認 namespace: $NAMESPACE..."
if ! kubectl --kubeconfig="$KUBECONFIG_PATH" get namespace "$NAMESPACE" &> /dev/null; then
log_info "建立 namespace: $NAMESPACE"
kubectl --kubeconfig="$KUBECONFIG_PATH" create namespace "$NAMESPACE"
else
log_info "Namespace $NAMESPACE 已存在"
fi
}
apply_alertmanager_config() {
log_info "套用 Alertmanager Webhook 設定..."
# Create ConfigMap YAML
cat <<EOF | kubectl --kubeconfig="$KUBECONFIG_PATH" apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: ${CONFIGMAP_NAME}
namespace: ${NAMESPACE}
labels:
app: alertmanager
component: awoooi-webhook
data:
alertmanager-webhook.yml: |
# =============================================================================
# AWOOOI Webhook Receiver Configuration
# =============================================================================
# 此設定檔定義 Alertmanager 如何將告警轉發至 AWOOOI OpenClaw
#
# 用法: 將此內容合併至主 alertmanager.yml 的 receivers 區段
# =============================================================================
receivers:
- name: 'awoooi-openclaw'
webhook_configs:
- url: '${AWOOOI_WEBHOOK_URL}'
send_resolved: true
max_alerts: 10
# 5 秒超時
http_config:
follow_redirects: true
# 路由規則範例 (合併至主設定):
# route:
# receiver: 'awoooi-openclaw'
# group_by: ['alertname', 'namespace']
# group_wait: 30s
# group_interval: 5m
# repeat_interval: 4h
# routes:
# - match:
# severity: critical
# receiver: 'awoooi-openclaw'
# group_wait: 10s
EOF
log_info "ConfigMap ${CONFIGMAP_NAME} 已套用"
}
reload_alertmanager() {
log_info "嘗試重載 Alertmanager..."
# Find Alertmanager pod
ALERTMANAGER_POD=$(kubectl --kubeconfig="$KUBECONFIG_PATH" get pods -n "$NAMESPACE" \
-l app=alertmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
if [[ -z "$ALERTMANAGER_POD" ]]; then
log_warn "找不到 Alertmanager Pod (可能尚未部署)"
log_info "ConfigMap 已建立,待 Alertmanager 部署後可手動合併設定"
return 0
fi
# Trigger reload via /-/reload endpoint
log_info "觸發 Alertmanager 設定重載..."
kubectl --kubeconfig="$KUBECONFIG_PATH" exec -n "$NAMESPACE" "$ALERTMANAGER_POD" -- \
wget -q -O- --post-data='' http://localhost:9093/-/reload 2>/dev/null || true
log_info "Alertmanager 重載完成"
}
verify_config() {
log_info "驗證 ConfigMap..."
kubectl --kubeconfig="$KUBECONFIG_PATH" get configmap "$CONFIGMAP_NAME" -n "$NAMESPACE" -o yaml
log_info "驗證完成"
}
# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
main() {
echo "============================================================"
echo " AWOOOI Prometheus Alertmanager 自動對接"
echo "============================================================"
echo ""
echo "目標: 將 Webhook 設定套用至 K3s 叢集"
echo "Webhook URL: $AWOOOI_WEBHOOK_URL"
echo "Namespace: $NAMESPACE"
echo ""
check_prerequisites
create_namespace_if_not_exists
apply_alertmanager_config
reload_alertmanager
verify_config
echo ""
echo "============================================================"
echo " 對接完成"
echo "============================================================"
echo ""
log_info "ConfigMap 已建立: $NAMESPACE/$CONFIGMAP_NAME"
log_info "下一步: 將 receiver 設定合併至 alertmanager.yml"
log_info "測試: 使用 scripts/fire_live_alert.py 發射測試告警"
}
main "$@"

View File

@@ -0,0 +1,265 @@
#!/usr/bin/env python3
"""
CISO-101 Multi-Sig Demo Script
==============================
展示 CRITICAL 任務從發起到完成的完整信任鏈生命週期
流程:
1. ClawBot 發起 CRITICAL 操作 (DROP TABLE)
2. 第一位簽核者簽核 → 仍為 PENDING (1/2)
3. 第二位簽核者簽核 → 轉為 APPROVED → 觸發執行
執行方式:
cd apps/api
source .venv/bin/activate
python scripts/demo_multisig.py
"""
import sys
from pathlib import Path
from datetime import datetime, timezone, timedelta
# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.models.approval import (
ApprovalRequestCreate,
ApprovalStatus,
RiskLevel,
BlastRadius,
DataImpact,
DryRunCheck,
)
from src.core.trust_engine import TrustEngine, get_required_signatures
def print_header(title: str) -> None:
"""Print a formatted header"""
print("\n" + "=" * 60)
print(f" {title}")
print("=" * 60)
def print_approval_status(approval) -> None:
"""Print approval status summary"""
print(f"""
ID: {approval.id}
Action: {approval.action}
Status: {approval.status.value.upper()}
Risk Level: {approval.risk_level.value.upper()}
Required Sigs: {approval.required_signatures}
Current Sigs: {approval.current_signatures}
Is Fully Signed: {approval.is_fully_signed}
""")
if approval.signatures:
print(" Signatures:")
for sig in approval.signatures:
print(f" - {sig.signer_name} ({sig.signer_id}) at {sig.signed_at.strftime('%H:%M:%S')}")
if sig.comment:
print(f" Comment: {sig.comment}")
def main():
"""Run the Multi-Sig demo"""
print_header("CISO-101 Multi-Sig Trust Engine Demo")
print("""
This demo shows the complete CRITICAL approval lifecycle:
1. ClawBot initiates a CRITICAL operation (DROP TABLE)
2. First signer signs → Still PENDING (1/2)
3. Second signer signs → APPROVED → Execution triggered
""")
# ==========================================================================
# Step 0: Show signature requirements
# ==========================================================================
print_header("Step 0: Signature Requirements")
print("""
Risk Level Required Signatures
---------- -------------------
LOW 0 (Auto-approve)
MEDIUM 1
CRITICAL 2 (Multi-Sig)
""")
for level in RiskLevel:
req = get_required_signatures(level)
print(f" {level.value.upper():10}{req} signature(s)")
# ==========================================================================
# Step 1: Create CRITICAL approval request
# ==========================================================================
print_header("Step 1: ClawBot Initiates CRITICAL Operation")
# Track approved requests
approved_requests = []
def on_approved(approval):
approved_requests.append(approval)
print(f"\n 🚀 EXECUTION TRIGGERED: {approval.action}")
def on_rejected(approval):
print(f"\n ❌ REJECTED: {approval.rejection_reason}")
engine = TrustEngine(
on_approved=on_approved,
on_rejected=on_rejected,
)
# Create the CRITICAL request
request = ApprovalRequestCreate(
action="DROP TABLE user_sessions",
description="清除所有用戶 session 以強制重新登入。此操作將影響所有線上用戶。",
risk_level=RiskLevel.CRITICAL,
blast_radius=BlastRadius(
affected_pods=0,
estimated_downtime="0",
related_services=["auth-service", "api-gateway", "user-service"],
data_impact=DataImpact.DESTRUCTIVE,
),
dry_run_checks=[
DryRunCheck(name="RBAC Check", passed=True, message="db-admin"),
DryRunCheck(name="Syntax Check", passed=True),
DryRunCheck(name="Backup Available", passed=False, message="No recent backup!"),
],
requested_by="ClawBot",
expires_at=datetime.now(timezone.utc) + timedelta(hours=1),
)
approval = engine.create_approval(request)
print(f"""
ClawBot 發起 CRITICAL 操作請求:
動作: {request.action}
描述: {request.description}
風險等級: {request.risk_level.value.upper()}
資料影響: {request.blast_radius.data_impact.value.upper()}
""")
print_approval_status(approval)
# ==========================================================================
# Step 2: First signer signs
# ==========================================================================
print_header("Step 2: First Signer (Alice) Signs")
approval, message, triggered = engine.sign_approval(
approval_id=approval.id,
signer_id="alice-001",
signer_name="Alice Chen (CTO)",
comment="已確認風險,建議在低流量時段執行",
)
print(f"""
Alice (CTO) 已簽核:
結果: {message}
觸發執行: {triggered}
""")
print_approval_status(approval)
assert approval.status == ApprovalStatus.PENDING, "Should still be PENDING after first signature"
assert approval.current_signatures == 1, "Should have 1 signature"
assert not triggered, "Should not trigger execution yet"
# ==========================================================================
# Step 3: Second signer signs
# ==========================================================================
print_header("Step 3: Second Signer (Bob) Signs - Multi-Sig Complete")
approval, message, triggered = engine.sign_approval(
approval_id=approval.id,
signer_id="bob-002",
signer_name="Bob Wu (CISO)",
comment="CISO 核准。已通知 DBA 團隊待命。",
)
print(f"""
Bob (CISO) 已簽核:
結果: {message}
觸發執行: {triggered}
""")
print_approval_status(approval)
assert approval.status == ApprovalStatus.APPROVED, "Should be APPROVED after second signature"
assert approval.current_signatures == 2, "Should have 2 signatures"
assert approval.is_fully_signed, "Should be fully signed"
assert triggered, "Should trigger execution"
# ==========================================================================
# Step 4: Verify final state
# ==========================================================================
print_header("Step 4: Verification")
pending = engine.get_pending_approvals()
print(f"""
驗證結果:
✅ 待簽核清單數量: {len(pending)} (應為 0)
✅ 已批准請求數量: {len(approved_requests)} (應為 1)
✅ 最終狀態: {approval.status.value.upper()}
✅ 簽核數: {approval.current_signatures}/{approval.required_signatures}
✅ 解決時間: {approval.resolved_at.strftime('%Y-%m-%d %H:%M:%S') if approval.resolved_at else 'N/A'}
""")
# ==========================================================================
# Bonus: Demo LOW risk auto-approval
# ==========================================================================
print_header("Bonus: LOW Risk Auto-Approval Demo")
low_request = ApprovalRequestCreate(
action="Scale deployment api-backend to 5 replicas",
description="增加後端服務副本數以應對流量增長",
risk_level=RiskLevel.LOW,
blast_radius=BlastRadius(
affected_pods=5,
estimated_downtime="0",
related_services=["api-backend"],
data_impact=DataImpact.NONE,
),
dry_run_checks=[
DryRunCheck(name="Resource Check", passed=True, message="5/20 pods"),
],
requested_by="ClawBot",
)
low_approval = engine.create_approval(low_request)
print(f"""
LOW 風險操作自動放行:
動作: {low_request.action}
風險等級: LOW
狀態: {low_approval.status.value.upper()} (自動批准!)
簽核數: {low_approval.required_signatures} (不需要簽核)
""")
assert low_approval.status == ApprovalStatus.APPROVED, "LOW risk should be auto-approved"
# ==========================================================================
# Summary
# ==========================================================================
print_header("Demo Complete!")
print("""
CISO-101 Multi-Sig Trust Engine 功能驗證完成:
✅ 風險等級分類 (LOW/MEDIUM/CRITICAL)
✅ 簽核數自動判定 (0/1/2)
✅ LOW 風險自動放行
✅ CRITICAL 雙重簽核 (Multi-Sig)
✅ 狀態機正確轉換 (PENDING → APPROVED)
✅ 簽核完成觸發執行回調
信任鏈完整性已驗證。
""")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,246 @@
#!/usr/bin/env python3
"""
Phase 5 E2E 點火測試 - OpenClaw 全鏈路驗證
==========================================
測試流程:
1. 發射模擬 K8s 告警到 Webhook
2. 驗證告警被正確處理
3. 驗證 ApprovalRecord 被建立
4. 模擬 Telegram 簽核回調
5. 驗證執行觸發
使用方式:
python scripts/e2e_openclaw_test.py
"""
import asyncio
import json
import sys
from datetime import datetime
def print_header(title: str) -> None:
"""列印測試標題"""
print("\n" + "=" * 60)
print(f" {title}")
print("=" * 60)
def print_step(step: int, description: str) -> None:
"""列印測試步驟"""
print(f"\n🔹 Step {step}: {description}")
def print_success(message: str) -> None:
"""列印成功訊息"""
print(f"{message}")
def print_error(message: str) -> None:
"""列印錯誤訊息"""
print(f"{message}")
def print_info(message: str) -> None:
"""列印資訊訊息"""
print(f" {message}")
async def test_phase5_e2e():
"""Phase 5 E2E 測試"""
print_header("Phase 5 E2E 點火測試 - OpenClaw 全鏈路驗證")
print(f"執行時間: {datetime.now().isoformat()}")
# =========================================================================
# Step 1: 測試 LogLevelFilter (日誌清洗)
# =========================================================================
print_step(1, "日誌清洗模組 (LogLevelFilter)")
try:
from src.services.context_gatherer import LogLevelFilter
# 模擬 K8s 日誌
raw_logs = """
2024-03-21T10:15:23.456Z INFO [harbor.core] Starting Harbor Core
2024-03-21T10:15:24.789Z DEBUG [harbor.core.db] Initializing connection pool
2024-03-21T10:16:45.123Z ERROR [harbor.core.db] Connection lost to PostgreSQL
2024-03-21T10:16:45.456Z FATAL [harbor.core] Unrecoverable error
Traceback (most recent call last):
File "/harbor/core/db.py", line 234, in connect
raise DatabaseConnectionError("Max retries exceeded")
""".strip()
filtered = LogLevelFilter.filter_logs(raw_logs)
stats = LogLevelFilter.get_filter_stats(raw_logs, filtered)
# 驗證 DEBUG/INFO 被過濾
assert "DEBUG" not in filtered, "DEBUG should be filtered"
assert "INFO" not in filtered.replace("DatabaseConnectionError", ""), "INFO should be filtered"
assert "ERROR" in filtered, "ERROR should be preserved"
assert "FATAL" in filtered, "FATAL should be preserved"
assert "Traceback" in filtered, "Stacktrace should be preserved"
print_success(f"日誌清洗成功: {stats['original_lines']}{stats['filtered_lines']}")
print_success(f"雜訊移除率: {stats['removal_rate_percent']}%")
except Exception as e:
print_error(f"日誌清洗測試失敗: {e}")
return False
# =========================================================================
# Step 2: 測試 Security Interceptor (白名單 + Nonce)
# =========================================================================
print_step(2, "安全攔截器 (Security Interceptor)")
try:
from src.services.security_interceptor import (
TelegramSecurityInterceptor,
UserNotWhitelistedError,
NonceReplayError,
)
from src.core.config import settings
interceptor = TelegramSecurityInterceptor()
# 測試白名單 (假設統帥 ID: 5619078117)
test_user_id = 5619078117
# 檢查白名單配置
whitelist = settings.OPENCLAW_TG_USER_WHITELIST
print_info(f"白名單配置: {whitelist}")
if whitelist:
is_whitelisted = interceptor.is_whitelisted(test_user_id)
if is_whitelisted:
print_success(f"統帥 ID {test_user_id} 在白名單內")
else:
print_info(f"統帥 ID {test_user_id} 不在白名單 (需配置)")
else:
print_info("白名單為空 (需在環境變數中配置 OPENCLAW_TG_USER_WHITELIST)")
# 測試 Nonce 產生
nonce = interceptor.generate_callback_nonce("test-approval-123", "approve")
print_success(f"Nonce 產生成功: {nonce[:30]}...")
# 解析 Nonce
parsed = interceptor.parse_callback_data(nonce)
assert parsed["action"] == "approve"
assert parsed["approval_id"] == "test-approval-123"
print_success("Nonce 解析成功")
except Exception as e:
print_error(f"安全攔截器測試失敗: {e}")
return False
# =========================================================================
# Step 3: 測試 Telegram Gateway (訊息格式)
# =========================================================================
print_step(3, "Telegram Gateway (SOUL.md 訊息格式)")
try:
from src.services.telegram_gateway import TelegramMessage, RISK_EMOJI_MAP
# 建立測試訊息
message = TelegramMessage(
status_emoji=RISK_EMOJI_MAP["critical"],
risk_level="CRITICAL",
resource_name="harbor-core-7d4b8c9f5-xk2m3",
root_cause="OOMKilled",
suggested_action="DELETE_POD (重啟 Pod)",
estimated_downtime="~30s",
approval_id="test-approval-123",
)
formatted = message.format()
# 驗證 SOUL.md 格式
assert "🚨" in formatted, "Should have critical emoji"
assert "CRITICAL" in formatted, "Should have risk level"
assert "harbor-core" in formatted, "Should have resource name"
assert "OOMKilled" in formatted, "Should have root cause"
assert "建議" in formatted, "Should have suggestion"
assert "停機" in formatted, "Should have downtime"
assert len(formatted) <= 500, f"Should be <= 500 chars, got {len(formatted)}"
print_success("SOUL.md 訊息格式驗證通過")
print_info(f"訊息長度: {len(formatted)} / 500 字元")
print()
print(" 📱 訊息預覽:")
for line in formatted.split("\n"):
print(f" {line}")
except Exception as e:
print_error(f"Telegram Gateway 測試失敗: {e}")
return False
# =========================================================================
# Step 4: 測試 OpenClaw 模組載入
# =========================================================================
print_step(4, "OpenClaw AI 模組載入")
try:
from src.services.openclaw import get_openclaw, OpenClawService
openclaw = get_openclaw()
assert isinstance(openclaw, OpenClawService)
print_success("OpenClaw 服務載入成功")
# 檢查 AI Fallback 順序
from src.core.config import settings
print_info(f"AI Fallback 順序: {settings.AI_FALLBACK_ORDER}")
print_info(f"預設模型: {settings.OPENCLAW_DEFAULT_MODEL}")
except Exception as e:
print_error(f"OpenClaw 模組載入失敗: {e}")
return False
# =========================================================================
# Step 5: 測試 Signature 審計欄位
# =========================================================================
print_step(5, "Signature 審計欄位 (Telegram 擴充)")
try:
from src.models.approval import Signature, SignatureSource
# 建立 Telegram 簽核記錄
sig = Signature(
signer_id="tg_5619078117",
signer_name="統帥",
comment="Telegram 簽核測試",
source=SignatureSource.TELEGRAM,
telegram_user_id=5619078117,
telegram_message_id=12345,
)
assert sig.source == SignatureSource.TELEGRAM
assert sig.telegram_user_id == 5619078117
print_success("Telegram 審計欄位驗證通過")
print_info(f"簽核來源: {sig.source.value}")
print_info(f"Telegram User ID: {sig.telegram_user_id}")
except Exception as e:
print_error(f"Signature 審計欄位測試失敗: {e}")
return False
# =========================================================================
# 測試完成
# =========================================================================
print_header("E2E 測試結果")
print()
print(" ✅ Step 1: 日誌清洗 (LogLevelFilter) - PASSED")
print(" ✅ Step 2: 安全攔截器 (Security Interceptor) - PASSED")
print(" ✅ Step 3: Telegram Gateway (SOUL.md 格式) - PASSED")
print(" ✅ Step 4: OpenClaw AI 模組載入 - PASSED")
print(" ✅ Step 5: Signature 審計欄位 - PASSED")
print()
print("=" * 60)
print(" 🎉 Phase 5 E2E 點火測試 - 全數通過!")
print("=" * 60)
return True
if __name__ == "__main__":
success = asyncio.run(test_phase5_e2e())
sys.exit(0 if success else 1)

View File

@@ -0,0 +1,372 @@
#!/usr/bin/env python3
"""
AWOOOI 實彈射擊腳本 - 自動化告警測試
=====================================
Phase 5: Shadow Mode - 自動化實彈演習
功能:
1. 模擬 Prometheus 格式的 OOMKilled/PodCrash 告警
2. 自動計算 HMAC-SHA256 簽章
3. 直接打向本地 Webhook 端點
4. 驗證回應並輸出結果
使用方式:
python scripts/fire_live_alert.py
環境變數:
WEBHOOK_HMAC_SECRET: HMAC 簽章密鑰 (必要)
AWOOOI_API_URL: API 端點 (預設: http://192.168.0.188:8000)
Tier 2 授權: 此腳本會觸發 AI 分析流程,需統帥授權
"""
import argparse
import hashlib
import hmac
import json
import os
import sys
from datetime import datetime, timezone
from typing import Literal
import httpx
# =============================================================================
# Configuration
# =============================================================================
DEFAULT_API_URL = os.getenv("AWOOOI_API_URL", "http://192.168.0.188:8000")
WEBHOOK_ENDPOINT = "/api/v1/webhooks/alerts"
HMAC_SECRET = os.getenv("WEBHOOK_HMAC_SECRET", "")
# =============================================================================
# Alert Templates
# =============================================================================
ALERT_TEMPLATES = {
"oomkilled": {
"alert_type": "k8s_pod_crash",
"severity": "critical",
"source": "prometheus",
"target_resource": "harbor-core-7d4b8c9f5-xk2m3",
"namespace": "harbor",
"message": "Pod terminated due to OOMKilled - Container exceeded memory limit",
"metrics": {
"memory_percent": 99.8,
"restart_count": 5,
"memory_limit_mb": 512,
"memory_usage_mb": 520,
},
"labels": {
"app": "harbor-core",
"deployment": "harbor-core",
"pod": "harbor-core-7d4b8c9f5-xk2m3",
"container": "harbor-core",
"reason": "OOMKilled",
},
},
"podcrash": {
"alert_type": "k8s_pod_crash",
"severity": "warning",
"source": "prometheus",
"target_resource": "nginx-ingress-7d6f8c9b5-abc12",
"namespace": "ingress-nginx",
"message": "Pod CrashLoopBackOff - Container restarting repeatedly",
"metrics": {
"restart_count": 8,
"cpu_percent": 15.2,
"memory_percent": 45.0,
},
"labels": {
"app": "nginx-ingress",
"deployment": "nginx-ingress-controller",
"pod": "nginx-ingress-7d6f8c9b5-abc12",
},
},
"highcpu": {
"alert_type": "high_cpu",
"severity": "warning",
"source": "prometheus",
"target_resource": "api-backend-deployment",
"namespace": "default",
"message": "High CPU usage detected - Pod using 95% of allocated CPU",
"metrics": {
"cpu_percent": 95.5,
"memory_percent": 60.0,
"sigma_deviation": 3.2,
},
"labels": {
"app": "api-backend",
"deployment": "api-backend",
},
},
"highmemory": {
"alert_type": "high_memory",
"severity": "warning",
"source": "prometheus",
"target_resource": "redis-master-0",
"namespace": "redis",
"message": "High memory usage detected - Pod memory at 92%",
"metrics": {
"cpu_percent": 25.0,
"memory_percent": 92.0,
"sigma_deviation": 2.8,
},
"labels": {
"app": "redis",
"statefulset": "redis-master",
},
},
}
# =============================================================================
# Helper Functions
# =============================================================================
def compute_hmac_signature(secret: str, payload: bytes) -> str:
"""計算 HMAC-SHA256 簽章"""
signature = hmac.new(
secret.encode(),
payload,
hashlib.sha256,
).hexdigest()
return f"sha256={signature}"
def print_header(title: str) -> None:
"""列印標題"""
print("\n" + "=" * 60)
print(f" {title}")
print("=" * 60)
def print_success(message: str) -> None:
"""列印成功訊息"""
print(f"{message}")
def print_error(message: str) -> None:
"""列印錯誤訊息"""
print(f"{message}")
def print_info(message: str) -> None:
"""列印資訊訊息"""
print(f" {message}")
def print_warning(message: str) -> None:
"""列印警告訊息"""
print(f" ⚠️ {message}")
# =============================================================================
# Main Logic
# =============================================================================
def fire_alert(
alert_type: str,
api_url: str = DEFAULT_API_URL,
hmac_secret: str = HMAC_SECRET,
dry_run: bool = False,
) -> dict:
"""
發射模擬告警
Args:
alert_type: 告警類型 (oomkilled, podcrash, highcpu, highmemory)
api_url: API 端點 URL
hmac_secret: HMAC 簽章密鑰
dry_run: 是否僅輸出不實際發送
Returns:
dict: API 回應
"""
print_header(f"AWOOOI 實彈射擊 - {alert_type.upper()}")
print(f"執行時間: {datetime.now(timezone.utc).isoformat()}")
print(f"目標端點: {api_url}{WEBHOOK_ENDPOINT}")
# 取得告警模板
if alert_type not in ALERT_TEMPLATES:
print_error(f"未知的告警類型: {alert_type}")
print_info(f"可用類型: {', '.join(ALERT_TEMPLATES.keys())}")
return {"success": False, "error": "Unknown alert type"}
payload = ALERT_TEMPLATES[alert_type].copy()
# 序列化 Payload (與 httpx 相同的格式)
payload_json = json.dumps(payload, separators=(",", ":"))
payload_bytes = payload_json.encode()
print("\n📦 告警 Payload:")
print(json.dumps(payload, indent=2, ensure_ascii=False))
# 計算 HMAC 簽章
if hmac_secret:
signature = compute_hmac_signature(hmac_secret, payload_bytes)
print_success(f"HMAC 簽章: {signature[:40]}...")
else:
signature = None
print_warning("無 HMAC Secret - 簽章將被跳過 (僅限 dev 環境)")
# Dry-run 模式
if dry_run:
print("\n🔒 [DRY-RUN MODE] 不實際發送請求")
print_info("移除 --dry-run 參數以實際發射")
return {"success": True, "dry_run": True}
# 發送請求
print("\n🚀 發射中...")
headers = {"Content-Type": "application/json"}
if signature:
headers["X-Signature-256"] = signature
try:
with httpx.Client(timeout=30.0) as client:
response = client.post(
f"{api_url}{WEBHOOK_ENDPOINT}",
content=payload_bytes,
headers=headers,
)
# 解析回應
print(f"\n📡 HTTP Status: {response.status_code}")
try:
result = response.json()
print("\n📋 API 回應:")
print(json.dumps(result, indent=2, ensure_ascii=False))
if response.status_code == 200 and result.get("success"):
print_success("告警已成功接收並處理!")
if result.get("converged"):
print_info(f"告警收斂: 相同指紋已聚合 x{result.get('hit_count', 1)}")
else:
print_info(f"風險等級: {result.get('risk_level', 'N/A')}")
print_info(f"建議操作: {result.get('suggested_action', 'N/A')}")
if result.get("approval_created"):
print_success(f"待簽核卡片已建立: {result.get('approval_id', 'N/A')}")
else:
print_error(f"處理失敗: {result.get('message', result.get('detail', 'Unknown error'))}")
return result
except json.JSONDecodeError:
print_error(f"回應解析失敗: {response.text}")
return {"success": False, "error": "Response parse error", "raw": response.text}
except httpx.ConnectError as e:
print_error(f"連線失敗: {str(e)}")
print_info(f"請確認 API 服務正在執行: {api_url}")
return {"success": False, "error": "Connection failed"}
except httpx.TimeoutException as e:
print_error(f"請求超時: {str(e)}")
return {"success": False, "error": "Timeout"}
except Exception as e:
print_error(f"未預期錯誤: {str(e)}")
return {"success": False, "error": str(e)}
def main():
"""主程式入口"""
parser = argparse.ArgumentParser(
description="AWOOOI 實彈射擊腳本 - 自動化告警測試",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
告警類型:
oomkilled - Pod OOMKilled (Critical)
podcrash - Pod CrashLoopBackOff (Warning)
highcpu - High CPU Usage (Warning)
highmemory - High Memory Usage (Warning)
範例:
# 發射 OOMKilled 告警
python scripts/fire_live_alert.py oomkilled
# Dry-run 模式 (不實際發送)
python scripts/fire_live_alert.py oomkilled --dry-run
# 指定 HMAC Secret
WEBHOOK_HMAC_SECRET=mysecret python scripts/fire_live_alert.py oomkilled
""",
)
parser.add_argument(
"alert_type",
choices=list(ALERT_TEMPLATES.keys()),
help="告警類型",
)
parser.add_argument(
"--api-url",
default=DEFAULT_API_URL,
help=f"API 端點 URL (預設: {DEFAULT_API_URL})",
)
parser.add_argument(
"--hmac-secret",
default=HMAC_SECRET,
help="HMAC 簽章密鑰 (也可用環境變數 WEBHOOK_HMAC_SECRET)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Dry-run 模式 - 僅輸出不實際發送",
)
parser.add_argument(
"--all",
action="store_true",
help="依序發射所有類型的告警",
)
args = parser.parse_args()
print_header("AWOOOI 實彈射擊系統")
print(f"API 端點: {args.api_url}")
print(f"HMAC 配置: {'已設定' if args.hmac_secret else '未設定 (dev mode)'}")
print(f"Shadow Mode: 已啟用 (K8s 操作將被安全攔截)")
if args.all:
# 發射所有類型的告警
print("\n🎯 連續發射所有告警類型...")
results = {}
for alert_type in ALERT_TEMPLATES.keys():
result = fire_alert(
alert_type=alert_type,
api_url=args.api_url,
hmac_secret=args.hmac_secret,
dry_run=args.dry_run,
)
results[alert_type] = result
# 摘要
print_header("射擊結果摘要")
for alert_type, result in results.items():
status = "" if result.get("success") else ""
print(f" {status} {alert_type}: {result.get('message', result.get('error', 'N/A'))}")
else:
# 發射單一告警
fire_alert(
alert_type=args.alert_type,
api_url=args.api_url,
hmac_secret=args.hmac_secret,
dry_run=args.dry_run,
)
print("\n" + "=" * 60)
print(" 實彈射擊完成")
print("=" * 60)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,318 @@
#!/usr/bin/env python3
"""
🚀 AWOOOI Phase 2 導彈腳本 - fire_test_alert.py
===============================================
向系統注入模擬告警,觸發 ClawBot AI 分析流程
用途:
- 驗證全鏈路 (Webhook → ClawBot → ApprovalCard)
- 測試戰情室前端是否即時彈出授權卡片
- 開發除錯用 (無需真實監控系統)
執行方式:
cd apps/api
python -m scripts.fire_test_alert
# 指定告警類型
python -m scripts.fire_test_alert --type db_connection_timeout
python -m scripts.fire_test_alert --type k8s_pod_crash --severity critical
Author: Claude Code
Date: 2026-03-21
"""
import argparse
import asyncio
import sys
from datetime import datetime
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
import httpx
# =============================================================================
# Config
# =============================================================================
API_BASE_URL = "http://localhost:8000"
WEBHOOK_ENDPOINT = f"{API_BASE_URL}/api/v1/webhooks/alerts"
# =============================================================================
# 預定義告警場景 (High-Fidelity Mock Alerts)
# =============================================================================
ALERT_SCENARIOS = {
"db_connection_timeout": {
"alert_type": "db_connection_timeout",
"severity": "critical",
"source": "prometheus-alertmanager",
"target_resource": "postgres-primary-0",
"namespace": "database",
"message": "PostgreSQL Database OOM - Connection pool exhausted, 47 waiting queries",
"metrics": {
"connection_count": 100,
"waiting_queries": 47,
"cpu_percent": 89,
"memory_percent": 95,
"sigma_deviation": 4.2,
},
"labels": {
"app": "postgres",
"team": "dba",
"tier": "critical",
},
},
"k8s_pod_crash": {
"alert_type": "k8s_pod_crash",
"severity": "warning",
"source": "k8s-event-watcher",
"target_resource": "harbor-core-7d4b8c9f5-xk2m3",
"namespace": "harbor",
"message": "Pod CrashLoopBackOff detected - OOMKilled after 5 restarts",
"metrics": {
"restart_count": 5,
"last_exit_code": 137,
"cpu_percent": 95,
"memory_percent": 100,
"sigma_deviation": 3.8,
},
"labels": {
"app": "harbor-core",
"team": "devops",
},
},
"high_cpu": {
"alert_type": "high_cpu",
"severity": "warning",
"source": "node-exporter",
"target_resource": "api-backend-deployment",
"namespace": "production",
"message": "Payment API Latency Spike - CPU at 94%, response time > 2s",
"metrics": {
"cpu_percent": 94,
"memory_percent": 72,
"response_time_ms": 2340,
"sigma_deviation": 3.2,
},
"labels": {
"app": "payment-api",
"team": "backend",
"sla": "critical",
},
},
"disk_full": {
"alert_type": "disk_full",
"severity": "critical",
"source": "node-exporter",
"target_resource": "logging-node-01",
"namespace": "kube-system",
"message": "Disk usage at 97% - /var/log nearly full, risk of logging failure",
"metrics": {
"disk_percent": 97,
"available_gb": 2.3,
"inode_percent": 89,
},
"labels": {
"node": "logging-node-01",
"team": "sre",
},
},
"ssl_expiry": {
"alert_type": "ssl_expiry",
"severity": "warning",
"source": "cert-manager",
"target_resource": "awoooi.wooo.work",
"namespace": "cert-manager",
"message": "SSL Certificate expiring in 7 days - auto-renewal failed",
"metrics": {
"days_until_expiry": 7,
},
"labels": {
"domain": "awoooi.wooo.work",
"issuer": "letsencrypt",
},
},
}
# =============================================================================
# Terminal Output Helpers (漂亮的 Log)
# =============================================================================
class Colors:
"""ANSI Color Codes"""
HEADER = '\033[95m'
BLUE = '\033[94m'
CYAN = '\033[96m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
DIM = '\033[2m'
def print_banner():
"""Print AWOOOI ASCII Banner"""
banner = f"""
{Colors.CYAN}{Colors.BOLD}
█████╗ ██╗ ██╗ ██████╗ ██████╗ ██████╗ ██╗
██╔══██╗██║ ██║██╔═══██╗██╔═══██╗██╔═══██╗██║
███████║██║ █╗ ██║██║ ██║██║ ██║██║ ██║██║
██╔══██║██║███╗██║██║ ██║██║ ██║██║ ██║██║
██║ ██║╚███╔███╔╝╚██████╔╝╚██████╔╝╚██████╔╝██║
╚═╝ ╚═╝ ╚══╝╚══╝ ╚═════╝ ╚═════╝ ╚═════╝ ╚═╝
{Colors.ENDC}
{Colors.DIM} 🚀 Phase 2 導彈腳本 - Test Alert Injector{Colors.ENDC}
{Colors.DIM} ─────────────────────────────────────────{Colors.ENDC}
"""
print(banner)
def print_section(title: str):
"""Print section header"""
print(f"\n{Colors.BLUE}{Colors.BOLD}{title}{Colors.ENDC}")
print(f"{Colors.DIM}{'' * 50}{Colors.ENDC}")
def print_alert_info(alert: dict):
"""Print alert payload info"""
print(f" {Colors.YELLOW}告警類型:{Colors.ENDC} {alert['alert_type']}")
print(f" {Colors.YELLOW}嚴重度:{Colors.ENDC} {alert['severity']}")
print(f" {Colors.YELLOW}目標資源:{Colors.ENDC} {alert['target_resource']}")
print(f" {Colors.YELLOW}命名空間:{Colors.ENDC} {alert['namespace']}")
print(f" {Colors.YELLOW}訊息:{Colors.ENDC} {alert['message']}")
if alert.get('metrics'):
print(f" {Colors.YELLOW}指標:{Colors.ENDC}")
for k, v in alert['metrics'].items():
print(f"{k}: {v}")
def print_response(response: dict, status_code: int):
"""Print API response"""
if status_code == 200 and response.get('success'):
print(f"\n{Colors.GREEN}{Colors.BOLD}✓ 告警發射成功!{Colors.ENDC}")
print(f" {Colors.CYAN}Approval ID:{Colors.ENDC} {response.get('approval_id', 'N/A')}")
print(f" {Colors.CYAN}風險等級:{Colors.ENDC} {response.get('risk_level', 'N/A')}")
print(f" {Colors.CYAN}建議動作:{Colors.ENDC} {response.get('suggested_action', 'N/A')}")
print(f" {Colors.CYAN}聚合次數:{Colors.ENDC} {response.get('hit_count', 1)}")
if response.get('converged'):
print(f" {Colors.YELLOW}⚡ 告警已收斂 (跳過 LLM){Colors.ENDC}")
else:
print(f"\n{Colors.RED}{Colors.BOLD}✗ 告警發射失敗!{Colors.ENDC}")
print(f" {Colors.RED}狀態碼:{Colors.ENDC} {status_code}")
print(f" {Colors.RED}回應:{Colors.ENDC} {response}")
def print_footer():
"""Print footer with instructions"""
print(f"\n{Colors.DIM}{'' * 50}{Colors.ENDC}")
print(f"{Colors.GREEN}📺 請查看戰情室前端:{Colors.ENDC} http://localhost:3000")
print(f"{Colors.GREEN}📋 右側面板應顯示新的 ApprovalCard{Colors.ENDC}")
print(f"{Colors.DIM}時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}{Colors.ENDC}\n")
# =============================================================================
# Main Logic
# =============================================================================
async def fire_alert(alert_type: str, severity: str | None = None) -> bool:
"""
發射測試告警
Args:
alert_type: 告警類型 (db_connection_timeout, k8s_pod_crash, etc.)
severity: 覆蓋嚴重度 (optional)
Returns:
bool: 是否成功
"""
# 取得告警場景
if alert_type not in ALERT_SCENARIOS:
print(f"{Colors.RED}❌ 未知告警類型: {alert_type}{Colors.ENDC}")
print(f"{Colors.DIM}可用類型: {', '.join(ALERT_SCENARIOS.keys())}{Colors.ENDC}")
return False
alert = ALERT_SCENARIOS[alert_type].copy()
# 覆蓋嚴重度
if severity:
alert['severity'] = severity
print_section("告警 Payload")
print_alert_info(alert)
print_section("發射告警至 Webhook API")
print(f" {Colors.CYAN}端點:{Colors.ENDC} {WEBHOOK_ENDPOINT}")
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
WEBHOOK_ENDPOINT,
json=alert,
headers={"Content-Type": "application/json"},
)
result = response.json()
print_response(result, response.status_code)
return response.status_code == 200
except httpx.ConnectError:
print(f"\n{Colors.RED}{Colors.BOLD}✗ 連線失敗!{Colors.ENDC}")
print(f" {Colors.RED}請確認後端 API 正在運行:{Colors.ENDC}")
print(f" {Colors.DIM}cd apps/api && uvicorn src.main:app --reload{Colors.ENDC}")
return False
except Exception as e:
print(f"\n{Colors.RED}{Colors.BOLD}✗ 發生錯誤:{e}{Colors.ENDC}")
return False
def main():
"""CLI Entry Point"""
parser = argparse.ArgumentParser(
description="🚀 AWOOOI Phase 2 導彈腳本 - 發射測試告警",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
可用告警類型:
db_connection_timeout PostgreSQL Database OOM (CRITICAL)
k8s_pod_crash Pod CrashLoopBackOff (MEDIUM)
high_cpu CPU Spike / Latency (MEDIUM)
disk_full Disk Full Warning (CRITICAL)
ssl_expiry SSL Certificate Expiry (LOW)
範例:
python -m scripts.fire_test_alert
python -m scripts.fire_test_alert --type db_connection_timeout
python -m scripts.fire_test_alert --type k8s_pod_crash --severity critical
""",
)
parser.add_argument(
"--type", "-t",
type=str,
default="db_connection_timeout",
choices=list(ALERT_SCENARIOS.keys()),
help="告警類型 (預設: db_connection_timeout)",
)
parser.add_argument(
"--severity", "-s",
type=str,
choices=["info", "warning", "critical"],
help="覆蓋嚴重度 (預設使用場景預設值)",
)
args = parser.parse_args()
print_banner()
success = asyncio.run(fire_alert(args.type, args.severity))
print_footer()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,180 @@
#!/usr/bin/env python3
"""
Phase 6.3 聚合測試腳本
=======================
功能:
1. 連續打入 3 筆「同源但不同名」的測試告警
2. 證明這 3 筆告警被成功「聚合」進同一個 Incident 的 signals 陣列中
3. 驗證 affected_services 有被正確填入
使用方式:
cd apps/api
python scripts/test_phase63_aggregation.py
預期結果:
- 3 筆告警全部聚合到 1 個 Incident
- signals 陣列長度 = 3
- affected_services 包含 "payment-service"
"""
import asyncio
import json
import httpx
from datetime import datetime
import time
# API 端點
API_BASE = "http://localhost:8000"
SIGNALS_ENDPOINT = f"{API_BASE}/api/v1/webhooks/signals"
# 測試告警: 同 namespace + 同 target不同 alert_name
# 模擬: payment-service 發生一連串相關問題
# 測試告警: 同 namespace + 同 target不同 alert_name
# 模擬: payment-service 發生一連串相關問題
# 注意: severity 只能是 info | warning | critical (SignalPayload 定義)
TEST_ALERTS = [
{
"alert_name": "PaymentServiceHighLatency",
"severity": "warning",
"source": "prometheus",
"namespace": "payment-prod",
"target": "payment-service",
"fingerprint": "fp_latency_001",
"labels": {"namespace": "payment-prod", "pod": "payment-service-abc123"},
"annotations": {"summary": "Payment service latency > 500ms"},
},
{
"alert_name": "PaymentServiceErrorRate",
"severity": "warning", # 原本是 high但 API 只接受 info|warning|critical
"source": "prometheus",
"namespace": "payment-prod",
"target": "payment-service",
"fingerprint": "fp_error_001",
"labels": {"namespace": "payment-prod", "pod": "payment-service-abc123"},
"annotations": {"summary": "Payment service error rate > 5%"},
},
{
"alert_name": "PaymentServicePodCrash",
"severity": "critical",
"source": "alertmanager",
"namespace": "payment-prod",
"target": "payment-service",
"fingerprint": "fp_crash_001",
"labels": {"namespace": "payment-prod", "pod": "payment-service-abc123"},
"annotations": {"summary": "Payment service pod crashed"},
},
]
async def send_alert(client: httpx.AsyncClient, alert: dict, index: int) -> dict:
"""發送單一告警"""
print(f"\n[{index+1}/3] 發送告警: {alert['alert_name']}")
print(f" severity: {alert['severity']}")
print(f" namespace: {alert['namespace']}")
print(f" target: {alert['target']}")
response = await client.post(
SIGNALS_ENDPOINT,
json=alert,
timeout=10.0,
)
result = response.json()
print(f" status_code: {response.status_code}")
print(f" message_id: {result.get('message_id', 'N/A')}")
return result
async def check_redis_incident(client: httpx.AsyncClient) -> dict | None:
"""檢查 Redis 中的 Incident"""
# 使用 health endpoint 確認 API 運作
try:
# 直接查詢 Redis (透過 API 或直接)
# 這裡我們用 curl 模擬,但實際應該有 API
return None
except Exception as e:
print(f"Error checking Redis: {e}")
return None
async def main():
"""主測試流程"""
print("=" * 60)
print("Phase 6.3 聚合測試")
print("=" * 60)
print(f"時間: {datetime.now().isoformat()}")
print(f"目標: 驗證 3 筆同源告警聚合到 1 個 Incident")
print()
async with httpx.AsyncClient() as client:
# 1. 確認 API 運作
print("[0] 檢查 API 健康狀態...")
try:
health = await client.get(f"{API_BASE}/api/v1/health", timeout=5.0)
print(f" API status: {health.status_code}")
except Exception as e:
print(f" API 連線失敗: {e}")
print(" 請確認 API 已啟動: docker compose up -d")
return
# 2. 連續發送 3 筆告警
print("\n" + "-" * 60)
print("階段一: 連續發送 3 筆告警")
print("-" * 60)
results = []
for i, alert in enumerate(TEST_ALERTS):
result = await send_alert(client, alert, i)
results.append(result)
# 短暫等待,確保 Consumer 有時間處理
await asyncio.sleep(0.5)
# 3. 等待 Consumer 處理完成
print("\n" + "-" * 60)
print("階段二: 等待 Consumer 處理 (3 秒)")
print("-" * 60)
await asyncio.sleep(3)
# 4. 輸出驗證指令
print("\n" + "-" * 60)
print("階段三: 驗證指令")
print("-" * 60)
print()
print("請執行以下 Redis 指令檢查聚合結果:")
print()
print("# 1. 查看所有 Incident keys")
print("docker exec -it awoooi-redis redis-cli KEYS 'incident:*'")
print()
print("# 2. 查看特定 Incident 的 JSON (取代 <INCIDENT_ID>)")
print("docker exec -it awoooi-redis redis-cli GET 'incident:INC-XXXXXXXX-XXXXXX'")
print()
print("# 3. 或直接用以下指令掃描並輸出所有 Incident:")
print("""docker exec -it awoooi-redis redis-cli --no-raw KEYS 'incident:INC-*' | xargs -I {} docker exec -i awoooi-redis redis-cli GET {}""")
print()
# 5. 輸出 API 日誌指令
print("-" * 60)
print("檢查 API 日誌:")
print("-" * 60)
print("docker logs awoooi-api --tail 50 | grep -E '(signal_|incident_|aggregat)'")
print()
# 6. 驗證標準
print("-" * 60)
print("驗證標準 (PASS/FAIL)")
print("-" * 60)
print("[ ] 只有 1 個 Incident 被建立 (非 3 個)")
print("[ ] signals 陣列長度 = 3")
print("[ ] affected_services 包含 'payment-service'")
print("[ ] severity 升級為 'P0' (因為第三筆是 critical)")
print()
print("=" * 60)
print("測試腳本執行完成")
print("=" * 60)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,261 @@
#!/usr/bin/env python3
"""
Phase 6.4 全鏈路測試腳本
========================
功能:
1. 觸發假告警 (建立 Incident)
2. 呼叫 /proposal 端點 (產生決策)
3. 呼叫 /approvals/pending (模擬前端撈取待簽核清單)
4. 證明這條鏈路完全暢通
使用方式:
cd apps/api
python scripts/test_phase64_proposal.py
驗收標準:
- Incident 成功建立
- Proposal 成功生成
- Proposal 出現在 /approvals/pending 清單中
- 前端零改動即可渲染
"""
import asyncio
import json
from datetime import datetime
import httpx
# API 端點
API_BASE = "http://localhost:8000"
SIGNALS_ENDPOINT = f"{API_BASE}/api/v1/webhooks/signals"
INCIDENTS_ENDPOINT = f"{API_BASE}/api/v1/incidents"
APPROVALS_ENDPOINT = f"{API_BASE}/api/v1/approvals/pending"
async def send_test_alert() -> dict | None:
"""發送測試告警"""
alert = {
"alert_name": "PodCrashLoopBackOff",
"severity": "critical", # P0
"source": "prometheus",
"namespace": "production",
"target": "api-gateway",
"fingerprint": f"fp_test_{datetime.now().strftime('%H%M%S')}",
"labels": {
"namespace": "production",
"pod": "api-gateway-abc123",
},
"annotations": {
"summary": "Pod api-gateway is in CrashLoopBackOff state",
},
}
async with httpx.AsyncClient() as client:
try:
response = await client.post(
SIGNALS_ENDPOINT,
json=alert,
timeout=10.0,
)
if response.status_code == 200:
return response.json()
else:
print(f" [ERROR] status_code: {response.status_code}")
print(f" [ERROR] response: {response.text}")
return None
except Exception as e:
print(f" [ERROR] {e}")
return None
async def wait_for_incident(namespace: str, timeout: int = 10) -> str | None:
"""等待 Incident 被建立並返回 incident_id"""
async with httpx.AsyncClient() as client:
for _ in range(timeout):
try:
response = await client.get(
INCIDENTS_ENDPOINT,
timeout=5.0,
)
if response.status_code == 200:
data = response.json()
for incident in data.get("incidents", []):
# 找到我們的測試 Incident
if "api-gateway" in incident.get("affected_services", []):
return incident.get("incident_id")
except Exception:
pass
await asyncio.sleep(1)
return None
async def generate_proposal(incident_id: str) -> dict | None:
"""生成 Decision Proposal"""
async with httpx.AsyncClient() as client:
try:
response = await client.post(
f"{INCIDENTS_ENDPOINT}/{incident_id}/proposal",
timeout=10.0,
)
if response.status_code == 200:
return response.json()
else:
print(f" [ERROR] status_code: {response.status_code}")
print(f" [ERROR] response: {response.text}")
return None
except Exception as e:
print(f" [ERROR] {e}")
return None
async def get_pending_approvals() -> dict | None:
"""取得待簽核清單"""
async with httpx.AsyncClient() as client:
try:
response = await client.get(
APPROVALS_ENDPOINT,
timeout=10.0,
)
if response.status_code == 200:
return response.json()
else:
print(f" [ERROR] status_code: {response.status_code}")
return None
except Exception as e:
print(f" [ERROR] {e}")
return None
async def main():
"""主測試流程"""
print("=" * 70)
print("Phase 6.4 全鏈路測試: Incident → Proposal → Pending Approvals")
print("=" * 70)
print(f"時間: {datetime.now().isoformat()}")
print()
# 0. 健康檢查
print("[0] 檢查 API 健康狀態...")
async with httpx.AsyncClient() as client:
try:
health = await client.get(f"{API_BASE}/api/v1/health", timeout=5.0)
print(f" API status: {health.status_code}")
except Exception as e:
print(f" API 連線失敗: {e}")
print(" 請確認 API 已啟動: docker compose up -d")
return
# 1. 發送測試告警
print("\n" + "-" * 70)
print("[1] 發送測試告警 (建立 Incident)")
print("-" * 70)
result = await send_test_alert()
if not result:
print(" [FAIL] 無法發送告警")
return
print(f" message_id: {result.get('message_id', 'N/A')}")
print(f" success: {result.get('success', False)}")
# 2. 等待 Incident 建立
print("\n" + "-" * 70)
print("[2] 等待 Consumer 處理並建立 Incident (最多 10 秒)")
print("-" * 70)
incident_id = await wait_for_incident("production")
if not incident_id:
print(" [FAIL] 無法找到測試 Incident")
print(" 請檢查 API 日誌: docker logs awoooi-api --tail 50")
return
print(f" incident_id: {incident_id}")
print(" [OK] Incident 已建立")
# 3. 生成 Proposal
print("\n" + "-" * 70)
print("[3] 呼叫 /proposal 端點生成決策")
print("-" * 70)
proposal_result = await generate_proposal(incident_id)
if not proposal_result or not proposal_result.get("success"):
print(f" [FAIL] 無法生成 Proposal")
print(f" message: {proposal_result.get('message') if proposal_result else 'N/A'}")
return
proposal = proposal_result.get("proposal", {})
print(f" proposal_id: {proposal.get('id', 'N/A')}")
print(f" action: {proposal.get('action', 'N/A')[:60]}...")
print(f" risk_level: {proposal.get('risk_level', 'N/A')}")
print(f" required_signatures: {proposal.get('required_signatures', 'N/A')}")
print(f" incident_status: {proposal_result.get('incident_status', 'N/A')}")
print(" [OK] Proposal 已生成")
# 4. 驗證 /approvals/pending
print("\n" + "-" * 70)
print("[4] 呼叫 /approvals/pending 驗證前端相容性")
print("-" * 70)
pending = await get_pending_approvals()
if not pending:
print(" [FAIL] 無法取得待簽核清單")
return
print(f" count: {pending.get('count', 0)}")
# 尋找我們的 Proposal
found = False
for approval in pending.get("approvals", []):
if approval.get("id") == proposal.get("id"):
found = True
print(f" [FOUND] Proposal 出現在待簽核清單中!")
print()
print(" === PendingApprovalsResponse JSON ===")
print(json.dumps({
"count": pending.get("count"),
"target_approval": approval,
}, indent=2, ensure_ascii=False, default=str))
break
if not found:
print(" [WARN] Proposal 未出現在待簽核清單中")
print(f" (可能因為 risk_level=LOW 已自動批准)")
# 5. 最終驗證
print("\n" + "=" * 70)
print("驗證結果")
print("=" * 70)
checks = [
("Incident 建立", incident_id is not None),
("Proposal 生成", proposal_result.get("success", False)),
("風險評估", proposal.get("risk_level") is not None),
("狀態推進 (MITIGATING)", proposal_result.get("incident_status") == "mitigating"),
("前端相容 (/approvals/pending)", pending is not None),
]
all_passed = True
for name, passed in checks:
status = "✅ PASS" if passed else "❌ FAIL"
print(f"[{status}] {name}")
if not passed:
all_passed = False
print()
print("=" * 70)
if all_passed:
print("🎉 Phase 6.4 全鏈路測試 PASSED!")
print(" 大腦已具備決策輸出能力!")
print(" Decision Proposal API 已鑄造完成!")
else:
print("💥 Phase 6.4 全鏈路測試 FAILED!")
print(" 請檢查上述失敗項目")
print("=" * 70)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,286 @@
#!/usr/bin/env python3
"""
Phase 6.3 Race Condition 測試腳本
==================================
功能:
1. 使用 asyncio.gather 同時發射 20 筆同源告警
2. 證明 Lua Script 原子操作成功擋下 Race Condition
3. 驗證最終 Incident JSON 精準包含 20 筆 Signals
使用方式:
cd apps/api
python scripts/test_race_condition.py
預期結果:
- 只有 1 個 Incident 被建立
- signals 陣列長度 = 20
- 無任何 Signal 遺失
統帥鐵律:
- 嚴禁人工 QA
- 必須程式化驗證
"""
import asyncio
import json
from datetime import datetime
import httpx
# API 端點
API_BASE = "http://localhost:8000"
SIGNALS_ENDPOINT = f"{API_BASE}/api/v1/webhooks/signals"
# 併發數量
CONCURRENT_SIGNALS = 20
# 測試 namespace 和 target (同源)
TEST_NAMESPACE = "race-test-ns"
TEST_TARGET = "race-test-service"
def generate_alert(index: int) -> dict:
"""生成測試告警 (同 namespace + 同 target不同 alert_name)"""
return {
"alert_name": f"RaceConditionTest_{index:03d}",
"severity": "warning",
"source": "prometheus",
"namespace": TEST_NAMESPACE,
"target": TEST_TARGET,
"fingerprint": f"fp_race_{index:03d}", # 唯一 fingerprint 防止去重
"labels": {
"namespace": TEST_NAMESPACE,
"test_index": str(index),
},
"annotations": {
"summary": f"Race condition test signal #{index}",
},
}
async def send_alert(client: httpx.AsyncClient, index: int) -> dict:
"""發送單一告警"""
alert = generate_alert(index)
try:
response = await client.post(
SIGNALS_ENDPOINT,
json=alert,
timeout=30.0,
)
return {
"index": index,
"status_code": response.status_code,
"message_id": response.json().get("message_id"),
"success": response.status_code == 200,
}
except Exception as e:
return {
"index": index,
"status_code": 0,
"message_id": None,
"success": False,
"error": str(e),
}
async def fire_concurrent_alerts() -> list[dict]:
"""併發發射所有告警"""
async with httpx.AsyncClient() as client:
tasks = [send_alert(client, i) for i in range(CONCURRENT_SIGNALS)]
results = await asyncio.gather(*tasks)
return list(results)
async def verify_redis_incident() -> dict | None:
"""從 Redis 查詢 Incident 並驗證"""
import subprocess
# 查詢所有 incident keys
result = subprocess.run(
["docker", "exec", "awoooi-redis", "redis-cli", "KEYS", "incident:INC-*"],
capture_output=True,
text=True,
)
keys = [k.strip() for k in result.stdout.strip().split("\n") if k.strip()]
if not keys:
return None
# 找到最新的 Incident (假設測試環境已清空)
# 對於測試,我們檢查所有 incident 找到包含 race-test-ns 的那個
for key in keys:
get_result = subprocess.run(
["docker", "exec", "awoooi-redis", "redis-cli", "GET", key],
capture_output=True,
text=True,
)
if get_result.stdout.strip():
try:
incident = json.loads(get_result.stdout.strip())
# 檢查是否是我們的測試 Incident
if any(
s.get("labels", {}).get("namespace") == TEST_NAMESPACE
for s in incident.get("signals", [])
):
return incident
except json.JSONDecodeError:
continue
return None
async def main():
"""主測試流程"""
print("=" * 70)
print("Phase 6.3 Race Condition 併發測試")
print("=" * 70)
print(f"時間: {datetime.now().isoformat()}")
print(f"併發數量: {CONCURRENT_SIGNALS} 筆告警")
print(f"測試 Namespace: {TEST_NAMESPACE}")
print(f"測試 Target: {TEST_TARGET}")
print()
# 0. 清除舊的測試 Incident (可選)
print("[0] 準備測試環境...")
import subprocess
# 刪除舊的索引 (如果存在)
subprocess.run(
[
"docker", "exec", "awoooi-redis", "redis-cli",
"DEL",
f"incident:idx:ns:{TEST_NAMESPACE}",
f"incident:idx:target:{TEST_TARGET}",
],
capture_output=True,
)
print(" 已清除舊索引")
# 1. 檢查 API
print("\n[1] 檢查 API 健康狀態...")
async with httpx.AsyncClient() as client:
try:
health = await client.get(f"{API_BASE}/api/v1/health", timeout=5.0)
print(f" API status: {health.status_code}")
except Exception as e:
print(f" API 連線失敗: {e}")
print(" 請確認 API 已啟動: docker compose up -d")
return
# 2. 併發發射告警
print("\n" + "-" * 70)
print("[2] 併發發射 20 筆告警 (asyncio.gather)")
print("-" * 70)
start_time = datetime.now()
results = await fire_concurrent_alerts()
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
success_count = sum(1 for r in results if r["success"])
fail_count = sum(1 for r in results if not r["success"])
print(f"\n發射結果:")
print(f" 成功: {success_count}/{CONCURRENT_SIGNALS}")
print(f" 失敗: {fail_count}/{CONCURRENT_SIGNALS}")
print(f" 耗時: {duration:.3f}")
if fail_count > 0:
print("\n失敗詳情:")
for r in results:
if not r["success"]:
print(f" - Index {r['index']}: {r.get('error', 'Unknown')}")
# 3. 等待 Consumer 處理
print("\n" + "-" * 70)
print("[3] 等待 Consumer 處理 (5 秒)")
print("-" * 70)
await asyncio.sleep(5)
# 4. 驗證 Redis Incident
print("\n" + "-" * 70)
print("[4] 驗證 Redis Incident")
print("-" * 70)
incident = await verify_redis_incident()
if not incident:
print("\n❌ 錯誤: 找不到測試 Incident!")
print(" 請檢查 API 日誌: docker logs awoooi-api --tail 100")
return
incident_id = incident.get("incident_id", "N/A")
signals = incident.get("signals", [])
signal_count = len(signals)
severity = incident.get("severity", "N/A")
affected_services = incident.get("affected_services", [])
print(f"\n找到 Incident:")
print(f" incident_id: {incident_id}")
print(f" signal_count: {signal_count}")
print(f" severity: {severity}")
print(f" affected_services: {affected_services}")
# 5. 驗證結果
print("\n" + "=" * 70)
print("驗證結果")
print("=" * 70)
# 計算聚合的告警數量
race_signals = [
s for s in signals
if s.get("alert_name", "").startswith("RaceConditionTest_")
]
race_signal_count = len(race_signals)
# 檢查告警名稱分布
alert_names = [s.get("alert_name") for s in race_signals]
unique_names = set(alert_names)
print()
passed = True
# 驗證 1: signal_count
if race_signal_count == CONCURRENT_SIGNALS:
print(f"[✅ PASS] Signal 數量: {race_signal_count}/{CONCURRENT_SIGNALS}")
else:
print(f"[❌ FAIL] Signal 數量: {race_signal_count}/{CONCURRENT_SIGNALS}")
print(f" 遺失 {CONCURRENT_SIGNALS - race_signal_count} 筆 Signal!")
passed = False
# 驗證 2: unique names (無重複跳過)
if len(unique_names) == race_signal_count:
print(f"[✅ PASS] 唯一告警名稱: {len(unique_names)} 個 (無重複)")
else:
print(f"[❌ FAIL] 唯一告警名稱: {len(unique_names)} 個 (有重複被覆蓋)")
passed = False
# 驗證 3: affected_services
if TEST_TARGET in affected_services:
print(f"[✅ PASS] affected_services 包含 '{TEST_TARGET}'")
else:
print(f"[❌ FAIL] affected_services 不包含 '{TEST_TARGET}'")
passed = False
# 最終結論
print()
print("=" * 70)
if passed:
print("🎉 Race Condition 測試 PASSED!")
print(f" {CONCURRENT_SIGNALS} 筆併發告警全部成功聚合!")
print(" Lua Script 原子操作有效防止了資料遺失!")
else:
print("💥 Race Condition 測試 FAILED!")
print(" 存在資料遺失,需要進一步調查!")
print("=" * 70)
# 輸出詳細日誌指令
print("\n檢查詳細日誌:")
print("docker logs awoooi-api --tail 100 | grep -E '(atomic|aggregate|race)'")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env python3
"""
Phase 6.1 測試腳本: Redis Streams Signal 流程驗證
=================================================
功能:
1. 發送測試 Signal 到 /api/v1/webhooks/signals
2. 驗證 Redis Stream 中有新訊息
3. 輸出 Stream 狀態
使用:
python scripts/test_signal_stream.py
環境變數:
API_BASE_URL: API 基礎 URL (預設: http://localhost:8000)
"""
import asyncio
import json
import os
import sys
import httpx
API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000")
SIGNAL_ENDPOINT = f"{API_BASE_URL}/api/v1/webhooks/signals"
async def send_test_signal() -> dict:
"""發送測試 Signal"""
payload = {
"source": "test-script",
"alert_name": "TestSignal",
"severity": "warning",
"namespace": "awoooi-test",
"target": "test-pod-123",
"message": "Phase 6.1 Event Bus 驗證測試",
"labels": {"team": "devops", "env": "test"},
"annotations": {"runbook_url": "https://wiki.example.com/runbook"},
}
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(SIGNAL_ENDPOINT, json=payload)
response.raise_for_status()
return response.json()
async def main():
print("=" * 60)
print("Phase 6.1 Event Bus 測試")
print("=" * 60)
print()
print(f"[1] 發送測試 Signal 到 {SIGNAL_ENDPOINT}")
try:
result = await send_test_signal()
print(f" ✅ 成功!")
print(f" Message ID: {result.get('message_id')}")
print(f" Stream: {result.get('stream')}")
except httpx.HTTPStatusError as e:
print(f" ❌ HTTP 錯誤: {e.response.status_code}")
print(f" {e.response.text}")
sys.exit(1)
except Exception as e:
print(f" ❌ 錯誤: {e}")
sys.exit(1)
print()
print("[2] 驗證 Signal Worker (Consumer) 是否收到訊息")
print(" 查看 API 日誌: docker logs awoooi-api | grep signal_received")
print()
print("[3] 手動檢查 Redis Stream 狀態")
print(" redis-cli XINFO STREAM stream:awoooi_signals")
print(" redis-cli XINFO GROUPS stream:awoooi_signals")
print()
print("=" * 60)
print("測試完成!")
print("=" * 60)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,360 @@
#!/usr/bin/env python3
"""
Tracer Bullet 2.0 - 全站閉環測試腳本
Phase 4: E2E Integration Test
測試流程:
1. 觸發假告警 (Mock Alert)
2. GraphRAG 分析 (Blast Radius + Root Cause)
3. 產生 ApprovalCard (Dry-Run)
4. 人類批准 (Multi-Sig)
5. MCP 模擬執行
執行方式:
cd apps/api
python scripts/tracer_bullet_2.py
"""
import asyncio
import json
from datetime import datetime
# ==================== 模擬模組導入 ====================
# 實際運行時這些會從專案導入
# from src.services import (
# topology_graph, trust_engine, multi_sig_engine, dry_run_engine
# )
# from src.plugins.finops import idle_scanner
# from src.plugins.mcp import mcp_bridge
# ==================== Test Configuration ====================
class TracerBullet2:
"""全站閉環測試器"""
def __init__(self):
self.test_id = f"tb2-{datetime.utcnow().strftime('%Y%m%d%H%M%S')}"
self.results: list[dict] = []
def log(self, step: str, status: str, data: dict | None = None):
"""記錄測試結果"""
result = {
"step": step,
"status": status,
"timestamp": datetime.utcnow().isoformat(),
"data": data or {},
}
self.results.append(result)
emoji = "" if status == "PASS" else "" if status == "FAIL" else "🔄"
print(f"{emoji} [{step}] {status}")
if data:
print(f" {json.dumps(data, indent=2, default=str)}")
# ==================== Step 1: Mock Alert ====================
async def step1_trigger_alert(self) -> dict:
"""
Step 1: 觸發假告警
模擬 Prometheus AlertManager 發送告警:
- frontend 服務 5xx 錯誤率上升
"""
print("\n" + "=" * 60)
print("STEP 1: TRIGGER MOCK ALERT")
print("=" * 60)
alert = {
"alertname": "HighErrorRate",
"service": "frontend",
"namespace": "production",
"severity": "critical",
"error_rate": 15.2, # 15% 5xx
"threshold": 5.0,
"fired_at": datetime.utcnow().isoformat(),
}
self.log("trigger_alert", "PASS", alert)
return alert
# ==================== Step 2: GraphRAG Analysis ====================
async def step2_graphrag_analysis(self, alert: dict) -> dict:
"""
Step 2: GraphRAG 分析
呼叫 TopologyGraph.get_blast_radius_and_root_cause()
分析:
- Blast Radius: frontend 掛了誰會跟著掛
- Root Cause: frontend 的依賴誰目前有問題
"""
print("\n" + "=" * 60)
print("STEP 2: GRAPHRAG ANALYSIS")
print("=" * 60)
target_service = alert["service"]
# Mock GraphRAG 結果 (實際會呼叫 topology_graph)
analysis = {
"targetService": target_service,
"blastRadius": {
"affectedServices": ["ingress"],
"affectedCount": 1,
"criticalPath": ["ingress -> frontend"],
"impactSummary": "If 'frontend' goes down, 1 upstream service (ingress) will be affected.",
},
"rootCause": {
"unhealthyDependencies": ["postgres-db"],
"dependencyChain": ["auth-service", "product-api", "order-api", "postgres-db", "redis-cache"],
"probableRootCauses": ["postgres-db"],
"analysisSummary": "Found 1 unhealthy dependency: postgres-db. Probable root cause: postgres-db.",
},
"analyzedAt": datetime.utcnow().isoformat(),
}
# 視覺化輸出
print("\n[BLAST RADIUS - Upstream Impact]")
print(" ┌─────────────────────┐")
print(" │ ingress │")
print(" └─────────┬───────────┘")
print(" │ depends on")
print("")
print(" ┌─────────────────────┐")
print(" │ frontend │ X")
print(" └─────────────────────┘")
print("\n[ROOT CAUSE - Downstream Chain]")
print(" ┌─────────────────────┐")
print(" │ frontend │ !")
print(" └─────────┬───────────┘")
print(" │ calls")
print("")
print(" ┌─────────────────────┐")
print(" │ postgres-db │ X (UNHEALTHY)")
print(" └─────────────────────┘")
self.log("graphrag_analysis", "PASS", analysis)
return analysis
# ==================== Step 3: Dry-Run & ApprovalCard ====================
async def step3_generate_approval(self, analysis: dict) -> dict:
"""
Step 3: 產生 ApprovalCard
根據分析結果,建議重啟 postgres-db
執行 Dry-Run 檢查
"""
print("\n" + "=" * 60)
print("STEP 3: DRY-RUN & APPROVAL CARD")
print("=" * 60)
root_cause = analysis["rootCause"]["probableRootCauses"][0]
# 建議動作
proposed_action = {
"operation": "restart_pod",
"parameters": {
"pod_name": f"{root_cause}-0",
"namespace": "production",
"graceful": True,
},
"reason": f"Auto-suggested based on GraphRAG root cause analysis: {root_cause} is unhealthy",
}
# Mock Dry-Run 結果
dry_run_result = {
"checks": [
{"name": "RBAC Permission", "passed": True, "message": "User has pod/delete permission"},
{"name": "Syntax Validation", "passed": True, "message": "Parameters valid"},
{"name": "Resource Exists", "passed": True, "message": "Pod postgres-db-0 exists"},
{"name": "No PDB Violation", "passed": True, "message": "PodDisruptionBudget allows 1 eviction"},
],
"overallPassed": True,
"blastRadius": {
"affectedPods": 1,
"affectedServices": ["postgres-db"],
"dataImpact": "NONE", # Graceful restart
},
"riskLevel": "high", # Database 操作
}
# 產生 ApprovalCard
approval_card = {
"approvalId": f"approval-{self.test_id}",
"action": proposed_action,
"dryRunResult": dry_run_result,
"requiredSignatures": 2, # HIGH risk = 2-sig
"allowedRoles": ["admin", "devops", "sre"],
"createdAt": datetime.utcnow().isoformat(),
"expiresAt": None, # No expiry for critical ops
}
print("\n[APPROVAL CARD]")
print(f" Action: {proposed_action['operation']}")
print(f" Target: {proposed_action['parameters']['pod_name']}")
print(f" Risk Level: {dry_run_result['riskLevel'].upper()}")
print(f" Required Signatures: {approval_card['requiredSignatures']}")
print(f" Dry-Run: {'PASSED' if dry_run_result['overallPassed'] else 'FAILED'}")
self.log("generate_approval", "PASS", approval_card)
return approval_card
# ==================== Step 4: Multi-Sig Approval ====================
async def step4_multisig_approval(self, approval_card: dict) -> dict:
"""
Step 4: 人類批准 (Multi-Sig)
模擬兩位管理者簽名:
1. DevOps Engineer
2. SRE Lead
"""
print("\n" + "=" * 60)
print("STEP 4: MULTI-SIG APPROVAL")
print("=" * 60)
approval_id = approval_card["approvalId"]
# 第一位簽名
sig1 = {
"userId": "devops-alice",
"role": "devops",
"signedAt": datetime.utcnow().isoformat(),
"comment": "GraphRAG analysis looks correct. Approving restart.",
}
print(f"\n[SIGNATURE 1] {sig1['role']}: {sig1['userId']}")
print(f" Comment: {sig1['comment']}")
# 第二位簽名
sig2 = {
"userId": "sre-bob",
"role": "sre",
"signedAt": datetime.utcnow().isoformat(),
"comment": "Verified PDB. Safe to proceed.",
}
print(f"\n[SIGNATURE 2] {sig2['role']}: {sig2['userId']}")
print(f" Comment: {sig2['comment']}")
# 批准結果
approval_result = {
"approvalId": approval_id,
"status": "APPROVED",
"signatures": [sig1, sig2],
"approvedAt": datetime.utcnow().isoformat(),
}
print(f"\n[APPROVAL STATUS] {approval_result['status']}")
print(f" Signatures: {len(approval_result['signatures'])}/{approval_card['requiredSignatures']}")
self.log("multisig_approval", "PASS", approval_result)
return approval_result
# ==================== Step 5: MCP Execution ====================
async def step5_mcp_execution(self, approval_result: dict, approval_card: dict) -> dict:
"""
Step 5: MCP 模擬執行
透過 MCP Bridge 執行操作
(Phase 3 為模擬Phase 4+ 連接真實 K8s)
"""
print("\n" + "=" * 60)
print("STEP 5: MCP EXECUTION")
print("=" * 60)
action = approval_card["action"]
# TOCTOU 保護: 再次執行 Dry-Run
print("\n[TOCTOU CHECK] Re-running dry-run before execution...")
toctou_passed = True # Mock
print(f" Result: {'PASSED' if toctou_passed else 'VOIDED'}")
if not toctou_passed:
self.log("mcp_execution", "FAIL", {"reason": "TOCTOU conflict detected"})
return {"status": "VOIDED"}
# MCP 執行
execution_result = {
"executionId": f"exec-{self.test_id}",
"operation": action["operation"],
"parameters": action["parameters"],
"status": "SUCCESS",
"output": {
"message": f"Pod {action['parameters']['pod_name']} restarted successfully",
"newPodName": "postgres-db-0", # Same name after restart
"restartTime": "2.3s",
},
"executedAt": datetime.utcnow().isoformat(),
}
print(f"\n[EXECUTION RESULT]")
print(f" Status: {execution_result['status']}")
print(f" Output: {execution_result['output']['message']}")
print(f" Restart Time: {execution_result['output']['restartTime']}")
# 更新 Trust Engine
print("\n[TRUST ENGINE] Recording approval for progressive autonomy...")
print(" Action Pattern: restart_pod:postgres-*")
print(" Trust Score: +1")
self.log("mcp_execution", "PASS", execution_result)
return execution_result
# ==================== Run All ====================
async def run(self):
"""執行完整測試流程"""
print("\n" + "=" * 60)
print("TRACER BULLET 2.0 - FULL LOOP TEST")
print(f"Test ID: {self.test_id}")
print("=" * 60)
try:
# Step 1: Trigger Alert
alert = await self.step1_trigger_alert()
# Step 2: GraphRAG Analysis
analysis = await self.step2_graphrag_analysis(alert)
# Step 3: Dry-Run & Approval Card
approval_card = await self.step3_generate_approval(analysis)
# Step 4: Multi-Sig Approval
approval_result = await self.step4_multisig_approval(approval_card)
# Step 5: MCP Execution
execution_result = await self.step5_mcp_execution(approval_result, approval_card)
# Summary
print("\n" + "=" * 60)
print("TEST SUMMARY")
print("=" * 60)
passed = sum(1 for r in self.results if r["status"] == "PASS")
failed = sum(1 for r in self.results if r["status"] == "FAIL")
print(f" Total Steps: {len(self.results)}")
print(f" Passed: {passed}")
print(f" Failed: {failed}")
print(f" Status: {'ALL PASSED' if failed == 0 else 'SOME FAILED'}")
return {
"testId": self.test_id,
"status": "PASS" if failed == 0 else "FAIL",
"results": self.results,
}
except Exception as e:
self.log("unexpected_error", "FAIL", {"error": str(e)})
raise
# ==================== Main ====================
if __name__ == "__main__":
tracer = TracerBullet2()
asyncio.run(tracer.run())