feat: add all application source code
- apps/api: FastAPI backend with Dockerfile - apps/web: Next.js frontend with Dockerfile - apps/sensor: Signal collection agent - packages: shared packages Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
198
apps/api/scripts/apply_prometheus_config.sh
Executable file
198
apps/api/scripts/apply_prometheus_config.sh
Executable file
@@ -0,0 +1,198 @@
|
||||
#!/bin/bash
|
||||
# =============================================================================
|
||||
# Prometheus Alertmanager 自動對接腳本
|
||||
# =============================================================================
|
||||
# Phase 5: Shadow Mode - 自動化環境對接
|
||||
#
|
||||
# 功能:
|
||||
# 1. 建立 Alertmanager ConfigMap
|
||||
# 2. 套用至 K3s 叢集
|
||||
# 3. 自動重載 Alertmanager
|
||||
#
|
||||
# 使用方式:
|
||||
# ./scripts/apply_prometheus_config.sh
|
||||
#
|
||||
# 前提條件:
|
||||
# - kubectl 已配置並可連線至 K3s (192.168.0.120)
|
||||
# - 有權限操作 monitoring namespace
|
||||
#
|
||||
# Tier 2 授權: 此腳本會變更 K3s 環境,需統帥授權
|
||||
# =============================================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# -----------------------------------------------------------------------------
|
||||
NAMESPACE="monitoring"
|
||||
CONFIGMAP_NAME="alertmanager-awoooi-webhook"
|
||||
AWOOOI_WEBHOOK_URL="http://192.168.0.188:8000/api/v1/webhooks/alerts"
|
||||
KUBECONFIG_PATH="${KUBECONFIG:-./k3s-prod.yaml}"
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Functions
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
log_info() {
|
||||
echo -e "${GREEN}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
check_prerequisites() {
|
||||
log_info "檢查前提條件..."
|
||||
|
||||
# Check kubectl
|
||||
if ! command -v kubectl &> /dev/null; then
|
||||
log_error "kubectl 未安裝"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check kubeconfig
|
||||
if [[ ! -f "$KUBECONFIG_PATH" ]]; then
|
||||
log_error "找不到 kubeconfig: $KUBECONFIG_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Test connection
|
||||
if ! kubectl --kubeconfig="$KUBECONFIG_PATH" cluster-info &> /dev/null; then
|
||||
log_error "無法連線至 K3s 叢集"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "前提條件檢查通過"
|
||||
}
|
||||
|
||||
create_namespace_if_not_exists() {
|
||||
log_info "確認 namespace: $NAMESPACE..."
|
||||
|
||||
if ! kubectl --kubeconfig="$KUBECONFIG_PATH" get namespace "$NAMESPACE" &> /dev/null; then
|
||||
log_info "建立 namespace: $NAMESPACE"
|
||||
kubectl --kubeconfig="$KUBECONFIG_PATH" create namespace "$NAMESPACE"
|
||||
else
|
||||
log_info "Namespace $NAMESPACE 已存在"
|
||||
fi
|
||||
}
|
||||
|
||||
apply_alertmanager_config() {
|
||||
log_info "套用 Alertmanager Webhook 設定..."
|
||||
|
||||
# Create ConfigMap YAML
|
||||
cat <<EOF | kubectl --kubeconfig="$KUBECONFIG_PATH" apply -f -
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: ${CONFIGMAP_NAME}
|
||||
namespace: ${NAMESPACE}
|
||||
labels:
|
||||
app: alertmanager
|
||||
component: awoooi-webhook
|
||||
data:
|
||||
alertmanager-webhook.yml: |
|
||||
# =============================================================================
|
||||
# AWOOOI Webhook Receiver Configuration
|
||||
# =============================================================================
|
||||
# 此設定檔定義 Alertmanager 如何將告警轉發至 AWOOOI OpenClaw
|
||||
#
|
||||
# 用法: 將此內容合併至主 alertmanager.yml 的 receivers 區段
|
||||
# =============================================================================
|
||||
|
||||
receivers:
|
||||
- name: 'awoooi-openclaw'
|
||||
webhook_configs:
|
||||
- url: '${AWOOOI_WEBHOOK_URL}'
|
||||
send_resolved: true
|
||||
max_alerts: 10
|
||||
# 5 秒超時
|
||||
http_config:
|
||||
follow_redirects: true
|
||||
|
||||
# 路由規則範例 (合併至主設定):
|
||||
# route:
|
||||
# receiver: 'awoooi-openclaw'
|
||||
# group_by: ['alertname', 'namespace']
|
||||
# group_wait: 30s
|
||||
# group_interval: 5m
|
||||
# repeat_interval: 4h
|
||||
# routes:
|
||||
# - match:
|
||||
# severity: critical
|
||||
# receiver: 'awoooi-openclaw'
|
||||
# group_wait: 10s
|
||||
EOF
|
||||
|
||||
log_info "ConfigMap ${CONFIGMAP_NAME} 已套用"
|
||||
}
|
||||
|
||||
reload_alertmanager() {
|
||||
log_info "嘗試重載 Alertmanager..."
|
||||
|
||||
# Find Alertmanager pod
|
||||
ALERTMANAGER_POD=$(kubectl --kubeconfig="$KUBECONFIG_PATH" get pods -n "$NAMESPACE" \
|
||||
-l app=alertmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
|
||||
|
||||
if [[ -z "$ALERTMANAGER_POD" ]]; then
|
||||
log_warn "找不到 Alertmanager Pod (可能尚未部署)"
|
||||
log_info "ConfigMap 已建立,待 Alertmanager 部署後可手動合併設定"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Trigger reload via /-/reload endpoint
|
||||
log_info "觸發 Alertmanager 設定重載..."
|
||||
kubectl --kubeconfig="$KUBECONFIG_PATH" exec -n "$NAMESPACE" "$ALERTMANAGER_POD" -- \
|
||||
wget -q -O- --post-data='' http://localhost:9093/-/reload 2>/dev/null || true
|
||||
|
||||
log_info "Alertmanager 重載完成"
|
||||
}
|
||||
|
||||
verify_config() {
|
||||
log_info "驗證 ConfigMap..."
|
||||
|
||||
kubectl --kubeconfig="$KUBECONFIG_PATH" get configmap "$CONFIGMAP_NAME" -n "$NAMESPACE" -o yaml
|
||||
|
||||
log_info "驗證完成"
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Main
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
main() {
|
||||
echo "============================================================"
|
||||
echo " AWOOOI Prometheus Alertmanager 自動對接"
|
||||
echo "============================================================"
|
||||
echo ""
|
||||
echo "目標: 將 Webhook 設定套用至 K3s 叢集"
|
||||
echo "Webhook URL: $AWOOOI_WEBHOOK_URL"
|
||||
echo "Namespace: $NAMESPACE"
|
||||
echo ""
|
||||
|
||||
check_prerequisites
|
||||
create_namespace_if_not_exists
|
||||
apply_alertmanager_config
|
||||
reload_alertmanager
|
||||
verify_config
|
||||
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo " 對接完成"
|
||||
echo "============================================================"
|
||||
echo ""
|
||||
log_info "ConfigMap 已建立: $NAMESPACE/$CONFIGMAP_NAME"
|
||||
log_info "下一步: 將 receiver 設定合併至 alertmanager.yml"
|
||||
log_info "測試: 使用 scripts/fire_live_alert.py 發射測試告警"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
265
apps/api/scripts/demo_multisig.py
Normal file
265
apps/api/scripts/demo_multisig.py
Normal file
@@ -0,0 +1,265 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
CISO-101 Multi-Sig Demo Script
|
||||
==============================
|
||||
展示 CRITICAL 任務從發起到完成的完整信任鏈生命週期
|
||||
|
||||
流程:
|
||||
1. ClawBot 發起 CRITICAL 操作 (DROP TABLE)
|
||||
2. 第一位簽核者簽核 → 仍為 PENDING (1/2)
|
||||
3. 第二位簽核者簽核 → 轉為 APPROVED → 觸發執行
|
||||
|
||||
執行方式:
|
||||
cd apps/api
|
||||
source .venv/bin/activate
|
||||
python scripts/demo_multisig.py
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone, timedelta
|
||||
|
||||
# Add parent to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from src.models.approval import (
|
||||
ApprovalRequestCreate,
|
||||
ApprovalStatus,
|
||||
RiskLevel,
|
||||
BlastRadius,
|
||||
DataImpact,
|
||||
DryRunCheck,
|
||||
)
|
||||
from src.core.trust_engine import TrustEngine, get_required_signatures
|
||||
|
||||
|
||||
def print_header(title: str) -> None:
|
||||
"""Print a formatted header"""
|
||||
print("\n" + "=" * 60)
|
||||
print(f" {title}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def print_approval_status(approval) -> None:
|
||||
"""Print approval status summary"""
|
||||
print(f"""
|
||||
ID: {approval.id}
|
||||
Action: {approval.action}
|
||||
Status: {approval.status.value.upper()}
|
||||
Risk Level: {approval.risk_level.value.upper()}
|
||||
Required Sigs: {approval.required_signatures}
|
||||
Current Sigs: {approval.current_signatures}
|
||||
Is Fully Signed: {approval.is_fully_signed}
|
||||
""")
|
||||
|
||||
if approval.signatures:
|
||||
print(" Signatures:")
|
||||
for sig in approval.signatures:
|
||||
print(f" - {sig.signer_name} ({sig.signer_id}) at {sig.signed_at.strftime('%H:%M:%S')}")
|
||||
if sig.comment:
|
||||
print(f" Comment: {sig.comment}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run the Multi-Sig demo"""
|
||||
|
||||
print_header("CISO-101 Multi-Sig Trust Engine Demo")
|
||||
print("""
|
||||
This demo shows the complete CRITICAL approval lifecycle:
|
||||
|
||||
1. ClawBot initiates a CRITICAL operation (DROP TABLE)
|
||||
2. First signer signs → Still PENDING (1/2)
|
||||
3. Second signer signs → APPROVED → Execution triggered
|
||||
""")
|
||||
|
||||
# ==========================================================================
|
||||
# Step 0: Show signature requirements
|
||||
# ==========================================================================
|
||||
print_header("Step 0: Signature Requirements")
|
||||
print("""
|
||||
Risk Level Required Signatures
|
||||
---------- -------------------
|
||||
LOW 0 (Auto-approve)
|
||||
MEDIUM 1
|
||||
CRITICAL 2 (Multi-Sig)
|
||||
""")
|
||||
|
||||
for level in RiskLevel:
|
||||
req = get_required_signatures(level)
|
||||
print(f" {level.value.upper():10} → {req} signature(s)")
|
||||
|
||||
# ==========================================================================
|
||||
# Step 1: Create CRITICAL approval request
|
||||
# ==========================================================================
|
||||
print_header("Step 1: ClawBot Initiates CRITICAL Operation")
|
||||
|
||||
# Track approved requests
|
||||
approved_requests = []
|
||||
|
||||
def on_approved(approval):
|
||||
approved_requests.append(approval)
|
||||
print(f"\n 🚀 EXECUTION TRIGGERED: {approval.action}")
|
||||
|
||||
def on_rejected(approval):
|
||||
print(f"\n ❌ REJECTED: {approval.rejection_reason}")
|
||||
|
||||
engine = TrustEngine(
|
||||
on_approved=on_approved,
|
||||
on_rejected=on_rejected,
|
||||
)
|
||||
|
||||
# Create the CRITICAL request
|
||||
request = ApprovalRequestCreate(
|
||||
action="DROP TABLE user_sessions",
|
||||
description="清除所有用戶 session 以強制重新登入。此操作將影響所有線上用戶。",
|
||||
risk_level=RiskLevel.CRITICAL,
|
||||
blast_radius=BlastRadius(
|
||||
affected_pods=0,
|
||||
estimated_downtime="0",
|
||||
related_services=["auth-service", "api-gateway", "user-service"],
|
||||
data_impact=DataImpact.DESTRUCTIVE,
|
||||
),
|
||||
dry_run_checks=[
|
||||
DryRunCheck(name="RBAC Check", passed=True, message="db-admin"),
|
||||
DryRunCheck(name="Syntax Check", passed=True),
|
||||
DryRunCheck(name="Backup Available", passed=False, message="No recent backup!"),
|
||||
],
|
||||
requested_by="ClawBot",
|
||||
expires_at=datetime.now(timezone.utc) + timedelta(hours=1),
|
||||
)
|
||||
|
||||
approval = engine.create_approval(request)
|
||||
|
||||
print(f"""
|
||||
ClawBot 發起 CRITICAL 操作請求:
|
||||
|
||||
動作: {request.action}
|
||||
描述: {request.description}
|
||||
風險等級: {request.risk_level.value.upper()}
|
||||
資料影響: {request.blast_radius.data_impact.value.upper()}
|
||||
""")
|
||||
|
||||
print_approval_status(approval)
|
||||
|
||||
# ==========================================================================
|
||||
# Step 2: First signer signs
|
||||
# ==========================================================================
|
||||
print_header("Step 2: First Signer (Alice) Signs")
|
||||
|
||||
approval, message, triggered = engine.sign_approval(
|
||||
approval_id=approval.id,
|
||||
signer_id="alice-001",
|
||||
signer_name="Alice Chen (CTO)",
|
||||
comment="已確認風險,建議在低流量時段執行",
|
||||
)
|
||||
|
||||
print(f"""
|
||||
Alice (CTO) 已簽核:
|
||||
|
||||
結果: {message}
|
||||
觸發執行: {triggered}
|
||||
""")
|
||||
|
||||
print_approval_status(approval)
|
||||
|
||||
assert approval.status == ApprovalStatus.PENDING, "Should still be PENDING after first signature"
|
||||
assert approval.current_signatures == 1, "Should have 1 signature"
|
||||
assert not triggered, "Should not trigger execution yet"
|
||||
|
||||
# ==========================================================================
|
||||
# Step 3: Second signer signs
|
||||
# ==========================================================================
|
||||
print_header("Step 3: Second Signer (Bob) Signs - Multi-Sig Complete")
|
||||
|
||||
approval, message, triggered = engine.sign_approval(
|
||||
approval_id=approval.id,
|
||||
signer_id="bob-002",
|
||||
signer_name="Bob Wu (CISO)",
|
||||
comment="CISO 核准。已通知 DBA 團隊待命。",
|
||||
)
|
||||
|
||||
print(f"""
|
||||
Bob (CISO) 已簽核:
|
||||
|
||||
結果: {message}
|
||||
觸發執行: {triggered}
|
||||
""")
|
||||
|
||||
print_approval_status(approval)
|
||||
|
||||
assert approval.status == ApprovalStatus.APPROVED, "Should be APPROVED after second signature"
|
||||
assert approval.current_signatures == 2, "Should have 2 signatures"
|
||||
assert approval.is_fully_signed, "Should be fully signed"
|
||||
assert triggered, "Should trigger execution"
|
||||
|
||||
# ==========================================================================
|
||||
# Step 4: Verify final state
|
||||
# ==========================================================================
|
||||
print_header("Step 4: Verification")
|
||||
|
||||
pending = engine.get_pending_approvals()
|
||||
|
||||
print(f"""
|
||||
驗證結果:
|
||||
|
||||
✅ 待簽核清單數量: {len(pending)} (應為 0)
|
||||
✅ 已批准請求數量: {len(approved_requests)} (應為 1)
|
||||
✅ 最終狀態: {approval.status.value.upper()}
|
||||
✅ 簽核數: {approval.current_signatures}/{approval.required_signatures}
|
||||
✅ 解決時間: {approval.resolved_at.strftime('%Y-%m-%d %H:%M:%S') if approval.resolved_at else 'N/A'}
|
||||
""")
|
||||
|
||||
# ==========================================================================
|
||||
# Bonus: Demo LOW risk auto-approval
|
||||
# ==========================================================================
|
||||
print_header("Bonus: LOW Risk Auto-Approval Demo")
|
||||
|
||||
low_request = ApprovalRequestCreate(
|
||||
action="Scale deployment api-backend to 5 replicas",
|
||||
description="增加後端服務副本數以應對流量增長",
|
||||
risk_level=RiskLevel.LOW,
|
||||
blast_radius=BlastRadius(
|
||||
affected_pods=5,
|
||||
estimated_downtime="0",
|
||||
related_services=["api-backend"],
|
||||
data_impact=DataImpact.NONE,
|
||||
),
|
||||
dry_run_checks=[
|
||||
DryRunCheck(name="Resource Check", passed=True, message="5/20 pods"),
|
||||
],
|
||||
requested_by="ClawBot",
|
||||
)
|
||||
|
||||
low_approval = engine.create_approval(low_request)
|
||||
|
||||
print(f"""
|
||||
LOW 風險操作自動放行:
|
||||
|
||||
動作: {low_request.action}
|
||||
風險等級: LOW
|
||||
狀態: {low_approval.status.value.upper()} (自動批准!)
|
||||
簽核數: {low_approval.required_signatures} (不需要簽核)
|
||||
""")
|
||||
|
||||
assert low_approval.status == ApprovalStatus.APPROVED, "LOW risk should be auto-approved"
|
||||
|
||||
# ==========================================================================
|
||||
# Summary
|
||||
# ==========================================================================
|
||||
print_header("Demo Complete!")
|
||||
print("""
|
||||
CISO-101 Multi-Sig Trust Engine 功能驗證完成:
|
||||
|
||||
✅ 風險等級分類 (LOW/MEDIUM/CRITICAL)
|
||||
✅ 簽核數自動判定 (0/1/2)
|
||||
✅ LOW 風險自動放行
|
||||
✅ CRITICAL 雙重簽核 (Multi-Sig)
|
||||
✅ 狀態機正確轉換 (PENDING → APPROVED)
|
||||
✅ 簽核完成觸發執行回調
|
||||
|
||||
信任鏈完整性已驗證。
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
246
apps/api/scripts/e2e_openclaw_test.py
Normal file
246
apps/api/scripts/e2e_openclaw_test.py
Normal file
@@ -0,0 +1,246 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 5 E2E 點火測試 - OpenClaw 全鏈路驗證
|
||||
==========================================
|
||||
|
||||
測試流程:
|
||||
1. 發射模擬 K8s 告警到 Webhook
|
||||
2. 驗證告警被正確處理
|
||||
3. 驗證 ApprovalRecord 被建立
|
||||
4. 模擬 Telegram 簽核回調
|
||||
5. 驗證執行觸發
|
||||
|
||||
使用方式:
|
||||
python scripts/e2e_openclaw_test.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def print_header(title: str) -> None:
|
||||
"""列印測試標題"""
|
||||
print("\n" + "=" * 60)
|
||||
print(f" {title}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def print_step(step: int, description: str) -> None:
|
||||
"""列印測試步驟"""
|
||||
print(f"\n🔹 Step {step}: {description}")
|
||||
|
||||
|
||||
def print_success(message: str) -> None:
|
||||
"""列印成功訊息"""
|
||||
print(f" ✅ {message}")
|
||||
|
||||
|
||||
def print_error(message: str) -> None:
|
||||
"""列印錯誤訊息"""
|
||||
print(f" ❌ {message}")
|
||||
|
||||
|
||||
def print_info(message: str) -> None:
|
||||
"""列印資訊訊息"""
|
||||
print(f" ℹ️ {message}")
|
||||
|
||||
|
||||
async def test_phase5_e2e():
|
||||
"""Phase 5 E2E 測試"""
|
||||
print_header("Phase 5 E2E 點火測試 - OpenClaw 全鏈路驗證")
|
||||
print(f"執行時間: {datetime.now().isoformat()}")
|
||||
|
||||
# =========================================================================
|
||||
# Step 1: 測試 LogLevelFilter (日誌清洗)
|
||||
# =========================================================================
|
||||
print_step(1, "日誌清洗模組 (LogLevelFilter)")
|
||||
|
||||
try:
|
||||
from src.services.context_gatherer import LogLevelFilter
|
||||
|
||||
# 模擬 K8s 日誌
|
||||
raw_logs = """
|
||||
2024-03-21T10:15:23.456Z INFO [harbor.core] Starting Harbor Core
|
||||
2024-03-21T10:15:24.789Z DEBUG [harbor.core.db] Initializing connection pool
|
||||
2024-03-21T10:16:45.123Z ERROR [harbor.core.db] Connection lost to PostgreSQL
|
||||
2024-03-21T10:16:45.456Z FATAL [harbor.core] Unrecoverable error
|
||||
Traceback (most recent call last):
|
||||
File "/harbor/core/db.py", line 234, in connect
|
||||
raise DatabaseConnectionError("Max retries exceeded")
|
||||
""".strip()
|
||||
|
||||
filtered = LogLevelFilter.filter_logs(raw_logs)
|
||||
stats = LogLevelFilter.get_filter_stats(raw_logs, filtered)
|
||||
|
||||
# 驗證 DEBUG/INFO 被過濾
|
||||
assert "DEBUG" not in filtered, "DEBUG should be filtered"
|
||||
assert "INFO" not in filtered.replace("DatabaseConnectionError", ""), "INFO should be filtered"
|
||||
assert "ERROR" in filtered, "ERROR should be preserved"
|
||||
assert "FATAL" in filtered, "FATAL should be preserved"
|
||||
assert "Traceback" in filtered, "Stacktrace should be preserved"
|
||||
|
||||
print_success(f"日誌清洗成功: {stats['original_lines']} → {stats['filtered_lines']} 行")
|
||||
print_success(f"雜訊移除率: {stats['removal_rate_percent']}%")
|
||||
|
||||
except Exception as e:
|
||||
print_error(f"日誌清洗測試失敗: {e}")
|
||||
return False
|
||||
|
||||
# =========================================================================
|
||||
# Step 2: 測試 Security Interceptor (白名單 + Nonce)
|
||||
# =========================================================================
|
||||
print_step(2, "安全攔截器 (Security Interceptor)")
|
||||
|
||||
try:
|
||||
from src.services.security_interceptor import (
|
||||
TelegramSecurityInterceptor,
|
||||
UserNotWhitelistedError,
|
||||
NonceReplayError,
|
||||
)
|
||||
from src.core.config import settings
|
||||
|
||||
interceptor = TelegramSecurityInterceptor()
|
||||
|
||||
# 測試白名單 (假設統帥 ID: 5619078117)
|
||||
test_user_id = 5619078117
|
||||
|
||||
# 檢查白名單配置
|
||||
whitelist = settings.OPENCLAW_TG_USER_WHITELIST
|
||||
print_info(f"白名單配置: {whitelist}")
|
||||
|
||||
if whitelist:
|
||||
is_whitelisted = interceptor.is_whitelisted(test_user_id)
|
||||
if is_whitelisted:
|
||||
print_success(f"統帥 ID {test_user_id} 在白名單內")
|
||||
else:
|
||||
print_info(f"統帥 ID {test_user_id} 不在白名單 (需配置)")
|
||||
else:
|
||||
print_info("白名單為空 (需在環境變數中配置 OPENCLAW_TG_USER_WHITELIST)")
|
||||
|
||||
# 測試 Nonce 產生
|
||||
nonce = interceptor.generate_callback_nonce("test-approval-123", "approve")
|
||||
print_success(f"Nonce 產生成功: {nonce[:30]}...")
|
||||
|
||||
# 解析 Nonce
|
||||
parsed = interceptor.parse_callback_data(nonce)
|
||||
assert parsed["action"] == "approve"
|
||||
assert parsed["approval_id"] == "test-approval-123"
|
||||
print_success("Nonce 解析成功")
|
||||
|
||||
except Exception as e:
|
||||
print_error(f"安全攔截器測試失敗: {e}")
|
||||
return False
|
||||
|
||||
# =========================================================================
|
||||
# Step 3: 測試 Telegram Gateway (訊息格式)
|
||||
# =========================================================================
|
||||
print_step(3, "Telegram Gateway (SOUL.md 訊息格式)")
|
||||
|
||||
try:
|
||||
from src.services.telegram_gateway import TelegramMessage, RISK_EMOJI_MAP
|
||||
|
||||
# 建立測試訊息
|
||||
message = TelegramMessage(
|
||||
status_emoji=RISK_EMOJI_MAP["critical"],
|
||||
risk_level="CRITICAL",
|
||||
resource_name="harbor-core-7d4b8c9f5-xk2m3",
|
||||
root_cause="OOMKilled",
|
||||
suggested_action="DELETE_POD (重啟 Pod)",
|
||||
estimated_downtime="~30s",
|
||||
approval_id="test-approval-123",
|
||||
)
|
||||
|
||||
formatted = message.format()
|
||||
|
||||
# 驗證 SOUL.md 格式
|
||||
assert "🚨" in formatted, "Should have critical emoji"
|
||||
assert "CRITICAL" in formatted, "Should have risk level"
|
||||
assert "harbor-core" in formatted, "Should have resource name"
|
||||
assert "OOMKilled" in formatted, "Should have root cause"
|
||||
assert "建議" in formatted, "Should have suggestion"
|
||||
assert "停機" in formatted, "Should have downtime"
|
||||
assert len(formatted) <= 500, f"Should be <= 500 chars, got {len(formatted)}"
|
||||
|
||||
print_success("SOUL.md 訊息格式驗證通過")
|
||||
print_info(f"訊息長度: {len(formatted)} / 500 字元")
|
||||
print()
|
||||
print(" 📱 訊息預覽:")
|
||||
for line in formatted.split("\n"):
|
||||
print(f" {line}")
|
||||
|
||||
except Exception as e:
|
||||
print_error(f"Telegram Gateway 測試失敗: {e}")
|
||||
return False
|
||||
|
||||
# =========================================================================
|
||||
# Step 4: 測試 OpenClaw 模組載入
|
||||
# =========================================================================
|
||||
print_step(4, "OpenClaw AI 模組載入")
|
||||
|
||||
try:
|
||||
from src.services.openclaw import get_openclaw, OpenClawService
|
||||
|
||||
openclaw = get_openclaw()
|
||||
assert isinstance(openclaw, OpenClawService)
|
||||
print_success("OpenClaw 服務載入成功")
|
||||
|
||||
# 檢查 AI Fallback 順序
|
||||
from src.core.config import settings
|
||||
print_info(f"AI Fallback 順序: {settings.AI_FALLBACK_ORDER}")
|
||||
print_info(f"預設模型: {settings.OPENCLAW_DEFAULT_MODEL}")
|
||||
|
||||
except Exception as e:
|
||||
print_error(f"OpenClaw 模組載入失敗: {e}")
|
||||
return False
|
||||
|
||||
# =========================================================================
|
||||
# Step 5: 測試 Signature 審計欄位
|
||||
# =========================================================================
|
||||
print_step(5, "Signature 審計欄位 (Telegram 擴充)")
|
||||
|
||||
try:
|
||||
from src.models.approval import Signature, SignatureSource
|
||||
|
||||
# 建立 Telegram 簽核記錄
|
||||
sig = Signature(
|
||||
signer_id="tg_5619078117",
|
||||
signer_name="統帥",
|
||||
comment="Telegram 簽核測試",
|
||||
source=SignatureSource.TELEGRAM,
|
||||
telegram_user_id=5619078117,
|
||||
telegram_message_id=12345,
|
||||
)
|
||||
|
||||
assert sig.source == SignatureSource.TELEGRAM
|
||||
assert sig.telegram_user_id == 5619078117
|
||||
print_success("Telegram 審計欄位驗證通過")
|
||||
print_info(f"簽核來源: {sig.source.value}")
|
||||
print_info(f"Telegram User ID: {sig.telegram_user_id}")
|
||||
|
||||
except Exception as e:
|
||||
print_error(f"Signature 審計欄位測試失敗: {e}")
|
||||
return False
|
||||
|
||||
# =========================================================================
|
||||
# 測試完成
|
||||
# =========================================================================
|
||||
print_header("E2E 測試結果")
|
||||
print()
|
||||
print(" ✅ Step 1: 日誌清洗 (LogLevelFilter) - PASSED")
|
||||
print(" ✅ Step 2: 安全攔截器 (Security Interceptor) - PASSED")
|
||||
print(" ✅ Step 3: Telegram Gateway (SOUL.md 格式) - PASSED")
|
||||
print(" ✅ Step 4: OpenClaw AI 模組載入 - PASSED")
|
||||
print(" ✅ Step 5: Signature 審計欄位 - PASSED")
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(" 🎉 Phase 5 E2E 點火測試 - 全數通過!")
|
||||
print("=" * 60)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = asyncio.run(test_phase5_e2e())
|
||||
sys.exit(0 if success else 1)
|
||||
372
apps/api/scripts/fire_live_alert.py
Executable file
372
apps/api/scripts/fire_live_alert.py
Executable file
@@ -0,0 +1,372 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AWOOOI 實彈射擊腳本 - 自動化告警測試
|
||||
=====================================
|
||||
Phase 5: Shadow Mode - 自動化實彈演習
|
||||
|
||||
功能:
|
||||
1. 模擬 Prometheus 格式的 OOMKilled/PodCrash 告警
|
||||
2. 自動計算 HMAC-SHA256 簽章
|
||||
3. 直接打向本地 Webhook 端點
|
||||
4. 驗證回應並輸出結果
|
||||
|
||||
使用方式:
|
||||
python scripts/fire_live_alert.py
|
||||
|
||||
環境變數:
|
||||
WEBHOOK_HMAC_SECRET: HMAC 簽章密鑰 (必要)
|
||||
AWOOOI_API_URL: API 端點 (預設: http://192.168.0.188:8000)
|
||||
|
||||
Tier 2 授權: 此腳本會觸發 AI 分析流程,需統帥授權
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from typing import Literal
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
DEFAULT_API_URL = os.getenv("AWOOOI_API_URL", "http://192.168.0.188:8000")
|
||||
WEBHOOK_ENDPOINT = "/api/v1/webhooks/alerts"
|
||||
HMAC_SECRET = os.getenv("WEBHOOK_HMAC_SECRET", "")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Alert Templates
|
||||
# =============================================================================
|
||||
|
||||
ALERT_TEMPLATES = {
|
||||
"oomkilled": {
|
||||
"alert_type": "k8s_pod_crash",
|
||||
"severity": "critical",
|
||||
"source": "prometheus",
|
||||
"target_resource": "harbor-core-7d4b8c9f5-xk2m3",
|
||||
"namespace": "harbor",
|
||||
"message": "Pod terminated due to OOMKilled - Container exceeded memory limit",
|
||||
"metrics": {
|
||||
"memory_percent": 99.8,
|
||||
"restart_count": 5,
|
||||
"memory_limit_mb": 512,
|
||||
"memory_usage_mb": 520,
|
||||
},
|
||||
"labels": {
|
||||
"app": "harbor-core",
|
||||
"deployment": "harbor-core",
|
||||
"pod": "harbor-core-7d4b8c9f5-xk2m3",
|
||||
"container": "harbor-core",
|
||||
"reason": "OOMKilled",
|
||||
},
|
||||
},
|
||||
"podcrash": {
|
||||
"alert_type": "k8s_pod_crash",
|
||||
"severity": "warning",
|
||||
"source": "prometheus",
|
||||
"target_resource": "nginx-ingress-7d6f8c9b5-abc12",
|
||||
"namespace": "ingress-nginx",
|
||||
"message": "Pod CrashLoopBackOff - Container restarting repeatedly",
|
||||
"metrics": {
|
||||
"restart_count": 8,
|
||||
"cpu_percent": 15.2,
|
||||
"memory_percent": 45.0,
|
||||
},
|
||||
"labels": {
|
||||
"app": "nginx-ingress",
|
||||
"deployment": "nginx-ingress-controller",
|
||||
"pod": "nginx-ingress-7d6f8c9b5-abc12",
|
||||
},
|
||||
},
|
||||
"highcpu": {
|
||||
"alert_type": "high_cpu",
|
||||
"severity": "warning",
|
||||
"source": "prometheus",
|
||||
"target_resource": "api-backend-deployment",
|
||||
"namespace": "default",
|
||||
"message": "High CPU usage detected - Pod using 95% of allocated CPU",
|
||||
"metrics": {
|
||||
"cpu_percent": 95.5,
|
||||
"memory_percent": 60.0,
|
||||
"sigma_deviation": 3.2,
|
||||
},
|
||||
"labels": {
|
||||
"app": "api-backend",
|
||||
"deployment": "api-backend",
|
||||
},
|
||||
},
|
||||
"highmemory": {
|
||||
"alert_type": "high_memory",
|
||||
"severity": "warning",
|
||||
"source": "prometheus",
|
||||
"target_resource": "redis-master-0",
|
||||
"namespace": "redis",
|
||||
"message": "High memory usage detected - Pod memory at 92%",
|
||||
"metrics": {
|
||||
"cpu_percent": 25.0,
|
||||
"memory_percent": 92.0,
|
||||
"sigma_deviation": 2.8,
|
||||
},
|
||||
"labels": {
|
||||
"app": "redis",
|
||||
"statefulset": "redis-master",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
def compute_hmac_signature(secret: str, payload: bytes) -> str:
|
||||
"""計算 HMAC-SHA256 簽章"""
|
||||
signature = hmac.new(
|
||||
secret.encode(),
|
||||
payload,
|
||||
hashlib.sha256,
|
||||
).hexdigest()
|
||||
return f"sha256={signature}"
|
||||
|
||||
|
||||
def print_header(title: str) -> None:
|
||||
"""列印標題"""
|
||||
print("\n" + "=" * 60)
|
||||
print(f" {title}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def print_success(message: str) -> None:
|
||||
"""列印成功訊息"""
|
||||
print(f" ✅ {message}")
|
||||
|
||||
|
||||
def print_error(message: str) -> None:
|
||||
"""列印錯誤訊息"""
|
||||
print(f" ❌ {message}")
|
||||
|
||||
|
||||
def print_info(message: str) -> None:
|
||||
"""列印資訊訊息"""
|
||||
print(f" ℹ️ {message}")
|
||||
|
||||
|
||||
def print_warning(message: str) -> None:
|
||||
"""列印警告訊息"""
|
||||
print(f" ⚠️ {message}")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Logic
|
||||
# =============================================================================
|
||||
|
||||
def fire_alert(
|
||||
alert_type: str,
|
||||
api_url: str = DEFAULT_API_URL,
|
||||
hmac_secret: str = HMAC_SECRET,
|
||||
dry_run: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
發射模擬告警
|
||||
|
||||
Args:
|
||||
alert_type: 告警類型 (oomkilled, podcrash, highcpu, highmemory)
|
||||
api_url: API 端點 URL
|
||||
hmac_secret: HMAC 簽章密鑰
|
||||
dry_run: 是否僅輸出不實際發送
|
||||
|
||||
Returns:
|
||||
dict: API 回應
|
||||
"""
|
||||
print_header(f"AWOOOI 實彈射擊 - {alert_type.upper()}")
|
||||
print(f"執行時間: {datetime.now(timezone.utc).isoformat()}")
|
||||
print(f"目標端點: {api_url}{WEBHOOK_ENDPOINT}")
|
||||
|
||||
# 取得告警模板
|
||||
if alert_type not in ALERT_TEMPLATES:
|
||||
print_error(f"未知的告警類型: {alert_type}")
|
||||
print_info(f"可用類型: {', '.join(ALERT_TEMPLATES.keys())}")
|
||||
return {"success": False, "error": "Unknown alert type"}
|
||||
|
||||
payload = ALERT_TEMPLATES[alert_type].copy()
|
||||
|
||||
# 序列化 Payload (與 httpx 相同的格式)
|
||||
payload_json = json.dumps(payload, separators=(",", ":"))
|
||||
payload_bytes = payload_json.encode()
|
||||
|
||||
print("\n📦 告警 Payload:")
|
||||
print(json.dumps(payload, indent=2, ensure_ascii=False))
|
||||
|
||||
# 計算 HMAC 簽章
|
||||
if hmac_secret:
|
||||
signature = compute_hmac_signature(hmac_secret, payload_bytes)
|
||||
print_success(f"HMAC 簽章: {signature[:40]}...")
|
||||
else:
|
||||
signature = None
|
||||
print_warning("無 HMAC Secret - 簽章將被跳過 (僅限 dev 環境)")
|
||||
|
||||
# Dry-run 模式
|
||||
if dry_run:
|
||||
print("\n🔒 [DRY-RUN MODE] 不實際發送請求")
|
||||
print_info("移除 --dry-run 參數以實際發射")
|
||||
return {"success": True, "dry_run": True}
|
||||
|
||||
# 發送請求
|
||||
print("\n🚀 發射中...")
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if signature:
|
||||
headers["X-Signature-256"] = signature
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=30.0) as client:
|
||||
response = client.post(
|
||||
f"{api_url}{WEBHOOK_ENDPOINT}",
|
||||
content=payload_bytes,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
# 解析回應
|
||||
print(f"\n📡 HTTP Status: {response.status_code}")
|
||||
|
||||
try:
|
||||
result = response.json()
|
||||
print("\n📋 API 回應:")
|
||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
||||
|
||||
if response.status_code == 200 and result.get("success"):
|
||||
print_success("告警已成功接收並處理!")
|
||||
|
||||
if result.get("converged"):
|
||||
print_info(f"告警收斂: 相同指紋已聚合 x{result.get('hit_count', 1)} 次")
|
||||
else:
|
||||
print_info(f"風險等級: {result.get('risk_level', 'N/A')}")
|
||||
print_info(f"建議操作: {result.get('suggested_action', 'N/A')}")
|
||||
|
||||
if result.get("approval_created"):
|
||||
print_success(f"待簽核卡片已建立: {result.get('approval_id', 'N/A')}")
|
||||
else:
|
||||
print_error(f"處理失敗: {result.get('message', result.get('detail', 'Unknown error'))}")
|
||||
|
||||
return result
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print_error(f"回應解析失敗: {response.text}")
|
||||
return {"success": False, "error": "Response parse error", "raw": response.text}
|
||||
|
||||
except httpx.ConnectError as e:
|
||||
print_error(f"連線失敗: {str(e)}")
|
||||
print_info(f"請確認 API 服務正在執行: {api_url}")
|
||||
return {"success": False, "error": "Connection failed"}
|
||||
|
||||
except httpx.TimeoutException as e:
|
||||
print_error(f"請求超時: {str(e)}")
|
||||
return {"success": False, "error": "Timeout"}
|
||||
|
||||
except Exception as e:
|
||||
print_error(f"未預期錯誤: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
def main():
|
||||
"""主程式入口"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="AWOOOI 實彈射擊腳本 - 自動化告警測試",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
告警類型:
|
||||
oomkilled - Pod OOMKilled (Critical)
|
||||
podcrash - Pod CrashLoopBackOff (Warning)
|
||||
highcpu - High CPU Usage (Warning)
|
||||
highmemory - High Memory Usage (Warning)
|
||||
|
||||
範例:
|
||||
# 發射 OOMKilled 告警
|
||||
python scripts/fire_live_alert.py oomkilled
|
||||
|
||||
# Dry-run 模式 (不實際發送)
|
||||
python scripts/fire_live_alert.py oomkilled --dry-run
|
||||
|
||||
# 指定 HMAC Secret
|
||||
WEBHOOK_HMAC_SECRET=mysecret python scripts/fire_live_alert.py oomkilled
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"alert_type",
|
||||
choices=list(ALERT_TEMPLATES.keys()),
|
||||
help="告警類型",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--api-url",
|
||||
default=DEFAULT_API_URL,
|
||||
help=f"API 端點 URL (預設: {DEFAULT_API_URL})",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--hmac-secret",
|
||||
default=HMAC_SECRET,
|
||||
help="HMAC 簽章密鑰 (也可用環境變數 WEBHOOK_HMAC_SECRET)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Dry-run 模式 - 僅輸出不實際發送",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--all",
|
||||
action="store_true",
|
||||
help="依序發射所有類型的告警",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print_header("AWOOOI 實彈射擊系統")
|
||||
print(f"API 端點: {args.api_url}")
|
||||
print(f"HMAC 配置: {'已設定' if args.hmac_secret else '未設定 (dev mode)'}")
|
||||
print(f"Shadow Mode: 已啟用 (K8s 操作將被安全攔截)")
|
||||
|
||||
if args.all:
|
||||
# 發射所有類型的告警
|
||||
print("\n🎯 連續發射所有告警類型...")
|
||||
results = {}
|
||||
for alert_type in ALERT_TEMPLATES.keys():
|
||||
result = fire_alert(
|
||||
alert_type=alert_type,
|
||||
api_url=args.api_url,
|
||||
hmac_secret=args.hmac_secret,
|
||||
dry_run=args.dry_run,
|
||||
)
|
||||
results[alert_type] = result
|
||||
|
||||
# 摘要
|
||||
print_header("射擊結果摘要")
|
||||
for alert_type, result in results.items():
|
||||
status = "✅" if result.get("success") else "❌"
|
||||
print(f" {status} {alert_type}: {result.get('message', result.get('error', 'N/A'))}")
|
||||
else:
|
||||
# 發射單一告警
|
||||
fire_alert(
|
||||
alert_type=args.alert_type,
|
||||
api_url=args.api_url,
|
||||
hmac_secret=args.hmac_secret,
|
||||
dry_run=args.dry_run,
|
||||
)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(" 實彈射擊完成")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
318
apps/api/scripts/fire_test_alert.py
Normal file
318
apps/api/scripts/fire_test_alert.py
Normal file
@@ -0,0 +1,318 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
🚀 AWOOOI Phase 2 導彈腳本 - fire_test_alert.py
|
||||
===============================================
|
||||
向系統注入模擬告警,觸發 ClawBot AI 分析流程
|
||||
|
||||
用途:
|
||||
- 驗證全鏈路 (Webhook → ClawBot → ApprovalCard)
|
||||
- 測試戰情室前端是否即時彈出授權卡片
|
||||
- 開發除錯用 (無需真實監控系統)
|
||||
|
||||
執行方式:
|
||||
cd apps/api
|
||||
python -m scripts.fire_test_alert
|
||||
|
||||
# 指定告警類型
|
||||
python -m scripts.fire_test_alert --type db_connection_timeout
|
||||
python -m scripts.fire_test_alert --type k8s_pod_crash --severity critical
|
||||
|
||||
Author: Claude Code
|
||||
Date: 2026-03-21
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
import httpx
|
||||
|
||||
# =============================================================================
|
||||
# Config
|
||||
# =============================================================================
|
||||
|
||||
API_BASE_URL = "http://localhost:8000"
|
||||
WEBHOOK_ENDPOINT = f"{API_BASE_URL}/api/v1/webhooks/alerts"
|
||||
|
||||
# =============================================================================
|
||||
# 預定義告警場景 (High-Fidelity Mock Alerts)
|
||||
# =============================================================================
|
||||
|
||||
ALERT_SCENARIOS = {
|
||||
"db_connection_timeout": {
|
||||
"alert_type": "db_connection_timeout",
|
||||
"severity": "critical",
|
||||
"source": "prometheus-alertmanager",
|
||||
"target_resource": "postgres-primary-0",
|
||||
"namespace": "database",
|
||||
"message": "PostgreSQL Database OOM - Connection pool exhausted, 47 waiting queries",
|
||||
"metrics": {
|
||||
"connection_count": 100,
|
||||
"waiting_queries": 47,
|
||||
"cpu_percent": 89,
|
||||
"memory_percent": 95,
|
||||
"sigma_deviation": 4.2,
|
||||
},
|
||||
"labels": {
|
||||
"app": "postgres",
|
||||
"team": "dba",
|
||||
"tier": "critical",
|
||||
},
|
||||
},
|
||||
"k8s_pod_crash": {
|
||||
"alert_type": "k8s_pod_crash",
|
||||
"severity": "warning",
|
||||
"source": "k8s-event-watcher",
|
||||
"target_resource": "harbor-core-7d4b8c9f5-xk2m3",
|
||||
"namespace": "harbor",
|
||||
"message": "Pod CrashLoopBackOff detected - OOMKilled after 5 restarts",
|
||||
"metrics": {
|
||||
"restart_count": 5,
|
||||
"last_exit_code": 137,
|
||||
"cpu_percent": 95,
|
||||
"memory_percent": 100,
|
||||
"sigma_deviation": 3.8,
|
||||
},
|
||||
"labels": {
|
||||
"app": "harbor-core",
|
||||
"team": "devops",
|
||||
},
|
||||
},
|
||||
"high_cpu": {
|
||||
"alert_type": "high_cpu",
|
||||
"severity": "warning",
|
||||
"source": "node-exporter",
|
||||
"target_resource": "api-backend-deployment",
|
||||
"namespace": "production",
|
||||
"message": "Payment API Latency Spike - CPU at 94%, response time > 2s",
|
||||
"metrics": {
|
||||
"cpu_percent": 94,
|
||||
"memory_percent": 72,
|
||||
"response_time_ms": 2340,
|
||||
"sigma_deviation": 3.2,
|
||||
},
|
||||
"labels": {
|
||||
"app": "payment-api",
|
||||
"team": "backend",
|
||||
"sla": "critical",
|
||||
},
|
||||
},
|
||||
"disk_full": {
|
||||
"alert_type": "disk_full",
|
||||
"severity": "critical",
|
||||
"source": "node-exporter",
|
||||
"target_resource": "logging-node-01",
|
||||
"namespace": "kube-system",
|
||||
"message": "Disk usage at 97% - /var/log nearly full, risk of logging failure",
|
||||
"metrics": {
|
||||
"disk_percent": 97,
|
||||
"available_gb": 2.3,
|
||||
"inode_percent": 89,
|
||||
},
|
||||
"labels": {
|
||||
"node": "logging-node-01",
|
||||
"team": "sre",
|
||||
},
|
||||
},
|
||||
"ssl_expiry": {
|
||||
"alert_type": "ssl_expiry",
|
||||
"severity": "warning",
|
||||
"source": "cert-manager",
|
||||
"target_resource": "awoooi.wooo.work",
|
||||
"namespace": "cert-manager",
|
||||
"message": "SSL Certificate expiring in 7 days - auto-renewal failed",
|
||||
"metrics": {
|
||||
"days_until_expiry": 7,
|
||||
},
|
||||
"labels": {
|
||||
"domain": "awoooi.wooo.work",
|
||||
"issuer": "letsencrypt",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Terminal Output Helpers (漂亮的 Log)
|
||||
# =============================================================================
|
||||
|
||||
class Colors:
|
||||
"""ANSI Color Codes"""
|
||||
HEADER = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
CYAN = '\033[96m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
ENDC = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
DIM = '\033[2m'
|
||||
|
||||
|
||||
def print_banner():
|
||||
"""Print AWOOOI ASCII Banner"""
|
||||
banner = f"""
|
||||
{Colors.CYAN}{Colors.BOLD}
|
||||
█████╗ ██╗ ██╗ ██████╗ ██████╗ ██████╗ ██╗
|
||||
██╔══██╗██║ ██║██╔═══██╗██╔═══██╗██╔═══██╗██║
|
||||
███████║██║ █╗ ██║██║ ██║██║ ██║██║ ██║██║
|
||||
██╔══██║██║███╗██║██║ ██║██║ ██║██║ ██║██║
|
||||
██║ ██║╚███╔███╔╝╚██████╔╝╚██████╔╝╚██████╔╝██║
|
||||
╚═╝ ╚═╝ ╚══╝╚══╝ ╚═════╝ ╚═════╝ ╚═════╝ ╚═╝
|
||||
{Colors.ENDC}
|
||||
{Colors.DIM} 🚀 Phase 2 導彈腳本 - Test Alert Injector{Colors.ENDC}
|
||||
{Colors.DIM} ─────────────────────────────────────────{Colors.ENDC}
|
||||
"""
|
||||
print(banner)
|
||||
|
||||
|
||||
def print_section(title: str):
|
||||
"""Print section header"""
|
||||
print(f"\n{Colors.BLUE}{Colors.BOLD}▶ {title}{Colors.ENDC}")
|
||||
print(f"{Colors.DIM}{'─' * 50}{Colors.ENDC}")
|
||||
|
||||
|
||||
def print_alert_info(alert: dict):
|
||||
"""Print alert payload info"""
|
||||
print(f" {Colors.YELLOW}告警類型:{Colors.ENDC} {alert['alert_type']}")
|
||||
print(f" {Colors.YELLOW}嚴重度:{Colors.ENDC} {alert['severity']}")
|
||||
print(f" {Colors.YELLOW}目標資源:{Colors.ENDC} {alert['target_resource']}")
|
||||
print(f" {Colors.YELLOW}命名空間:{Colors.ENDC} {alert['namespace']}")
|
||||
print(f" {Colors.YELLOW}訊息:{Colors.ENDC} {alert['message']}")
|
||||
if alert.get('metrics'):
|
||||
print(f" {Colors.YELLOW}指標:{Colors.ENDC}")
|
||||
for k, v in alert['metrics'].items():
|
||||
print(f" • {k}: {v}")
|
||||
|
||||
|
||||
def print_response(response: dict, status_code: int):
|
||||
"""Print API response"""
|
||||
if status_code == 200 and response.get('success'):
|
||||
print(f"\n{Colors.GREEN}{Colors.BOLD}✓ 告警發射成功!{Colors.ENDC}")
|
||||
print(f" {Colors.CYAN}Approval ID:{Colors.ENDC} {response.get('approval_id', 'N/A')}")
|
||||
print(f" {Colors.CYAN}風險等級:{Colors.ENDC} {response.get('risk_level', 'N/A')}")
|
||||
print(f" {Colors.CYAN}建議動作:{Colors.ENDC} {response.get('suggested_action', 'N/A')}")
|
||||
print(f" {Colors.CYAN}聚合次數:{Colors.ENDC} {response.get('hit_count', 1)}")
|
||||
if response.get('converged'):
|
||||
print(f" {Colors.YELLOW}⚡ 告警已收斂 (跳過 LLM){Colors.ENDC}")
|
||||
else:
|
||||
print(f"\n{Colors.RED}{Colors.BOLD}✗ 告警發射失敗!{Colors.ENDC}")
|
||||
print(f" {Colors.RED}狀態碼:{Colors.ENDC} {status_code}")
|
||||
print(f" {Colors.RED}回應:{Colors.ENDC} {response}")
|
||||
|
||||
|
||||
def print_footer():
|
||||
"""Print footer with instructions"""
|
||||
print(f"\n{Colors.DIM}{'─' * 50}{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}📺 請查看戰情室前端:{Colors.ENDC} http://localhost:3000")
|
||||
print(f"{Colors.GREEN}📋 右側面板應顯示新的 ApprovalCard{Colors.ENDC}")
|
||||
print(f"{Colors.DIM}時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}{Colors.ENDC}\n")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Logic
|
||||
# =============================================================================
|
||||
|
||||
async def fire_alert(alert_type: str, severity: str | None = None) -> bool:
|
||||
"""
|
||||
發射測試告警
|
||||
|
||||
Args:
|
||||
alert_type: 告警類型 (db_connection_timeout, k8s_pod_crash, etc.)
|
||||
severity: 覆蓋嚴重度 (optional)
|
||||
|
||||
Returns:
|
||||
bool: 是否成功
|
||||
"""
|
||||
# 取得告警場景
|
||||
if alert_type not in ALERT_SCENARIOS:
|
||||
print(f"{Colors.RED}❌ 未知告警類型: {alert_type}{Colors.ENDC}")
|
||||
print(f"{Colors.DIM}可用類型: {', '.join(ALERT_SCENARIOS.keys())}{Colors.ENDC}")
|
||||
return False
|
||||
|
||||
alert = ALERT_SCENARIOS[alert_type].copy()
|
||||
|
||||
# 覆蓋嚴重度
|
||||
if severity:
|
||||
alert['severity'] = severity
|
||||
|
||||
print_section("告警 Payload")
|
||||
print_alert_info(alert)
|
||||
|
||||
print_section("發射告警至 Webhook API")
|
||||
print(f" {Colors.CYAN}端點:{Colors.ENDC} {WEBHOOK_ENDPOINT}")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
response = await client.post(
|
||||
WEBHOOK_ENDPOINT,
|
||||
json=alert,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
print_response(result, response.status_code)
|
||||
|
||||
return response.status_code == 200
|
||||
|
||||
except httpx.ConnectError:
|
||||
print(f"\n{Colors.RED}{Colors.BOLD}✗ 連線失敗!{Colors.ENDC}")
|
||||
print(f" {Colors.RED}請確認後端 API 正在運行:{Colors.ENDC}")
|
||||
print(f" {Colors.DIM}cd apps/api && uvicorn src.main:app --reload{Colors.ENDC}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n{Colors.RED}{Colors.BOLD}✗ 發生錯誤:{e}{Colors.ENDC}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI Entry Point"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="🚀 AWOOOI Phase 2 導彈腳本 - 發射測試告警",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
可用告警類型:
|
||||
db_connection_timeout PostgreSQL Database OOM (CRITICAL)
|
||||
k8s_pod_crash Pod CrashLoopBackOff (MEDIUM)
|
||||
high_cpu CPU Spike / Latency (MEDIUM)
|
||||
disk_full Disk Full Warning (CRITICAL)
|
||||
ssl_expiry SSL Certificate Expiry (LOW)
|
||||
|
||||
範例:
|
||||
python -m scripts.fire_test_alert
|
||||
python -m scripts.fire_test_alert --type db_connection_timeout
|
||||
python -m scripts.fire_test_alert --type k8s_pod_crash --severity critical
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--type", "-t",
|
||||
type=str,
|
||||
default="db_connection_timeout",
|
||||
choices=list(ALERT_SCENARIOS.keys()),
|
||||
help="告警類型 (預設: db_connection_timeout)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--severity", "-s",
|
||||
type=str,
|
||||
choices=["info", "warning", "critical"],
|
||||
help="覆蓋嚴重度 (預設使用場景預設值)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print_banner()
|
||||
success = asyncio.run(fire_alert(args.type, args.severity))
|
||||
print_footer()
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
180
apps/api/scripts/test_phase63_aggregation.py
Executable file
180
apps/api/scripts/test_phase63_aggregation.py
Executable file
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 6.3 聚合測試腳本
|
||||
=======================
|
||||
|
||||
功能:
|
||||
1. 連續打入 3 筆「同源但不同名」的測試告警
|
||||
2. 證明這 3 筆告警被成功「聚合」進同一個 Incident 的 signals 陣列中
|
||||
3. 驗證 affected_services 有被正確填入
|
||||
|
||||
使用方式:
|
||||
cd apps/api
|
||||
python scripts/test_phase63_aggregation.py
|
||||
|
||||
預期結果:
|
||||
- 3 筆告警全部聚合到 1 個 Incident
|
||||
- signals 陣列長度 = 3
|
||||
- affected_services 包含 "payment-service"
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import httpx
|
||||
from datetime import datetime
|
||||
import time
|
||||
|
||||
# API 端點
|
||||
API_BASE = "http://localhost:8000"
|
||||
SIGNALS_ENDPOINT = f"{API_BASE}/api/v1/webhooks/signals"
|
||||
|
||||
# 測試告警: 同 namespace + 同 target,不同 alert_name
|
||||
# 模擬: payment-service 發生一連串相關問題
|
||||
# 測試告警: 同 namespace + 同 target,不同 alert_name
|
||||
# 模擬: payment-service 發生一連串相關問題
|
||||
# 注意: severity 只能是 info | warning | critical (SignalPayload 定義)
|
||||
TEST_ALERTS = [
|
||||
{
|
||||
"alert_name": "PaymentServiceHighLatency",
|
||||
"severity": "warning",
|
||||
"source": "prometheus",
|
||||
"namespace": "payment-prod",
|
||||
"target": "payment-service",
|
||||
"fingerprint": "fp_latency_001",
|
||||
"labels": {"namespace": "payment-prod", "pod": "payment-service-abc123"},
|
||||
"annotations": {"summary": "Payment service latency > 500ms"},
|
||||
},
|
||||
{
|
||||
"alert_name": "PaymentServiceErrorRate",
|
||||
"severity": "warning", # 原本是 high,但 API 只接受 info|warning|critical
|
||||
"source": "prometheus",
|
||||
"namespace": "payment-prod",
|
||||
"target": "payment-service",
|
||||
"fingerprint": "fp_error_001",
|
||||
"labels": {"namespace": "payment-prod", "pod": "payment-service-abc123"},
|
||||
"annotations": {"summary": "Payment service error rate > 5%"},
|
||||
},
|
||||
{
|
||||
"alert_name": "PaymentServicePodCrash",
|
||||
"severity": "critical",
|
||||
"source": "alertmanager",
|
||||
"namespace": "payment-prod",
|
||||
"target": "payment-service",
|
||||
"fingerprint": "fp_crash_001",
|
||||
"labels": {"namespace": "payment-prod", "pod": "payment-service-abc123"},
|
||||
"annotations": {"summary": "Payment service pod crashed"},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
async def send_alert(client: httpx.AsyncClient, alert: dict, index: int) -> dict:
|
||||
"""發送單一告警"""
|
||||
print(f"\n[{index+1}/3] 發送告警: {alert['alert_name']}")
|
||||
print(f" severity: {alert['severity']}")
|
||||
print(f" namespace: {alert['namespace']}")
|
||||
print(f" target: {alert['target']}")
|
||||
|
||||
response = await client.post(
|
||||
SIGNALS_ENDPOINT,
|
||||
json=alert,
|
||||
timeout=10.0,
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
print(f" status_code: {response.status_code}")
|
||||
print(f" message_id: {result.get('message_id', 'N/A')}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def check_redis_incident(client: httpx.AsyncClient) -> dict | None:
|
||||
"""檢查 Redis 中的 Incident"""
|
||||
# 使用 health endpoint 確認 API 運作
|
||||
try:
|
||||
# 直接查詢 Redis (透過 API 或直接)
|
||||
# 這裡我們用 curl 模擬,但實際應該有 API
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error checking Redis: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
"""主測試流程"""
|
||||
print("=" * 60)
|
||||
print("Phase 6.3 聚合測試")
|
||||
print("=" * 60)
|
||||
print(f"時間: {datetime.now().isoformat()}")
|
||||
print(f"目標: 驗證 3 筆同源告警聚合到 1 個 Incident")
|
||||
print()
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
# 1. 確認 API 運作
|
||||
print("[0] 檢查 API 健康狀態...")
|
||||
try:
|
||||
health = await client.get(f"{API_BASE}/api/v1/health", timeout=5.0)
|
||||
print(f" API status: {health.status_code}")
|
||||
except Exception as e:
|
||||
print(f" API 連線失敗: {e}")
|
||||
print(" 請確認 API 已啟動: docker compose up -d")
|
||||
return
|
||||
|
||||
# 2. 連續發送 3 筆告警
|
||||
print("\n" + "-" * 60)
|
||||
print("階段一: 連續發送 3 筆告警")
|
||||
print("-" * 60)
|
||||
|
||||
results = []
|
||||
for i, alert in enumerate(TEST_ALERTS):
|
||||
result = await send_alert(client, alert, i)
|
||||
results.append(result)
|
||||
# 短暫等待,確保 Consumer 有時間處理
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# 3. 等待 Consumer 處理完成
|
||||
print("\n" + "-" * 60)
|
||||
print("階段二: 等待 Consumer 處理 (3 秒)")
|
||||
print("-" * 60)
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# 4. 輸出驗證指令
|
||||
print("\n" + "-" * 60)
|
||||
print("階段三: 驗證指令")
|
||||
print("-" * 60)
|
||||
print()
|
||||
print("請執行以下 Redis 指令檢查聚合結果:")
|
||||
print()
|
||||
print("# 1. 查看所有 Incident keys")
|
||||
print("docker exec -it awoooi-redis redis-cli KEYS 'incident:*'")
|
||||
print()
|
||||
print("# 2. 查看特定 Incident 的 JSON (取代 <INCIDENT_ID>)")
|
||||
print("docker exec -it awoooi-redis redis-cli GET 'incident:INC-XXXXXXXX-XXXXXX'")
|
||||
print()
|
||||
print("# 3. 或直接用以下指令掃描並輸出所有 Incident:")
|
||||
print("""docker exec -it awoooi-redis redis-cli --no-raw KEYS 'incident:INC-*' | xargs -I {} docker exec -i awoooi-redis redis-cli GET {}""")
|
||||
print()
|
||||
|
||||
# 5. 輸出 API 日誌指令
|
||||
print("-" * 60)
|
||||
print("檢查 API 日誌:")
|
||||
print("-" * 60)
|
||||
print("docker logs awoooi-api --tail 50 | grep -E '(signal_|incident_|aggregat)'")
|
||||
print()
|
||||
|
||||
# 6. 驗證標準
|
||||
print("-" * 60)
|
||||
print("驗證標準 (PASS/FAIL)")
|
||||
print("-" * 60)
|
||||
print("[ ] 只有 1 個 Incident 被建立 (非 3 個)")
|
||||
print("[ ] signals 陣列長度 = 3")
|
||||
print("[ ] affected_services 包含 'payment-service'")
|
||||
print("[ ] severity 升級為 'P0' (因為第三筆是 critical)")
|
||||
print()
|
||||
|
||||
print("=" * 60)
|
||||
print("測試腳本執行完成")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
261
apps/api/scripts/test_phase64_proposal.py
Executable file
261
apps/api/scripts/test_phase64_proposal.py
Executable file
@@ -0,0 +1,261 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 6.4 全鏈路測試腳本
|
||||
========================
|
||||
|
||||
功能:
|
||||
1. 觸發假告警 (建立 Incident)
|
||||
2. 呼叫 /proposal 端點 (產生決策)
|
||||
3. 呼叫 /approvals/pending (模擬前端撈取待簽核清單)
|
||||
4. 證明這條鏈路完全暢通
|
||||
|
||||
使用方式:
|
||||
cd apps/api
|
||||
python scripts/test_phase64_proposal.py
|
||||
|
||||
驗收標準:
|
||||
- Incident 成功建立
|
||||
- Proposal 成功生成
|
||||
- Proposal 出現在 /approvals/pending 清單中
|
||||
- 前端零改動即可渲染
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
import httpx
|
||||
|
||||
# API 端點
|
||||
API_BASE = "http://localhost:8000"
|
||||
SIGNALS_ENDPOINT = f"{API_BASE}/api/v1/webhooks/signals"
|
||||
INCIDENTS_ENDPOINT = f"{API_BASE}/api/v1/incidents"
|
||||
APPROVALS_ENDPOINT = f"{API_BASE}/api/v1/approvals/pending"
|
||||
|
||||
|
||||
async def send_test_alert() -> dict | None:
|
||||
"""發送測試告警"""
|
||||
alert = {
|
||||
"alert_name": "PodCrashLoopBackOff",
|
||||
"severity": "critical", # P0
|
||||
"source": "prometheus",
|
||||
"namespace": "production",
|
||||
"target": "api-gateway",
|
||||
"fingerprint": f"fp_test_{datetime.now().strftime('%H%M%S')}",
|
||||
"labels": {
|
||||
"namespace": "production",
|
||||
"pod": "api-gateway-abc123",
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "Pod api-gateway is in CrashLoopBackOff state",
|
||||
},
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
SIGNALS_ENDPOINT,
|
||||
json=alert,
|
||||
timeout=10.0,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
print(f" [ERROR] status_code: {response.status_code}")
|
||||
print(f" [ERROR] response: {response.text}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" [ERROR] {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def wait_for_incident(namespace: str, timeout: int = 10) -> str | None:
|
||||
"""等待 Incident 被建立並返回 incident_id"""
|
||||
async with httpx.AsyncClient() as client:
|
||||
for _ in range(timeout):
|
||||
try:
|
||||
response = await client.get(
|
||||
INCIDENTS_ENDPOINT,
|
||||
timeout=5.0,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
for incident in data.get("incidents", []):
|
||||
# 找到我們的測試 Incident
|
||||
if "api-gateway" in incident.get("affected_services", []):
|
||||
return incident.get("incident_id")
|
||||
except Exception:
|
||||
pass
|
||||
await asyncio.sleep(1)
|
||||
return None
|
||||
|
||||
|
||||
async def generate_proposal(incident_id: str) -> dict | None:
|
||||
"""生成 Decision Proposal"""
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{INCIDENTS_ENDPOINT}/{incident_id}/proposal",
|
||||
timeout=10.0,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
print(f" [ERROR] status_code: {response.status_code}")
|
||||
print(f" [ERROR] response: {response.text}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" [ERROR] {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def get_pending_approvals() -> dict | None:
|
||||
"""取得待簽核清單"""
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.get(
|
||||
APPROVALS_ENDPOINT,
|
||||
timeout=10.0,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
print(f" [ERROR] status_code: {response.status_code}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" [ERROR] {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
"""主測試流程"""
|
||||
print("=" * 70)
|
||||
print("Phase 6.4 全鏈路測試: Incident → Proposal → Pending Approvals")
|
||||
print("=" * 70)
|
||||
print(f"時間: {datetime.now().isoformat()}")
|
||||
print()
|
||||
|
||||
# 0. 健康檢查
|
||||
print("[0] 檢查 API 健康狀態...")
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
health = await client.get(f"{API_BASE}/api/v1/health", timeout=5.0)
|
||||
print(f" API status: {health.status_code}")
|
||||
except Exception as e:
|
||||
print(f" API 連線失敗: {e}")
|
||||
print(" 請確認 API 已啟動: docker compose up -d")
|
||||
return
|
||||
|
||||
# 1. 發送測試告警
|
||||
print("\n" + "-" * 70)
|
||||
print("[1] 發送測試告警 (建立 Incident)")
|
||||
print("-" * 70)
|
||||
|
||||
result = await send_test_alert()
|
||||
if not result:
|
||||
print(" [FAIL] 無法發送告警")
|
||||
return
|
||||
|
||||
print(f" message_id: {result.get('message_id', 'N/A')}")
|
||||
print(f" success: {result.get('success', False)}")
|
||||
|
||||
# 2. 等待 Incident 建立
|
||||
print("\n" + "-" * 70)
|
||||
print("[2] 等待 Consumer 處理並建立 Incident (最多 10 秒)")
|
||||
print("-" * 70)
|
||||
|
||||
incident_id = await wait_for_incident("production")
|
||||
|
||||
if not incident_id:
|
||||
print(" [FAIL] 無法找到測試 Incident")
|
||||
print(" 請檢查 API 日誌: docker logs awoooi-api --tail 50")
|
||||
return
|
||||
|
||||
print(f" incident_id: {incident_id}")
|
||||
print(" [OK] Incident 已建立")
|
||||
|
||||
# 3. 生成 Proposal
|
||||
print("\n" + "-" * 70)
|
||||
print("[3] 呼叫 /proposal 端點生成決策")
|
||||
print("-" * 70)
|
||||
|
||||
proposal_result = await generate_proposal(incident_id)
|
||||
|
||||
if not proposal_result or not proposal_result.get("success"):
|
||||
print(f" [FAIL] 無法生成 Proposal")
|
||||
print(f" message: {proposal_result.get('message') if proposal_result else 'N/A'}")
|
||||
return
|
||||
|
||||
proposal = proposal_result.get("proposal", {})
|
||||
print(f" proposal_id: {proposal.get('id', 'N/A')}")
|
||||
print(f" action: {proposal.get('action', 'N/A')[:60]}...")
|
||||
print(f" risk_level: {proposal.get('risk_level', 'N/A')}")
|
||||
print(f" required_signatures: {proposal.get('required_signatures', 'N/A')}")
|
||||
print(f" incident_status: {proposal_result.get('incident_status', 'N/A')}")
|
||||
print(" [OK] Proposal 已生成")
|
||||
|
||||
# 4. 驗證 /approvals/pending
|
||||
print("\n" + "-" * 70)
|
||||
print("[4] 呼叫 /approvals/pending 驗證前端相容性")
|
||||
print("-" * 70)
|
||||
|
||||
pending = await get_pending_approvals()
|
||||
|
||||
if not pending:
|
||||
print(" [FAIL] 無法取得待簽核清單")
|
||||
return
|
||||
|
||||
print(f" count: {pending.get('count', 0)}")
|
||||
|
||||
# 尋找我們的 Proposal
|
||||
found = False
|
||||
for approval in pending.get("approvals", []):
|
||||
if approval.get("id") == proposal.get("id"):
|
||||
found = True
|
||||
print(f" [FOUND] Proposal 出現在待簽核清單中!")
|
||||
print()
|
||||
print(" === PendingApprovalsResponse JSON ===")
|
||||
print(json.dumps({
|
||||
"count": pending.get("count"),
|
||||
"target_approval": approval,
|
||||
}, indent=2, ensure_ascii=False, default=str))
|
||||
break
|
||||
|
||||
if not found:
|
||||
print(" [WARN] Proposal 未出現在待簽核清單中")
|
||||
print(f" (可能因為 risk_level=LOW 已自動批准)")
|
||||
|
||||
# 5. 最終驗證
|
||||
print("\n" + "=" * 70)
|
||||
print("驗證結果")
|
||||
print("=" * 70)
|
||||
|
||||
checks = [
|
||||
("Incident 建立", incident_id is not None),
|
||||
("Proposal 生成", proposal_result.get("success", False)),
|
||||
("風險評估", proposal.get("risk_level") is not None),
|
||||
("狀態推進 (MITIGATING)", proposal_result.get("incident_status") == "mitigating"),
|
||||
("前端相容 (/approvals/pending)", pending is not None),
|
||||
]
|
||||
|
||||
all_passed = True
|
||||
for name, passed in checks:
|
||||
status = "✅ PASS" if passed else "❌ FAIL"
|
||||
print(f"[{status}] {name}")
|
||||
if not passed:
|
||||
all_passed = False
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
if all_passed:
|
||||
print("🎉 Phase 6.4 全鏈路測試 PASSED!")
|
||||
print(" 大腦已具備決策輸出能力!")
|
||||
print(" Decision Proposal API 已鑄造完成!")
|
||||
else:
|
||||
print("💥 Phase 6.4 全鏈路測試 FAILED!")
|
||||
print(" 請檢查上述失敗項目")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
286
apps/api/scripts/test_race_condition.py
Executable file
286
apps/api/scripts/test_race_condition.py
Executable file
@@ -0,0 +1,286 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 6.3 Race Condition 測試腳本
|
||||
==================================
|
||||
|
||||
功能:
|
||||
1. 使用 asyncio.gather 同時發射 20 筆同源告警
|
||||
2. 證明 Lua Script 原子操作成功擋下 Race Condition
|
||||
3. 驗證最終 Incident JSON 精準包含 20 筆 Signals
|
||||
|
||||
使用方式:
|
||||
cd apps/api
|
||||
python scripts/test_race_condition.py
|
||||
|
||||
預期結果:
|
||||
- 只有 1 個 Incident 被建立
|
||||
- signals 陣列長度 = 20
|
||||
- 無任何 Signal 遺失
|
||||
|
||||
統帥鐵律:
|
||||
- 嚴禁人工 QA
|
||||
- 必須程式化驗證
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
import httpx
|
||||
|
||||
# API 端點
|
||||
API_BASE = "http://localhost:8000"
|
||||
SIGNALS_ENDPOINT = f"{API_BASE}/api/v1/webhooks/signals"
|
||||
|
||||
# 併發數量
|
||||
CONCURRENT_SIGNALS = 20
|
||||
|
||||
# 測試 namespace 和 target (同源)
|
||||
TEST_NAMESPACE = "race-test-ns"
|
||||
TEST_TARGET = "race-test-service"
|
||||
|
||||
|
||||
def generate_alert(index: int) -> dict:
|
||||
"""生成測試告警 (同 namespace + 同 target,不同 alert_name)"""
|
||||
return {
|
||||
"alert_name": f"RaceConditionTest_{index:03d}",
|
||||
"severity": "warning",
|
||||
"source": "prometheus",
|
||||
"namespace": TEST_NAMESPACE,
|
||||
"target": TEST_TARGET,
|
||||
"fingerprint": f"fp_race_{index:03d}", # 唯一 fingerprint 防止去重
|
||||
"labels": {
|
||||
"namespace": TEST_NAMESPACE,
|
||||
"test_index": str(index),
|
||||
},
|
||||
"annotations": {
|
||||
"summary": f"Race condition test signal #{index}",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
async def send_alert(client: httpx.AsyncClient, index: int) -> dict:
|
||||
"""發送單一告警"""
|
||||
alert = generate_alert(index)
|
||||
try:
|
||||
response = await client.post(
|
||||
SIGNALS_ENDPOINT,
|
||||
json=alert,
|
||||
timeout=30.0,
|
||||
)
|
||||
return {
|
||||
"index": index,
|
||||
"status_code": response.status_code,
|
||||
"message_id": response.json().get("message_id"),
|
||||
"success": response.status_code == 200,
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"index": index,
|
||||
"status_code": 0,
|
||||
"message_id": None,
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
|
||||
async def fire_concurrent_alerts() -> list[dict]:
|
||||
"""併發發射所有告警"""
|
||||
async with httpx.AsyncClient() as client:
|
||||
tasks = [send_alert(client, i) for i in range(CONCURRENT_SIGNALS)]
|
||||
results = await asyncio.gather(*tasks)
|
||||
return list(results)
|
||||
|
||||
|
||||
async def verify_redis_incident() -> dict | None:
|
||||
"""從 Redis 查詢 Incident 並驗證"""
|
||||
import subprocess
|
||||
|
||||
# 查詢所有 incident keys
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "awoooi-redis", "redis-cli", "KEYS", "incident:INC-*"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
keys = [k.strip() for k in result.stdout.strip().split("\n") if k.strip()]
|
||||
|
||||
if not keys:
|
||||
return None
|
||||
|
||||
# 找到最新的 Incident (假設測試環境已清空)
|
||||
# 對於測試,我們檢查所有 incident 找到包含 race-test-ns 的那個
|
||||
for key in keys:
|
||||
get_result = subprocess.run(
|
||||
["docker", "exec", "awoooi-redis", "redis-cli", "GET", key],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if get_result.stdout.strip():
|
||||
try:
|
||||
incident = json.loads(get_result.stdout.strip())
|
||||
# 檢查是否是我們的測試 Incident
|
||||
if any(
|
||||
s.get("labels", {}).get("namespace") == TEST_NAMESPACE
|
||||
for s in incident.get("signals", [])
|
||||
):
|
||||
return incident
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
"""主測試流程"""
|
||||
print("=" * 70)
|
||||
print("Phase 6.3 Race Condition 併發測試")
|
||||
print("=" * 70)
|
||||
print(f"時間: {datetime.now().isoformat()}")
|
||||
print(f"併發數量: {CONCURRENT_SIGNALS} 筆告警")
|
||||
print(f"測試 Namespace: {TEST_NAMESPACE}")
|
||||
print(f"測試 Target: {TEST_TARGET}")
|
||||
print()
|
||||
|
||||
# 0. 清除舊的測試 Incident (可選)
|
||||
print("[0] 準備測試環境...")
|
||||
import subprocess
|
||||
|
||||
# 刪除舊的索引 (如果存在)
|
||||
subprocess.run(
|
||||
[
|
||||
"docker", "exec", "awoooi-redis", "redis-cli",
|
||||
"DEL",
|
||||
f"incident:idx:ns:{TEST_NAMESPACE}",
|
||||
f"incident:idx:target:{TEST_TARGET}",
|
||||
],
|
||||
capture_output=True,
|
||||
)
|
||||
print(" 已清除舊索引")
|
||||
|
||||
# 1. 檢查 API
|
||||
print("\n[1] 檢查 API 健康狀態...")
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
health = await client.get(f"{API_BASE}/api/v1/health", timeout=5.0)
|
||||
print(f" API status: {health.status_code}")
|
||||
except Exception as e:
|
||||
print(f" API 連線失敗: {e}")
|
||||
print(" 請確認 API 已啟動: docker compose up -d")
|
||||
return
|
||||
|
||||
# 2. 併發發射告警
|
||||
print("\n" + "-" * 70)
|
||||
print("[2] 併發發射 20 筆告警 (asyncio.gather)")
|
||||
print("-" * 70)
|
||||
|
||||
start_time = datetime.now()
|
||||
results = await fire_concurrent_alerts()
|
||||
end_time = datetime.now()
|
||||
duration = (end_time - start_time).total_seconds()
|
||||
|
||||
success_count = sum(1 for r in results if r["success"])
|
||||
fail_count = sum(1 for r in results if not r["success"])
|
||||
|
||||
print(f"\n發射結果:")
|
||||
print(f" 成功: {success_count}/{CONCURRENT_SIGNALS}")
|
||||
print(f" 失敗: {fail_count}/{CONCURRENT_SIGNALS}")
|
||||
print(f" 耗時: {duration:.3f} 秒")
|
||||
|
||||
if fail_count > 0:
|
||||
print("\n失敗詳情:")
|
||||
for r in results:
|
||||
if not r["success"]:
|
||||
print(f" - Index {r['index']}: {r.get('error', 'Unknown')}")
|
||||
|
||||
# 3. 等待 Consumer 處理
|
||||
print("\n" + "-" * 70)
|
||||
print("[3] 等待 Consumer 處理 (5 秒)")
|
||||
print("-" * 70)
|
||||
await asyncio.sleep(5)
|
||||
|
||||
# 4. 驗證 Redis Incident
|
||||
print("\n" + "-" * 70)
|
||||
print("[4] 驗證 Redis Incident")
|
||||
print("-" * 70)
|
||||
|
||||
incident = await verify_redis_incident()
|
||||
|
||||
if not incident:
|
||||
print("\n❌ 錯誤: 找不到測試 Incident!")
|
||||
print(" 請檢查 API 日誌: docker logs awoooi-api --tail 100")
|
||||
return
|
||||
|
||||
incident_id = incident.get("incident_id", "N/A")
|
||||
signals = incident.get("signals", [])
|
||||
signal_count = len(signals)
|
||||
severity = incident.get("severity", "N/A")
|
||||
affected_services = incident.get("affected_services", [])
|
||||
|
||||
print(f"\n找到 Incident:")
|
||||
print(f" incident_id: {incident_id}")
|
||||
print(f" signal_count: {signal_count}")
|
||||
print(f" severity: {severity}")
|
||||
print(f" affected_services: {affected_services}")
|
||||
|
||||
# 5. 驗證結果
|
||||
print("\n" + "=" * 70)
|
||||
print("驗證結果")
|
||||
print("=" * 70)
|
||||
|
||||
# 計算聚合的告警數量
|
||||
race_signals = [
|
||||
s for s in signals
|
||||
if s.get("alert_name", "").startswith("RaceConditionTest_")
|
||||
]
|
||||
race_signal_count = len(race_signals)
|
||||
|
||||
# 檢查告警名稱分布
|
||||
alert_names = [s.get("alert_name") for s in race_signals]
|
||||
unique_names = set(alert_names)
|
||||
|
||||
print()
|
||||
passed = True
|
||||
|
||||
# 驗證 1: signal_count
|
||||
if race_signal_count == CONCURRENT_SIGNALS:
|
||||
print(f"[✅ PASS] Signal 數量: {race_signal_count}/{CONCURRENT_SIGNALS}")
|
||||
else:
|
||||
print(f"[❌ FAIL] Signal 數量: {race_signal_count}/{CONCURRENT_SIGNALS}")
|
||||
print(f" 遺失 {CONCURRENT_SIGNALS - race_signal_count} 筆 Signal!")
|
||||
passed = False
|
||||
|
||||
# 驗證 2: unique names (無重複跳過)
|
||||
if len(unique_names) == race_signal_count:
|
||||
print(f"[✅ PASS] 唯一告警名稱: {len(unique_names)} 個 (無重複)")
|
||||
else:
|
||||
print(f"[❌ FAIL] 唯一告警名稱: {len(unique_names)} 個 (有重複被覆蓋)")
|
||||
passed = False
|
||||
|
||||
# 驗證 3: affected_services
|
||||
if TEST_TARGET in affected_services:
|
||||
print(f"[✅ PASS] affected_services 包含 '{TEST_TARGET}'")
|
||||
else:
|
||||
print(f"[❌ FAIL] affected_services 不包含 '{TEST_TARGET}'")
|
||||
passed = False
|
||||
|
||||
# 最終結論
|
||||
print()
|
||||
print("=" * 70)
|
||||
if passed:
|
||||
print("🎉 Race Condition 測試 PASSED!")
|
||||
print(f" {CONCURRENT_SIGNALS} 筆併發告警全部成功聚合!")
|
||||
print(" Lua Script 原子操作有效防止了資料遺失!")
|
||||
else:
|
||||
print("💥 Race Condition 測試 FAILED!")
|
||||
print(" 存在資料遺失,需要進一步調查!")
|
||||
print("=" * 70)
|
||||
|
||||
# 輸出詳細日誌指令
|
||||
print("\n檢查詳細日誌:")
|
||||
print("docker logs awoooi-api --tail 100 | grep -E '(atomic|aggregate|race)'")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
83
apps/api/scripts/test_signal_stream.py
Normal file
83
apps/api/scripts/test_signal_stream.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 6.1 測試腳本: Redis Streams Signal 流程驗證
|
||||
=================================================
|
||||
|
||||
功能:
|
||||
1. 發送測試 Signal 到 /api/v1/webhooks/signals
|
||||
2. 驗證 Redis Stream 中有新訊息
|
||||
3. 輸出 Stream 狀態
|
||||
|
||||
使用:
|
||||
python scripts/test_signal_stream.py
|
||||
|
||||
環境變數:
|
||||
API_BASE_URL: API 基礎 URL (預設: http://localhost:8000)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000")
|
||||
SIGNAL_ENDPOINT = f"{API_BASE_URL}/api/v1/webhooks/signals"
|
||||
|
||||
|
||||
async def send_test_signal() -> dict:
|
||||
"""發送測試 Signal"""
|
||||
payload = {
|
||||
"source": "test-script",
|
||||
"alert_name": "TestSignal",
|
||||
"severity": "warning",
|
||||
"namespace": "awoooi-test",
|
||||
"target": "test-pod-123",
|
||||
"message": "Phase 6.1 Event Bus 驗證測試",
|
||||
"labels": {"team": "devops", "env": "test"},
|
||||
"annotations": {"runbook_url": "https://wiki.example.com/runbook"},
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.post(SIGNAL_ENDPOINT, json=payload)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
async def main():
|
||||
print("=" * 60)
|
||||
print("Phase 6.1 Event Bus 測試")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
print(f"[1] 發送測試 Signal 到 {SIGNAL_ENDPOINT}")
|
||||
try:
|
||||
result = await send_test_signal()
|
||||
print(f" ✅ 成功!")
|
||||
print(f" Message ID: {result.get('message_id')}")
|
||||
print(f" Stream: {result.get('stream')}")
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f" ❌ HTTP 錯誤: {e.response.status_code}")
|
||||
print(f" {e.response.text}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f" ❌ 錯誤: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
print()
|
||||
print("[2] 驗證 Signal Worker (Consumer) 是否收到訊息")
|
||||
print(" 查看 API 日誌: docker logs awoooi-api | grep signal_received")
|
||||
print()
|
||||
print("[3] 手動檢查 Redis Stream 狀態")
|
||||
print(" redis-cli XINFO STREAM stream:awoooi_signals")
|
||||
print(" redis-cli XINFO GROUPS stream:awoooi_signals")
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("測試完成!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
360
apps/api/scripts/tracer_bullet_2.py
Normal file
360
apps/api/scripts/tracer_bullet_2.py
Normal file
@@ -0,0 +1,360 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tracer Bullet 2.0 - 全站閉環測試腳本
|
||||
Phase 4: E2E Integration Test
|
||||
|
||||
測試流程:
|
||||
1. 觸發假告警 (Mock Alert)
|
||||
2. GraphRAG 分析 (Blast Radius + Root Cause)
|
||||
3. 產生 ApprovalCard (Dry-Run)
|
||||
4. 人類批准 (Multi-Sig)
|
||||
5. MCP 模擬執行
|
||||
|
||||
執行方式:
|
||||
cd apps/api
|
||||
python scripts/tracer_bullet_2.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
# ==================== 模擬模組導入 ====================
|
||||
|
||||
# 實際運行時這些會從專案導入
|
||||
# from src.services import (
|
||||
# topology_graph, trust_engine, multi_sig_engine, dry_run_engine
|
||||
# )
|
||||
# from src.plugins.finops import idle_scanner
|
||||
# from src.plugins.mcp import mcp_bridge
|
||||
|
||||
|
||||
# ==================== Test Configuration ====================
|
||||
|
||||
|
||||
class TracerBullet2:
|
||||
"""全站閉環測試器"""
|
||||
|
||||
def __init__(self):
|
||||
self.test_id = f"tb2-{datetime.utcnow().strftime('%Y%m%d%H%M%S')}"
|
||||
self.results: list[dict] = []
|
||||
|
||||
def log(self, step: str, status: str, data: dict | None = None):
|
||||
"""記錄測試結果"""
|
||||
result = {
|
||||
"step": step,
|
||||
"status": status,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"data": data or {},
|
||||
}
|
||||
self.results.append(result)
|
||||
emoji = "✅" if status == "PASS" else "❌" if status == "FAIL" else "🔄"
|
||||
print(f"{emoji} [{step}] {status}")
|
||||
if data:
|
||||
print(f" {json.dumps(data, indent=2, default=str)}")
|
||||
|
||||
# ==================== Step 1: Mock Alert ====================
|
||||
|
||||
async def step1_trigger_alert(self) -> dict:
|
||||
"""
|
||||
Step 1: 觸發假告警
|
||||
|
||||
模擬 Prometheus AlertManager 發送告警:
|
||||
- frontend 服務 5xx 錯誤率上升
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("STEP 1: TRIGGER MOCK ALERT")
|
||||
print("=" * 60)
|
||||
|
||||
alert = {
|
||||
"alertname": "HighErrorRate",
|
||||
"service": "frontend",
|
||||
"namespace": "production",
|
||||
"severity": "critical",
|
||||
"error_rate": 15.2, # 15% 5xx
|
||||
"threshold": 5.0,
|
||||
"fired_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
self.log("trigger_alert", "PASS", alert)
|
||||
return alert
|
||||
|
||||
# ==================== Step 2: GraphRAG Analysis ====================
|
||||
|
||||
async def step2_graphrag_analysis(self, alert: dict) -> dict:
|
||||
"""
|
||||
Step 2: GraphRAG 分析
|
||||
|
||||
呼叫 TopologyGraph.get_blast_radius_and_root_cause()
|
||||
分析:
|
||||
- Blast Radius: frontend 掛了誰會跟著掛
|
||||
- Root Cause: frontend 的依賴誰目前有問題
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("STEP 2: GRAPHRAG ANALYSIS")
|
||||
print("=" * 60)
|
||||
|
||||
target_service = alert["service"]
|
||||
|
||||
# Mock GraphRAG 結果 (實際會呼叫 topology_graph)
|
||||
analysis = {
|
||||
"targetService": target_service,
|
||||
"blastRadius": {
|
||||
"affectedServices": ["ingress"],
|
||||
"affectedCount": 1,
|
||||
"criticalPath": ["ingress -> frontend"],
|
||||
"impactSummary": "If 'frontend' goes down, 1 upstream service (ingress) will be affected.",
|
||||
},
|
||||
"rootCause": {
|
||||
"unhealthyDependencies": ["postgres-db"],
|
||||
"dependencyChain": ["auth-service", "product-api", "order-api", "postgres-db", "redis-cache"],
|
||||
"probableRootCauses": ["postgres-db"],
|
||||
"analysisSummary": "Found 1 unhealthy dependency: postgres-db. Probable root cause: postgres-db.",
|
||||
},
|
||||
"analyzedAt": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
# 視覺化輸出
|
||||
print("\n[BLAST RADIUS - Upstream Impact]")
|
||||
print(" ┌─────────────────────┐")
|
||||
print(" │ ingress │")
|
||||
print(" └─────────┬───────────┘")
|
||||
print(" │ depends on")
|
||||
print(" ▼")
|
||||
print(" ┌─────────────────────┐")
|
||||
print(" │ frontend │ X")
|
||||
print(" └─────────────────────┘")
|
||||
|
||||
print("\n[ROOT CAUSE - Downstream Chain]")
|
||||
print(" ┌─────────────────────┐")
|
||||
print(" │ frontend │ !")
|
||||
print(" └─────────┬───────────┘")
|
||||
print(" │ calls")
|
||||
print(" ▼")
|
||||
print(" ┌─────────────────────┐")
|
||||
print(" │ postgres-db │ X (UNHEALTHY)")
|
||||
print(" └─────────────────────┘")
|
||||
|
||||
self.log("graphrag_analysis", "PASS", analysis)
|
||||
return analysis
|
||||
|
||||
# ==================== Step 3: Dry-Run & ApprovalCard ====================
|
||||
|
||||
async def step3_generate_approval(self, analysis: dict) -> dict:
|
||||
"""
|
||||
Step 3: 產生 ApprovalCard
|
||||
|
||||
根據分析結果,建議重啟 postgres-db
|
||||
執行 Dry-Run 檢查
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("STEP 3: DRY-RUN & APPROVAL CARD")
|
||||
print("=" * 60)
|
||||
|
||||
root_cause = analysis["rootCause"]["probableRootCauses"][0]
|
||||
|
||||
# 建議動作
|
||||
proposed_action = {
|
||||
"operation": "restart_pod",
|
||||
"parameters": {
|
||||
"pod_name": f"{root_cause}-0",
|
||||
"namespace": "production",
|
||||
"graceful": True,
|
||||
},
|
||||
"reason": f"Auto-suggested based on GraphRAG root cause analysis: {root_cause} is unhealthy",
|
||||
}
|
||||
|
||||
# Mock Dry-Run 結果
|
||||
dry_run_result = {
|
||||
"checks": [
|
||||
{"name": "RBAC Permission", "passed": True, "message": "User has pod/delete permission"},
|
||||
{"name": "Syntax Validation", "passed": True, "message": "Parameters valid"},
|
||||
{"name": "Resource Exists", "passed": True, "message": "Pod postgres-db-0 exists"},
|
||||
{"name": "No PDB Violation", "passed": True, "message": "PodDisruptionBudget allows 1 eviction"},
|
||||
],
|
||||
"overallPassed": True,
|
||||
"blastRadius": {
|
||||
"affectedPods": 1,
|
||||
"affectedServices": ["postgres-db"],
|
||||
"dataImpact": "NONE", # Graceful restart
|
||||
},
|
||||
"riskLevel": "high", # Database 操作
|
||||
}
|
||||
|
||||
# 產生 ApprovalCard
|
||||
approval_card = {
|
||||
"approvalId": f"approval-{self.test_id}",
|
||||
"action": proposed_action,
|
||||
"dryRunResult": dry_run_result,
|
||||
"requiredSignatures": 2, # HIGH risk = 2-sig
|
||||
"allowedRoles": ["admin", "devops", "sre"],
|
||||
"createdAt": datetime.utcnow().isoformat(),
|
||||
"expiresAt": None, # No expiry for critical ops
|
||||
}
|
||||
|
||||
print("\n[APPROVAL CARD]")
|
||||
print(f" Action: {proposed_action['operation']}")
|
||||
print(f" Target: {proposed_action['parameters']['pod_name']}")
|
||||
print(f" Risk Level: {dry_run_result['riskLevel'].upper()}")
|
||||
print(f" Required Signatures: {approval_card['requiredSignatures']}")
|
||||
print(f" Dry-Run: {'PASSED' if dry_run_result['overallPassed'] else 'FAILED'}")
|
||||
|
||||
self.log("generate_approval", "PASS", approval_card)
|
||||
return approval_card
|
||||
|
||||
# ==================== Step 4: Multi-Sig Approval ====================
|
||||
|
||||
async def step4_multisig_approval(self, approval_card: dict) -> dict:
|
||||
"""
|
||||
Step 4: 人類批准 (Multi-Sig)
|
||||
|
||||
模擬兩位管理者簽名:
|
||||
1. DevOps Engineer
|
||||
2. SRE Lead
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("STEP 4: MULTI-SIG APPROVAL")
|
||||
print("=" * 60)
|
||||
|
||||
approval_id = approval_card["approvalId"]
|
||||
|
||||
# 第一位簽名
|
||||
sig1 = {
|
||||
"userId": "devops-alice",
|
||||
"role": "devops",
|
||||
"signedAt": datetime.utcnow().isoformat(),
|
||||
"comment": "GraphRAG analysis looks correct. Approving restart.",
|
||||
}
|
||||
print(f"\n[SIGNATURE 1] {sig1['role']}: {sig1['userId']}")
|
||||
print(f" Comment: {sig1['comment']}")
|
||||
|
||||
# 第二位簽名
|
||||
sig2 = {
|
||||
"userId": "sre-bob",
|
||||
"role": "sre",
|
||||
"signedAt": datetime.utcnow().isoformat(),
|
||||
"comment": "Verified PDB. Safe to proceed.",
|
||||
}
|
||||
print(f"\n[SIGNATURE 2] {sig2['role']}: {sig2['userId']}")
|
||||
print(f" Comment: {sig2['comment']}")
|
||||
|
||||
# 批准結果
|
||||
approval_result = {
|
||||
"approvalId": approval_id,
|
||||
"status": "APPROVED",
|
||||
"signatures": [sig1, sig2],
|
||||
"approvedAt": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
print(f"\n[APPROVAL STATUS] {approval_result['status']}")
|
||||
print(f" Signatures: {len(approval_result['signatures'])}/{approval_card['requiredSignatures']}")
|
||||
|
||||
self.log("multisig_approval", "PASS", approval_result)
|
||||
return approval_result
|
||||
|
||||
# ==================== Step 5: MCP Execution ====================
|
||||
|
||||
async def step5_mcp_execution(self, approval_result: dict, approval_card: dict) -> dict:
|
||||
"""
|
||||
Step 5: MCP 模擬執行
|
||||
|
||||
透過 MCP Bridge 執行操作
|
||||
(Phase 3 為模擬,Phase 4+ 連接真實 K8s)
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("STEP 5: MCP EXECUTION")
|
||||
print("=" * 60)
|
||||
|
||||
action = approval_card["action"]
|
||||
|
||||
# TOCTOU 保護: 再次執行 Dry-Run
|
||||
print("\n[TOCTOU CHECK] Re-running dry-run before execution...")
|
||||
toctou_passed = True # Mock
|
||||
print(f" Result: {'PASSED' if toctou_passed else 'VOIDED'}")
|
||||
|
||||
if not toctou_passed:
|
||||
self.log("mcp_execution", "FAIL", {"reason": "TOCTOU conflict detected"})
|
||||
return {"status": "VOIDED"}
|
||||
|
||||
# MCP 執行
|
||||
execution_result = {
|
||||
"executionId": f"exec-{self.test_id}",
|
||||
"operation": action["operation"],
|
||||
"parameters": action["parameters"],
|
||||
"status": "SUCCESS",
|
||||
"output": {
|
||||
"message": f"Pod {action['parameters']['pod_name']} restarted successfully",
|
||||
"newPodName": "postgres-db-0", # Same name after restart
|
||||
"restartTime": "2.3s",
|
||||
},
|
||||
"executedAt": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
print(f"\n[EXECUTION RESULT]")
|
||||
print(f" Status: {execution_result['status']}")
|
||||
print(f" Output: {execution_result['output']['message']}")
|
||||
print(f" Restart Time: {execution_result['output']['restartTime']}")
|
||||
|
||||
# 更新 Trust Engine
|
||||
print("\n[TRUST ENGINE] Recording approval for progressive autonomy...")
|
||||
print(" Action Pattern: restart_pod:postgres-*")
|
||||
print(" Trust Score: +1")
|
||||
|
||||
self.log("mcp_execution", "PASS", execution_result)
|
||||
return execution_result
|
||||
|
||||
# ==================== Run All ====================
|
||||
|
||||
async def run(self):
|
||||
"""執行完整測試流程"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TRACER BULLET 2.0 - FULL LOOP TEST")
|
||||
print(f"Test ID: {self.test_id}")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
# Step 1: Trigger Alert
|
||||
alert = await self.step1_trigger_alert()
|
||||
|
||||
# Step 2: GraphRAG Analysis
|
||||
analysis = await self.step2_graphrag_analysis(alert)
|
||||
|
||||
# Step 3: Dry-Run & Approval Card
|
||||
approval_card = await self.step3_generate_approval(analysis)
|
||||
|
||||
# Step 4: Multi-Sig Approval
|
||||
approval_result = await self.step4_multisig_approval(approval_card)
|
||||
|
||||
# Step 5: MCP Execution
|
||||
execution_result = await self.step5_mcp_execution(approval_result, approval_card)
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
passed = sum(1 for r in self.results if r["status"] == "PASS")
|
||||
failed = sum(1 for r in self.results if r["status"] == "FAIL")
|
||||
|
||||
print(f" Total Steps: {len(self.results)}")
|
||||
print(f" Passed: {passed}")
|
||||
print(f" Failed: {failed}")
|
||||
print(f" Status: {'ALL PASSED' if failed == 0 else 'SOME FAILED'}")
|
||||
|
||||
return {
|
||||
"testId": self.test_id,
|
||||
"status": "PASS" if failed == 0 else "FAIL",
|
||||
"results": self.results,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log("unexpected_error", "FAIL", {"error": str(e)})
|
||||
raise
|
||||
|
||||
|
||||
# ==================== Main ====================
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tracer = TracerBullet2()
|
||||
asyncio.run(tracer.run())
|
||||
Reference in New Issue
Block a user