#!/usr/bin/env bash # scripts/ops/dr-drill.sh # Sprint 5.2: 每月 DR Drill — 從最新 Velero 備份還原到測試 namespace,驗證後清理 # # 部署: cron 0 3 1-7 * 0 (每月第一個週日 03:00) on 121 # 用法: bash dr-drill.sh [--dry-run] # # 流程: # 1. 找最新 Velero backup # 2. 還原到 awoooi-dr-test namespace # 3. 等待 Pod Ready(最多 10 分鐘) # 4. 驗證 API health endpoint # 5. 清理 dr-test namespace + restore 資源 # 6. Telegram 通知結果 # # 2026-04-09 Claude Sonnet 4.6 Asia/Taipei set -euo pipefail KUBECONFIG="${KUBECONFIG:-/etc/rancher/k3s/k3s.yaml}" DR_NAMESPACE="awoooi-dr-test" RESTORE_TIMEOUT="${RESTORE_TIMEOUT:-600}" # 10 分鐘 SECRETS_FILE="${SECRETS_FILE:-/home/wooo/awoooi-ops-secrets/secrets.env}" DRY_RUN="${1:-}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" [[ -f "$SECRETS_FILE" ]] && source "$SECRETS_FILE" TIMESTAMP=$(date '+%Y%m%d_%H%M%S') RESTORE_NAME="dr-drill-${TIMESTAMP}" START_TIME=$(date +%s) log() { echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"; } notify_awoooi_ops() { local status="$1" local msg="$2" local helper="${SCRIPT_DIR}/notify-awoooi-ops.sh" [[ -x "$helper" ]] || return 1 AWOOI_OPS_ALERTNAME="DRDrillStatus" \ AWOOI_OPS_JOB_NAME="DR Drill 月度演練" \ AWOOI_OPS_STATUS="$status" \ AWOOI_OPS_SEVERITY="info" \ AWOOI_OPS_SOURCE="dr-drill" \ AWOOI_OPS_COMPONENT="disaster-recovery" \ AWOOI_OPS_SUMMARY="DR Drill ${status}" \ AWOOI_OPS_DETAIL="$msg" \ AWOOI_OPS_DURATION_SECONDS="$(elapsed)" \ "$helper" >/dev/null } notify_telegram() { local msg="$1" local status="${2:-success}" # 正式路徑:先交給 AWOOI API,由 TelegramGateway 送出並鏡像到 AwoooP。 # 只有 API 不可達或 helper 未部署時,才使用 Telegram 直發救命旁路。 notify_awoooi_ops "$status" "$msg" && return 0 local chat_id="${TELEGRAM_ALERT_CHAT_ID:-${SRE_GROUP_CHAT_ID:--1003711974679}}" if [[ -n "${TELEGRAM_BOT_TOKEN:-}" && -n "$chat_id" ]]; then curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ -d "chat_id=${chat_id}" \ -d "parse_mode=HTML" \ --data-urlencode "text=${msg}" \ > /dev/null 2>&1 || true fi } kubectl_cmd() { sudo kubectl --kubeconfig="$KUBECONFIG" "$@" } elapsed() { local end end=$(date +%s) echo $(( end - START_TIME )) } # ── Step 1: 找最新 Velero backup ───────────────────────────────────────────── find_latest_backup() { local latest latest=$(kubectl_cmd get backups -n velero \ --sort-by='.metadata.creationTimestamp' \ -o jsonpath='{.items[-1].metadata.name}' 2>/dev/null) if [[ -z "$latest" ]]; then log "❌ 找不到任何 Velero backup" return 1 fi local phase phase=$(kubectl_cmd get backup "$latest" -n velero \ -o jsonpath='{.status.phase}' 2>/dev/null) if [[ "$phase" != "Completed" ]]; then log "❌ 最新 backup ${latest} 狀態 ${phase},非 Completed" return 1 fi echo "$latest" } # ── Step 2: 清理舊的 dr-test namespace(若存在)────────────────────────────── cleanup_dr_namespace() { if kubectl_cmd get namespace "$DR_NAMESPACE" &>/dev/null; then log "🗑️ 清理舊的 ${DR_NAMESPACE} namespace..." kubectl_cmd delete namespace "$DR_NAMESPACE" --timeout=120s 2>/dev/null || true # 等待刪除完成 local i=0 while kubectl_cmd get namespace "$DR_NAMESPACE" &>/dev/null && (( i < 30 )); do sleep 5; ((i++)) done log "✅ 舊 namespace 已清理" fi } # ── Step 3: 執行 Velero restore ────────────────────────────────────────────── run_restore() { local backup_name="$1" log "🔄 開始還原: backup=${backup_name} → namespace=${DR_NAMESPACE}" kubectl_cmd create -f - </dev/null || echo "") case "$phase" in Completed) log "✅ Restore Completed"; return 0 ;; Failed|PartiallyFailed) log "❌ Restore 失敗: ${phase}"; return 1 ;; esac sleep 10; ((i+=10)) log "⏳ Restore 狀態: ${phase:-Pending} (${i}s)" done log "❌ Restore timeout" return 1 } # ── Step 4: 等待 Pod Ready ─────────────────────────────────────────────────── wait_pods_ready() { log "⏳ 等待 ${DR_NAMESPACE} Pod Ready..." local i=0 while (( i < 300 )); do local ready ready=$(kubectl_cmd get pods -n "$DR_NAMESPACE" --no-headers 2>/dev/null \ | grep -c "Running" || echo "0") local total total=$(kubectl_cmd get pods -n "$DR_NAMESPACE" --no-headers 2>/dev/null \ | wc -l || echo "0") log "Pod 狀態: ${ready}/${total} Running" if (( total > 0 && ready == total )); then log "✅ 所有 Pod Ready" return 0 fi sleep 10; ((i+=10)) done log "⚠️ Pod 等待 timeout,繼續驗證" return 0 # 不阻斷,讓 health check 決定 } # ── Step 5: 驗證 API health ────────────────────────────────────────────────── verify_api_health() { log "🩺 驗證 DR namespace API health..." # 找 api pod IP local api_ip api_ip=$(kubectl_cmd get pods -n "$DR_NAMESPACE" \ -l app=awoooi-api \ -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || echo "") if [[ -z "$api_ip" ]]; then log "⚠️ 找不到 API Pod IP,跳過 health check" return 0 fi local http_code http_code=$(curl -s -o /dev/null -w "%{http_code}" \ --connect-timeout 10 --max-time 30 \ "http://${api_ip}:8000/api/v1/health" 2>/dev/null || echo "0") if [[ "$http_code" == "200" ]]; then log "✅ API health: 200 OK (${api_ip})" return 0 else log "❌ API health: ${http_code} (${api_ip})" return 1 fi } # ── Step 6: 清理 dr-test 資源 ──────────────────────────────────────────────── cleanup_all() { log "🗑️ 清理 DR test 資源..." kubectl_cmd delete namespace "$DR_NAMESPACE" --timeout=120s 2>/dev/null || true kubectl_cmd delete restore "$RESTORE_NAME" -n velero 2>/dev/null || true log "✅ 清理完成" } # ── Main ────────────────────────────────────────────────────────────────────── main() { log "=== DR Drill 開始 (${TIMESTAMP}) ===" if [[ "$DRY_RUN" == "--dry-run" ]]; then log "🔍 DRY RUN 模式 — 只檢查 backup,不執行還原" local backup backup=$(find_latest_backup) || { notify_telegram "❌ DR Drill 失敗: 找不到有效 backup" "failed"; exit 1; } log "✅ 最新 backup: ${backup}" notify_telegram "🔍 DR Drill DRY RUN ├ 最新 backup: ${backup} └ 狀態: Completed ✅ (未執行還原)" "success" return 0 fi local backup backup=$(find_latest_backup) || { notify_telegram "❌ DR Drill 失敗 └ 找不到有效 Velero backup" "failed" exit 1 } log "📦 使用 backup: ${backup}" cleanup_dr_namespace local restore_ok=true run_restore "$backup" || restore_ok=false local pod_status="⚠️" local health_status="⚠️" if $restore_ok; then wait_pods_ready verify_api_health && health_status="✅" || health_status="❌" pod_status="✅" else pod_status="❌" fi cleanup_all local duration duration=$(elapsed) local minutes=$(( duration / 60 )) local seconds=$(( duration % 60 )) local overall="✅ PASS" [[ "$health_status" == "❌" || "$pod_status" == "❌" ]] && overall="❌ FAIL" log "=== DR Drill 完成: ${overall} (${minutes}m${seconds}s) ===" local notify_status="success" [[ "$overall" == *"FAIL"* ]] && notify_status="failed" notify_telegram "${overall} DR Drill 月度演練 ├ 備份: ${backup} ├ Restore: ${pod_status} ├ API Health: ${health_status} ├ 耗時: ${minutes}m${seconds}s └ 時間: $(date '+%Y-%m-%d %H:%M') +0800" "$notify_status" [[ "$overall" == *"FAIL"* ]] && exit 1 return 0 } main "$@"