From 5ead01abf7f7fe702d80f56703f30b8a40ae6be2 Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 9 Apr 2026 10:42:12 +0800 Subject: [PATCH] =?UTF-8?q?feat(ops):=20dr-drill.sh=20=E2=80=94=20?= =?UTF-8?q?=E6=AF=8F=E6=9C=88=20DR=20Drill=20=E8=87=AA=E5=8B=95=E6=BC=94?= =?UTF-8?q?=E7=B7=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 每月第一個週日 03:00 (121 cron) 執行: 1. 找最新 Velero backup (Completed) 2. 還原到 awoooi-dr-test namespace 3. 等待 Pod Ready + API health 驗證 4. 清理 dr-test namespace + restore 資源 5. Telegram 通知 PASS/FAIL + 耗時 支援 --dry-run 模式 (只檢查 backup,不還原)。 dry-run 驗證通過: daily-awoooi-prod-20260409020003 Co-Authored-By: Claude Sonnet 4.6 --- scripts/ops/dr-drill.sh | 246 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 scripts/ops/dr-drill.sh diff --git a/scripts/ops/dr-drill.sh b/scripts/ops/dr-drill.sh new file mode 100644 index 00000000..051f75c1 --- /dev/null +++ b/scripts/ops/dr-drill.sh @@ -0,0 +1,246 @@ +#!/usr/bin/env bash +# scripts/ops/dr-drill.sh +# Sprint 5.2: 每月 DR Drill — 從最新 Velero 備份還原到測試 namespace,驗證後清理 +# +# 部署: cron 0 3 1-7 * 0 (每月第一個週日 03:00) on 121 +# 用法: bash dr-drill.sh [--dry-run] +# +# 流程: +# 1. 找最新 Velero backup +# 2. 還原到 awoooi-dr-test namespace +# 3. 等待 Pod Ready(最多 10 分鐘) +# 4. 驗證 API health endpoint +# 5. 清理 dr-test namespace + restore 資源 +# 6. Telegram 通知結果 +# +# 2026-04-09 Claude Sonnet 4.6 Asia/Taipei + +set -euo pipefail + +KUBECONFIG="${KUBECONFIG:-/etc/rancher/k3s/k3s.yaml}" +DR_NAMESPACE="awoooi-dr-test" +RESTORE_TIMEOUT="${RESTORE_TIMEOUT:-600}" # 10 分鐘 +SECRETS_FILE="${SECRETS_FILE:-/home/wooo/awoooi-ops-secrets/secrets.env}" +DRY_RUN="${1:-}" + +[[ -f "$SECRETS_FILE" ]] && source "$SECRETS_FILE" + +TIMESTAMP=$(date '+%Y%m%d_%H%M%S') +RESTORE_NAME="dr-drill-${TIMESTAMP}" +START_TIME=$(date +%s) + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"; } + +notify_telegram() { + local msg="$1" + if [[ -n "${TELEGRAM_BOT_TOKEN:-}" && -n "${TELEGRAM_CHAT_ID:-}" ]]; then + curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ + -H "Content-Type: application/json" \ + -d "{\"chat_id\":\"${TELEGRAM_CHAT_ID}\",\"text\":\"${msg}\",\"parse_mode\":\"HTML\"}" \ + > /dev/null 2>&1 || true + fi +} + +kubectl_cmd() { + sudo kubectl --kubeconfig="$KUBECONFIG" "$@" +} + +elapsed() { + local end + end=$(date +%s) + echo $(( end - START_TIME )) +} + +# ── Step 1: 找最新 Velero backup ───────────────────────────────────────────── +find_latest_backup() { + local latest + latest=$(kubectl_cmd get backups -n velero \ + --sort-by='.metadata.creationTimestamp' \ + -o jsonpath='{.items[-1].metadata.name}' 2>/dev/null) + if [[ -z "$latest" ]]; then + log "❌ 找不到任何 Velero backup" + return 1 + fi + local phase + phase=$(kubectl_cmd get backup "$latest" -n velero \ + -o jsonpath='{.status.phase}' 2>/dev/null) + if [[ "$phase" != "Completed" ]]; then + log "❌ 最新 backup ${latest} 狀態 ${phase},非 Completed" + return 1 + fi + echo "$latest" +} + +# ── Step 2: 清理舊的 dr-test namespace(若存在)────────────────────────────── +cleanup_dr_namespace() { + if kubectl_cmd get namespace "$DR_NAMESPACE" &>/dev/null; then + log "🗑️ 清理舊的 ${DR_NAMESPACE} namespace..." + kubectl_cmd delete namespace "$DR_NAMESPACE" --timeout=120s 2>/dev/null || true + # 等待刪除完成 + local i=0 + while kubectl_cmd get namespace "$DR_NAMESPACE" &>/dev/null && (( i < 30 )); do + sleep 5; ((i++)) + done + log "✅ 舊 namespace 已清理" + fi +} + +# ── Step 3: 執行 Velero restore ────────────────────────────────────────────── +run_restore() { + local backup_name="$1" + log "🔄 開始還原: backup=${backup_name} → namespace=${DR_NAMESPACE}" + + kubectl_cmd create -f - </dev/null || echo "") + case "$phase" in + Completed) log "✅ Restore Completed"; return 0 ;; + Failed|PartiallyFailed) log "❌ Restore 失敗: ${phase}"; return 1 ;; + esac + sleep 10; ((i+=10)) + log "⏳ Restore 狀態: ${phase:-Pending} (${i}s)" + done + log "❌ Restore timeout" + return 1 +} + +# ── Step 4: 等待 Pod Ready ─────────────────────────────────────────────────── +wait_pods_ready() { + log "⏳ 等待 ${DR_NAMESPACE} Pod Ready..." + local i=0 + while (( i < 300 )); do + local ready + ready=$(kubectl_cmd get pods -n "$DR_NAMESPACE" --no-headers 2>/dev/null \ + | grep -c "Running" || echo "0") + local total + total=$(kubectl_cmd get pods -n "$DR_NAMESPACE" --no-headers 2>/dev/null \ + | wc -l || echo "0") + log "Pod 狀態: ${ready}/${total} Running" + if (( total > 0 && ready == total )); then + log "✅ 所有 Pod Ready" + return 0 + fi + sleep 10; ((i+=10)) + done + log "⚠️ Pod 等待 timeout,繼續驗證" + return 0 # 不阻斷,讓 health check 決定 +} + +# ── Step 5: 驗證 API health ────────────────────────────────────────────────── +verify_api_health() { + log "🩺 驗證 DR namespace API health..." + # 找 api pod IP + local api_ip + api_ip=$(kubectl_cmd get pods -n "$DR_NAMESPACE" \ + -l app=awoooi-api \ + -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || echo "") + + if [[ -z "$api_ip" ]]; then + log "⚠️ 找不到 API Pod IP,跳過 health check" + return 0 + fi + + local http_code + http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + --connect-timeout 10 --max-time 30 \ + "http://${api_ip}:8000/api/v1/health" 2>/dev/null || echo "0") + + if [[ "$http_code" == "200" ]]; then + log "✅ API health: 200 OK (${api_ip})" + return 0 + else + log "❌ API health: ${http_code} (${api_ip})" + return 1 + fi +} + +# ── Step 6: 清理 dr-test 資源 ──────────────────────────────────────────────── +cleanup_all() { + log "🗑️ 清理 DR test 資源..." + kubectl_cmd delete namespace "$DR_NAMESPACE" --timeout=120s 2>/dev/null || true + kubectl_cmd delete restore "$RESTORE_NAME" -n velero 2>/dev/null || true + log "✅ 清理完成" +} + +# ── Main ────────────────────────────────────────────────────────────────────── +main() { + log "=== DR Drill 開始 (${TIMESTAMP}) ===" + + if [[ "$DRY_RUN" == "--dry-run" ]]; then + log "🔍 DRY RUN 模式 — 只檢查 backup,不執行還原" + local backup + backup=$(find_latest_backup) || { notify_telegram "❌ DR Drill 失敗: 找不到有效 backup"; exit 1; } + log "✅ 最新 backup: ${backup}" + notify_telegram "🔍 DR Drill DRY RUN +├ 最新 backup: ${backup} +└ 狀態: Completed ✅ (未執行還原)" + return 0 + fi + + local backup + backup=$(find_latest_backup) || { + notify_telegram "❌ DR Drill 失敗 +└ 找不到有效 Velero backup" + exit 1 + } + log "📦 使用 backup: ${backup}" + + cleanup_dr_namespace + + local restore_ok=true + run_restore "$backup" || restore_ok=false + + local pod_status="⚠️" + local health_status="⚠️" + + if $restore_ok; then + wait_pods_ready + verify_api_health && health_status="✅" || health_status="❌" + pod_status="✅" + else + pod_status="❌" + fi + + cleanup_all + + local duration + duration=$(elapsed) + local minutes=$(( duration / 60 )) + local seconds=$(( duration % 60 )) + + local overall="✅ PASS" + [[ "$health_status" == "❌" || "$pod_status" == "❌" ]] && overall="❌ FAIL" + + log "=== DR Drill 完成: ${overall} (${minutes}m${seconds}s) ===" + + notify_telegram "${overall} DR Drill 月度演練 +├ 備份: ${backup} +├ Restore: ${pod_status} +├ API Health: ${health_status} +├ 耗時: ${minutes}m${seconds}s +└ 時間: $(date '+%Y-%m-%d %H:%M') +0800" + + [[ "$overall" == *"FAIL"* ]] && exit 1 + return 0 +} + +main "$@"