feat(ops): dr-drill.sh — 每月 DR Drill 自動演練

每月第一個週日 03:00 (121 cron) 執行:
1. 找最新 Velero backup (Completed)
2. 還原到 awoooi-dr-test namespace
3. 等待 Pod Ready + API health 驗證
4. 清理 dr-test namespace + restore 資源
5. Telegram 通知 PASS/FAIL + 耗時

支援 --dry-run 模式 (只檢查 backup,不還原)。
dry-run 驗證通過: daily-awoooi-prod-20260409020003

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-09 10:42:12 +08:00
parent 770667eed4
commit 5ead01abf7

246
scripts/ops/dr-drill.sh Normal file
View File

@@ -0,0 +1,246 @@
#!/usr/bin/env bash
# scripts/ops/dr-drill.sh
# Sprint 5.2: 每月 DR Drill — 從最新 Velero 備份還原到測試 namespace驗證後清理
#
# 部署: cron 0 3 1-7 * 0 (每月第一個週日 03:00) on 121
# 用法: bash dr-drill.sh [--dry-run]
#
# 流程:
# 1. 找最新 Velero backup
# 2. 還原到 awoooi-dr-test namespace
# 3. 等待 Pod Ready最多 10 分鐘)
# 4. 驗證 API health endpoint
# 5. 清理 dr-test namespace + restore 資源
# 6. Telegram 通知結果
#
# 2026-04-09 Claude Sonnet 4.6 Asia/Taipei
set -euo pipefail
KUBECONFIG="${KUBECONFIG:-/etc/rancher/k3s/k3s.yaml}"
DR_NAMESPACE="awoooi-dr-test"
RESTORE_TIMEOUT="${RESTORE_TIMEOUT:-600}" # 10 分鐘
SECRETS_FILE="${SECRETS_FILE:-/home/wooo/awoooi-ops-secrets/secrets.env}"
DRY_RUN="${1:-}"
[[ -f "$SECRETS_FILE" ]] && source "$SECRETS_FILE"
TIMESTAMP=$(date '+%Y%m%d_%H%M%S')
RESTORE_NAME="dr-drill-${TIMESTAMP}"
START_TIME=$(date +%s)
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"; }
notify_telegram() {
local msg="$1"
if [[ -n "${TELEGRAM_BOT_TOKEN:-}" && -n "${TELEGRAM_CHAT_ID:-}" ]]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-H "Content-Type: application/json" \
-d "{\"chat_id\":\"${TELEGRAM_CHAT_ID}\",\"text\":\"${msg}\",\"parse_mode\":\"HTML\"}" \
> /dev/null 2>&1 || true
fi
}
kubectl_cmd() {
sudo kubectl --kubeconfig="$KUBECONFIG" "$@"
}
elapsed() {
local end
end=$(date +%s)
echo $(( end - START_TIME ))
}
# ── Step 1: 找最新 Velero backup ─────────────────────────────────────────────
find_latest_backup() {
local latest
latest=$(kubectl_cmd get backups -n velero \
--sort-by='.metadata.creationTimestamp' \
-o jsonpath='{.items[-1].metadata.name}' 2>/dev/null)
if [[ -z "$latest" ]]; then
log "❌ 找不到任何 Velero backup"
return 1
fi
local phase
phase=$(kubectl_cmd get backup "$latest" -n velero \
-o jsonpath='{.status.phase}' 2>/dev/null)
if [[ "$phase" != "Completed" ]]; then
log "❌ 最新 backup ${latest} 狀態 ${phase},非 Completed"
return 1
fi
echo "$latest"
}
# ── Step 2: 清理舊的 dr-test namespace若存在──────────────────────────────
cleanup_dr_namespace() {
if kubectl_cmd get namespace "$DR_NAMESPACE" &>/dev/null; then
log "🗑️ 清理舊的 ${DR_NAMESPACE} namespace..."
kubectl_cmd delete namespace "$DR_NAMESPACE" --timeout=120s 2>/dev/null || true
# 等待刪除完成
local i=0
while kubectl_cmd get namespace "$DR_NAMESPACE" &>/dev/null && (( i < 30 )); do
sleep 5; ((i++))
done
log "✅ 舊 namespace 已清理"
fi
}
# ── Step 3: 執行 Velero restore ──────────────────────────────────────────────
run_restore() {
local backup_name="$1"
log "🔄 開始還原: backup=${backup_name} → namespace=${DR_NAMESPACE}"
kubectl_cmd create -f - <<EOF
apiVersion: velero.io/v1
kind: Restore
metadata:
name: ${RESTORE_NAME}
namespace: velero
spec:
backupName: ${backup_name}
includedNamespaces:
- awoooi-prod
namespaceMapping:
awoooi-prod: ${DR_NAMESPACE}
restorePVs: false
existingResourcePolicy: none
EOF
log "⏳ 等待 restore 完成timeout ${RESTORE_TIMEOUT}s..."
local i=0
local phase=""
while (( i < RESTORE_TIMEOUT )); do
phase=$(kubectl_cmd get restore "$RESTORE_NAME" -n velero \
-o jsonpath='{.status.phase}' 2>/dev/null || echo "")
case "$phase" in
Completed) log "✅ Restore Completed"; return 0 ;;
Failed|PartiallyFailed) log "❌ Restore 失敗: ${phase}"; return 1 ;;
esac
sleep 10; ((i+=10))
log "⏳ Restore 狀態: ${phase:-Pending} (${i}s)"
done
log "❌ Restore timeout"
return 1
}
# ── Step 4: 等待 Pod Ready ───────────────────────────────────────────────────
wait_pods_ready() {
log "⏳ 等待 ${DR_NAMESPACE} Pod Ready..."
local i=0
while (( i < 300 )); do
local ready
ready=$(kubectl_cmd get pods -n "$DR_NAMESPACE" --no-headers 2>/dev/null \
| grep -c "Running" || echo "0")
local total
total=$(kubectl_cmd get pods -n "$DR_NAMESPACE" --no-headers 2>/dev/null \
| wc -l || echo "0")
log "Pod 狀態: ${ready}/${total} Running"
if (( total > 0 && ready == total )); then
log "✅ 所有 Pod Ready"
return 0
fi
sleep 10; ((i+=10))
done
log "⚠️ Pod 等待 timeout繼續驗證"
return 0 # 不阻斷,讓 health check 決定
}
# ── Step 5: 驗證 API health ──────────────────────────────────────────────────
verify_api_health() {
log "🩺 驗證 DR namespace API health..."
# 找 api pod IP
local api_ip
api_ip=$(kubectl_cmd get pods -n "$DR_NAMESPACE" \
-l app=awoooi-api \
-o jsonpath='{.items[0].status.podIP}' 2>/dev/null || echo "")
if [[ -z "$api_ip" ]]; then
log "⚠️ 找不到 API Pod IP跳過 health check"
return 0
fi
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
--connect-timeout 10 --max-time 30 \
"http://${api_ip}:8000/api/v1/health" 2>/dev/null || echo "0")
if [[ "$http_code" == "200" ]]; then
log "✅ API health: 200 OK (${api_ip})"
return 0
else
log "❌ API health: ${http_code} (${api_ip})"
return 1
fi
}
# ── Step 6: 清理 dr-test 資源 ────────────────────────────────────────────────
cleanup_all() {
log "🗑️ 清理 DR test 資源..."
kubectl_cmd delete namespace "$DR_NAMESPACE" --timeout=120s 2>/dev/null || true
kubectl_cmd delete restore "$RESTORE_NAME" -n velero 2>/dev/null || true
log "✅ 清理完成"
}
# ── Main ──────────────────────────────────────────────────────────────────────
main() {
log "=== DR Drill 開始 (${TIMESTAMP}) ==="
if [[ "$DRY_RUN" == "--dry-run" ]]; then
log "🔍 DRY RUN 模式 — 只檢查 backup不執行還原"
local backup
backup=$(find_latest_backup) || { notify_telegram "❌ DR Drill 失敗: 找不到有效 backup"; exit 1; }
log "✅ 最新 backup: ${backup}"
notify_telegram "🔍 <b>DR Drill DRY RUN</b>
├ 最新 backup: ${backup}
└ 狀態: Completed ✅ (未執行還原)"
return 0
fi
local backup
backup=$(find_latest_backup) || {
notify_telegram "❌ <b>DR Drill 失敗</b>
└ 找不到有效 Velero backup"
exit 1
}
log "📦 使用 backup: ${backup}"
cleanup_dr_namespace
local restore_ok=true
run_restore "$backup" || restore_ok=false
local pod_status="⚠️"
local health_status="⚠️"
if $restore_ok; then
wait_pods_ready
verify_api_health && health_status="✅" || health_status="❌"
pod_status="✅"
else
pod_status="❌"
fi
cleanup_all
local duration
duration=$(elapsed)
local minutes=$(( duration / 60 ))
local seconds=$(( duration % 60 ))
local overall="✅ PASS"
[[ "$health_status" == "❌" || "$pod_status" == "❌" ]] && overall="❌ FAIL"
log "=== DR Drill 完成: ${overall} (${minutes}m${seconds}s) ==="
notify_telegram "${overall} <b>DR Drill 月度演練</b>
├ 備份: ${backup}
├ Restore: ${pod_status}
├ API Health: ${health_status}
├ 耗時: ${minutes}m${seconds}s
└ 時間: $(date '+%Y-%m-%d %H:%M') +0800"
[[ "$overall" == *"FAIL"* ]] && exit 1
return 0
}
main "$@"