248 lines
8.2 KiB
Bash
248 lines
8.2 KiB
Bash
#!/usr/bin/env bash
|
||
# scripts/ops/dr-drill.sh
|
||
# Sprint 5.2: 每月 DR Drill — 從最新 Velero 備份還原到測試 namespace,驗證後清理
|
||
#
|
||
# 部署: cron 0 3 1-7 * 0 (每月第一個週日 03:00) on 121
|
||
# 用法: bash dr-drill.sh [--dry-run]
|
||
#
|
||
# 流程:
|
||
# 1. 找最新 Velero backup
|
||
# 2. 還原到 awoooi-dr-test namespace
|
||
# 3. 等待 Pod Ready(最多 10 分鐘)
|
||
# 4. 驗證 API health endpoint
|
||
# 5. 清理 dr-test namespace + restore 資源
|
||
# 6. Telegram 通知結果
|
||
#
|
||
# 2026-04-09 Claude Sonnet 4.6 Asia/Taipei
|
||
|
||
set -euo pipefail
|
||
|
||
KUBECONFIG="${KUBECONFIG:-/etc/rancher/k3s/k3s.yaml}"
|
||
DR_NAMESPACE="awoooi-dr-test"
|
||
RESTORE_TIMEOUT="${RESTORE_TIMEOUT:-600}" # 10 分鐘
|
||
SECRETS_FILE="${SECRETS_FILE:-/home/wooo/awoooi-ops-secrets/secrets.env}"
|
||
DRY_RUN="${1:-}"
|
||
|
||
[[ -f "$SECRETS_FILE" ]] && source "$SECRETS_FILE"
|
||
|
||
TIMESTAMP=$(date '+%Y%m%d_%H%M%S')
|
||
RESTORE_NAME="dr-drill-${TIMESTAMP}"
|
||
START_TIME=$(date +%s)
|
||
|
||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"; }
|
||
|
||
notify_telegram() {
|
||
local msg="$1"
|
||
local chat_id="${TELEGRAM_ALERT_CHAT_ID:-${SRE_GROUP_CHAT_ID:--1003711974679}}"
|
||
if [[ -n "${TELEGRAM_BOT_TOKEN:-}" && -n "$chat_id" ]]; then
|
||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||
-H "Content-Type: application/json" \
|
||
-d "{\"chat_id\":\"${chat_id}\",\"text\":\"${msg}\",\"parse_mode\":\"HTML\"}" \
|
||
> /dev/null 2>&1 || true
|
||
fi
|
||
}
|
||
|
||
kubectl_cmd() {
|
||
sudo kubectl --kubeconfig="$KUBECONFIG" "$@"
|
||
}
|
||
|
||
elapsed() {
|
||
local end
|
||
end=$(date +%s)
|
||
echo $(( end - START_TIME ))
|
||
}
|
||
|
||
# ── Step 1: 找最新 Velero backup ─────────────────────────────────────────────
|
||
find_latest_backup() {
|
||
local latest
|
||
latest=$(kubectl_cmd get backups -n velero \
|
||
--sort-by='.metadata.creationTimestamp' \
|
||
-o jsonpath='{.items[-1].metadata.name}' 2>/dev/null)
|
||
if [[ -z "$latest" ]]; then
|
||
log "❌ 找不到任何 Velero backup"
|
||
return 1
|
||
fi
|
||
local phase
|
||
phase=$(kubectl_cmd get backup "$latest" -n velero \
|
||
-o jsonpath='{.status.phase}' 2>/dev/null)
|
||
if [[ "$phase" != "Completed" ]]; then
|
||
log "❌ 最新 backup ${latest} 狀態 ${phase},非 Completed"
|
||
return 1
|
||
fi
|
||
echo "$latest"
|
||
}
|
||
|
||
# ── Step 2: 清理舊的 dr-test namespace(若存在)──────────────────────────────
|
||
cleanup_dr_namespace() {
|
||
if kubectl_cmd get namespace "$DR_NAMESPACE" &>/dev/null; then
|
||
log "🗑️ 清理舊的 ${DR_NAMESPACE} namespace..."
|
||
kubectl_cmd delete namespace "$DR_NAMESPACE" --timeout=120s 2>/dev/null || true
|
||
# 等待刪除完成
|
||
local i=0
|
||
while kubectl_cmd get namespace "$DR_NAMESPACE" &>/dev/null && (( i < 30 )); do
|
||
sleep 5; ((i++))
|
||
done
|
||
log "✅ 舊 namespace 已清理"
|
||
fi
|
||
}
|
||
|
||
# ── Step 3: 執行 Velero restore ──────────────────────────────────────────────
|
||
run_restore() {
|
||
local backup_name="$1"
|
||
log "🔄 開始還原: backup=${backup_name} → namespace=${DR_NAMESPACE}"
|
||
|
||
kubectl_cmd create -f - <<EOF
|
||
apiVersion: velero.io/v1
|
||
kind: Restore
|
||
metadata:
|
||
name: ${RESTORE_NAME}
|
||
namespace: velero
|
||
spec:
|
||
backupName: ${backup_name}
|
||
includedNamespaces:
|
||
- awoooi-prod
|
||
namespaceMapping:
|
||
awoooi-prod: ${DR_NAMESPACE}
|
||
restorePVs: false
|
||
existingResourcePolicy: none
|
||
EOF
|
||
|
||
log "⏳ 等待 restore 完成(timeout ${RESTORE_TIMEOUT}s)..."
|
||
local i=0
|
||
local phase=""
|
||
while (( i < RESTORE_TIMEOUT )); do
|
||
phase=$(kubectl_cmd get restore "$RESTORE_NAME" -n velero \
|
||
-o jsonpath='{.status.phase}' 2>/dev/null || echo "")
|
||
case "$phase" in
|
||
Completed) log "✅ Restore Completed"; return 0 ;;
|
||
Failed|PartiallyFailed) log "❌ Restore 失敗: ${phase}"; return 1 ;;
|
||
esac
|
||
sleep 10; ((i+=10))
|
||
log "⏳ Restore 狀態: ${phase:-Pending} (${i}s)"
|
||
done
|
||
log "❌ Restore timeout"
|
||
return 1
|
||
}
|
||
|
||
# ── Step 4: 等待 Pod Ready ───────────────────────────────────────────────────
|
||
wait_pods_ready() {
|
||
log "⏳ 等待 ${DR_NAMESPACE} Pod Ready..."
|
||
local i=0
|
||
while (( i < 300 )); do
|
||
local ready
|
||
ready=$(kubectl_cmd get pods -n "$DR_NAMESPACE" --no-headers 2>/dev/null \
|
||
| grep -c "Running" || echo "0")
|
||
local total
|
||
total=$(kubectl_cmd get pods -n "$DR_NAMESPACE" --no-headers 2>/dev/null \
|
||
| wc -l || echo "0")
|
||
log "Pod 狀態: ${ready}/${total} Running"
|
||
if (( total > 0 && ready == total )); then
|
||
log "✅ 所有 Pod Ready"
|
||
return 0
|
||
fi
|
||
sleep 10; ((i+=10))
|
||
done
|
||
log "⚠️ Pod 等待 timeout,繼續驗證"
|
||
return 0 # 不阻斷,讓 health check 決定
|
||
}
|
||
|
||
# ── Step 5: 驗證 API health ──────────────────────────────────────────────────
|
||
verify_api_health() {
|
||
log "🩺 驗證 DR namespace API health..."
|
||
# 找 api pod IP
|
||
local api_ip
|
||
api_ip=$(kubectl_cmd get pods -n "$DR_NAMESPACE" \
|
||
-l app=awoooi-api \
|
||
-o jsonpath='{.items[0].status.podIP}' 2>/dev/null || echo "")
|
||
|
||
if [[ -z "$api_ip" ]]; then
|
||
log "⚠️ 找不到 API Pod IP,跳過 health check"
|
||
return 0
|
||
fi
|
||
|
||
local http_code
|
||
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
||
--connect-timeout 10 --max-time 30 \
|
||
"http://${api_ip}:8000/api/v1/health" 2>/dev/null || echo "0")
|
||
|
||
if [[ "$http_code" == "200" ]]; then
|
||
log "✅ API health: 200 OK (${api_ip})"
|
||
return 0
|
||
else
|
||
log "❌ API health: ${http_code} (${api_ip})"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# ── Step 6: 清理 dr-test 資源 ────────────────────────────────────────────────
|
||
cleanup_all() {
|
||
log "🗑️ 清理 DR test 資源..."
|
||
kubectl_cmd delete namespace "$DR_NAMESPACE" --timeout=120s 2>/dev/null || true
|
||
kubectl_cmd delete restore "$RESTORE_NAME" -n velero 2>/dev/null || true
|
||
log "✅ 清理完成"
|
||
}
|
||
|
||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||
main() {
|
||
log "=== DR Drill 開始 (${TIMESTAMP}) ==="
|
||
|
||
if [[ "$DRY_RUN" == "--dry-run" ]]; then
|
||
log "🔍 DRY RUN 模式 — 只檢查 backup,不執行還原"
|
||
local backup
|
||
backup=$(find_latest_backup) || { notify_telegram "❌ DR Drill 失敗: 找不到有效 backup"; exit 1; }
|
||
log "✅ 最新 backup: ${backup}"
|
||
notify_telegram "🔍 <b>DR Drill DRY RUN</b>
|
||
├ 最新 backup: ${backup}
|
||
└ 狀態: Completed ✅ (未執行還原)"
|
||
return 0
|
||
fi
|
||
|
||
local backup
|
||
backup=$(find_latest_backup) || {
|
||
notify_telegram "❌ <b>DR Drill 失敗</b>
|
||
└ 找不到有效 Velero backup"
|
||
exit 1
|
||
}
|
||
log "📦 使用 backup: ${backup}"
|
||
|
||
cleanup_dr_namespace
|
||
|
||
local restore_ok=true
|
||
run_restore "$backup" || restore_ok=false
|
||
|
||
local pod_status="⚠️"
|
||
local health_status="⚠️"
|
||
|
||
if $restore_ok; then
|
||
wait_pods_ready
|
||
verify_api_health && health_status="✅" || health_status="❌"
|
||
pod_status="✅"
|
||
else
|
||
pod_status="❌"
|
||
fi
|
||
|
||
cleanup_all
|
||
|
||
local duration
|
||
duration=$(elapsed)
|
||
local minutes=$(( duration / 60 ))
|
||
local seconds=$(( duration % 60 ))
|
||
|
||
local overall="✅ PASS"
|
||
[[ "$health_status" == "❌" || "$pod_status" == "❌" ]] && overall="❌ FAIL"
|
||
|
||
log "=== DR Drill 完成: ${overall} (${minutes}m${seconds}s) ==="
|
||
|
||
notify_telegram "${overall} <b>DR Drill 月度演練</b>
|
||
├ 備份: ${backup}
|
||
├ Restore: ${pod_status}
|
||
├ API Health: ${health_status}
|
||
├ 耗時: ${minutes}m${seconds}s
|
||
└ 時間: $(date '+%Y-%m-%d %H:%M') +0800"
|
||
|
||
[[ "$overall" == *"FAIL"* ]] && exit 1
|
||
return 0
|
||
}
|
||
|
||
main "$@"
|