Files
awoooi/scripts/ops/dr-drill.sh
Your Name ee2cc2bfc3
Some checks failed
CD Pipeline / tests (push) Failing after 1m23s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 15s
fix(alerts): 收斂 Telegram 告警到 SRE 戰情室
2026-06-12 11:06:16 +08:00

277 lines
9.2 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# scripts/ops/dr-drill.sh
# Sprint 5.2: 每月 DR Drill — 從最新 Velero 備份還原到測試 namespace驗證後清理
#
# 部署: cron 0 3 1-7 * 0 (每月第一個週日 03:00) on 121
# 用法: bash dr-drill.sh [--dry-run]
#
# 流程:
# 1. 找最新 Velero backup
# 2. 還原到 awoooi-dr-test namespace
# 3. 等待 Pod Ready最多 10 分鐘)
# 4. 驗證 API health endpoint
# 5. 清理 dr-test namespace + restore 資源
# 6. Telegram 通知結果
#
# 2026-04-09 Claude Sonnet 4.6 Asia/Taipei
set -euo pipefail
KUBECONFIG="${KUBECONFIG:-/etc/rancher/k3s/k3s.yaml}"
DR_NAMESPACE="awoooi-dr-test"
RESTORE_TIMEOUT="${RESTORE_TIMEOUT:-600}" # 10 分鐘
SECRETS_FILE="${SECRETS_FILE:-/home/wooo/awoooi-ops-secrets/secrets.env}"
DRY_RUN="${1:-}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
[[ -f "$SECRETS_FILE" ]] && source "$SECRETS_FILE"
TIMESTAMP=$(date '+%Y%m%d_%H%M%S')
RESTORE_NAME="dr-drill-${TIMESTAMP}"
START_TIME=$(date +%s)
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"; }
notify_awoooi_ops() {
local status="$1"
local msg="$2"
local helper="${SCRIPT_DIR}/notify-awoooi-ops.sh"
[[ -x "$helper" ]] || return 1
AWOOI_OPS_ALERTNAME="DRDrillStatus" \
AWOOI_OPS_JOB_NAME="DR Drill 月度演練" \
AWOOI_OPS_STATUS="$status" \
AWOOI_OPS_SEVERITY="info" \
AWOOI_OPS_SOURCE="dr-drill" \
AWOOI_OPS_COMPONENT="disaster-recovery" \
AWOOI_OPS_SUMMARY="DR Drill ${status}" \
AWOOI_OPS_DETAIL="$msg" \
AWOOI_OPS_DURATION_SECONDS="$(elapsed)" \
"$helper" >/dev/null
}
notify_telegram() {
local msg="$1"
local status="${2:-success}"
# 正式路徑:先交給 AWOOI API由 TelegramGateway 送出並鏡像到 AwoooP。
# 只有 API 不可達或 helper 未部署時,才使用 Telegram 直發救命旁路。
notify_awoooi_ops "$status" "$msg" && return 0
local chat_id="${SRE_GROUP_CHAT_ID:--1003711974679}"
if [[ -n "${TELEGRAM_BOT_TOKEN:-}" && -n "$chat_id" ]]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${chat_id}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${msg}" \
> /dev/null 2>&1 || true
fi
}
kubectl_cmd() {
sudo kubectl --kubeconfig="$KUBECONFIG" "$@"
}
elapsed() {
local end
end=$(date +%s)
echo $(( end - START_TIME ))
}
# ── Step 1: 找最新 Velero backup ─────────────────────────────────────────────
find_latest_backup() {
local latest
latest=$(kubectl_cmd get backups -n velero \
--sort-by='.metadata.creationTimestamp' \
-o jsonpath='{.items[-1].metadata.name}' 2>/dev/null)
if [[ -z "$latest" ]]; then
log "❌ 找不到任何 Velero backup"
return 1
fi
local phase
phase=$(kubectl_cmd get backup "$latest" -n velero \
-o jsonpath='{.status.phase}' 2>/dev/null)
if [[ "$phase" != "Completed" ]]; then
log "❌ 最新 backup ${latest} 狀態 ${phase},非 Completed"
return 1
fi
echo "$latest"
}
# ── Step 2: 清理舊的 dr-test namespace若存在──────────────────────────────
cleanup_dr_namespace() {
if kubectl_cmd get namespace "$DR_NAMESPACE" &>/dev/null; then
log "🗑️ 清理舊的 ${DR_NAMESPACE} namespace..."
kubectl_cmd delete namespace "$DR_NAMESPACE" --timeout=120s 2>/dev/null || true
# 等待刪除完成
local i=0
while kubectl_cmd get namespace "$DR_NAMESPACE" &>/dev/null && (( i < 30 )); do
sleep 5; ((i++))
done
log "✅ 舊 namespace 已清理"
fi
}
# ── Step 3: 執行 Velero restore ──────────────────────────────────────────────
run_restore() {
local backup_name="$1"
log "🔄 開始還原: backup=${backup_name} → namespace=${DR_NAMESPACE}"
kubectl_cmd create -f - <<EOF
apiVersion: velero.io/v1
kind: Restore
metadata:
name: ${RESTORE_NAME}
namespace: velero
spec:
backupName: ${backup_name}
includedNamespaces:
- awoooi-prod
namespaceMapping:
awoooi-prod: ${DR_NAMESPACE}
restorePVs: false
existingResourcePolicy: none
EOF
log "⏳ 等待 restore 完成timeout ${RESTORE_TIMEOUT}s..."
local i=0
local phase=""
while (( i < RESTORE_TIMEOUT )); do
phase=$(kubectl_cmd get restore "$RESTORE_NAME" -n velero \
-o jsonpath='{.status.phase}' 2>/dev/null || echo "")
case "$phase" in
Completed) log "✅ Restore Completed"; return 0 ;;
Failed|PartiallyFailed) log "❌ Restore 失敗: ${phase}"; return 1 ;;
esac
sleep 10; ((i+=10))
log "⏳ Restore 狀態: ${phase:-Pending} (${i}s)"
done
log "❌ Restore timeout"
return 1
}
# ── Step 4: 等待 Pod Ready ───────────────────────────────────────────────────
wait_pods_ready() {
log "⏳ 等待 ${DR_NAMESPACE} Pod Ready..."
local i=0
while (( i < 300 )); do
local ready
ready=$(kubectl_cmd get pods -n "$DR_NAMESPACE" --no-headers 2>/dev/null \
| grep -c "Running" || echo "0")
local total
total=$(kubectl_cmd get pods -n "$DR_NAMESPACE" --no-headers 2>/dev/null \
| wc -l || echo "0")
log "Pod 狀態: ${ready}/${total} Running"
if (( total > 0 && ready == total )); then
log "✅ 所有 Pod Ready"
return 0
fi
sleep 10; ((i+=10))
done
log "⚠️ Pod 等待 timeout繼續驗證"
return 0 # 不阻斷,讓 health check 決定
}
# ── Step 5: 驗證 API health ──────────────────────────────────────────────────
verify_api_health() {
log "🩺 驗證 DR namespace API health..."
# 找 api pod IP
local api_ip
api_ip=$(kubectl_cmd get pods -n "$DR_NAMESPACE" \
-l app=awoooi-api \
-o jsonpath='{.items[0].status.podIP}' 2>/dev/null || echo "")
if [[ -z "$api_ip" ]]; then
log "⚠️ 找不到 API Pod IP跳過 health check"
return 0
fi
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
--connect-timeout 10 --max-time 30 \
"http://${api_ip}:8000/api/v1/health" 2>/dev/null || echo "0")
if [[ "$http_code" == "200" ]]; then
log "✅ API health: 200 OK (${api_ip})"
return 0
else
log "❌ API health: ${http_code} (${api_ip})"
return 1
fi
}
# ── Step 6: 清理 dr-test 資源 ────────────────────────────────────────────────
cleanup_all() {
log "🗑️ 清理 DR test 資源..."
kubectl_cmd delete namespace "$DR_NAMESPACE" --timeout=120s 2>/dev/null || true
kubectl_cmd delete restore "$RESTORE_NAME" -n velero 2>/dev/null || true
log "✅ 清理完成"
}
# ── Main ──────────────────────────────────────────────────────────────────────
main() {
log "=== DR Drill 開始 (${TIMESTAMP}) ==="
if [[ "$DRY_RUN" == "--dry-run" ]]; then
log "🔍 DRY RUN 模式 — 只檢查 backup不執行還原"
local backup
backup=$(find_latest_backup) || { notify_telegram "❌ DR Drill 失敗: 找不到有效 backup" "failed"; exit 1; }
log "✅ 最新 backup: ${backup}"
notify_telegram "🔍 <b>DR Drill DRY RUN</b>
├ 最新 backup: ${backup}
└ 狀態: Completed ✅ (未執行還原)" "success"
return 0
fi
local backup
backup=$(find_latest_backup) || {
notify_telegram "❌ <b>DR Drill 失敗</b>
└ 找不到有效 Velero backup" "failed"
exit 1
}
log "📦 使用 backup: ${backup}"
cleanup_dr_namespace
local restore_ok=true
run_restore "$backup" || restore_ok=false
local pod_status="⚠️"
local health_status="⚠️"
if $restore_ok; then
wait_pods_ready
verify_api_health && health_status="✅" || health_status="❌"
pod_status="✅"
else
pod_status="❌"
fi
cleanup_all
local duration
duration=$(elapsed)
local minutes=$(( duration / 60 ))
local seconds=$(( duration % 60 ))
local overall="✅ PASS"
[[ "$health_status" == "❌" || "$pod_status" == "❌" ]] && overall="❌ FAIL"
log "=== DR Drill 完成: ${overall} (${minutes}m${seconds}s) ==="
local notify_status="success"
[[ "$overall" == *"FAIL"* ]] && notify_status="failed"
notify_telegram "${overall} <b>DR Drill 月度演練</b>
├ 備份: ${backup}
├ Restore: ${pod_status}
├ API Health: ${health_status}
├ 耗時: ${minutes}m${seconds}s
└ 時間: $(date '+%Y-%m-%d %H:%M') +0800" "$notify_status"
[[ "$overall" == *"FAIL"* ]] && exit 1
return 0
}
main "$@"