- scripts/cron_backup_restore_test.sh: Velero restore dry-run 腳本 - k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml: 每週日 02:00 台北執行 - k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml: 腳本 ConfigMap - flywheel-alerts.yaml: BackupRestoreTestFailed + BackupRestoreTestStale 告警 失敗時寫入 node-exporter textfile → Prometheus 告警 → TYPE-3 Incident 2026-04-12 ogt (ADR-074 M4) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
61 lines
2.3 KiB
Bash
Executable File
61 lines
2.3 KiB
Bash
Executable File
#!/bin/sh
|
||
# =============================================================================
|
||
# 備份還原驗證 CronJob — ADR-074 M4
|
||
# =============================================================================
|
||
# 每週日 02:00 台北執行 Velero restore dry-run,驗證 PVC 快照可讀取。
|
||
# 失敗時寫入 Prometheus textfile metrics → 觸發 BackupRestoreTestFailed 告警。
|
||
#
|
||
# Textfile 路徑:/var/lib/node_exporter/textfile_collector/backup_restore_test.prom
|
||
# (由 node-exporter --collector.textfile.directory 掃描)
|
||
#
|
||
# 2026-04-12 ogt (ADR-074 M4)
|
||
# =============================================================================
|
||
|
||
set -e
|
||
|
||
TEXTFILE="/var/lib/node_exporter/textfile_collector/backup_restore_test.prom"
|
||
NAMESPACE="${VELERO_NAMESPACE:-velero}"
|
||
BACKUP_NAME="${VELERO_BACKUP_NAME:-awoooi-daily}"
|
||
EXIT_CODE=0
|
||
|
||
echo "=== backup-restore-test: $(date '+%Y-%m-%d %H:%M:%S %Z') ==="
|
||
echo "Backup: ${BACKUP_NAME} Namespace: ${NAMESPACE}"
|
||
|
||
# --- Velero restore dry-run ---
|
||
velero restore create \
|
||
--from-backup "${BACKUP_NAME}" \
|
||
--namespace-mappings "${NAMESPACE}:restore-test-dry" \
|
||
--dry-run \
|
||
--wait \
|
||
2>&1 || EXIT_CODE=$?
|
||
|
||
# --- 寫入 textfile metric ---
|
||
TS=$(date +%s%3N)
|
||
mkdir -p "$(dirname "${TEXTFILE}")"
|
||
|
||
if [ "${EXIT_CODE}" -eq 0 ]; then
|
||
echo "backup restore dry-run OK"
|
||
cat > "${TEXTFILE}" <<PROM
|
||
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
|
||
# TYPE awoooi_backup_restore_test_success gauge
|
||
awoooi_backup_restore_test_success 1 ${TS}
|
||
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
|
||
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
|
||
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
|
||
PROM
|
||
echo "Textfile written: success"
|
||
exit 0
|
||
else
|
||
echo "backup restore dry-run FAILED (exit ${EXIT_CODE})"
|
||
cat > "${TEXTFILE}" <<PROM
|
||
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
|
||
# TYPE awoooi_backup_restore_test_success gauge
|
||
awoooi_backup_restore_test_success 0 ${TS}
|
||
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
|
||
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
|
||
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
|
||
PROM
|
||
echo "Textfile written: failure"
|
||
exit 1
|
||
fi
|