feat(m4): ADR-074 M4 — 備份還原週排程驗證 CronJob
- scripts/cron_backup_restore_test.sh: Velero restore dry-run 腳本 - k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml: 每週日 02:00 台北執行 - k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml: 腳本 ConfigMap - flywheel-alerts.yaml: BackupRestoreTestFailed + BackupRestoreTestStale 告警 失敗時寫入 node-exporter textfile → Prometheus 告警 → TYPE-3 Incident 2026-04-12 ogt (ADR-074 M4) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
60
scripts/cron_backup_restore_test.sh
Executable file
60
scripts/cron_backup_restore_test.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/sh
|
||||
# =============================================================================
|
||||
# 備份還原驗證 CronJob — ADR-074 M4
|
||||
# =============================================================================
|
||||
# 每週日 02:00 台北執行 Velero restore dry-run,驗證 PVC 快照可讀取。
|
||||
# 失敗時寫入 Prometheus textfile metrics → 觸發 BackupRestoreTestFailed 告警。
|
||||
#
|
||||
# Textfile 路徑:/var/lib/node_exporter/textfile_collector/backup_restore_test.prom
|
||||
# (由 node-exporter --collector.textfile.directory 掃描)
|
||||
#
|
||||
# 2026-04-12 ogt (ADR-074 M4)
|
||||
# =============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
TEXTFILE="/var/lib/node_exporter/textfile_collector/backup_restore_test.prom"
|
||||
NAMESPACE="${VELERO_NAMESPACE:-velero}"
|
||||
BACKUP_NAME="${VELERO_BACKUP_NAME:-awoooi-daily}"
|
||||
EXIT_CODE=0
|
||||
|
||||
echo "=== backup-restore-test: $(date '+%Y-%m-%d %H:%M:%S %Z') ==="
|
||||
echo "Backup: ${BACKUP_NAME} Namespace: ${NAMESPACE}"
|
||||
|
||||
# --- Velero restore dry-run ---
|
||||
velero restore create \
|
||||
--from-backup "${BACKUP_NAME}" \
|
||||
--namespace-mappings "${NAMESPACE}:restore-test-dry" \
|
||||
--dry-run \
|
||||
--wait \
|
||||
2>&1 || EXIT_CODE=$?
|
||||
|
||||
# --- 寫入 textfile metric ---
|
||||
TS=$(date +%s%3N)
|
||||
mkdir -p "$(dirname "${TEXTFILE}")"
|
||||
|
||||
if [ "${EXIT_CODE}" -eq 0 ]; then
|
||||
echo "backup restore dry-run OK"
|
||||
cat > "${TEXTFILE}" <<PROM
|
||||
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
|
||||
# TYPE awoooi_backup_restore_test_success gauge
|
||||
awoooi_backup_restore_test_success 1 ${TS}
|
||||
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
|
||||
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
|
||||
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
|
||||
PROM
|
||||
echo "Textfile written: success"
|
||||
exit 0
|
||||
else
|
||||
echo "backup restore dry-run FAILED (exit ${EXIT_CODE})"
|
||||
cat > "${TEXTFILE}" <<PROM
|
||||
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
|
||||
# TYPE awoooi_backup_restore_test_success gauge
|
||||
awoooi_backup_restore_test_success 0 ${TS}
|
||||
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
|
||||
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
|
||||
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
|
||||
PROM
|
||||
echo "Textfile written: failure"
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user