Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
P0-1: incident_service.py — 刪除 classify_alert_early 死碼 L131-132 P0-2: cron_backup_restore_test.sh — date +%s%3N→+%s,修正毫秒時間戳 P1-2: gitea_webhook.py — fingerprint 移除 sha_short,收斂同 branch 失敗 heartbeat: 還原原始空格對齊格式(統帥要求原本怎樣就怎樣) P1-1(積木化)/P1-3(TYPE-4)/P2-1(timeZone)/P2-2(IP)/P2-3(WS重連) 待後續處理 2026-04-12 ogt Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
63 lines
2.4 KiB
Bash
Executable File
63 lines
2.4 KiB
Bash
Executable File
#!/bin/sh
|
||
# =============================================================================
|
||
# 備份還原驗證 CronJob — ADR-074 M4
|
||
# =============================================================================
|
||
# 每週日 02:00 台北執行 Velero restore dry-run,驗證 PVC 快照可讀取。
|
||
# 失敗時寫入 Prometheus textfile metrics → 觸發 BackupRestoreTestFailed 告警。
|
||
#
|
||
# Textfile 路徑:/var/lib/node_exporter/textfile_collector/backup_restore_test.prom
|
||
# (由 node-exporter --collector.textfile.directory 掃描)
|
||
#
|
||
# 2026-04-12 ogt (ADR-074 M4)
|
||
# =============================================================================
|
||
|
||
set -e
|
||
|
||
TEXTFILE="/var/lib/node_exporter/textfile_collector/backup_restore_test.prom"
|
||
NAMESPACE="${VELERO_NAMESPACE:-velero}"
|
||
BACKUP_NAME="${VELERO_BACKUP_NAME:-awoooi-daily}"
|
||
EXIT_CODE=0
|
||
|
||
echo "=== backup-restore-test: $(date '+%Y-%m-%d %H:%M:%S %Z') ==="
|
||
echo "Backup: ${BACKUP_NAME} Namespace: ${NAMESPACE}"
|
||
|
||
# --- Velero restore dry-run ---
|
||
velero restore create \
|
||
--from-backup "${BACKUP_NAME}" \
|
||
--namespace-mappings "${NAMESPACE}:restore-test-dry" \
|
||
--dry-run \
|
||
--wait \
|
||
2>&1 || EXIT_CODE=$?
|
||
|
||
# --- 寫入 textfile metric ---
|
||
# 注意:Prometheus textfile collector 不接受毫秒時間戳(13位),只接受秒(10位)
|
||
# 直接省略 timestamp,由 node-exporter scrape 時自動補上
|
||
TS=$(date +%s)
|
||
mkdir -p "$(dirname "${TEXTFILE}")"
|
||
|
||
if [ "${EXIT_CODE}" -eq 0 ]; then
|
||
echo "backup restore dry-run OK"
|
||
cat > "${TEXTFILE}" <<PROM
|
||
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
|
||
# TYPE awoooi_backup_restore_test_success gauge
|
||
awoooi_backup_restore_test_success 1
|
||
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
|
||
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
|
||
awoooi_backup_restore_test_timestamp_seconds ${TS}
|
||
PROM
|
||
echo "Textfile written: success"
|
||
exit 0
|
||
else
|
||
echo "backup restore dry-run FAILED (exit ${EXIT_CODE})"
|
||
cat > "${TEXTFILE}" <<PROM
|
||
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
|
||
# TYPE awoooi_backup_restore_test_success gauge
|
||
awoooi_backup_restore_test_success 0
|
||
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
|
||
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
|
||
awoooi_backup_restore_test_timestamp_seconds ${TS}
|
||
PROM
|
||
echo "Textfile written: failure"
|
||
exit 1
|
||
fi
|