feat(m4): ADR-074 M4 — 備份還原週排程驗證 CronJob

- scripts/cron_backup_restore_test.sh: Velero restore dry-run 腳本
- k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml: 每週日 02:00 台北執行
- k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml: 腳本 ConfigMap
- flywheel-alerts.yaml: BackupRestoreTestFailed + BackupRestoreTestStale 告警

失敗時寫入 node-exporter textfile → Prometheus 告警 → TYPE-3 Incident

2026-04-12 ogt (ADR-074 M4)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-12 15:36:30 +08:00
parent 3489e05c84
commit c1c96ab47b
4 changed files with 215 additions and 0 deletions

View File

@@ -0,0 +1,76 @@
# =============================================================================
# 備份還原驗證 CronJob — ADR-074 M4
# =============================================================================
# 每週日 02:00 台北18:00 UTC 週六)執行 Velero restore dry-run。
# 失敗 → 寫入 node-exporter textfile → BackupRestoreTestFailed 告警。
#
# 前提node-exporter 須掛載 /var/lib/node_exporter/textfile_collector
# Velero CLI 需在 velero/velero image 中可用
#
# 2026-04-12 ogt (ADR-074 M4)
# =============================================================================
apiVersion: batch/v1
kind: CronJob
metadata:
name: backup-restore-test
namespace: velero
labels:
app: awoooi
component: backup-restore-test
adr: "074-m4"
spec:
# 每週日 02:00 台北 = 週六 18:00 UTC
schedule: "0 18 * * 6"
timeZone: "Asia/Taipei"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 5
startingDeadlineSeconds: 600
jobTemplate:
spec:
backoffLimit: 0
activeDeadlineSeconds: 600
template:
metadata:
labels:
app: awoooi
component: backup-restore-test
spec:
restartPolicy: Never
serviceAccountName: velero
containers:
- name: backup-restore-test
image: velero/velero:v1.13.0
command:
- /bin/sh
- /scripts/cron_backup_restore_test.sh
env:
- name: TZ
value: "Asia/Taipei"
- name: VELERO_NAMESPACE
value: "velero"
- name: VELERO_BACKUP_NAME
value: "awoooi-daily"
volumeMounts:
- name: scripts
mountPath: /scripts
readOnly: true
- name: textfile-collector
mountPath: /var/lib/node_exporter/textfile_collector
resources:
requests:
cpu: "50m"
memory: "64Mi"
limits:
cpu: "200m"
memory: "128Mi"
volumes:
- name: scripts
configMap:
name: backup-restore-test-scripts
defaultMode: 0755
- name: textfile-collector
hostPath:
path: /var/lib/node_exporter/textfile_collector
type: DirectoryOrCreate

View File

@@ -0,0 +1,49 @@
# =============================================================================
# ConfigMap: backup-restore-test-scripts — ADR-074 M4
# =============================================================================
# 掛載備份還原驗證腳本到 CronJob Pod
#
# 2026-04-12 ogt (ADR-074 M4)
# =============================================================================
apiVersion: v1
kind: ConfigMap
metadata:
name: backup-restore-test-scripts
namespace: velero
labels:
app: awoooi
component: backup-restore-test
adr: "074-m4"
data:
cron_backup_restore_test.sh: |
#!/bin/sh
set -e
TEXTFILE="/var/lib/node_exporter/textfile_collector/backup_restore_test.prom"
NAMESPACE="${VELERO_NAMESPACE:-velero}"
BACKUP_NAME="${VELERO_BACKUP_NAME:-awoooi-daily}"
EXIT_CODE=0
echo "=== backup-restore-test: $(date '+%Y-%m-%d %H:%M:%S %Z') ==="
echo "Backup: ${BACKUP_NAME} Namespace: ${NAMESPACE}"
velero restore create \
--from-backup "${BACKUP_NAME}" \
--namespace-mappings "${NAMESPACE}:restore-test-dry" \
--dry-run \
--wait \
2>&1 || EXIT_CODE=$?
TS=$(date +%s%3N)
mkdir -p "$(dirname "${TEXTFILE}")"
if [ "${EXIT_CODE}" -eq 0 ]; then
echo "backup restore dry-run OK"
printf '# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded\n# TYPE awoooi_backup_restore_test_success gauge\nawoooi_backup_restore_test_success 1 %s\n# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run\n# TYPE awoooi_backup_restore_test_timestamp_seconds gauge\nawoooi_backup_restore_test_timestamp_seconds %s %s\n' "${TS}" "$(date +%s)" "${TS}" > "${TEXTFILE}"
exit 0
else
echo "backup restore dry-run FAILED (exit ${EXIT_CODE})"
printf '# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded\n# TYPE awoooi_backup_restore_test_success gauge\nawoooi_backup_restore_test_success 0 %s\n# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run\n# TYPE awoooi_backup_restore_test_timestamp_seconds gauge\nawoooi_backup_restore_test_timestamp_seconds %s %s\n' "${TS}" "$(date +%s)" "${TS}" > "${TEXTFILE}"
exit 1
fi

View File

@@ -87,6 +87,36 @@ spec:
summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24 小時"
description: "大量 Incident 未推進,可能是決策引擎或 Telegram 通知阻塞。"
- name: awoooi_backup_restore
interval: 1h
rules:
# P0: 備份還原 dry-run 失敗
- alert: BackupRestoreTestFailed
expr: awoooi_backup_restore_test_success == 0
for: 5m
labels:
severity: critical
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "備份還原 dry-run 測試失敗"
description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。"
runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
# P1: 備份還原測試超過 8 天未執行(週排程失效)
- alert: BackupRestoreTestStale
expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
for: 10m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "備份還原測試超過 8 天未執行"
description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。"
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
- name: awoooi_host_connectivity
interval: 60s
rules:

View File

@@ -0,0 +1,60 @@
#!/bin/sh
# =============================================================================
# 備份還原驗證 CronJob — ADR-074 M4
# =============================================================================
# 每週日 02:00 台北執行 Velero restore dry-run驗證 PVC 快照可讀取。
# 失敗時寫入 Prometheus textfile metrics → 觸發 BackupRestoreTestFailed 告警。
#
# Textfile 路徑:/var/lib/node_exporter/textfile_collector/backup_restore_test.prom
# (由 node-exporter --collector.textfile.directory 掃描)
#
# 2026-04-12 ogt (ADR-074 M4)
# =============================================================================
set -e
TEXTFILE="/var/lib/node_exporter/textfile_collector/backup_restore_test.prom"
NAMESPACE="${VELERO_NAMESPACE:-velero}"
BACKUP_NAME="${VELERO_BACKUP_NAME:-awoooi-daily}"
EXIT_CODE=0
echo "=== backup-restore-test: $(date '+%Y-%m-%d %H:%M:%S %Z') ==="
echo "Backup: ${BACKUP_NAME} Namespace: ${NAMESPACE}"
# --- Velero restore dry-run ---
velero restore create \
--from-backup "${BACKUP_NAME}" \
--namespace-mappings "${NAMESPACE}:restore-test-dry" \
--dry-run \
--wait \
2>&1 || EXIT_CODE=$?
# --- 寫入 textfile metric ---
TS=$(date +%s%3N)
mkdir -p "$(dirname "${TEXTFILE}")"
if [ "${EXIT_CODE}" -eq 0 ]; then
echo "backup restore dry-run OK"
cat > "${TEXTFILE}" <<PROM
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
# TYPE awoooi_backup_restore_test_success gauge
awoooi_backup_restore_test_success 1 ${TS}
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
PROM
echo "Textfile written: success"
exit 0
else
echo "backup restore dry-run FAILED (exit ${EXIT_CODE})"
cat > "${TEXTFILE}" <<PROM
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
# TYPE awoooi_backup_restore_test_success gauge
awoooi_backup_restore_test_success 0 ${TS}
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
PROM
echo "Textfile written: failure"
exit 1
fi