feat(m4): ADR-074 M4 — 備份還原週排程驗證 CronJob
- scripts/cron_backup_restore_test.sh: Velero restore dry-run 腳本 - k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml: 每週日 02:00 台北執行 - k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml: 腳本 ConfigMap - flywheel-alerts.yaml: BackupRestoreTestFailed + BackupRestoreTestStale 告警 失敗時寫入 node-exporter textfile → Prometheus 告警 → TYPE-3 Incident 2026-04-12 ogt (ADR-074 M4) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
76
k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml
Normal file
76
k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml
Normal file
@@ -0,0 +1,76 @@
|
||||
# =============================================================================
|
||||
# 備份還原驗證 CronJob — ADR-074 M4
|
||||
# =============================================================================
|
||||
# 每週日 02:00 台北(18:00 UTC 週六)執行 Velero restore dry-run。
|
||||
# 失敗 → 寫入 node-exporter textfile → BackupRestoreTestFailed 告警。
|
||||
#
|
||||
# 前提:node-exporter 須掛載 /var/lib/node_exporter/textfile_collector
|
||||
# Velero CLI 需在 velero/velero image 中可用
|
||||
#
|
||||
# 2026-04-12 ogt (ADR-074 M4)
|
||||
# =============================================================================
|
||||
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: backup-restore-test
|
||||
namespace: velero
|
||||
labels:
|
||||
app: awoooi
|
||||
component: backup-restore-test
|
||||
adr: "074-m4"
|
||||
spec:
|
||||
# 每週日 02:00 台北 = 週六 18:00 UTC
|
||||
schedule: "0 18 * * 6"
|
||||
timeZone: "Asia/Taipei"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 5
|
||||
startingDeadlineSeconds: 600
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
activeDeadlineSeconds: 600
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: awoooi
|
||||
component: backup-restore-test
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
serviceAccountName: velero
|
||||
containers:
|
||||
- name: backup-restore-test
|
||||
image: velero/velero:v1.13.0
|
||||
command:
|
||||
- /bin/sh
|
||||
- /scripts/cron_backup_restore_test.sh
|
||||
env:
|
||||
- name: TZ
|
||||
value: "Asia/Taipei"
|
||||
- name: VELERO_NAMESPACE
|
||||
value: "velero"
|
||||
- name: VELERO_BACKUP_NAME
|
||||
value: "awoooi-daily"
|
||||
volumeMounts:
|
||||
- name: scripts
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
- name: textfile-collector
|
||||
mountPath: /var/lib/node_exporter/textfile_collector
|
||||
resources:
|
||||
requests:
|
||||
cpu: "50m"
|
||||
memory: "64Mi"
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "128Mi"
|
||||
volumes:
|
||||
- name: scripts
|
||||
configMap:
|
||||
name: backup-restore-test-scripts
|
||||
defaultMode: 0755
|
||||
- name: textfile-collector
|
||||
hostPath:
|
||||
path: /var/lib/node_exporter/textfile_collector
|
||||
type: DirectoryOrCreate
|
||||
49
k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml
Normal file
49
k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml
Normal file
@@ -0,0 +1,49 @@
|
||||
# =============================================================================
|
||||
# ConfigMap: backup-restore-test-scripts — ADR-074 M4
|
||||
# =============================================================================
|
||||
# 掛載備份還原驗證腳本到 CronJob Pod
|
||||
#
|
||||
# 2026-04-12 ogt (ADR-074 M4)
|
||||
# =============================================================================
|
||||
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: backup-restore-test-scripts
|
||||
namespace: velero
|
||||
labels:
|
||||
app: awoooi
|
||||
component: backup-restore-test
|
||||
adr: "074-m4"
|
||||
data:
|
||||
cron_backup_restore_test.sh: |
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
TEXTFILE="/var/lib/node_exporter/textfile_collector/backup_restore_test.prom"
|
||||
NAMESPACE="${VELERO_NAMESPACE:-velero}"
|
||||
BACKUP_NAME="${VELERO_BACKUP_NAME:-awoooi-daily}"
|
||||
EXIT_CODE=0
|
||||
|
||||
echo "=== backup-restore-test: $(date '+%Y-%m-%d %H:%M:%S %Z') ==="
|
||||
echo "Backup: ${BACKUP_NAME} Namespace: ${NAMESPACE}"
|
||||
|
||||
velero restore create \
|
||||
--from-backup "${BACKUP_NAME}" \
|
||||
--namespace-mappings "${NAMESPACE}:restore-test-dry" \
|
||||
--dry-run \
|
||||
--wait \
|
||||
2>&1 || EXIT_CODE=$?
|
||||
|
||||
TS=$(date +%s%3N)
|
||||
mkdir -p "$(dirname "${TEXTFILE}")"
|
||||
|
||||
if [ "${EXIT_CODE}" -eq 0 ]; then
|
||||
echo "backup restore dry-run OK"
|
||||
printf '# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded\n# TYPE awoooi_backup_restore_test_success gauge\nawoooi_backup_restore_test_success 1 %s\n# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run\n# TYPE awoooi_backup_restore_test_timestamp_seconds gauge\nawoooi_backup_restore_test_timestamp_seconds %s %s\n' "${TS}" "$(date +%s)" "${TS}" > "${TEXTFILE}"
|
||||
exit 0
|
||||
else
|
||||
echo "backup restore dry-run FAILED (exit ${EXIT_CODE})"
|
||||
printf '# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded\n# TYPE awoooi_backup_restore_test_success gauge\nawoooi_backup_restore_test_success 0 %s\n# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run\n# TYPE awoooi_backup_restore_test_timestamp_seconds gauge\nawoooi_backup_restore_test_timestamp_seconds %s %s\n' "${TS}" "$(date +%s)" "${TS}" > "${TEXTFILE}"
|
||||
exit 1
|
||||
fi
|
||||
@@ -87,6 +87,36 @@ spec:
|
||||
summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24 小時"
|
||||
description: "大量 Incident 未推進,可能是決策引擎或 Telegram 通知阻塞。"
|
||||
|
||||
- name: awoooi_backup_restore
|
||||
interval: 1h
|
||||
rules:
|
||||
|
||||
# P0: 備份還原 dry-run 失敗
|
||||
- alert: BackupRestoreTestFailed
|
||||
expr: awoooi_backup_restore_test_success == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "備份還原 dry-run 測試失敗"
|
||||
description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。"
|
||||
runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
|
||||
|
||||
# P1: 備份還原測試超過 8 天未執行(週排程失效)
|
||||
- alert: BackupRestoreTestStale
|
||||
expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "備份還原測試超過 8 天未執行"
|
||||
description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。"
|
||||
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
|
||||
|
||||
- name: awoooi_host_connectivity
|
||||
interval: 60s
|
||||
rules:
|
||||
|
||||
60
scripts/cron_backup_restore_test.sh
Executable file
60
scripts/cron_backup_restore_test.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/sh
|
||||
# =============================================================================
|
||||
# 備份還原驗證 CronJob — ADR-074 M4
|
||||
# =============================================================================
|
||||
# 每週日 02:00 台北執行 Velero restore dry-run,驗證 PVC 快照可讀取。
|
||||
# 失敗時寫入 Prometheus textfile metrics → 觸發 BackupRestoreTestFailed 告警。
|
||||
#
|
||||
# Textfile 路徑:/var/lib/node_exporter/textfile_collector/backup_restore_test.prom
|
||||
# (由 node-exporter --collector.textfile.directory 掃描)
|
||||
#
|
||||
# 2026-04-12 ogt (ADR-074 M4)
|
||||
# =============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
TEXTFILE="/var/lib/node_exporter/textfile_collector/backup_restore_test.prom"
|
||||
NAMESPACE="${VELERO_NAMESPACE:-velero}"
|
||||
BACKUP_NAME="${VELERO_BACKUP_NAME:-awoooi-daily}"
|
||||
EXIT_CODE=0
|
||||
|
||||
echo "=== backup-restore-test: $(date '+%Y-%m-%d %H:%M:%S %Z') ==="
|
||||
echo "Backup: ${BACKUP_NAME} Namespace: ${NAMESPACE}"
|
||||
|
||||
# --- Velero restore dry-run ---
|
||||
velero restore create \
|
||||
--from-backup "${BACKUP_NAME}" \
|
||||
--namespace-mappings "${NAMESPACE}:restore-test-dry" \
|
||||
--dry-run \
|
||||
--wait \
|
||||
2>&1 || EXIT_CODE=$?
|
||||
|
||||
# --- 寫入 textfile metric ---
|
||||
TS=$(date +%s%3N)
|
||||
mkdir -p "$(dirname "${TEXTFILE}")"
|
||||
|
||||
if [ "${EXIT_CODE}" -eq 0 ]; then
|
||||
echo "backup restore dry-run OK"
|
||||
cat > "${TEXTFILE}" <<PROM
|
||||
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
|
||||
# TYPE awoooi_backup_restore_test_success gauge
|
||||
awoooi_backup_restore_test_success 1 ${TS}
|
||||
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
|
||||
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
|
||||
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
|
||||
PROM
|
||||
echo "Textfile written: success"
|
||||
exit 0
|
||||
else
|
||||
echo "backup restore dry-run FAILED (exit ${EXIT_CODE})"
|
||||
cat > "${TEXTFILE}" <<PROM
|
||||
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
|
||||
# TYPE awoooi_backup_restore_test_success gauge
|
||||
awoooi_backup_restore_test_success 0 ${TS}
|
||||
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
|
||||
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
|
||||
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
|
||||
PROM
|
||||
echo "Textfile written: failure"
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user