From c1c96ab47b3828c40286b4edd4e2de1a1f2eaaa5 Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 15:36:30 +0800 Subject: [PATCH] =?UTF-8?q?feat(m4):=20ADR-074=20M4=20=E2=80=94=20?= =?UTF-8?q?=E5=82=99=E4=BB=BD=E9=82=84=E5=8E=9F=E9=80=B1=E6=8E=92=E7=A8=8B?= =?UTF-8?q?=E9=A9=97=E8=AD=89=20CronJob?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - scripts/cron_backup_restore_test.sh: Velero restore dry-run 腳本 - k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml: 每週日 02:00 台北執行 - k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml: 腳本 ConfigMap - flywheel-alerts.yaml: BackupRestoreTestFailed + BackupRestoreTestStale 告警 失敗時寫入 node-exporter textfile → Prometheus 告警 → TYPE-3 Incident 2026-04-12 ogt (ADR-074 M4) Co-Authored-By: Claude Sonnet 4.6 --- .../16-cronjob-backup-restore-test.yaml | 76 +++++++++++++++++++ .../17-configmap-backup-restore-scripts.yaml | 49 ++++++++++++ k8s/monitoring/flywheel-alerts.yaml | 30 ++++++++ scripts/cron_backup_restore_test.sh | 60 +++++++++++++++ 4 files changed, 215 insertions(+) create mode 100644 k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml create mode 100644 k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml create mode 100755 scripts/cron_backup_restore_test.sh diff --git a/k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml b/k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml new file mode 100644 index 00000000..5a572038 --- /dev/null +++ b/k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml @@ -0,0 +1,76 @@ +# ============================================================================= +# 備份還原驗證 CronJob — ADR-074 M4 +# ============================================================================= +# 每週日 02:00 台北(18:00 UTC 週六)執行 Velero restore dry-run。 +# 失敗 → 寫入 node-exporter textfile → BackupRestoreTestFailed 告警。 +# +# 前提:node-exporter 須掛載 /var/lib/node_exporter/textfile_collector +# Velero CLI 需在 velero/velero image 中可用 +# +# 2026-04-12 ogt (ADR-074 M4) +# ============================================================================= + +apiVersion: batch/v1 +kind: CronJob +metadata: + name: backup-restore-test + namespace: velero + labels: + app: awoooi + component: backup-restore-test + adr: "074-m4" +spec: + # 每週日 02:00 台北 = 週六 18:00 UTC + schedule: "0 18 * * 6" + timeZone: "Asia/Taipei" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 5 + startingDeadlineSeconds: 600 + jobTemplate: + spec: + backoffLimit: 0 + activeDeadlineSeconds: 600 + template: + metadata: + labels: + app: awoooi + component: backup-restore-test + spec: + restartPolicy: Never + serviceAccountName: velero + containers: + - name: backup-restore-test + image: velero/velero:v1.13.0 + command: + - /bin/sh + - /scripts/cron_backup_restore_test.sh + env: + - name: TZ + value: "Asia/Taipei" + - name: VELERO_NAMESPACE + value: "velero" + - name: VELERO_BACKUP_NAME + value: "awoooi-daily" + volumeMounts: + - name: scripts + mountPath: /scripts + readOnly: true + - name: textfile-collector + mountPath: /var/lib/node_exporter/textfile_collector + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" + volumes: + - name: scripts + configMap: + name: backup-restore-test-scripts + defaultMode: 0755 + - name: textfile-collector + hostPath: + path: /var/lib/node_exporter/textfile_collector + type: DirectoryOrCreate diff --git a/k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml b/k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml new file mode 100644 index 00000000..d42e891a --- /dev/null +++ b/k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml @@ -0,0 +1,49 @@ +# ============================================================================= +# ConfigMap: backup-restore-test-scripts — ADR-074 M4 +# ============================================================================= +# 掛載備份還原驗證腳本到 CronJob Pod +# +# 2026-04-12 ogt (ADR-074 M4) +# ============================================================================= + +apiVersion: v1 +kind: ConfigMap +metadata: + name: backup-restore-test-scripts + namespace: velero + labels: + app: awoooi + component: backup-restore-test + adr: "074-m4" +data: + cron_backup_restore_test.sh: | + #!/bin/sh + set -e + + TEXTFILE="/var/lib/node_exporter/textfile_collector/backup_restore_test.prom" + NAMESPACE="${VELERO_NAMESPACE:-velero}" + BACKUP_NAME="${VELERO_BACKUP_NAME:-awoooi-daily}" + EXIT_CODE=0 + + echo "=== backup-restore-test: $(date '+%Y-%m-%d %H:%M:%S %Z') ===" + echo "Backup: ${BACKUP_NAME} Namespace: ${NAMESPACE}" + + velero restore create \ + --from-backup "${BACKUP_NAME}" \ + --namespace-mappings "${NAMESPACE}:restore-test-dry" \ + --dry-run \ + --wait \ + 2>&1 || EXIT_CODE=$? + + TS=$(date +%s%3N) + mkdir -p "$(dirname "${TEXTFILE}")" + + if [ "${EXIT_CODE}" -eq 0 ]; then + echo "backup restore dry-run OK" + printf '# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded\n# TYPE awoooi_backup_restore_test_success gauge\nawoooi_backup_restore_test_success 1 %s\n# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run\n# TYPE awoooi_backup_restore_test_timestamp_seconds gauge\nawoooi_backup_restore_test_timestamp_seconds %s %s\n' "${TS}" "$(date +%s)" "${TS}" > "${TEXTFILE}" + exit 0 + else + echo "backup restore dry-run FAILED (exit ${EXIT_CODE})" + printf '# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded\n# TYPE awoooi_backup_restore_test_success gauge\nawoooi_backup_restore_test_success 0 %s\n# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run\n# TYPE awoooi_backup_restore_test_timestamp_seconds gauge\nawoooi_backup_restore_test_timestamp_seconds %s %s\n' "${TS}" "$(date +%s)" "${TS}" > "${TEXTFILE}" + exit 1 + fi diff --git a/k8s/monitoring/flywheel-alerts.yaml b/k8s/monitoring/flywheel-alerts.yaml index 6f4d85d7..e7f854e7 100644 --- a/k8s/monitoring/flywheel-alerts.yaml +++ b/k8s/monitoring/flywheel-alerts.yaml @@ -87,6 +87,36 @@ spec: summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24 小時" description: "大量 Incident 未推進,可能是決策引擎或 Telegram 通知阻塞。" + - name: awoooi_backup_restore + interval: 1h + rules: + + # P0: 備份還原 dry-run 失敗 + - alert: BackupRestoreTestFailed + expr: awoooi_backup_restore_test_success == 0 + for: 5m + labels: + severity: critical + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "備份還原 dry-run 測試失敗" + description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。" + runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run" + + # P1: 備份還原測試超過 8 天未執行(週排程失效) + - alert: BackupRestoreTestStale + expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200 + for: 10m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "備份還原測試超過 8 天未執行" + description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。" + runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態" + - name: awoooi_host_connectivity interval: 60s rules: diff --git a/scripts/cron_backup_restore_test.sh b/scripts/cron_backup_restore_test.sh new file mode 100755 index 00000000..9d68666c --- /dev/null +++ b/scripts/cron_backup_restore_test.sh @@ -0,0 +1,60 @@ +#!/bin/sh +# ============================================================================= +# 備份還原驗證 CronJob — ADR-074 M4 +# ============================================================================= +# 每週日 02:00 台北執行 Velero restore dry-run,驗證 PVC 快照可讀取。 +# 失敗時寫入 Prometheus textfile metrics → 觸發 BackupRestoreTestFailed 告警。 +# +# Textfile 路徑:/var/lib/node_exporter/textfile_collector/backup_restore_test.prom +# (由 node-exporter --collector.textfile.directory 掃描) +# +# 2026-04-12 ogt (ADR-074 M4) +# ============================================================================= + +set -e + +TEXTFILE="/var/lib/node_exporter/textfile_collector/backup_restore_test.prom" +NAMESPACE="${VELERO_NAMESPACE:-velero}" +BACKUP_NAME="${VELERO_BACKUP_NAME:-awoooi-daily}" +EXIT_CODE=0 + +echo "=== backup-restore-test: $(date '+%Y-%m-%d %H:%M:%S %Z') ===" +echo "Backup: ${BACKUP_NAME} Namespace: ${NAMESPACE}" + +# --- Velero restore dry-run --- +velero restore create \ + --from-backup "${BACKUP_NAME}" \ + --namespace-mappings "${NAMESPACE}:restore-test-dry" \ + --dry-run \ + --wait \ + 2>&1 || EXIT_CODE=$? + +# --- 寫入 textfile metric --- +TS=$(date +%s%3N) +mkdir -p "$(dirname "${TEXTFILE}")" + +if [ "${EXIT_CODE}" -eq 0 ]; then + echo "backup restore dry-run OK" + cat > "${TEXTFILE}" < "${TEXTFILE}" <