#!/bin/bash # ============================================================================= # WOOO AIOps - 備份倉庫完整性與抽樣還原演練 # 2026-05-06 ogt + Codex: 將「有備份」升級為「可讀、可抽樣還原」。 # # 模式: # --mode check 每週 restic check,預設 read-data-subset=1% # --mode restore-drill 每月從每個 repo 抽一個小檔案 dump 到 0700 暫存目錄 # # 安全: # - 不還原到 production path。 # - 不輸出 Secret 內容;抽樣檔只寫入 /tmp 0700 目錄,結束即刪。 # ============================================================================= set -euo pipefail source "$(dirname "$0")/common.sh" MODE="check" READ_DATA_SUBSET="${RESTIC_CHECK_READ_DATA_SUBSET:-1%}" MAX_SAMPLE_BYTES="${RESTIC_RESTORE_DRILL_MAX_SAMPLE_BYTES:-20971520}" STATE_DIR="${BACKUP_BASE}/integrity" LOG_FILE="${BACKUP_LOG_DIR}/backup-integrity.log" RESTORE_DIR="/tmp/backup-restore-drill-$$" REPOS_DEFAULT="awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes" REPOS="${BACKUP_INTEGRITY_REPOS:-${REPOS_DEFAULT}}" while [ "$#" -gt 0 ]; do case "$1" in --mode) MODE="${2:-}" shift 2 ;; --read-data-subset) READ_DATA_SUBSET="${2:-}" shift 2 ;; --repos) REPOS="${2:-}" shift 2 ;; -h|--help) cat <<'USAGE' Usage: check-backup-integrity.sh --mode check [--read-data-subset 1%] check-backup-integrity.sh --mode restore-drill USAGE exit 0 ;; *) echo "Unknown argument: $1" >&2 exit 2 ;; esac done case "${MODE}" in check|restore-drill) ;; *) echo "MODE must be check or restore-drill" >&2 exit 2 ;; esac status_file() { case "${MODE}" in check) echo "${STATE_DIR}/check.status" ;; restore-drill) echo "${STATE_DIR}/restore-drill.status" ;; esac } cleanup() { rm -rf "${RESTORE_DIR}" } low_priority() { if command -v ionice >/dev/null 2>&1; then ionice -c2 -n7 nice -n 10 "$@" else nice -n 10 "$@" fi } latest_snapshot_count() { local repo="$1" restic -r "${repo}" snapshots --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \ python3 -c 'import json,sys; rows=json.load(sys.stdin); print(len(rows))' 2>/dev/null || echo 0 } latest_snapshot_timestamp() { local repo="$1" restic -r "${repo}" snapshots --latest 1 --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \ python3 -c 'import datetime as dt,json,re,sys rows=json.load(sys.stdin) if not rows: print(0); raise SystemExit value=str(rows[-1].get("time","")).replace("Z","+00:00") value=re.sub(r"\.(\d{6})\d+([+-]\d\d:\d\d)$", r".\1\2", value) print(int(dt.datetime.fromisoformat(value).timestamp()))' 2>/dev/null || echo 0 } sample_path_for_repo() { local repo="$1" { restic -r "${repo}" ls latest --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null || true; } | \ python3 -c 'import json,os,sys limit=int(os.environ.get("MAX_SAMPLE_BYTES","20971520")) blocked=(".restic-password","runtime-secrets","secrets.yaml") fallback="" for line in sys.stdin: try: item=json.loads(line) except json.JSONDecodeError: continue if item.get("type") != "file": continue path=item.get("path") or "" size=int(item.get("size") or 0) if size <= 0 or size > limit: continue if any(token in path for token in blocked): continue print(path) raise SystemExit print(fallback)' 2>/dev/null } write_status() { local timestamp="$1" local success="$2" local failed_count="$3" local checked_count="$4" local status status=$(status_file) install -d -m 700 "${STATE_DIR}" cat > "${status}" <> "${LOG_FILE}" for name in ${REPOS}; do local repo="${BACKUP_BASE}/${name}" local count local latest_ts checked=$((checked + 1)) if [ ! -d "${repo}/data" ]; then log_error "Restic repo 不存在或未初始化: ${repo}" echo "repo=${name} status=missing" >> "${LOG_FILE}" failed=$((failed + 1)) continue fi count=$(latest_snapshot_count "${repo}") latest_ts=$(latest_snapshot_timestamp "${repo}") if [ "${count}" -le 0 ] || [ "${latest_ts}" -le 0 ]; then log_error "Restic repo 沒有可用 snapshot: ${repo}" echo "repo=${name} status=no_snapshot count=${count}" >> "${LOG_FILE}" failed=$((failed + 1)) continue fi if [ "${MODE}" = "check" ]; then log_info "restic check: ${name} (${repo})" if low_priority restic -r "${repo}" check --read-data-subset="${READ_DATA_SUBSET}" --password-file "${RESTIC_PASSWORD_FILE}" >> "${LOG_FILE}" 2>&1; then log_success "repo ${name} check OK" echo "repo=${name} status=check_ok snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}" else log_error "repo ${name} check failed" echo "repo=${name} status=check_failed snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}" failed=$((failed + 1)) fi else local sample local sample_out sample=$(MAX_SAMPLE_BYTES="${MAX_SAMPLE_BYTES}" sample_path_for_repo "${repo}") if [ -z "${sample}" ]; then log_warn "repo ${name} 找不到適合抽樣 dump 的小檔案,改用 read-data-subset fallback" if low_priority restic -r "${repo}" check --read-data-subset=0.1% --password-file "${RESTIC_PASSWORD_FILE}" >> "${LOG_FILE}" 2>&1; then log_success "repo ${name} restore drill fallback OK" echo "repo=${name} status=restore_drill_fallback_check_ok snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}" else log_error "repo ${name} restore drill fallback failed" echo "repo=${name} status=restore_drill_fallback_failed snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}" failed=$((failed + 1)) fi continue fi sample_out="${RESTORE_DIR}/${name}.sample" log_info "restore drill sample dump: ${name}" if low_priority restic -r "${repo}" dump latest "${sample}" --password-file "${RESTIC_PASSWORD_FILE}" > "${sample_out}" 2>> "${LOG_FILE}" && [ -s "${sample_out}" ]; then log_success "repo ${name} restore drill OK ($(wc -c < "${sample_out}") bytes)" echo "repo=${name} status=restore_drill_ok snapshots=${count} latest=${latest_ts} sample_bytes=$(wc -c < "${sample_out}")" >> "${LOG_FILE}" else log_error "repo ${name} restore drill failed" echo "repo=${name} status=restore_drill_failed snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}" failed=$((failed + 1)) fi fi done local success=0 [ "${failed}" -eq 0 ] && success=1 write_status "${now}" "${success}" "${failed}" "${checked}" local duration duration=$(($(date +%s) - start_time)) if [ "${failed}" -eq 0 ]; then log_success "========== 備份完整性檢查完成 mode=${MODE} (${duration}s) ==========" notify_clawbot "success" "backup-integrity" "備份完整性檢查完成 mode=${MODE}" "${duration}" else log_error "========== 備份完整性檢查 mode=${MODE} 有 ${failed}/${checked} 個 repo 失敗 (${duration}s) ==========" notify_clawbot "failed" "backup-integrity" "備份完整性檢查 mode=${MODE} 有 ${failed}/${checked} 個 repo 失敗" "${duration}" fi return "${failed}" } main "$@"