239 lines
8.4 KiB
Bash
Executable File
239 lines
8.4 KiB
Bash
Executable File
#!/bin/bash
|
||
# =============================================================================
|
||
# WOOO AIOps - 備份倉庫完整性與抽樣還原演練
|
||
# 2026-05-06 ogt + Codex: 將「有備份」升級為「可讀、可抽樣還原」。
|
||
#
|
||
# 模式:
|
||
# --mode check 每週 restic check,預設 read-data-subset=1%
|
||
# --mode restore-drill 每月從每個 repo 抽一個小檔案 dump 到 0700 暫存目錄
|
||
#
|
||
# 安全:
|
||
# - 不還原到 production path。
|
||
# - 不輸出 Secret 內容;抽樣檔只寫入 /tmp 0700 目錄,結束即刪。
|
||
# =============================================================================
|
||
|
||
set -euo pipefail
|
||
|
||
source "$(dirname "$0")/common.sh"
|
||
|
||
MODE="check"
|
||
READ_DATA_SUBSET="${RESTIC_CHECK_READ_DATA_SUBSET:-1%}"
|
||
MAX_SAMPLE_BYTES="${RESTIC_RESTORE_DRILL_MAX_SAMPLE_BYTES:-20971520}"
|
||
STATE_DIR="${BACKUP_BASE}/integrity"
|
||
LOG_FILE="${BACKUP_LOG_DIR}/backup-integrity.log"
|
||
RESTORE_DIR="/tmp/backup-restore-drill-$$"
|
||
REPOS_DEFAULT="awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes"
|
||
REPOS="${BACKUP_INTEGRITY_REPOS:-${REPOS_DEFAULT}}"
|
||
|
||
while [ "$#" -gt 0 ]; do
|
||
case "$1" in
|
||
--mode)
|
||
MODE="${2:-}"
|
||
shift 2
|
||
;;
|
||
--read-data-subset)
|
||
READ_DATA_SUBSET="${2:-}"
|
||
shift 2
|
||
;;
|
||
--repos)
|
||
REPOS="${2:-}"
|
||
shift 2
|
||
;;
|
||
-h|--help)
|
||
cat <<'USAGE'
|
||
Usage:
|
||
check-backup-integrity.sh --mode check [--read-data-subset 1%]
|
||
check-backup-integrity.sh --mode restore-drill
|
||
USAGE
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "Unknown argument: $1" >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
done
|
||
|
||
case "${MODE}" in
|
||
check|restore-drill) ;;
|
||
*)
|
||
echo "MODE must be check or restore-drill" >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
|
||
status_file() {
|
||
case "${MODE}" in
|
||
check) echo "${STATE_DIR}/check.status" ;;
|
||
restore-drill) echo "${STATE_DIR}/restore-drill.status" ;;
|
||
esac
|
||
}
|
||
|
||
cleanup() {
|
||
rm -rf "${RESTORE_DIR}"
|
||
}
|
||
|
||
low_priority() {
|
||
if command -v ionice >/dev/null 2>&1; then
|
||
ionice -c2 -n7 nice -n 10 "$@"
|
||
else
|
||
nice -n 10 "$@"
|
||
fi
|
||
}
|
||
|
||
latest_snapshot_count() {
|
||
local repo="$1"
|
||
restic -r "${repo}" snapshots --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
|
||
python3 -c 'import json,sys; rows=json.load(sys.stdin); print(len(rows))' 2>/dev/null || echo 0
|
||
}
|
||
|
||
latest_snapshot_timestamp() {
|
||
local repo="$1"
|
||
restic -r "${repo}" snapshots --latest 1 --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
|
||
python3 -c 'import datetime as dt,json,re,sys
|
||
rows=json.load(sys.stdin)
|
||
if not rows:
|
||
print(0); raise SystemExit
|
||
value=str(rows[-1].get("time","")).replace("Z","+00:00")
|
||
value=re.sub(r"\.(\d{6})\d+([+-]\d\d:\d\d)$", r".\1\2", value)
|
||
print(int(dt.datetime.fromisoformat(value).timestamp()))' 2>/dev/null || echo 0
|
||
}
|
||
|
||
sample_path_for_repo() {
|
||
local repo="$1"
|
||
{ restic -r "${repo}" ls latest --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null || true; } | \
|
||
python3 -c 'import json,os,sys
|
||
limit=int(os.environ.get("MAX_SAMPLE_BYTES","20971520"))
|
||
blocked=(".restic-password","runtime-secrets","secrets.yaml")
|
||
fallback=""
|
||
for line in sys.stdin:
|
||
try:
|
||
item=json.loads(line)
|
||
except json.JSONDecodeError:
|
||
continue
|
||
if item.get("type") != "file":
|
||
continue
|
||
path=item.get("path") or ""
|
||
size=int(item.get("size") or 0)
|
||
if size <= 0 or size > limit:
|
||
continue
|
||
if any(token in path for token in blocked):
|
||
continue
|
||
print(path)
|
||
raise SystemExit
|
||
print(fallback)' 2>/dev/null
|
||
}
|
||
|
||
write_status() {
|
||
local timestamp="$1"
|
||
local success="$2"
|
||
local failed_count="$3"
|
||
local checked_count="$4"
|
||
local status
|
||
status=$(status_file)
|
||
install -d -m 700 "${STATE_DIR}"
|
||
cat > "${status}" <<EOF
|
||
timestamp=${timestamp}
|
||
mode=${MODE}
|
||
success=${success}
|
||
failed_count=${failed_count}
|
||
checked_repo_count=${checked_count}
|
||
read_data_subset=${READ_DATA_SUBSET}
|
||
EOF
|
||
}
|
||
|
||
main() {
|
||
local start_time
|
||
local now
|
||
local failed=0
|
||
local checked=0
|
||
start_time=$(date +%s)
|
||
now="${start_time}"
|
||
|
||
trap cleanup EXIT
|
||
install -d -m 700 "${STATE_DIR}"
|
||
install -d -m 700 "${RESTORE_DIR}"
|
||
|
||
log_info "========== 開始備份完整性檢查 mode=${MODE} =========="
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] mode=${MODE} repos=${REPOS}" >> "${LOG_FILE}"
|
||
|
||
for name in ${REPOS}; do
|
||
local repo="${BACKUP_BASE}/${name}"
|
||
local count
|
||
local latest_ts
|
||
checked=$((checked + 1))
|
||
|
||
if [ ! -d "${repo}/data" ]; then
|
||
log_error "Restic repo 不存在或未初始化: ${repo}"
|
||
echo "repo=${name} status=missing" >> "${LOG_FILE}"
|
||
failed=$((failed + 1))
|
||
continue
|
||
fi
|
||
|
||
count=$(latest_snapshot_count "${repo}")
|
||
latest_ts=$(latest_snapshot_timestamp "${repo}")
|
||
if [ "${count}" -le 0 ] || [ "${latest_ts}" -le 0 ]; then
|
||
log_error "Restic repo 沒有可用 snapshot: ${repo}"
|
||
echo "repo=${name} status=no_snapshot count=${count}" >> "${LOG_FILE}"
|
||
failed=$((failed + 1))
|
||
continue
|
||
fi
|
||
|
||
if [ "${MODE}" = "check" ]; then
|
||
log_info "restic check: ${name} (${repo})"
|
||
if low_priority restic -r "${repo}" check --read-data-subset="${READ_DATA_SUBSET}" --password-file "${RESTIC_PASSWORD_FILE}" >> "${LOG_FILE}" 2>&1; then
|
||
log_success "repo ${name} check OK"
|
||
echo "repo=${name} status=check_ok snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}"
|
||
else
|
||
log_error "repo ${name} check failed"
|
||
echo "repo=${name} status=check_failed snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}"
|
||
failed=$((failed + 1))
|
||
fi
|
||
else
|
||
local sample
|
||
local sample_out
|
||
sample=$(MAX_SAMPLE_BYTES="${MAX_SAMPLE_BYTES}" sample_path_for_repo "${repo}")
|
||
if [ -z "${sample}" ]; then
|
||
log_warn "repo ${name} 找不到適合抽樣 dump 的小檔案,改用 read-data-subset fallback"
|
||
if low_priority restic -r "${repo}" check --read-data-subset=0.1% --password-file "${RESTIC_PASSWORD_FILE}" >> "${LOG_FILE}" 2>&1; then
|
||
log_success "repo ${name} restore drill fallback OK"
|
||
echo "repo=${name} status=restore_drill_fallback_check_ok snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}"
|
||
else
|
||
log_error "repo ${name} restore drill fallback failed"
|
||
echo "repo=${name} status=restore_drill_fallback_failed snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}"
|
||
failed=$((failed + 1))
|
||
fi
|
||
continue
|
||
fi
|
||
sample_out="${RESTORE_DIR}/${name}.sample"
|
||
log_info "restore drill sample dump: ${name}"
|
||
if low_priority restic -r "${repo}" dump latest "${sample}" --password-file "${RESTIC_PASSWORD_FILE}" > "${sample_out}" 2>> "${LOG_FILE}" && [ -s "${sample_out}" ]; then
|
||
log_success "repo ${name} restore drill OK ($(wc -c < "${sample_out}") bytes)"
|
||
echo "repo=${name} status=restore_drill_ok snapshots=${count} latest=${latest_ts} sample_bytes=$(wc -c < "${sample_out}")" >> "${LOG_FILE}"
|
||
else
|
||
log_error "repo ${name} restore drill failed"
|
||
echo "repo=${name} status=restore_drill_failed snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}"
|
||
failed=$((failed + 1))
|
||
fi
|
||
fi
|
||
done
|
||
|
||
local success=0
|
||
[ "${failed}" -eq 0 ] && success=1
|
||
write_status "${now}" "${success}" "${failed}" "${checked}"
|
||
|
||
local duration
|
||
duration=$(($(date +%s) - start_time))
|
||
if [ "${failed}" -eq 0 ]; then
|
||
log_success "========== 備份完整性檢查完成 mode=${MODE} (${duration}s) =========="
|
||
notify_clawbot "success" "backup-integrity" "備份完整性檢查完成 mode=${MODE}" "${duration}"
|
||
else
|
||
log_error "========== 備份完整性檢查 mode=${MODE} 有 ${failed}/${checked} 個 repo 失敗 (${duration}s) =========="
|
||
notify_clawbot "failed" "backup-integrity" "備份完整性檢查 mode=${MODE} 有 ${failed}/${checked} 個 repo 失敗" "${duration}"
|
||
fi
|
||
|
||
return "${failed}"
|
||
}
|
||
|
||
main "$@"
|