Files
awoooi/scripts/backup/check-backup-integrity.sh
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

239 lines
8.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# WOOO AIOps - 備份倉庫完整性與抽樣還原演練
# 2026-05-06 ogt + Codex: 將「有備份」升級為「可讀、可抽樣還原」。
#
# 模式:
# --mode check 每週 restic check預設 read-data-subset=1%
# --mode restore-drill 每月從每個 repo 抽一個小檔案 dump 到 0700 暫存目錄
#
# 安全:
# - 不還原到 production path。
# - 不輸出 Secret 內容;抽樣檔只寫入 /tmp 0700 目錄,結束即刪。
# =============================================================================
set -euo pipefail
source "$(dirname "$0")/common.sh"
MODE="check"
READ_DATA_SUBSET="${RESTIC_CHECK_READ_DATA_SUBSET:-1%}"
MAX_SAMPLE_BYTES="${RESTIC_RESTORE_DRILL_MAX_SAMPLE_BYTES:-20971520}"
STATE_DIR="${BACKUP_BASE}/integrity"
LOG_FILE="${BACKUP_LOG_DIR}/backup-integrity.log"
RESTORE_DIR="/tmp/backup-restore-drill-$$"
REPOS_DEFAULT="awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes"
REPOS="${BACKUP_INTEGRITY_REPOS:-${REPOS_DEFAULT}}"
while [ "$#" -gt 0 ]; do
case "$1" in
--mode)
MODE="${2:-}"
shift 2
;;
--read-data-subset)
READ_DATA_SUBSET="${2:-}"
shift 2
;;
--repos)
REPOS="${2:-}"
shift 2
;;
-h|--help)
cat <<'USAGE'
Usage:
check-backup-integrity.sh --mode check [--read-data-subset 1%]
check-backup-integrity.sh --mode restore-drill
USAGE
exit 0
;;
*)
echo "Unknown argument: $1" >&2
exit 2
;;
esac
done
case "${MODE}" in
check|restore-drill) ;;
*)
echo "MODE must be check or restore-drill" >&2
exit 2
;;
esac
status_file() {
case "${MODE}" in
check) echo "${STATE_DIR}/check.status" ;;
restore-drill) echo "${STATE_DIR}/restore-drill.status" ;;
esac
}
cleanup() {
rm -rf "${RESTORE_DIR}"
}
low_priority() {
if command -v ionice >/dev/null 2>&1; then
ionice -c2 -n7 nice -n 10 "$@"
else
nice -n 10 "$@"
fi
}
latest_snapshot_count() {
local repo="$1"
restic -r "${repo}" snapshots --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
python3 -c 'import json,sys; rows=json.load(sys.stdin); print(len(rows))' 2>/dev/null || echo 0
}
latest_snapshot_timestamp() {
local repo="$1"
restic -r "${repo}" snapshots --latest 1 --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
python3 -c 'import datetime as dt,json,re,sys
rows=json.load(sys.stdin)
if not rows:
print(0); raise SystemExit
value=str(rows[-1].get("time","")).replace("Z","+00:00")
value=re.sub(r"\.(\d{6})\d+([+-]\d\d:\d\d)$", r".\1\2", value)
print(int(dt.datetime.fromisoformat(value).timestamp()))' 2>/dev/null || echo 0
}
sample_path_for_repo() {
local repo="$1"
{ restic -r "${repo}" ls latest --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null || true; } | \
python3 -c 'import json,os,sys
limit=int(os.environ.get("MAX_SAMPLE_BYTES","20971520"))
blocked=(".restic-password","runtime-secrets","secrets.yaml")
fallback=""
for line in sys.stdin:
try:
item=json.loads(line)
except json.JSONDecodeError:
continue
if item.get("type") != "file":
continue
path=item.get("path") or ""
size=int(item.get("size") or 0)
if size <= 0 or size > limit:
continue
if any(token in path for token in blocked):
continue
print(path)
raise SystemExit
print(fallback)' 2>/dev/null
}
write_status() {
local timestamp="$1"
local success="$2"
local failed_count="$3"
local checked_count="$4"
local status
status=$(status_file)
install -d -m 700 "${STATE_DIR}"
cat > "${status}" <<EOF
timestamp=${timestamp}
mode=${MODE}
success=${success}
failed_count=${failed_count}
checked_repo_count=${checked_count}
read_data_subset=${READ_DATA_SUBSET}
EOF
}
main() {
local start_time
local now
local failed=0
local checked=0
start_time=$(date +%s)
now="${start_time}"
trap cleanup EXIT
install -d -m 700 "${STATE_DIR}"
install -d -m 700 "${RESTORE_DIR}"
log_info "========== 開始備份完整性檢查 mode=${MODE} =========="
echo "[$(date '+%Y-%m-%d %H:%M:%S')] mode=${MODE} repos=${REPOS}" >> "${LOG_FILE}"
for name in ${REPOS}; do
local repo="${BACKUP_BASE}/${name}"
local count
local latest_ts
checked=$((checked + 1))
if [ ! -d "${repo}/data" ]; then
log_error "Restic repo 不存在或未初始化: ${repo}"
echo "repo=${name} status=missing" >> "${LOG_FILE}"
failed=$((failed + 1))
continue
fi
count=$(latest_snapshot_count "${repo}")
latest_ts=$(latest_snapshot_timestamp "${repo}")
if [ "${count}" -le 0 ] || [ "${latest_ts}" -le 0 ]; then
log_error "Restic repo 沒有可用 snapshot: ${repo}"
echo "repo=${name} status=no_snapshot count=${count}" >> "${LOG_FILE}"
failed=$((failed + 1))
continue
fi
if [ "${MODE}" = "check" ]; then
log_info "restic check: ${name} (${repo})"
if low_priority restic -r "${repo}" check --read-data-subset="${READ_DATA_SUBSET}" --password-file "${RESTIC_PASSWORD_FILE}" >> "${LOG_FILE}" 2>&1; then
log_success "repo ${name} check OK"
echo "repo=${name} status=check_ok snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}"
else
log_error "repo ${name} check failed"
echo "repo=${name} status=check_failed snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}"
failed=$((failed + 1))
fi
else
local sample
local sample_out
sample=$(MAX_SAMPLE_BYTES="${MAX_SAMPLE_BYTES}" sample_path_for_repo "${repo}")
if [ -z "${sample}" ]; then
log_warn "repo ${name} 找不到適合抽樣 dump 的小檔案,改用 read-data-subset fallback"
if low_priority restic -r "${repo}" check --read-data-subset=0.1% --password-file "${RESTIC_PASSWORD_FILE}" >> "${LOG_FILE}" 2>&1; then
log_success "repo ${name} restore drill fallback OK"
echo "repo=${name} status=restore_drill_fallback_check_ok snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}"
else
log_error "repo ${name} restore drill fallback failed"
echo "repo=${name} status=restore_drill_fallback_failed snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}"
failed=$((failed + 1))
fi
continue
fi
sample_out="${RESTORE_DIR}/${name}.sample"
log_info "restore drill sample dump: ${name}"
if low_priority restic -r "${repo}" dump latest "${sample}" --password-file "${RESTIC_PASSWORD_FILE}" > "${sample_out}" 2>> "${LOG_FILE}" && [ -s "${sample_out}" ]; then
log_success "repo ${name} restore drill OK ($(wc -c < "${sample_out}") bytes)"
echo "repo=${name} status=restore_drill_ok snapshots=${count} latest=${latest_ts} sample_bytes=$(wc -c < "${sample_out}")" >> "${LOG_FILE}"
else
log_error "repo ${name} restore drill failed"
echo "repo=${name} status=restore_drill_failed snapshots=${count} latest=${latest_ts}" >> "${LOG_FILE}"
failed=$((failed + 1))
fi
fi
done
local success=0
[ "${failed}" -eq 0 ] && success=1
write_status "${now}" "${success}" "${failed}" "${checked}"
local duration
duration=$(($(date +%s) - start_time))
if [ "${failed}" -eq 0 ]; then
log_success "========== 備份完整性檢查完成 mode=${MODE} (${duration}s) =========="
notify_clawbot "success" "backup-integrity" "備份完整性檢查完成 mode=${MODE}" "${duration}"
else
log_error "========== 備份完整性檢查 mode=${MODE}${failed}/${checked} 個 repo 失敗 (${duration}s) =========="
notify_clawbot "failed" "backup-integrity" "備份完整性檢查 mode=${MODE}${failed}/${checked} 個 repo 失敗" "${duration}"
fi
return "${failed}"
}
main "$@"