#!/bin/bash # ============================================================================= # WOOO AIOps - Offsite backup copy controller # 2026-05-06 ogt + Codex: 將離機備份從口頭缺口變成可審計腳本。 # # 模式: # --mode status 只檢查本地 repo、rclone 與離機遠端可列出;不寫 success marker。 # --mode dry-run 對指定 repo 做 rclone dry-run;不寫 success marker。 # --mode sync 對指定 repo 做 rclone mirror;全部成功才寫 marker。 # # 安全: # - 不輸出 provider/rclone credential。 # - 預設只跑 status;不會無意間上傳 80GB+。 # - latest-only 策略下,sync 模式使用 rclone sync 鏡像本地 repo, # 成功後刪除 Google Drive 上已不存在於本地 repo 的舊檔。 # - 子備份腳本仍不得直接刪遠端;本腳本是唯一 offsite 刪舊入口。 # - 不複製 restic locks。 # ============================================================================= set -euo pipefail source "$(dirname "$0")/common.sh" SERVICE="offsite-backup" MODE="status" PROVIDER="${OFFSITE_PROVIDER:-rclone}" RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}" OFFSITE_REMOTE_ROOT_VALUE="${OFFSITE_REMOTE_ROOT:-${RCLONE_REMOTE}:awoooi-backups/restic}" OFFSITE_DIR="${BACKUP_BASE}/offsite" LOCK_DIR="/tmp/awoooi-offsite-backup.lock" RCLONE_TRANSFERS="${RCLONE_TRANSFERS:-2}" RCLONE_CHECKERS="${RCLONE_CHECKERS:-4}" RCLONE_BWLIMIT="${RCLONE_BWLIMIT:-8M}" OFFSITE_RCLONE_BACKEND="${OFFSITE_RCLONE_BACKEND:-drive}" RCLONE_FAST_LIST="${RCLONE_FAST_LIST:-1}" RCLONE_DRIVE_USE_TRASH="${RCLONE_DRIVE_USE_TRASH:-false}" OFFSITE_SYNC_DELETE_OLD="${OFFSITE_SYNC_DELETE_OLD:-1}" OFFSITE_SYNC_MAX_LOAD_1="${OFFSITE_SYNC_MAX_LOAD_1:-12}" OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT="${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT:-92}" OFFSITE_SYNC_REQUIRE_ENABLE_MARKER_FOR_FULL="${OFFSITE_SYNC_REQUIRE_ENABLE_MARKER_FOR_FULL:-1}" OFFSITE_SYNC_ENABLE_MARKER="${OFFSITE_SYNC_ENABLE_MARKER:-${OFFSITE_DIR}/enable-rclone-sync}" OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES="${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES:-270}" OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES="${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES:-120 480 840 1200}" OFFSITE_SYNC_NOTIFY_SKIPPED="${OFFSITE_SYNC_NOTIFY_SKIPPED:-0}" OFFSITE_SYNC_NOTIFY_SUCCESS="${OFFSITE_SYNC_NOTIFY_SUCCESS:-0}" EXPECTED_REPOS_DEFAULT="awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes" REPOS="${OFFSITE_REPOS:-${EXPECTED_REPOS_DEFAULT}}" DRY_RUN_ARGS=() usage() { cat <<'USAGE' Usage: sync-offsite-backups.sh --mode status sync-offsite-backups.sh --mode dry-run [--repos "ai-artifacts public-routes"] sync-offsite-backups.sh --mode sync [--repos "ai-artifacts public-routes"] Notes: - Default provider is rclone, with Google Drive remote root gdrive:awoooi-backups/restic. - --mode sync writes /backup/offsite/-last-success only when all expected repos are selected and mirrored successfully. - Partial sync writes /backup/offsite/-partial-last-success and per-repo markers. - OFFSITE_SYNC_DELETE_OLD=1 makes sync mode mirror local restic repos and delete old remote files after local retention has pruned them. - For Google Drive, RCLONE_DRIVE_USE_TRASH=false makes deletes permanent instead of moving old backup packs to Trash. USAGE } while [ "$#" -gt 0 ]; do case "$1" in --mode) MODE="${2:-}" shift 2 ;; --repos) REPOS="${2:-}" shift 2 ;; -h|--help) usage exit 0 ;; *) echo "Unknown argument: $1" >&2 usage >&2 exit 2 ;; esac done case "${MODE}" in status|dry-run|sync) ;; *) echo "MODE must be status, dry-run, or sync" >&2 exit 2 ;; esac cleanup() { rmdir "${LOCK_DIR}" 2>/dev/null || true } low_priority() { if command -v ionice >/dev/null 2>&1; then ionice -c2 -n7 nice -n 10 "$@" else nice -n 10 "$@" fi } require_lock() { if ! mkdir "${LOCK_DIR}" 2>/dev/null; then log_error "Offsite sync 已有執行中的 lock: ${LOCK_DIR}" exit 1 fi trap cleanup EXIT } prepare_rclone() { if ! command -v rclone >/dev/null 2>&1; then log_error "rclone 未安裝,無法執行 offsite copy" return 1 fi if [ "${PROVIDER}" = "b2" ]; then if ! check_b2_config; then return 1 fi # 不依賴本機 rclone.conf;用環境變數把 common.sh 的 B2 值交給 rclone。 export RCLONE_CONFIG_B2_TYPE="b2" export RCLONE_CONFIG_B2_ACCOUNT="${B2_ACCOUNT_ID}" export RCLONE_CONFIG_B2_KEY="${B2_APPLICATION_KEY}" return 0 fi if ! rclone listremotes 2>/dev/null | grep -Fxq "${RCLONE_REMOTE}:"; then log_error "rclone remote 未設定: ${RCLONE_REMOTE}:;請先在 110 執行 configure-offsite-rclone.sh --interactive" return 1 fi return 0 } remote_root() { if [ "${PROVIDER}" = "b2" ]; then printf 'b2:%s/restic' "${B2_BUCKET}" return fi printf '%s' "${OFFSITE_REMOTE_ROOT_VALUE}" } remote_status_target() { if [ "${PROVIDER}" = "b2" ]; then remote_root return fi printf '%s:' "${RCLONE_REMOTE}" } repo_count() { local count=0 for _repo in $1; do count=$((count + 1)) done echo "${count}" } is_full_scope() { [ "$(repo_count "${REPOS}")" -eq "$(repo_count "${EXPECTED_REPOS_DEFAULT}")" ] } float_le() { awk -v left="$1" -v right="$2" 'BEGIN { exit !(left <= right) }' } current_load_1() { awk '{print $1}' /proc/loadavg 2>/dev/null || echo 0 } backup_disk_used_pct() { df -P "${BACKUP_BASE}" 2>/dev/null | awk 'NR==2 {gsub("%", "", $5); print $5 + 0}' || echo 100 } active_backup_processes() { ps -eo pid=,args= | awk -v self="$$" ' $1 == self { next } /\/backup\/scripts\/backup-(all|awoooi|awoooi-frequent|gitea|harbor|momo|langfuse|monitoring|signoz|open-webui|clawbot|sentry|ai-artifacts|public-routes|configs)\.sh/ { print } ' } minutes_until_next_backup_schedule() { local now_h local now_m local now local sched local delta local best=1440 now_h="$(date +%H)" now_m="$(date +%M)" now=$((10#${now_h} * 60 + 10#${now_m})) for sched in ${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES}; do delta=$((sched - now)) if [ "${delta}" -le 0 ]; then delta=$((delta + 1440)) fi if [ "${delta}" -lt "${best}" ]; then best="${delta}" fi done echo "${best}" } resource_preflight() { local load_1 local disk_pct local active_backups local runway_minutes [ "${MODE}" = "sync" ] || return 0 if is_full_scope && [ "${OFFSITE_SYNC_REQUIRE_ENABLE_MARKER_FOR_FULL}" = "1" ] && [ ! -f "${OFFSITE_SYNC_ENABLE_MARKER}" ]; then log_error "Full offsite sync 需要明確啟用 marker: ${OFFSITE_SYNC_ENABLE_MARKER}" return 1 fi if is_full_scope; then active_backups="$(active_backup_processes || true)" if [ -n "${active_backups}" ]; then log_warn "略過 full offsite sync:偵測到正在執行的備份程序" printf '%s\n' "${active_backups}" | tee -a "${BACKUP_LOG_DIR}/backup.log" >/dev/null return 1 fi runway_minutes="$(minutes_until_next_backup_schedule)" if [ "${runway_minutes}" -lt "${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}" ]; then log_warn "略過 full offsite sync:距離下一次備份排程 ${runway_minutes} 分鐘,低於 runway ${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES} 分鐘" return 1 fi fi load_1="$(current_load_1)" if ! float_le "${load_1}" "${OFFSITE_SYNC_MAX_LOAD_1}"; then log_warn "略過 offsite sync:1m load=${load_1} 高於上限 ${OFFSITE_SYNC_MAX_LOAD_1}" return 1 fi disk_pct="$(backup_disk_used_pct)" if [ "${disk_pct}" -gt "${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT}" ]; then log_warn "略過 offsite sync:${BACKUP_BASE} 使用率 ${disk_pct}% 高於上限 ${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT}%" return 1 fi log_info "Offsite sync resource preflight OK load_1=${load_1}/${OFFSITE_SYNC_MAX_LOAD_1} backup_disk_used=${disk_pct}%/${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT}%" } write_marker() { local path="$1" local scope="$2" local timestamp timestamp=$(date +%s) install -d -m 750 "${OFFSITE_DIR}" cat > "${path}" </tmp/awoooi-offsite-rclone-lsd.log 2>&1 || return 1 } copy_repo() { local name="$1" local local_repo="${BACKUP_BASE}/${name}" local remote_repo local rclone_verb="copy" local rclone_extra_args=() remote_repo="$(remote_root)/${name}" if [ ! -d "${local_repo}/data" ]; then log_error "Restic repo 不存在或未初始化: ${local_repo}" return 1 fi if [ "${OFFSITE_SYNC_DELETE_OLD}" = "1" ] && [ "${MODE}" != "status" ]; then rclone_verb="sync" fi if [ "${RCLONE_FAST_LIST}" = "1" ]; then rclone_extra_args+=(--fast-list) fi if [ "${OFFSITE_RCLONE_BACKEND}" = "drive" ]; then rclone_extra_args+=("--drive-use-trash=${RCLONE_DRIVE_USE_TRASH}") fi log_info "Offsite ${MODE}: ${name} -> ${remote_repo} (rclone=${rclone_verb}, delete_old=${OFFSITE_SYNC_DELETE_OLD}, backend=${OFFSITE_RCLONE_BACKEND}, drive_trash=${RCLONE_DRIVE_USE_TRASH})" low_priority rclone "${rclone_verb}" "${local_repo}" "${remote_repo}" \ "${DRY_RUN_ARGS[@]}" \ "${rclone_extra_args[@]}" \ --exclude 'locks/**' \ --transfers "${RCLONE_TRANSFERS}" \ --checkers "${RCLONE_CHECKERS}" \ --bwlimit "${RCLONE_BWLIMIT}" \ --contimeout 15s \ --timeout 5m \ --retries 2 \ --stats 30s \ --stats-one-line \ >> "${BACKUP_LOG_DIR}/offsite-sync.log" 2>&1 } main() { local start_time local failed=0 local checked=0 local scope="partial" local remote_prepared=0 start_time=$(date +%s) require_lock install -d -m 750 "${OFFSITE_DIR}" log_info "========== Offsite backup ${MODE} 開始 ==========" log_info "provider=${PROVIDER} remote_root=$(remote_root) repos=$(repo_count "${REPOS}") bwlimit=${RCLONE_BWLIMIT} transfers=${RCLONE_TRANSFERS} max_load_1=${OFFSITE_SYNC_MAX_LOAD_1} full_runway_minutes=${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES} delete_old=${OFFSITE_SYNC_DELETE_OLD} backend=${OFFSITE_RCLONE_BACKEND} drive_trash=${RCLONE_DRIVE_USE_TRASH}" resource_preflight || { if [ "${MODE}" = "sync" ] && [ "${OFFSITE_SYNC_NOTIFY_SKIPPED}" = "1" ]; then notify_clawbot "warning" "${SERVICE}" "Offsite backup sync 略過:主機負載或前置條件未達安全門檻" 0 fi exit 1 } if prepare_rclone; then remote_prepared=1 elif [ "${MODE}" != "status" ]; then notify_clawbot "warning" "${SERVICE}" "Offsite rclone provider 未配置或不可用" 0 exit 1 else log_warn "Offsite provider 尚未配置;status 模式只檢查本地 repo,配置缺口交由 backup health metric 告警" fi if [ "${remote_prepared}" -eq 1 ]; then if status_remote; then log_success "Offsite remote 可列出" else log_warn "Offsite remote 尚不可列出或目前為空;copy 模式仍可建立路徑" fi fi if [ "${MODE}" = "status" ]; then for name in ${REPOS}; do checked=$((checked + 1)) if [ -d "${BACKUP_BASE}/${name}/data" ]; then log_success "本地 repo 存在: ${name}" else log_error "本地 repo 缺失: ${name}" failed=$((failed + 1)) fi done else [ "${MODE}" = "dry-run" ] && DRY_RUN_ARGS=(--dry-run) for name in ${REPOS}; do checked=$((checked + 1)) if copy_repo "${name}"; then log_success "Offsite ${MODE} 成功: ${name}" if [ "${MODE}" = "sync" ]; then write_marker "${OFFSITE_DIR}/${PROVIDER}-${name}.last_success" "repo" fi else log_error "Offsite ${MODE} 失敗: ${name}" failed=$((failed + 1)) fi done fi if is_full_scope; then scope="full" fi local duration duration=$(($(date +%s) - start_time)) if [ "${failed}" -eq 0 ]; then if [ "${MODE}" = "sync" ]; then if [ "${scope}" = "full" ]; then write_marker "${OFFSITE_DIR}/${PROVIDER}-last-success" "full" else write_marker "${OFFSITE_DIR}/${PROVIDER}-partial-last-success" "partial" fi fi log_success "========== Offsite backup ${MODE} 完成 (${duration}s, ${checked}/${checked}) ==========" if [ "${MODE}" != "status" ] && [ "${OFFSITE_SYNC_NOTIFY_SUCCESS}" = "1" ]; then notify_clawbot "success" "${SERVICE}" "Offsite backup ${MODE} 完成 scope=${scope} (${checked}/${checked})" "${duration}" fi else log_error "========== Offsite backup ${MODE} 失敗 ${failed}/${checked} (${duration}s) ==========" notify_clawbot "failed" "${SERVICE}" "Offsite backup ${MODE} 失敗 ${failed}/${checked}" "${duration}" fi return "${failed}" } main "$@"