415 lines
14 KiB
Bash
Executable File
415 lines
14 KiB
Bash
Executable File
#!/bin/bash
|
||
# =============================================================================
|
||
# WOOO AIOps - Offsite backup copy controller
|
||
# 2026-05-06 ogt + Codex: 將離機備份從口頭缺口變成可審計腳本。
|
||
#
|
||
# 模式:
|
||
# --mode status 只檢查本地 repo、rclone 與離機遠端可列出;不寫 success marker。
|
||
# --mode dry-run 對指定 repo 做 rclone dry-run;不寫 success marker。
|
||
# --mode sync 對指定 repo 做 rclone mirror;全部成功才寫 marker。
|
||
#
|
||
# 安全:
|
||
# - 不輸出 provider/rclone credential。
|
||
# - 預設只跑 status;不會無意間上傳 80GB+。
|
||
# - latest-only 策略下,sync 模式使用 rclone sync 鏡像本地 repo,
|
||
# 成功後刪除 Google Drive 上已不存在於本地 repo 的舊檔。
|
||
# - 子備份腳本仍不得直接刪遠端;本腳本是唯一 offsite 刪舊入口。
|
||
# - 不複製 restic locks。
|
||
# =============================================================================
|
||
|
||
set -euo pipefail
|
||
|
||
source "$(dirname "$0")/common.sh"
|
||
|
||
SERVICE="offsite-backup"
|
||
MODE="status"
|
||
PROVIDER="${OFFSITE_PROVIDER:-rclone}"
|
||
RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}"
|
||
OFFSITE_REMOTE_ROOT_VALUE="${OFFSITE_REMOTE_ROOT:-${RCLONE_REMOTE}:awoooi-backups/restic}"
|
||
OFFSITE_DIR="${BACKUP_BASE}/offsite"
|
||
LOCK_DIR="/tmp/awoooi-offsite-backup.lock"
|
||
RCLONE_TRANSFERS="${RCLONE_TRANSFERS:-2}"
|
||
RCLONE_CHECKERS="${RCLONE_CHECKERS:-4}"
|
||
RCLONE_BWLIMIT="${RCLONE_BWLIMIT:-8M}"
|
||
OFFSITE_RCLONE_BACKEND="${OFFSITE_RCLONE_BACKEND:-drive}"
|
||
RCLONE_FAST_LIST="${RCLONE_FAST_LIST:-1}"
|
||
RCLONE_DRIVE_USE_TRASH="${RCLONE_DRIVE_USE_TRASH:-false}"
|
||
OFFSITE_SYNC_DELETE_OLD="${OFFSITE_SYNC_DELETE_OLD:-1}"
|
||
OFFSITE_SYNC_MAX_LOAD_1="${OFFSITE_SYNC_MAX_LOAD_1:-12}"
|
||
OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT="${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT:-92}"
|
||
OFFSITE_SYNC_REQUIRE_ENABLE_MARKER_FOR_FULL="${OFFSITE_SYNC_REQUIRE_ENABLE_MARKER_FOR_FULL:-1}"
|
||
OFFSITE_SYNC_ENABLE_MARKER="${OFFSITE_SYNC_ENABLE_MARKER:-${OFFSITE_DIR}/enable-rclone-sync}"
|
||
OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES="${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES:-270}"
|
||
OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES="${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES:-120 480 840 1200}"
|
||
OFFSITE_SYNC_NOTIFY_SKIPPED="${OFFSITE_SYNC_NOTIFY_SKIPPED:-0}"
|
||
OFFSITE_SYNC_NOTIFY_SUCCESS="${OFFSITE_SYNC_NOTIFY_SUCCESS:-0}"
|
||
EXPECTED_REPOS_DEFAULT="awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes"
|
||
REPOS="${OFFSITE_REPOS:-${EXPECTED_REPOS_DEFAULT}}"
|
||
DRY_RUN_ARGS=()
|
||
|
||
usage() {
|
||
cat <<'USAGE'
|
||
Usage:
|
||
sync-offsite-backups.sh --mode status
|
||
sync-offsite-backups.sh --mode dry-run [--repos "ai-artifacts public-routes"]
|
||
sync-offsite-backups.sh --mode sync [--repos "ai-artifacts public-routes"]
|
||
|
||
Notes:
|
||
- Default provider is rclone, with Google Drive remote root gdrive:awoooi-backups/restic.
|
||
- --mode sync writes /backup/offsite/<provider>-last-success only when all expected
|
||
repos are selected and mirrored successfully.
|
||
- Partial sync writes /backup/offsite/<provider>-partial-last-success and per-repo markers.
|
||
- OFFSITE_SYNC_DELETE_OLD=1 makes sync mode mirror local restic repos and delete old
|
||
remote files after local retention has pruned them.
|
||
- For Google Drive, RCLONE_DRIVE_USE_TRASH=false makes deletes permanent instead of moving old backup packs to Trash.
|
||
USAGE
|
||
}
|
||
|
||
while [ "$#" -gt 0 ]; do
|
||
case "$1" in
|
||
--mode)
|
||
MODE="${2:-}"
|
||
shift 2
|
||
;;
|
||
--repos)
|
||
REPOS="${2:-}"
|
||
shift 2
|
||
;;
|
||
-h|--help)
|
||
usage
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "Unknown argument: $1" >&2
|
||
usage >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
done
|
||
|
||
case "${MODE}" in
|
||
status|dry-run|sync) ;;
|
||
*)
|
||
echo "MODE must be status, dry-run, or sync" >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
|
||
cleanup() {
|
||
rmdir "${LOCK_DIR}" 2>/dev/null || true
|
||
}
|
||
|
||
low_priority() {
|
||
if command -v ionice >/dev/null 2>&1; then
|
||
ionice -c2 -n7 nice -n 10 "$@"
|
||
else
|
||
nice -n 10 "$@"
|
||
fi
|
||
}
|
||
|
||
require_lock() {
|
||
if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
|
||
log_error "Offsite sync 已有執行中的 lock: ${LOCK_DIR}"
|
||
exit 1
|
||
fi
|
||
trap cleanup EXIT
|
||
}
|
||
|
||
prepare_rclone() {
|
||
if ! command -v rclone >/dev/null 2>&1; then
|
||
log_error "rclone 未安裝,無法執行 offsite copy"
|
||
return 1
|
||
fi
|
||
|
||
if [ "${PROVIDER}" = "b2" ]; then
|
||
if ! check_b2_config; then
|
||
return 1
|
||
fi
|
||
|
||
# 不依賴本機 rclone.conf;用環境變數把 common.sh 的 B2 值交給 rclone。
|
||
export RCLONE_CONFIG_B2_TYPE="b2"
|
||
export RCLONE_CONFIG_B2_ACCOUNT="${B2_ACCOUNT_ID}"
|
||
export RCLONE_CONFIG_B2_KEY="${B2_APPLICATION_KEY}"
|
||
return 0
|
||
fi
|
||
|
||
if ! rclone listremotes 2>/dev/null | grep -Fxq "${RCLONE_REMOTE}:"; then
|
||
log_error "rclone remote 未設定: ${RCLONE_REMOTE}:;請先在 110 執行 configure-offsite-rclone.sh --interactive"
|
||
return 1
|
||
fi
|
||
return 0
|
||
}
|
||
|
||
remote_root() {
|
||
if [ "${PROVIDER}" = "b2" ]; then
|
||
printf 'b2:%s/restic' "${B2_BUCKET}"
|
||
return
|
||
fi
|
||
printf '%s' "${OFFSITE_REMOTE_ROOT_VALUE}"
|
||
}
|
||
|
||
remote_status_target() {
|
||
if [ "${PROVIDER}" = "b2" ]; then
|
||
remote_root
|
||
return
|
||
fi
|
||
printf '%s:' "${RCLONE_REMOTE}"
|
||
}
|
||
|
||
repo_count() {
|
||
local count=0
|
||
for _repo in $1; do
|
||
count=$((count + 1))
|
||
done
|
||
echo "${count}"
|
||
}
|
||
|
||
is_full_scope() {
|
||
[ "$(repo_count "${REPOS}")" -eq "$(repo_count "${EXPECTED_REPOS_DEFAULT}")" ]
|
||
}
|
||
|
||
float_le() {
|
||
awk -v left="$1" -v right="$2" 'BEGIN { exit !(left <= right) }'
|
||
}
|
||
|
||
current_load_1() {
|
||
awk '{print $1}' /proc/loadavg 2>/dev/null || echo 0
|
||
}
|
||
|
||
backup_disk_used_pct() {
|
||
df -P "${BACKUP_BASE}" 2>/dev/null | awk 'NR==2 {gsub("%", "", $5); print $5 + 0}' || echo 100
|
||
}
|
||
|
||
active_backup_processes() {
|
||
ps -eo pid=,args= | awk -v self="$$" '
|
||
$1 == self { next }
|
||
/\/backup\/scripts\/backup-(all|awoooi|awoooi-frequent|gitea|harbor|momo|langfuse|monitoring|signoz|open-webui|clawbot|sentry|ai-artifacts|public-routes|configs)\.sh/ {
|
||
print
|
||
}
|
||
'
|
||
}
|
||
|
||
minutes_until_next_backup_schedule() {
|
||
local now_h
|
||
local now_m
|
||
local now
|
||
local sched
|
||
local delta
|
||
local best=1440
|
||
|
||
now_h="$(date +%H)"
|
||
now_m="$(date +%M)"
|
||
now=$((10#${now_h} * 60 + 10#${now_m}))
|
||
|
||
for sched in ${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES}; do
|
||
delta=$((sched - now))
|
||
if [ "${delta}" -le 0 ]; then
|
||
delta=$((delta + 1440))
|
||
fi
|
||
if [ "${delta}" -lt "${best}" ]; then
|
||
best="${delta}"
|
||
fi
|
||
done
|
||
|
||
echo "${best}"
|
||
}
|
||
|
||
resource_preflight() {
|
||
local load_1
|
||
local disk_pct
|
||
local active_backups
|
||
local runway_minutes
|
||
|
||
[ "${MODE}" = "sync" ] || return 0
|
||
|
||
if is_full_scope && [ "${OFFSITE_SYNC_REQUIRE_ENABLE_MARKER_FOR_FULL}" = "1" ] && [ ! -f "${OFFSITE_SYNC_ENABLE_MARKER}" ]; then
|
||
log_error "Full offsite sync 需要明確啟用 marker: ${OFFSITE_SYNC_ENABLE_MARKER}"
|
||
return 1
|
||
fi
|
||
|
||
if is_full_scope; then
|
||
active_backups="$(active_backup_processes || true)"
|
||
if [ -n "${active_backups}" ]; then
|
||
log_warn "略過 full offsite sync:偵測到正在執行的備份程序"
|
||
printf '%s\n' "${active_backups}" | tee -a "${BACKUP_LOG_DIR}/backup.log" >/dev/null
|
||
return 1
|
||
fi
|
||
|
||
runway_minutes="$(minutes_until_next_backup_schedule)"
|
||
if [ "${runway_minutes}" -lt "${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}" ]; then
|
||
log_warn "略過 full offsite sync:距離下一次備份排程 ${runway_minutes} 分鐘,低於 runway ${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES} 分鐘"
|
||
return 1
|
||
fi
|
||
fi
|
||
|
||
load_1="$(current_load_1)"
|
||
if ! float_le "${load_1}" "${OFFSITE_SYNC_MAX_LOAD_1}"; then
|
||
log_warn "略過 offsite sync:1m load=${load_1} 高於上限 ${OFFSITE_SYNC_MAX_LOAD_1}"
|
||
return 1
|
||
fi
|
||
|
||
disk_pct="$(backup_disk_used_pct)"
|
||
if [ "${disk_pct}" -gt "${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT}" ]; then
|
||
log_warn "略過 offsite sync:${BACKUP_BASE} 使用率 ${disk_pct}% 高於上限 ${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT}%"
|
||
return 1
|
||
fi
|
||
|
||
log_info "Offsite sync resource preflight OK load_1=${load_1}/${OFFSITE_SYNC_MAX_LOAD_1} backup_disk_used=${disk_pct}%/${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT}%"
|
||
}
|
||
|
||
write_marker() {
|
||
local path="$1"
|
||
local scope="$2"
|
||
local timestamp
|
||
timestamp=$(date +%s)
|
||
install -d -m 750 "${OFFSITE_DIR}"
|
||
cat > "${path}" <<EOF
|
||
timestamp=${timestamp}
|
||
provider=${PROVIDER}
|
||
mode=${MODE}
|
||
scope=${scope}
|
||
repos=${REPOS}
|
||
EOF
|
||
}
|
||
|
||
status_remote() {
|
||
log_info "檢查 offsite remote 可列出狀態 provider=${PROVIDER}(不輸出 credential)"
|
||
low_priority rclone lsd "$(remote_status_target)" \
|
||
--max-depth 1 \
|
||
--contimeout 15s \
|
||
--timeout 30s \
|
||
--retries 1 \
|
||
>/tmp/awoooi-offsite-rclone-lsd.log 2>&1 || return 1
|
||
}
|
||
|
||
copy_repo() {
|
||
local name="$1"
|
||
local local_repo="${BACKUP_BASE}/${name}"
|
||
local remote_repo
|
||
local rclone_verb="copy"
|
||
local rclone_extra_args=()
|
||
remote_repo="$(remote_root)/${name}"
|
||
|
||
if [ ! -d "${local_repo}/data" ]; then
|
||
log_error "Restic repo 不存在或未初始化: ${local_repo}"
|
||
return 1
|
||
fi
|
||
|
||
if [ "${OFFSITE_SYNC_DELETE_OLD}" = "1" ] && [ "${MODE}" != "status" ]; then
|
||
rclone_verb="sync"
|
||
fi
|
||
if [ "${RCLONE_FAST_LIST}" = "1" ]; then
|
||
rclone_extra_args+=(--fast-list)
|
||
fi
|
||
if [ "${OFFSITE_RCLONE_BACKEND}" = "drive" ]; then
|
||
rclone_extra_args+=("--drive-use-trash=${RCLONE_DRIVE_USE_TRASH}")
|
||
fi
|
||
|
||
log_info "Offsite ${MODE}: ${name} -> ${remote_repo} (rclone=${rclone_verb}, delete_old=${OFFSITE_SYNC_DELETE_OLD}, backend=${OFFSITE_RCLONE_BACKEND}, drive_trash=${RCLONE_DRIVE_USE_TRASH})"
|
||
low_priority rclone "${rclone_verb}" "${local_repo}" "${remote_repo}" \
|
||
"${DRY_RUN_ARGS[@]}" \
|
||
"${rclone_extra_args[@]}" \
|
||
--exclude 'locks/**' \
|
||
--transfers "${RCLONE_TRANSFERS}" \
|
||
--checkers "${RCLONE_CHECKERS}" \
|
||
--bwlimit "${RCLONE_BWLIMIT}" \
|
||
--contimeout 15s \
|
||
--timeout 5m \
|
||
--retries 2 \
|
||
--stats 30s \
|
||
--stats-one-line \
|
||
>> "${BACKUP_LOG_DIR}/offsite-sync.log" 2>&1
|
||
}
|
||
|
||
main() {
|
||
local start_time
|
||
local failed=0
|
||
local checked=0
|
||
local scope="partial"
|
||
local remote_prepared=0
|
||
start_time=$(date +%s)
|
||
|
||
require_lock
|
||
install -d -m 750 "${OFFSITE_DIR}"
|
||
|
||
log_info "========== Offsite backup ${MODE} 開始 =========="
|
||
log_info "provider=${PROVIDER} remote_root=$(remote_root) repos=$(repo_count "${REPOS}") bwlimit=${RCLONE_BWLIMIT} transfers=${RCLONE_TRANSFERS} max_load_1=${OFFSITE_SYNC_MAX_LOAD_1} full_runway_minutes=${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES} delete_old=${OFFSITE_SYNC_DELETE_OLD} backend=${OFFSITE_RCLONE_BACKEND} drive_trash=${RCLONE_DRIVE_USE_TRASH}"
|
||
|
||
resource_preflight || {
|
||
if [ "${MODE}" = "sync" ] && [ "${OFFSITE_SYNC_NOTIFY_SKIPPED}" = "1" ]; then
|
||
notify_clawbot "warning" "${SERVICE}" "Offsite backup sync 略過:主機負載或前置條件未達安全門檻" 0
|
||
fi
|
||
exit 1
|
||
}
|
||
|
||
if prepare_rclone; then
|
||
remote_prepared=1
|
||
elif [ "${MODE}" != "status" ]; then
|
||
notify_clawbot "warning" "${SERVICE}" "Offsite rclone provider 未配置或不可用" 0
|
||
exit 1
|
||
else
|
||
log_warn "Offsite provider 尚未配置;status 模式只檢查本地 repo,配置缺口交由 backup health metric 告警"
|
||
fi
|
||
|
||
if [ "${remote_prepared}" -eq 1 ]; then
|
||
if status_remote; then
|
||
log_success "Offsite remote 可列出"
|
||
else
|
||
log_warn "Offsite remote 尚不可列出或目前為空;copy 模式仍可建立路徑"
|
||
fi
|
||
fi
|
||
|
||
if [ "${MODE}" = "status" ]; then
|
||
for name in ${REPOS}; do
|
||
checked=$((checked + 1))
|
||
if [ -d "${BACKUP_BASE}/${name}/data" ]; then
|
||
log_success "本地 repo 存在: ${name}"
|
||
else
|
||
log_error "本地 repo 缺失: ${name}"
|
||
failed=$((failed + 1))
|
||
fi
|
||
done
|
||
else
|
||
[ "${MODE}" = "dry-run" ] && DRY_RUN_ARGS=(--dry-run)
|
||
for name in ${REPOS}; do
|
||
checked=$((checked + 1))
|
||
if copy_repo "${name}"; then
|
||
log_success "Offsite ${MODE} 成功: ${name}"
|
||
if [ "${MODE}" = "sync" ]; then
|
||
write_marker "${OFFSITE_DIR}/${PROVIDER}-${name}.last_success" "repo"
|
||
fi
|
||
else
|
||
log_error "Offsite ${MODE} 失敗: ${name}"
|
||
failed=$((failed + 1))
|
||
fi
|
||
done
|
||
fi
|
||
|
||
if is_full_scope; then
|
||
scope="full"
|
||
fi
|
||
|
||
local duration
|
||
duration=$(($(date +%s) - start_time))
|
||
if [ "${failed}" -eq 0 ]; then
|
||
if [ "${MODE}" = "sync" ]; then
|
||
if [ "${scope}" = "full" ]; then
|
||
write_marker "${OFFSITE_DIR}/${PROVIDER}-last-success" "full"
|
||
else
|
||
write_marker "${OFFSITE_DIR}/${PROVIDER}-partial-last-success" "partial"
|
||
fi
|
||
fi
|
||
log_success "========== Offsite backup ${MODE} 完成 (${duration}s, ${checked}/${checked}) =========="
|
||
if [ "${MODE}" != "status" ] && [ "${OFFSITE_SYNC_NOTIFY_SUCCESS}" = "1" ]; then
|
||
notify_clawbot "success" "${SERVICE}" "Offsite backup ${MODE} 完成 scope=${scope} (${checked}/${checked})" "${duration}"
|
||
fi
|
||
else
|
||
log_error "========== Offsite backup ${MODE} 失敗 ${failed}/${checked} (${duration}s) =========="
|
||
notify_clawbot "failed" "${SERVICE}" "Offsite backup ${MODE} 失敗 ${failed}/${checked}" "${duration}"
|
||
fi
|
||
|
||
return "${failed}"
|
||
}
|
||
|
||
main "$@"
|