Files
awoooi/scripts/backup/sync-offsite-backups.sh
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

415 lines
14 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# WOOO AIOps - Offsite backup copy controller
# 2026-05-06 ogt + Codex: 將離機備份從口頭缺口變成可審計腳本。
#
# 模式:
# --mode status 只檢查本地 repo、rclone 與離機遠端可列出;不寫 success marker。
# --mode dry-run 對指定 repo 做 rclone dry-run不寫 success marker。
# --mode sync 對指定 repo 做 rclone mirror全部成功才寫 marker。
#
# 安全:
# - 不輸出 provider/rclone credential。
# - 預設只跑 status不會無意間上傳 80GB+。
# - latest-only 策略下sync 模式使用 rclone sync 鏡像本地 repo
# 成功後刪除 Google Drive 上已不存在於本地 repo 的舊檔。
# - 子備份腳本仍不得直接刪遠端;本腳本是唯一 offsite 刪舊入口。
# - 不複製 restic locks。
# =============================================================================
set -euo pipefail
source "$(dirname "$0")/common.sh"
SERVICE="offsite-backup"
MODE="status"
PROVIDER="${OFFSITE_PROVIDER:-rclone}"
RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}"
OFFSITE_REMOTE_ROOT_VALUE="${OFFSITE_REMOTE_ROOT:-${RCLONE_REMOTE}:awoooi-backups/restic}"
OFFSITE_DIR="${BACKUP_BASE}/offsite"
LOCK_DIR="/tmp/awoooi-offsite-backup.lock"
RCLONE_TRANSFERS="${RCLONE_TRANSFERS:-2}"
RCLONE_CHECKERS="${RCLONE_CHECKERS:-4}"
RCLONE_BWLIMIT="${RCLONE_BWLIMIT:-8M}"
OFFSITE_RCLONE_BACKEND="${OFFSITE_RCLONE_BACKEND:-drive}"
RCLONE_FAST_LIST="${RCLONE_FAST_LIST:-1}"
RCLONE_DRIVE_USE_TRASH="${RCLONE_DRIVE_USE_TRASH:-false}"
OFFSITE_SYNC_DELETE_OLD="${OFFSITE_SYNC_DELETE_OLD:-1}"
OFFSITE_SYNC_MAX_LOAD_1="${OFFSITE_SYNC_MAX_LOAD_1:-12}"
OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT="${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT:-92}"
OFFSITE_SYNC_REQUIRE_ENABLE_MARKER_FOR_FULL="${OFFSITE_SYNC_REQUIRE_ENABLE_MARKER_FOR_FULL:-1}"
OFFSITE_SYNC_ENABLE_MARKER="${OFFSITE_SYNC_ENABLE_MARKER:-${OFFSITE_DIR}/enable-rclone-sync}"
OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES="${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES:-270}"
OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES="${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES:-120 480 840 1200}"
OFFSITE_SYNC_NOTIFY_SKIPPED="${OFFSITE_SYNC_NOTIFY_SKIPPED:-0}"
OFFSITE_SYNC_NOTIFY_SUCCESS="${OFFSITE_SYNC_NOTIFY_SUCCESS:-0}"
EXPECTED_REPOS_DEFAULT="awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes"
REPOS="${OFFSITE_REPOS:-${EXPECTED_REPOS_DEFAULT}}"
DRY_RUN_ARGS=()
usage() {
cat <<'USAGE'
Usage:
sync-offsite-backups.sh --mode status
sync-offsite-backups.sh --mode dry-run [--repos "ai-artifacts public-routes"]
sync-offsite-backups.sh --mode sync [--repos "ai-artifacts public-routes"]
Notes:
- Default provider is rclone, with Google Drive remote root gdrive:awoooi-backups/restic.
- --mode sync writes /backup/offsite/<provider>-last-success only when all expected
repos are selected and mirrored successfully.
- Partial sync writes /backup/offsite/<provider>-partial-last-success and per-repo markers.
- OFFSITE_SYNC_DELETE_OLD=1 makes sync mode mirror local restic repos and delete old
remote files after local retention has pruned them.
- For Google Drive, RCLONE_DRIVE_USE_TRASH=false makes deletes permanent instead of moving old backup packs to Trash.
USAGE
}
while [ "$#" -gt 0 ]; do
case "$1" in
--mode)
MODE="${2:-}"
shift 2
;;
--repos)
REPOS="${2:-}"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
case "${MODE}" in
status|dry-run|sync) ;;
*)
echo "MODE must be status, dry-run, or sync" >&2
exit 2
;;
esac
cleanup() {
rmdir "${LOCK_DIR}" 2>/dev/null || true
}
low_priority() {
if command -v ionice >/dev/null 2>&1; then
ionice -c2 -n7 nice -n 10 "$@"
else
nice -n 10 "$@"
fi
}
require_lock() {
if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
log_error "Offsite sync 已有執行中的 lock: ${LOCK_DIR}"
exit 1
fi
trap cleanup EXIT
}
prepare_rclone() {
if ! command -v rclone >/dev/null 2>&1; then
log_error "rclone 未安裝,無法執行 offsite copy"
return 1
fi
if [ "${PROVIDER}" = "b2" ]; then
if ! check_b2_config; then
return 1
fi
# 不依賴本機 rclone.conf用環境變數把 common.sh 的 B2 值交給 rclone。
export RCLONE_CONFIG_B2_TYPE="b2"
export RCLONE_CONFIG_B2_ACCOUNT="${B2_ACCOUNT_ID}"
export RCLONE_CONFIG_B2_KEY="${B2_APPLICATION_KEY}"
return 0
fi
if ! rclone listremotes 2>/dev/null | grep -Fxq "${RCLONE_REMOTE}:"; then
log_error "rclone remote 未設定: ${RCLONE_REMOTE}:;請先在 110 執行 configure-offsite-rclone.sh --interactive"
return 1
fi
return 0
}
remote_root() {
if [ "${PROVIDER}" = "b2" ]; then
printf 'b2:%s/restic' "${B2_BUCKET}"
return
fi
printf '%s' "${OFFSITE_REMOTE_ROOT_VALUE}"
}
remote_status_target() {
if [ "${PROVIDER}" = "b2" ]; then
remote_root
return
fi
printf '%s:' "${RCLONE_REMOTE}"
}
repo_count() {
local count=0
for _repo in $1; do
count=$((count + 1))
done
echo "${count}"
}
is_full_scope() {
[ "$(repo_count "${REPOS}")" -eq "$(repo_count "${EXPECTED_REPOS_DEFAULT}")" ]
}
float_le() {
awk -v left="$1" -v right="$2" 'BEGIN { exit !(left <= right) }'
}
current_load_1() {
awk '{print $1}' /proc/loadavg 2>/dev/null || echo 0
}
backup_disk_used_pct() {
df -P "${BACKUP_BASE}" 2>/dev/null | awk 'NR==2 {gsub("%", "", $5); print $5 + 0}' || echo 100
}
active_backup_processes() {
ps -eo pid=,args= | awk -v self="$$" '
$1 == self { next }
/\/backup\/scripts\/backup-(all|awoooi|awoooi-frequent|gitea|harbor|momo|langfuse|monitoring|signoz|open-webui|clawbot|sentry|ai-artifacts|public-routes|configs)\.sh/ {
print
}
'
}
minutes_until_next_backup_schedule() {
local now_h
local now_m
local now
local sched
local delta
local best=1440
now_h="$(date +%H)"
now_m="$(date +%M)"
now=$((10#${now_h} * 60 + 10#${now_m}))
for sched in ${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES}; do
delta=$((sched - now))
if [ "${delta}" -le 0 ]; then
delta=$((delta + 1440))
fi
if [ "${delta}" -lt "${best}" ]; then
best="${delta}"
fi
done
echo "${best}"
}
resource_preflight() {
local load_1
local disk_pct
local active_backups
local runway_minutes
[ "${MODE}" = "sync" ] || return 0
if is_full_scope && [ "${OFFSITE_SYNC_REQUIRE_ENABLE_MARKER_FOR_FULL}" = "1" ] && [ ! -f "${OFFSITE_SYNC_ENABLE_MARKER}" ]; then
log_error "Full offsite sync 需要明確啟用 marker: ${OFFSITE_SYNC_ENABLE_MARKER}"
return 1
fi
if is_full_scope; then
active_backups="$(active_backup_processes || true)"
if [ -n "${active_backups}" ]; then
log_warn "略過 full offsite sync偵測到正在執行的備份程序"
printf '%s\n' "${active_backups}" | tee -a "${BACKUP_LOG_DIR}/backup.log" >/dev/null
return 1
fi
runway_minutes="$(minutes_until_next_backup_schedule)"
if [ "${runway_minutes}" -lt "${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}" ]; then
log_warn "略過 full offsite sync距離下一次備份排程 ${runway_minutes} 分鐘,低於 runway ${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES} 分鐘"
return 1
fi
fi
load_1="$(current_load_1)"
if ! float_le "${load_1}" "${OFFSITE_SYNC_MAX_LOAD_1}"; then
log_warn "略過 offsite sync1m load=${load_1} 高於上限 ${OFFSITE_SYNC_MAX_LOAD_1}"
return 1
fi
disk_pct="$(backup_disk_used_pct)"
if [ "${disk_pct}" -gt "${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT}" ]; then
log_warn "略過 offsite sync${BACKUP_BASE} 使用率 ${disk_pct}% 高於上限 ${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT}%"
return 1
fi
log_info "Offsite sync resource preflight OK load_1=${load_1}/${OFFSITE_SYNC_MAX_LOAD_1} backup_disk_used=${disk_pct}%/${OFFSITE_SYNC_MAX_BACKUP_DISK_USED_PCT}%"
}
write_marker() {
local path="$1"
local scope="$2"
local timestamp
timestamp=$(date +%s)
install -d -m 750 "${OFFSITE_DIR}"
cat > "${path}" <<EOF
timestamp=${timestamp}
provider=${PROVIDER}
mode=${MODE}
scope=${scope}
repos=${REPOS}
EOF
}
status_remote() {
log_info "檢查 offsite remote 可列出狀態 provider=${PROVIDER}(不輸出 credential"
low_priority rclone lsd "$(remote_status_target)" \
--max-depth 1 \
--contimeout 15s \
--timeout 30s \
--retries 1 \
>/tmp/awoooi-offsite-rclone-lsd.log 2>&1 || return 1
}
copy_repo() {
local name="$1"
local local_repo="${BACKUP_BASE}/${name}"
local remote_repo
local rclone_verb="copy"
local rclone_extra_args=()
remote_repo="$(remote_root)/${name}"
if [ ! -d "${local_repo}/data" ]; then
log_error "Restic repo 不存在或未初始化: ${local_repo}"
return 1
fi
if [ "${OFFSITE_SYNC_DELETE_OLD}" = "1" ] && [ "${MODE}" != "status" ]; then
rclone_verb="sync"
fi
if [ "${RCLONE_FAST_LIST}" = "1" ]; then
rclone_extra_args+=(--fast-list)
fi
if [ "${OFFSITE_RCLONE_BACKEND}" = "drive" ]; then
rclone_extra_args+=("--drive-use-trash=${RCLONE_DRIVE_USE_TRASH}")
fi
log_info "Offsite ${MODE}: ${name} -> ${remote_repo} (rclone=${rclone_verb}, delete_old=${OFFSITE_SYNC_DELETE_OLD}, backend=${OFFSITE_RCLONE_BACKEND}, drive_trash=${RCLONE_DRIVE_USE_TRASH})"
low_priority rclone "${rclone_verb}" "${local_repo}" "${remote_repo}" \
"${DRY_RUN_ARGS[@]}" \
"${rclone_extra_args[@]}" \
--exclude 'locks/**' \
--transfers "${RCLONE_TRANSFERS}" \
--checkers "${RCLONE_CHECKERS}" \
--bwlimit "${RCLONE_BWLIMIT}" \
--contimeout 15s \
--timeout 5m \
--retries 2 \
--stats 30s \
--stats-one-line \
>> "${BACKUP_LOG_DIR}/offsite-sync.log" 2>&1
}
main() {
local start_time
local failed=0
local checked=0
local scope="partial"
local remote_prepared=0
start_time=$(date +%s)
require_lock
install -d -m 750 "${OFFSITE_DIR}"
log_info "========== Offsite backup ${MODE} 開始 =========="
log_info "provider=${PROVIDER} remote_root=$(remote_root) repos=$(repo_count "${REPOS}") bwlimit=${RCLONE_BWLIMIT} transfers=${RCLONE_TRANSFERS} max_load_1=${OFFSITE_SYNC_MAX_LOAD_1} full_runway_minutes=${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES} delete_old=${OFFSITE_SYNC_DELETE_OLD} backend=${OFFSITE_RCLONE_BACKEND} drive_trash=${RCLONE_DRIVE_USE_TRASH}"
resource_preflight || {
if [ "${MODE}" = "sync" ] && [ "${OFFSITE_SYNC_NOTIFY_SKIPPED}" = "1" ]; then
notify_clawbot "warning" "${SERVICE}" "Offsite backup sync 略過:主機負載或前置條件未達安全門檻" 0
fi
exit 1
}
if prepare_rclone; then
remote_prepared=1
elif [ "${MODE}" != "status" ]; then
notify_clawbot "warning" "${SERVICE}" "Offsite rclone provider 未配置或不可用" 0
exit 1
else
log_warn "Offsite provider 尚未配置status 模式只檢查本地 repo配置缺口交由 backup health metric 告警"
fi
if [ "${remote_prepared}" -eq 1 ]; then
if status_remote; then
log_success "Offsite remote 可列出"
else
log_warn "Offsite remote 尚不可列出或目前為空copy 模式仍可建立路徑"
fi
fi
if [ "${MODE}" = "status" ]; then
for name in ${REPOS}; do
checked=$((checked + 1))
if [ -d "${BACKUP_BASE}/${name}/data" ]; then
log_success "本地 repo 存在: ${name}"
else
log_error "本地 repo 缺失: ${name}"
failed=$((failed + 1))
fi
done
else
[ "${MODE}" = "dry-run" ] && DRY_RUN_ARGS=(--dry-run)
for name in ${REPOS}; do
checked=$((checked + 1))
if copy_repo "${name}"; then
log_success "Offsite ${MODE} 成功: ${name}"
if [ "${MODE}" = "sync" ]; then
write_marker "${OFFSITE_DIR}/${PROVIDER}-${name}.last_success" "repo"
fi
else
log_error "Offsite ${MODE} 失敗: ${name}"
failed=$((failed + 1))
fi
done
fi
if is_full_scope; then
scope="full"
fi
local duration
duration=$(($(date +%s) - start_time))
if [ "${failed}" -eq 0 ]; then
if [ "${MODE}" = "sync" ]; then
if [ "${scope}" = "full" ]; then
write_marker "${OFFSITE_DIR}/${PROVIDER}-last-success" "full"
else
write_marker "${OFFSITE_DIR}/${PROVIDER}-partial-last-success" "partial"
fi
fi
log_success "========== Offsite backup ${MODE} 完成 (${duration}s, ${checked}/${checked}) =========="
if [ "${MODE}" != "status" ] && [ "${OFFSITE_SYNC_NOTIFY_SUCCESS}" = "1" ]; then
notify_clawbot "success" "${SERVICE}" "Offsite backup ${MODE} 完成 scope=${scope} (${checked}/${checked})" "${duration}"
fi
else
log_error "========== Offsite backup ${MODE} 失敗 ${failed}/${checked} (${duration}s) =========="
notify_clawbot "failed" "${SERVICE}" "Offsite backup ${MODE} 失敗 ${failed}/${checked}" "${duration}"
fi
return "${failed}"
}
main "$@"