437 lines
12 KiB
Bash
Executable File
437 lines
12 KiB
Bash
Executable File
#!/bin/bash
|
||
# =============================================================================
|
||
# WOOO AIOps - Offsite backup readiness gate
|
||
# 2026-05-06 ogt + Codex: 離機備份與 credential escrow 放行檢查。
|
||
#
|
||
# 預設為 read-only status,不讀、不列印任何 secret。
|
||
# Google Drive/rclone 是目前優先 provider;B2 只保留相容路徑。
|
||
# =============================================================================
|
||
|
||
set -euo pipefail
|
||
|
||
BACKUP_BASE="${BACKUP_BASE:-/backup}"
|
||
OFFSITE_ENV_FILE="${BACKUP_OFFSITE_ENV_FILE:-${BACKUP_BASE}/scripts/offsite.env}"
|
||
OFFSITE_DIR="${BACKUP_OFFSITE_STATUS_DIR:-${BACKUP_BASE}/offsite}"
|
||
ESCROW_DIR="${BACKUP_ESCROW_EVIDENCE_DIR:-${BACKUP_BASE}/escrow-evidence}"
|
||
SYNC_SCRIPT="${BACKUP_SYNC_SCRIPT:-${BACKUP_BASE}/scripts/sync-offsite-backups.sh}"
|
||
OFFSITE_PROVIDER="${OFFSITE_PROVIDER:-rclone}"
|
||
OFFSITE_RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}"
|
||
OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES="${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES:-270}"
|
||
OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES="${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES:-120 480 840 1200}"
|
||
MODE="status"
|
||
REQUIRE_CONFIGURED=0
|
||
REQUIRE_ESCROW=0
|
||
NO_COLOR=0
|
||
SMALL_REPOS="ai-artifacts public-routes"
|
||
EXPECTED_REPOS="awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes"
|
||
|
||
pass=0
|
||
warn=0
|
||
blocked_count=0
|
||
|
||
usage() {
|
||
cat <<'USAGE'
|
||
Usage:
|
||
backup-offsite-readiness-gate.sh [--status] [--no-color]
|
||
backup-offsite-readiness-gate.sh --dry-run-small [--repos "ai-artifacts public-routes"]
|
||
backup-offsite-readiness-gate.sh --pre-full-sync
|
||
|
||
Options:
|
||
--require-configured Treat missing rclone/offsite config as BLOCKED.
|
||
--require-escrow Treat stale/missing credential escrow markers as BLOCKED.
|
||
|
||
Rules:
|
||
- This gate never prints credential values.
|
||
- --dry-run-small runs rclone dry-run only for the selected small repos.
|
||
- --pre-full-sync does not upload data; it checks config, local repos, and load.
|
||
USAGE
|
||
}
|
||
|
||
while [ "$#" -gt 0 ]; do
|
||
case "$1" in
|
||
--status)
|
||
MODE="status"
|
||
shift
|
||
;;
|
||
--dry-run-small)
|
||
MODE="dry-run-small"
|
||
REQUIRE_CONFIGURED=1
|
||
shift
|
||
;;
|
||
--pre-full-sync)
|
||
MODE="pre-full-sync"
|
||
REQUIRE_CONFIGURED=1
|
||
shift
|
||
;;
|
||
--repos)
|
||
SMALL_REPOS="${2:-}"
|
||
shift 2
|
||
;;
|
||
--require-configured)
|
||
REQUIRE_CONFIGURED=1
|
||
shift
|
||
;;
|
||
--require-escrow)
|
||
REQUIRE_ESCROW=1
|
||
shift
|
||
;;
|
||
--no-color)
|
||
NO_COLOR=1
|
||
shift
|
||
;;
|
||
-h|--help)
|
||
usage
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "Unknown argument: $1" >&2
|
||
usage >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
done
|
||
|
||
case "${MODE}" in
|
||
status|dry-run-small|pre-full-sync) ;;
|
||
*)
|
||
echo "Invalid mode: ${MODE}" >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
|
||
if [ "${NO_COLOR}" = "1" ]; then
|
||
green=""
|
||
yellow=""
|
||
red=""
|
||
reset=""
|
||
else
|
||
green="$(printf '\033[32m')"
|
||
yellow="$(printf '\033[33m')"
|
||
red="$(printf '\033[31m')"
|
||
reset="$(printf '\033[0m')"
|
||
fi
|
||
|
||
ok() {
|
||
pass=$((pass + 1))
|
||
printf "%sOK%s %s\n" "${green}" "${reset}" "$*"
|
||
}
|
||
|
||
warning() {
|
||
warn=$((warn + 1))
|
||
printf "%sWARN%s %s\n" "${yellow}" "${reset}" "$*"
|
||
}
|
||
|
||
block() {
|
||
blocked_count=$((blocked_count + 1))
|
||
printf "%sBLOCKED%s %s\n" "${red}" "${reset}" "$*"
|
||
}
|
||
|
||
warn_or_block() {
|
||
local require="$1"
|
||
shift
|
||
if [ "${require}" = "1" ]; then
|
||
block "$@"
|
||
else
|
||
warning "$@"
|
||
fi
|
||
}
|
||
|
||
configured_secret() {
|
||
local value="${1:-}"
|
||
[ -n "${value}" ] && [ "${value}" != "CHANGE_ME" ] && [ "${value}" != "CHANGEME" ] && [ "${value}" != "TODO" ] && [ "${value}" != "REDACTED" ]
|
||
}
|
||
|
||
file_mode() {
|
||
stat -c '%a' "$1" 2>/dev/null || stat -f '%Lp' "$1" 2>/dev/null || echo unknown
|
||
}
|
||
|
||
load_offsite_env() {
|
||
if [ -f "${OFFSITE_ENV_FILE}" ]; then
|
||
# shellcheck disable=SC1090
|
||
source "${OFFSITE_ENV_FILE}"
|
||
OFFSITE_PROVIDER="${OFFSITE_PROVIDER:-rclone}"
|
||
OFFSITE_RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}"
|
||
fi
|
||
}
|
||
|
||
repo_count() {
|
||
local count=0
|
||
for _repo in $1; do
|
||
count=$((count + 1))
|
||
done
|
||
echo "${count}"
|
||
}
|
||
|
||
marker_timestamp() {
|
||
local path="$1"
|
||
[ -f "${path}" ] || {
|
||
echo 0
|
||
return
|
||
}
|
||
awk -F= '/^timestamp=/ {print int($2); found=1; exit} END {if (!found) print 0}' "${path}" 2>/dev/null || echo 0
|
||
}
|
||
|
||
check_offsite_env() {
|
||
load_offsite_env
|
||
if [ -f "${OFFSITE_ENV_FILE}" ]; then
|
||
mode="$(file_mode "${OFFSITE_ENV_FILE}")"
|
||
if [ "${mode}" = "600" ]; then
|
||
ok "offsite.env exists with private mode 0600"
|
||
else
|
||
block "offsite.env mode must be 0600; current mode=${mode}"
|
||
fi
|
||
elif [ "${OFFSITE_PROVIDER}" = "b2" ]; then
|
||
warn_or_block "${REQUIRE_CONFIGURED}" "offsite.env missing; B2 provider not configured yet"
|
||
else
|
||
warning "offsite.env missing; Google Drive/rclone 可先用 rclone config 建 remote,再用 configure-offsite-rclone.sh 寫入非 secret 設定"
|
||
fi
|
||
}
|
||
|
||
check_configured() {
|
||
load_offsite_env
|
||
if command -v rclone >/dev/null 2>&1; then
|
||
ok "rclone command is available"
|
||
else
|
||
warn_or_block "${REQUIRE_CONFIGURED}" "rclone command is missing"
|
||
fi
|
||
|
||
if [ "${OFFSITE_PROVIDER}" = "b2" ]; then
|
||
local b2_ready=0
|
||
if configured_secret "${B2_ACCOUNT_ID:-}" && configured_secret "${B2_APPLICATION_KEY:-}" && configured_secret "${B2_BUCKET:-}"; then
|
||
b2_ready=1
|
||
fi
|
||
|
||
if [ "${b2_ready}" = "1" ]; then
|
||
ok "B2 account/application key/bucket are configured without exposing values"
|
||
else
|
||
warn_or_block "${REQUIRE_CONFIGURED}" "B2 account/application key/bucket not fully configured"
|
||
fi
|
||
elif command -v rclone >/dev/null 2>&1 && rclone listremotes 2>/dev/null | grep -Fxq "${OFFSITE_RCLONE_REMOTE}:"; then
|
||
ok "rclone remote is configured without exposing tokens: ${OFFSITE_RCLONE_REMOTE}:"
|
||
else
|
||
warn_or_block "${REQUIRE_CONFIGURED}" "Google Drive/rclone remote not configured: ${OFFSITE_RCLONE_REMOTE}:"
|
||
fi
|
||
|
||
if [ -x "${SYNC_SCRIPT}" ]; then
|
||
ok "offsite sync controller is executable: ${SYNC_SCRIPT}"
|
||
else
|
||
block "offsite sync controller missing or not executable: ${SYNC_SCRIPT}"
|
||
fi
|
||
}
|
||
|
||
check_local_repos() {
|
||
local repos="$1"
|
||
local missing=0
|
||
for repo in ${repos}; do
|
||
if [ -d "${BACKUP_BASE}/${repo}/data" ]; then
|
||
ok "local restic repo exists: ${repo}"
|
||
else
|
||
block "local restic repo missing or uninitialized: ${BACKUP_BASE}/${repo}"
|
||
missing=$((missing + 1))
|
||
fi
|
||
done
|
||
[ "${missing}" -eq 0 ]
|
||
}
|
||
|
||
check_offsite_marker() {
|
||
local now
|
||
local ts
|
||
local age
|
||
local provider
|
||
now="$(date +%s)"
|
||
for provider in rclone b2; do
|
||
ts="$(marker_timestamp "${OFFSITE_DIR}/${provider}-last-success")"
|
||
[ "${ts}" -gt 0 ] && break
|
||
done
|
||
if [ "${ts}" -gt 0 ]; then
|
||
age=$((now - ts))
|
||
if [ "${age}" -le $((48 * 3600)) ]; then
|
||
ok "full offsite success marker is fresh provider=${provider} age=${age}s"
|
||
else
|
||
warning "full offsite success marker stale provider=${provider} age=${age}s"
|
||
fi
|
||
else
|
||
warning "full offsite success marker missing; full remote copy has not been proven"
|
||
fi
|
||
|
||
for provider in rclone b2; do
|
||
ts="$(marker_timestamp "${OFFSITE_DIR}/${provider}-partial-last-success")"
|
||
[ "${ts}" -gt 0 ] && break
|
||
done
|
||
if [ "${ts}" -gt 0 ]; then
|
||
age=$((now - ts))
|
||
ok "partial offsite marker exists provider=${provider} age=${age}s"
|
||
else
|
||
warning "partial offsite marker missing; small-repo sync has not been proven"
|
||
fi
|
||
}
|
||
|
||
check_escrow_markers() {
|
||
local now
|
||
local item
|
||
local path
|
||
local ts
|
||
local age
|
||
now="$(date +%s)"
|
||
for item in restic_repository_password offsite_provider_credentials break_glass_admin_credentials dns_registrar_recovery oauth_ai_provider_recovery; do
|
||
path="${ESCROW_DIR}/${item}.last_verified"
|
||
ts="$(marker_timestamp "${path}")"
|
||
if [ "${ts}" -gt 0 ]; then
|
||
age=$((now - ts))
|
||
if [ "${age}" -le $((744 * 3600)) ]; then
|
||
ok "credential escrow marker fresh: ${item}"
|
||
else
|
||
warn_or_block "${REQUIRE_ESCROW}" "credential escrow marker stale: ${item} age=${age}s"
|
||
fi
|
||
else
|
||
warn_or_block "${REQUIRE_ESCROW}" "credential escrow marker missing: ${item}"
|
||
fi
|
||
done
|
||
}
|
||
|
||
check_load_for_full_sync() {
|
||
if [ -r /proc/loadavg ]; then
|
||
awk '
|
||
{
|
||
load5=$2
|
||
cores=0
|
||
while ((getline line < "/proc/cpuinfo") > 0) {
|
||
if (line ~ /^processor/) cores++
|
||
}
|
||
if (cores < 1) cores=1
|
||
ratio=load5/cores
|
||
printf "LOAD5 %.4f CORES %d LOAD5_PER_CORE %.6f\n", load5, cores, ratio
|
||
if (ratio > 0.7) exit 42
|
||
}
|
||
' /proc/loadavg
|
||
rc=$?
|
||
if [ "${rc}" -eq 0 ]; then
|
||
ok "host load is low enough for pre-full-sync review"
|
||
else
|
||
block "host load too high for full offsite sync review"
|
||
fi
|
||
else
|
||
warning "load check skipped; /proc/loadavg unavailable"
|
||
fi
|
||
}
|
||
|
||
active_backup_processes() {
|
||
ps -eo pid=,args= | awk -v self="$$" '
|
||
$1 == self { next }
|
||
/\/backup\/scripts\/backup-(all|awoooi|awoooi-frequent|gitea|harbor|momo|langfuse|monitoring|signoz|open-webui|clawbot|sentry|ai-artifacts|public-routes|configs)\.sh/ {
|
||
print
|
||
}
|
||
'
|
||
}
|
||
|
||
minutes_until_next_backup_schedule() {
|
||
local now_h
|
||
local now_m
|
||
local now
|
||
local sched
|
||
local delta
|
||
local best=1440
|
||
|
||
now_h="$(date +%H)"
|
||
now_m="$(date +%M)"
|
||
now=$((10#${now_h} * 60 + 10#${now_m}))
|
||
|
||
for sched in ${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES}; do
|
||
delta=$((sched - now))
|
||
if [ "${delta}" -le 0 ]; then
|
||
delta=$((delta + 1440))
|
||
fi
|
||
if [ "${delta}" -lt "${best}" ]; then
|
||
best="${delta}"
|
||
fi
|
||
done
|
||
|
||
echo "${best}"
|
||
}
|
||
|
||
check_full_sync_runway() {
|
||
local active_backups
|
||
local runway_minutes
|
||
|
||
active_backups="$(active_backup_processes || true)"
|
||
if [ -n "${active_backups}" ]; then
|
||
block "active backup process detected; full offsite sync must not overlap local backups"
|
||
printf '%s\n' "${active_backups}"
|
||
else
|
||
ok "no active local backup process detected"
|
||
fi
|
||
|
||
runway_minutes="$(minutes_until_next_backup_schedule)"
|
||
if [ "${runway_minutes}" -lt "${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}" ]; then
|
||
block "not enough runway before next backup schedule: ${runway_minutes}m < ${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}m"
|
||
else
|
||
ok "enough runway before next backup schedule: ${runway_minutes}m >= ${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}m"
|
||
fi
|
||
}
|
||
|
||
run_small_dry_run() {
|
||
if [ ! -x "${SYNC_SCRIPT}" ]; then
|
||
block "cannot run dry-run; sync controller missing"
|
||
return
|
||
fi
|
||
echo
|
||
echo "== small repo rclone dry-run =="
|
||
if "${SYNC_SCRIPT}" --mode dry-run --repos "${SMALL_REPOS}"; then
|
||
ok "small repo offsite dry-run passed: ${SMALL_REPOS}"
|
||
else
|
||
block "small repo offsite dry-run failed: ${SMALL_REPOS}"
|
||
fi
|
||
}
|
||
|
||
echo "AWOOOI offsite backup readiness gate"
|
||
date
|
||
echo "BACKUP_BASE=${BACKUP_BASE}"
|
||
echo "OFFSITE_ENV_FILE=${OFFSITE_ENV_FILE}"
|
||
echo "MODE=${MODE}"
|
||
echo
|
||
|
||
echo "== config =="
|
||
check_offsite_env
|
||
check_configured
|
||
|
||
echo
|
||
echo "== local repos =="
|
||
if [ "${MODE}" = "pre-full-sync" ]; then
|
||
echo "EXPECTED_REPO_COUNT=$(repo_count "${EXPECTED_REPOS}")"
|
||
check_local_repos "${EXPECTED_REPOS}"
|
||
else
|
||
check_local_repos "${SMALL_REPOS}"
|
||
fi
|
||
|
||
echo
|
||
echo "== markers =="
|
||
check_offsite_marker
|
||
check_escrow_markers
|
||
|
||
if [ "${MODE}" = "pre-full-sync" ]; then
|
||
echo
|
||
echo "== pre-full-sync safety =="
|
||
check_load_for_full_sync
|
||
check_full_sync_runway
|
||
fi
|
||
|
||
if [ "${MODE}" = "dry-run-small" ]; then
|
||
run_small_dry_run
|
||
fi
|
||
|
||
echo
|
||
echo "== summary =="
|
||
echo "PASS=${pass} WARN=${warn} BLOCKED=${blocked_count}"
|
||
|
||
if [ "${blocked_count}" -gt 0 ]; then
|
||
echo "Result: BLOCKED. Do not run offsite sync until blocked items are fixed."
|
||
exit 1
|
||
fi
|
||
|
||
if [ "${warn}" -gt 0 ]; then
|
||
echo "Result: READY_WITH_WARNINGS. Local backups are checkable, but offsite/escrow proof is incomplete."
|
||
exit 0
|
||
fi
|
||
|
||
echo "Result: READY. Offsite and credential escrow readiness checks are green."
|