#!/bin/bash # ============================================================================= # WOOO AIOps - Offsite backup readiness gate # 2026-05-06 ogt + Codex: 離機備份與 credential escrow 放行檢查。 # # 預設為 read-only status,不讀、不列印任何 secret。 # Google Drive/rclone 是目前優先 provider;B2 只保留相容路徑。 # ============================================================================= set -euo pipefail BACKUP_BASE="${BACKUP_BASE:-/backup}" OFFSITE_ENV_FILE="${BACKUP_OFFSITE_ENV_FILE:-${BACKUP_BASE}/scripts/offsite.env}" OFFSITE_DIR="${BACKUP_OFFSITE_STATUS_DIR:-${BACKUP_BASE}/offsite}" ESCROW_DIR="${BACKUP_ESCROW_EVIDENCE_DIR:-${BACKUP_BASE}/escrow-evidence}" SYNC_SCRIPT="${BACKUP_SYNC_SCRIPT:-${BACKUP_BASE}/scripts/sync-offsite-backups.sh}" OFFSITE_PROVIDER="${OFFSITE_PROVIDER:-rclone}" OFFSITE_RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}" OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES="${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES:-270}" OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES="${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES:-120 480 840 1200}" MODE="status" REQUIRE_CONFIGURED=0 REQUIRE_ESCROW=0 NO_COLOR=0 SMALL_REPOS="ai-artifacts public-routes" EXPECTED_REPOS="awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes" pass=0 warn=0 blocked_count=0 usage() { cat <<'USAGE' Usage: backup-offsite-readiness-gate.sh [--status] [--no-color] backup-offsite-readiness-gate.sh --dry-run-small [--repos "ai-artifacts public-routes"] backup-offsite-readiness-gate.sh --pre-full-sync Options: --require-configured Treat missing rclone/offsite config as BLOCKED. --require-escrow Treat stale/missing credential escrow markers as BLOCKED. Rules: - This gate never prints credential values. - --dry-run-small runs rclone dry-run only for the selected small repos. - --pre-full-sync does not upload data; it checks config, local repos, and load. USAGE } while [ "$#" -gt 0 ]; do case "$1" in --status) MODE="status" shift ;; --dry-run-small) MODE="dry-run-small" REQUIRE_CONFIGURED=1 shift ;; --pre-full-sync) MODE="pre-full-sync" REQUIRE_CONFIGURED=1 shift ;; --repos) SMALL_REPOS="${2:-}" shift 2 ;; --require-configured) REQUIRE_CONFIGURED=1 shift ;; --require-escrow) REQUIRE_ESCROW=1 shift ;; --no-color) NO_COLOR=1 shift ;; -h|--help) usage exit 0 ;; *) echo "Unknown argument: $1" >&2 usage >&2 exit 2 ;; esac done case "${MODE}" in status|dry-run-small|pre-full-sync) ;; *) echo "Invalid mode: ${MODE}" >&2 exit 2 ;; esac if [ "${NO_COLOR}" = "1" ]; then green="" yellow="" red="" reset="" else green="$(printf '\033[32m')" yellow="$(printf '\033[33m')" red="$(printf '\033[31m')" reset="$(printf '\033[0m')" fi ok() { pass=$((pass + 1)) printf "%sOK%s %s\n" "${green}" "${reset}" "$*" } warning() { warn=$((warn + 1)) printf "%sWARN%s %s\n" "${yellow}" "${reset}" "$*" } block() { blocked_count=$((blocked_count + 1)) printf "%sBLOCKED%s %s\n" "${red}" "${reset}" "$*" } warn_or_block() { local require="$1" shift if [ "${require}" = "1" ]; then block "$@" else warning "$@" fi } configured_secret() { local value="${1:-}" [ -n "${value}" ] && [ "${value}" != "CHANGE_ME" ] && [ "${value}" != "CHANGEME" ] && [ "${value}" != "TODO" ] && [ "${value}" != "REDACTED" ] } file_mode() { stat -c '%a' "$1" 2>/dev/null || stat -f '%Lp' "$1" 2>/dev/null || echo unknown } load_offsite_env() { if [ -f "${OFFSITE_ENV_FILE}" ]; then # shellcheck disable=SC1090 source "${OFFSITE_ENV_FILE}" OFFSITE_PROVIDER="${OFFSITE_PROVIDER:-rclone}" OFFSITE_RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}" fi } repo_count() { local count=0 for _repo in $1; do count=$((count + 1)) done echo "${count}" } marker_timestamp() { local path="$1" [ -f "${path}" ] || { echo 0 return } awk -F= '/^timestamp=/ {print int($2); found=1; exit} END {if (!found) print 0}' "${path}" 2>/dev/null || echo 0 } check_offsite_env() { load_offsite_env if [ -f "${OFFSITE_ENV_FILE}" ]; then mode="$(file_mode "${OFFSITE_ENV_FILE}")" if [ "${mode}" = "600" ]; then ok "offsite.env exists with private mode 0600" else block "offsite.env mode must be 0600; current mode=${mode}" fi elif [ "${OFFSITE_PROVIDER}" = "b2" ]; then warn_or_block "${REQUIRE_CONFIGURED}" "offsite.env missing; B2 provider not configured yet" else warning "offsite.env missing; Google Drive/rclone 可先用 rclone config 建 remote,再用 configure-offsite-rclone.sh 寫入非 secret 設定" fi } check_configured() { load_offsite_env if command -v rclone >/dev/null 2>&1; then ok "rclone command is available" else warn_or_block "${REQUIRE_CONFIGURED}" "rclone command is missing" fi if [ "${OFFSITE_PROVIDER}" = "b2" ]; then local b2_ready=0 if configured_secret "${B2_ACCOUNT_ID:-}" && configured_secret "${B2_APPLICATION_KEY:-}" && configured_secret "${B2_BUCKET:-}"; then b2_ready=1 fi if [ "${b2_ready}" = "1" ]; then ok "B2 account/application key/bucket are configured without exposing values" else warn_or_block "${REQUIRE_CONFIGURED}" "B2 account/application key/bucket not fully configured" fi elif command -v rclone >/dev/null 2>&1 && rclone listremotes 2>/dev/null | grep -Fxq "${OFFSITE_RCLONE_REMOTE}:"; then ok "rclone remote is configured without exposing tokens: ${OFFSITE_RCLONE_REMOTE}:" else warn_or_block "${REQUIRE_CONFIGURED}" "Google Drive/rclone remote not configured: ${OFFSITE_RCLONE_REMOTE}:" fi if [ -x "${SYNC_SCRIPT}" ]; then ok "offsite sync controller is executable: ${SYNC_SCRIPT}" else block "offsite sync controller missing or not executable: ${SYNC_SCRIPT}" fi } check_local_repos() { local repos="$1" local missing=0 for repo in ${repos}; do if [ -d "${BACKUP_BASE}/${repo}/data" ]; then ok "local restic repo exists: ${repo}" else block "local restic repo missing or uninitialized: ${BACKUP_BASE}/${repo}" missing=$((missing + 1)) fi done [ "${missing}" -eq 0 ] } check_offsite_marker() { local now local ts local age local provider now="$(date +%s)" for provider in rclone b2; do ts="$(marker_timestamp "${OFFSITE_DIR}/${provider}-last-success")" [ "${ts}" -gt 0 ] && break done if [ "${ts}" -gt 0 ]; then age=$((now - ts)) if [ "${age}" -le $((48 * 3600)) ]; then ok "full offsite success marker is fresh provider=${provider} age=${age}s" else warning "full offsite success marker stale provider=${provider} age=${age}s" fi else warning "full offsite success marker missing; full remote copy has not been proven" fi for provider in rclone b2; do ts="$(marker_timestamp "${OFFSITE_DIR}/${provider}-partial-last-success")" [ "${ts}" -gt 0 ] && break done if [ "${ts}" -gt 0 ]; then age=$((now - ts)) ok "partial offsite marker exists provider=${provider} age=${age}s" else warning "partial offsite marker missing; small-repo sync has not been proven" fi } check_escrow_markers() { local now local item local path local ts local age now="$(date +%s)" for item in restic_repository_password offsite_provider_credentials break_glass_admin_credentials dns_registrar_recovery oauth_ai_provider_recovery; do path="${ESCROW_DIR}/${item}.last_verified" ts="$(marker_timestamp "${path}")" if [ "${ts}" -gt 0 ]; then age=$((now - ts)) if [ "${age}" -le $((744 * 3600)) ]; then ok "credential escrow marker fresh: ${item}" else warn_or_block "${REQUIRE_ESCROW}" "credential escrow marker stale: ${item} age=${age}s" fi else warn_or_block "${REQUIRE_ESCROW}" "credential escrow marker missing: ${item}" fi done } check_load_for_full_sync() { if [ -r /proc/loadavg ]; then awk ' { load5=$2 cores=0 while ((getline line < "/proc/cpuinfo") > 0) { if (line ~ /^processor/) cores++ } if (cores < 1) cores=1 ratio=load5/cores printf "LOAD5 %.4f CORES %d LOAD5_PER_CORE %.6f\n", load5, cores, ratio if (ratio > 0.7) exit 42 } ' /proc/loadavg rc=$? if [ "${rc}" -eq 0 ]; then ok "host load is low enough for pre-full-sync review" else block "host load too high for full offsite sync review" fi else warning "load check skipped; /proc/loadavg unavailable" fi } active_backup_processes() { ps -eo pid=,args= | awk -v self="$$" ' $1 == self { next } /\/backup\/scripts\/backup-(all|awoooi|awoooi-frequent|gitea|harbor|momo|langfuse|monitoring|signoz|open-webui|clawbot|sentry|ai-artifacts|public-routes|configs)\.sh/ { print } ' } minutes_until_next_backup_schedule() { local now_h local now_m local now local sched local delta local best=1440 now_h="$(date +%H)" now_m="$(date +%M)" now=$((10#${now_h} * 60 + 10#${now_m})) for sched in ${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES}; do delta=$((sched - now)) if [ "${delta}" -le 0 ]; then delta=$((delta + 1440)) fi if [ "${delta}" -lt "${best}" ]; then best="${delta}" fi done echo "${best}" } check_full_sync_runway() { local active_backups local runway_minutes active_backups="$(active_backup_processes || true)" if [ -n "${active_backups}" ]; then block "active backup process detected; full offsite sync must not overlap local backups" printf '%s\n' "${active_backups}" else ok "no active local backup process detected" fi runway_minutes="$(minutes_until_next_backup_schedule)" if [ "${runway_minutes}" -lt "${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}" ]; then block "not enough runway before next backup schedule: ${runway_minutes}m < ${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}m" else ok "enough runway before next backup schedule: ${runway_minutes}m >= ${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}m" fi } run_small_dry_run() { if [ ! -x "${SYNC_SCRIPT}" ]; then block "cannot run dry-run; sync controller missing" return fi echo echo "== small repo rclone dry-run ==" if "${SYNC_SCRIPT}" --mode dry-run --repos "${SMALL_REPOS}"; then ok "small repo offsite dry-run passed: ${SMALL_REPOS}" else block "small repo offsite dry-run failed: ${SMALL_REPOS}" fi } echo "AWOOOI offsite backup readiness gate" date echo "BACKUP_BASE=${BACKUP_BASE}" echo "OFFSITE_ENV_FILE=${OFFSITE_ENV_FILE}" echo "MODE=${MODE}" echo echo "== config ==" check_offsite_env check_configured echo echo "== local repos ==" if [ "${MODE}" = "pre-full-sync" ]; then echo "EXPECTED_REPO_COUNT=$(repo_count "${EXPECTED_REPOS}")" check_local_repos "${EXPECTED_REPOS}" else check_local_repos "${SMALL_REPOS}" fi echo echo "== markers ==" check_offsite_marker check_escrow_markers if [ "${MODE}" = "pre-full-sync" ]; then echo echo "== pre-full-sync safety ==" check_load_for_full_sync check_full_sync_runway fi if [ "${MODE}" = "dry-run-small" ]; then run_small_dry_run fi echo echo "== summary ==" echo "PASS=${pass} WARN=${warn} BLOCKED=${blocked_count}" if [ "${blocked_count}" -gt 0 ]; then echo "Result: BLOCKED. Do not run offsite sync until blocked items are fixed." exit 1 fi if [ "${warn}" -gt 0 ]; then echo "Result: READY_WITH_WARNINGS. Local backups are checkable, but offsite/escrow proof is incomplete." exit 0 fi echo "Result: READY. Offsite and credential escrow readiness checks are green."