Files
awoooi/scripts/backup/backup-offsite-readiness-gate.sh
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

437 lines
12 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# WOOO AIOps - Offsite backup readiness gate
# 2026-05-06 ogt + Codex: 離機備份與 credential escrow 放行檢查。
#
# 預設為 read-only status不讀、不列印任何 secret。
# Google Drive/rclone 是目前優先 providerB2 只保留相容路徑。
# =============================================================================
set -euo pipefail
BACKUP_BASE="${BACKUP_BASE:-/backup}"
OFFSITE_ENV_FILE="${BACKUP_OFFSITE_ENV_FILE:-${BACKUP_BASE}/scripts/offsite.env}"
OFFSITE_DIR="${BACKUP_OFFSITE_STATUS_DIR:-${BACKUP_BASE}/offsite}"
ESCROW_DIR="${BACKUP_ESCROW_EVIDENCE_DIR:-${BACKUP_BASE}/escrow-evidence}"
SYNC_SCRIPT="${BACKUP_SYNC_SCRIPT:-${BACKUP_BASE}/scripts/sync-offsite-backups.sh}"
OFFSITE_PROVIDER="${OFFSITE_PROVIDER:-rclone}"
OFFSITE_RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}"
OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES="${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES:-270}"
OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES="${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES:-120 480 840 1200}"
MODE="status"
REQUIRE_CONFIGURED=0
REQUIRE_ESCROW=0
NO_COLOR=0
SMALL_REPOS="ai-artifacts public-routes"
EXPECTED_REPOS="awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes"
pass=0
warn=0
blocked_count=0
usage() {
cat <<'USAGE'
Usage:
backup-offsite-readiness-gate.sh [--status] [--no-color]
backup-offsite-readiness-gate.sh --dry-run-small [--repos "ai-artifacts public-routes"]
backup-offsite-readiness-gate.sh --pre-full-sync
Options:
--require-configured Treat missing rclone/offsite config as BLOCKED.
--require-escrow Treat stale/missing credential escrow markers as BLOCKED.
Rules:
- This gate never prints credential values.
- --dry-run-small runs rclone dry-run only for the selected small repos.
- --pre-full-sync does not upload data; it checks config, local repos, and load.
USAGE
}
while [ "$#" -gt 0 ]; do
case "$1" in
--status)
MODE="status"
shift
;;
--dry-run-small)
MODE="dry-run-small"
REQUIRE_CONFIGURED=1
shift
;;
--pre-full-sync)
MODE="pre-full-sync"
REQUIRE_CONFIGURED=1
shift
;;
--repos)
SMALL_REPOS="${2:-}"
shift 2
;;
--require-configured)
REQUIRE_CONFIGURED=1
shift
;;
--require-escrow)
REQUIRE_ESCROW=1
shift
;;
--no-color)
NO_COLOR=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
case "${MODE}" in
status|dry-run-small|pre-full-sync) ;;
*)
echo "Invalid mode: ${MODE}" >&2
exit 2
;;
esac
if [ "${NO_COLOR}" = "1" ]; then
green=""
yellow=""
red=""
reset=""
else
green="$(printf '\033[32m')"
yellow="$(printf '\033[33m')"
red="$(printf '\033[31m')"
reset="$(printf '\033[0m')"
fi
ok() {
pass=$((pass + 1))
printf "%sOK%s %s\n" "${green}" "${reset}" "$*"
}
warning() {
warn=$((warn + 1))
printf "%sWARN%s %s\n" "${yellow}" "${reset}" "$*"
}
block() {
blocked_count=$((blocked_count + 1))
printf "%sBLOCKED%s %s\n" "${red}" "${reset}" "$*"
}
warn_or_block() {
local require="$1"
shift
if [ "${require}" = "1" ]; then
block "$@"
else
warning "$@"
fi
}
configured_secret() {
local value="${1:-}"
[ -n "${value}" ] && [ "${value}" != "CHANGE_ME" ] && [ "${value}" != "CHANGEME" ] && [ "${value}" != "TODO" ] && [ "${value}" != "REDACTED" ]
}
file_mode() {
stat -c '%a' "$1" 2>/dev/null || stat -f '%Lp' "$1" 2>/dev/null || echo unknown
}
load_offsite_env() {
if [ -f "${OFFSITE_ENV_FILE}" ]; then
# shellcheck disable=SC1090
source "${OFFSITE_ENV_FILE}"
OFFSITE_PROVIDER="${OFFSITE_PROVIDER:-rclone}"
OFFSITE_RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}"
fi
}
repo_count() {
local count=0
for _repo in $1; do
count=$((count + 1))
done
echo "${count}"
}
marker_timestamp() {
local path="$1"
[ -f "${path}" ] || {
echo 0
return
}
awk -F= '/^timestamp=/ {print int($2); found=1; exit} END {if (!found) print 0}' "${path}" 2>/dev/null || echo 0
}
check_offsite_env() {
load_offsite_env
if [ -f "${OFFSITE_ENV_FILE}" ]; then
mode="$(file_mode "${OFFSITE_ENV_FILE}")"
if [ "${mode}" = "600" ]; then
ok "offsite.env exists with private mode 0600"
else
block "offsite.env mode must be 0600; current mode=${mode}"
fi
elif [ "${OFFSITE_PROVIDER}" = "b2" ]; then
warn_or_block "${REQUIRE_CONFIGURED}" "offsite.env missing; B2 provider not configured yet"
else
warning "offsite.env missing; Google Drive/rclone 可先用 rclone config 建 remote再用 configure-offsite-rclone.sh 寫入非 secret 設定"
fi
}
check_configured() {
load_offsite_env
if command -v rclone >/dev/null 2>&1; then
ok "rclone command is available"
else
warn_or_block "${REQUIRE_CONFIGURED}" "rclone command is missing"
fi
if [ "${OFFSITE_PROVIDER}" = "b2" ]; then
local b2_ready=0
if configured_secret "${B2_ACCOUNT_ID:-}" && configured_secret "${B2_APPLICATION_KEY:-}" && configured_secret "${B2_BUCKET:-}"; then
b2_ready=1
fi
if [ "${b2_ready}" = "1" ]; then
ok "B2 account/application key/bucket are configured without exposing values"
else
warn_or_block "${REQUIRE_CONFIGURED}" "B2 account/application key/bucket not fully configured"
fi
elif command -v rclone >/dev/null 2>&1 && rclone listremotes 2>/dev/null | grep -Fxq "${OFFSITE_RCLONE_REMOTE}:"; then
ok "rclone remote is configured without exposing tokens: ${OFFSITE_RCLONE_REMOTE}:"
else
warn_or_block "${REQUIRE_CONFIGURED}" "Google Drive/rclone remote not configured: ${OFFSITE_RCLONE_REMOTE}:"
fi
if [ -x "${SYNC_SCRIPT}" ]; then
ok "offsite sync controller is executable: ${SYNC_SCRIPT}"
else
block "offsite sync controller missing or not executable: ${SYNC_SCRIPT}"
fi
}
check_local_repos() {
local repos="$1"
local missing=0
for repo in ${repos}; do
if [ -d "${BACKUP_BASE}/${repo}/data" ]; then
ok "local restic repo exists: ${repo}"
else
block "local restic repo missing or uninitialized: ${BACKUP_BASE}/${repo}"
missing=$((missing + 1))
fi
done
[ "${missing}" -eq 0 ]
}
check_offsite_marker() {
local now
local ts
local age
local provider
now="$(date +%s)"
for provider in rclone b2; do
ts="$(marker_timestamp "${OFFSITE_DIR}/${provider}-last-success")"
[ "${ts}" -gt 0 ] && break
done
if [ "${ts}" -gt 0 ]; then
age=$((now - ts))
if [ "${age}" -le $((48 * 3600)) ]; then
ok "full offsite success marker is fresh provider=${provider} age=${age}s"
else
warning "full offsite success marker stale provider=${provider} age=${age}s"
fi
else
warning "full offsite success marker missing; full remote copy has not been proven"
fi
for provider in rclone b2; do
ts="$(marker_timestamp "${OFFSITE_DIR}/${provider}-partial-last-success")"
[ "${ts}" -gt 0 ] && break
done
if [ "${ts}" -gt 0 ]; then
age=$((now - ts))
ok "partial offsite marker exists provider=${provider} age=${age}s"
else
warning "partial offsite marker missing; small-repo sync has not been proven"
fi
}
check_escrow_markers() {
local now
local item
local path
local ts
local age
now="$(date +%s)"
for item in restic_repository_password offsite_provider_credentials break_glass_admin_credentials dns_registrar_recovery oauth_ai_provider_recovery; do
path="${ESCROW_DIR}/${item}.last_verified"
ts="$(marker_timestamp "${path}")"
if [ "${ts}" -gt 0 ]; then
age=$((now - ts))
if [ "${age}" -le $((744 * 3600)) ]; then
ok "credential escrow marker fresh: ${item}"
else
warn_or_block "${REQUIRE_ESCROW}" "credential escrow marker stale: ${item} age=${age}s"
fi
else
warn_or_block "${REQUIRE_ESCROW}" "credential escrow marker missing: ${item}"
fi
done
}
check_load_for_full_sync() {
if [ -r /proc/loadavg ]; then
awk '
{
load5=$2
cores=0
while ((getline line < "/proc/cpuinfo") > 0) {
if (line ~ /^processor/) cores++
}
if (cores < 1) cores=1
ratio=load5/cores
printf "LOAD5 %.4f CORES %d LOAD5_PER_CORE %.6f\n", load5, cores, ratio
if (ratio > 0.7) exit 42
}
' /proc/loadavg
rc=$?
if [ "${rc}" -eq 0 ]; then
ok "host load is low enough for pre-full-sync review"
else
block "host load too high for full offsite sync review"
fi
else
warning "load check skipped; /proc/loadavg unavailable"
fi
}
active_backup_processes() {
ps -eo pid=,args= | awk -v self="$$" '
$1 == self { next }
/\/backup\/scripts\/backup-(all|awoooi|awoooi-frequent|gitea|harbor|momo|langfuse|monitoring|signoz|open-webui|clawbot|sentry|ai-artifacts|public-routes|configs)\.sh/ {
print
}
'
}
minutes_until_next_backup_schedule() {
local now_h
local now_m
local now
local sched
local delta
local best=1440
now_h="$(date +%H)"
now_m="$(date +%M)"
now=$((10#${now_h} * 60 + 10#${now_m}))
for sched in ${OFFSITE_SYNC_BACKUP_SCHEDULE_MINUTES}; do
delta=$((sched - now))
if [ "${delta}" -le 0 ]; then
delta=$((delta + 1440))
fi
if [ "${delta}" -lt "${best}" ]; then
best="${delta}"
fi
done
echo "${best}"
}
check_full_sync_runway() {
local active_backups
local runway_minutes
active_backups="$(active_backup_processes || true)"
if [ -n "${active_backups}" ]; then
block "active backup process detected; full offsite sync must not overlap local backups"
printf '%s\n' "${active_backups}"
else
ok "no active local backup process detected"
fi
runway_minutes="$(minutes_until_next_backup_schedule)"
if [ "${runway_minutes}" -lt "${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}" ]; then
block "not enough runway before next backup schedule: ${runway_minutes}m < ${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}m"
else
ok "enough runway before next backup schedule: ${runway_minutes}m >= ${OFFSITE_SYNC_FULL_MIN_RUNWAY_MINUTES}m"
fi
}
run_small_dry_run() {
if [ ! -x "${SYNC_SCRIPT}" ]; then
block "cannot run dry-run; sync controller missing"
return
fi
echo
echo "== small repo rclone dry-run =="
if "${SYNC_SCRIPT}" --mode dry-run --repos "${SMALL_REPOS}"; then
ok "small repo offsite dry-run passed: ${SMALL_REPOS}"
else
block "small repo offsite dry-run failed: ${SMALL_REPOS}"
fi
}
echo "AWOOOI offsite backup readiness gate"
date
echo "BACKUP_BASE=${BACKUP_BASE}"
echo "OFFSITE_ENV_FILE=${OFFSITE_ENV_FILE}"
echo "MODE=${MODE}"
echo
echo "== config =="
check_offsite_env
check_configured
echo
echo "== local repos =="
if [ "${MODE}" = "pre-full-sync" ]; then
echo "EXPECTED_REPO_COUNT=$(repo_count "${EXPECTED_REPOS}")"
check_local_repos "${EXPECTED_REPOS}"
else
check_local_repos "${SMALL_REPOS}"
fi
echo
echo "== markers =="
check_offsite_marker
check_escrow_markers
if [ "${MODE}" = "pre-full-sync" ]; then
echo
echo "== pre-full-sync safety =="
check_load_for_full_sync
check_full_sync_runway
fi
if [ "${MODE}" = "dry-run-small" ]; then
run_small_dry_run
fi
echo
echo "== summary =="
echo "PASS=${pass} WARN=${warn} BLOCKED=${blocked_count}"
if [ "${blocked_count}" -gt 0 ]; then
echo "Result: BLOCKED. Do not run offsite sync until blocked items are fixed."
exit 1
fi
if [ "${warn}" -gt 0 ]; then
echo "Result: READY_WITH_WARNINGS. Local backups are checkable, but offsite/escrow proof is incomplete."
exit 0
fi
echo "Result: READY. Offsite and credential escrow readiness checks are green."