Files
awoooi/scripts/backup/verify-offsite-full-sync.sh
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

297 lines
11 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# WOOO AIOps - Offsite full sync verifier
# 2026-05-19 ogt + Codex: full sync 後驗證 Google Drive/rclone 遠端仍符合
# latest-only13 個 repo 都可列出,且 snapshots/ 只保留 1 份。
#
# 規則:
# - 只讀 Google Drive/rclone remote不讀、不輸出 token 或 rclone.conf。
# - 預設印出人可讀報告;--write-textfile 會寫 node-exporter 指標。
# - full marker 未 fresh 時可執行,但結果會標示 verify_ok=0。
# =============================================================================
set -euo pipefail
source "$(dirname "$0")/common.sh"
SERVICE="offsite-full-sync-verify"
PROVIDER="${OFFSITE_PROVIDER:-rclone}"
RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}"
OFFSITE_REMOTE_ROOT_VALUE="${OFFSITE_REMOTE_ROOT:-${RCLONE_REMOTE}:awoooi-backups/restic}"
OFFSITE_DIR="${BACKUP_BASE}/offsite"
TEXTFILE_DIR="${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_exporter_textfiles}"
TEXTFILE_PATH="${TEXTFILE_DIR}/offsite_full_sync_verify.prom"
HOST_LABEL="${AIOPS_HOST_LABEL:-110}"
EXPECTED_REPOS="${OFFSITE_REPOS:-awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes}"
MAX_AGE_HOURS="${OFFSITE_FULL_VERIFY_MAX_AGE_HOURS:-48}"
WRITE_TEXTFILE=0
NO_COLOR=0
usage() {
cat <<'USAGE'
Usage:
verify-offsite-full-sync.sh [--write-textfile] [--no-color]
Checks:
- Google Drive/rclone remote exists.
- /backup/offsite/rclone-last-success is fresh.
- Every expected remote restic repo has exactly one snapshots/ entry.
This script never prints OAuth tokens, rclone.conf, restic passwords, or provider secrets.
USAGE
}
while [ "$#" -gt 0 ]; do
case "$1" in
--write-textfile)
WRITE_TEXTFILE=1
shift
;;
--no-color)
NO_COLOR=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [ "${NO_COLOR}" = "1" ]; then
green=""
yellow=""
red=""
reset=""
else
green="$(printf '\033[32m')"
yellow="$(printf '\033[33m')"
red="$(printf '\033[31m')"
reset="$(printf '\033[0m')"
fi
label_escape() {
printf '%s' "$1" | sed -e 's/\\/\\\\/g' -e 's/"/\\"/g'
}
remote_root() {
printf '%s' "${OFFSITE_REMOTE_ROOT_VALUE}"
}
remote_repo_path() {
local repo="$1"
printf '%s/%s' "$(remote_root)" "${repo}"
}
marker_timestamp() {
local path="$1"
[ -f "${path}" ] || {
echo 0
return
}
awk -F= '/^timestamp=/ {print int($2); found=1; exit} END {if (!found) print 0}' "${path}" 2>/dev/null || echo 0
}
repo_count() {
local count=0
for _repo in ${EXPECTED_REPOS}; do
count=$((count + 1))
done
echo "${count}"
}
low_priority() {
if command -v ionice >/dev/null 2>&1; then
ionice -c2 -n7 nice -n 10 "$@"
else
nice -n 10 "$@"
fi
}
rclone_ready() {
command -v rclone >/dev/null 2>&1 || return 1
rclone listremotes 2>/dev/null | grep -Fxq "${RCLONE_REMOTE}:"
}
count_remote_snapshots() {
local repo="$1"
local remote_snapshots
local output
remote_snapshots="$(remote_repo_path "${repo}")/snapshots"
if ! output="$(low_priority timeout 60s rclone lsf "${remote_snapshots}" --files-only --max-depth 1 2>/dev/null)"; then
echo -1
return 1
fi
printf '%s\n' "${output}" | sed '/^[[:space:]]*$/d' | wc -l | tr -d ' '
}
write_textfile() {
local now="$1"
local full_ts="$2"
local full_age="$3"
local full_fresh="$4"
local verify_ok="$5"
local failed="$6"
local success_ts="$7"
local success_age="$8"
local success_fresh="$9"
shift 9
local rows=("$@")
local tmp
local host
local provider
host="$(label_escape "${HOST_LABEL}")"
provider="$(label_escape "${PROVIDER}")"
install -d -m 755 "${TEXTFILE_DIR}"
tmp="$(mktemp "${TEXTFILE_PATH}.tmp.XXXXXX")"
{
echo "# HELP awoooi_backup_offsite_full_verify_last_run_timestamp Unix timestamp of the last full offsite verification run."
echo "# TYPE awoooi_backup_offsite_full_verify_last_run_timestamp gauge"
echo "# HELP awoooi_backup_offsite_full_verify_last_success_timestamp Unix timestamp of the last successful full offsite verification run."
echo "# TYPE awoooi_backup_offsite_full_verify_last_success_timestamp gauge"
echo "# HELP awoooi_backup_offsite_full_verify_age_seconds Age of the last successful full offsite verification run."
echo "# TYPE awoooi_backup_offsite_full_verify_age_seconds gauge"
echo "# HELP awoooi_backup_offsite_full_verify_fresh Whether the last successful full offsite verification is within max_age_hours."
echo "# TYPE awoooi_backup_offsite_full_verify_fresh gauge"
echo "# HELP awoooi_backup_offsite_full_verify_last_run_failed Whether the latest full offsite verification run failed."
echo "# TYPE awoooi_backup_offsite_full_verify_last_run_failed gauge"
echo "# HELP awoooi_backup_offsite_remote_verify_ok Whether full offsite remote state currently matches latest-only expectations."
echo "# TYPE awoooi_backup_offsite_remote_verify_ok gauge"
echo "# HELP awoooi_backup_offsite_full_marker_fresh Whether the full offsite success marker is fresh."
echo "# TYPE awoooi_backup_offsite_full_marker_fresh gauge"
echo "# HELP awoooi_backup_offsite_remote_snapshot_count Count of remote restic snapshots for each repo."
echo "# TYPE awoooi_backup_offsite_remote_snapshot_count gauge"
echo "# HELP awoooi_backup_offsite_remote_snapshot_latest_only Whether the remote repo has exactly one snapshot."
echo "# TYPE awoooi_backup_offsite_remote_snapshot_latest_only gauge"
echo "awoooi_backup_offsite_full_verify_last_run_timestamp{host=\"${host}\",provider=\"${provider}\"} ${now}"
echo "awoooi_backup_offsite_full_verify_last_success_timestamp{host=\"${host}\",provider=\"${provider}\"} ${success_ts}"
echo "awoooi_backup_offsite_full_verify_age_seconds{host=\"${host}\",provider=\"${provider}\",max_age_hours=\"${MAX_AGE_HOURS}\"} ${success_age}"
echo "awoooi_backup_offsite_full_verify_fresh{host=\"${host}\",provider=\"${provider}\",max_age_hours=\"${MAX_AGE_HOURS}\"} ${success_fresh}"
echo "awoooi_backup_offsite_full_verify_last_run_failed{host=\"${host}\",provider=\"${provider}\"} ${failed}"
echo "awoooi_backup_offsite_remote_verify_ok{host=\"${host}\",provider=\"${provider}\"} ${verify_ok}"
echo "awoooi_backup_offsite_full_marker_fresh{host=\"${host}\",provider=\"${provider}\",max_age_hours=\"${MAX_AGE_HOURS}\"} ${full_fresh}"
echo "awoooi_backup_offsite_full_marker_timestamp{host=\"${host}\",provider=\"${provider}\"} ${full_ts}"
echo "awoooi_backup_offsite_full_marker_age_seconds{host=\"${host}\",provider=\"${provider}\",max_age_hours=\"${MAX_AGE_HOURS}\"} ${full_age}"
for row in "${rows[@]}"; do
IFS='|' read -r repo count ok <<<"${row}"
repo="$(label_escape "${repo}")"
echo "awoooi_backup_offsite_remote_snapshot_count{host=\"${host}\",provider=\"${provider}\",repo=\"${repo}\"} ${count}"
echo "awoooi_backup_offsite_remote_snapshot_latest_only{host=\"${host}\",provider=\"${provider}\",repo=\"${repo}\"} ${ok}"
done
} >"${tmp}"
mv "${tmp}" "${TEXTFILE_PATH}"
chmod 0644 "${TEXTFILE_PATH}"
}
main() {
local now
local full_ts
local full_age
local full_fresh=0
local failed=0
local repo
local count
local ok
local latest_only_ok=1
local verify_ok=0
local success_marker="${OFFSITE_DIR}/${PROVIDER}-full-verify-last-success"
local success_ts
local success_age
local success_fresh=0
local rows=()
now="$(date +%s)"
full_ts="$(marker_timestamp "${OFFSITE_DIR}/${PROVIDER}-last-success")"
full_age=0
if [ "${full_ts}" -gt 0 ]; then
full_age=$((now - full_ts))
if [ "${full_age}" -le $((MAX_AGE_HOURS * 3600)) ]; then
full_fresh=1
fi
fi
echo "AWOOOI offsite full sync verifier"
date
echo "PROVIDER=${PROVIDER}"
echo "REMOTE_ROOT=$(remote_root)"
echo "EXPECTED_REPO_COUNT=$(repo_count)"
echo "WRITE_TEXTFILE=${WRITE_TEXTFILE}"
echo
if [ "${PROVIDER}" != "rclone" ]; then
printf "%sBLOCKED%s unsupported provider for remote snapshot verification: %s\n" "${red}" "${reset}" "${PROVIDER}"
failed=1
elif rclone_ready; then
printf "%sOK%s rclone remote configured: %s:\n" "${green}" "${reset}" "${RCLONE_REMOTE}"
else
printf "%sBLOCKED%s rclone remote unavailable: %s:\n" "${red}" "${reset}" "${RCLONE_REMOTE}"
failed=1
fi
if [ "${full_fresh}" = "1" ]; then
printf "%sOK%s full offsite marker fresh age=%ss\n" "${green}" "${reset}" "${full_age}"
else
printf "%sWARN%s full offsite marker missing or stale age=%ss\n" "${yellow}" "${reset}" "${full_age}"
failed=1
fi
echo
echo "== remote snapshot counts =="
for repo in ${EXPECTED_REPOS}; do
count="$(count_remote_snapshots "${repo}" || true)"
ok=0
if [ "${count}" = "1" ]; then
ok=1
printf "%sOK%s %s remote snapshots=%s\n" "${green}" "${reset}" "${repo}" "${count}"
else
latest_only_ok=0
failed=1
printf "%sWARN%s %s remote snapshots=%s expected=1\n" "${yellow}" "${reset}" "${repo}" "${count}"
fi
rows+=("${repo}|${count}|${ok}")
done
if [ "${failed}" -eq 0 ] && [ "${latest_only_ok}" -eq 1 ] && [ "${full_fresh}" = "1" ]; then
verify_ok=1
install -d -m 750 "${OFFSITE_DIR}"
cat >"${success_marker}" <<EOF
timestamp=${now}
provider=${PROVIDER}
scope=full-verify
repos=${EXPECTED_REPOS}
EOF
fi
success_ts="$(marker_timestamp "${success_marker}")"
success_age=0
if [ "${success_ts}" -gt 0 ]; then
success_age=$((now - success_ts))
if [ "${success_age}" -le $((MAX_AGE_HOURS * 3600)) ]; then
success_fresh=1
fi
fi
if [ "${WRITE_TEXTFILE}" = "1" ]; then
write_textfile "${now}" "${full_ts}" "${full_age}" "${full_fresh}" "${verify_ok}" "${failed}" "${success_ts}" "${success_age}" "${success_fresh}" "${rows[@]}"
echo
echo "TEXTFILE_WRITTEN=${TEXTFILE_PATH}"
fi
echo
echo "REMOTE_LATEST_ONLY_OK=${latest_only_ok}"
echo "FULL_MARKER_FRESH=${full_fresh}"
echo "VERIFY_OK=${verify_ok}"
echo "FAILED=${failed}"
return "${failed}"
}
main "$@"