297 lines
11 KiB
Bash
297 lines
11 KiB
Bash
#!/bin/bash
|
||
# =============================================================================
|
||
# WOOO AIOps - Offsite full sync verifier
|
||
# 2026-05-19 ogt + Codex: full sync 後驗證 Google Drive/rclone 遠端仍符合
|
||
# latest-only:13 個 repo 都可列出,且 snapshots/ 只保留 1 份。
|
||
#
|
||
# 規則:
|
||
# - 只讀 Google Drive/rclone remote,不讀、不輸出 token 或 rclone.conf。
|
||
# - 預設印出人可讀報告;--write-textfile 會寫 node-exporter 指標。
|
||
# - full marker 未 fresh 時可執行,但結果會標示 verify_ok=0。
|
||
# =============================================================================
|
||
|
||
set -euo pipefail
|
||
|
||
source "$(dirname "$0")/common.sh"
|
||
|
||
SERVICE="offsite-full-sync-verify"
|
||
PROVIDER="${OFFSITE_PROVIDER:-rclone}"
|
||
RCLONE_REMOTE="${OFFSITE_RCLONE_REMOTE:-gdrive}"
|
||
OFFSITE_REMOTE_ROOT_VALUE="${OFFSITE_REMOTE_ROOT:-${RCLONE_REMOTE}:awoooi-backups/restic}"
|
||
OFFSITE_DIR="${BACKUP_BASE}/offsite"
|
||
TEXTFILE_DIR="${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_exporter_textfiles}"
|
||
TEXTFILE_PATH="${TEXTFILE_DIR}/offsite_full_sync_verify.prom"
|
||
HOST_LABEL="${AIOPS_HOST_LABEL:-110}"
|
||
EXPECTED_REPOS="${OFFSITE_REPOS:-awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes}"
|
||
MAX_AGE_HOURS="${OFFSITE_FULL_VERIFY_MAX_AGE_HOURS:-48}"
|
||
WRITE_TEXTFILE=0
|
||
NO_COLOR=0
|
||
|
||
usage() {
|
||
cat <<'USAGE'
|
||
Usage:
|
||
verify-offsite-full-sync.sh [--write-textfile] [--no-color]
|
||
|
||
Checks:
|
||
- Google Drive/rclone remote exists.
|
||
- /backup/offsite/rclone-last-success is fresh.
|
||
- Every expected remote restic repo has exactly one snapshots/ entry.
|
||
|
||
This script never prints OAuth tokens, rclone.conf, restic passwords, or provider secrets.
|
||
USAGE
|
||
}
|
||
|
||
while [ "$#" -gt 0 ]; do
|
||
case "$1" in
|
||
--write-textfile)
|
||
WRITE_TEXTFILE=1
|
||
shift
|
||
;;
|
||
--no-color)
|
||
NO_COLOR=1
|
||
shift
|
||
;;
|
||
-h|--help)
|
||
usage
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "Unknown argument: $1" >&2
|
||
usage >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
done
|
||
|
||
if [ "${NO_COLOR}" = "1" ]; then
|
||
green=""
|
||
yellow=""
|
||
red=""
|
||
reset=""
|
||
else
|
||
green="$(printf '\033[32m')"
|
||
yellow="$(printf '\033[33m')"
|
||
red="$(printf '\033[31m')"
|
||
reset="$(printf '\033[0m')"
|
||
fi
|
||
|
||
label_escape() {
|
||
printf '%s' "$1" | sed -e 's/\\/\\\\/g' -e 's/"/\\"/g'
|
||
}
|
||
|
||
remote_root() {
|
||
printf '%s' "${OFFSITE_REMOTE_ROOT_VALUE}"
|
||
}
|
||
|
||
remote_repo_path() {
|
||
local repo="$1"
|
||
printf '%s/%s' "$(remote_root)" "${repo}"
|
||
}
|
||
|
||
marker_timestamp() {
|
||
local path="$1"
|
||
[ -f "${path}" ] || {
|
||
echo 0
|
||
return
|
||
}
|
||
awk -F= '/^timestamp=/ {print int($2); found=1; exit} END {if (!found) print 0}' "${path}" 2>/dev/null || echo 0
|
||
}
|
||
|
||
repo_count() {
|
||
local count=0
|
||
for _repo in ${EXPECTED_REPOS}; do
|
||
count=$((count + 1))
|
||
done
|
||
echo "${count}"
|
||
}
|
||
|
||
low_priority() {
|
||
if command -v ionice >/dev/null 2>&1; then
|
||
ionice -c2 -n7 nice -n 10 "$@"
|
||
else
|
||
nice -n 10 "$@"
|
||
fi
|
||
}
|
||
|
||
rclone_ready() {
|
||
command -v rclone >/dev/null 2>&1 || return 1
|
||
rclone listremotes 2>/dev/null | grep -Fxq "${RCLONE_REMOTE}:"
|
||
}
|
||
|
||
count_remote_snapshots() {
|
||
local repo="$1"
|
||
local remote_snapshots
|
||
local output
|
||
remote_snapshots="$(remote_repo_path "${repo}")/snapshots"
|
||
|
||
if ! output="$(low_priority timeout 60s rclone lsf "${remote_snapshots}" --files-only --max-depth 1 2>/dev/null)"; then
|
||
echo -1
|
||
return 1
|
||
fi
|
||
printf '%s\n' "${output}" | sed '/^[[:space:]]*$/d' | wc -l | tr -d ' '
|
||
}
|
||
|
||
write_textfile() {
|
||
local now="$1"
|
||
local full_ts="$2"
|
||
local full_age="$3"
|
||
local full_fresh="$4"
|
||
local verify_ok="$5"
|
||
local failed="$6"
|
||
local success_ts="$7"
|
||
local success_age="$8"
|
||
local success_fresh="$9"
|
||
shift 9
|
||
local rows=("$@")
|
||
local tmp
|
||
local host
|
||
local provider
|
||
|
||
host="$(label_escape "${HOST_LABEL}")"
|
||
provider="$(label_escape "${PROVIDER}")"
|
||
install -d -m 755 "${TEXTFILE_DIR}"
|
||
tmp="$(mktemp "${TEXTFILE_PATH}.tmp.XXXXXX")"
|
||
{
|
||
echo "# HELP awoooi_backup_offsite_full_verify_last_run_timestamp Unix timestamp of the last full offsite verification run."
|
||
echo "# TYPE awoooi_backup_offsite_full_verify_last_run_timestamp gauge"
|
||
echo "# HELP awoooi_backup_offsite_full_verify_last_success_timestamp Unix timestamp of the last successful full offsite verification run."
|
||
echo "# TYPE awoooi_backup_offsite_full_verify_last_success_timestamp gauge"
|
||
echo "# HELP awoooi_backup_offsite_full_verify_age_seconds Age of the last successful full offsite verification run."
|
||
echo "# TYPE awoooi_backup_offsite_full_verify_age_seconds gauge"
|
||
echo "# HELP awoooi_backup_offsite_full_verify_fresh Whether the last successful full offsite verification is within max_age_hours."
|
||
echo "# TYPE awoooi_backup_offsite_full_verify_fresh gauge"
|
||
echo "# HELP awoooi_backup_offsite_full_verify_last_run_failed Whether the latest full offsite verification run failed."
|
||
echo "# TYPE awoooi_backup_offsite_full_verify_last_run_failed gauge"
|
||
echo "# HELP awoooi_backup_offsite_remote_verify_ok Whether full offsite remote state currently matches latest-only expectations."
|
||
echo "# TYPE awoooi_backup_offsite_remote_verify_ok gauge"
|
||
echo "# HELP awoooi_backup_offsite_full_marker_fresh Whether the full offsite success marker is fresh."
|
||
echo "# TYPE awoooi_backup_offsite_full_marker_fresh gauge"
|
||
echo "# HELP awoooi_backup_offsite_remote_snapshot_count Count of remote restic snapshots for each repo."
|
||
echo "# TYPE awoooi_backup_offsite_remote_snapshot_count gauge"
|
||
echo "# HELP awoooi_backup_offsite_remote_snapshot_latest_only Whether the remote repo has exactly one snapshot."
|
||
echo "# TYPE awoooi_backup_offsite_remote_snapshot_latest_only gauge"
|
||
echo "awoooi_backup_offsite_full_verify_last_run_timestamp{host=\"${host}\",provider=\"${provider}\"} ${now}"
|
||
echo "awoooi_backup_offsite_full_verify_last_success_timestamp{host=\"${host}\",provider=\"${provider}\"} ${success_ts}"
|
||
echo "awoooi_backup_offsite_full_verify_age_seconds{host=\"${host}\",provider=\"${provider}\",max_age_hours=\"${MAX_AGE_HOURS}\"} ${success_age}"
|
||
echo "awoooi_backup_offsite_full_verify_fresh{host=\"${host}\",provider=\"${provider}\",max_age_hours=\"${MAX_AGE_HOURS}\"} ${success_fresh}"
|
||
echo "awoooi_backup_offsite_full_verify_last_run_failed{host=\"${host}\",provider=\"${provider}\"} ${failed}"
|
||
echo "awoooi_backup_offsite_remote_verify_ok{host=\"${host}\",provider=\"${provider}\"} ${verify_ok}"
|
||
echo "awoooi_backup_offsite_full_marker_fresh{host=\"${host}\",provider=\"${provider}\",max_age_hours=\"${MAX_AGE_HOURS}\"} ${full_fresh}"
|
||
echo "awoooi_backup_offsite_full_marker_timestamp{host=\"${host}\",provider=\"${provider}\"} ${full_ts}"
|
||
echo "awoooi_backup_offsite_full_marker_age_seconds{host=\"${host}\",provider=\"${provider}\",max_age_hours=\"${MAX_AGE_HOURS}\"} ${full_age}"
|
||
for row in "${rows[@]}"; do
|
||
IFS='|' read -r repo count ok <<<"${row}"
|
||
repo="$(label_escape "${repo}")"
|
||
echo "awoooi_backup_offsite_remote_snapshot_count{host=\"${host}\",provider=\"${provider}\",repo=\"${repo}\"} ${count}"
|
||
echo "awoooi_backup_offsite_remote_snapshot_latest_only{host=\"${host}\",provider=\"${provider}\",repo=\"${repo}\"} ${ok}"
|
||
done
|
||
} >"${tmp}"
|
||
mv "${tmp}" "${TEXTFILE_PATH}"
|
||
chmod 0644 "${TEXTFILE_PATH}"
|
||
}
|
||
|
||
main() {
|
||
local now
|
||
local full_ts
|
||
local full_age
|
||
local full_fresh=0
|
||
local failed=0
|
||
local repo
|
||
local count
|
||
local ok
|
||
local latest_only_ok=1
|
||
local verify_ok=0
|
||
local success_marker="${OFFSITE_DIR}/${PROVIDER}-full-verify-last-success"
|
||
local success_ts
|
||
local success_age
|
||
local success_fresh=0
|
||
local rows=()
|
||
|
||
now="$(date +%s)"
|
||
full_ts="$(marker_timestamp "${OFFSITE_DIR}/${PROVIDER}-last-success")"
|
||
full_age=0
|
||
if [ "${full_ts}" -gt 0 ]; then
|
||
full_age=$((now - full_ts))
|
||
if [ "${full_age}" -le $((MAX_AGE_HOURS * 3600)) ]; then
|
||
full_fresh=1
|
||
fi
|
||
fi
|
||
|
||
echo "AWOOOI offsite full sync verifier"
|
||
date
|
||
echo "PROVIDER=${PROVIDER}"
|
||
echo "REMOTE_ROOT=$(remote_root)"
|
||
echo "EXPECTED_REPO_COUNT=$(repo_count)"
|
||
echo "WRITE_TEXTFILE=${WRITE_TEXTFILE}"
|
||
echo
|
||
|
||
if [ "${PROVIDER}" != "rclone" ]; then
|
||
printf "%sBLOCKED%s unsupported provider for remote snapshot verification: %s\n" "${red}" "${reset}" "${PROVIDER}"
|
||
failed=1
|
||
elif rclone_ready; then
|
||
printf "%sOK%s rclone remote configured: %s:\n" "${green}" "${reset}" "${RCLONE_REMOTE}"
|
||
else
|
||
printf "%sBLOCKED%s rclone remote unavailable: %s:\n" "${red}" "${reset}" "${RCLONE_REMOTE}"
|
||
failed=1
|
||
fi
|
||
|
||
if [ "${full_fresh}" = "1" ]; then
|
||
printf "%sOK%s full offsite marker fresh age=%ss\n" "${green}" "${reset}" "${full_age}"
|
||
else
|
||
printf "%sWARN%s full offsite marker missing or stale age=%ss\n" "${yellow}" "${reset}" "${full_age}"
|
||
failed=1
|
||
fi
|
||
|
||
echo
|
||
echo "== remote snapshot counts =="
|
||
for repo in ${EXPECTED_REPOS}; do
|
||
count="$(count_remote_snapshots "${repo}" || true)"
|
||
ok=0
|
||
if [ "${count}" = "1" ]; then
|
||
ok=1
|
||
printf "%sOK%s %s remote snapshots=%s\n" "${green}" "${reset}" "${repo}" "${count}"
|
||
else
|
||
latest_only_ok=0
|
||
failed=1
|
||
printf "%sWARN%s %s remote snapshots=%s expected=1\n" "${yellow}" "${reset}" "${repo}" "${count}"
|
||
fi
|
||
rows+=("${repo}|${count}|${ok}")
|
||
done
|
||
|
||
if [ "${failed}" -eq 0 ] && [ "${latest_only_ok}" -eq 1 ] && [ "${full_fresh}" = "1" ]; then
|
||
verify_ok=1
|
||
install -d -m 750 "${OFFSITE_DIR}"
|
||
cat >"${success_marker}" <<EOF
|
||
timestamp=${now}
|
||
provider=${PROVIDER}
|
||
scope=full-verify
|
||
repos=${EXPECTED_REPOS}
|
||
EOF
|
||
fi
|
||
|
||
success_ts="$(marker_timestamp "${success_marker}")"
|
||
success_age=0
|
||
if [ "${success_ts}" -gt 0 ]; then
|
||
success_age=$((now - success_ts))
|
||
if [ "${success_age}" -le $((MAX_AGE_HOURS * 3600)) ]; then
|
||
success_fresh=1
|
||
fi
|
||
fi
|
||
|
||
if [ "${WRITE_TEXTFILE}" = "1" ]; then
|
||
write_textfile "${now}" "${full_ts}" "${full_age}" "${full_fresh}" "${verify_ok}" "${failed}" "${success_ts}" "${success_age}" "${success_fresh}" "${rows[@]}"
|
||
echo
|
||
echo "TEXTFILE_WRITTEN=${TEXTFILE_PATH}"
|
||
fi
|
||
|
||
echo
|
||
echo "REMOTE_LATEST_ONLY_OK=${latest_only_ok}"
|
||
echo "FULL_MARKER_FRESH=${full_fresh}"
|
||
echo "VERIFY_OK=${verify_ok}"
|
||
echo "FAILED=${failed}"
|
||
|
||
return "${failed}"
|
||
}
|
||
|
||
main "$@"
|