359 lines
13 KiB
Bash
359 lines
13 KiB
Bash
#!/usr/bin/env bash
|
||
# Read-only operator checklist for completing AWOOOI DR offsite readiness.
|
||
#
|
||
# 2026-05-07 ogt + Codex:
|
||
# - 只讀彙整 110 Google Drive/rclone offsite/escrow 狀態與 Prometheus scorecard。
|
||
# - 不讀、不列印、不寫入任何 credential。
|
||
# - 不上傳資料、不寫 success marker;所有寫入動作只輸出給 operator 在 110 TTY 明確執行。
|
||
|
||
set -euo pipefail
|
||
|
||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||
REMOTE_110="${REMOTE_110:-wooo@192.168.0.110}"
|
||
PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}"
|
||
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://192.168.0.110:9093}"
|
||
SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}"
|
||
SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
|
||
|
||
MODE="check"
|
||
REQUIRE_DR=0
|
||
NO_COLOR=0
|
||
|
||
usage() {
|
||
cat <<'USAGE'
|
||
Usage:
|
||
bash scripts/reboot-recovery/dr-offsite-operator-checklist.sh [--check] [--no-color]
|
||
bash scripts/reboot-recovery/dr-offsite-operator-checklist.sh --commands-only
|
||
bash scripts/reboot-recovery/dr-offsite-operator-checklist.sh --require-dr
|
||
|
||
Purpose:
|
||
Produce a read-only, secret-safe handoff for finishing Google Drive/rclone offsite backup and
|
||
credential escrow after core reboot recovery is already green.
|
||
|
||
Rules:
|
||
- This script never prints credential values.
|
||
- This script never uploads backup data.
|
||
- This script never writes provider credentials, escrow, partial-sync, or full-sync markers.
|
||
- Operator must run the printed write commands directly on 110 TTY.
|
||
- --require-dr is the final post-escrow gate: it also requires the repo scorecard,
|
||
Prometheus recovery recording rule, and backup alert visibility contract to agree.
|
||
|
||
Environment:
|
||
REMOTE_110, PROMETHEUS_URL, ALERTMANAGER_URL, SSH_BATCH_MODE,
|
||
SSH_STRICT_HOST_KEY_CHECKING.
|
||
USAGE
|
||
}
|
||
|
||
while [ "$#" -gt 0 ]; do
|
||
case "$1" in
|
||
--check)
|
||
MODE="check"
|
||
shift
|
||
;;
|
||
--commands-only)
|
||
MODE="commands-only"
|
||
shift
|
||
;;
|
||
--require-dr)
|
||
MODE="check"
|
||
REQUIRE_DR=1
|
||
shift
|
||
;;
|
||
--no-color)
|
||
NO_COLOR=1
|
||
shift
|
||
;;
|
||
-h|--help)
|
||
usage
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "Unknown argument: $1" >&2
|
||
usage >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
done
|
||
|
||
if [ "${NO_COLOR}" = "1" ]; then
|
||
green=""
|
||
yellow=""
|
||
red=""
|
||
reset=""
|
||
else
|
||
green="$(printf '\033[32m')"
|
||
yellow="$(printf '\033[33m')"
|
||
red="$(printf '\033[31m')"
|
||
reset="$(printf '\033[0m')"
|
||
fi
|
||
|
||
ok() {
|
||
printf "%sOK%s %s\n" "${green}" "${reset}" "$*"
|
||
}
|
||
|
||
warn() {
|
||
printf "%sWARN%s %s\n" "${yellow}" "${reset}" "$*"
|
||
}
|
||
|
||
block() {
|
||
printf "%sBLOCKED%s %s\n" "${red}" "${reset}" "$*"
|
||
}
|
||
|
||
section() {
|
||
echo
|
||
echo "== $* =="
|
||
}
|
||
|
||
kv_from_file() {
|
||
local path="$1"
|
||
local key="$2"
|
||
awk -F= -v key="$key" '$1 == key {print $2; found=1; exit} END {if (!found) print ""}' "$path"
|
||
}
|
||
|
||
print_secret_rules() {
|
||
section "安全邊界"
|
||
cat <<'TEXT'
|
||
- 不要把 Google Drive OAuth token、rclone.conf、restic password、OAuth recovery code、private key 貼到聊天、repo、LOGBOOK、Telegram 或 Prometheus label。
|
||
- evidence-id 只能是密碼管理器項目 ID、工單 ID、sealed envelope ID 或 recovery checklist ID。
|
||
- 這份 checklist 只讀;看到命令後,仍需 operator 在 110 TTY 明確執行。
|
||
TEXT
|
||
}
|
||
|
||
print_all_commands() {
|
||
section "完整 110 TTY 命令順序"
|
||
cat <<'COMMANDS'
|
||
# 0. 登入 110;以下命令都在 110 本機跑。
|
||
ssh wooo@192.168.0.110
|
||
|
||
# 1. 先產出紅acted 狀態,不查 remote、不上傳、不寫 marker。
|
||
/backup/scripts/offsite-escrow-evidence-report.sh --no-color
|
||
|
||
# 2. 設定 Google Drive/rclone。OAuth token 只留在 110 host-local rclone.conf。
|
||
/backup/scripts/configure-offsite-rclone.sh --interactive
|
||
/backup/scripts/configure-offsite-rclone.sh --status
|
||
|
||
# 3. Google Drive/rclone 設定後 gate;不可有 BLOCKED。
|
||
/backup/scripts/backup-offsite-readiness-gate.sh --status --require-configured --no-color
|
||
|
||
# 4. 小範圍 dry-run;不會上傳、不寫 marker。
|
||
/backup/scripts/backup-offsite-readiness-gate.sh --dry-run-small --no-color
|
||
/backup/scripts/sync-offsite-backups.sh --mode dry-run --repos "ai-artifacts public-routes"
|
||
|
||
# 5. dry-run 成功後才做小範圍 partial sync;這一步會上傳小 repo 並寫 partial marker。
|
||
/backup/scripts/sync-offsite-backups.sh --mode sync --repos "ai-artifacts public-routes"
|
||
/backup/scripts/backup-offsite-readiness-gate.sh --status --require-configured --no-color
|
||
|
||
# 6. 人工確認 credential escrow。先列出缺失項目,再把 EVIDENCE_ID_FOR_* 換成不含 secret 的證據 ID。
|
||
/backup/scripts/mark-credential-escrow-verified.sh --status
|
||
/backup/scripts/mark-credential-escrow-verified.sh --missing-commands
|
||
|
||
# 7. 全量 offsite sync 前只讀檢查;全綠後才安排低峰 full sync。
|
||
/backup/scripts/backup-offsite-readiness-gate.sh --pre-full-sync --require-configured --require-escrow --no-color
|
||
|
||
# 8. 低峰窗口 full sync;先放明確啟用 marker,這一步會上傳全 13 repo,成功才寫 full marker。
|
||
install -d -m 750 /backup/offsite
|
||
touch /backup/offsite/enable-rclone-sync
|
||
/backup/scripts/sync-offsite-backups.sh --mode sync
|
||
|
||
# 9. 完成後證據檢查。
|
||
/backup/scripts/offsite-escrow-evidence-report.sh --include-remote-status --no-color
|
||
/backup/scripts/backup-offsite-readiness-gate.sh --status --require-configured --require-escrow --no-color
|
||
grep -E 'awoooi_backup_offsite_|awoooi_backup_credential_escrow_' /home/wooo/node_exporter_textfiles/backup_health.prom
|
||
COMMANDS
|
||
|
||
section "repo 工作站最終 gate"
|
||
cat <<'COMMANDS'
|
||
# 在 /Users/ogt/awoooi repo 工作站跑;DR 完成前 --require-dr 必須失敗。
|
||
bash scripts/reboot-recovery/full-stack-recovery-scorecard.sh --require-dr
|
||
python3 scripts/ops/recovery-scorecard-contract-check.py --prometheus-url http://192.168.0.110:9090 --expect-core-ready --expect-dr-ready
|
||
bash scripts/reboot-recovery/wait-dr-offsite-ready.sh --timeout-seconds 900 --interval-seconds 30 --no-color
|
||
COMMANDS
|
||
}
|
||
|
||
print_next_step_commands() {
|
||
local next_step="$1"
|
||
section "依目前 NEXT_STEP 的下一段命令"
|
||
case "${next_step}" in
|
||
configure_google_drive_rclone_on_110_tty|configure_b2_on_110_tty)
|
||
cat <<'COMMANDS'
|
||
ssh wooo@192.168.0.110
|
||
/backup/scripts/offsite-escrow-evidence-report.sh --no-color
|
||
/backup/scripts/configure-offsite-rclone.sh --interactive
|
||
/backup/scripts/configure-offsite-rclone.sh --status
|
||
/backup/scripts/backup-offsite-readiness-gate.sh --status --require-configured --no-color
|
||
COMMANDS
|
||
;;
|
||
run_small_dry_run_then_partial_sync)
|
||
cat <<'COMMANDS'
|
||
ssh wooo@192.168.0.110
|
||
/backup/scripts/backup-offsite-readiness-gate.sh --dry-run-small --no-color
|
||
/backup/scripts/sync-offsite-backups.sh --mode dry-run --repos "ai-artifacts public-routes"
|
||
# 上面兩條都成功後才執行:
|
||
/backup/scripts/sync-offsite-backups.sh --mode sync --repos "ai-artifacts public-routes"
|
||
/backup/scripts/offsite-escrow-evidence-report.sh --no-color
|
||
COMMANDS
|
||
;;
|
||
complete_credential_escrow_review)
|
||
cat <<'COMMANDS'
|
||
ssh wooo@192.168.0.110
|
||
/backup/scripts/mark-credential-escrow-verified.sh --status
|
||
# 將輸出的 EVIDENCE_ID_FOR_* 換成不含 secret 的密碼管理器項目 ID、工單 ID、sealed envelope ID 或 recovery checklist ID。
|
||
/backup/scripts/mark-credential-escrow-verified.sh --missing-commands
|
||
/backup/scripts/offsite-escrow-evidence-report.sh --no-color
|
||
|
||
# 5 個 marker 寫完後,回到 repo 工作站等待 Prometheus / Alertmanager 收斂:
|
||
bash scripts/reboot-recovery/wait-dr-offsite-ready.sh --timeout-seconds 900 --interval-seconds 30 --no-color
|
||
COMMANDS
|
||
;;
|
||
pre_full_sync_review)
|
||
cat <<'COMMANDS'
|
||
ssh wooo@192.168.0.110
|
||
/backup/scripts/backup-offsite-readiness-gate.sh --pre-full-sync --require-configured --require-escrow --no-color
|
||
# 上面全綠,且已確認低峰窗口後才執行:
|
||
install -d -m 750 /backup/offsite
|
||
touch /backup/offsite/enable-rclone-sync
|
||
/backup/scripts/sync-offsite-backups.sh --mode sync
|
||
/backup/scripts/offsite-escrow-evidence-report.sh --include-remote-status --no-color
|
||
COMMANDS
|
||
;;
|
||
offsite_and_escrow_ready)
|
||
cat <<'COMMANDS'
|
||
# 110 側維持每日 evidence report、每週 integrity check、每月 restore drill。
|
||
/backup/scripts/offsite-escrow-evidence-report.sh --include-remote-status --no-color
|
||
|
||
# repo 側確認 DR recording rule 變成 1。
|
||
bash scripts/reboot-recovery/full-stack-recovery-scorecard.sh --require-dr
|
||
python3 scripts/ops/recovery-scorecard-contract-check.py --prometheus-url http://192.168.0.110:9090 --expect-core-ready --expect-dr-ready
|
||
bash scripts/reboot-recovery/wait-dr-offsite-ready.sh --once --no-color
|
||
COMMANDS
|
||
;;
|
||
*)
|
||
warn "NEXT_STEP unknown=${next_step:-empty}; 請照完整 110 TTY 命令順序逐段執行。"
|
||
;;
|
||
esac
|
||
}
|
||
|
||
if [ "${MODE}" = "commands-only" ]; then
|
||
echo "AWOOOI DR offsite operator checklist"
|
||
date '+%Y-%m-%d %H:%M:%S %Z'
|
||
print_secret_rules
|
||
print_all_commands
|
||
exit 0
|
||
fi
|
||
|
||
tmpdir="$(mktemp -d)"
|
||
trap 'rm -rf "${tmpdir}"' EXIT
|
||
scorecard_log="${tmpdir}/scorecard.log"
|
||
require_dr_scorecard_log="${tmpdir}/scorecard-require-dr.log"
|
||
contract_log="${tmpdir}/recovery-scorecard-contract.log"
|
||
dr_contract_log="${tmpdir}/recovery-scorecard-contract-dr.log"
|
||
backup_visibility_log="${tmpdir}/backup-alert-live-visibility.log"
|
||
evidence_log="${tmpdir}/offsite-evidence-report.log"
|
||
|
||
echo "AWOOOI DR offsite operator checklist"
|
||
date '+%Y-%m-%d %H:%M:%S %Z'
|
||
echo "REMOTE_110=${REMOTE_110}"
|
||
echo "PROMETHEUS_URL=${PROMETHEUS_URL}"
|
||
echo "ALERTMANAGER_URL=${ALERTMANAGER_URL}"
|
||
|
||
print_secret_rules
|
||
|
||
section "repo scorecard"
|
||
if bash "${ROOT_DIR}/scripts/reboot-recovery/full-stack-recovery-scorecard.sh" >"${scorecard_log}" 2>&1; then
|
||
ok "full-stack-recovery-scorecard.sh completed"
|
||
else
|
||
warn "full-stack-recovery-scorecard.sh returned non-zero; continuing with collected output"
|
||
fi
|
||
cat "${scorecard_log}"
|
||
|
||
recovery_state="$(kv_from_file "${scorecard_log}" RECOVERY_STATE)"
|
||
next_step="$(kv_from_file "${scorecard_log}" NEXT_STEP)"
|
||
|
||
section "Prometheus recovery recording rule"
|
||
if python3 "${ROOT_DIR}/scripts/ops/recovery-scorecard-contract-check.py" \
|
||
--prometheus-url "${PROMETHEUS_URL}" \
|
||
--expect-core-ready \
|
||
>"${contract_log}" 2>&1; then
|
||
ok "recovery scorecard live contract passed"
|
||
else
|
||
block "recovery scorecard live contract failed"
|
||
fi
|
||
cat "${contract_log}"
|
||
|
||
section "110 redacted evidence report"
|
||
ssh_opts=(-o BatchMode="${SSH_BATCH_MODE}" -o ConnectTimeout=6 -o StrictHostKeyChecking="${SSH_STRICT_HOST_KEY_CHECKING}")
|
||
if ssh "${ssh_opts[@]}" "${REMOTE_110}" '/backup/scripts/offsite-escrow-evidence-report.sh --no-color' >"${evidence_log}" 2>&1; then
|
||
ok "110 offsite evidence report collected"
|
||
else
|
||
warn "110 offsite evidence report unavailable; SSH 或 /backup/scripts 需先恢復"
|
||
fi
|
||
cat "${evidence_log}"
|
||
|
||
evidence_next_step="$(kv_from_file "${evidence_log}" NEXT_STEP)"
|
||
if [ -n "${evidence_next_step}" ]; then
|
||
next_step="${evidence_next_step}"
|
||
fi
|
||
|
||
section "目前判定"
|
||
echo "RECOVERY_STATE=${recovery_state:-unknown}"
|
||
echo "NEXT_STEP=${next_step:-unknown}"
|
||
|
||
if [ "${recovery_state:-}" = "CORE_READY_DR_OFFSITE_READY" ] || [ "${next_step:-}" = "offsite_and_escrow_ready" ]; then
|
||
ok "核心恢復與 DR offsite gate 看起來都已完成"
|
||
else
|
||
warn "核心恢復可用,但 DR offsite 仍需 operator 完成人工段落"
|
||
fi
|
||
|
||
print_next_step_commands "${next_step:-unknown}"
|
||
print_all_commands
|
||
|
||
if [ "${REQUIRE_DR}" = "1" ]; then
|
||
section "require-dr final contract"
|
||
require_dr_failed=0
|
||
|
||
if bash "${ROOT_DIR}/scripts/reboot-recovery/full-stack-recovery-scorecard.sh" \
|
||
--require-dr \
|
||
>"${require_dr_scorecard_log}" 2>&1; then
|
||
ok "full-stack-recovery-scorecard.sh --require-dr passed"
|
||
else
|
||
require_dr_failed=1
|
||
block "full-stack-recovery-scorecard.sh --require-dr failed"
|
||
fi
|
||
cat "${require_dr_scorecard_log}"
|
||
|
||
if python3 "${ROOT_DIR}/scripts/ops/recovery-scorecard-contract-check.py" \
|
||
--prometheus-url "${PROMETHEUS_URL}" \
|
||
--expect-core-ready \
|
||
--expect-dr-ready \
|
||
>"${dr_contract_log}" 2>&1; then
|
||
ok "Prometheus recovery recording rule confirms DR ready"
|
||
else
|
||
require_dr_failed=1
|
||
block "Prometheus recovery recording rule does not confirm DR ready"
|
||
fi
|
||
cat "${dr_contract_log}"
|
||
|
||
if python3 "${ROOT_DIR}/scripts/ops/backup-alert-live-visibility-check.py" \
|
||
--prometheus-url "${PROMETHEUS_URL}" \
|
||
--alertmanager-url "${ALERTMANAGER_URL}" \
|
||
>"${backup_visibility_log}" 2>&1; then
|
||
ok "backup alert visibility contract passed"
|
||
else
|
||
require_dr_failed=1
|
||
block "backup alert visibility contract failed"
|
||
fi
|
||
cat "${backup_visibility_log}"
|
||
|
||
if [ "${recovery_state:-}" != "CORE_READY_DR_OFFSITE_READY" ] && [ "${next_step:-}" != "offsite_and_escrow_ready" ]; then
|
||
require_dr_failed=1
|
||
block "require-dr state check failed: ${recovery_state:-unknown}; NEXT_STEP=${next_step:-unknown}"
|
||
fi
|
||
|
||
if [ "${require_dr_failed}" -eq 0 ]; then
|
||
ok "DR offsite final gate passed"
|
||
exit 0
|
||
fi
|
||
block "require-dr failed: ${recovery_state:-unknown}; NEXT_STEP=${next_step:-unknown}"
|
||
exit 1
|
||
fi
|