Files
awoooi/scripts/reboot-recovery/dr-offsite-operator-checklist.sh
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

359 lines
13 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Read-only operator checklist for completing AWOOOI DR offsite readiness.
#
# 2026-05-07 ogt + Codex:
# - 只讀彙整 110 Google Drive/rclone offsite/escrow 狀態與 Prometheus scorecard。
# - 不讀、不列印、不寫入任何 credential。
# - 不上傳資料、不寫 success marker所有寫入動作只輸出給 operator 在 110 TTY 明確執行。
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
REMOTE_110="${REMOTE_110:-wooo@192.168.0.110}"
PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}"
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://192.168.0.110:9093}"
SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}"
SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
MODE="check"
REQUIRE_DR=0
NO_COLOR=0
usage() {
cat <<'USAGE'
Usage:
bash scripts/reboot-recovery/dr-offsite-operator-checklist.sh [--check] [--no-color]
bash scripts/reboot-recovery/dr-offsite-operator-checklist.sh --commands-only
bash scripts/reboot-recovery/dr-offsite-operator-checklist.sh --require-dr
Purpose:
Produce a read-only, secret-safe handoff for finishing Google Drive/rclone offsite backup and
credential escrow after core reboot recovery is already green.
Rules:
- This script never prints credential values.
- This script never uploads backup data.
- This script never writes provider credentials, escrow, partial-sync, or full-sync markers.
- Operator must run the printed write commands directly on 110 TTY.
- --require-dr is the final post-escrow gate: it also requires the repo scorecard,
Prometheus recovery recording rule, and backup alert visibility contract to agree.
Environment:
REMOTE_110, PROMETHEUS_URL, ALERTMANAGER_URL, SSH_BATCH_MODE,
SSH_STRICT_HOST_KEY_CHECKING.
USAGE
}
while [ "$#" -gt 0 ]; do
case "$1" in
--check)
MODE="check"
shift
;;
--commands-only)
MODE="commands-only"
shift
;;
--require-dr)
MODE="check"
REQUIRE_DR=1
shift
;;
--no-color)
NO_COLOR=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [ "${NO_COLOR}" = "1" ]; then
green=""
yellow=""
red=""
reset=""
else
green="$(printf '\033[32m')"
yellow="$(printf '\033[33m')"
red="$(printf '\033[31m')"
reset="$(printf '\033[0m')"
fi
ok() {
printf "%sOK%s %s\n" "${green}" "${reset}" "$*"
}
warn() {
printf "%sWARN%s %s\n" "${yellow}" "${reset}" "$*"
}
block() {
printf "%sBLOCKED%s %s\n" "${red}" "${reset}" "$*"
}
section() {
echo
echo "== $* =="
}
kv_from_file() {
local path="$1"
local key="$2"
awk -F= -v key="$key" '$1 == key {print $2; found=1; exit} END {if (!found) print ""}' "$path"
}
print_secret_rules() {
section "安全邊界"
cat <<'TEXT'
- 不要把 Google Drive OAuth token、rclone.conf、restic password、OAuth recovery code、private key 貼到聊天、repo、LOGBOOK、Telegram 或 Prometheus label。
- evidence-id 只能是密碼管理器項目 ID、工單 ID、sealed envelope ID 或 recovery checklist ID。
- 這份 checklist 只讀;看到命令後,仍需 operator 在 110 TTY 明確執行。
TEXT
}
print_all_commands() {
section "完整 110 TTY 命令順序"
cat <<'COMMANDS'
# 0. 登入 110以下命令都在 110 本機跑。
ssh wooo@192.168.0.110
# 1. 先產出紅acted 狀態,不查 remote、不上傳、不寫 marker。
/backup/scripts/offsite-escrow-evidence-report.sh --no-color
# 2. 設定 Google Drive/rclone。OAuth token 只留在 110 host-local rclone.conf。
/backup/scripts/configure-offsite-rclone.sh --interactive
/backup/scripts/configure-offsite-rclone.sh --status
# 3. Google Drive/rclone 設定後 gate不可有 BLOCKED。
/backup/scripts/backup-offsite-readiness-gate.sh --status --require-configured --no-color
# 4. 小範圍 dry-run不會上傳、不寫 marker。
/backup/scripts/backup-offsite-readiness-gate.sh --dry-run-small --no-color
/backup/scripts/sync-offsite-backups.sh --mode dry-run --repos "ai-artifacts public-routes"
# 5. dry-run 成功後才做小範圍 partial sync這一步會上傳小 repo 並寫 partial marker。
/backup/scripts/sync-offsite-backups.sh --mode sync --repos "ai-artifacts public-routes"
/backup/scripts/backup-offsite-readiness-gate.sh --status --require-configured --no-color
# 6. 人工確認 credential escrow。先列出缺失項目再把 EVIDENCE_ID_FOR_* 換成不含 secret 的證據 ID。
/backup/scripts/mark-credential-escrow-verified.sh --status
/backup/scripts/mark-credential-escrow-verified.sh --missing-commands
# 7. 全量 offsite sync 前只讀檢查;全綠後才安排低峰 full sync。
/backup/scripts/backup-offsite-readiness-gate.sh --pre-full-sync --require-configured --require-escrow --no-color
# 8. 低峰窗口 full sync先放明確啟用 marker這一步會上傳全 13 repo成功才寫 full marker。
install -d -m 750 /backup/offsite
touch /backup/offsite/enable-rclone-sync
/backup/scripts/sync-offsite-backups.sh --mode sync
# 9. 完成後證據檢查。
/backup/scripts/offsite-escrow-evidence-report.sh --include-remote-status --no-color
/backup/scripts/backup-offsite-readiness-gate.sh --status --require-configured --require-escrow --no-color
grep -E 'awoooi_backup_offsite_|awoooi_backup_credential_escrow_' /home/wooo/node_exporter_textfiles/backup_health.prom
COMMANDS
section "repo 工作站最終 gate"
cat <<'COMMANDS'
# 在 /Users/ogt/awoooi repo 工作站跑DR 完成前 --require-dr 必須失敗。
bash scripts/reboot-recovery/full-stack-recovery-scorecard.sh --require-dr
python3 scripts/ops/recovery-scorecard-contract-check.py --prometheus-url http://192.168.0.110:9090 --expect-core-ready --expect-dr-ready
bash scripts/reboot-recovery/wait-dr-offsite-ready.sh --timeout-seconds 900 --interval-seconds 30 --no-color
COMMANDS
}
print_next_step_commands() {
local next_step="$1"
section "依目前 NEXT_STEP 的下一段命令"
case "${next_step}" in
configure_google_drive_rclone_on_110_tty|configure_b2_on_110_tty)
cat <<'COMMANDS'
ssh wooo@192.168.0.110
/backup/scripts/offsite-escrow-evidence-report.sh --no-color
/backup/scripts/configure-offsite-rclone.sh --interactive
/backup/scripts/configure-offsite-rclone.sh --status
/backup/scripts/backup-offsite-readiness-gate.sh --status --require-configured --no-color
COMMANDS
;;
run_small_dry_run_then_partial_sync)
cat <<'COMMANDS'
ssh wooo@192.168.0.110
/backup/scripts/backup-offsite-readiness-gate.sh --dry-run-small --no-color
/backup/scripts/sync-offsite-backups.sh --mode dry-run --repos "ai-artifacts public-routes"
# 上面兩條都成功後才執行:
/backup/scripts/sync-offsite-backups.sh --mode sync --repos "ai-artifacts public-routes"
/backup/scripts/offsite-escrow-evidence-report.sh --no-color
COMMANDS
;;
complete_credential_escrow_review)
cat <<'COMMANDS'
ssh wooo@192.168.0.110
/backup/scripts/mark-credential-escrow-verified.sh --status
# 將輸出的 EVIDENCE_ID_FOR_* 換成不含 secret 的密碼管理器項目 ID、工單 ID、sealed envelope ID 或 recovery checklist ID。
/backup/scripts/mark-credential-escrow-verified.sh --missing-commands
/backup/scripts/offsite-escrow-evidence-report.sh --no-color
# 5 個 marker 寫完後,回到 repo 工作站等待 Prometheus / Alertmanager 收斂:
bash scripts/reboot-recovery/wait-dr-offsite-ready.sh --timeout-seconds 900 --interval-seconds 30 --no-color
COMMANDS
;;
pre_full_sync_review)
cat <<'COMMANDS'
ssh wooo@192.168.0.110
/backup/scripts/backup-offsite-readiness-gate.sh --pre-full-sync --require-configured --require-escrow --no-color
# 上面全綠,且已確認低峰窗口後才執行:
install -d -m 750 /backup/offsite
touch /backup/offsite/enable-rclone-sync
/backup/scripts/sync-offsite-backups.sh --mode sync
/backup/scripts/offsite-escrow-evidence-report.sh --include-remote-status --no-color
COMMANDS
;;
offsite_and_escrow_ready)
cat <<'COMMANDS'
# 110 側維持每日 evidence report、每週 integrity check、每月 restore drill。
/backup/scripts/offsite-escrow-evidence-report.sh --include-remote-status --no-color
# repo 側確認 DR recording rule 變成 1。
bash scripts/reboot-recovery/full-stack-recovery-scorecard.sh --require-dr
python3 scripts/ops/recovery-scorecard-contract-check.py --prometheus-url http://192.168.0.110:9090 --expect-core-ready --expect-dr-ready
bash scripts/reboot-recovery/wait-dr-offsite-ready.sh --once --no-color
COMMANDS
;;
*)
warn "NEXT_STEP unknown=${next_step:-empty}; 請照完整 110 TTY 命令順序逐段執行。"
;;
esac
}
if [ "${MODE}" = "commands-only" ]; then
echo "AWOOOI DR offsite operator checklist"
date '+%Y-%m-%d %H:%M:%S %Z'
print_secret_rules
print_all_commands
exit 0
fi
tmpdir="$(mktemp -d)"
trap 'rm -rf "${tmpdir}"' EXIT
scorecard_log="${tmpdir}/scorecard.log"
require_dr_scorecard_log="${tmpdir}/scorecard-require-dr.log"
contract_log="${tmpdir}/recovery-scorecard-contract.log"
dr_contract_log="${tmpdir}/recovery-scorecard-contract-dr.log"
backup_visibility_log="${tmpdir}/backup-alert-live-visibility.log"
evidence_log="${tmpdir}/offsite-evidence-report.log"
echo "AWOOOI DR offsite operator checklist"
date '+%Y-%m-%d %H:%M:%S %Z'
echo "REMOTE_110=${REMOTE_110}"
echo "PROMETHEUS_URL=${PROMETHEUS_URL}"
echo "ALERTMANAGER_URL=${ALERTMANAGER_URL}"
print_secret_rules
section "repo scorecard"
if bash "${ROOT_DIR}/scripts/reboot-recovery/full-stack-recovery-scorecard.sh" >"${scorecard_log}" 2>&1; then
ok "full-stack-recovery-scorecard.sh completed"
else
warn "full-stack-recovery-scorecard.sh returned non-zero; continuing with collected output"
fi
cat "${scorecard_log}"
recovery_state="$(kv_from_file "${scorecard_log}" RECOVERY_STATE)"
next_step="$(kv_from_file "${scorecard_log}" NEXT_STEP)"
section "Prometheus recovery recording rule"
if python3 "${ROOT_DIR}/scripts/ops/recovery-scorecard-contract-check.py" \
--prometheus-url "${PROMETHEUS_URL}" \
--expect-core-ready \
>"${contract_log}" 2>&1; then
ok "recovery scorecard live contract passed"
else
block "recovery scorecard live contract failed"
fi
cat "${contract_log}"
section "110 redacted evidence report"
ssh_opts=(-o BatchMode="${SSH_BATCH_MODE}" -o ConnectTimeout=6 -o StrictHostKeyChecking="${SSH_STRICT_HOST_KEY_CHECKING}")
if ssh "${ssh_opts[@]}" "${REMOTE_110}" '/backup/scripts/offsite-escrow-evidence-report.sh --no-color' >"${evidence_log}" 2>&1; then
ok "110 offsite evidence report collected"
else
warn "110 offsite evidence report unavailable; SSH 或 /backup/scripts 需先恢復"
fi
cat "${evidence_log}"
evidence_next_step="$(kv_from_file "${evidence_log}" NEXT_STEP)"
if [ -n "${evidence_next_step}" ]; then
next_step="${evidence_next_step}"
fi
section "目前判定"
echo "RECOVERY_STATE=${recovery_state:-unknown}"
echo "NEXT_STEP=${next_step:-unknown}"
if [ "${recovery_state:-}" = "CORE_READY_DR_OFFSITE_READY" ] || [ "${next_step:-}" = "offsite_and_escrow_ready" ]; then
ok "核心恢復與 DR offsite gate 看起來都已完成"
else
warn "核心恢復可用,但 DR offsite 仍需 operator 完成人工段落"
fi
print_next_step_commands "${next_step:-unknown}"
print_all_commands
if [ "${REQUIRE_DR}" = "1" ]; then
section "require-dr final contract"
require_dr_failed=0
if bash "${ROOT_DIR}/scripts/reboot-recovery/full-stack-recovery-scorecard.sh" \
--require-dr \
>"${require_dr_scorecard_log}" 2>&1; then
ok "full-stack-recovery-scorecard.sh --require-dr passed"
else
require_dr_failed=1
block "full-stack-recovery-scorecard.sh --require-dr failed"
fi
cat "${require_dr_scorecard_log}"
if python3 "${ROOT_DIR}/scripts/ops/recovery-scorecard-contract-check.py" \
--prometheus-url "${PROMETHEUS_URL}" \
--expect-core-ready \
--expect-dr-ready \
>"${dr_contract_log}" 2>&1; then
ok "Prometheus recovery recording rule confirms DR ready"
else
require_dr_failed=1
block "Prometheus recovery recording rule does not confirm DR ready"
fi
cat "${dr_contract_log}"
if python3 "${ROOT_DIR}/scripts/ops/backup-alert-live-visibility-check.py" \
--prometheus-url "${PROMETHEUS_URL}" \
--alertmanager-url "${ALERTMANAGER_URL}" \
>"${backup_visibility_log}" 2>&1; then
ok "backup alert visibility contract passed"
else
require_dr_failed=1
block "backup alert visibility contract failed"
fi
cat "${backup_visibility_log}"
if [ "${recovery_state:-}" != "CORE_READY_DR_OFFSITE_READY" ] && [ "${next_step:-}" != "offsite_and_escrow_ready" ]; then
require_dr_failed=1
block "require-dr state check failed: ${recovery_state:-unknown}; NEXT_STEP=${next_step:-unknown}"
fi
if [ "${require_dr_failed}" -eq 0 ]; then
ok "DR offsite final gate passed"
exit 0
fi
block "require-dr failed: ${recovery_state:-unknown}; NEXT_STEP=${next_step:-unknown}"
exit 1
fi