228 lines
6.4 KiB
Bash
Executable File
228 lines
6.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Wait for the post-escrow DR offsite gate to converge.
|
||
#
|
||
# 2026-05-20 ogt + Codex:
|
||
# - 只讀等待人工 credential escrow marker 寫完後,repo scorecard、
|
||
# Prometheus recording rule、Alertmanager 可見性與最終 checklist 全部一致。
|
||
# - 不讀、不寫、不列印任何 secret;不建立 marker;不觸發 offsite sync。
|
||
|
||
set -euo pipefail
|
||
|
||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||
PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}"
|
||
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://192.168.0.110:9093}"
|
||
TIMEOUT_SECONDS=900
|
||
INTERVAL_SECONDS=30
|
||
NO_COLOR=0
|
||
ONCE=0
|
||
|
||
usage() {
|
||
cat <<'USAGE'
|
||
Usage:
|
||
bash scripts/reboot-recovery/wait-dr-offsite-ready.sh [--timeout-seconds N] [--interval-seconds N] [--no-color]
|
||
bash scripts/reboot-recovery/wait-dr-offsite-ready.sh --once [--no-color]
|
||
|
||
Purpose:
|
||
After the human operator writes the five credential escrow markers on 110,
|
||
wait until all read-only DR gates converge:
|
||
1. full-stack-recovery-scorecard.sh --require-dr
|
||
2. Prometheus recovery recording rule with --expect-dr-ready
|
||
3. backup-alert-live-visibility-check.py
|
||
4. dr-offsite-operator-checklist.sh --require-dr
|
||
|
||
Rules:
|
||
- This script never writes escrow markers.
|
||
- This script never uploads or deletes backup data.
|
||
- This script never prints credential values.
|
||
- It only waits for scrape/rule/Alertmanager convergence after a real human escrow review.
|
||
|
||
Environment:
|
||
PROMETHEUS_URL, ALERTMANAGER_URL, REMOTE_110, SSH_BATCH_MODE,
|
||
SSH_STRICT_HOST_KEY_CHECKING.
|
||
USAGE
|
||
}
|
||
|
||
while [ "$#" -gt 0 ]; do
|
||
case "$1" in
|
||
--timeout-seconds)
|
||
TIMEOUT_SECONDS="${2:-}"
|
||
shift 2
|
||
;;
|
||
--interval-seconds)
|
||
INTERVAL_SECONDS="${2:-}"
|
||
shift 2
|
||
;;
|
||
--once)
|
||
ONCE=1
|
||
shift
|
||
;;
|
||
--no-color)
|
||
NO_COLOR=1
|
||
shift
|
||
;;
|
||
-h|--help)
|
||
usage
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "Unknown argument: $1" >&2
|
||
usage >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
done
|
||
|
||
if ! [[ "${TIMEOUT_SECONDS}" =~ ^[0-9]+$ ]] || [ "${TIMEOUT_SECONDS}" -le 0 ]; then
|
||
echo "--timeout-seconds 必須是正整數" >&2
|
||
exit 2
|
||
fi
|
||
|
||
if ! [[ "${INTERVAL_SECONDS}" =~ ^[0-9]+$ ]] || [ "${INTERVAL_SECONDS}" -le 0 ]; then
|
||
echo "--interval-seconds 必須是正整數" >&2
|
||
exit 2
|
||
fi
|
||
|
||
if [ "${NO_COLOR}" = "1" ]; then
|
||
green=""
|
||
yellow=""
|
||
red=""
|
||
reset=""
|
||
else
|
||
green="$(printf '\033[32m')"
|
||
yellow="$(printf '\033[33m')"
|
||
red="$(printf '\033[31m')"
|
||
reset="$(printf '\033[0m')"
|
||
fi
|
||
|
||
ok() {
|
||
printf "%sOK%s %s\n" "${green}" "${reset}" "$*"
|
||
}
|
||
|
||
pending() {
|
||
printf "%sPENDING%s %s\n" "${yellow}" "${reset}" "$*"
|
||
}
|
||
|
||
block() {
|
||
printf "%sBLOCKED%s %s\n" "${red}" "${reset}" "$*"
|
||
}
|
||
|
||
kv_from_file() {
|
||
local path="$1"
|
||
local key="$2"
|
||
awk -F= -v key="$key" '$1 == key {print $2; found=1; exit} END {if (!found) print ""}' "$path"
|
||
}
|
||
|
||
run_gate() {
|
||
local label="$1"
|
||
local output="$2"
|
||
shift 2
|
||
if "$@" >"${output}" 2>&1; then
|
||
printf '%s=1\n' "${label}"
|
||
return 0
|
||
fi
|
||
printf '%s=0\n' "${label}"
|
||
return 1
|
||
}
|
||
|
||
log_root="${TMPDIR:-/tmp}/awoooi-dr-offsite-wait"
|
||
mkdir -p "${log_root}"
|
||
run_id="$(date +%Y%m%d-%H%M%S)"
|
||
log_dir="${log_root}/${run_id}"
|
||
mkdir -p "${log_dir}"
|
||
|
||
echo "AWOOOI DR offsite convergence wait"
|
||
date '+%Y-%m-%d %H:%M:%S %Z'
|
||
echo "PROMETHEUS_URL=${PROMETHEUS_URL}"
|
||
echo "ALERTMANAGER_URL=${ALERTMANAGER_URL}"
|
||
echo "LOG_DIR=${log_dir}"
|
||
echo
|
||
|
||
started_at="$(date +%s)"
|
||
attempt=0
|
||
|
||
while :; do
|
||
attempt=$((attempt + 1))
|
||
now="$(date +%s)"
|
||
elapsed=$((now - started_at))
|
||
attempt_dir="${log_dir}/attempt-${attempt}"
|
||
mkdir -p "${attempt_dir}"
|
||
|
||
scorecard_log="${attempt_dir}/scorecard-require-dr.log"
|
||
prom_log="${attempt_dir}/prometheus-dr-ready.log"
|
||
visibility_log="${attempt_dir}/backup-alert-visibility.log"
|
||
final_log="${attempt_dir}/final-checklist-require-dr.log"
|
||
|
||
scorecard_ok=0
|
||
prometheus_ok=0
|
||
visibility_ok=0
|
||
final_ok=0
|
||
|
||
if run_gate SCORECARD_READY "${scorecard_log}" \
|
||
bash "${ROOT_DIR}/scripts/reboot-recovery/full-stack-recovery-scorecard.sh" --require-dr; then
|
||
scorecard_ok=1
|
||
fi
|
||
|
||
if run_gate PROMETHEUS_READY "${prom_log}" \
|
||
python3 "${ROOT_DIR}/scripts/ops/recovery-scorecard-contract-check.py" \
|
||
--prometheus-url "${PROMETHEUS_URL}" \
|
||
--expect-core-ready \
|
||
--expect-dr-ready; then
|
||
prometheus_ok=1
|
||
fi
|
||
|
||
if run_gate BACKUP_VISIBILITY_READY "${visibility_log}" \
|
||
python3 "${ROOT_DIR}/scripts/ops/backup-alert-live-visibility-check.py" \
|
||
--prometheus-url "${PROMETHEUS_URL}" \
|
||
--alertmanager-url "${ALERTMANAGER_URL}"; then
|
||
visibility_ok=1
|
||
fi
|
||
|
||
recovery_state="$(kv_from_file "${scorecard_log}" RECOVERY_STATE)"
|
||
next_step="$(kv_from_file "${scorecard_log}" NEXT_STEP)"
|
||
escrow_missing="$(kv_from_file "${scorecard_log}" ESCROW_MISSING_COUNT)"
|
||
full_marker="$(kv_from_file "${scorecard_log}" OFFSITE_FULL_MARKER_PRESENT)"
|
||
offsite_configured="$(kv_from_file "${scorecard_log}" OFFSITE_CONFIGURED)"
|
||
|
||
printf 'ATTEMPT=%s ELAPSED_SECONDS=%s SCORECARD_READY=%s PROMETHEUS_READY=%s BACKUP_VISIBILITY_READY=%s OFFSITE_CONFIGURED=%s FULL_MARKER=%s ESCROW_MISSING_COUNT=%s RECOVERY_STATE=%s NEXT_STEP=%s\n' \
|
||
"${attempt}" \
|
||
"${elapsed}" \
|
||
"${scorecard_ok}" \
|
||
"${prometheus_ok}" \
|
||
"${visibility_ok}" \
|
||
"${offsite_configured:-unknown}" \
|
||
"${full_marker:-unknown}" \
|
||
"${escrow_missing:-unknown}" \
|
||
"${recovery_state:-unknown}" \
|
||
"${next_step:-unknown}"
|
||
|
||
if [ "${scorecard_ok}" -eq 1 ] && [ "${prometheus_ok}" -eq 1 ] && [ "${visibility_ok}" -eq 1 ]; then
|
||
if bash "${ROOT_DIR}/scripts/reboot-recovery/dr-offsite-operator-checklist.sh" --require-dr --no-color >"${final_log}" 2>&1; then
|
||
final_ok=1
|
||
fi
|
||
fi
|
||
|
||
if [ "${final_ok}" -eq 1 ]; then
|
||
ok "DR offsite final gate converged"
|
||
echo "FINAL_CHECKLIST_LOG=${final_log}"
|
||
exit 0
|
||
fi
|
||
|
||
if [ "${ONCE}" = "1" ]; then
|
||
block "DR offsite final gate is not ready yet"
|
||
echo "LAST_ATTEMPT_DIR=${attempt_dir}"
|
||
exit 1
|
||
fi
|
||
|
||
now="$(date +%s)"
|
||
elapsed=$((now - started_at))
|
||
if [ "${elapsed}" -ge "${TIMEOUT_SECONDS}" ]; then
|
||
block "timed out waiting for DR offsite final gate"
|
||
echo "LAST_ATTEMPT_DIR=${attempt_dir}"
|
||
echo "下一步:如果 ESCROW_MISSING_COUNT 仍大於 0,先由人工在 110 寫入真實非 secret evidence-id;如果已為 0,檢查 Prometheus scrape/rule 與 Alertmanager 收斂。"
|
||
exit 1
|
||
fi
|
||
|
||
pending "waiting ${INTERVAL_SECONDS}s for marker/textfile/Prometheus/Alertmanager convergence"
|
||
sleep "${INTERVAL_SECONDS}"
|
||
done
|