Files
awoooi/scripts/reboot-recovery/wait-dr-offsite-ready.sh
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

228 lines
6.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Wait for the post-escrow DR offsite gate to converge.
#
# 2026-05-20 ogt + Codex:
# - 只讀等待人工 credential escrow marker 寫完後repo scorecard、
# Prometheus recording rule、Alertmanager 可見性與最終 checklist 全部一致。
# - 不讀、不寫、不列印任何 secret不建立 marker不觸發 offsite sync。
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}"
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://192.168.0.110:9093}"
TIMEOUT_SECONDS=900
INTERVAL_SECONDS=30
NO_COLOR=0
ONCE=0
usage() {
cat <<'USAGE'
Usage:
bash scripts/reboot-recovery/wait-dr-offsite-ready.sh [--timeout-seconds N] [--interval-seconds N] [--no-color]
bash scripts/reboot-recovery/wait-dr-offsite-ready.sh --once [--no-color]
Purpose:
After the human operator writes the five credential escrow markers on 110,
wait until all read-only DR gates converge:
1. full-stack-recovery-scorecard.sh --require-dr
2. Prometheus recovery recording rule with --expect-dr-ready
3. backup-alert-live-visibility-check.py
4. dr-offsite-operator-checklist.sh --require-dr
Rules:
- This script never writes escrow markers.
- This script never uploads or deletes backup data.
- This script never prints credential values.
- It only waits for scrape/rule/Alertmanager convergence after a real human escrow review.
Environment:
PROMETHEUS_URL, ALERTMANAGER_URL, REMOTE_110, SSH_BATCH_MODE,
SSH_STRICT_HOST_KEY_CHECKING.
USAGE
}
while [ "$#" -gt 0 ]; do
case "$1" in
--timeout-seconds)
TIMEOUT_SECONDS="${2:-}"
shift 2
;;
--interval-seconds)
INTERVAL_SECONDS="${2:-}"
shift 2
;;
--once)
ONCE=1
shift
;;
--no-color)
NO_COLOR=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if ! [[ "${TIMEOUT_SECONDS}" =~ ^[0-9]+$ ]] || [ "${TIMEOUT_SECONDS}" -le 0 ]; then
echo "--timeout-seconds 必須是正整數" >&2
exit 2
fi
if ! [[ "${INTERVAL_SECONDS}" =~ ^[0-9]+$ ]] || [ "${INTERVAL_SECONDS}" -le 0 ]; then
echo "--interval-seconds 必須是正整數" >&2
exit 2
fi
if [ "${NO_COLOR}" = "1" ]; then
green=""
yellow=""
red=""
reset=""
else
green="$(printf '\033[32m')"
yellow="$(printf '\033[33m')"
red="$(printf '\033[31m')"
reset="$(printf '\033[0m')"
fi
ok() {
printf "%sOK%s %s\n" "${green}" "${reset}" "$*"
}
pending() {
printf "%sPENDING%s %s\n" "${yellow}" "${reset}" "$*"
}
block() {
printf "%sBLOCKED%s %s\n" "${red}" "${reset}" "$*"
}
kv_from_file() {
local path="$1"
local key="$2"
awk -F= -v key="$key" '$1 == key {print $2; found=1; exit} END {if (!found) print ""}' "$path"
}
run_gate() {
local label="$1"
local output="$2"
shift 2
if "$@" >"${output}" 2>&1; then
printf '%s=1\n' "${label}"
return 0
fi
printf '%s=0\n' "${label}"
return 1
}
log_root="${TMPDIR:-/tmp}/awoooi-dr-offsite-wait"
mkdir -p "${log_root}"
run_id="$(date +%Y%m%d-%H%M%S)"
log_dir="${log_root}/${run_id}"
mkdir -p "${log_dir}"
echo "AWOOOI DR offsite convergence wait"
date '+%Y-%m-%d %H:%M:%S %Z'
echo "PROMETHEUS_URL=${PROMETHEUS_URL}"
echo "ALERTMANAGER_URL=${ALERTMANAGER_URL}"
echo "LOG_DIR=${log_dir}"
echo
started_at="$(date +%s)"
attempt=0
while :; do
attempt=$((attempt + 1))
now="$(date +%s)"
elapsed=$((now - started_at))
attempt_dir="${log_dir}/attempt-${attempt}"
mkdir -p "${attempt_dir}"
scorecard_log="${attempt_dir}/scorecard-require-dr.log"
prom_log="${attempt_dir}/prometheus-dr-ready.log"
visibility_log="${attempt_dir}/backup-alert-visibility.log"
final_log="${attempt_dir}/final-checklist-require-dr.log"
scorecard_ok=0
prometheus_ok=0
visibility_ok=0
final_ok=0
if run_gate SCORECARD_READY "${scorecard_log}" \
bash "${ROOT_DIR}/scripts/reboot-recovery/full-stack-recovery-scorecard.sh" --require-dr; then
scorecard_ok=1
fi
if run_gate PROMETHEUS_READY "${prom_log}" \
python3 "${ROOT_DIR}/scripts/ops/recovery-scorecard-contract-check.py" \
--prometheus-url "${PROMETHEUS_URL}" \
--expect-core-ready \
--expect-dr-ready; then
prometheus_ok=1
fi
if run_gate BACKUP_VISIBILITY_READY "${visibility_log}" \
python3 "${ROOT_DIR}/scripts/ops/backup-alert-live-visibility-check.py" \
--prometheus-url "${PROMETHEUS_URL}" \
--alertmanager-url "${ALERTMANAGER_URL}"; then
visibility_ok=1
fi
recovery_state="$(kv_from_file "${scorecard_log}" RECOVERY_STATE)"
next_step="$(kv_from_file "${scorecard_log}" NEXT_STEP)"
escrow_missing="$(kv_from_file "${scorecard_log}" ESCROW_MISSING_COUNT)"
full_marker="$(kv_from_file "${scorecard_log}" OFFSITE_FULL_MARKER_PRESENT)"
offsite_configured="$(kv_from_file "${scorecard_log}" OFFSITE_CONFIGURED)"
printf 'ATTEMPT=%s ELAPSED_SECONDS=%s SCORECARD_READY=%s PROMETHEUS_READY=%s BACKUP_VISIBILITY_READY=%s OFFSITE_CONFIGURED=%s FULL_MARKER=%s ESCROW_MISSING_COUNT=%s RECOVERY_STATE=%s NEXT_STEP=%s\n' \
"${attempt}" \
"${elapsed}" \
"${scorecard_ok}" \
"${prometheus_ok}" \
"${visibility_ok}" \
"${offsite_configured:-unknown}" \
"${full_marker:-unknown}" \
"${escrow_missing:-unknown}" \
"${recovery_state:-unknown}" \
"${next_step:-unknown}"
if [ "${scorecard_ok}" -eq 1 ] && [ "${prometheus_ok}" -eq 1 ] && [ "${visibility_ok}" -eq 1 ]; then
if bash "${ROOT_DIR}/scripts/reboot-recovery/dr-offsite-operator-checklist.sh" --require-dr --no-color >"${final_log}" 2>&1; then
final_ok=1
fi
fi
if [ "${final_ok}" -eq 1 ]; then
ok "DR offsite final gate converged"
echo "FINAL_CHECKLIST_LOG=${final_log}"
exit 0
fi
if [ "${ONCE}" = "1" ]; then
block "DR offsite final gate is not ready yet"
echo "LAST_ATTEMPT_DIR=${attempt_dir}"
exit 1
fi
now="$(date +%s)"
elapsed=$((now - started_at))
if [ "${elapsed}" -ge "${TIMEOUT_SECONDS}" ]; then
block "timed out waiting for DR offsite final gate"
echo "LAST_ATTEMPT_DIR=${attempt_dir}"
echo "下一步:如果 ESCROW_MISSING_COUNT 仍大於 0先由人工在 110 寫入真實非 secret evidence-id如果已為 0檢查 Prometheus scrape/rule 與 Alertmanager 收斂。"
exit 1
fi
pending "waiting ${INTERVAL_SECONDS}s for marker/textfile/Prometheus/Alertmanager convergence"
sleep "${INTERVAL_SECONDS}"
done