190 lines
5.8 KiB
Bash
Executable File
190 lines
5.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Read-only scorecard for reboot recovery and backup DR readiness.
|
|
|
|
set -euo pipefail
|
|
|
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
REMOTE_110="${REMOTE_110:-wooo@192.168.0.110}"
|
|
PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}"
|
|
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://192.168.0.110:9093}"
|
|
SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}"
|
|
SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
|
|
REQUIRE_CORE=0
|
|
REQUIRE_DR=0
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage: bash scripts/reboot-recovery/full-stack-recovery-scorecard.sh [--require-core] [--require-dr] [--require-all]
|
|
|
|
Read-only scorecard for reboot recovery and DR readiness.
|
|
|
|
Options:
|
|
--require-core Exit non-zero unless core cold-start recovery is ready.
|
|
--require-dr Exit non-zero unless rclone/offsite + escrow + full marker are ready.
|
|
--require-all Require both core and DR readiness.
|
|
|
|
Environment:
|
|
REMOTE_110, PROMETHEUS_URL, ALERTMANAGER_URL, SSH_BATCH_MODE,
|
|
SSH_STRICT_HOST_KEY_CHECKING.
|
|
USAGE
|
|
}
|
|
|
|
while [ "$#" -gt 0 ]; do
|
|
case "$1" in
|
|
--require-core)
|
|
REQUIRE_CORE=1
|
|
shift
|
|
;;
|
|
--require-dr)
|
|
REQUIRE_DR=1
|
|
shift
|
|
;;
|
|
--require-all)
|
|
REQUIRE_CORE=1
|
|
REQUIRE_DR=1
|
|
shift
|
|
;;
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown argument: $1" >&2
|
|
usage >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
done
|
|
|
|
ssh_opts=(-o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout=6 -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING")
|
|
|
|
status_value() {
|
|
local key="$1"
|
|
local value="$2"
|
|
printf '%s=%s\n' "$key" "$value"
|
|
}
|
|
|
|
metric_value() {
|
|
local expr="$1"
|
|
PROMETHEUS_URL="$PROMETHEUS_URL" EXPR="$expr" python3 - <<'PY'
|
|
import json
|
|
import os
|
|
import urllib.parse
|
|
import urllib.request
|
|
|
|
base = os.environ["PROMETHEUS_URL"].rstrip("/")
|
|
expr = os.environ["EXPR"]
|
|
url = base + "/api/v1/query?" + urllib.parse.urlencode({"query": expr})
|
|
payload = json.load(urllib.request.urlopen(url, timeout=8))
|
|
rows = payload.get("data", {}).get("result") or []
|
|
if not rows:
|
|
print("0")
|
|
else:
|
|
value = rows[0].get("value") or [0, "0"]
|
|
print(value[1])
|
|
PY
|
|
}
|
|
|
|
bool_metric() {
|
|
local expr="$1"
|
|
local value
|
|
value="$(metric_value "$expr" 2>/dev/null || echo 0)"
|
|
python3 - "$value" <<'PY'
|
|
import sys
|
|
try:
|
|
print(1 if float(sys.argv[1]) > 0 else 0)
|
|
except Exception:
|
|
print(0)
|
|
PY
|
|
}
|
|
|
|
echo "AWOOOI full-stack recovery scorecard"
|
|
date '+%Y-%m-%d %H:%M:%S %Z'
|
|
echo
|
|
|
|
cold_green="$(bool_metric 'awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"}')"
|
|
cold_warn="$(metric_value 'awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"}' 2>/dev/null || echo 999)"
|
|
cold_blocked="$(metric_value 'awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"}' 2>/dev/null || echo 999)"
|
|
cold_alerts="$(metric_value 'count(ALERTS{alertname=~"ColdStart.*",alertstate="firing"})' 2>/dev/null || echo 999)"
|
|
|
|
status_value CORE_COLD_START_GREEN "$cold_green"
|
|
status_value CORE_COLD_START_WARN_GATES "$cold_warn"
|
|
status_value CORE_COLD_START_BLOCKED_GATES "$cold_blocked"
|
|
status_value CORE_COLD_START_FIRING_ALERTS "$cold_alerts"
|
|
|
|
if bash "$ROOT_DIR/scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh" >/tmp/awoooi-scorecard-cold-start-parity.log 2>&1; then
|
|
status_value CORE_COLD_START_DEPLOY_PARITY 1
|
|
else
|
|
status_value CORE_COLD_START_DEPLOY_PARITY 0
|
|
fi
|
|
|
|
if python3 "$ROOT_DIR/scripts/ops/backup-alert-live-visibility-check.py" \
|
|
--prometheus-url "$PROMETHEUS_URL" \
|
|
--alertmanager-url "$ALERTMANAGER_URL" \
|
|
>/tmp/awoooi-scorecard-backup-alert-visibility.log 2>&1; then
|
|
status_value BACKUP_GAP_ALERT_VISIBILITY 1
|
|
else
|
|
status_value BACKUP_GAP_ALERT_VISIBILITY 0
|
|
fi
|
|
|
|
evidence_report="$(ssh "${ssh_opts[@]}" "$REMOTE_110" '/backup/scripts/offsite-escrow-evidence-report.sh --no-color' 2>/tmp/awoooi-scorecard-offsite-evidence.err || true)"
|
|
|
|
extract_report_value() {
|
|
local key="$1"
|
|
awk -F= -v key="$key" '$1 == key {print $2; found=1; exit} END {if (!found) print ""}' <<<"$evidence_report"
|
|
}
|
|
|
|
offsite_configured="$(extract_report_value OFFSITE_CONFIGURED)"
|
|
rclone_configured="$(extract_report_value RCLONE_CONFIGURED)"
|
|
b2_configured="$(extract_report_value B2_CONFIGURED)"
|
|
escrow_missing="$(extract_report_value ESCROW_MISSING_COUNT)"
|
|
partial_marker="$(extract_report_value PARTIAL_MARKER_PRESENT)"
|
|
full_marker="$(extract_report_value FULL_MARKER_PRESENT)"
|
|
next_step="$(extract_report_value NEXT_STEP)"
|
|
|
|
status_value OFFSITE_CONFIGURED "${offsite_configured:-${b2_configured:-unknown}}"
|
|
status_value OFFSITE_RCLONE_CONFIGURED "${rclone_configured:-unknown}"
|
|
status_value OFFSITE_B2_LEGACY_CONFIGURED "${b2_configured:-unknown}"
|
|
status_value OFFSITE_PARTIAL_MARKER_PRESENT "${partial_marker:-unknown}"
|
|
status_value OFFSITE_FULL_MARKER_PRESENT "${full_marker:-unknown}"
|
|
status_value ESCROW_MISSING_COUNT "${escrow_missing:-unknown}"
|
|
status_value NEXT_STEP "${next_step:-unknown}"
|
|
|
|
if [ "$cold_green" = "1" ] \
|
|
&& [ "${cold_warn%.*}" = "0" ] \
|
|
&& [ "${cold_blocked%.*}" = "0" ] \
|
|
&& [ "${cold_alerts%.*}" = "0" ]; then
|
|
core_state="CORE_READY"
|
|
else
|
|
core_state="CORE_NOT_READY"
|
|
fi
|
|
|
|
if [ "${offsite_configured:-${b2_configured:-0}}" = "1" ] \
|
|
&& [ "${escrow_missing:-999}" = "0" ] \
|
|
&& [ "${full_marker:-0}" = "1" ]; then
|
|
dr_state="DR_OFFSITE_READY"
|
|
else
|
|
dr_state="DR_OFFSITE_PENDING"
|
|
fi
|
|
|
|
status_value RECOVERY_STATE "${core_state}_${dr_state}"
|
|
|
|
echo
|
|
echo "Artifacts:"
|
|
echo "- /tmp/awoooi-scorecard-cold-start-parity.log"
|
|
echo "- /tmp/awoooi-scorecard-backup-alert-visibility.log"
|
|
echo "- /tmp/awoooi-scorecard-offsite-evidence.err"
|
|
|
|
exit_code=0
|
|
if [ "$REQUIRE_CORE" = "1" ] && [ "$core_state" != "CORE_READY" ]; then
|
|
echo "BLOCKED require-core failed: ${core_state}" >&2
|
|
exit_code=1
|
|
fi
|
|
|
|
if [ "$REQUIRE_DR" = "1" ] && [ "$dr_state" != "DR_OFFSITE_READY" ]; then
|
|
echo "BLOCKED require-dr failed: ${dr_state}; NEXT_STEP=${next_step:-unknown}" >&2
|
|
exit_code=1
|
|
fi
|
|
|
|
exit "$exit_code"
|