Files
awoooi/scripts/reboot-recovery/full-stack-recovery-scorecard.sh
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

190 lines
5.8 KiB
Bash
Executable File

#!/usr/bin/env bash
# Read-only scorecard for reboot recovery and backup DR readiness.
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
REMOTE_110="${REMOTE_110:-wooo@192.168.0.110}"
PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}"
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://192.168.0.110:9093}"
SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}"
SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
REQUIRE_CORE=0
REQUIRE_DR=0
usage() {
cat <<'USAGE'
Usage: bash scripts/reboot-recovery/full-stack-recovery-scorecard.sh [--require-core] [--require-dr] [--require-all]
Read-only scorecard for reboot recovery and DR readiness.
Options:
--require-core Exit non-zero unless core cold-start recovery is ready.
--require-dr Exit non-zero unless rclone/offsite + escrow + full marker are ready.
--require-all Require both core and DR readiness.
Environment:
REMOTE_110, PROMETHEUS_URL, ALERTMANAGER_URL, SSH_BATCH_MODE,
SSH_STRICT_HOST_KEY_CHECKING.
USAGE
}
while [ "$#" -gt 0 ]; do
case "$1" in
--require-core)
REQUIRE_CORE=1
shift
;;
--require-dr)
REQUIRE_DR=1
shift
;;
--require-all)
REQUIRE_CORE=1
REQUIRE_DR=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
ssh_opts=(-o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout=6 -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING")
status_value() {
local key="$1"
local value="$2"
printf '%s=%s\n' "$key" "$value"
}
metric_value() {
local expr="$1"
PROMETHEUS_URL="$PROMETHEUS_URL" EXPR="$expr" python3 - <<'PY'
import json
import os
import urllib.parse
import urllib.request
base = os.environ["PROMETHEUS_URL"].rstrip("/")
expr = os.environ["EXPR"]
url = base + "/api/v1/query?" + urllib.parse.urlencode({"query": expr})
payload = json.load(urllib.request.urlopen(url, timeout=8))
rows = payload.get("data", {}).get("result") or []
if not rows:
print("0")
else:
value = rows[0].get("value") or [0, "0"]
print(value[1])
PY
}
bool_metric() {
local expr="$1"
local value
value="$(metric_value "$expr" 2>/dev/null || echo 0)"
python3 - "$value" <<'PY'
import sys
try:
print(1 if float(sys.argv[1]) > 0 else 0)
except Exception:
print(0)
PY
}
echo "AWOOOI full-stack recovery scorecard"
date '+%Y-%m-%d %H:%M:%S %Z'
echo
cold_green="$(bool_metric 'awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"}')"
cold_warn="$(metric_value 'awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"}' 2>/dev/null || echo 999)"
cold_blocked="$(metric_value 'awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"}' 2>/dev/null || echo 999)"
cold_alerts="$(metric_value 'count(ALERTS{alertname=~"ColdStart.*",alertstate="firing"})' 2>/dev/null || echo 999)"
status_value CORE_COLD_START_GREEN "$cold_green"
status_value CORE_COLD_START_WARN_GATES "$cold_warn"
status_value CORE_COLD_START_BLOCKED_GATES "$cold_blocked"
status_value CORE_COLD_START_FIRING_ALERTS "$cold_alerts"
if bash "$ROOT_DIR/scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh" >/tmp/awoooi-scorecard-cold-start-parity.log 2>&1; then
status_value CORE_COLD_START_DEPLOY_PARITY 1
else
status_value CORE_COLD_START_DEPLOY_PARITY 0
fi
if python3 "$ROOT_DIR/scripts/ops/backup-alert-live-visibility-check.py" \
--prometheus-url "$PROMETHEUS_URL" \
--alertmanager-url "$ALERTMANAGER_URL" \
>/tmp/awoooi-scorecard-backup-alert-visibility.log 2>&1; then
status_value BACKUP_GAP_ALERT_VISIBILITY 1
else
status_value BACKUP_GAP_ALERT_VISIBILITY 0
fi
evidence_report="$(ssh "${ssh_opts[@]}" "$REMOTE_110" '/backup/scripts/offsite-escrow-evidence-report.sh --no-color' 2>/tmp/awoooi-scorecard-offsite-evidence.err || true)"
extract_report_value() {
local key="$1"
awk -F= -v key="$key" '$1 == key {print $2; found=1; exit} END {if (!found) print ""}' <<<"$evidence_report"
}
offsite_configured="$(extract_report_value OFFSITE_CONFIGURED)"
rclone_configured="$(extract_report_value RCLONE_CONFIGURED)"
b2_configured="$(extract_report_value B2_CONFIGURED)"
escrow_missing="$(extract_report_value ESCROW_MISSING_COUNT)"
partial_marker="$(extract_report_value PARTIAL_MARKER_PRESENT)"
full_marker="$(extract_report_value FULL_MARKER_PRESENT)"
next_step="$(extract_report_value NEXT_STEP)"
status_value OFFSITE_CONFIGURED "${offsite_configured:-${b2_configured:-unknown}}"
status_value OFFSITE_RCLONE_CONFIGURED "${rclone_configured:-unknown}"
status_value OFFSITE_B2_LEGACY_CONFIGURED "${b2_configured:-unknown}"
status_value OFFSITE_PARTIAL_MARKER_PRESENT "${partial_marker:-unknown}"
status_value OFFSITE_FULL_MARKER_PRESENT "${full_marker:-unknown}"
status_value ESCROW_MISSING_COUNT "${escrow_missing:-unknown}"
status_value NEXT_STEP "${next_step:-unknown}"
if [ "$cold_green" = "1" ] \
&& [ "${cold_warn%.*}" = "0" ] \
&& [ "${cold_blocked%.*}" = "0" ] \
&& [ "${cold_alerts%.*}" = "0" ]; then
core_state="CORE_READY"
else
core_state="CORE_NOT_READY"
fi
if [ "${offsite_configured:-${b2_configured:-0}}" = "1" ] \
&& [ "${escrow_missing:-999}" = "0" ] \
&& [ "${full_marker:-0}" = "1" ]; then
dr_state="DR_OFFSITE_READY"
else
dr_state="DR_OFFSITE_PENDING"
fi
status_value RECOVERY_STATE "${core_state}_${dr_state}"
echo
echo "Artifacts:"
echo "- /tmp/awoooi-scorecard-cold-start-parity.log"
echo "- /tmp/awoooi-scorecard-backup-alert-visibility.log"
echo "- /tmp/awoooi-scorecard-offsite-evidence.err"
exit_code=0
if [ "$REQUIRE_CORE" = "1" ] && [ "$core_state" != "CORE_READY" ]; then
echo "BLOCKED require-core failed: ${core_state}" >&2
exit_code=1
fi
if [ "$REQUIRE_DR" = "1" ] && [ "$dr_state" != "DR_OFFSITE_READY" ]; then
echo "BLOCKED require-dr failed: ${dr_state}; NEXT_STEP=${next_step:-unknown}" >&2
exit_code=1
fi
exit "$exit_code"