Files
awoooi/scripts/ops/prometheus-rule-drift-guard.sh
2026-05-29 12:41:34 +08:00

150 lines
5.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# Guard 110 Prometheus alert rules against stale deploys.
#
# This script is intentionally narrow: it only restores the canonical alert
# rules file when required recovery/backup rules disappear from live Prometheus
# or when the active file differs from the canonical copy.
set -uo pipefail
HOST_LABEL="${HOST_LABEL:-110}"
PROMETHEUS_URL="${PROMETHEUS_URL:-http://127.0.0.1:9090}"
CURRENT_RULES="${CURRENT_RULES:-/home/wooo/monitoring/alerts.yml}"
CANONICAL_RULES="${CANONICAL_RULES:-/home/wooo/monitoring/alerts-unified.canonical.yml}"
TEXTFILE="${TEXTFILE:-/home/wooo/node_exporter_textfiles/prometheus_rule_drift_guard.prom}"
LOG_FILE="${LOG_FILE:-/home/wooo/logs/prometheus-rule-drift-guard.log}"
REQUIRED_RULES=(
"BackupCredentialEscrowEvidenceMissing"
"BackupExpectedJobMissing"
"awoooi_recovery_core_ready"
"awoooi_recovery_dr_offsite_ready"
"ColdStartRecoveryBlocked"
)
log() {
mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >>"$LOG_FILE"
}
write_textfile() {
local status="$1"
local repaired="$2"
local missing_count="$3"
local matches_canonical="$4"
local tmp
mkdir -p "$(dirname "$TEXTFILE")" 2>/dev/null || true
tmp="$(mktemp "${TEXTFILE}.tmp.XXXXXX")" || return 0
cat >"$tmp" <<EOF
# HELP awoooi_prometheus_rule_drift_guard_last_run_timestamp Unix timestamp of the last Prometheus rule drift guard run.
# TYPE awoooi_prometheus_rule_drift_guard_last_run_timestamp gauge
awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="${HOST_LABEL}",status="${status}"} $(date +%s)
# HELP awoooi_prometheus_rule_drift_guard_repaired Whether the guard restored canonical Prometheus rules on the last run.
# TYPE awoooi_prometheus_rule_drift_guard_repaired gauge
awoooi_prometheus_rule_drift_guard_repaired{host="${HOST_LABEL}"} ${repaired}
# HELP awoooi_prometheus_rule_drift_guard_missing_required_count Number of required live rules missing after the last check.
# TYPE awoooi_prometheus_rule_drift_guard_missing_required_count gauge
awoooi_prometheus_rule_drift_guard_missing_required_count{host="${HOST_LABEL}"} ${missing_count}
# HELP awoooi_prometheus_rule_drift_guard_current_matches_canonical Whether active alerts.yml matches canonical copy.
# TYPE awoooi_prometheus_rule_drift_guard_current_matches_canonical gauge
awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="${HOST_LABEL}"} ${matches_canonical}
EOF
chmod 0644 "$tmp" 2>/dev/null || true
mv "$tmp" "$TEXTFILE" 2>/dev/null || rm -f "$tmp"
}
rules_missing_count() {
python3 - "$PROMETHEUS_URL" "${REQUIRED_RULES[@]}" <<'PY'
import json
import sys
import urllib.request
base_url = sys.argv[1].rstrip("/")
required = set(sys.argv[2:])
try:
with urllib.request.urlopen(f"{base_url}/api/v1/rules", timeout=8) as response:
payload = json.loads(response.read().decode("utf-8"))
if payload.get("status") != "success":
raise RuntimeError(payload)
loaded = {
str(rule.get("name") or rule.get("alert") or rule.get("record"))
for group in payload.get("data", {}).get("groups") or []
for rule in group.get("rules") or []
}
print(len(required - loaded))
except Exception as exc:
print(f"QUERY_FAILED:{exc}")
PY
}
matches_canonical() {
if [ ! -f "$CURRENT_RULES" ] || [ ! -f "$CANONICAL_RULES" ]; then
echo 0
return
fi
if cmp -s "$CURRENT_RULES" "$CANONICAL_RULES"; then
echo 1
else
echo 0
fi
}
restore_rules() {
local backup_path
backup_path="${CURRENT_RULES}.guard.bak.$(date +%Y%m%d%H%M%S)"
cp "$CURRENT_RULES" "$backup_path" 2>/dev/null || true
cp "$CANONICAL_RULES" "$CURRENT_RULES"
curl -fsS -X POST "${PROMETHEUS_URL}/-/reload" >/dev/null
}
main() {
if [ ! -f "$CANONICAL_RULES" ]; then
log "canonical rules missing: ${CANONICAL_RULES}"
write_textfile "canonical_missing" 0 999 0
return 1
fi
local missing before_matches repaired after_missing after_matches
missing="$(rules_missing_count)"
before_matches="$(matches_canonical)"
repaired=0
if [[ "$missing" == QUERY_FAILED:* ]]; then
log "Prometheus query failed: ${missing}"
write_textfile "query_failed" 0 999 "$before_matches"
return 1
fi
if [ "$missing" -gt 0 ] || [ "$before_matches" -eq 0 ]; then
log "rule drift detected: missing=${missing} current_matches_canonical=${before_matches}; restoring"
if restore_rules; then
repaired=1
sleep 3
else
log "restore failed"
write_textfile "restore_failed" 0 "$missing" "$before_matches"
return 1
fi
fi
after_missing="$(rules_missing_count)"
after_matches="$(matches_canonical)"
if [[ "$after_missing" == QUERY_FAILED:* ]]; then
log "post-restore Prometheus query failed: ${after_missing}"
write_textfile "post_query_failed" "$repaired" 999 "$after_matches"
return 1
fi
if [ "$after_missing" -eq 0 ] && [ "$after_matches" -eq 1 ]; then
write_textfile "ok" "$repaired" "$after_missing" "$after_matches"
log "ok repaired=${repaired}"
return 0
fi
log "still drifted after check: missing=${after_missing} current_matches_canonical=${after_matches}"
write_textfile "drifted" "$repaired" "$after_missing" "$after_matches"
return 1
}
main "$@"