150 lines
5.2 KiB
Bash
Executable File
150 lines
5.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Guard 110 Prometheus alert rules against stale deploys.
|
|
#
|
|
# This script is intentionally narrow: it only restores the canonical alert
|
|
# rules file when required recovery/backup rules disappear from live Prometheus
|
|
# or when the active file differs from the canonical copy.
|
|
|
|
set -uo pipefail
|
|
|
|
HOST_LABEL="${HOST_LABEL:-110}"
|
|
PROMETHEUS_URL="${PROMETHEUS_URL:-http://127.0.0.1:9090}"
|
|
CURRENT_RULES="${CURRENT_RULES:-/home/wooo/monitoring/alerts.yml}"
|
|
CANONICAL_RULES="${CANONICAL_RULES:-/home/wooo/monitoring/alerts-unified.canonical.yml}"
|
|
TEXTFILE="${TEXTFILE:-/home/wooo/node_exporter_textfiles/prometheus_rule_drift_guard.prom}"
|
|
LOG_FILE="${LOG_FILE:-/home/wooo/logs/prometheus-rule-drift-guard.log}"
|
|
|
|
REQUIRED_RULES=(
|
|
"BackupCredentialEscrowEvidenceMissing"
|
|
"BackupExpectedJobMissing"
|
|
"awoooi_recovery_core_ready"
|
|
"awoooi_recovery_dr_offsite_ready"
|
|
"ColdStartRecoveryBlocked"
|
|
)
|
|
|
|
log() {
|
|
mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true
|
|
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >>"$LOG_FILE"
|
|
}
|
|
|
|
write_textfile() {
|
|
local status="$1"
|
|
local repaired="$2"
|
|
local missing_count="$3"
|
|
local matches_canonical="$4"
|
|
local tmp
|
|
mkdir -p "$(dirname "$TEXTFILE")" 2>/dev/null || true
|
|
tmp="$(mktemp "${TEXTFILE}.tmp.XXXXXX")" || return 0
|
|
cat >"$tmp" <<EOF
|
|
# HELP awoooi_prometheus_rule_drift_guard_last_run_timestamp Unix timestamp of the last Prometheus rule drift guard run.
|
|
# TYPE awoooi_prometheus_rule_drift_guard_last_run_timestamp gauge
|
|
awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="${HOST_LABEL}",status="${status}"} $(date +%s)
|
|
# HELP awoooi_prometheus_rule_drift_guard_repaired Whether the guard restored canonical Prometheus rules on the last run.
|
|
# TYPE awoooi_prometheus_rule_drift_guard_repaired gauge
|
|
awoooi_prometheus_rule_drift_guard_repaired{host="${HOST_LABEL}"} ${repaired}
|
|
# HELP awoooi_prometheus_rule_drift_guard_missing_required_count Number of required live rules missing after the last check.
|
|
# TYPE awoooi_prometheus_rule_drift_guard_missing_required_count gauge
|
|
awoooi_prometheus_rule_drift_guard_missing_required_count{host="${HOST_LABEL}"} ${missing_count}
|
|
# HELP awoooi_prometheus_rule_drift_guard_current_matches_canonical Whether active alerts.yml matches canonical copy.
|
|
# TYPE awoooi_prometheus_rule_drift_guard_current_matches_canonical gauge
|
|
awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="${HOST_LABEL}"} ${matches_canonical}
|
|
EOF
|
|
chmod 0644 "$tmp" 2>/dev/null || true
|
|
mv "$tmp" "$TEXTFILE" 2>/dev/null || rm -f "$tmp"
|
|
}
|
|
|
|
rules_missing_count() {
|
|
python3 - "$PROMETHEUS_URL" "${REQUIRED_RULES[@]}" <<'PY'
|
|
import json
|
|
import sys
|
|
import urllib.request
|
|
|
|
base_url = sys.argv[1].rstrip("/")
|
|
required = set(sys.argv[2:])
|
|
try:
|
|
with urllib.request.urlopen(f"{base_url}/api/v1/rules", timeout=8) as response:
|
|
payload = json.loads(response.read().decode("utf-8"))
|
|
if payload.get("status") != "success":
|
|
raise RuntimeError(payload)
|
|
loaded = {
|
|
str(rule.get("name") or rule.get("alert") or rule.get("record"))
|
|
for group in payload.get("data", {}).get("groups") or []
|
|
for rule in group.get("rules") or []
|
|
}
|
|
print(len(required - loaded))
|
|
except Exception as exc:
|
|
print(f"QUERY_FAILED:{exc}")
|
|
PY
|
|
}
|
|
|
|
matches_canonical() {
|
|
if [ ! -f "$CURRENT_RULES" ] || [ ! -f "$CANONICAL_RULES" ]; then
|
|
echo 0
|
|
return
|
|
fi
|
|
if cmp -s "$CURRENT_RULES" "$CANONICAL_RULES"; then
|
|
echo 1
|
|
else
|
|
echo 0
|
|
fi
|
|
}
|
|
|
|
restore_rules() {
|
|
local backup_path
|
|
backup_path="${CURRENT_RULES}.guard.bak.$(date +%Y%m%d%H%M%S)"
|
|
cp "$CURRENT_RULES" "$backup_path" 2>/dev/null || true
|
|
cp "$CANONICAL_RULES" "$CURRENT_RULES"
|
|
curl -fsS -X POST "${PROMETHEUS_URL}/-/reload" >/dev/null
|
|
}
|
|
|
|
main() {
|
|
if [ ! -f "$CANONICAL_RULES" ]; then
|
|
log "canonical rules missing: ${CANONICAL_RULES}"
|
|
write_textfile "canonical_missing" 0 999 0
|
|
return 1
|
|
fi
|
|
|
|
local missing before_matches repaired after_missing after_matches
|
|
missing="$(rules_missing_count)"
|
|
before_matches="$(matches_canonical)"
|
|
repaired=0
|
|
|
|
if [[ "$missing" == QUERY_FAILED:* ]]; then
|
|
log "Prometheus query failed: ${missing}"
|
|
write_textfile "query_failed" 0 999 "$before_matches"
|
|
return 1
|
|
fi
|
|
|
|
if [ "$missing" -gt 0 ] || [ "$before_matches" -eq 0 ]; then
|
|
log "rule drift detected: missing=${missing} current_matches_canonical=${before_matches}; restoring"
|
|
if restore_rules; then
|
|
repaired=1
|
|
sleep 3
|
|
else
|
|
log "restore failed"
|
|
write_textfile "restore_failed" 0 "$missing" "$before_matches"
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
after_missing="$(rules_missing_count)"
|
|
after_matches="$(matches_canonical)"
|
|
if [[ "$after_missing" == QUERY_FAILED:* ]]; then
|
|
log "post-restore Prometheus query failed: ${after_missing}"
|
|
write_textfile "post_query_failed" "$repaired" 999 "$after_matches"
|
|
return 1
|
|
fi
|
|
|
|
if [ "$after_missing" -eq 0 ] && [ "$after_matches" -eq 1 ]; then
|
|
write_textfile "ok" "$repaired" "$after_missing" "$after_matches"
|
|
log "ok repaired=${repaired}"
|
|
return 0
|
|
fi
|
|
|
|
log "still drifted after check: missing=${after_missing} current_matches_canonical=${after_matches}"
|
|
write_textfile "drifted" "$repaired" "$after_missing" "$after_matches"
|
|
return 1
|
|
}
|
|
|
|
main "$@"
|