#!/usr/bin/env bash # Guard 110 Prometheus alert rules against stale deploys. # # This script is intentionally narrow: it only restores the canonical alert # rules file when required recovery/backup rules disappear from live Prometheus # or when the active file differs from the canonical copy. set -uo pipefail HOST_LABEL="${HOST_LABEL:-110}" PROMETHEUS_URL="${PROMETHEUS_URL:-http://127.0.0.1:9090}" CURRENT_RULES="${CURRENT_RULES:-/home/wooo/monitoring/alerts.yml}" CANONICAL_RULES="${CANONICAL_RULES:-/home/wooo/monitoring/alerts-unified.canonical.yml}" TEXTFILE="${TEXTFILE:-/home/wooo/node_exporter_textfiles/prometheus_rule_drift_guard.prom}" LOG_FILE="${LOG_FILE:-/home/wooo/logs/prometheus-rule-drift-guard.log}" REQUIRED_RULES=( "BackupCredentialEscrowEvidenceMissing" "BackupExpectedJobMissing" "awoooi_recovery_core_ready" "awoooi_recovery_dr_offsite_ready" "ColdStartRecoveryBlocked" ) log() { mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >>"$LOG_FILE" } write_textfile() { local status="$1" local repaired="$2" local missing_count="$3" local matches_canonical="$4" local tmp mkdir -p "$(dirname "$TEXTFILE")" 2>/dev/null || true tmp="$(mktemp "${TEXTFILE}.tmp.XXXXXX")" || return 0 cat >"$tmp" </dev/null || true mv "$tmp" "$TEXTFILE" 2>/dev/null || rm -f "$tmp" } rules_missing_count() { python3 - "$PROMETHEUS_URL" "${REQUIRED_RULES[@]}" <<'PY' import json import sys import urllib.request base_url = sys.argv[1].rstrip("/") required = set(sys.argv[2:]) try: with urllib.request.urlopen(f"{base_url}/api/v1/rules", timeout=8) as response: payload = json.loads(response.read().decode("utf-8")) if payload.get("status") != "success": raise RuntimeError(payload) loaded = { str(rule.get("name") or rule.get("alert") or rule.get("record")) for group in payload.get("data", {}).get("groups") or [] for rule in group.get("rules") or [] } print(len(required - loaded)) except Exception as exc: print(f"QUERY_FAILED:{exc}") PY } matches_canonical() { if [ ! -f "$CURRENT_RULES" ] || [ ! -f "$CANONICAL_RULES" ]; then echo 0 return fi if cmp -s "$CURRENT_RULES" "$CANONICAL_RULES"; then echo 1 else echo 0 fi } restore_rules() { local backup_path backup_path="${CURRENT_RULES}.guard.bak.$(date +%Y%m%d%H%M%S)" cp "$CURRENT_RULES" "$backup_path" 2>/dev/null || true cp "$CANONICAL_RULES" "$CURRENT_RULES" curl -fsS -X POST "${PROMETHEUS_URL}/-/reload" >/dev/null } main() { if [ ! -f "$CANONICAL_RULES" ]; then log "canonical rules missing: ${CANONICAL_RULES}" write_textfile "canonical_missing" 0 999 0 return 1 fi local missing before_matches repaired after_missing after_matches missing="$(rules_missing_count)" before_matches="$(matches_canonical)" repaired=0 if [[ "$missing" == QUERY_FAILED:* ]]; then log "Prometheus query failed: ${missing}" write_textfile "query_failed" 0 999 "$before_matches" return 1 fi if [ "$missing" -gt 0 ] || [ "$before_matches" -eq 0 ]; then log "rule drift detected: missing=${missing} current_matches_canonical=${before_matches}; restoring" if restore_rules; then repaired=1 sleep 3 else log "restore failed" write_textfile "restore_failed" 0 "$missing" "$before_matches" return 1 fi fi after_missing="$(rules_missing_count)" after_matches="$(matches_canonical)" if [[ "$after_missing" == QUERY_FAILED:* ]]; then log "post-restore Prometheus query failed: ${after_missing}" write_textfile "post_query_failed" "$repaired" 999 "$after_matches" return 1 fi if [ "$after_missing" -eq 0 ] && [ "$after_matches" -eq 1 ]; then write_textfile "ok" "$repaired" "$after_missing" "$after_matches" log "ok repaired=${repaired}" return 0 fi log "still drifted after check: missing=${after_missing} current_matches_canonical=${after_matches}" write_textfile "drifted" "$repaired" "$after_missing" "$after_matches" return 1 } main "$@"