#!/usr/bin/env python3 """Validate recovery scorecard recording-rule contract.""" from __future__ import annotations import argparse import json import sys import urllib.parse import urllib.request from pathlib import Path from typing import Any import yaml DEFAULT_RULES = Path("ops/monitoring/alerts-unified.yml") DEFAULT_BASELINE = Path("ops/reboot-recovery/full-stack-backup-baseline.yml") EXPECTED_CORE = 'awoooi_recovery_core_ready{host="110",scope="110_120_121_188"}' EXPECTED_DR = 'awoooi_recovery_dr_offsite_ready{host="110"}' class ContractError(RuntimeError): pass def _rules(path: Path) -> list[dict[str, Any]]: data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} rules: list[dict[str, Any]] = [] for group in data.get("groups") or []: rules.extend(group.get("rules") or []) return rules def _expected_recording_rules(path: Path) -> list[str]: data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} rules = data.get("monitoring_contract", {}).get("prometheus_recording_rules") or [] if not rules: raise ContractError(f"missing monitoring_contract.prometheus_recording_rules in {path}") return [str(rule) for rule in rules] def static_check(rules_path: Path, baseline_path: Path) -> list[str]: rules = _rules(rules_path) by_record = {str(rule.get("record")): rule for rule in rules if rule.get("record")} expected = _expected_recording_rules(baseline_path) missing = sorted(set(expected) - set(by_record)) if missing: raise ContractError(f"alerts-unified.yml missing recovery recording rules: {missing}") core_expr = str(by_record["awoooi_recovery_core_ready"].get("expr", "")) for required in [ "awoooi_cold_start_last_result", "awoooi_cold_start_warn_gates", "awoooi_cold_start_blocked_gates", "awoooi_cold_start_last_green_timestamp", ]: if required not in core_expr: raise ContractError(f"awoooi_recovery_core_ready expr missing {required}") dr_expr = str(by_record["awoooi_recovery_dr_offsite_ready"].get("expr", "")) for required in [ "awoooi_backup_offsite_configured", "awoooi_backup_offsite_fresh", "awoooi_backup_credential_escrow_fresh", ]: if required not in dr_expr: raise ContractError(f"awoooi_recovery_dr_offsite_ready expr missing {required}") return [ "OK alerts-unified.yml contains every recovery scorecard recording rule", "OK recovery core rule depends on cold-start green/warn/blocked/last-green metrics", "OK recovery DR rule depends on provider-neutral offsite freshness and credential escrow freshness", ] def _prom_query(base_url: str, expr: str) -> list[dict[str, Any]]: url = f"{base_url.rstrip('/')}/api/v1/query?" + urllib.parse.urlencode({"query": expr}) with urllib.request.urlopen(url, timeout=8) as response: payload = json.loads(response.read().decode("utf-8")) if payload.get("status") != "success": raise ContractError(f"Prometheus query failed for {expr}: {payload}") return payload.get("data", {}).get("result") or [] def _single_value(base_url: str, expr: str) -> float: rows = _prom_query(base_url, expr) if len(rows) != 1: raise ContractError(f"Prometheus query expected one series for {expr}, got {len(rows)}") value = rows[0].get("value") or [] if len(value) < 2: raise ContractError(f"Prometheus query returned malformed value for {expr}: {rows[0]}") try: number = float(value[1]) except (TypeError, ValueError) as exc: raise ContractError(f"Prometheus query returned non-numeric value for {expr}: {rows[0]}") from exc if number not in {0.0, 1.0}: raise ContractError(f"Prometheus recovery scorecard metric must be 0 or 1: {expr}={number}") return number def live_check( base_url: str, expect_core_ready: bool = False, expect_dr_ready: bool = False, ) -> list[str]: core = _single_value(base_url, EXPECTED_CORE) dr = _single_value(base_url, EXPECTED_DR) lines = [ f"OK live {EXPECTED_CORE} value={int(core)}", f"OK live {EXPECTED_DR} value={int(dr)}", ] if expect_core_ready and core != 1.0: raise ContractError(f"expected core recovery ready, got {core}") if expect_dr_ready and dr != 1.0: raise ContractError(f"expected DR offsite ready, got {dr}") return lines def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--rules", type=Path, default=DEFAULT_RULES) parser.add_argument("--baseline", type=Path, default=DEFAULT_BASELINE) parser.add_argument("--prometheus-url", default="") parser.add_argument("--expect-core-ready", action="store_true") parser.add_argument("--expect-dr-ready", action="store_true") args = parser.parse_args() try: for line in static_check(args.rules, args.baseline): print(line) if args.prometheus_url: for line in live_check( args.prometheus_url, args.expect_core_ready, args.expect_dr_ready, ): print(line) except (ContractError, OSError, yaml.YAMLError, json.JSONDecodeError) as exc: print(f"RECOVERY_SCORECARD_CONTRACT_FAILED {exc}", file=sys.stderr) return 1 print("RECOVERY_SCORECARD_CONTRACT_OK") return 0 if __name__ == "__main__": raise SystemExit(main())