#!/usr/bin/env python3 """ Backup health textfile exporter for full-stack reboot readiness. 2026-05-06 ogt + Codex: backup coverage follow-up after the reboot incident. Why: a green service gate is not enough if the last restorable copy is stale. This exporter is read-only; it checks cron/script presence and the latest successful backup evidence, then writes node-exporter textfile metrics. """ from __future__ import annotations import json import os import re import shlex import subprocess import tempfile import time from datetime import datetime, timezone from pathlib import Path TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector")) OUTPUT_NAME = "backup_health.prom" HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename) LABEL_RE = re.compile(r'["\\\n]') BACKUP_COMMON_SH = Path(os.environ.get("AIOPS_BACKUP_COMMON_SH", "/backup/scripts/common.sh")) BACKUP_OFFSITE_ENV = Path(os.environ.get("AIOPS_BACKUP_OFFSITE_ENV", "/backup/scripts/offsite.env")) OFFSITE_STATUS_DIR = Path(os.environ.get("AIOPS_OFFSITE_STATUS_DIR", "/backup/offsite")) ESCROW_EVIDENCE_DIR = Path(os.environ.get("AIOPS_ESCROW_EVIDENCE_DIR", "/backup/escrow-evidence")) CONFIG_CAPTURE_STATUS_FILE = Path(os.environ.get("AIOPS_CONFIG_CAPTURE_STATUS_FILE", "/backup/status/backup-configs-last-status.json")) ESCROW_ITEMS = [ "restic_repository_password", "offsite_provider_credentials", "break_glass_admin_credentials", "dns_registrar_recovery", "oauth_ai_provider_recovery", ] def _escape_label(value: str) -> str: return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value) def _run(command: list[str], timeout: int = 30) -> tuple[int, str, str]: try: result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, check=False) except FileNotFoundError as exc: return 127, "", str(exc) except subprocess.TimeoutExpired as exc: stdout = exc.stdout if isinstance(exc.stdout, str) else "" stderr = exc.stderr if isinstance(exc.stderr, str) else "timeout" return 124, stdout, stderr return result.returncode, result.stdout, result.stderr def _parse_time(value: str) -> int: if not value: return 0 normalized = re.sub(r"\.(\d{6})\d+([+-]\d\d:\d\d|Z)$", r".\1\2", value) normalized = normalized.replace("Z", "+00:00") try: return int(datetime.fromisoformat(normalized).astimezone(timezone.utc).timestamp()) except ValueError: return 0 def _parse_marker_timestamp(text: str) -> int: match = re.search(r"\b(\d{10})\b", text) if match: return int(match.group(1)) for line in text.splitlines(): parsed = _parse_time(line.strip()) if parsed: return parsed return 0 def _marker_timestamp(paths: list[Path]) -> int: for path in paths: try: text = path.read_text(encoding="utf-8", errors="replace") parsed = _parse_marker_timestamp(text) return parsed or int(path.stat().st_mtime) except OSError: continue return 0 def _shell_export_value(path: Path, key: str) -> str: try: lines = path.read_text(encoding="utf-8", errors="replace").splitlines() except OSError: return "" for line in lines: try: tokens = shlex.split(line, comments=True, posix=True) except ValueError: continue if tokens and tokens[0] == "export": tokens = tokens[1:] for token in tokens: if not token.startswith(f"{key}="): continue return token.split("=", 1)[1].strip() return "" def _backup_config_value(key: str) -> str: for path in [BACKUP_OFFSITE_ENV, BACKUP_COMMON_SH]: value = _shell_export_value(path, key) if value: default_match = re.fullmatch(r"\$\{" + re.escape(key) + r":-([^}]+)\}", value) if default_match: return default_match.group(1) return value return "" def _configured_secret(value: str) -> bool: return value.strip() not in {"", "CHANGE_ME", "CHANGEME", "TODO", "REDACTED"} def _b2_configured() -> bool: return ( _configured_secret(_backup_config_value("B2_ACCOUNT_ID")) and _configured_secret(_backup_config_value("B2_APPLICATION_KEY")) and _configured_secret(_backup_config_value("B2_BUCKET")) ) def _rclone_configured() -> bool: remote = _backup_config_value("OFFSITE_RCLONE_REMOTE") or os.environ.get("OFFSITE_RCLONE_REMOTE", "gdrive") rc, stdout, _ = _run(["rclone", "listremotes"], timeout=10) if rc == 0 and remote: return f"{remote}:" in {line.strip() for line in stdout.splitlines()} for path in [ Path.home() / ".config/rclone/rclone.conf", Path("/home/wooo/.config/rclone/rclone.conf"), Path("/root/.config/rclone/rclone.conf"), Path("/etc/rclone.conf"), ]: try: if path.is_file() and path.stat().st_size > 0: return True except OSError: continue return False def _cron_text() -> str: rc, stdout, _ = _run(["crontab", "-l"], timeout=10) return stdout if rc == 0 else "" def _active_cron_lines(cron: str) -> list[str]: return [line.strip() for line in cron.splitlines() if line.strip() and not line.lstrip().startswith("#")] def _cron_duplicate_metric_lines(host: str, cron: str) -> list[str]: lines: list[str] = [] active_lines = _active_cron_lines(cron) duplicate_count = max(0, len(active_lines) - len(set(active_lines))) lines.append(f'awoooi_backup_cron_active_duplicate_count{{host="{_escape_label(host)}"}} {duplicate_count}') singular_patterns = { "backup_health_exporter": "/home/wooo/scripts/backup-health-textfile-exporter.py", "offsite_status": "/backup/scripts/sync-offsite-backups.sh --mode status", "offsite_escrow_evidence_report": "/backup/scripts/offsite-escrow-evidence-report.sh --no-color", "offsite_sync_gated": "/backup/scripts/sync-offsite-backups.sh --mode sync", "offsite_full_sync_verify": "/backup/scripts/verify-offsite-full-sync.sh --write-textfile", } for entry, pattern in singular_patterns.items(): count = sum(1 for line in active_lines if pattern in line) labels = f'host="{_escape_label(host)}",entry="{_escape_label(entry)}"' lines.append(f"awoooi_backup_cron_singular_entry_count{{{labels}}} {count}") lines.append(f"awoooi_backup_cron_singular_entry_ok{{{labels}}} {1 if count == 1 else 0}") return lines def _newest_file_timestamp(patterns: list[str]) -> int: newest = 0 for pattern in patterns: for path in Path("/").glob(pattern.lstrip("/")): try: if path.is_file(): newest = max(newest, int(path.stat().st_mtime)) except OSError: continue return newest def _read_backup_110_timestamp() -> int: candidates = [ Path("/home/ollama/node_exporter_textfiles/backup.prom"), Path("/home/ollama/backup/110/last_success"), ] for path in candidates: try: text = path.read_text(encoding="utf-8", errors="replace") except OSError: continue match = re.search(r"(?:backup_110_last_success_timestamp\s+)?(\d{10})", text) if match: return int(match.group(1)) return 0 def _latest_restic_snapshot(repo: str) -> tuple[int, int]: password_file = os.environ.get("RESTIC_PASSWORD_FILE", "/backup/scripts/.restic-password") if not Path(repo).exists() or not Path(password_file).exists(): return 0, 0 rc, stdout, _ = _run( ["restic", "-r", repo, "snapshots", "--json", "--password-file", password_file], timeout=45, ) if rc != 0: return 0, 0 try: rows = json.loads(stdout) except json.JSONDecodeError: return 0, 0 timestamps = [_parse_time(str(row.get("time", ""))) for row in rows] timestamps = [value for value in timestamps if value > 0] return (max(timestamps), len(timestamps)) if timestamps else (0, 0) def _backup_all_failed_count_from_log(path: Path) -> tuple[int, int]: try: lines = path.read_text(encoding="utf-8", errors="replace").splitlines() except OSError: return 0, -1 for line in reversed(lines): if "全服務備份完成" not in line: continue ts_match = re.match(r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]", line) timestamp = 0 if ts_match: timestamp = int(datetime.strptime(ts_match.group(1), "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc).timestamp()) - 8 * 3600 failed_match = re.search(r"-\s+(\d+)\s+個失敗", line) if failed_match: return timestamp, int(failed_match.group(1)) if "全部成功" in line: return timestamp, 0 return 0, -1 def _latest_backup_all_failed_count() -> tuple[int, int]: candidates = [ _backup_all_failed_count_from_log(Path("/backup/logs/cron.log")), _backup_all_failed_count_from_log(Path("/backup/logs/backup.log")), ] candidates = [row for row in candidates if row[0] > 0 and row[1] >= 0] if not candidates: return 0, -1 return max(candidates, key=lambda row: row[0]) def _read_key_value_status(path: str) -> dict[str, int | str]: values: dict[str, int | str] = {} try: lines = Path(path).read_text(encoding="utf-8", errors="replace").splitlines() except OSError: return values for line in lines: if not line or line.startswith("#") or "=" not in line: continue key, value = line.split("=", 1) key = key.strip() value = value.strip() try: values[key] = int(float(value)) except ValueError: values[key] = value return values def _integrity_metric_lines(host: str) -> list[str]: now = int(time.time()) specs = [ ("restic_check", "/backup/integrity/check.status", 192), ("restore_drill", "/backup/integrity/restore-drill.status", 744), ] lines: list[str] = [] for scope, path, max_age_hours in specs: values = _read_key_value_status(path) timestamp = int(values.get("timestamp", 0)) if "timestamp" in values else 0 failed_count = int(values.get("failed_count", -1)) if "failed_count" in values else -1 checked_count = int(values.get("checked_repo_count", 0)) if "checked_repo_count" in values else 0 age = now - timestamp if timestamp else 0 fresh = 1 if timestamp and age <= max_age_hours * 3600 and failed_count == 0 else 0 labels = f'host="{_escape_label(host)}",scope="{scope}",max_age_hours="{max_age_hours}"' lines.extend( [ f"awoooi_backup_integrity_last_success_timestamp{{{labels}}} {timestamp if failed_count == 0 else 0}", f"awoooi_backup_integrity_age_seconds{{{labels}}} {age}", f"awoooi_backup_integrity_fresh{{{labels}}} {fresh}", f"awoooi_backup_integrity_failed_repo_count{{{labels}}} {failed_count}", f"awoooi_backup_integrity_checked_repo_count{{{labels}}} {checked_count}", ] ) return lines def _config_capture_metric_lines(host: str) -> list[str]: now = int(time.time()) labels = f'host="{_escape_label(host)}"' try: document = json.loads(CONFIG_CAPTURE_STATUS_FILE.read_text(encoding="utf-8", errors="replace")) except (OSError, json.JSONDecodeError): return [ f"awoooi_backup_config_capture_status_timestamp{{{labels}}} 0", f"awoooi_backup_config_capture_status_age_seconds{{{labels}}} 0", f"awoooi_backup_config_capture_critical_failed_count{{{labels}}} -1", ] timestamp = int(document.get("timestamp") or 0) critical_failed = int(document.get("critical_failed_count", -1)) failed_count = int(document.get("failed_count", -1)) snapshot_id = str(document.get("snapshot_id") or "unknown") duration = int(document.get("duration_seconds", 0) or 0) age = now - timestamp if timestamp else 0 lines = [ f"awoooi_backup_config_capture_status_timestamp{{{labels},snapshot_id=\"{_escape_label(snapshot_id)}\"}} {timestamp}", f"awoooi_backup_config_capture_status_age_seconds{{{labels}}} {age}", f"awoooi_backup_config_capture_critical_failed_count{{{labels}}} {critical_failed}", f"awoooi_backup_config_capture_failed_count{{{labels}}} {failed_count}", f"awoooi_backup_config_capture_duration_seconds{{{labels}}} {duration}", ] for item in document.get("items") or []: target = str(item.get("target") or "unknown") source = str(item.get("source") or "unknown") critical = "true" if item.get("critical") else "false" ok = 1 if item.get("ok") else 0 item_labels = ( f'host="{_escape_label(host)}",' f'target="{_escape_label(target)}",' f'source="{_escape_label(source)}",' f'critical="{critical}"' ) lines.append(f"awoooi_backup_config_capture_ok{{{item_labels}}} {ok}") return lines def _offsite_and_escrow_metric_lines(host: str) -> list[str]: now = int(time.time()) lines: list[str] = [] b2_configured = int(_b2_configured()) rclone_configured = int(_rclone_configured()) b2_full_timestamp = _marker_timestamp( [ OFFSITE_STATUS_DIR / "b2-last-success", OFFSITE_STATUS_DIR / "b2.last_success", OFFSITE_STATUS_DIR / "last_success", Path("/backup/logs/offsite-b2.status"), ] ) b2_partial_timestamp = _marker_timestamp( [ OFFSITE_STATUS_DIR / "b2-partial-last-success", OFFSITE_STATUS_DIR / "b2.partial_last_success", ] ) rclone_full_timestamp = _marker_timestamp( [ OFFSITE_STATUS_DIR / "rclone-last-success", OFFSITE_STATUS_DIR / "rclone.last_success", OFFSITE_STATUS_DIR / "last_success", Path("/backup/logs/rclone-sync.status"), ] ) rclone_partial_timestamp = _marker_timestamp( [ OFFSITE_STATUS_DIR / "rclone-partial-last-success", OFFSITE_STATUS_DIR / "rclone.partial_last_success", ] ) offsite_specs = [ ("b2", b2_configured, b2_full_timestamp), ("rclone", rclone_configured, rclone_full_timestamp), ] for provider, configured, timestamp in offsite_specs: age = now - timestamp if timestamp else 0 fresh = 1 if configured and timestamp and age <= 48 * 3600 else 0 labels = f'host="{_escape_label(host)}",provider="{provider}",max_age_hours="48"' lines.extend( [ f"awoooi_backup_offsite_configured{{{labels}}} {configured}", f"awoooi_backup_offsite_last_success_timestamp{{{labels}}} {timestamp}", f"awoooi_backup_offsite_age_seconds{{{labels}}} {age}", f"awoooi_backup_offsite_fresh{{{labels}}} {fresh}", ] ) partial_fresh_by_provider: dict[str, int] = {} for provider, configured, timestamp in [ ("b2", b2_configured, b2_partial_timestamp), ("rclone", rclone_configured, rclone_partial_timestamp), ]: partial_age = now - timestamp if timestamp else 0 partial_fresh = 1 if configured and timestamp and partial_age <= 48 * 3600 else 0 partial_fresh_by_provider[provider] = partial_fresh partial_labels = f'host="{_escape_label(host)}",provider="{provider}",scope="partial",max_age_hours="48"' lines.extend( [ f"awoooi_backup_offsite_partial_last_success_timestamp{{{partial_labels}}} {timestamp}", f"awoooi_backup_offsite_partial_age_seconds{{{partial_labels}}} {partial_age}", f"awoooi_backup_offsite_partial_fresh{{{partial_labels}}} {partial_fresh}", ] ) full_sync_enable_marker = OFFSITE_STATUS_DIR / "enable-rclone-sync" try: full_sync_enabled = 1 if full_sync_enable_marker.is_file() else 0 full_sync_enabled_timestamp = int(full_sync_enable_marker.stat().st_mtime) if full_sync_enabled else 0 except OSError: full_sync_enabled = 0 full_sync_enabled_timestamp = 0 full_sync_labels = f'host="{_escape_label(host)}",provider="rclone"' lines.extend( [ f"awoooi_backup_offsite_full_sync_enabled{{{full_sync_labels}}} {full_sync_enabled}", f"awoooi_backup_offsite_full_sync_enabled_timestamp{{{full_sync_labels}}} {full_sync_enabled_timestamp}", ] ) escrow_missing_count = 0 for item in ESCROW_ITEMS: timestamp = _marker_timestamp( [ ESCROW_EVIDENCE_DIR / f"{item}.last_verified", ESCROW_EVIDENCE_DIR / f"{item}.verified", ESCROW_EVIDENCE_DIR / item, ] ) age = now - timestamp if timestamp else 0 fresh = 1 if timestamp and age <= 744 * 3600 else 0 escrow_missing_count += 0 if fresh else 1 labels = f'host="{_escape_label(host)}",item="{item}",max_age_hours="744"' lines.extend( [ f"awoooi_backup_credential_escrow_expected_info{{{labels}}} 1", f"awoooi_backup_credential_escrow_last_verified_timestamp{{{labels}}} {timestamp}", f"awoooi_backup_credential_escrow_age_seconds{{{labels}}} {age}", f"awoooi_backup_credential_escrow_fresh{{{labels}}} {fresh}", ] ) offsite_configured = 1 if b2_configured or rclone_configured else 0 any_partial_fresh = 1 if any(partial_fresh_by_provider.values()) else 0 full_fresh = 1 if ( (b2_configured and b2_full_timestamp and now - b2_full_timestamp <= 48 * 3600) or (rclone_configured and rclone_full_timestamp and now - rclone_full_timestamp <= 48 * 3600) ) else 0 if not offsite_configured: next_step = "configure_google_drive_rclone_on_110_tty" phase = 1 elif not any_partial_fresh: next_step = "run_small_dry_run_then_partial_sync" phase = 2 elif escrow_missing_count > 0: next_step = "complete_credential_escrow_review" phase = 3 elif not full_fresh: next_step = "pre_full_sync_review" phase = 4 else: next_step = "offsite_and_escrow_ready" phase = 5 lines.extend( [ f'awoooi_backup_dr_credential_escrow_missing_count{{host="{_escape_label(host)}"}} {escrow_missing_count}', f'awoooi_backup_dr_phase{{host="{_escape_label(host)}",next_step="{_escape_label(next_step)}"}} {phase}', f'awoooi_backup_dr_next_step_info{{host="{_escape_label(host)}",next_step="{_escape_label(next_step)}"}} 1', ] ) return lines def _retention_metric_lines(host: str) -> list[str]: mode = (_backup_config_value("BACKUP_RETENTION_MODE") or os.environ.get("BACKUP_RETENTION_MODE", "")).strip() keep_last = (_backup_config_value("KEEP_LAST") or os.environ.get("KEEP_LAST", "")).strip() offsite_delete_old = ( _backup_config_value("OFFSITE_SYNC_DELETE_OLD") or os.environ.get("OFFSITE_SYNC_DELETE_OLD", "") ).strip() latest_only = 1 if mode == "latest" and keep_last == "1" else 0 offsite_mirror = 1 if offsite_delete_old == "1" else 0 labels = f'host="{_escape_label(host)}",scope="restic",mode="{_escape_label(mode or "unknown")}",keep_last="{_escape_label(keep_last or "unknown")}"' offsite_labels = ( f'host="{_escape_label(host)}",scope="offsite",provider="rclone",' f'delete_old="{_escape_label(offsite_delete_old or "unknown")}"' ) return [ f"awoooi_backup_retention_latest_only{{{labels}}} {latest_only}", f"awoooi_backup_retention_offsite_delete_old_enabled{{{offsite_labels}}} {offsite_mirror}", ] def _collect_velero_from_k8s() -> dict[str, int | str]: remote_script = r""" python3 - <<'PY' import datetime as dt import json import subprocess import time def kubectl(args): for prefix in (["sudo", "-n", "kubectl"], ["kubectl"]): result = subprocess.run(prefix + args, capture_output=True, text=True, timeout=20, check=False) if result.returncode == 0: return result.stdout return "" def load_json(args): text = kubectl(args + ["-o", "json"]) try: return json.loads(text) if text else {} except json.JSONDecodeError: return {} def parse_ts(value): if not value: return 0 try: return int(dt.datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()) except ValueError: return 0 now = int(time.time()) schedules = load_json(["get", "schedules.velero.io", "-n", "velero"]).get("items") or [] backups = load_json(["get", "backups.velero.io", "-n", "velero"]).get("items") or [] cron = load_json(["get", "cronjob", "-n", "velero", "backup-restore-test"]) jobs = load_json(["get", "jobs", "-n", "velero", "-l", "component=backup-restore-test"]).get("items") or [] completed = [] for item in backups: if item.get("status", {}).get("phase") != "Completed": continue timestamp = parse_ts(item.get("status", {}).get("completionTimestamp") or item.get("metadata", {}).get("creationTimestamp")) if timestamp: completed.append(timestamp) failed_jobs = 0 for job in jobs: conditions = job.get("status", {}).get("conditions") or [] if any(row.get("type") == "Failed" and row.get("status") == "True" for row in conditions): failed_jobs += 1 last_success = parse_ts((cron.get("status") or {}).get("lastSuccessfulTime")) latest_backup = max(completed) if completed else 0 print("monitor_up=1") print(f"schedule_count={len(schedules)}") print(f"schedule_paused_count={sum(1 for item in schedules if item.get('spec', {}).get('paused'))}") print(f"latest_completed_backup_timestamp={latest_backup}") print(f"latest_completed_backup_age_seconds={now - latest_backup if latest_backup else 0}") print(f"latest_completed_backup_fresh={1 if latest_backup and now - latest_backup <= 90000 else 0}") print(f"restore_test_cron_present={1 if cron.get('metadata', {}).get('name') == 'backup-restore-test' else 0}") print(f"restore_test_last_success_timestamp={last_success}") print(f"restore_test_last_success_age_seconds={now - last_success if last_success else 0}") print(f"restore_test_last_success_fresh={1 if last_success and now - last_success <= 691200 else 0}") print(f"restore_test_failed_jobs={failed_jobs}") PY """ hosts = os.environ.get("AIOPS_K8S_QUERY_HOSTS", "192.168.0.120 192.168.0.121 192.168.0.125").split() values: dict[str, int | str] = {"monitor_up": 0, "source": "unreachable"} for host in hosts: rc, stdout, _ = _run( [ "ssh", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=accept-new", "-o", "ConnectTimeout=8", f"wooo@{host}", remote_script, ], timeout=45, ) if rc != 0: continue parsed: dict[str, int | str] = {"source": f"{host}-kubectl"} for line in stdout.splitlines(): if "=" not in line: continue key, value = line.split("=", 1) try: parsed[key.strip()] = int(float(value.strip())) except ValueError: continue if int(parsed.get("monitor_up", 0)) == 1: return parsed return values def _velero_metric_lines(host: str) -> list[str]: values = _collect_velero_from_k8s() labels = f'host="{_escape_label(host)}",source="{_escape_label(str(values.get("source", "unreachable")))}",namespace="velero"' return [ f"awoooi_velero_monitor_up{{{labels}}} {values.get('monitor_up', 0)}", f"awoooi_velero_schedule_count{{{labels}}} {values.get('schedule_count', 0)}", f"awoooi_velero_schedule_paused_count{{{labels}}} {values.get('schedule_paused_count', 0)}", f"awoooi_velero_latest_completed_backup_timestamp{{{labels}}} {values.get('latest_completed_backup_timestamp', 0)}", f"awoooi_velero_latest_completed_backup_age_seconds{{{labels}}} {values.get('latest_completed_backup_age_seconds', 0)}", f"awoooi_velero_latest_completed_backup_fresh{{{labels},max_age_hours=\"25\"}} {values.get('latest_completed_backup_fresh', 0)}", f"awoooi_velero_restore_test_cron_present{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_cron_present', 0)}", f"awoooi_velero_restore_test_last_success_timestamp{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_last_success_timestamp', 0)}", f"awoooi_velero_restore_test_last_success_age_seconds{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_last_success_age_seconds', 0)}", f"awoooi_velero_restore_test_last_success_fresh{{{labels},cronjob=\"backup-restore-test\",max_age_hours=\"192\"}} {values.get('restore_test_last_success_fresh', 0)}", f"awoooi_velero_restore_test_failed_jobs{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_failed_jobs', 0)}", ] def _metric_lines_for_job( *, host: str, job: str, source: str, target: str, backup_type: str, last_success: int, max_age_hours: float, sample_count: int = 0, ) -> list[str]: now = int(time.time()) labels = ( f'host="{_escape_label(host)}",' f'job="{_escape_label(job)}",' f'type="{_escape_label(backup_type)}",' f'source="{_escape_label(source)}",' f'target="{_escape_label(target)}",' f'max_age_hours="{max_age_hours:g}"' ) age = now - last_success if last_success > 0 else 0 fresh = 1 if last_success > 0 and age <= int(max_age_hours * 3600) else 0 return [ f"awoooi_backup_expected_job_info{{{labels}}} 1", f"awoooi_backup_job_last_success_timestamp{{{labels}}} {last_success}", f"awoooi_backup_job_age_seconds{{{labels}}} {age}", f"awoooi_backup_job_fresh{{{labels}}} {fresh}", f"awoooi_backup_job_snapshot_count{{{labels}}} {sample_count}", ] def _base_lines(host: str) -> list[str]: now = int(time.time()) return [ "# HELP awoooi_backup_health_monitor_up Whether the backup health exporter completed.", "# TYPE awoooi_backup_health_monitor_up gauge", "# HELP awoooi_backup_health_last_run_timestamp Unix timestamp of the last backup health exporter run.", "# TYPE awoooi_backup_health_last_run_timestamp gauge", "# HELP awoooi_backup_expected_job_info Expected backup job inventory.", "# TYPE awoooi_backup_expected_job_info gauge", "# HELP awoooi_backup_job_configured Whether the expected backup cron/config is present.", "# TYPE awoooi_backup_job_configured gauge", "# HELP awoooi_backup_script_present Whether the backup script exists on this host.", "# TYPE awoooi_backup_script_present gauge", "# HELP awoooi_backup_job_last_success_timestamp Unix timestamp of the latest successful backup evidence.", "# TYPE awoooi_backup_job_last_success_timestamp gauge", "# HELP awoooi_backup_job_age_seconds Age of the latest successful backup evidence.", "# TYPE awoooi_backup_job_age_seconds gauge", "# HELP awoooi_backup_job_fresh Whether the latest successful backup evidence is within max_age_hours.", "# TYPE awoooi_backup_job_fresh gauge", "# HELP awoooi_backup_job_snapshot_count Number of snapshots or files considered for this job.", "# TYPE awoooi_backup_job_snapshot_count gauge", "# HELP awoooi_backup_last_run_failed_count Failed component count from the last aggregate backup run.", "# TYPE awoooi_backup_last_run_failed_count gauge", "# HELP awoooi_backup_integrity_last_success_timestamp Unix timestamp of latest successful backup integrity or restore drill run.", "# TYPE awoooi_backup_integrity_last_success_timestamp gauge", "# HELP awoooi_backup_integrity_age_seconds Age of backup integrity or restore drill status.", "# TYPE awoooi_backup_integrity_age_seconds gauge", "# HELP awoooi_backup_integrity_fresh Whether backup integrity or restore drill status is fresh and successful.", "# TYPE awoooi_backup_integrity_fresh gauge", "# HELP awoooi_backup_integrity_failed_repo_count Failed repository count from backup integrity or restore drill run.", "# TYPE awoooi_backup_integrity_failed_repo_count gauge", "# HELP awoooi_backup_integrity_checked_repo_count Checked repository count from backup integrity or restore drill run.", "# TYPE awoooi_backup_integrity_checked_repo_count gauge", "# HELP awoooi_backup_config_capture_status_timestamp Unix timestamp of the latest config-capture coverage status.", "# TYPE awoooi_backup_config_capture_status_timestamp gauge", "# HELP awoooi_backup_config_capture_status_age_seconds Age of the latest config-capture coverage status.", "# TYPE awoooi_backup_config_capture_status_age_seconds gauge", "# HELP awoooi_backup_config_capture_critical_failed_count Critical config-capture targets missing from the latest configs backup.", "# TYPE awoooi_backup_config_capture_critical_failed_count gauge", "# HELP awoooi_backup_config_capture_failed_count Total config-capture targets missing from the latest configs backup.", "# TYPE awoooi_backup_config_capture_failed_count gauge", "# HELP awoooi_backup_config_capture_duration_seconds Duration of the latest configs backup capture run.", "# TYPE awoooi_backup_config_capture_duration_seconds gauge", "# HELP awoooi_backup_config_capture_ok Whether the latest configs backup captured a specific target.", "# TYPE awoooi_backup_config_capture_ok gauge", "# HELP awoooi_backup_offsite_configured Whether an offsite backup provider appears configured without exposing credentials.", "# TYPE awoooi_backup_offsite_configured gauge", "# HELP awoooi_backup_offsite_last_success_timestamp Unix timestamp of latest offsite copy success marker.", "# TYPE awoooi_backup_offsite_last_success_timestamp gauge", "# HELP awoooi_backup_offsite_age_seconds Age of latest offsite copy success marker.", "# TYPE awoooi_backup_offsite_age_seconds gauge", "# HELP awoooi_backup_offsite_fresh Whether offsite copy success marker is fresh.", "# TYPE awoooi_backup_offsite_fresh gauge", "# HELP awoooi_backup_offsite_partial_last_success_timestamp Unix timestamp of latest partial offsite copy success marker.", "# TYPE awoooi_backup_offsite_partial_last_success_timestamp gauge", "# HELP awoooi_backup_offsite_partial_age_seconds Age of latest partial offsite copy success marker.", "# TYPE awoooi_backup_offsite_partial_age_seconds gauge", "# HELP awoooi_backup_offsite_partial_fresh Whether partial offsite copy success marker is fresh.", "# TYPE awoooi_backup_offsite_partial_fresh gauge", "# HELP awoooi_backup_offsite_full_sync_enabled Whether the gated full offsite sync enable marker exists.", "# TYPE awoooi_backup_offsite_full_sync_enabled gauge", "# HELP awoooi_backup_offsite_full_sync_enabled_timestamp Unix timestamp of the gated full offsite sync enable marker.", "# TYPE awoooi_backup_offsite_full_sync_enabled_timestamp gauge", "# HELP awoooi_backup_credential_escrow_expected_info Expected credential escrow evidence inventory.", "# TYPE awoooi_backup_credential_escrow_expected_info gauge", "# HELP awoooi_backup_credential_escrow_last_verified_timestamp Unix timestamp of credential escrow verification evidence.", "# TYPE awoooi_backup_credential_escrow_last_verified_timestamp gauge", "# HELP awoooi_backup_credential_escrow_age_seconds Age of credential escrow verification evidence.", "# TYPE awoooi_backup_credential_escrow_age_seconds gauge", "# HELP awoooi_backup_credential_escrow_fresh Whether credential escrow verification evidence is fresh.", "# TYPE awoooi_backup_credential_escrow_fresh gauge", "# HELP awoooi_backup_dr_credential_escrow_missing_count Number of credential escrow items that still need fresh human verification.", "# TYPE awoooi_backup_dr_credential_escrow_missing_count gauge", "# HELP awoooi_backup_dr_phase Numeric DR offsite completion phase for AI/operator triage.", "# TYPE awoooi_backup_dr_phase gauge", "# HELP awoooi_backup_dr_next_step_info Current human-safe next step for DR offsite completion.", "# TYPE awoooi_backup_dr_next_step_info gauge", "# HELP awoooi_backup_retention_latest_only Whether local restic backup retention is configured as latest-only keep-last=1.", "# TYPE awoooi_backup_retention_latest_only gauge", "# HELP awoooi_backup_retention_offsite_delete_old_enabled Whether offsite rclone sync is allowed to delete old remote backup files after successful mirror.", "# TYPE awoooi_backup_retention_offsite_delete_old_enabled gauge", "# HELP awoooi_backup_cron_active_duplicate_count Number of exact duplicate active crontab entries on the backup host.", "# TYPE awoooi_backup_cron_active_duplicate_count gauge", "# HELP awoooi_backup_cron_singular_entry_count Number of active crontab entries matching a backup/offsite singleton pattern.", "# TYPE awoooi_backup_cron_singular_entry_count gauge", "# HELP awoooi_backup_cron_singular_entry_ok Whether a backup/offsite singleton cron pattern has exactly one active entry.", "# TYPE awoooi_backup_cron_singular_entry_ok gauge", "# HELP awoooi_velero_monitor_up Whether the backup health exporter can query Velero via a reachable K3s kubectl endpoint.", "# TYPE awoooi_velero_monitor_up gauge", "# HELP awoooi_velero_schedule_count Number of Velero schedules in the velero namespace.", "# TYPE awoooi_velero_schedule_count gauge", "# HELP awoooi_velero_schedule_paused_count Number of paused Velero schedules.", "# TYPE awoooi_velero_schedule_paused_count gauge", "# HELP awoooi_velero_latest_completed_backup_timestamp Unix timestamp of latest Completed Velero backup.", "# TYPE awoooi_velero_latest_completed_backup_timestamp gauge", "# HELP awoooi_velero_latest_completed_backup_age_seconds Age of latest Completed Velero backup.", "# TYPE awoooi_velero_latest_completed_backup_age_seconds gauge", "# HELP awoooi_velero_latest_completed_backup_fresh Whether latest Completed Velero backup is within max_age_hours.", "# TYPE awoooi_velero_latest_completed_backup_fresh gauge", "# HELP awoooi_velero_restore_test_cron_present Whether backup-restore-test CronJob exists.", "# TYPE awoooi_velero_restore_test_cron_present gauge", "# HELP awoooi_velero_restore_test_last_success_timestamp Unix timestamp of backup-restore-test lastSuccessfulTime.", "# TYPE awoooi_velero_restore_test_last_success_timestamp gauge", "# HELP awoooi_velero_restore_test_last_success_age_seconds Age of backup-restore-test lastSuccessfulTime.", "# TYPE awoooi_velero_restore_test_last_success_age_seconds gauge", "# HELP awoooi_velero_restore_test_last_success_fresh Whether backup-restore-test lastSuccessfulTime is within max_age_hours.", "# TYPE awoooi_velero_restore_test_last_success_fresh gauge", "# HELP awoooi_velero_restore_test_failed_jobs Failed backup-restore-test jobs retained in velero namespace.", "# TYPE awoooi_velero_restore_test_failed_jobs gauge", f'awoooi_backup_health_monitor_up{{host="{_escape_label(host)}"}} 1', f'awoooi_backup_health_last_run_timestamp{{host="{_escape_label(host)}"}} {now}', ] def _collect_110(host: str) -> list[str]: cron = _cron_text() lines = _base_lines(host) expected_crons = { "backup_all": "/backup/scripts/backup-all.sh", "awoooi_frequent": "/backup/scripts/backup-awoooi-frequent.sh", "offsite_status": "/backup/scripts/sync-offsite-backups.sh --mode status", "offsite_sync_gated": "/backup/offsite/enable-rclone-sync", "offsite_escrow_evidence_report": "/backup/scripts/offsite-escrow-evidence-report.sh --no-color", "offsite_full_sync_verify": "/backup/scripts/verify-offsite-full-sync.sh --write-textfile", "backup_integrity_check": "/backup/scripts/check-backup-integrity.sh --mode check", "backup_restore_drill": "/backup/scripts/check-backup-integrity.sh --mode restore-drill", } for job, pattern in expected_crons.items(): labels = f'host="{_escape_label(host)}",job="{_escape_label(job)}"' lines.append(f"awoooi_backup_job_configured{{{labels}}} {int(pattern in cron)}") for script in [ "backup-all.sh", "backup-awoooi.sh", "backup-awoooi-frequent.sh", "backup-configs.sh", "backup-sentry.sh", "backup-ai-artifacts.sh", "backup-public-routes.sh", "configure-offsite-rclone.sh", "configure-offsite-b2.sh", "sync-offsite-backups.sh", "backup-offsite-readiness-gate.sh", "offsite-escrow-evidence-report.sh", "verify-offsite-full-sync.sh", "mark-credential-escrow-verified.sh", "check-backup-integrity.sh", "backup-gitea.sh", "backup-harbor.sh", "backup-momo.sh", "backup-langfuse.sh", "backup-monitoring.sh", "backup-signoz.sh", "backup-open-webui.sh", "backup-clawbot.sh", ]: labels = f'host="{_escape_label(host)}",script="{_escape_label(script)}"' lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path('/backup/scripts', script).exists())}") for job, repo, max_age in [ ("awoooi_db", "/backup/awoooi", 7), ("configs", "/backup/configs", 48), ("sentry", "/backup/sentry", 48), ("gitea", "/backup/gitea", 48), ("harbor", "/backup/harbor", 48), ("momo", "/backup/momo", 48), ("langfuse", "/backup/langfuse", 48), ("monitoring", "/backup/monitoring", 48), ("signoz", "/backup/signoz", 48), ("open_webui", "/backup/open-webui", 48), ("clawbot", "/backup/clawbot", 48), ("ai_artifacts", "/backup/ai-artifacts", 48), ("public_routes", "/backup/public-routes", 168), ]: timestamp, count = _latest_restic_snapshot(repo) lines.extend( _metric_lines_for_job( host=host, job=job, source="110-restic", target=repo, backup_type="restic", last_success=timestamp, max_age_hours=max_age, sample_count=count, ) ) backup_all_ts, failed_count = _latest_backup_all_failed_count() labels = f'host="{_escape_label(host)}",job="backup_all"' lines.append(f"awoooi_backup_last_run_failed_count{{{labels}}} {failed_count}") lines.append(f"awoooi_backup_job_last_success_timestamp{{{labels},type=\"aggregate\",source=\"110-cron-log\",target=\"/backup/logs/cron.log\",max_age_hours=\"48\"}} {backup_all_ts if failed_count == 0 else 0}") lines.extend(_integrity_metric_lines(host)) lines.extend(_config_capture_metric_lines(host)) lines.extend(_offsite_and_escrow_metric_lines(host)) lines.extend(_retention_metric_lines(host)) lines.extend(_cron_duplicate_metric_lines(host, cron)) lines.extend(_velero_metric_lines(host)) return lines def _collect_188(host: str) -> list[str]: cron = _cron_text() lines = _base_lines(host) for job, pattern in { "backup_from_110": "/home/ollama/bin/backup-from-110.sh", "momo_pg_daily": "/home/ollama/bin/momo-pg-backup.sh", }.items(): labels = f'host="{_escape_label(host)}",job="{_escape_label(job)}"' lines.append(f"awoooi_backup_job_configured{{{labels}}} {int(pattern in cron)}") for script in [ "/home/ollama/bin/backup-from-110.sh", "/home/ollama/bin/momo-pg-backup.sh", "/home/ollama/awoooi-ops/pg-backup.sh", ]: labels = f'host="{_escape_label(host)}",script="{_escape_label(Path(script).name)}"' lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path(script).exists() and os.access(script, os.X_OK))}") lines.extend( _metric_lines_for_job( host=host, job="backup_from_110", source="188-rsync", target="/home/ollama/backup/110", backup_type="rsync", last_success=_read_backup_110_timestamp(), max_age_hours=25, sample_count=1, ) ) momo_ts = _newest_file_timestamp([ "/home/ollama/momo_backups/*.sql.gz", "/home/ollama/momo-pro/backups/*.sql.gz", "/home/ollama/backups/momo_analytics_*.sql.gz", ]) lines.extend( _metric_lines_for_job( host=host, job="momo_pg_daily", source="188-pg-dump", target="/home/ollama/momo_backups", backup_type="pg_dump", last_success=momo_ts, max_age_hours=30, sample_count=1 if momo_ts else 0, ) ) return lines def collect() -> str: host = HOST_LABEL if host == "110": lines = _collect_110(host) elif host == "188": lines = _collect_188(host) else: lines = _base_lines(host) return "\n".join(lines) + "\n" def main() -> None: TEXTFILE_DIR.mkdir(parents=True, exist_ok=True) payload = collect() with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp: tmp.write(payload) tmp_path = Path(tmp.name) output_path = TEXTFILE_DIR / OUTPUT_NAME tmp_path.replace(output_path) output_path.chmod(0o644) if __name__ == "__main__": main()