Files
awoooi/scripts/ops/backup-health-textfile-exporter.py
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

927 lines
42 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Backup health textfile exporter for full-stack reboot readiness.
2026-05-06 ogt + Codex: backup coverage follow-up after the reboot incident.
Why: a green service gate is not enough if the last restorable copy is stale.
This exporter is read-only; it checks cron/script presence and the latest
successful backup evidence, then writes node-exporter textfile metrics.
"""
from __future__ import annotations
import json
import os
import re
import shlex
import subprocess
import tempfile
import time
from datetime import datetime, timezone
from pathlib import Path
TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
OUTPUT_NAME = "backup_health.prom"
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
LABEL_RE = re.compile(r'["\\\n]')
BACKUP_COMMON_SH = Path(os.environ.get("AIOPS_BACKUP_COMMON_SH", "/backup/scripts/common.sh"))
BACKUP_OFFSITE_ENV = Path(os.environ.get("AIOPS_BACKUP_OFFSITE_ENV", "/backup/scripts/offsite.env"))
OFFSITE_STATUS_DIR = Path(os.environ.get("AIOPS_OFFSITE_STATUS_DIR", "/backup/offsite"))
ESCROW_EVIDENCE_DIR = Path(os.environ.get("AIOPS_ESCROW_EVIDENCE_DIR", "/backup/escrow-evidence"))
CONFIG_CAPTURE_STATUS_FILE = Path(os.environ.get("AIOPS_CONFIG_CAPTURE_STATUS_FILE", "/backup/status/backup-configs-last-status.json"))
ESCROW_ITEMS = [
"restic_repository_password",
"offsite_provider_credentials",
"break_glass_admin_credentials",
"dns_registrar_recovery",
"oauth_ai_provider_recovery",
]
def _escape_label(value: str) -> str:
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
def _run(command: list[str], timeout: int = 30) -> tuple[int, str, str]:
try:
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, check=False)
except FileNotFoundError as exc:
return 127, "", str(exc)
except subprocess.TimeoutExpired as exc:
stdout = exc.stdout if isinstance(exc.stdout, str) else ""
stderr = exc.stderr if isinstance(exc.stderr, str) else "timeout"
return 124, stdout, stderr
return result.returncode, result.stdout, result.stderr
def _parse_time(value: str) -> int:
if not value:
return 0
normalized = re.sub(r"\.(\d{6})\d+([+-]\d\d:\d\d|Z)$", r".\1\2", value)
normalized = normalized.replace("Z", "+00:00")
try:
return int(datetime.fromisoformat(normalized).astimezone(timezone.utc).timestamp())
except ValueError:
return 0
def _parse_marker_timestamp(text: str) -> int:
match = re.search(r"\b(\d{10})\b", text)
if match:
return int(match.group(1))
for line in text.splitlines():
parsed = _parse_time(line.strip())
if parsed:
return parsed
return 0
def _marker_timestamp(paths: list[Path]) -> int:
for path in paths:
try:
text = path.read_text(encoding="utf-8", errors="replace")
parsed = _parse_marker_timestamp(text)
return parsed or int(path.stat().st_mtime)
except OSError:
continue
return 0
def _shell_export_value(path: Path, key: str) -> str:
try:
lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
except OSError:
return ""
for line in lines:
try:
tokens = shlex.split(line, comments=True, posix=True)
except ValueError:
continue
if tokens and tokens[0] == "export":
tokens = tokens[1:]
for token in tokens:
if not token.startswith(f"{key}="):
continue
return token.split("=", 1)[1].strip()
return ""
def _backup_config_value(key: str) -> str:
for path in [BACKUP_OFFSITE_ENV, BACKUP_COMMON_SH]:
value = _shell_export_value(path, key)
if value:
default_match = re.fullmatch(r"\$\{" + re.escape(key) + r":-([^}]+)\}", value)
if default_match:
return default_match.group(1)
return value
return ""
def _configured_secret(value: str) -> bool:
return value.strip() not in {"", "CHANGE_ME", "CHANGEME", "TODO", "REDACTED"}
def _b2_configured() -> bool:
return (
_configured_secret(_backup_config_value("B2_ACCOUNT_ID"))
and _configured_secret(_backup_config_value("B2_APPLICATION_KEY"))
and _configured_secret(_backup_config_value("B2_BUCKET"))
)
def _rclone_configured() -> bool:
remote = _backup_config_value("OFFSITE_RCLONE_REMOTE") or os.environ.get("OFFSITE_RCLONE_REMOTE", "gdrive")
rc, stdout, _ = _run(["rclone", "listremotes"], timeout=10)
if rc == 0 and remote:
return f"{remote}:" in {line.strip() for line in stdout.splitlines()}
for path in [
Path.home() / ".config/rclone/rclone.conf",
Path("/home/wooo/.config/rclone/rclone.conf"),
Path("/root/.config/rclone/rclone.conf"),
Path("/etc/rclone.conf"),
]:
try:
if path.is_file() and path.stat().st_size > 0:
return True
except OSError:
continue
return False
def _cron_text() -> str:
rc, stdout, _ = _run(["crontab", "-l"], timeout=10)
return stdout if rc == 0 else ""
def _active_cron_lines(cron: str) -> list[str]:
return [line.strip() for line in cron.splitlines() if line.strip() and not line.lstrip().startswith("#")]
def _cron_duplicate_metric_lines(host: str, cron: str) -> list[str]:
lines: list[str] = []
active_lines = _active_cron_lines(cron)
duplicate_count = max(0, len(active_lines) - len(set(active_lines)))
lines.append(f'awoooi_backup_cron_active_duplicate_count{{host="{_escape_label(host)}"}} {duplicate_count}')
singular_patterns = {
"backup_health_exporter": "/home/wooo/scripts/backup-health-textfile-exporter.py",
"offsite_status": "/backup/scripts/sync-offsite-backups.sh --mode status",
"offsite_escrow_evidence_report": "/backup/scripts/offsite-escrow-evidence-report.sh --no-color",
"offsite_sync_gated": "/backup/scripts/sync-offsite-backups.sh --mode sync",
"offsite_full_sync_verify": "/backup/scripts/verify-offsite-full-sync.sh --write-textfile",
}
for entry, pattern in singular_patterns.items():
count = sum(1 for line in active_lines if pattern in line)
labels = f'host="{_escape_label(host)}",entry="{_escape_label(entry)}"'
lines.append(f"awoooi_backup_cron_singular_entry_count{{{labels}}} {count}")
lines.append(f"awoooi_backup_cron_singular_entry_ok{{{labels}}} {1 if count == 1 else 0}")
return lines
def _newest_file_timestamp(patterns: list[str]) -> int:
newest = 0
for pattern in patterns:
for path in Path("/").glob(pattern.lstrip("/")):
try:
if path.is_file():
newest = max(newest, int(path.stat().st_mtime))
except OSError:
continue
return newest
def _read_backup_110_timestamp() -> int:
candidates = [
Path("/home/ollama/node_exporter_textfiles/backup.prom"),
Path("/home/ollama/backup/110/last_success"),
]
for path in candidates:
try:
text = path.read_text(encoding="utf-8", errors="replace")
except OSError:
continue
match = re.search(r"(?:backup_110_last_success_timestamp\s+)?(\d{10})", text)
if match:
return int(match.group(1))
return 0
def _latest_restic_snapshot(repo: str) -> tuple[int, int]:
password_file = os.environ.get("RESTIC_PASSWORD_FILE", "/backup/scripts/.restic-password")
if not Path(repo).exists() or not Path(password_file).exists():
return 0, 0
rc, stdout, _ = _run(
["restic", "-r", repo, "snapshots", "--json", "--password-file", password_file],
timeout=45,
)
if rc != 0:
return 0, 0
try:
rows = json.loads(stdout)
except json.JSONDecodeError:
return 0, 0
timestamps = [_parse_time(str(row.get("time", ""))) for row in rows]
timestamps = [value for value in timestamps if value > 0]
return (max(timestamps), len(timestamps)) if timestamps else (0, 0)
def _backup_all_failed_count_from_log(path: Path) -> tuple[int, int]:
try:
lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
except OSError:
return 0, -1
for line in reversed(lines):
if "全服務備份完成" not in line:
continue
ts_match = re.match(r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]", line)
timestamp = 0
if ts_match:
timestamp = int(datetime.strptime(ts_match.group(1), "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc).timestamp()) - 8 * 3600
failed_match = re.search(r"-\s+(\d+)\s+個失敗", line)
if failed_match:
return timestamp, int(failed_match.group(1))
if "全部成功" in line:
return timestamp, 0
return 0, -1
def _latest_backup_all_failed_count() -> tuple[int, int]:
candidates = [
_backup_all_failed_count_from_log(Path("/backup/logs/cron.log")),
_backup_all_failed_count_from_log(Path("/backup/logs/backup.log")),
]
candidates = [row for row in candidates if row[0] > 0 and row[1] >= 0]
if not candidates:
return 0, -1
return max(candidates, key=lambda row: row[0])
def _read_key_value_status(path: str) -> dict[str, int | str]:
values: dict[str, int | str] = {}
try:
lines = Path(path).read_text(encoding="utf-8", errors="replace").splitlines()
except OSError:
return values
for line in lines:
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
key = key.strip()
value = value.strip()
try:
values[key] = int(float(value))
except ValueError:
values[key] = value
return values
def _integrity_metric_lines(host: str) -> list[str]:
now = int(time.time())
specs = [
("restic_check", "/backup/integrity/check.status", 192),
("restore_drill", "/backup/integrity/restore-drill.status", 744),
]
lines: list[str] = []
for scope, path, max_age_hours in specs:
values = _read_key_value_status(path)
timestamp = int(values.get("timestamp", 0)) if "timestamp" in values else 0
failed_count = int(values.get("failed_count", -1)) if "failed_count" in values else -1
checked_count = int(values.get("checked_repo_count", 0)) if "checked_repo_count" in values else 0
age = now - timestamp if timestamp else 0
fresh = 1 if timestamp and age <= max_age_hours * 3600 and failed_count == 0 else 0
labels = f'host="{_escape_label(host)}",scope="{scope}",max_age_hours="{max_age_hours}"'
lines.extend(
[
f"awoooi_backup_integrity_last_success_timestamp{{{labels}}} {timestamp if failed_count == 0 else 0}",
f"awoooi_backup_integrity_age_seconds{{{labels}}} {age}",
f"awoooi_backup_integrity_fresh{{{labels}}} {fresh}",
f"awoooi_backup_integrity_failed_repo_count{{{labels}}} {failed_count}",
f"awoooi_backup_integrity_checked_repo_count{{{labels}}} {checked_count}",
]
)
return lines
def _config_capture_metric_lines(host: str) -> list[str]:
now = int(time.time())
labels = f'host="{_escape_label(host)}"'
try:
document = json.loads(CONFIG_CAPTURE_STATUS_FILE.read_text(encoding="utf-8", errors="replace"))
except (OSError, json.JSONDecodeError):
return [
f"awoooi_backup_config_capture_status_timestamp{{{labels}}} 0",
f"awoooi_backup_config_capture_status_age_seconds{{{labels}}} 0",
f"awoooi_backup_config_capture_critical_failed_count{{{labels}}} -1",
]
timestamp = int(document.get("timestamp") or 0)
critical_failed = int(document.get("critical_failed_count", -1))
failed_count = int(document.get("failed_count", -1))
snapshot_id = str(document.get("snapshot_id") or "unknown")
duration = int(document.get("duration_seconds", 0) or 0)
age = now - timestamp if timestamp else 0
lines = [
f"awoooi_backup_config_capture_status_timestamp{{{labels},snapshot_id=\"{_escape_label(snapshot_id)}\"}} {timestamp}",
f"awoooi_backup_config_capture_status_age_seconds{{{labels}}} {age}",
f"awoooi_backup_config_capture_critical_failed_count{{{labels}}} {critical_failed}",
f"awoooi_backup_config_capture_failed_count{{{labels}}} {failed_count}",
f"awoooi_backup_config_capture_duration_seconds{{{labels}}} {duration}",
]
for item in document.get("items") or []:
target = str(item.get("target") or "unknown")
source = str(item.get("source") or "unknown")
critical = "true" if item.get("critical") else "false"
ok = 1 if item.get("ok") else 0
item_labels = (
f'host="{_escape_label(host)}",'
f'target="{_escape_label(target)}",'
f'source="{_escape_label(source)}",'
f'critical="{critical}"'
)
lines.append(f"awoooi_backup_config_capture_ok{{{item_labels}}} {ok}")
return lines
def _offsite_and_escrow_metric_lines(host: str) -> list[str]:
now = int(time.time())
lines: list[str] = []
b2_configured = int(_b2_configured())
rclone_configured = int(_rclone_configured())
b2_full_timestamp = _marker_timestamp(
[
OFFSITE_STATUS_DIR / "b2-last-success",
OFFSITE_STATUS_DIR / "b2.last_success",
OFFSITE_STATUS_DIR / "last_success",
Path("/backup/logs/offsite-b2.status"),
]
)
b2_partial_timestamp = _marker_timestamp(
[
OFFSITE_STATUS_DIR / "b2-partial-last-success",
OFFSITE_STATUS_DIR / "b2.partial_last_success",
]
)
rclone_full_timestamp = _marker_timestamp(
[
OFFSITE_STATUS_DIR / "rclone-last-success",
OFFSITE_STATUS_DIR / "rclone.last_success",
OFFSITE_STATUS_DIR / "last_success",
Path("/backup/logs/rclone-sync.status"),
]
)
rclone_partial_timestamp = _marker_timestamp(
[
OFFSITE_STATUS_DIR / "rclone-partial-last-success",
OFFSITE_STATUS_DIR / "rclone.partial_last_success",
]
)
offsite_specs = [
("b2", b2_configured, b2_full_timestamp),
("rclone", rclone_configured, rclone_full_timestamp),
]
for provider, configured, timestamp in offsite_specs:
age = now - timestamp if timestamp else 0
fresh = 1 if configured and timestamp and age <= 48 * 3600 else 0
labels = f'host="{_escape_label(host)}",provider="{provider}",max_age_hours="48"'
lines.extend(
[
f"awoooi_backup_offsite_configured{{{labels}}} {configured}",
f"awoooi_backup_offsite_last_success_timestamp{{{labels}}} {timestamp}",
f"awoooi_backup_offsite_age_seconds{{{labels}}} {age}",
f"awoooi_backup_offsite_fresh{{{labels}}} {fresh}",
]
)
partial_fresh_by_provider: dict[str, int] = {}
for provider, configured, timestamp in [
("b2", b2_configured, b2_partial_timestamp),
("rclone", rclone_configured, rclone_partial_timestamp),
]:
partial_age = now - timestamp if timestamp else 0
partial_fresh = 1 if configured and timestamp and partial_age <= 48 * 3600 else 0
partial_fresh_by_provider[provider] = partial_fresh
partial_labels = f'host="{_escape_label(host)}",provider="{provider}",scope="partial",max_age_hours="48"'
lines.extend(
[
f"awoooi_backup_offsite_partial_last_success_timestamp{{{partial_labels}}} {timestamp}",
f"awoooi_backup_offsite_partial_age_seconds{{{partial_labels}}} {partial_age}",
f"awoooi_backup_offsite_partial_fresh{{{partial_labels}}} {partial_fresh}",
]
)
full_sync_enable_marker = OFFSITE_STATUS_DIR / "enable-rclone-sync"
try:
full_sync_enabled = 1 if full_sync_enable_marker.is_file() else 0
full_sync_enabled_timestamp = int(full_sync_enable_marker.stat().st_mtime) if full_sync_enabled else 0
except OSError:
full_sync_enabled = 0
full_sync_enabled_timestamp = 0
full_sync_labels = f'host="{_escape_label(host)}",provider="rclone"'
lines.extend(
[
f"awoooi_backup_offsite_full_sync_enabled{{{full_sync_labels}}} {full_sync_enabled}",
f"awoooi_backup_offsite_full_sync_enabled_timestamp{{{full_sync_labels}}} {full_sync_enabled_timestamp}",
]
)
escrow_missing_count = 0
for item in ESCROW_ITEMS:
timestamp = _marker_timestamp(
[
ESCROW_EVIDENCE_DIR / f"{item}.last_verified",
ESCROW_EVIDENCE_DIR / f"{item}.verified",
ESCROW_EVIDENCE_DIR / item,
]
)
age = now - timestamp if timestamp else 0
fresh = 1 if timestamp and age <= 744 * 3600 else 0
escrow_missing_count += 0 if fresh else 1
labels = f'host="{_escape_label(host)}",item="{item}",max_age_hours="744"'
lines.extend(
[
f"awoooi_backup_credential_escrow_expected_info{{{labels}}} 1",
f"awoooi_backup_credential_escrow_last_verified_timestamp{{{labels}}} {timestamp}",
f"awoooi_backup_credential_escrow_age_seconds{{{labels}}} {age}",
f"awoooi_backup_credential_escrow_fresh{{{labels}}} {fresh}",
]
)
offsite_configured = 1 if b2_configured or rclone_configured else 0
any_partial_fresh = 1 if any(partial_fresh_by_provider.values()) else 0
full_fresh = 1 if (
(b2_configured and b2_full_timestamp and now - b2_full_timestamp <= 48 * 3600)
or (rclone_configured and rclone_full_timestamp and now - rclone_full_timestamp <= 48 * 3600)
) else 0
if not offsite_configured:
next_step = "configure_google_drive_rclone_on_110_tty"
phase = 1
elif not any_partial_fresh:
next_step = "run_small_dry_run_then_partial_sync"
phase = 2
elif escrow_missing_count > 0:
next_step = "complete_credential_escrow_review"
phase = 3
elif not full_fresh:
next_step = "pre_full_sync_review"
phase = 4
else:
next_step = "offsite_and_escrow_ready"
phase = 5
lines.extend(
[
f'awoooi_backup_dr_credential_escrow_missing_count{{host="{_escape_label(host)}"}} {escrow_missing_count}',
f'awoooi_backup_dr_phase{{host="{_escape_label(host)}",next_step="{_escape_label(next_step)}"}} {phase}',
f'awoooi_backup_dr_next_step_info{{host="{_escape_label(host)}",next_step="{_escape_label(next_step)}"}} 1',
]
)
return lines
def _retention_metric_lines(host: str) -> list[str]:
mode = (_backup_config_value("BACKUP_RETENTION_MODE") or os.environ.get("BACKUP_RETENTION_MODE", "")).strip()
keep_last = (_backup_config_value("KEEP_LAST") or os.environ.get("KEEP_LAST", "")).strip()
offsite_delete_old = (
_backup_config_value("OFFSITE_SYNC_DELETE_OLD") or os.environ.get("OFFSITE_SYNC_DELETE_OLD", "")
).strip()
latest_only = 1 if mode == "latest" and keep_last == "1" else 0
offsite_mirror = 1 if offsite_delete_old == "1" else 0
labels = f'host="{_escape_label(host)}",scope="restic",mode="{_escape_label(mode or "unknown")}",keep_last="{_escape_label(keep_last or "unknown")}"'
offsite_labels = (
f'host="{_escape_label(host)}",scope="offsite",provider="rclone",'
f'delete_old="{_escape_label(offsite_delete_old or "unknown")}"'
)
return [
f"awoooi_backup_retention_latest_only{{{labels}}} {latest_only}",
f"awoooi_backup_retention_offsite_delete_old_enabled{{{offsite_labels}}} {offsite_mirror}",
]
def _collect_velero_from_k8s() -> dict[str, int | str]:
remote_script = r"""
python3 - <<'PY'
import datetime as dt
import json
import subprocess
import time
def kubectl(args):
for prefix in (["sudo", "-n", "kubectl"], ["kubectl"]):
result = subprocess.run(prefix + args, capture_output=True, text=True, timeout=20, check=False)
if result.returncode == 0:
return result.stdout
return ""
def load_json(args):
text = kubectl(args + ["-o", "json"])
try:
return json.loads(text) if text else {}
except json.JSONDecodeError:
return {}
def parse_ts(value):
if not value:
return 0
try:
return int(dt.datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp())
except ValueError:
return 0
now = int(time.time())
schedules = load_json(["get", "schedules.velero.io", "-n", "velero"]).get("items") or []
backups = load_json(["get", "backups.velero.io", "-n", "velero"]).get("items") or []
cron = load_json(["get", "cronjob", "-n", "velero", "backup-restore-test"])
jobs = load_json(["get", "jobs", "-n", "velero", "-l", "component=backup-restore-test"]).get("items") or []
completed = []
for item in backups:
if item.get("status", {}).get("phase") != "Completed":
continue
timestamp = parse_ts(item.get("status", {}).get("completionTimestamp") or item.get("metadata", {}).get("creationTimestamp"))
if timestamp:
completed.append(timestamp)
failed_jobs = 0
for job in jobs:
conditions = job.get("status", {}).get("conditions") or []
if any(row.get("type") == "Failed" and row.get("status") == "True" for row in conditions):
failed_jobs += 1
last_success = parse_ts((cron.get("status") or {}).get("lastSuccessfulTime"))
latest_backup = max(completed) if completed else 0
print("monitor_up=1")
print(f"schedule_count={len(schedules)}")
print(f"schedule_paused_count={sum(1 for item in schedules if item.get('spec', {}).get('paused'))}")
print(f"latest_completed_backup_timestamp={latest_backup}")
print(f"latest_completed_backup_age_seconds={now - latest_backup if latest_backup else 0}")
print(f"latest_completed_backup_fresh={1 if latest_backup and now - latest_backup <= 90000 else 0}")
print(f"restore_test_cron_present={1 if cron.get('metadata', {}).get('name') == 'backup-restore-test' else 0}")
print(f"restore_test_last_success_timestamp={last_success}")
print(f"restore_test_last_success_age_seconds={now - last_success if last_success else 0}")
print(f"restore_test_last_success_fresh={1 if last_success and now - last_success <= 691200 else 0}")
print(f"restore_test_failed_jobs={failed_jobs}")
PY
"""
hosts = os.environ.get("AIOPS_K8S_QUERY_HOSTS", "192.168.0.120 192.168.0.121 192.168.0.125").split()
values: dict[str, int | str] = {"monitor_up": 0, "source": "unreachable"}
for host in hosts:
rc, stdout, _ = _run(
[
"ssh",
"-o",
"BatchMode=yes",
"-o",
"StrictHostKeyChecking=accept-new",
"-o",
"ConnectTimeout=8",
f"wooo@{host}",
remote_script,
],
timeout=45,
)
if rc != 0:
continue
parsed: dict[str, int | str] = {"source": f"{host}-kubectl"}
for line in stdout.splitlines():
if "=" not in line:
continue
key, value = line.split("=", 1)
try:
parsed[key.strip()] = int(float(value.strip()))
except ValueError:
continue
if int(parsed.get("monitor_up", 0)) == 1:
return parsed
return values
def _velero_metric_lines(host: str) -> list[str]:
values = _collect_velero_from_k8s()
labels = f'host="{_escape_label(host)}",source="{_escape_label(str(values.get("source", "unreachable")))}",namespace="velero"'
return [
f"awoooi_velero_monitor_up{{{labels}}} {values.get('monitor_up', 0)}",
f"awoooi_velero_schedule_count{{{labels}}} {values.get('schedule_count', 0)}",
f"awoooi_velero_schedule_paused_count{{{labels}}} {values.get('schedule_paused_count', 0)}",
f"awoooi_velero_latest_completed_backup_timestamp{{{labels}}} {values.get('latest_completed_backup_timestamp', 0)}",
f"awoooi_velero_latest_completed_backup_age_seconds{{{labels}}} {values.get('latest_completed_backup_age_seconds', 0)}",
f"awoooi_velero_latest_completed_backup_fresh{{{labels},max_age_hours=\"25\"}} {values.get('latest_completed_backup_fresh', 0)}",
f"awoooi_velero_restore_test_cron_present{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_cron_present', 0)}",
f"awoooi_velero_restore_test_last_success_timestamp{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_last_success_timestamp', 0)}",
f"awoooi_velero_restore_test_last_success_age_seconds{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_last_success_age_seconds', 0)}",
f"awoooi_velero_restore_test_last_success_fresh{{{labels},cronjob=\"backup-restore-test\",max_age_hours=\"192\"}} {values.get('restore_test_last_success_fresh', 0)}",
f"awoooi_velero_restore_test_failed_jobs{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_failed_jobs', 0)}",
]
def _metric_lines_for_job(
*,
host: str,
job: str,
source: str,
target: str,
backup_type: str,
last_success: int,
max_age_hours: float,
sample_count: int = 0,
) -> list[str]:
now = int(time.time())
labels = (
f'host="{_escape_label(host)}",'
f'job="{_escape_label(job)}",'
f'type="{_escape_label(backup_type)}",'
f'source="{_escape_label(source)}",'
f'target="{_escape_label(target)}",'
f'max_age_hours="{max_age_hours:g}"'
)
age = now - last_success if last_success > 0 else 0
fresh = 1 if last_success > 0 and age <= int(max_age_hours * 3600) else 0
return [
f"awoooi_backup_expected_job_info{{{labels}}} 1",
f"awoooi_backup_job_last_success_timestamp{{{labels}}} {last_success}",
f"awoooi_backup_job_age_seconds{{{labels}}} {age}",
f"awoooi_backup_job_fresh{{{labels}}} {fresh}",
f"awoooi_backup_job_snapshot_count{{{labels}}} {sample_count}",
]
def _base_lines(host: str) -> list[str]:
now = int(time.time())
return [
"# HELP awoooi_backup_health_monitor_up Whether the backup health exporter completed.",
"# TYPE awoooi_backup_health_monitor_up gauge",
"# HELP awoooi_backup_health_last_run_timestamp Unix timestamp of the last backup health exporter run.",
"# TYPE awoooi_backup_health_last_run_timestamp gauge",
"# HELP awoooi_backup_expected_job_info Expected backup job inventory.",
"# TYPE awoooi_backup_expected_job_info gauge",
"# HELP awoooi_backup_job_configured Whether the expected backup cron/config is present.",
"# TYPE awoooi_backup_job_configured gauge",
"# HELP awoooi_backup_script_present Whether the backup script exists on this host.",
"# TYPE awoooi_backup_script_present gauge",
"# HELP awoooi_backup_job_last_success_timestamp Unix timestamp of the latest successful backup evidence.",
"# TYPE awoooi_backup_job_last_success_timestamp gauge",
"# HELP awoooi_backup_job_age_seconds Age of the latest successful backup evidence.",
"# TYPE awoooi_backup_job_age_seconds gauge",
"# HELP awoooi_backup_job_fresh Whether the latest successful backup evidence is within max_age_hours.",
"# TYPE awoooi_backup_job_fresh gauge",
"# HELP awoooi_backup_job_snapshot_count Number of snapshots or files considered for this job.",
"# TYPE awoooi_backup_job_snapshot_count gauge",
"# HELP awoooi_backup_last_run_failed_count Failed component count from the last aggregate backup run.",
"# TYPE awoooi_backup_last_run_failed_count gauge",
"# HELP awoooi_backup_integrity_last_success_timestamp Unix timestamp of latest successful backup integrity or restore drill run.",
"# TYPE awoooi_backup_integrity_last_success_timestamp gauge",
"# HELP awoooi_backup_integrity_age_seconds Age of backup integrity or restore drill status.",
"# TYPE awoooi_backup_integrity_age_seconds gauge",
"# HELP awoooi_backup_integrity_fresh Whether backup integrity or restore drill status is fresh and successful.",
"# TYPE awoooi_backup_integrity_fresh gauge",
"# HELP awoooi_backup_integrity_failed_repo_count Failed repository count from backup integrity or restore drill run.",
"# TYPE awoooi_backup_integrity_failed_repo_count gauge",
"# HELP awoooi_backup_integrity_checked_repo_count Checked repository count from backup integrity or restore drill run.",
"# TYPE awoooi_backup_integrity_checked_repo_count gauge",
"# HELP awoooi_backup_config_capture_status_timestamp Unix timestamp of the latest config-capture coverage status.",
"# TYPE awoooi_backup_config_capture_status_timestamp gauge",
"# HELP awoooi_backup_config_capture_status_age_seconds Age of the latest config-capture coverage status.",
"# TYPE awoooi_backup_config_capture_status_age_seconds gauge",
"# HELP awoooi_backup_config_capture_critical_failed_count Critical config-capture targets missing from the latest configs backup.",
"# TYPE awoooi_backup_config_capture_critical_failed_count gauge",
"# HELP awoooi_backup_config_capture_failed_count Total config-capture targets missing from the latest configs backup.",
"# TYPE awoooi_backup_config_capture_failed_count gauge",
"# HELP awoooi_backup_config_capture_duration_seconds Duration of the latest configs backup capture run.",
"# TYPE awoooi_backup_config_capture_duration_seconds gauge",
"# HELP awoooi_backup_config_capture_ok Whether the latest configs backup captured a specific target.",
"# TYPE awoooi_backup_config_capture_ok gauge",
"# HELP awoooi_backup_offsite_configured Whether an offsite backup provider appears configured without exposing credentials.",
"# TYPE awoooi_backup_offsite_configured gauge",
"# HELP awoooi_backup_offsite_last_success_timestamp Unix timestamp of latest offsite copy success marker.",
"# TYPE awoooi_backup_offsite_last_success_timestamp gauge",
"# HELP awoooi_backup_offsite_age_seconds Age of latest offsite copy success marker.",
"# TYPE awoooi_backup_offsite_age_seconds gauge",
"# HELP awoooi_backup_offsite_fresh Whether offsite copy success marker is fresh.",
"# TYPE awoooi_backup_offsite_fresh gauge",
"# HELP awoooi_backup_offsite_partial_last_success_timestamp Unix timestamp of latest partial offsite copy success marker.",
"# TYPE awoooi_backup_offsite_partial_last_success_timestamp gauge",
"# HELP awoooi_backup_offsite_partial_age_seconds Age of latest partial offsite copy success marker.",
"# TYPE awoooi_backup_offsite_partial_age_seconds gauge",
"# HELP awoooi_backup_offsite_partial_fresh Whether partial offsite copy success marker is fresh.",
"# TYPE awoooi_backup_offsite_partial_fresh gauge",
"# HELP awoooi_backup_offsite_full_sync_enabled Whether the gated full offsite sync enable marker exists.",
"# TYPE awoooi_backup_offsite_full_sync_enabled gauge",
"# HELP awoooi_backup_offsite_full_sync_enabled_timestamp Unix timestamp of the gated full offsite sync enable marker.",
"# TYPE awoooi_backup_offsite_full_sync_enabled_timestamp gauge",
"# HELP awoooi_backup_credential_escrow_expected_info Expected credential escrow evidence inventory.",
"# TYPE awoooi_backup_credential_escrow_expected_info gauge",
"# HELP awoooi_backup_credential_escrow_last_verified_timestamp Unix timestamp of credential escrow verification evidence.",
"# TYPE awoooi_backup_credential_escrow_last_verified_timestamp gauge",
"# HELP awoooi_backup_credential_escrow_age_seconds Age of credential escrow verification evidence.",
"# TYPE awoooi_backup_credential_escrow_age_seconds gauge",
"# HELP awoooi_backup_credential_escrow_fresh Whether credential escrow verification evidence is fresh.",
"# TYPE awoooi_backup_credential_escrow_fresh gauge",
"# HELP awoooi_backup_dr_credential_escrow_missing_count Number of credential escrow items that still need fresh human verification.",
"# TYPE awoooi_backup_dr_credential_escrow_missing_count gauge",
"# HELP awoooi_backup_dr_phase Numeric DR offsite completion phase for AI/operator triage.",
"# TYPE awoooi_backup_dr_phase gauge",
"# HELP awoooi_backup_dr_next_step_info Current human-safe next step for DR offsite completion.",
"# TYPE awoooi_backup_dr_next_step_info gauge",
"# HELP awoooi_backup_retention_latest_only Whether local restic backup retention is configured as latest-only keep-last=1.",
"# TYPE awoooi_backup_retention_latest_only gauge",
"# HELP awoooi_backup_retention_offsite_delete_old_enabled Whether offsite rclone sync is allowed to delete old remote backup files after successful mirror.",
"# TYPE awoooi_backup_retention_offsite_delete_old_enabled gauge",
"# HELP awoooi_backup_cron_active_duplicate_count Number of exact duplicate active crontab entries on the backup host.",
"# TYPE awoooi_backup_cron_active_duplicate_count gauge",
"# HELP awoooi_backup_cron_singular_entry_count Number of active crontab entries matching a backup/offsite singleton pattern.",
"# TYPE awoooi_backup_cron_singular_entry_count gauge",
"# HELP awoooi_backup_cron_singular_entry_ok Whether a backup/offsite singleton cron pattern has exactly one active entry.",
"# TYPE awoooi_backup_cron_singular_entry_ok gauge",
"# HELP awoooi_velero_monitor_up Whether the backup health exporter can query Velero via a reachable K3s kubectl endpoint.",
"# TYPE awoooi_velero_monitor_up gauge",
"# HELP awoooi_velero_schedule_count Number of Velero schedules in the velero namespace.",
"# TYPE awoooi_velero_schedule_count gauge",
"# HELP awoooi_velero_schedule_paused_count Number of paused Velero schedules.",
"# TYPE awoooi_velero_schedule_paused_count gauge",
"# HELP awoooi_velero_latest_completed_backup_timestamp Unix timestamp of latest Completed Velero backup.",
"# TYPE awoooi_velero_latest_completed_backup_timestamp gauge",
"# HELP awoooi_velero_latest_completed_backup_age_seconds Age of latest Completed Velero backup.",
"# TYPE awoooi_velero_latest_completed_backup_age_seconds gauge",
"# HELP awoooi_velero_latest_completed_backup_fresh Whether latest Completed Velero backup is within max_age_hours.",
"# TYPE awoooi_velero_latest_completed_backup_fresh gauge",
"# HELP awoooi_velero_restore_test_cron_present Whether backup-restore-test CronJob exists.",
"# TYPE awoooi_velero_restore_test_cron_present gauge",
"# HELP awoooi_velero_restore_test_last_success_timestamp Unix timestamp of backup-restore-test lastSuccessfulTime.",
"# TYPE awoooi_velero_restore_test_last_success_timestamp gauge",
"# HELP awoooi_velero_restore_test_last_success_age_seconds Age of backup-restore-test lastSuccessfulTime.",
"# TYPE awoooi_velero_restore_test_last_success_age_seconds gauge",
"# HELP awoooi_velero_restore_test_last_success_fresh Whether backup-restore-test lastSuccessfulTime is within max_age_hours.",
"# TYPE awoooi_velero_restore_test_last_success_fresh gauge",
"# HELP awoooi_velero_restore_test_failed_jobs Failed backup-restore-test jobs retained in velero namespace.",
"# TYPE awoooi_velero_restore_test_failed_jobs gauge",
f'awoooi_backup_health_monitor_up{{host="{_escape_label(host)}"}} 1',
f'awoooi_backup_health_last_run_timestamp{{host="{_escape_label(host)}"}} {now}',
]
def _collect_110(host: str) -> list[str]:
cron = _cron_text()
lines = _base_lines(host)
expected_crons = {
"backup_all": "/backup/scripts/backup-all.sh",
"awoooi_frequent": "/backup/scripts/backup-awoooi-frequent.sh",
"offsite_status": "/backup/scripts/sync-offsite-backups.sh --mode status",
"offsite_sync_gated": "/backup/offsite/enable-rclone-sync",
"offsite_escrow_evidence_report": "/backup/scripts/offsite-escrow-evidence-report.sh --no-color",
"offsite_full_sync_verify": "/backup/scripts/verify-offsite-full-sync.sh --write-textfile",
"backup_integrity_check": "/backup/scripts/check-backup-integrity.sh --mode check",
"backup_restore_drill": "/backup/scripts/check-backup-integrity.sh --mode restore-drill",
}
for job, pattern in expected_crons.items():
labels = f'host="{_escape_label(host)}",job="{_escape_label(job)}"'
lines.append(f"awoooi_backup_job_configured{{{labels}}} {int(pattern in cron)}")
for script in [
"backup-all.sh",
"backup-awoooi.sh",
"backup-awoooi-frequent.sh",
"backup-configs.sh",
"backup-sentry.sh",
"backup-ai-artifacts.sh",
"backup-public-routes.sh",
"configure-offsite-rclone.sh",
"configure-offsite-b2.sh",
"sync-offsite-backups.sh",
"backup-offsite-readiness-gate.sh",
"offsite-escrow-evidence-report.sh",
"verify-offsite-full-sync.sh",
"mark-credential-escrow-verified.sh",
"check-backup-integrity.sh",
"backup-gitea.sh",
"backup-harbor.sh",
"backup-momo.sh",
"backup-langfuse.sh",
"backup-monitoring.sh",
"backup-signoz.sh",
"backup-open-webui.sh",
"backup-clawbot.sh",
]:
labels = f'host="{_escape_label(host)}",script="{_escape_label(script)}"'
lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path('/backup/scripts', script).exists())}")
for job, repo, max_age in [
("awoooi_db", "/backup/awoooi", 7),
("configs", "/backup/configs", 48),
("sentry", "/backup/sentry", 48),
("gitea", "/backup/gitea", 48),
("harbor", "/backup/harbor", 48),
("momo", "/backup/momo", 48),
("langfuse", "/backup/langfuse", 48),
("monitoring", "/backup/monitoring", 48),
("signoz", "/backup/signoz", 48),
("open_webui", "/backup/open-webui", 48),
("clawbot", "/backup/clawbot", 48),
("ai_artifacts", "/backup/ai-artifacts", 48),
("public_routes", "/backup/public-routes", 168),
]:
timestamp, count = _latest_restic_snapshot(repo)
lines.extend(
_metric_lines_for_job(
host=host,
job=job,
source="110-restic",
target=repo,
backup_type="restic",
last_success=timestamp,
max_age_hours=max_age,
sample_count=count,
)
)
backup_all_ts, failed_count = _latest_backup_all_failed_count()
labels = f'host="{_escape_label(host)}",job="backup_all"'
lines.append(f"awoooi_backup_last_run_failed_count{{{labels}}} {failed_count}")
lines.append(f"awoooi_backup_job_last_success_timestamp{{{labels},type=\"aggregate\",source=\"110-cron-log\",target=\"/backup/logs/cron.log\",max_age_hours=\"48\"}} {backup_all_ts if failed_count == 0 else 0}")
lines.extend(_integrity_metric_lines(host))
lines.extend(_config_capture_metric_lines(host))
lines.extend(_offsite_and_escrow_metric_lines(host))
lines.extend(_retention_metric_lines(host))
lines.extend(_cron_duplicate_metric_lines(host, cron))
lines.extend(_velero_metric_lines(host))
return lines
def _collect_188(host: str) -> list[str]:
cron = _cron_text()
lines = _base_lines(host)
for job, pattern in {
"backup_from_110": "/home/ollama/bin/backup-from-110.sh",
"momo_pg_daily": "/home/ollama/bin/momo-pg-backup.sh",
}.items():
labels = f'host="{_escape_label(host)}",job="{_escape_label(job)}"'
lines.append(f"awoooi_backup_job_configured{{{labels}}} {int(pattern in cron)}")
for script in [
"/home/ollama/bin/backup-from-110.sh",
"/home/ollama/bin/momo-pg-backup.sh",
"/home/ollama/awoooi-ops/pg-backup.sh",
]:
labels = f'host="{_escape_label(host)}",script="{_escape_label(Path(script).name)}"'
lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path(script).exists() and os.access(script, os.X_OK))}")
lines.extend(
_metric_lines_for_job(
host=host,
job="backup_from_110",
source="188-rsync",
target="/home/ollama/backup/110",
backup_type="rsync",
last_success=_read_backup_110_timestamp(),
max_age_hours=25,
sample_count=1,
)
)
momo_ts = _newest_file_timestamp([
"/home/ollama/momo_backups/*.sql.gz",
"/home/ollama/momo-pro/backups/*.sql.gz",
"/home/ollama/backups/momo_analytics_*.sql.gz",
])
lines.extend(
_metric_lines_for_job(
host=host,
job="momo_pg_daily",
source="188-pg-dump",
target="/home/ollama/momo_backups",
backup_type="pg_dump",
last_success=momo_ts,
max_age_hours=30,
sample_count=1 if momo_ts else 0,
)
)
return lines
def collect() -> str:
host = HOST_LABEL
if host == "110":
lines = _collect_110(host)
elif host == "188":
lines = _collect_188(host)
else:
lines = _base_lines(host)
return "\n".join(lines) + "\n"
def main() -> None:
TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
payload = collect()
with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
tmp.write(payload)
tmp_path = Path(tmp.name)
output_path = TEXTFILE_DIR / OUTPUT_NAME
tmp_path.replace(output_path)
output_path.chmod(0o644)
if __name__ == "__main__":
main()