awoooi/scripts/ops/backup-health-textfile-exporter.py

#!/usr/bin/env python3
"""
Backup health textfile exporter for full-stack reboot readiness.

2026-05-06 ogt + Codex: backup coverage follow-up after the reboot incident.
Why: a green service gate is not enough if the last restorable copy is stale.
This exporter is read-only; it checks cron/script presence and the latest
successful backup evidence, then writes node-exporter textfile metrics.
"""

from __future__ import annotations

import json
import os
import re
import shlex
import subprocess
import tempfile
import time
from datetime import datetime, timezone
from pathlib import Path


TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
OUTPUT_NAME = "backup_health.prom"
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
LABEL_RE = re.compile(r'["\\\n]')
BACKUP_COMMON_SH = Path(os.environ.get("AIOPS_BACKUP_COMMON_SH", "/backup/scripts/common.sh"))
BACKUP_OFFSITE_ENV = Path(os.environ.get("AIOPS_BACKUP_OFFSITE_ENV", "/backup/scripts/offsite.env"))
OFFSITE_STATUS_DIR = Path(os.environ.get("AIOPS_OFFSITE_STATUS_DIR", "/backup/offsite"))
ESCROW_EVIDENCE_DIR = Path(os.environ.get("AIOPS_ESCROW_EVIDENCE_DIR", "/backup/escrow-evidence"))
CONFIG_CAPTURE_STATUS_FILE = Path(os.environ.get("AIOPS_CONFIG_CAPTURE_STATUS_FILE", "/backup/status/backup-configs-last-status.json"))
ESCROW_ITEMS = [
    "restic_repository_password",
    "offsite_provider_credentials",
    "break_glass_admin_credentials",
    "dns_registrar_recovery",
    "oauth_ai_provider_recovery",
]


def _escape_label(value: str) -> str:
    return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)


def _run(command: list[str], timeout: int = 30) -> tuple[int, str, str]:
    try:
        result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, check=False)
    except FileNotFoundError as exc:
        return 127, "", str(exc)
    except subprocess.TimeoutExpired as exc:
        stdout = exc.stdout if isinstance(exc.stdout, str) else ""
        stderr = exc.stderr if isinstance(exc.stderr, str) else "timeout"
        return 124, stdout, stderr
    return result.returncode, result.stdout, result.stderr


def _parse_time(value: str) -> int:
    if not value:
        return 0
    normalized = re.sub(r"\.(\d{6})\d+([+-]\d\d:\d\d|Z)$", r".\1\2", value)
    normalized = normalized.replace("Z", "+00:00")
    try:
        return int(datetime.fromisoformat(normalized).astimezone(timezone.utc).timestamp())
    except ValueError:
        return 0


def _parse_marker_timestamp(text: str) -> int:
    match = re.search(r"\b(\d{10})\b", text)
    if match:
        return int(match.group(1))
    for line in text.splitlines():
        parsed = _parse_time(line.strip())
        if parsed:
            return parsed
    return 0


def _marker_timestamp(paths: list[Path]) -> int:
    for path in paths:
        try:
            text = path.read_text(encoding="utf-8", errors="replace")
            parsed = _parse_marker_timestamp(text)
            return parsed or int(path.stat().st_mtime)
        except OSError:
            continue
    return 0


def _shell_export_value(path: Path, key: str) -> str:
    try:
        lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
    except OSError:
        return ""
    for line in lines:
        try:
            tokens = shlex.split(line, comments=True, posix=True)
        except ValueError:
            continue
        if tokens and tokens[0] == "export":
            tokens = tokens[1:]
        for token in tokens:
            if not token.startswith(f"{key}="):
                continue
            return token.split("=", 1)[1].strip()
    return ""


def _backup_config_value(key: str) -> str:
    for path in [BACKUP_OFFSITE_ENV, BACKUP_COMMON_SH]:
        value = _shell_export_value(path, key)
        if value:
            default_match = re.fullmatch(r"\$\{" + re.escape(key) + r":-([^}]+)\}", value)
            if default_match:
                return default_match.group(1)
            return value
    return ""


def _configured_secret(value: str) -> bool:
    return value.strip() not in {"", "CHANGE_ME", "CHANGEME", "TODO", "REDACTED"}


def _b2_configured() -> bool:
    return (
        _configured_secret(_backup_config_value("B2_ACCOUNT_ID"))
        and _configured_secret(_backup_config_value("B2_APPLICATION_KEY"))
        and _configured_secret(_backup_config_value("B2_BUCKET"))
    )


def _rclone_configured() -> bool:
    remote = _backup_config_value("OFFSITE_RCLONE_REMOTE") or os.environ.get("OFFSITE_RCLONE_REMOTE", "gdrive")
    rc, stdout, _ = _run(["rclone", "listremotes"], timeout=10)
    if rc == 0 and remote:
        return f"{remote}:" in {line.strip() for line in stdout.splitlines()}
    for path in [
        Path.home() / ".config/rclone/rclone.conf",
        Path("/home/wooo/.config/rclone/rclone.conf"),
        Path("/root/.config/rclone/rclone.conf"),
        Path("/etc/rclone.conf"),
    ]:
        try:
            if path.is_file() and path.stat().st_size > 0:
                return True
        except OSError:
            continue
    return False


def _cron_text() -> str:
    rc, stdout, _ = _run(["crontab", "-l"], timeout=10)
    return stdout if rc == 0 else ""


def _active_cron_lines(cron: str) -> list[str]:
    return [line.strip() for line in cron.splitlines() if line.strip() and not line.lstrip().startswith("#")]


def _cron_duplicate_metric_lines(host: str, cron: str) -> list[str]:
    lines: list[str] = []
    active_lines = _active_cron_lines(cron)
    duplicate_count = max(0, len(active_lines) - len(set(active_lines)))
    lines.append(f'awoooi_backup_cron_active_duplicate_count{{host="{_escape_label(host)}"}} {duplicate_count}')

    singular_patterns = {
        "backup_health_exporter": "/home/wooo/scripts/backup-health-textfile-exporter.py",
        "offsite_status": "/backup/scripts/sync-offsite-backups.sh --mode status",
        "offsite_escrow_evidence_report": "/backup/scripts/offsite-escrow-evidence-report.sh --no-color",
        "offsite_sync_gated": "/backup/scripts/sync-offsite-backups.sh --mode sync",
        "offsite_full_sync_verify": "/backup/scripts/verify-offsite-full-sync.sh --write-textfile",
    }
    for entry, pattern in singular_patterns.items():
        count = sum(1 for line in active_lines if pattern in line)
        labels = f'host="{_escape_label(host)}",entry="{_escape_label(entry)}"'
        lines.append(f"awoooi_backup_cron_singular_entry_count{{{labels}}} {count}")
        lines.append(f"awoooi_backup_cron_singular_entry_ok{{{labels}}} {1 if count == 1 else 0}")
    return lines


def _newest_file_timestamp(patterns: list[str]) -> int:
    newest = 0
    for pattern in patterns:
        for path in Path("/").glob(pattern.lstrip("/")):
            try:
                if path.is_file():
                    newest = max(newest, int(path.stat().st_mtime))
            except OSError:
                continue
    return newest


def _read_backup_110_timestamp() -> int:
    candidates = [
        Path("/home/ollama/node_exporter_textfiles/backup.prom"),
        Path("/home/ollama/backup/110/last_success"),
    ]
    for path in candidates:
        try:
            text = path.read_text(encoding="utf-8", errors="replace")
        except OSError:
            continue
        match = re.search(r"(?:backup_110_last_success_timestamp\s+)?(\d{10})", text)
        if match:
            return int(match.group(1))
    return 0


def _latest_restic_snapshot(repo: str) -> tuple[int, int]:
    password_file = os.environ.get("RESTIC_PASSWORD_FILE", "/backup/scripts/.restic-password")
    if not Path(repo).exists() or not Path(password_file).exists():
        return 0, 0
    rc, stdout, _ = _run(
        ["restic", "-r", repo, "snapshots", "--json", "--password-file", password_file],
        timeout=45,
    )
    if rc != 0:
        return 0, 0
    try:
        rows = json.loads(stdout)
    except json.JSONDecodeError:
        return 0, 0
    timestamps = [_parse_time(str(row.get("time", ""))) for row in rows]
    timestamps = [value for value in timestamps if value > 0]
    return (max(timestamps), len(timestamps)) if timestamps else (0, 0)


def _backup_all_failed_count_from_log(path: Path) -> tuple[int, int]:
    try:
        lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
    except OSError:
        return 0, -1
    for line in reversed(lines):
        if "全服務備份完成" not in line:
            continue
        ts_match = re.match(r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]", line)
        timestamp = 0
        if ts_match:
            timestamp = int(datetime.strptime(ts_match.group(1), "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc).timestamp()) - 8 * 3600
        failed_match = re.search(r"-\s+(\d+)\s+個失敗", line)
        if failed_match:
            return timestamp, int(failed_match.group(1))
        if "全部成功" in line:
            return timestamp, 0
    return 0, -1


def _latest_backup_all_failed_count() -> tuple[int, int]:
    candidates = [
        _backup_all_failed_count_from_log(Path("/backup/logs/cron.log")),
        _backup_all_failed_count_from_log(Path("/backup/logs/backup.log")),
    ]
    candidates = [row for row in candidates if row[0] > 0 and row[1] >= 0]
    if not candidates:
        return 0, -1
    return max(candidates, key=lambda row: row[0])


def _read_key_value_status(path: str) -> dict[str, int | str]:
    values: dict[str, int | str] = {}
    try:
        lines = Path(path).read_text(encoding="utf-8", errors="replace").splitlines()
    except OSError:
        return values
    for line in lines:
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        key = key.strip()
        value = value.strip()
        try:
            values[key] = int(float(value))
        except ValueError:
            values[key] = value
    return values


def _integrity_metric_lines(host: str) -> list[str]:
    now = int(time.time())
    specs = [
        ("restic_check", "/backup/integrity/check.status", 192),
        ("restore_drill", "/backup/integrity/restore-drill.status", 744),
    ]
    lines: list[str] = []
    for scope, path, max_age_hours in specs:
        values = _read_key_value_status(path)
        timestamp = int(values.get("timestamp", 0)) if "timestamp" in values else 0
        failed_count = int(values.get("failed_count", -1)) if "failed_count" in values else -1
        checked_count = int(values.get("checked_repo_count", 0)) if "checked_repo_count" in values else 0
        age = now - timestamp if timestamp else 0
        fresh = 1 if timestamp and age <= max_age_hours * 3600 and failed_count == 0 else 0
        labels = f'host="{_escape_label(host)}",scope="{scope}",max_age_hours="{max_age_hours}"'
        lines.extend(
            [
                f"awoooi_backup_integrity_last_success_timestamp{{{labels}}} {timestamp if failed_count == 0 else 0}",
                f"awoooi_backup_integrity_age_seconds{{{labels}}} {age}",
                f"awoooi_backup_integrity_fresh{{{labels}}} {fresh}",
                f"awoooi_backup_integrity_failed_repo_count{{{labels}}} {failed_count}",
                f"awoooi_backup_integrity_checked_repo_count{{{labels}}} {checked_count}",
            ]
        )
    return lines


def _config_capture_metric_lines(host: str) -> list[str]:
    now = int(time.time())
    labels = f'host="{_escape_label(host)}"'
    try:
        document = json.loads(CONFIG_CAPTURE_STATUS_FILE.read_text(encoding="utf-8", errors="replace"))
    except (OSError, json.JSONDecodeError):
        return [
            f"awoooi_backup_config_capture_status_timestamp{{{labels}}} 0",
            f"awoooi_backup_config_capture_status_age_seconds{{{labels}}} 0",
            f"awoooi_backup_config_capture_critical_failed_count{{{labels}}} -1",
        ]

    timestamp = int(document.get("timestamp") or 0)
    critical_failed = int(document.get("critical_failed_count", -1))
    failed_count = int(document.get("failed_count", -1))
    snapshot_id = str(document.get("snapshot_id") or "unknown")
    duration = int(document.get("duration_seconds", 0) or 0)
    age = now - timestamp if timestamp else 0
    lines = [
        f"awoooi_backup_config_capture_status_timestamp{{{labels},snapshot_id=\"{_escape_label(snapshot_id)}\"}} {timestamp}",
        f"awoooi_backup_config_capture_status_age_seconds{{{labels}}} {age}",
        f"awoooi_backup_config_capture_critical_failed_count{{{labels}}} {critical_failed}",
        f"awoooi_backup_config_capture_failed_count{{{labels}}} {failed_count}",
        f"awoooi_backup_config_capture_duration_seconds{{{labels}}} {duration}",
    ]
    for item in document.get("items") or []:
        target = str(item.get("target") or "unknown")
        source = str(item.get("source") or "unknown")
        critical = "true" if item.get("critical") else "false"
        ok = 1 if item.get("ok") else 0
        item_labels = (
            f'host="{_escape_label(host)}",'
            f'target="{_escape_label(target)}",'
            f'source="{_escape_label(source)}",'
            f'critical="{critical}"'
        )
        lines.append(f"awoooi_backup_config_capture_ok{{{item_labels}}} {ok}")
    return lines


def _offsite_and_escrow_metric_lines(host: str) -> list[str]:
    now = int(time.time())
    lines: list[str] = []
    b2_configured = int(_b2_configured())
    rclone_configured = int(_rclone_configured())
    b2_full_timestamp = _marker_timestamp(
        [
            OFFSITE_STATUS_DIR / "b2-last-success",
            OFFSITE_STATUS_DIR / "b2.last_success",
            OFFSITE_STATUS_DIR / "last_success",
            Path("/backup/logs/offsite-b2.status"),
        ]
    )
    b2_partial_timestamp = _marker_timestamp(
        [
            OFFSITE_STATUS_DIR / "b2-partial-last-success",
            OFFSITE_STATUS_DIR / "b2.partial_last_success",
        ]
    )
    rclone_full_timestamp = _marker_timestamp(
        [
            OFFSITE_STATUS_DIR / "rclone-last-success",
            OFFSITE_STATUS_DIR / "rclone.last_success",
            OFFSITE_STATUS_DIR / "last_success",
            Path("/backup/logs/rclone-sync.status"),
        ]
    )
    rclone_partial_timestamp = _marker_timestamp(
        [
            OFFSITE_STATUS_DIR / "rclone-partial-last-success",
            OFFSITE_STATUS_DIR / "rclone.partial_last_success",
        ]
    )
    offsite_specs = [
        ("b2", b2_configured, b2_full_timestamp),
        ("rclone", rclone_configured, rclone_full_timestamp),
    ]
    for provider, configured, timestamp in offsite_specs:
        age = now - timestamp if timestamp else 0
        fresh = 1 if configured and timestamp and age <= 48 * 3600 else 0
        labels = f'host="{_escape_label(host)}",provider="{provider}",max_age_hours="48"'
        lines.extend(
            [
                f"awoooi_backup_offsite_configured{{{labels}}} {configured}",
                f"awoooi_backup_offsite_last_success_timestamp{{{labels}}} {timestamp}",
                f"awoooi_backup_offsite_age_seconds{{{labels}}} {age}",
                f"awoooi_backup_offsite_fresh{{{labels}}} {fresh}",
            ]
        )

    partial_fresh_by_provider: dict[str, int] = {}
    for provider, configured, timestamp in [
        ("b2", b2_configured, b2_partial_timestamp),
        ("rclone", rclone_configured, rclone_partial_timestamp),
    ]:
        partial_age = now - timestamp if timestamp else 0
        partial_fresh = 1 if configured and timestamp and partial_age <= 48 * 3600 else 0
        partial_fresh_by_provider[provider] = partial_fresh
        partial_labels = f'host="{_escape_label(host)}",provider="{provider}",scope="partial",max_age_hours="48"'
        lines.extend(
            [
                f"awoooi_backup_offsite_partial_last_success_timestamp{{{partial_labels}}} {timestamp}",
                f"awoooi_backup_offsite_partial_age_seconds{{{partial_labels}}} {partial_age}",
                f"awoooi_backup_offsite_partial_fresh{{{partial_labels}}} {partial_fresh}",
            ]
        )

    full_sync_enable_marker = OFFSITE_STATUS_DIR / "enable-rclone-sync"
    try:
        full_sync_enabled = 1 if full_sync_enable_marker.is_file() else 0
        full_sync_enabled_timestamp = int(full_sync_enable_marker.stat().st_mtime) if full_sync_enabled else 0
    except OSError:
        full_sync_enabled = 0
        full_sync_enabled_timestamp = 0
    full_sync_labels = f'host="{_escape_label(host)}",provider="rclone"'
    lines.extend(
        [
            f"awoooi_backup_offsite_full_sync_enabled{{{full_sync_labels}}} {full_sync_enabled}",
            f"awoooi_backup_offsite_full_sync_enabled_timestamp{{{full_sync_labels}}} {full_sync_enabled_timestamp}",
        ]
    )

    escrow_missing_count = 0
    for item in ESCROW_ITEMS:
        timestamp = _marker_timestamp(
            [
                ESCROW_EVIDENCE_DIR / f"{item}.last_verified",
                ESCROW_EVIDENCE_DIR / f"{item}.verified",
                ESCROW_EVIDENCE_DIR / item,
            ]
        )
        age = now - timestamp if timestamp else 0
        fresh = 1 if timestamp and age <= 744 * 3600 else 0
        escrow_missing_count += 0 if fresh else 1
        labels = f'host="{_escape_label(host)}",item="{item}",max_age_hours="744"'
        lines.extend(
            [
                f"awoooi_backup_credential_escrow_expected_info{{{labels}}} 1",
                f"awoooi_backup_credential_escrow_last_verified_timestamp{{{labels}}} {timestamp}",
                f"awoooi_backup_credential_escrow_age_seconds{{{labels}}} {age}",
                f"awoooi_backup_credential_escrow_fresh{{{labels}}} {fresh}",
            ]
        )
    offsite_configured = 1 if b2_configured or rclone_configured else 0
    any_partial_fresh = 1 if any(partial_fresh_by_provider.values()) else 0
    full_fresh = 1 if (
        (b2_configured and b2_full_timestamp and now - b2_full_timestamp <= 48 * 3600)
        or (rclone_configured and rclone_full_timestamp and now - rclone_full_timestamp <= 48 * 3600)
    ) else 0
    if not offsite_configured:
        next_step = "configure_google_drive_rclone_on_110_tty"
        phase = 1
    elif not any_partial_fresh:
        next_step = "run_small_dry_run_then_partial_sync"
        phase = 2
    elif escrow_missing_count > 0:
        next_step = "complete_credential_escrow_review"
        phase = 3
    elif not full_fresh:
        next_step = "pre_full_sync_review"
        phase = 4
    else:
        next_step = "offsite_and_escrow_ready"
        phase = 5

    lines.extend(
        [
            f'awoooi_backup_dr_credential_escrow_missing_count{{host="{_escape_label(host)}"}} {escrow_missing_count}',
            f'awoooi_backup_dr_phase{{host="{_escape_label(host)}",next_step="{_escape_label(next_step)}"}} {phase}',
            f'awoooi_backup_dr_next_step_info{{host="{_escape_label(host)}",next_step="{_escape_label(next_step)}"}} 1',
        ]
    )
    return lines


def _retention_metric_lines(host: str) -> list[str]:
    mode = (_backup_config_value("BACKUP_RETENTION_MODE") or os.environ.get("BACKUP_RETENTION_MODE", "")).strip()
    keep_last = (_backup_config_value("KEEP_LAST") or os.environ.get("KEEP_LAST", "")).strip()
    offsite_delete_old = (
        _backup_config_value("OFFSITE_SYNC_DELETE_OLD") or os.environ.get("OFFSITE_SYNC_DELETE_OLD", "")
    ).strip()

    latest_only = 1 if mode == "latest" and keep_last == "1" else 0
    offsite_mirror = 1 if offsite_delete_old == "1" else 0
    labels = f'host="{_escape_label(host)}",scope="restic",mode="{_escape_label(mode or "unknown")}",keep_last="{_escape_label(keep_last or "unknown")}"'
    offsite_labels = (
        f'host="{_escape_label(host)}",scope="offsite",provider="rclone",'
        f'delete_old="{_escape_label(offsite_delete_old or "unknown")}"'
    )
    return [
        f"awoooi_backup_retention_latest_only{{{labels}}} {latest_only}",
        f"awoooi_backup_retention_offsite_delete_old_enabled{{{offsite_labels}}} {offsite_mirror}",
    ]


def _collect_velero_from_k8s() -> dict[str, int | str]:
    remote_script = r"""
python3 - <<'PY'
import datetime as dt
import json
import subprocess
import time


def kubectl(args):
    for prefix in (["sudo", "-n", "kubectl"], ["kubectl"]):
        result = subprocess.run(prefix + args, capture_output=True, text=True, timeout=20, check=False)
        if result.returncode == 0:
            return result.stdout
    return ""


def load_json(args):
    text = kubectl(args + ["-o", "json"])
    try:
        return json.loads(text) if text else {}
    except json.JSONDecodeError:
        return {}


def parse_ts(value):
    if not value:
        return 0
    try:
        return int(dt.datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp())
    except ValueError:
        return 0


now = int(time.time())
schedules = load_json(["get", "schedules.velero.io", "-n", "velero"]).get("items") or []
backups = load_json(["get", "backups.velero.io", "-n", "velero"]).get("items") or []
cron = load_json(["get", "cronjob", "-n", "velero", "backup-restore-test"])
jobs = load_json(["get", "jobs", "-n", "velero", "-l", "component=backup-restore-test"]).get("items") or []

completed = []
for item in backups:
    if item.get("status", {}).get("phase") != "Completed":
        continue
    timestamp = parse_ts(item.get("status", {}).get("completionTimestamp") or item.get("metadata", {}).get("creationTimestamp"))
    if timestamp:
        completed.append(timestamp)

failed_jobs = 0
for job in jobs:
    conditions = job.get("status", {}).get("conditions") or []
    if any(row.get("type") == "Failed" and row.get("status") == "True" for row in conditions):
        failed_jobs += 1

last_success = parse_ts((cron.get("status") or {}).get("lastSuccessfulTime"))
latest_backup = max(completed) if completed else 0

print("monitor_up=1")
print(f"schedule_count={len(schedules)}")
print(f"schedule_paused_count={sum(1 for item in schedules if item.get('spec', {}).get('paused'))}")
print(f"latest_completed_backup_timestamp={latest_backup}")
print(f"latest_completed_backup_age_seconds={now - latest_backup if latest_backup else 0}")
print(f"latest_completed_backup_fresh={1 if latest_backup and now - latest_backup <= 90000 else 0}")
print(f"restore_test_cron_present={1 if cron.get('metadata', {}).get('name') == 'backup-restore-test' else 0}")
print(f"restore_test_last_success_timestamp={last_success}")
print(f"restore_test_last_success_age_seconds={now - last_success if last_success else 0}")
print(f"restore_test_last_success_fresh={1 if last_success and now - last_success <= 691200 else 0}")
print(f"restore_test_failed_jobs={failed_jobs}")
PY
"""
    hosts = os.environ.get("AIOPS_K8S_QUERY_HOSTS", "192.168.0.120 192.168.0.121 192.168.0.125").split()
    values: dict[str, int | str] = {"monitor_up": 0, "source": "unreachable"}
    for host in hosts:
        rc, stdout, _ = _run(
            [
                "ssh",
                "-o",
                "BatchMode=yes",
                "-o",
                "StrictHostKeyChecking=accept-new",
                "-o",
                "ConnectTimeout=8",
                f"wooo@{host}",
                remote_script,
            ],
            timeout=45,
        )
        if rc != 0:
            continue
        parsed: dict[str, int | str] = {"source": f"{host}-kubectl"}
        for line in stdout.splitlines():
            if "=" not in line:
                continue
            key, value = line.split("=", 1)
            try:
                parsed[key.strip()] = int(float(value.strip()))
            except ValueError:
                continue
        if int(parsed.get("monitor_up", 0)) == 1:
            return parsed
    return values


def _velero_metric_lines(host: str) -> list[str]:
    values = _collect_velero_from_k8s()
    labels = f'host="{_escape_label(host)}",source="{_escape_label(str(values.get("source", "unreachable")))}",namespace="velero"'
    return [
        f"awoooi_velero_monitor_up{{{labels}}} {values.get('monitor_up', 0)}",
        f"awoooi_velero_schedule_count{{{labels}}} {values.get('schedule_count', 0)}",
        f"awoooi_velero_schedule_paused_count{{{labels}}} {values.get('schedule_paused_count', 0)}",
        f"awoooi_velero_latest_completed_backup_timestamp{{{labels}}} {values.get('latest_completed_backup_timestamp', 0)}",
        f"awoooi_velero_latest_completed_backup_age_seconds{{{labels}}} {values.get('latest_completed_backup_age_seconds', 0)}",
        f"awoooi_velero_latest_completed_backup_fresh{{{labels},max_age_hours=\"25\"}} {values.get('latest_completed_backup_fresh', 0)}",
        f"awoooi_velero_restore_test_cron_present{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_cron_present', 0)}",
        f"awoooi_velero_restore_test_last_success_timestamp{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_last_success_timestamp', 0)}",
        f"awoooi_velero_restore_test_last_success_age_seconds{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_last_success_age_seconds', 0)}",
        f"awoooi_velero_restore_test_last_success_fresh{{{labels},cronjob=\"backup-restore-test\",max_age_hours=\"192\"}} {values.get('restore_test_last_success_fresh', 0)}",
        f"awoooi_velero_restore_test_failed_jobs{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_failed_jobs', 0)}",
    ]


def _metric_lines_for_job(
    *,
    host: str,
    job: str,
    source: str,
    target: str,
    backup_type: str,
    last_success: int,
    max_age_hours: float,
    sample_count: int = 0,
) -> list[str]:
    now = int(time.time())
    labels = (
        f'host="{_escape_label(host)}",'
        f'job="{_escape_label(job)}",'
        f'type="{_escape_label(backup_type)}",'
        f'source="{_escape_label(source)}",'
        f'target="{_escape_label(target)}",'
        f'max_age_hours="{max_age_hours:g}"'
    )
    age = now - last_success if last_success > 0 else 0
    fresh = 1 if last_success > 0 and age <= int(max_age_hours * 3600) else 0
    return [
        f"awoooi_backup_expected_job_info{{{labels}}} 1",
        f"awoooi_backup_job_last_success_timestamp{{{labels}}} {last_success}",
        f"awoooi_backup_job_age_seconds{{{labels}}} {age}",
        f"awoooi_backup_job_fresh{{{labels}}} {fresh}",
        f"awoooi_backup_job_snapshot_count{{{labels}}} {sample_count}",
    ]


def _base_lines(host: str) -> list[str]:
    now = int(time.time())
    return [
        "# HELP awoooi_backup_health_monitor_up Whether the backup health exporter completed.",
        "# TYPE awoooi_backup_health_monitor_up gauge",
        "# HELP awoooi_backup_health_last_run_timestamp Unix timestamp of the last backup health exporter run.",
        "# TYPE awoooi_backup_health_last_run_timestamp gauge",
        "# HELP awoooi_backup_expected_job_info Expected backup job inventory.",
        "# TYPE awoooi_backup_expected_job_info gauge",
        "# HELP awoooi_backup_job_configured Whether the expected backup cron/config is present.",
        "# TYPE awoooi_backup_job_configured gauge",
        "# HELP awoooi_backup_script_present Whether the backup script exists on this host.",
        "# TYPE awoooi_backup_script_present gauge",
        "# HELP awoooi_backup_job_last_success_timestamp Unix timestamp of the latest successful backup evidence.",
        "# TYPE awoooi_backup_job_last_success_timestamp gauge",
        "# HELP awoooi_backup_job_age_seconds Age of the latest successful backup evidence.",
        "# TYPE awoooi_backup_job_age_seconds gauge",
        "# HELP awoooi_backup_job_fresh Whether the latest successful backup evidence is within max_age_hours.",
        "# TYPE awoooi_backup_job_fresh gauge",
        "# HELP awoooi_backup_job_snapshot_count Number of snapshots or files considered for this job.",
        "# TYPE awoooi_backup_job_snapshot_count gauge",
        "# HELP awoooi_backup_last_run_failed_count Failed component count from the last aggregate backup run.",
        "# TYPE awoooi_backup_last_run_failed_count gauge",
        "# HELP awoooi_backup_integrity_last_success_timestamp Unix timestamp of latest successful backup integrity or restore drill run.",
        "# TYPE awoooi_backup_integrity_last_success_timestamp gauge",
        "# HELP awoooi_backup_integrity_age_seconds Age of backup integrity or restore drill status.",
        "# TYPE awoooi_backup_integrity_age_seconds gauge",
        "# HELP awoooi_backup_integrity_fresh Whether backup integrity or restore drill status is fresh and successful.",
        "# TYPE awoooi_backup_integrity_fresh gauge",
        "# HELP awoooi_backup_integrity_failed_repo_count Failed repository count from backup integrity or restore drill run.",
        "# TYPE awoooi_backup_integrity_failed_repo_count gauge",
        "# HELP awoooi_backup_integrity_checked_repo_count Checked repository count from backup integrity or restore drill run.",
        "# TYPE awoooi_backup_integrity_checked_repo_count gauge",
        "# HELP awoooi_backup_config_capture_status_timestamp Unix timestamp of the latest config-capture coverage status.",
        "# TYPE awoooi_backup_config_capture_status_timestamp gauge",
        "# HELP awoooi_backup_config_capture_status_age_seconds Age of the latest config-capture coverage status.",
        "# TYPE awoooi_backup_config_capture_status_age_seconds gauge",
        "# HELP awoooi_backup_config_capture_critical_failed_count Critical config-capture targets missing from the latest configs backup.",
        "# TYPE awoooi_backup_config_capture_critical_failed_count gauge",
        "# HELP awoooi_backup_config_capture_failed_count Total config-capture targets missing from the latest configs backup.",
        "# TYPE awoooi_backup_config_capture_failed_count gauge",
        "# HELP awoooi_backup_config_capture_duration_seconds Duration of the latest configs backup capture run.",
        "# TYPE awoooi_backup_config_capture_duration_seconds gauge",
        "# HELP awoooi_backup_config_capture_ok Whether the latest configs backup captured a specific target.",
        "# TYPE awoooi_backup_config_capture_ok gauge",
        "# HELP awoooi_backup_offsite_configured Whether an offsite backup provider appears configured without exposing credentials.",
        "# TYPE awoooi_backup_offsite_configured gauge",
        "# HELP awoooi_backup_offsite_last_success_timestamp Unix timestamp of latest offsite copy success marker.",
        "# TYPE awoooi_backup_offsite_last_success_timestamp gauge",
        "# HELP awoooi_backup_offsite_age_seconds Age of latest offsite copy success marker.",
        "# TYPE awoooi_backup_offsite_age_seconds gauge",
        "# HELP awoooi_backup_offsite_fresh Whether offsite copy success marker is fresh.",
        "# TYPE awoooi_backup_offsite_fresh gauge",
        "# HELP awoooi_backup_offsite_partial_last_success_timestamp Unix timestamp of latest partial offsite copy success marker.",
        "# TYPE awoooi_backup_offsite_partial_last_success_timestamp gauge",
        "# HELP awoooi_backup_offsite_partial_age_seconds Age of latest partial offsite copy success marker.",
        "# TYPE awoooi_backup_offsite_partial_age_seconds gauge",
        "# HELP awoooi_backup_offsite_partial_fresh Whether partial offsite copy success marker is fresh.",
        "# TYPE awoooi_backup_offsite_partial_fresh gauge",
        "# HELP awoooi_backup_offsite_full_sync_enabled Whether the gated full offsite sync enable marker exists.",
        "# TYPE awoooi_backup_offsite_full_sync_enabled gauge",
        "# HELP awoooi_backup_offsite_full_sync_enabled_timestamp Unix timestamp of the gated full offsite sync enable marker.",
        "# TYPE awoooi_backup_offsite_full_sync_enabled_timestamp gauge",
        "# HELP awoooi_backup_credential_escrow_expected_info Expected credential escrow evidence inventory.",
        "# TYPE awoooi_backup_credential_escrow_expected_info gauge",
        "# HELP awoooi_backup_credential_escrow_last_verified_timestamp Unix timestamp of credential escrow verification evidence.",
        "# TYPE awoooi_backup_credential_escrow_last_verified_timestamp gauge",
        "# HELP awoooi_backup_credential_escrow_age_seconds Age of credential escrow verification evidence.",
        "# TYPE awoooi_backup_credential_escrow_age_seconds gauge",
        "# HELP awoooi_backup_credential_escrow_fresh Whether credential escrow verification evidence is fresh.",
        "# TYPE awoooi_backup_credential_escrow_fresh gauge",
        "# HELP awoooi_backup_dr_credential_escrow_missing_count Number of credential escrow items that still need fresh human verification.",
        "# TYPE awoooi_backup_dr_credential_escrow_missing_count gauge",
        "# HELP awoooi_backup_dr_phase Numeric DR offsite completion phase for AI/operator triage.",
        "# TYPE awoooi_backup_dr_phase gauge",
        "# HELP awoooi_backup_dr_next_step_info Current human-safe next step for DR offsite completion.",
        "# TYPE awoooi_backup_dr_next_step_info gauge",
        "# HELP awoooi_backup_retention_latest_only Whether local restic backup retention is configured as latest-only keep-last=1.",
        "# TYPE awoooi_backup_retention_latest_only gauge",
        "# HELP awoooi_backup_retention_offsite_delete_old_enabled Whether offsite rclone sync is allowed to delete old remote backup files after successful mirror.",
        "# TYPE awoooi_backup_retention_offsite_delete_old_enabled gauge",
        "# HELP awoooi_backup_cron_active_duplicate_count Number of exact duplicate active crontab entries on the backup host.",
        "# TYPE awoooi_backup_cron_active_duplicate_count gauge",
        "# HELP awoooi_backup_cron_singular_entry_count Number of active crontab entries matching a backup/offsite singleton pattern.",
        "# TYPE awoooi_backup_cron_singular_entry_count gauge",
        "# HELP awoooi_backup_cron_singular_entry_ok Whether a backup/offsite singleton cron pattern has exactly one active entry.",
        "# TYPE awoooi_backup_cron_singular_entry_ok gauge",
        "# HELP awoooi_velero_monitor_up Whether the backup health exporter can query Velero via a reachable K3s kubectl endpoint.",
        "# TYPE awoooi_velero_monitor_up gauge",
        "# HELP awoooi_velero_schedule_count Number of Velero schedules in the velero namespace.",
        "# TYPE awoooi_velero_schedule_count gauge",
        "# HELP awoooi_velero_schedule_paused_count Number of paused Velero schedules.",
        "# TYPE awoooi_velero_schedule_paused_count gauge",
        "# HELP awoooi_velero_latest_completed_backup_timestamp Unix timestamp of latest Completed Velero backup.",
        "# TYPE awoooi_velero_latest_completed_backup_timestamp gauge",
        "# HELP awoooi_velero_latest_completed_backup_age_seconds Age of latest Completed Velero backup.",
        "# TYPE awoooi_velero_latest_completed_backup_age_seconds gauge",
        "# HELP awoooi_velero_latest_completed_backup_fresh Whether latest Completed Velero backup is within max_age_hours.",
        "# TYPE awoooi_velero_latest_completed_backup_fresh gauge",
        "# HELP awoooi_velero_restore_test_cron_present Whether backup-restore-test CronJob exists.",
        "# TYPE awoooi_velero_restore_test_cron_present gauge",
        "# HELP awoooi_velero_restore_test_last_success_timestamp Unix timestamp of backup-restore-test lastSuccessfulTime.",
        "# TYPE awoooi_velero_restore_test_last_success_timestamp gauge",
        "# HELP awoooi_velero_restore_test_last_success_age_seconds Age of backup-restore-test lastSuccessfulTime.",
        "# TYPE awoooi_velero_restore_test_last_success_age_seconds gauge",
        "# HELP awoooi_velero_restore_test_last_success_fresh Whether backup-restore-test lastSuccessfulTime is within max_age_hours.",
        "# TYPE awoooi_velero_restore_test_last_success_fresh gauge",
        "# HELP awoooi_velero_restore_test_failed_jobs Failed backup-restore-test jobs retained in velero namespace.",
        "# TYPE awoooi_velero_restore_test_failed_jobs gauge",
        f'awoooi_backup_health_monitor_up{{host="{_escape_label(host)}"}} 1',
        f'awoooi_backup_health_last_run_timestamp{{host="{_escape_label(host)}"}} {now}',
    ]


def _collect_110(host: str) -> list[str]:
    cron = _cron_text()
    lines = _base_lines(host)
    expected_crons = {
        "backup_all": "/backup/scripts/backup-all.sh",
        "awoooi_frequent": "/backup/scripts/backup-awoooi-frequent.sh",
        "offsite_status": "/backup/scripts/sync-offsite-backups.sh --mode status",
        "offsite_sync_gated": "/backup/offsite/enable-rclone-sync",
        "offsite_escrow_evidence_report": "/backup/scripts/offsite-escrow-evidence-report.sh --no-color",
        "offsite_full_sync_verify": "/backup/scripts/verify-offsite-full-sync.sh --write-textfile",
        "backup_integrity_check": "/backup/scripts/check-backup-integrity.sh --mode check",
        "backup_restore_drill": "/backup/scripts/check-backup-integrity.sh --mode restore-drill",
    }
    for job, pattern in expected_crons.items():
        labels = f'host="{_escape_label(host)}",job="{_escape_label(job)}"'
        lines.append(f"awoooi_backup_job_configured{{{labels}}} {int(pattern in cron)}")
    for script in [
        "backup-all.sh",
        "backup-awoooi.sh",
        "backup-awoooi-frequent.sh",
        "backup-configs.sh",
        "backup-sentry.sh",
        "backup-ai-artifacts.sh",
        "backup-public-routes.sh",
        "configure-offsite-rclone.sh",
        "configure-offsite-b2.sh",
        "sync-offsite-backups.sh",
        "backup-offsite-readiness-gate.sh",
        "offsite-escrow-evidence-report.sh",
        "verify-offsite-full-sync.sh",
        "mark-credential-escrow-verified.sh",
        "check-backup-integrity.sh",
        "backup-gitea.sh",
        "backup-harbor.sh",
        "backup-momo.sh",
        "backup-langfuse.sh",
        "backup-monitoring.sh",
        "backup-signoz.sh",
        "backup-open-webui.sh",
        "backup-clawbot.sh",
    ]:
        labels = f'host="{_escape_label(host)}",script="{_escape_label(script)}"'
        lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path('/backup/scripts', script).exists())}")

    for job, repo, max_age in [
        ("awoooi_db", "/backup/awoooi", 7),
        ("configs", "/backup/configs", 48),
        ("sentry", "/backup/sentry", 48),
        ("gitea", "/backup/gitea", 48),
        ("harbor", "/backup/harbor", 48),
        ("momo", "/backup/momo", 48),
        ("langfuse", "/backup/langfuse", 48),
        ("monitoring", "/backup/monitoring", 48),
        ("signoz", "/backup/signoz", 48),
        ("open_webui", "/backup/open-webui", 48),
        ("clawbot", "/backup/clawbot", 48),
        ("ai_artifacts", "/backup/ai-artifacts", 48),
        ("public_routes", "/backup/public-routes", 168),
    ]:
        timestamp, count = _latest_restic_snapshot(repo)
        lines.extend(
            _metric_lines_for_job(
                host=host,
                job=job,
                source="110-restic",
                target=repo,
                backup_type="restic",
                last_success=timestamp,
                max_age_hours=max_age,
                sample_count=count,
            )
        )

    backup_all_ts, failed_count = _latest_backup_all_failed_count()
    labels = f'host="{_escape_label(host)}",job="backup_all"'
    lines.append(f"awoooi_backup_last_run_failed_count{{{labels}}} {failed_count}")
    lines.append(f"awoooi_backup_job_last_success_timestamp{{{labels},type=\"aggregate\",source=\"110-cron-log\",target=\"/backup/logs/cron.log\",max_age_hours=\"48\"}} {backup_all_ts if failed_count == 0 else 0}")
    lines.extend(_integrity_metric_lines(host))
    lines.extend(_config_capture_metric_lines(host))
    lines.extend(_offsite_and_escrow_metric_lines(host))
    lines.extend(_retention_metric_lines(host))
    lines.extend(_cron_duplicate_metric_lines(host, cron))
    lines.extend(_velero_metric_lines(host))
    return lines


def _collect_188(host: str) -> list[str]:
    cron = _cron_text()
    lines = _base_lines(host)
    for job, pattern in {
        "backup_from_110": "/home/ollama/bin/backup-from-110.sh",
        "momo_pg_daily": "/home/ollama/bin/momo-pg-backup.sh",
    }.items():
        labels = f'host="{_escape_label(host)}",job="{_escape_label(job)}"'
        lines.append(f"awoooi_backup_job_configured{{{labels}}} {int(pattern in cron)}")

    for script in [
        "/home/ollama/bin/backup-from-110.sh",
        "/home/ollama/bin/momo-pg-backup.sh",
        "/home/ollama/awoooi-ops/pg-backup.sh",
    ]:
        labels = f'host="{_escape_label(host)}",script="{_escape_label(Path(script).name)}"'
        lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path(script).exists() and os.access(script, os.X_OK))}")

    lines.extend(
        _metric_lines_for_job(
            host=host,
            job="backup_from_110",
            source="188-rsync",
            target="/home/ollama/backup/110",
            backup_type="rsync",
            last_success=_read_backup_110_timestamp(),
            max_age_hours=25,
            sample_count=1,
        )
    )
    momo_ts = _newest_file_timestamp([
        "/home/ollama/momo_backups/*.sql.gz",
        "/home/ollama/momo-pro/backups/*.sql.gz",
        "/home/ollama/backups/momo_analytics_*.sql.gz",
    ])
    lines.extend(
        _metric_lines_for_job(
            host=host,
            job="momo_pg_daily",
            source="188-pg-dump",
            target="/home/ollama/momo_backups",
            backup_type="pg_dump",
            last_success=momo_ts,
            max_age_hours=30,
            sample_count=1 if momo_ts else 0,
        )
    )
    return lines


def collect() -> str:
    host = HOST_LABEL
    if host == "110":
        lines = _collect_110(host)
    elif host == "188":
        lines = _collect_188(host)
    else:
        lines = _base_lines(host)
    return "\n".join(lines) + "\n"


def main() -> None:
    TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
    payload = collect()
    with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
        tmp.write(payload)
        tmp_path = Path(tmp.name)
    output_path = TEXTFILE_DIR / OUTPUT_NAME
    tmp_path.replace(output_path)
    output_path.chmod(0o644)


if __name__ == "__main__":
    main()