927 lines
42 KiB
Python
Executable File
927 lines
42 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Backup health textfile exporter for full-stack reboot readiness.
|
|
|
|
2026-05-06 ogt + Codex: backup coverage follow-up after the reboot incident.
|
|
Why: a green service gate is not enough if the last restorable copy is stale.
|
|
This exporter is read-only; it checks cron/script presence and the latest
|
|
successful backup evidence, then writes node-exporter textfile metrics.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import shlex
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
|
|
OUTPUT_NAME = "backup_health.prom"
|
|
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
|
|
LABEL_RE = re.compile(r'["\\\n]')
|
|
BACKUP_COMMON_SH = Path(os.environ.get("AIOPS_BACKUP_COMMON_SH", "/backup/scripts/common.sh"))
|
|
BACKUP_OFFSITE_ENV = Path(os.environ.get("AIOPS_BACKUP_OFFSITE_ENV", "/backup/scripts/offsite.env"))
|
|
OFFSITE_STATUS_DIR = Path(os.environ.get("AIOPS_OFFSITE_STATUS_DIR", "/backup/offsite"))
|
|
ESCROW_EVIDENCE_DIR = Path(os.environ.get("AIOPS_ESCROW_EVIDENCE_DIR", "/backup/escrow-evidence"))
|
|
CONFIG_CAPTURE_STATUS_FILE = Path(os.environ.get("AIOPS_CONFIG_CAPTURE_STATUS_FILE", "/backup/status/backup-configs-last-status.json"))
|
|
ESCROW_ITEMS = [
|
|
"restic_repository_password",
|
|
"offsite_provider_credentials",
|
|
"break_glass_admin_credentials",
|
|
"dns_registrar_recovery",
|
|
"oauth_ai_provider_recovery",
|
|
]
|
|
|
|
|
|
def _escape_label(value: str) -> str:
|
|
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
|
|
|
|
|
|
def _run(command: list[str], timeout: int = 30) -> tuple[int, str, str]:
|
|
try:
|
|
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, check=False)
|
|
except FileNotFoundError as exc:
|
|
return 127, "", str(exc)
|
|
except subprocess.TimeoutExpired as exc:
|
|
stdout = exc.stdout if isinstance(exc.stdout, str) else ""
|
|
stderr = exc.stderr if isinstance(exc.stderr, str) else "timeout"
|
|
return 124, stdout, stderr
|
|
return result.returncode, result.stdout, result.stderr
|
|
|
|
|
|
def _parse_time(value: str) -> int:
|
|
if not value:
|
|
return 0
|
|
normalized = re.sub(r"\.(\d{6})\d+([+-]\d\d:\d\d|Z)$", r".\1\2", value)
|
|
normalized = normalized.replace("Z", "+00:00")
|
|
try:
|
|
return int(datetime.fromisoformat(normalized).astimezone(timezone.utc).timestamp())
|
|
except ValueError:
|
|
return 0
|
|
|
|
|
|
def _parse_marker_timestamp(text: str) -> int:
|
|
match = re.search(r"\b(\d{10})\b", text)
|
|
if match:
|
|
return int(match.group(1))
|
|
for line in text.splitlines():
|
|
parsed = _parse_time(line.strip())
|
|
if parsed:
|
|
return parsed
|
|
return 0
|
|
|
|
|
|
def _marker_timestamp(paths: list[Path]) -> int:
|
|
for path in paths:
|
|
try:
|
|
text = path.read_text(encoding="utf-8", errors="replace")
|
|
parsed = _parse_marker_timestamp(text)
|
|
return parsed or int(path.stat().st_mtime)
|
|
except OSError:
|
|
continue
|
|
return 0
|
|
|
|
|
|
def _shell_export_value(path: Path, key: str) -> str:
|
|
try:
|
|
lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
|
|
except OSError:
|
|
return ""
|
|
for line in lines:
|
|
try:
|
|
tokens = shlex.split(line, comments=True, posix=True)
|
|
except ValueError:
|
|
continue
|
|
if tokens and tokens[0] == "export":
|
|
tokens = tokens[1:]
|
|
for token in tokens:
|
|
if not token.startswith(f"{key}="):
|
|
continue
|
|
return token.split("=", 1)[1].strip()
|
|
return ""
|
|
|
|
|
|
def _backup_config_value(key: str) -> str:
|
|
for path in [BACKUP_OFFSITE_ENV, BACKUP_COMMON_SH]:
|
|
value = _shell_export_value(path, key)
|
|
if value:
|
|
default_match = re.fullmatch(r"\$\{" + re.escape(key) + r":-([^}]+)\}", value)
|
|
if default_match:
|
|
return default_match.group(1)
|
|
return value
|
|
return ""
|
|
|
|
|
|
def _configured_secret(value: str) -> bool:
|
|
return value.strip() not in {"", "CHANGE_ME", "CHANGEME", "TODO", "REDACTED"}
|
|
|
|
|
|
def _b2_configured() -> bool:
|
|
return (
|
|
_configured_secret(_backup_config_value("B2_ACCOUNT_ID"))
|
|
and _configured_secret(_backup_config_value("B2_APPLICATION_KEY"))
|
|
and _configured_secret(_backup_config_value("B2_BUCKET"))
|
|
)
|
|
|
|
|
|
def _rclone_configured() -> bool:
|
|
remote = _backup_config_value("OFFSITE_RCLONE_REMOTE") or os.environ.get("OFFSITE_RCLONE_REMOTE", "gdrive")
|
|
rc, stdout, _ = _run(["rclone", "listremotes"], timeout=10)
|
|
if rc == 0 and remote:
|
|
return f"{remote}:" in {line.strip() for line in stdout.splitlines()}
|
|
for path in [
|
|
Path.home() / ".config/rclone/rclone.conf",
|
|
Path("/home/wooo/.config/rclone/rclone.conf"),
|
|
Path("/root/.config/rclone/rclone.conf"),
|
|
Path("/etc/rclone.conf"),
|
|
]:
|
|
try:
|
|
if path.is_file() and path.stat().st_size > 0:
|
|
return True
|
|
except OSError:
|
|
continue
|
|
return False
|
|
|
|
|
|
def _cron_text() -> str:
|
|
rc, stdout, _ = _run(["crontab", "-l"], timeout=10)
|
|
return stdout if rc == 0 else ""
|
|
|
|
|
|
def _active_cron_lines(cron: str) -> list[str]:
|
|
return [line.strip() for line in cron.splitlines() if line.strip() and not line.lstrip().startswith("#")]
|
|
|
|
|
|
def _cron_duplicate_metric_lines(host: str, cron: str) -> list[str]:
|
|
lines: list[str] = []
|
|
active_lines = _active_cron_lines(cron)
|
|
duplicate_count = max(0, len(active_lines) - len(set(active_lines)))
|
|
lines.append(f'awoooi_backup_cron_active_duplicate_count{{host="{_escape_label(host)}"}} {duplicate_count}')
|
|
|
|
singular_patterns = {
|
|
"backup_health_exporter": "/home/wooo/scripts/backup-health-textfile-exporter.py",
|
|
"offsite_status": "/backup/scripts/sync-offsite-backups.sh --mode status",
|
|
"offsite_escrow_evidence_report": "/backup/scripts/offsite-escrow-evidence-report.sh --no-color",
|
|
"offsite_sync_gated": "/backup/scripts/sync-offsite-backups.sh --mode sync",
|
|
"offsite_full_sync_verify": "/backup/scripts/verify-offsite-full-sync.sh --write-textfile",
|
|
}
|
|
for entry, pattern in singular_patterns.items():
|
|
count = sum(1 for line in active_lines if pattern in line)
|
|
labels = f'host="{_escape_label(host)}",entry="{_escape_label(entry)}"'
|
|
lines.append(f"awoooi_backup_cron_singular_entry_count{{{labels}}} {count}")
|
|
lines.append(f"awoooi_backup_cron_singular_entry_ok{{{labels}}} {1 if count == 1 else 0}")
|
|
return lines
|
|
|
|
|
|
def _newest_file_timestamp(patterns: list[str]) -> int:
|
|
newest = 0
|
|
for pattern in patterns:
|
|
for path in Path("/").glob(pattern.lstrip("/")):
|
|
try:
|
|
if path.is_file():
|
|
newest = max(newest, int(path.stat().st_mtime))
|
|
except OSError:
|
|
continue
|
|
return newest
|
|
|
|
|
|
def _read_backup_110_timestamp() -> int:
|
|
candidates = [
|
|
Path("/home/ollama/node_exporter_textfiles/backup.prom"),
|
|
Path("/home/ollama/backup/110/last_success"),
|
|
]
|
|
for path in candidates:
|
|
try:
|
|
text = path.read_text(encoding="utf-8", errors="replace")
|
|
except OSError:
|
|
continue
|
|
match = re.search(r"(?:backup_110_last_success_timestamp\s+)?(\d{10})", text)
|
|
if match:
|
|
return int(match.group(1))
|
|
return 0
|
|
|
|
|
|
def _latest_restic_snapshot(repo: str) -> tuple[int, int]:
|
|
password_file = os.environ.get("RESTIC_PASSWORD_FILE", "/backup/scripts/.restic-password")
|
|
if not Path(repo).exists() or not Path(password_file).exists():
|
|
return 0, 0
|
|
rc, stdout, _ = _run(
|
|
["restic", "-r", repo, "snapshots", "--json", "--password-file", password_file],
|
|
timeout=45,
|
|
)
|
|
if rc != 0:
|
|
return 0, 0
|
|
try:
|
|
rows = json.loads(stdout)
|
|
except json.JSONDecodeError:
|
|
return 0, 0
|
|
timestamps = [_parse_time(str(row.get("time", ""))) for row in rows]
|
|
timestamps = [value for value in timestamps if value > 0]
|
|
return (max(timestamps), len(timestamps)) if timestamps else (0, 0)
|
|
|
|
|
|
def _backup_all_failed_count_from_log(path: Path) -> tuple[int, int]:
|
|
try:
|
|
lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
|
|
except OSError:
|
|
return 0, -1
|
|
for line in reversed(lines):
|
|
if "全服務備份完成" not in line:
|
|
continue
|
|
ts_match = re.match(r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]", line)
|
|
timestamp = 0
|
|
if ts_match:
|
|
timestamp = int(datetime.strptime(ts_match.group(1), "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc).timestamp()) - 8 * 3600
|
|
failed_match = re.search(r"-\s+(\d+)\s+個失敗", line)
|
|
if failed_match:
|
|
return timestamp, int(failed_match.group(1))
|
|
if "全部成功" in line:
|
|
return timestamp, 0
|
|
return 0, -1
|
|
|
|
|
|
def _latest_backup_all_failed_count() -> tuple[int, int]:
|
|
candidates = [
|
|
_backup_all_failed_count_from_log(Path("/backup/logs/cron.log")),
|
|
_backup_all_failed_count_from_log(Path("/backup/logs/backup.log")),
|
|
]
|
|
candidates = [row for row in candidates if row[0] > 0 and row[1] >= 0]
|
|
if not candidates:
|
|
return 0, -1
|
|
return max(candidates, key=lambda row: row[0])
|
|
|
|
|
|
def _read_key_value_status(path: str) -> dict[str, int | str]:
|
|
values: dict[str, int | str] = {}
|
|
try:
|
|
lines = Path(path).read_text(encoding="utf-8", errors="replace").splitlines()
|
|
except OSError:
|
|
return values
|
|
for line in lines:
|
|
if not line or line.startswith("#") or "=" not in line:
|
|
continue
|
|
key, value = line.split("=", 1)
|
|
key = key.strip()
|
|
value = value.strip()
|
|
try:
|
|
values[key] = int(float(value))
|
|
except ValueError:
|
|
values[key] = value
|
|
return values
|
|
|
|
|
|
def _integrity_metric_lines(host: str) -> list[str]:
|
|
now = int(time.time())
|
|
specs = [
|
|
("restic_check", "/backup/integrity/check.status", 192),
|
|
("restore_drill", "/backup/integrity/restore-drill.status", 744),
|
|
]
|
|
lines: list[str] = []
|
|
for scope, path, max_age_hours in specs:
|
|
values = _read_key_value_status(path)
|
|
timestamp = int(values.get("timestamp", 0)) if "timestamp" in values else 0
|
|
failed_count = int(values.get("failed_count", -1)) if "failed_count" in values else -1
|
|
checked_count = int(values.get("checked_repo_count", 0)) if "checked_repo_count" in values else 0
|
|
age = now - timestamp if timestamp else 0
|
|
fresh = 1 if timestamp and age <= max_age_hours * 3600 and failed_count == 0 else 0
|
|
labels = f'host="{_escape_label(host)}",scope="{scope}",max_age_hours="{max_age_hours}"'
|
|
lines.extend(
|
|
[
|
|
f"awoooi_backup_integrity_last_success_timestamp{{{labels}}} {timestamp if failed_count == 0 else 0}",
|
|
f"awoooi_backup_integrity_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_backup_integrity_fresh{{{labels}}} {fresh}",
|
|
f"awoooi_backup_integrity_failed_repo_count{{{labels}}} {failed_count}",
|
|
f"awoooi_backup_integrity_checked_repo_count{{{labels}}} {checked_count}",
|
|
]
|
|
)
|
|
return lines
|
|
|
|
|
|
def _config_capture_metric_lines(host: str) -> list[str]:
|
|
now = int(time.time())
|
|
labels = f'host="{_escape_label(host)}"'
|
|
try:
|
|
document = json.loads(CONFIG_CAPTURE_STATUS_FILE.read_text(encoding="utf-8", errors="replace"))
|
|
except (OSError, json.JSONDecodeError):
|
|
return [
|
|
f"awoooi_backup_config_capture_status_timestamp{{{labels}}} 0",
|
|
f"awoooi_backup_config_capture_status_age_seconds{{{labels}}} 0",
|
|
f"awoooi_backup_config_capture_critical_failed_count{{{labels}}} -1",
|
|
]
|
|
|
|
timestamp = int(document.get("timestamp") or 0)
|
|
critical_failed = int(document.get("critical_failed_count", -1))
|
|
failed_count = int(document.get("failed_count", -1))
|
|
snapshot_id = str(document.get("snapshot_id") or "unknown")
|
|
duration = int(document.get("duration_seconds", 0) or 0)
|
|
age = now - timestamp if timestamp else 0
|
|
lines = [
|
|
f"awoooi_backup_config_capture_status_timestamp{{{labels},snapshot_id=\"{_escape_label(snapshot_id)}\"}} {timestamp}",
|
|
f"awoooi_backup_config_capture_status_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_backup_config_capture_critical_failed_count{{{labels}}} {critical_failed}",
|
|
f"awoooi_backup_config_capture_failed_count{{{labels}}} {failed_count}",
|
|
f"awoooi_backup_config_capture_duration_seconds{{{labels}}} {duration}",
|
|
]
|
|
for item in document.get("items") or []:
|
|
target = str(item.get("target") or "unknown")
|
|
source = str(item.get("source") or "unknown")
|
|
critical = "true" if item.get("critical") else "false"
|
|
ok = 1 if item.get("ok") else 0
|
|
item_labels = (
|
|
f'host="{_escape_label(host)}",'
|
|
f'target="{_escape_label(target)}",'
|
|
f'source="{_escape_label(source)}",'
|
|
f'critical="{critical}"'
|
|
)
|
|
lines.append(f"awoooi_backup_config_capture_ok{{{item_labels}}} {ok}")
|
|
return lines
|
|
|
|
|
|
def _offsite_and_escrow_metric_lines(host: str) -> list[str]:
|
|
now = int(time.time())
|
|
lines: list[str] = []
|
|
b2_configured = int(_b2_configured())
|
|
rclone_configured = int(_rclone_configured())
|
|
b2_full_timestamp = _marker_timestamp(
|
|
[
|
|
OFFSITE_STATUS_DIR / "b2-last-success",
|
|
OFFSITE_STATUS_DIR / "b2.last_success",
|
|
OFFSITE_STATUS_DIR / "last_success",
|
|
Path("/backup/logs/offsite-b2.status"),
|
|
]
|
|
)
|
|
b2_partial_timestamp = _marker_timestamp(
|
|
[
|
|
OFFSITE_STATUS_DIR / "b2-partial-last-success",
|
|
OFFSITE_STATUS_DIR / "b2.partial_last_success",
|
|
]
|
|
)
|
|
rclone_full_timestamp = _marker_timestamp(
|
|
[
|
|
OFFSITE_STATUS_DIR / "rclone-last-success",
|
|
OFFSITE_STATUS_DIR / "rclone.last_success",
|
|
OFFSITE_STATUS_DIR / "last_success",
|
|
Path("/backup/logs/rclone-sync.status"),
|
|
]
|
|
)
|
|
rclone_partial_timestamp = _marker_timestamp(
|
|
[
|
|
OFFSITE_STATUS_DIR / "rclone-partial-last-success",
|
|
OFFSITE_STATUS_DIR / "rclone.partial_last_success",
|
|
]
|
|
)
|
|
offsite_specs = [
|
|
("b2", b2_configured, b2_full_timestamp),
|
|
("rclone", rclone_configured, rclone_full_timestamp),
|
|
]
|
|
for provider, configured, timestamp in offsite_specs:
|
|
age = now - timestamp if timestamp else 0
|
|
fresh = 1 if configured and timestamp and age <= 48 * 3600 else 0
|
|
labels = f'host="{_escape_label(host)}",provider="{provider}",max_age_hours="48"'
|
|
lines.extend(
|
|
[
|
|
f"awoooi_backup_offsite_configured{{{labels}}} {configured}",
|
|
f"awoooi_backup_offsite_last_success_timestamp{{{labels}}} {timestamp}",
|
|
f"awoooi_backup_offsite_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_backup_offsite_fresh{{{labels}}} {fresh}",
|
|
]
|
|
)
|
|
|
|
partial_fresh_by_provider: dict[str, int] = {}
|
|
for provider, configured, timestamp in [
|
|
("b2", b2_configured, b2_partial_timestamp),
|
|
("rclone", rclone_configured, rclone_partial_timestamp),
|
|
]:
|
|
partial_age = now - timestamp if timestamp else 0
|
|
partial_fresh = 1 if configured and timestamp and partial_age <= 48 * 3600 else 0
|
|
partial_fresh_by_provider[provider] = partial_fresh
|
|
partial_labels = f'host="{_escape_label(host)}",provider="{provider}",scope="partial",max_age_hours="48"'
|
|
lines.extend(
|
|
[
|
|
f"awoooi_backup_offsite_partial_last_success_timestamp{{{partial_labels}}} {timestamp}",
|
|
f"awoooi_backup_offsite_partial_age_seconds{{{partial_labels}}} {partial_age}",
|
|
f"awoooi_backup_offsite_partial_fresh{{{partial_labels}}} {partial_fresh}",
|
|
]
|
|
)
|
|
|
|
full_sync_enable_marker = OFFSITE_STATUS_DIR / "enable-rclone-sync"
|
|
try:
|
|
full_sync_enabled = 1 if full_sync_enable_marker.is_file() else 0
|
|
full_sync_enabled_timestamp = int(full_sync_enable_marker.stat().st_mtime) if full_sync_enabled else 0
|
|
except OSError:
|
|
full_sync_enabled = 0
|
|
full_sync_enabled_timestamp = 0
|
|
full_sync_labels = f'host="{_escape_label(host)}",provider="rclone"'
|
|
lines.extend(
|
|
[
|
|
f"awoooi_backup_offsite_full_sync_enabled{{{full_sync_labels}}} {full_sync_enabled}",
|
|
f"awoooi_backup_offsite_full_sync_enabled_timestamp{{{full_sync_labels}}} {full_sync_enabled_timestamp}",
|
|
]
|
|
)
|
|
|
|
escrow_missing_count = 0
|
|
for item in ESCROW_ITEMS:
|
|
timestamp = _marker_timestamp(
|
|
[
|
|
ESCROW_EVIDENCE_DIR / f"{item}.last_verified",
|
|
ESCROW_EVIDENCE_DIR / f"{item}.verified",
|
|
ESCROW_EVIDENCE_DIR / item,
|
|
]
|
|
)
|
|
age = now - timestamp if timestamp else 0
|
|
fresh = 1 if timestamp and age <= 744 * 3600 else 0
|
|
escrow_missing_count += 0 if fresh else 1
|
|
labels = f'host="{_escape_label(host)}",item="{item}",max_age_hours="744"'
|
|
lines.extend(
|
|
[
|
|
f"awoooi_backup_credential_escrow_expected_info{{{labels}}} 1",
|
|
f"awoooi_backup_credential_escrow_last_verified_timestamp{{{labels}}} {timestamp}",
|
|
f"awoooi_backup_credential_escrow_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_backup_credential_escrow_fresh{{{labels}}} {fresh}",
|
|
]
|
|
)
|
|
offsite_configured = 1 if b2_configured or rclone_configured else 0
|
|
any_partial_fresh = 1 if any(partial_fresh_by_provider.values()) else 0
|
|
full_fresh = 1 if (
|
|
(b2_configured and b2_full_timestamp and now - b2_full_timestamp <= 48 * 3600)
|
|
or (rclone_configured and rclone_full_timestamp and now - rclone_full_timestamp <= 48 * 3600)
|
|
) else 0
|
|
if not offsite_configured:
|
|
next_step = "configure_google_drive_rclone_on_110_tty"
|
|
phase = 1
|
|
elif not any_partial_fresh:
|
|
next_step = "run_small_dry_run_then_partial_sync"
|
|
phase = 2
|
|
elif escrow_missing_count > 0:
|
|
next_step = "complete_credential_escrow_review"
|
|
phase = 3
|
|
elif not full_fresh:
|
|
next_step = "pre_full_sync_review"
|
|
phase = 4
|
|
else:
|
|
next_step = "offsite_and_escrow_ready"
|
|
phase = 5
|
|
|
|
lines.extend(
|
|
[
|
|
f'awoooi_backup_dr_credential_escrow_missing_count{{host="{_escape_label(host)}"}} {escrow_missing_count}',
|
|
f'awoooi_backup_dr_phase{{host="{_escape_label(host)}",next_step="{_escape_label(next_step)}"}} {phase}',
|
|
f'awoooi_backup_dr_next_step_info{{host="{_escape_label(host)}",next_step="{_escape_label(next_step)}"}} 1',
|
|
]
|
|
)
|
|
return lines
|
|
|
|
|
|
def _retention_metric_lines(host: str) -> list[str]:
|
|
mode = (_backup_config_value("BACKUP_RETENTION_MODE") or os.environ.get("BACKUP_RETENTION_MODE", "")).strip()
|
|
keep_last = (_backup_config_value("KEEP_LAST") or os.environ.get("KEEP_LAST", "")).strip()
|
|
offsite_delete_old = (
|
|
_backup_config_value("OFFSITE_SYNC_DELETE_OLD") or os.environ.get("OFFSITE_SYNC_DELETE_OLD", "")
|
|
).strip()
|
|
|
|
latest_only = 1 if mode == "latest" and keep_last == "1" else 0
|
|
offsite_mirror = 1 if offsite_delete_old == "1" else 0
|
|
labels = f'host="{_escape_label(host)}",scope="restic",mode="{_escape_label(mode or "unknown")}",keep_last="{_escape_label(keep_last or "unknown")}"'
|
|
offsite_labels = (
|
|
f'host="{_escape_label(host)}",scope="offsite",provider="rclone",'
|
|
f'delete_old="{_escape_label(offsite_delete_old or "unknown")}"'
|
|
)
|
|
return [
|
|
f"awoooi_backup_retention_latest_only{{{labels}}} {latest_only}",
|
|
f"awoooi_backup_retention_offsite_delete_old_enabled{{{offsite_labels}}} {offsite_mirror}",
|
|
]
|
|
|
|
|
|
def _collect_velero_from_k8s() -> dict[str, int | str]:
|
|
remote_script = r"""
|
|
python3 - <<'PY'
|
|
import datetime as dt
|
|
import json
|
|
import subprocess
|
|
import time
|
|
|
|
|
|
def kubectl(args):
|
|
for prefix in (["sudo", "-n", "kubectl"], ["kubectl"]):
|
|
result = subprocess.run(prefix + args, capture_output=True, text=True, timeout=20, check=False)
|
|
if result.returncode == 0:
|
|
return result.stdout
|
|
return ""
|
|
|
|
|
|
def load_json(args):
|
|
text = kubectl(args + ["-o", "json"])
|
|
try:
|
|
return json.loads(text) if text else {}
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
|
|
|
|
def parse_ts(value):
|
|
if not value:
|
|
return 0
|
|
try:
|
|
return int(dt.datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp())
|
|
except ValueError:
|
|
return 0
|
|
|
|
|
|
now = int(time.time())
|
|
schedules = load_json(["get", "schedules.velero.io", "-n", "velero"]).get("items") or []
|
|
backups = load_json(["get", "backups.velero.io", "-n", "velero"]).get("items") or []
|
|
cron = load_json(["get", "cronjob", "-n", "velero", "backup-restore-test"])
|
|
jobs = load_json(["get", "jobs", "-n", "velero", "-l", "component=backup-restore-test"]).get("items") or []
|
|
|
|
completed = []
|
|
for item in backups:
|
|
if item.get("status", {}).get("phase") != "Completed":
|
|
continue
|
|
timestamp = parse_ts(item.get("status", {}).get("completionTimestamp") or item.get("metadata", {}).get("creationTimestamp"))
|
|
if timestamp:
|
|
completed.append(timestamp)
|
|
|
|
failed_jobs = 0
|
|
for job in jobs:
|
|
conditions = job.get("status", {}).get("conditions") or []
|
|
if any(row.get("type") == "Failed" and row.get("status") == "True" for row in conditions):
|
|
failed_jobs += 1
|
|
|
|
last_success = parse_ts((cron.get("status") or {}).get("lastSuccessfulTime"))
|
|
latest_backup = max(completed) if completed else 0
|
|
|
|
print("monitor_up=1")
|
|
print(f"schedule_count={len(schedules)}")
|
|
print(f"schedule_paused_count={sum(1 for item in schedules if item.get('spec', {}).get('paused'))}")
|
|
print(f"latest_completed_backup_timestamp={latest_backup}")
|
|
print(f"latest_completed_backup_age_seconds={now - latest_backup if latest_backup else 0}")
|
|
print(f"latest_completed_backup_fresh={1 if latest_backup and now - latest_backup <= 90000 else 0}")
|
|
print(f"restore_test_cron_present={1 if cron.get('metadata', {}).get('name') == 'backup-restore-test' else 0}")
|
|
print(f"restore_test_last_success_timestamp={last_success}")
|
|
print(f"restore_test_last_success_age_seconds={now - last_success if last_success else 0}")
|
|
print(f"restore_test_last_success_fresh={1 if last_success and now - last_success <= 691200 else 0}")
|
|
print(f"restore_test_failed_jobs={failed_jobs}")
|
|
PY
|
|
"""
|
|
hosts = os.environ.get("AIOPS_K8S_QUERY_HOSTS", "192.168.0.120 192.168.0.121 192.168.0.125").split()
|
|
values: dict[str, int | str] = {"monitor_up": 0, "source": "unreachable"}
|
|
for host in hosts:
|
|
rc, stdout, _ = _run(
|
|
[
|
|
"ssh",
|
|
"-o",
|
|
"BatchMode=yes",
|
|
"-o",
|
|
"StrictHostKeyChecking=accept-new",
|
|
"-o",
|
|
"ConnectTimeout=8",
|
|
f"wooo@{host}",
|
|
remote_script,
|
|
],
|
|
timeout=45,
|
|
)
|
|
if rc != 0:
|
|
continue
|
|
parsed: dict[str, int | str] = {"source": f"{host}-kubectl"}
|
|
for line in stdout.splitlines():
|
|
if "=" not in line:
|
|
continue
|
|
key, value = line.split("=", 1)
|
|
try:
|
|
parsed[key.strip()] = int(float(value.strip()))
|
|
except ValueError:
|
|
continue
|
|
if int(parsed.get("monitor_up", 0)) == 1:
|
|
return parsed
|
|
return values
|
|
|
|
|
|
def _velero_metric_lines(host: str) -> list[str]:
|
|
values = _collect_velero_from_k8s()
|
|
labels = f'host="{_escape_label(host)}",source="{_escape_label(str(values.get("source", "unreachable")))}",namespace="velero"'
|
|
return [
|
|
f"awoooi_velero_monitor_up{{{labels}}} {values.get('monitor_up', 0)}",
|
|
f"awoooi_velero_schedule_count{{{labels}}} {values.get('schedule_count', 0)}",
|
|
f"awoooi_velero_schedule_paused_count{{{labels}}} {values.get('schedule_paused_count', 0)}",
|
|
f"awoooi_velero_latest_completed_backup_timestamp{{{labels}}} {values.get('latest_completed_backup_timestamp', 0)}",
|
|
f"awoooi_velero_latest_completed_backup_age_seconds{{{labels}}} {values.get('latest_completed_backup_age_seconds', 0)}",
|
|
f"awoooi_velero_latest_completed_backup_fresh{{{labels},max_age_hours=\"25\"}} {values.get('latest_completed_backup_fresh', 0)}",
|
|
f"awoooi_velero_restore_test_cron_present{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_cron_present', 0)}",
|
|
f"awoooi_velero_restore_test_last_success_timestamp{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_last_success_timestamp', 0)}",
|
|
f"awoooi_velero_restore_test_last_success_age_seconds{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_last_success_age_seconds', 0)}",
|
|
f"awoooi_velero_restore_test_last_success_fresh{{{labels},cronjob=\"backup-restore-test\",max_age_hours=\"192\"}} {values.get('restore_test_last_success_fresh', 0)}",
|
|
f"awoooi_velero_restore_test_failed_jobs{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_failed_jobs', 0)}",
|
|
]
|
|
|
|
|
|
def _metric_lines_for_job(
|
|
*,
|
|
host: str,
|
|
job: str,
|
|
source: str,
|
|
target: str,
|
|
backup_type: str,
|
|
last_success: int,
|
|
max_age_hours: float,
|
|
sample_count: int = 0,
|
|
) -> list[str]:
|
|
now = int(time.time())
|
|
labels = (
|
|
f'host="{_escape_label(host)}",'
|
|
f'job="{_escape_label(job)}",'
|
|
f'type="{_escape_label(backup_type)}",'
|
|
f'source="{_escape_label(source)}",'
|
|
f'target="{_escape_label(target)}",'
|
|
f'max_age_hours="{max_age_hours:g}"'
|
|
)
|
|
age = now - last_success if last_success > 0 else 0
|
|
fresh = 1 if last_success > 0 and age <= int(max_age_hours * 3600) else 0
|
|
return [
|
|
f"awoooi_backup_expected_job_info{{{labels}}} 1",
|
|
f"awoooi_backup_job_last_success_timestamp{{{labels}}} {last_success}",
|
|
f"awoooi_backup_job_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_backup_job_fresh{{{labels}}} {fresh}",
|
|
f"awoooi_backup_job_snapshot_count{{{labels}}} {sample_count}",
|
|
]
|
|
|
|
|
|
def _base_lines(host: str) -> list[str]:
|
|
now = int(time.time())
|
|
return [
|
|
"# HELP awoooi_backup_health_monitor_up Whether the backup health exporter completed.",
|
|
"# TYPE awoooi_backup_health_monitor_up gauge",
|
|
"# HELP awoooi_backup_health_last_run_timestamp Unix timestamp of the last backup health exporter run.",
|
|
"# TYPE awoooi_backup_health_last_run_timestamp gauge",
|
|
"# HELP awoooi_backup_expected_job_info Expected backup job inventory.",
|
|
"# TYPE awoooi_backup_expected_job_info gauge",
|
|
"# HELP awoooi_backup_job_configured Whether the expected backup cron/config is present.",
|
|
"# TYPE awoooi_backup_job_configured gauge",
|
|
"# HELP awoooi_backup_script_present Whether the backup script exists on this host.",
|
|
"# TYPE awoooi_backup_script_present gauge",
|
|
"# HELP awoooi_backup_job_last_success_timestamp Unix timestamp of the latest successful backup evidence.",
|
|
"# TYPE awoooi_backup_job_last_success_timestamp gauge",
|
|
"# HELP awoooi_backup_job_age_seconds Age of the latest successful backup evidence.",
|
|
"# TYPE awoooi_backup_job_age_seconds gauge",
|
|
"# HELP awoooi_backup_job_fresh Whether the latest successful backup evidence is within max_age_hours.",
|
|
"# TYPE awoooi_backup_job_fresh gauge",
|
|
"# HELP awoooi_backup_job_snapshot_count Number of snapshots or files considered for this job.",
|
|
"# TYPE awoooi_backup_job_snapshot_count gauge",
|
|
"# HELP awoooi_backup_last_run_failed_count Failed component count from the last aggregate backup run.",
|
|
"# TYPE awoooi_backup_last_run_failed_count gauge",
|
|
"# HELP awoooi_backup_integrity_last_success_timestamp Unix timestamp of latest successful backup integrity or restore drill run.",
|
|
"# TYPE awoooi_backup_integrity_last_success_timestamp gauge",
|
|
"# HELP awoooi_backup_integrity_age_seconds Age of backup integrity or restore drill status.",
|
|
"# TYPE awoooi_backup_integrity_age_seconds gauge",
|
|
"# HELP awoooi_backup_integrity_fresh Whether backup integrity or restore drill status is fresh and successful.",
|
|
"# TYPE awoooi_backup_integrity_fresh gauge",
|
|
"# HELP awoooi_backup_integrity_failed_repo_count Failed repository count from backup integrity or restore drill run.",
|
|
"# TYPE awoooi_backup_integrity_failed_repo_count gauge",
|
|
"# HELP awoooi_backup_integrity_checked_repo_count Checked repository count from backup integrity or restore drill run.",
|
|
"# TYPE awoooi_backup_integrity_checked_repo_count gauge",
|
|
"# HELP awoooi_backup_config_capture_status_timestamp Unix timestamp of the latest config-capture coverage status.",
|
|
"# TYPE awoooi_backup_config_capture_status_timestamp gauge",
|
|
"# HELP awoooi_backup_config_capture_status_age_seconds Age of the latest config-capture coverage status.",
|
|
"# TYPE awoooi_backup_config_capture_status_age_seconds gauge",
|
|
"# HELP awoooi_backup_config_capture_critical_failed_count Critical config-capture targets missing from the latest configs backup.",
|
|
"# TYPE awoooi_backup_config_capture_critical_failed_count gauge",
|
|
"# HELP awoooi_backup_config_capture_failed_count Total config-capture targets missing from the latest configs backup.",
|
|
"# TYPE awoooi_backup_config_capture_failed_count gauge",
|
|
"# HELP awoooi_backup_config_capture_duration_seconds Duration of the latest configs backup capture run.",
|
|
"# TYPE awoooi_backup_config_capture_duration_seconds gauge",
|
|
"# HELP awoooi_backup_config_capture_ok Whether the latest configs backup captured a specific target.",
|
|
"# TYPE awoooi_backup_config_capture_ok gauge",
|
|
"# HELP awoooi_backup_offsite_configured Whether an offsite backup provider appears configured without exposing credentials.",
|
|
"# TYPE awoooi_backup_offsite_configured gauge",
|
|
"# HELP awoooi_backup_offsite_last_success_timestamp Unix timestamp of latest offsite copy success marker.",
|
|
"# TYPE awoooi_backup_offsite_last_success_timestamp gauge",
|
|
"# HELP awoooi_backup_offsite_age_seconds Age of latest offsite copy success marker.",
|
|
"# TYPE awoooi_backup_offsite_age_seconds gauge",
|
|
"# HELP awoooi_backup_offsite_fresh Whether offsite copy success marker is fresh.",
|
|
"# TYPE awoooi_backup_offsite_fresh gauge",
|
|
"# HELP awoooi_backup_offsite_partial_last_success_timestamp Unix timestamp of latest partial offsite copy success marker.",
|
|
"# TYPE awoooi_backup_offsite_partial_last_success_timestamp gauge",
|
|
"# HELP awoooi_backup_offsite_partial_age_seconds Age of latest partial offsite copy success marker.",
|
|
"# TYPE awoooi_backup_offsite_partial_age_seconds gauge",
|
|
"# HELP awoooi_backup_offsite_partial_fresh Whether partial offsite copy success marker is fresh.",
|
|
"# TYPE awoooi_backup_offsite_partial_fresh gauge",
|
|
"# HELP awoooi_backup_offsite_full_sync_enabled Whether the gated full offsite sync enable marker exists.",
|
|
"# TYPE awoooi_backup_offsite_full_sync_enabled gauge",
|
|
"# HELP awoooi_backup_offsite_full_sync_enabled_timestamp Unix timestamp of the gated full offsite sync enable marker.",
|
|
"# TYPE awoooi_backup_offsite_full_sync_enabled_timestamp gauge",
|
|
"# HELP awoooi_backup_credential_escrow_expected_info Expected credential escrow evidence inventory.",
|
|
"# TYPE awoooi_backup_credential_escrow_expected_info gauge",
|
|
"# HELP awoooi_backup_credential_escrow_last_verified_timestamp Unix timestamp of credential escrow verification evidence.",
|
|
"# TYPE awoooi_backup_credential_escrow_last_verified_timestamp gauge",
|
|
"# HELP awoooi_backup_credential_escrow_age_seconds Age of credential escrow verification evidence.",
|
|
"# TYPE awoooi_backup_credential_escrow_age_seconds gauge",
|
|
"# HELP awoooi_backup_credential_escrow_fresh Whether credential escrow verification evidence is fresh.",
|
|
"# TYPE awoooi_backup_credential_escrow_fresh gauge",
|
|
"# HELP awoooi_backup_dr_credential_escrow_missing_count Number of credential escrow items that still need fresh human verification.",
|
|
"# TYPE awoooi_backup_dr_credential_escrow_missing_count gauge",
|
|
"# HELP awoooi_backup_dr_phase Numeric DR offsite completion phase for AI/operator triage.",
|
|
"# TYPE awoooi_backup_dr_phase gauge",
|
|
"# HELP awoooi_backup_dr_next_step_info Current human-safe next step for DR offsite completion.",
|
|
"# TYPE awoooi_backup_dr_next_step_info gauge",
|
|
"# HELP awoooi_backup_retention_latest_only Whether local restic backup retention is configured as latest-only keep-last=1.",
|
|
"# TYPE awoooi_backup_retention_latest_only gauge",
|
|
"# HELP awoooi_backup_retention_offsite_delete_old_enabled Whether offsite rclone sync is allowed to delete old remote backup files after successful mirror.",
|
|
"# TYPE awoooi_backup_retention_offsite_delete_old_enabled gauge",
|
|
"# HELP awoooi_backup_cron_active_duplicate_count Number of exact duplicate active crontab entries on the backup host.",
|
|
"# TYPE awoooi_backup_cron_active_duplicate_count gauge",
|
|
"# HELP awoooi_backup_cron_singular_entry_count Number of active crontab entries matching a backup/offsite singleton pattern.",
|
|
"# TYPE awoooi_backup_cron_singular_entry_count gauge",
|
|
"# HELP awoooi_backup_cron_singular_entry_ok Whether a backup/offsite singleton cron pattern has exactly one active entry.",
|
|
"# TYPE awoooi_backup_cron_singular_entry_ok gauge",
|
|
"# HELP awoooi_velero_monitor_up Whether the backup health exporter can query Velero via a reachable K3s kubectl endpoint.",
|
|
"# TYPE awoooi_velero_monitor_up gauge",
|
|
"# HELP awoooi_velero_schedule_count Number of Velero schedules in the velero namespace.",
|
|
"# TYPE awoooi_velero_schedule_count gauge",
|
|
"# HELP awoooi_velero_schedule_paused_count Number of paused Velero schedules.",
|
|
"# TYPE awoooi_velero_schedule_paused_count gauge",
|
|
"# HELP awoooi_velero_latest_completed_backup_timestamp Unix timestamp of latest Completed Velero backup.",
|
|
"# TYPE awoooi_velero_latest_completed_backup_timestamp gauge",
|
|
"# HELP awoooi_velero_latest_completed_backup_age_seconds Age of latest Completed Velero backup.",
|
|
"# TYPE awoooi_velero_latest_completed_backup_age_seconds gauge",
|
|
"# HELP awoooi_velero_latest_completed_backup_fresh Whether latest Completed Velero backup is within max_age_hours.",
|
|
"# TYPE awoooi_velero_latest_completed_backup_fresh gauge",
|
|
"# HELP awoooi_velero_restore_test_cron_present Whether backup-restore-test CronJob exists.",
|
|
"# TYPE awoooi_velero_restore_test_cron_present gauge",
|
|
"# HELP awoooi_velero_restore_test_last_success_timestamp Unix timestamp of backup-restore-test lastSuccessfulTime.",
|
|
"# TYPE awoooi_velero_restore_test_last_success_timestamp gauge",
|
|
"# HELP awoooi_velero_restore_test_last_success_age_seconds Age of backup-restore-test lastSuccessfulTime.",
|
|
"# TYPE awoooi_velero_restore_test_last_success_age_seconds gauge",
|
|
"# HELP awoooi_velero_restore_test_last_success_fresh Whether backup-restore-test lastSuccessfulTime is within max_age_hours.",
|
|
"# TYPE awoooi_velero_restore_test_last_success_fresh gauge",
|
|
"# HELP awoooi_velero_restore_test_failed_jobs Failed backup-restore-test jobs retained in velero namespace.",
|
|
"# TYPE awoooi_velero_restore_test_failed_jobs gauge",
|
|
f'awoooi_backup_health_monitor_up{{host="{_escape_label(host)}"}} 1',
|
|
f'awoooi_backup_health_last_run_timestamp{{host="{_escape_label(host)}"}} {now}',
|
|
]
|
|
|
|
|
|
def _collect_110(host: str) -> list[str]:
|
|
cron = _cron_text()
|
|
lines = _base_lines(host)
|
|
expected_crons = {
|
|
"backup_all": "/backup/scripts/backup-all.sh",
|
|
"awoooi_frequent": "/backup/scripts/backup-awoooi-frequent.sh",
|
|
"offsite_status": "/backup/scripts/sync-offsite-backups.sh --mode status",
|
|
"offsite_sync_gated": "/backup/offsite/enable-rclone-sync",
|
|
"offsite_escrow_evidence_report": "/backup/scripts/offsite-escrow-evidence-report.sh --no-color",
|
|
"offsite_full_sync_verify": "/backup/scripts/verify-offsite-full-sync.sh --write-textfile",
|
|
"backup_integrity_check": "/backup/scripts/check-backup-integrity.sh --mode check",
|
|
"backup_restore_drill": "/backup/scripts/check-backup-integrity.sh --mode restore-drill",
|
|
}
|
|
for job, pattern in expected_crons.items():
|
|
labels = f'host="{_escape_label(host)}",job="{_escape_label(job)}"'
|
|
lines.append(f"awoooi_backup_job_configured{{{labels}}} {int(pattern in cron)}")
|
|
for script in [
|
|
"backup-all.sh",
|
|
"backup-awoooi.sh",
|
|
"backup-awoooi-frequent.sh",
|
|
"backup-configs.sh",
|
|
"backup-sentry.sh",
|
|
"backup-ai-artifacts.sh",
|
|
"backup-public-routes.sh",
|
|
"configure-offsite-rclone.sh",
|
|
"configure-offsite-b2.sh",
|
|
"sync-offsite-backups.sh",
|
|
"backup-offsite-readiness-gate.sh",
|
|
"offsite-escrow-evidence-report.sh",
|
|
"verify-offsite-full-sync.sh",
|
|
"mark-credential-escrow-verified.sh",
|
|
"check-backup-integrity.sh",
|
|
"backup-gitea.sh",
|
|
"backup-harbor.sh",
|
|
"backup-momo.sh",
|
|
"backup-langfuse.sh",
|
|
"backup-monitoring.sh",
|
|
"backup-signoz.sh",
|
|
"backup-open-webui.sh",
|
|
"backup-clawbot.sh",
|
|
]:
|
|
labels = f'host="{_escape_label(host)}",script="{_escape_label(script)}"'
|
|
lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path('/backup/scripts', script).exists())}")
|
|
|
|
for job, repo, max_age in [
|
|
("awoooi_db", "/backup/awoooi", 7),
|
|
("configs", "/backup/configs", 48),
|
|
("sentry", "/backup/sentry", 48),
|
|
("gitea", "/backup/gitea", 48),
|
|
("harbor", "/backup/harbor", 48),
|
|
("momo", "/backup/momo", 48),
|
|
("langfuse", "/backup/langfuse", 48),
|
|
("monitoring", "/backup/monitoring", 48),
|
|
("signoz", "/backup/signoz", 48),
|
|
("open_webui", "/backup/open-webui", 48),
|
|
("clawbot", "/backup/clawbot", 48),
|
|
("ai_artifacts", "/backup/ai-artifacts", 48),
|
|
("public_routes", "/backup/public-routes", 168),
|
|
]:
|
|
timestamp, count = _latest_restic_snapshot(repo)
|
|
lines.extend(
|
|
_metric_lines_for_job(
|
|
host=host,
|
|
job=job,
|
|
source="110-restic",
|
|
target=repo,
|
|
backup_type="restic",
|
|
last_success=timestamp,
|
|
max_age_hours=max_age,
|
|
sample_count=count,
|
|
)
|
|
)
|
|
|
|
backup_all_ts, failed_count = _latest_backup_all_failed_count()
|
|
labels = f'host="{_escape_label(host)}",job="backup_all"'
|
|
lines.append(f"awoooi_backup_last_run_failed_count{{{labels}}} {failed_count}")
|
|
lines.append(f"awoooi_backup_job_last_success_timestamp{{{labels},type=\"aggregate\",source=\"110-cron-log\",target=\"/backup/logs/cron.log\",max_age_hours=\"48\"}} {backup_all_ts if failed_count == 0 else 0}")
|
|
lines.extend(_integrity_metric_lines(host))
|
|
lines.extend(_config_capture_metric_lines(host))
|
|
lines.extend(_offsite_and_escrow_metric_lines(host))
|
|
lines.extend(_retention_metric_lines(host))
|
|
lines.extend(_cron_duplicate_metric_lines(host, cron))
|
|
lines.extend(_velero_metric_lines(host))
|
|
return lines
|
|
|
|
|
|
def _collect_188(host: str) -> list[str]:
|
|
cron = _cron_text()
|
|
lines = _base_lines(host)
|
|
for job, pattern in {
|
|
"backup_from_110": "/home/ollama/bin/backup-from-110.sh",
|
|
"momo_pg_daily": "/home/ollama/bin/momo-pg-backup.sh",
|
|
}.items():
|
|
labels = f'host="{_escape_label(host)}",job="{_escape_label(job)}"'
|
|
lines.append(f"awoooi_backup_job_configured{{{labels}}} {int(pattern in cron)}")
|
|
|
|
for script in [
|
|
"/home/ollama/bin/backup-from-110.sh",
|
|
"/home/ollama/bin/momo-pg-backup.sh",
|
|
"/home/ollama/awoooi-ops/pg-backup.sh",
|
|
]:
|
|
labels = f'host="{_escape_label(host)}",script="{_escape_label(Path(script).name)}"'
|
|
lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path(script).exists() and os.access(script, os.X_OK))}")
|
|
|
|
lines.extend(
|
|
_metric_lines_for_job(
|
|
host=host,
|
|
job="backup_from_110",
|
|
source="188-rsync",
|
|
target="/home/ollama/backup/110",
|
|
backup_type="rsync",
|
|
last_success=_read_backup_110_timestamp(),
|
|
max_age_hours=25,
|
|
sample_count=1,
|
|
)
|
|
)
|
|
momo_ts = _newest_file_timestamp([
|
|
"/home/ollama/momo_backups/*.sql.gz",
|
|
"/home/ollama/momo-pro/backups/*.sql.gz",
|
|
"/home/ollama/backups/momo_analytics_*.sql.gz",
|
|
])
|
|
lines.extend(
|
|
_metric_lines_for_job(
|
|
host=host,
|
|
job="momo_pg_daily",
|
|
source="188-pg-dump",
|
|
target="/home/ollama/momo_backups",
|
|
backup_type="pg_dump",
|
|
last_success=momo_ts,
|
|
max_age_hours=30,
|
|
sample_count=1 if momo_ts else 0,
|
|
)
|
|
)
|
|
return lines
|
|
|
|
|
|
def collect() -> str:
|
|
host = HOST_LABEL
|
|
if host == "110":
|
|
lines = _collect_110(host)
|
|
elif host == "188":
|
|
lines = _collect_188(host)
|
|
else:
|
|
lines = _base_lines(host)
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
def main() -> None:
|
|
TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
|
|
payload = collect()
|
|
with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
|
|
tmp.write(payload)
|
|
tmp_path = Path(tmp.name)
|
|
output_path = TEXTFILE_DIR / OUTPUT_NAME
|
|
tmp_path.replace(output_path)
|
|
output_path.chmod(0o644)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|