#!/usr/bin/env python3 """ Systemd unit textfile exporter for host-level AIOps baselines. 2026-05-05 ogt + Codex: 110 runner overload follow-up. Why: GitHub/Gitea runner services are outside Docker metrics. A bad WatchdogSec drop-in caused repeated runner restarts on 110, so the AI monitor needs direct visibility into systemd restarts, watchdog settings, and limits. """ from __future__ import annotations import os import re import subprocess import tempfile from pathlib import Path TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector")) OUTPUT_NAME = "systemd_units.prom" HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename) UNIT_NAMES = [ unit.strip() for unit in os.environ.get("AIOPS_SYSTEMD_UNITS", "").split(",") if unit.strip() ] LABEL_RE = re.compile(r'["\\\n]') def _escape_label(value: str) -> str: return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value) def _parse_usec(value: str) -> float: value = value.strip() if value in {"", "0", "infinity"}: return 0.0 multipliers = { "us": 0.000001, "ms": 0.001, "s": 1.0, "min": 60.0, "h": 3600.0, } match = re.fullmatch(r"([0-9.]+)\s*([A-Za-z]+)?", value) if not match: return 0.0 number, unit = match.groups() if unit is None: return float(number) / 1_000_000 return float(number) * multipliers.get(unit, 0.0) def _parse_bytes(value: str) -> float: if value in {"", "infinity"}: return 0.0 try: return float(value) except ValueError: return 0.0 def _show_unit(unit: str) -> dict[str, str]: result = subprocess.run( [ "systemctl", "show", unit, "-p", "ActiveState", "-p", "SubState", "-p", "NRestarts", "-p", "WatchdogUSec", "-p", "CPUQuotaPerSecUSec", "-p", "MemoryMax", ], check=True, capture_output=True, text=True, timeout=10, ) values = {} for line in result.stdout.splitlines(): key, _, raw = line.partition("=") values[key] = raw return values def collect() -> str: lines = [ "# HELP systemd_unit_info Systemd unit inventory exposed by the textfile exporter.", "# TYPE systemd_unit_info gauge", "# HELP systemd_unit_restarts_total Systemd unit restart counter from systemctl show NRestarts.", "# TYPE systemd_unit_restarts_total counter", "# HELP systemd_unit_watchdog_seconds Systemd unit WatchdogSec setting in seconds, 0 when disabled.", "# TYPE systemd_unit_watchdog_seconds gauge", "# HELP systemd_unit_cpu_quota_cores Systemd unit CPUQuota converted to CPU cores, 0 when unlimited.", "# TYPE systemd_unit_cpu_quota_cores gauge", "# HELP systemd_unit_memory_max_bytes Systemd unit MemoryMax in bytes, 0 when unlimited.", "# TYPE systemd_unit_memory_max_bytes gauge", ] for unit in UNIT_NAMES: try: values = _show_unit(unit) except Exception as exc: labels = f'host="{_escape_label(HOST_LABEL)}",unit="{_escape_label(unit)}"' lines.append(f'systemd_unit_info{{{labels},active_state="scrape_error",sub_state="{_escape_label(str(exc))}"}} 0') continue labels = f'host="{_escape_label(HOST_LABEL)}",unit="{_escape_label(unit)}"' active_state = values.get("ActiveState", "") sub_state = values.get("SubState", "") restarts = int(values.get("NRestarts") or 0) watchdog_seconds = _parse_usec(values.get("WatchdogUSec", "0")) cpu_quota = _parse_usec(values.get("CPUQuotaPerSecUSec", "0")) memory_max = _parse_bytes(values.get("MemoryMax", "0")) lines.append( f'systemd_unit_info{{{labels},active_state="{_escape_label(active_state)}",sub_state="{_escape_label(sub_state)}"}} 1' ) lines.append(f"systemd_unit_restarts_total{{{labels}}} {restarts}") lines.append(f"systemd_unit_watchdog_seconds{{{labels}}} {watchdog_seconds:.6f}") lines.append(f"systemd_unit_cpu_quota_cores{{{labels}}} {cpu_quota:.6f}") lines.append(f"systemd_unit_memory_max_bytes{{{labels}}} {memory_max:.0f}") return "\n".join(lines) + "\n" def main() -> None: if not UNIT_NAMES: raise SystemExit("AIOPS_SYSTEMD_UNITS is required") TEXTFILE_DIR.mkdir(parents=True, exist_ok=True) payload = collect() with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp: tmp.write(payload) tmp_path = Path(tmp.name) output_path = TEXTFILE_DIR / OUTPUT_NAME tmp_path.replace(output_path) output_path.chmod(0o644) if __name__ == "__main__": main()