Files
awoooi/scripts/ops/systemd-units-textfile-exporter.py
Your Name fe618960a8
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Has been cancelled
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 39s
fix(ops): monitor systemd runners in host baseline
2026-05-05 14:08:43 +08:00

148 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""
Systemd unit textfile exporter for host-level AIOps baselines.
2026-05-05 ogt + Codex: 110 runner overload follow-up.
Why: GitHub/Gitea runner services are outside Docker metrics. A bad
WatchdogSec drop-in caused repeated runner restarts on 110, so the AI monitor
needs direct visibility into systemd restarts, watchdog settings, and limits.
"""
from __future__ import annotations
import os
import re
import subprocess
import tempfile
from pathlib import Path
TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
OUTPUT_NAME = "systemd_units.prom"
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
UNIT_NAMES = [
unit.strip()
for unit in os.environ.get("AIOPS_SYSTEMD_UNITS", "").split(",")
if unit.strip()
]
LABEL_RE = re.compile(r'["\\\n]')
def _escape_label(value: str) -> str:
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
def _parse_usec(value: str) -> float:
value = value.strip()
if value in {"", "0", "infinity"}:
return 0.0
multipliers = {
"us": 0.000001,
"ms": 0.001,
"s": 1.0,
"min": 60.0,
"h": 3600.0,
}
match = re.fullmatch(r"([0-9.]+)\s*([A-Za-z]+)?", value)
if not match:
return 0.0
number, unit = match.groups()
if unit is None:
return float(number) / 1_000_000
return float(number) * multipliers.get(unit, 0.0)
def _parse_bytes(value: str) -> float:
if value in {"", "infinity"}:
return 0.0
try:
return float(value)
except ValueError:
return 0.0
def _show_unit(unit: str) -> dict[str, str]:
result = subprocess.run(
[
"systemctl",
"show",
unit,
"-p",
"ActiveState",
"-p",
"SubState",
"-p",
"NRestarts",
"-p",
"WatchdogUSec",
"-p",
"CPUQuotaPerSecUSec",
"-p",
"MemoryMax",
],
check=True,
capture_output=True,
text=True,
timeout=10,
)
values = {}
for line in result.stdout.splitlines():
key, _, raw = line.partition("=")
values[key] = raw
return values
def collect() -> str:
lines = [
"# HELP systemd_unit_info Systemd unit inventory exposed by the textfile exporter.",
"# TYPE systemd_unit_info gauge",
"# HELP systemd_unit_restarts_total Systemd unit restart counter from systemctl show NRestarts.",
"# TYPE systemd_unit_restarts_total counter",
"# HELP systemd_unit_watchdog_seconds Systemd unit WatchdogSec setting in seconds, 0 when disabled.",
"# TYPE systemd_unit_watchdog_seconds gauge",
"# HELP systemd_unit_cpu_quota_cores Systemd unit CPUQuota converted to CPU cores, 0 when unlimited.",
"# TYPE systemd_unit_cpu_quota_cores gauge",
"# HELP systemd_unit_memory_max_bytes Systemd unit MemoryMax in bytes, 0 when unlimited.",
"# TYPE systemd_unit_memory_max_bytes gauge",
]
for unit in UNIT_NAMES:
try:
values = _show_unit(unit)
except Exception as exc:
labels = f'host="{_escape_label(HOST_LABEL)}",unit="{_escape_label(unit)}"'
lines.append(f'systemd_unit_info{{{labels},active_state="scrape_error",sub_state="{_escape_label(str(exc))}"}} 0')
continue
labels = f'host="{_escape_label(HOST_LABEL)}",unit="{_escape_label(unit)}"'
active_state = values.get("ActiveState", "")
sub_state = values.get("SubState", "")
restarts = int(values.get("NRestarts") or 0)
watchdog_seconds = _parse_usec(values.get("WatchdogUSec", "0"))
cpu_quota = _parse_usec(values.get("CPUQuotaPerSecUSec", "0"))
memory_max = _parse_bytes(values.get("MemoryMax", "0"))
lines.append(
f'systemd_unit_info{{{labels},active_state="{_escape_label(active_state)}",sub_state="{_escape_label(sub_state)}"}} 1'
)
lines.append(f"systemd_unit_restarts_total{{{labels}}} {restarts}")
lines.append(f"systemd_unit_watchdog_seconds{{{labels}}} {watchdog_seconds:.6f}")
lines.append(f"systemd_unit_cpu_quota_cores{{{labels}}} {cpu_quota:.6f}")
lines.append(f"systemd_unit_memory_max_bytes{{{labels}}} {memory_max:.0f}")
return "\n".join(lines) + "\n"
def main() -> None:
if not UNIT_NAMES:
raise SystemExit("AIOPS_SYSTEMD_UNITS is required")
TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
payload = collect()
with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
tmp.write(payload)
tmp_path = Path(tmp.name)
output_path = TEXTFILE_DIR / OUTPUT_NAME
tmp_path.replace(output_path)
output_path.chmod(0o644)
if __name__ == "__main__":
main()