Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Has been cancelled
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 39s
148 lines
4.9 KiB
Python
148 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Systemd unit textfile exporter for host-level AIOps baselines.
|
|
|
|
2026-05-05 ogt + Codex: 110 runner overload follow-up.
|
|
Why: GitHub/Gitea runner services are outside Docker metrics. A bad
|
|
WatchdogSec drop-in caused repeated runner restarts on 110, so the AI monitor
|
|
needs direct visibility into systemd restarts, watchdog settings, and limits.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
|
|
TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
|
|
OUTPUT_NAME = "systemd_units.prom"
|
|
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
|
|
UNIT_NAMES = [
|
|
unit.strip()
|
|
for unit in os.environ.get("AIOPS_SYSTEMD_UNITS", "").split(",")
|
|
if unit.strip()
|
|
]
|
|
LABEL_RE = re.compile(r'["\\\n]')
|
|
|
|
|
|
def _escape_label(value: str) -> str:
|
|
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
|
|
|
|
|
|
def _parse_usec(value: str) -> float:
|
|
value = value.strip()
|
|
if value in {"", "0", "infinity"}:
|
|
return 0.0
|
|
multipliers = {
|
|
"us": 0.000001,
|
|
"ms": 0.001,
|
|
"s": 1.0,
|
|
"min": 60.0,
|
|
"h": 3600.0,
|
|
}
|
|
match = re.fullmatch(r"([0-9.]+)\s*([A-Za-z]+)?", value)
|
|
if not match:
|
|
return 0.0
|
|
number, unit = match.groups()
|
|
if unit is None:
|
|
return float(number) / 1_000_000
|
|
return float(number) * multipliers.get(unit, 0.0)
|
|
|
|
|
|
def _parse_bytes(value: str) -> float:
|
|
if value in {"", "infinity"}:
|
|
return 0.0
|
|
try:
|
|
return float(value)
|
|
except ValueError:
|
|
return 0.0
|
|
|
|
|
|
def _show_unit(unit: str) -> dict[str, str]:
|
|
result = subprocess.run(
|
|
[
|
|
"systemctl",
|
|
"show",
|
|
unit,
|
|
"-p",
|
|
"ActiveState",
|
|
"-p",
|
|
"SubState",
|
|
"-p",
|
|
"NRestarts",
|
|
"-p",
|
|
"WatchdogUSec",
|
|
"-p",
|
|
"CPUQuotaPerSecUSec",
|
|
"-p",
|
|
"MemoryMax",
|
|
],
|
|
check=True,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10,
|
|
)
|
|
values = {}
|
|
for line in result.stdout.splitlines():
|
|
key, _, raw = line.partition("=")
|
|
values[key] = raw
|
|
return values
|
|
|
|
|
|
def collect() -> str:
|
|
lines = [
|
|
"# HELP systemd_unit_info Systemd unit inventory exposed by the textfile exporter.",
|
|
"# TYPE systemd_unit_info gauge",
|
|
"# HELP systemd_unit_restarts_total Systemd unit restart counter from systemctl show NRestarts.",
|
|
"# TYPE systemd_unit_restarts_total counter",
|
|
"# HELP systemd_unit_watchdog_seconds Systemd unit WatchdogSec setting in seconds, 0 when disabled.",
|
|
"# TYPE systemd_unit_watchdog_seconds gauge",
|
|
"# HELP systemd_unit_cpu_quota_cores Systemd unit CPUQuota converted to CPU cores, 0 when unlimited.",
|
|
"# TYPE systemd_unit_cpu_quota_cores gauge",
|
|
"# HELP systemd_unit_memory_max_bytes Systemd unit MemoryMax in bytes, 0 when unlimited.",
|
|
"# TYPE systemd_unit_memory_max_bytes gauge",
|
|
]
|
|
for unit in UNIT_NAMES:
|
|
try:
|
|
values = _show_unit(unit)
|
|
except Exception as exc:
|
|
labels = f'host="{_escape_label(HOST_LABEL)}",unit="{_escape_label(unit)}"'
|
|
lines.append(f'systemd_unit_info{{{labels},active_state="scrape_error",sub_state="{_escape_label(str(exc))}"}} 0')
|
|
continue
|
|
|
|
labels = f'host="{_escape_label(HOST_LABEL)}",unit="{_escape_label(unit)}"'
|
|
active_state = values.get("ActiveState", "")
|
|
sub_state = values.get("SubState", "")
|
|
restarts = int(values.get("NRestarts") or 0)
|
|
watchdog_seconds = _parse_usec(values.get("WatchdogUSec", "0"))
|
|
cpu_quota = _parse_usec(values.get("CPUQuotaPerSecUSec", "0"))
|
|
memory_max = _parse_bytes(values.get("MemoryMax", "0"))
|
|
lines.append(
|
|
f'systemd_unit_info{{{labels},active_state="{_escape_label(active_state)}",sub_state="{_escape_label(sub_state)}"}} 1'
|
|
)
|
|
lines.append(f"systemd_unit_restarts_total{{{labels}}} {restarts}")
|
|
lines.append(f"systemd_unit_watchdog_seconds{{{labels}}} {watchdog_seconds:.6f}")
|
|
lines.append(f"systemd_unit_cpu_quota_cores{{{labels}}} {cpu_quota:.6f}")
|
|
lines.append(f"systemd_unit_memory_max_bytes{{{labels}}} {memory_max:.0f}")
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
def main() -> None:
|
|
if not UNIT_NAMES:
|
|
raise SystemExit("AIOPS_SYSTEMD_UNITS is required")
|
|
TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
|
|
payload = collect()
|
|
with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
|
|
tmp.write(payload)
|
|
tmp_path = Path(tmp.name)
|
|
output_path = TEXTFILE_DIR / OUTPUT_NAME
|
|
tmp_path.replace(output_path)
|
|
output_path.chmod(0o644)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|