Some checks failed
CD Pipeline / tests (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Has been cancelled
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Has been cancelled
162 lines
6.2 KiB
Python
Executable File
162 lines
6.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Docker stats textfile exporter for host-level AIOps baselines.
|
|
|
|
2026-05-05 ogt + Codex: 110/188 CPU overload follow-up.
|
|
Why: cAdvisor v0.47 may not expose per-container restart count and the live
|
|
110 scrape currently only exposes the root cgroup. This exporter writes a small
|
|
node-exporter textfile so Prometheus can alert on Docker Compose CPU/restarts.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import tempfile
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
|
|
OUTPUT_NAME = "docker_stats.prom"
|
|
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
|
|
LABEL_RE = re.compile(r'["\\\n]')
|
|
|
|
|
|
def _escape_label(value: str) -> str:
|
|
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
|
|
|
|
|
|
def _run_json_lines(command: list[str]) -> list[dict]:
|
|
result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
|
|
rows: list[dict] = []
|
|
for line in result.stdout.splitlines():
|
|
if not line.strip():
|
|
continue
|
|
rows.append(json.loads(line))
|
|
return rows
|
|
|
|
|
|
def _run_text_lines(command: list[str]) -> list[str]:
|
|
result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
|
|
return [line.strip() for line in result.stdout.splitlines() if line.strip()]
|
|
|
|
|
|
def _cpu_cores(cpu_perc: str) -> float:
|
|
return float(cpu_perc.strip().rstrip("%")) / 100.0
|
|
|
|
|
|
def _memory_bytes(value: str) -> float:
|
|
raw = value.strip()
|
|
match = re.fullmatch(r"([0-9.]+)\s*([A-Za-z]+)", raw)
|
|
if not match:
|
|
return 0.0
|
|
number, unit = match.groups()
|
|
scale = {
|
|
"B": 1,
|
|
"KiB": 1024,
|
|
"MiB": 1024**2,
|
|
"GiB": 1024**3,
|
|
"TiB": 1024**4,
|
|
"KB": 1000,
|
|
"MB": 1000**2,
|
|
"GB": 1000**3,
|
|
"TB": 1000**4,
|
|
}.get(unit, 1)
|
|
return float(number) * scale
|
|
|
|
|
|
def _started_at_seconds(value: str) -> float:
|
|
if not value:
|
|
return 0.0
|
|
try:
|
|
normalized = re.sub(r"\.(\d{6})\d+(Z|[+-]\d\d:\d\d)$", r".\1\2", value)
|
|
normalized = normalized.replace("Z", "+00:00")
|
|
return datetime.fromisoformat(normalized).astimezone(timezone.utc).timestamp()
|
|
except ValueError:
|
|
return 0.0
|
|
|
|
|
|
def collect() -> str:
|
|
stats = _run_json_lines([
|
|
"docker",
|
|
"stats",
|
|
"--no-stream",
|
|
"--format",
|
|
"{{json .}}",
|
|
])
|
|
names = _run_text_lines(["docker", "ps", "-a", "--format", "{{.Names}}"])
|
|
inspect_by_name = {}
|
|
if names:
|
|
inspected = json.loads(subprocess.run(
|
|
["docker", "inspect", *names],
|
|
check=True,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
).stdout)
|
|
inspect_by_name = {row.get("Name", "").lstrip("/"): row for row in inspected}
|
|
|
|
lines = [
|
|
"# HELP docker_container_cpu_cores Current Docker container CPU usage in cores from docker stats.",
|
|
"# TYPE docker_container_cpu_cores gauge",
|
|
"# HELP docker_container_cpu_limit_cores Docker container CPU quota in cores, 0 when unlimited.",
|
|
"# TYPE docker_container_cpu_limit_cores gauge",
|
|
"# HELP docker_container_memory_usage_bytes Current Docker container memory usage in bytes from docker stats.",
|
|
"# TYPE docker_container_memory_usage_bytes gauge",
|
|
"# HELP docker_container_memory_limit_bytes Docker container memory limit in bytes, 0 when unlimited.",
|
|
"# TYPE docker_container_memory_limit_bytes gauge",
|
|
"# HELP docker_container_pids Current Docker container process/thread count from docker stats.",
|
|
"# TYPE docker_container_pids gauge",
|
|
"# HELP docker_container_inspect_restart_count Docker container restart count from Docker inspect.",
|
|
"# TYPE docker_container_inspect_restart_count gauge",
|
|
"# HELP docker_container_info Docker container inventory exposed by the textfile exporter.",
|
|
"# TYPE docker_container_info gauge",
|
|
"# HELP docker_container_started_seconds Docker container start timestamp from Docker inspect.",
|
|
"# TYPE docker_container_started_seconds gauge",
|
|
]
|
|
|
|
for row in stats:
|
|
name = row.get("Name", "")
|
|
if not name:
|
|
continue
|
|
inspected = inspect_by_name.get(name, {})
|
|
host_config = inspected.get("HostConfig", {}) if isinstance(inspected, dict) else {}
|
|
state = inspected.get("State", {}) if isinstance(inspected, dict) else {}
|
|
nano_cpus = float(host_config.get("NanoCpus") or 0)
|
|
memory_limit = float(host_config.get("Memory") or 0)
|
|
restart_count = int(inspected.get("RestartCount") or 0)
|
|
started_seconds = _started_at_seconds(state.get("StartedAt", ""))
|
|
labels = f'host="{_escape_label(HOST_LABEL)}",container_name="{_escape_label(name)}"'
|
|
mem_current = (row.get("MemUsage") or "0 B / 0 B").split("/", 1)[0].strip()
|
|
pids = row.get("PIDs") or "0"
|
|
lines.append(f"docker_container_cpu_cores{{{labels}}} {_cpu_cores(row.get('CPUPerc', '0%')):.6f}")
|
|
lines.append(f"docker_container_cpu_limit_cores{{{labels}}} {nano_cpus / 1_000_000_000:.6f}")
|
|
lines.append(f"docker_container_memory_usage_bytes{{{labels}}} {_memory_bytes(mem_current):.0f}")
|
|
lines.append(f"docker_container_memory_limit_bytes{{{labels}}} {memory_limit:.0f}")
|
|
lines.append(f"docker_container_pids{{{labels}}} {int(float(pids))}")
|
|
lines.append(f"docker_container_inspect_restart_count{{{labels}}} {restart_count}")
|
|
lines.append(f"docker_container_started_seconds{{{labels}}} {started_seconds:.6f}")
|
|
lines.append(
|
|
f'docker_container_info{{{labels},status="{_escape_label(state.get("Status", ""))}"}} 1'
|
|
)
|
|
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
def main() -> None:
|
|
TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
|
|
payload = collect()
|
|
with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
|
|
tmp.write(payload)
|
|
tmp_path = Path(tmp.name)
|
|
output_path = TEXTFILE_DIR / OUTPUT_NAME
|
|
tmp_path.replace(output_path)
|
|
output_path.chmod(0o644)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|