#!/usr/bin/env python3 """ Docker stats textfile exporter for host-level AIOps baselines. 2026-05-05 ogt + Codex: 110/188 CPU overload follow-up. Why: cAdvisor v0.47 may not expose per-container restart count and the live 110 scrape currently only exposes the root cgroup. This exporter writes a small node-exporter textfile so Prometheus can alert on Docker Compose CPU/restarts. """ from __future__ import annotations import json import os import re import subprocess import tempfile from datetime import datetime, timezone from pathlib import Path TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector")) OUTPUT_NAME = "docker_stats.prom" HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename) LABEL_RE = re.compile(r'["\\\n]') def _escape_label(value: str) -> str: return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value) def _run_json_lines(command: list[str]) -> list[dict]: result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30) rows: list[dict] = [] for line in result.stdout.splitlines(): if not line.strip(): continue rows.append(json.loads(line)) return rows def _run_text_lines(command: list[str]) -> list[str]: result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30) return [line.strip() for line in result.stdout.splitlines() if line.strip()] def _cpu_cores(cpu_perc: str) -> float: return float(cpu_perc.strip().rstrip("%")) / 100.0 def _memory_bytes(value: str) -> float: raw = value.strip() match = re.fullmatch(r"([0-9.]+)\s*([A-Za-z]+)", raw) if not match: return 0.0 number, unit = match.groups() scale = { "B": 1, "KiB": 1024, "MiB": 1024**2, "GiB": 1024**3, "TiB": 1024**4, "KB": 1000, "MB": 1000**2, "GB": 1000**3, "TB": 1000**4, }.get(unit, 1) return float(number) * scale def _started_at_seconds(value: str) -> float: if not value: return 0.0 try: normalized = re.sub(r"\.(\d{6})\d+(Z|[+-]\d\d:\d\d)$", r".\1\2", value) normalized = normalized.replace("Z", "+00:00") return datetime.fromisoformat(normalized).astimezone(timezone.utc).timestamp() except ValueError: return 0.0 def collect() -> str: stats = _run_json_lines([ "docker", "stats", "--no-stream", "--format", "{{json .}}", ]) names = _run_text_lines(["docker", "ps", "-a", "--format", "{{.Names}}"]) inspect_by_name = {} if names: inspected = json.loads(subprocess.run( ["docker", "inspect", *names], check=True, capture_output=True, text=True, timeout=30, ).stdout) inspect_by_name = {row.get("Name", "").lstrip("/"): row for row in inspected} lines = [ "# HELP docker_container_cpu_cores Current Docker container CPU usage in cores from docker stats.", "# TYPE docker_container_cpu_cores gauge", "# HELP docker_container_cpu_limit_cores Docker container CPU quota in cores, 0 when unlimited.", "# TYPE docker_container_cpu_limit_cores gauge", "# HELP docker_container_memory_usage_bytes Current Docker container memory usage in bytes from docker stats.", "# TYPE docker_container_memory_usage_bytes gauge", "# HELP docker_container_memory_limit_bytes Docker container memory limit in bytes, 0 when unlimited.", "# TYPE docker_container_memory_limit_bytes gauge", "# HELP docker_container_pids Current Docker container process/thread count from docker stats.", "# TYPE docker_container_pids gauge", "# HELP docker_container_inspect_restart_count Docker container restart count from Docker inspect.", "# TYPE docker_container_inspect_restart_count gauge", "# HELP docker_container_info Docker container inventory exposed by the textfile exporter.", "# TYPE docker_container_info gauge", "# HELP docker_container_started_seconds Docker container start timestamp from Docker inspect.", "# TYPE docker_container_started_seconds gauge", ] for row in stats: name = row.get("Name", "") if not name: continue inspected = inspect_by_name.get(name, {}) host_config = inspected.get("HostConfig", {}) if isinstance(inspected, dict) else {} state = inspected.get("State", {}) if isinstance(inspected, dict) else {} nano_cpus = float(host_config.get("NanoCpus") or 0) memory_limit = float(host_config.get("Memory") or 0) restart_count = int(inspected.get("RestartCount") or 0) started_seconds = _started_at_seconds(state.get("StartedAt", "")) labels = f'host="{_escape_label(HOST_LABEL)}",container_name="{_escape_label(name)}"' mem_current = (row.get("MemUsage") or "0 B / 0 B").split("/", 1)[0].strip() pids = row.get("PIDs") or "0" lines.append(f"docker_container_cpu_cores{{{labels}}} {_cpu_cores(row.get('CPUPerc', '0%')):.6f}") lines.append(f"docker_container_cpu_limit_cores{{{labels}}} {nano_cpus / 1_000_000_000:.6f}") lines.append(f"docker_container_memory_usage_bytes{{{labels}}} {_memory_bytes(mem_current):.0f}") lines.append(f"docker_container_memory_limit_bytes{{{labels}}} {memory_limit:.0f}") lines.append(f"docker_container_pids{{{labels}}} {int(float(pids))}") lines.append(f"docker_container_inspect_restart_count{{{labels}}} {restart_count}") lines.append(f"docker_container_started_seconds{{{labels}}} {started_seconds:.6f}") lines.append( f'docker_container_info{{{labels},status="{_escape_label(state.get("Status", ""))}"}} 1' ) return "\n".join(lines) + "\n" def main() -> None: TEXTFILE_DIR.mkdir(parents=True, exist_ok=True) payload = collect() with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp: tmp.write(payload) tmp_path = Path(tmp.name) output_path = TEXTFILE_DIR / OUTPUT_NAME tmp_path.replace(output_path) output_path.chmod(0o644) if __name__ == "__main__": main()