Files
awoooi/scripts/ops/docker-stats-textfile-exporter.py
Your Name 7d45f0cb58
Some checks failed
CD Pipeline / tests (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Has been cancelled
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Has been cancelled
fix(ops): alert on stale gitea actions jobs
2026-05-05 14:42:09 +08:00

162 lines
6.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Docker stats textfile exporter for host-level AIOps baselines.
2026-05-05 ogt + Codex: 110/188 CPU overload follow-up.
Why: cAdvisor v0.47 may not expose per-container restart count and the live
110 scrape currently only exposes the root cgroup. This exporter writes a small
node-exporter textfile so Prometheus can alert on Docker Compose CPU/restarts.
"""
from __future__ import annotations
import json
import os
import re
import subprocess
import tempfile
from datetime import datetime, timezone
from pathlib import Path
TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
OUTPUT_NAME = "docker_stats.prom"
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
LABEL_RE = re.compile(r'["\\\n]')
def _escape_label(value: str) -> str:
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
def _run_json_lines(command: list[str]) -> list[dict]:
result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
rows: list[dict] = []
for line in result.stdout.splitlines():
if not line.strip():
continue
rows.append(json.loads(line))
return rows
def _run_text_lines(command: list[str]) -> list[str]:
result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
return [line.strip() for line in result.stdout.splitlines() if line.strip()]
def _cpu_cores(cpu_perc: str) -> float:
return float(cpu_perc.strip().rstrip("%")) / 100.0
def _memory_bytes(value: str) -> float:
raw = value.strip()
match = re.fullmatch(r"([0-9.]+)\s*([A-Za-z]+)", raw)
if not match:
return 0.0
number, unit = match.groups()
scale = {
"B": 1,
"KiB": 1024,
"MiB": 1024**2,
"GiB": 1024**3,
"TiB": 1024**4,
"KB": 1000,
"MB": 1000**2,
"GB": 1000**3,
"TB": 1000**4,
}.get(unit, 1)
return float(number) * scale
def _started_at_seconds(value: str) -> float:
if not value:
return 0.0
try:
normalized = re.sub(r"\.(\d{6})\d+(Z|[+-]\d\d:\d\d)$", r".\1\2", value)
normalized = normalized.replace("Z", "+00:00")
return datetime.fromisoformat(normalized).astimezone(timezone.utc).timestamp()
except ValueError:
return 0.0
def collect() -> str:
stats = _run_json_lines([
"docker",
"stats",
"--no-stream",
"--format",
"{{json .}}",
])
names = _run_text_lines(["docker", "ps", "-a", "--format", "{{.Names}}"])
inspect_by_name = {}
if names:
inspected = json.loads(subprocess.run(
["docker", "inspect", *names],
check=True,
capture_output=True,
text=True,
timeout=30,
).stdout)
inspect_by_name = {row.get("Name", "").lstrip("/"): row for row in inspected}
lines = [
"# HELP docker_container_cpu_cores Current Docker container CPU usage in cores from docker stats.",
"# TYPE docker_container_cpu_cores gauge",
"# HELP docker_container_cpu_limit_cores Docker container CPU quota in cores, 0 when unlimited.",
"# TYPE docker_container_cpu_limit_cores gauge",
"# HELP docker_container_memory_usage_bytes Current Docker container memory usage in bytes from docker stats.",
"# TYPE docker_container_memory_usage_bytes gauge",
"# HELP docker_container_memory_limit_bytes Docker container memory limit in bytes, 0 when unlimited.",
"# TYPE docker_container_memory_limit_bytes gauge",
"# HELP docker_container_pids Current Docker container process/thread count from docker stats.",
"# TYPE docker_container_pids gauge",
"# HELP docker_container_inspect_restart_count Docker container restart count from Docker inspect.",
"# TYPE docker_container_inspect_restart_count gauge",
"# HELP docker_container_info Docker container inventory exposed by the textfile exporter.",
"# TYPE docker_container_info gauge",
"# HELP docker_container_started_seconds Docker container start timestamp from Docker inspect.",
"# TYPE docker_container_started_seconds gauge",
]
for row in stats:
name = row.get("Name", "")
if not name:
continue
inspected = inspect_by_name.get(name, {})
host_config = inspected.get("HostConfig", {}) if isinstance(inspected, dict) else {}
state = inspected.get("State", {}) if isinstance(inspected, dict) else {}
nano_cpus = float(host_config.get("NanoCpus") or 0)
memory_limit = float(host_config.get("Memory") or 0)
restart_count = int(inspected.get("RestartCount") or 0)
started_seconds = _started_at_seconds(state.get("StartedAt", ""))
labels = f'host="{_escape_label(HOST_LABEL)}",container_name="{_escape_label(name)}"'
mem_current = (row.get("MemUsage") or "0 B / 0 B").split("/", 1)[0].strip()
pids = row.get("PIDs") or "0"
lines.append(f"docker_container_cpu_cores{{{labels}}} {_cpu_cores(row.get('CPUPerc', '0%')):.6f}")
lines.append(f"docker_container_cpu_limit_cores{{{labels}}} {nano_cpus / 1_000_000_000:.6f}")
lines.append(f"docker_container_memory_usage_bytes{{{labels}}} {_memory_bytes(mem_current):.0f}")
lines.append(f"docker_container_memory_limit_bytes{{{labels}}} {memory_limit:.0f}")
lines.append(f"docker_container_pids{{{labels}}} {int(float(pids))}")
lines.append(f"docker_container_inspect_restart_count{{{labels}}} {restart_count}")
lines.append(f"docker_container_started_seconds{{{labels}}} {started_seconds:.6f}")
lines.append(
f'docker_container_info{{{labels},status="{_escape_label(state.get("Status", ""))}"}} 1'
)
return "\n".join(lines) + "\n"
def main() -> None:
TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
payload = collect()
with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
tmp.write(payload)
tmp_path = Path(tmp.name)
output_path = TEXTFILE_DIR / OUTPUT_NAME
tmp_path.replace(output_path)
output_path.chmod(0o644)
if __name__ == "__main__":
main()