awoooi/scripts/ops/host-sustained-load-evidence.py

#!/usr/bin/env python3
"""Build sanitized evidence for unknown sustained host load.

This collector is read-only. It intentionally emits process families and
container names instead of raw command lines so CPU-pressure alerts can proceed
to a source-specific PlayBook without leaking workspace paths, URLs, JSON
payloads, or secrets.
"""

from __future__ import annotations

import argparse
import json
import os
import re
import subprocess
import time
from pathlib import Path
from typing import Any


DEFAULT_HOST_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom")
DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS = 300
SCHEMA_VERSION = "host_sustained_load_sanitized_evidence_v1"
LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
METRIC_RE = re.compile(
    r"^(?P<name>[A-Za-z_:][A-Za-z0-9_:]*)(?:\{(?P<labels>[^}]*)\})?\s+"
    r"(?P<value>[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)$"
)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Collect sanitized sustained-load evidence.")
    parser.add_argument("--host", default=os.environ.get("AIOPS_HOST_LABEL", "110"))
    parser.add_argument("--metrics-file", type=Path, default=DEFAULT_HOST_METRICS_FILE)
    parser.add_argument("--docker-stats-file", type=Path, default=DEFAULT_DOCKER_STATS_FILE)
    parser.add_argument(
        "--docker-stats-max-age-seconds",
        type=int,
        default=DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS,
    )
    parser.add_argument("--ps-file", type=Path)
    parser.add_argument("--top-n", type=int, default=8)
    parser.add_argument("--json", action="store_true")
    return parser.parse_args()


def _unescape_label(value: str) -> str:
    return value.replace(r"\"", '"').replace(r"\\", "\\").replace(r"\n", "\n")


def parse_prometheus_text(text: str) -> list[dict[str, Any]]:
    samples: list[dict[str, Any]] = []
    for raw_line in text.splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#"):
            continue
        match = METRIC_RE.match(line)
        if not match:
            continue
        labels = {
            item.group("key"): _unescape_label(item.group("value"))
            for item in LABEL_RE.finditer(match.group("labels") or "")
        }
        samples.append(
            {
                "name": match.group("name"),
                "labels": labels,
                "value": float(match.group("value")),
            }
        )
    return samples


def _sample_value_any(samples: list[dict[str, Any]], name: str) -> float | None:
    for sample in samples:
        if sample["name"] == name:
            return float(sample["value"])
    return None


def _textfile_mtime_seconds(samples: list[dict[str, Any]], suffix: str) -> float | None:
    for sample in samples:
        if sample["name"] != "node_textfile_mtime_seconds":
            continue
        file_label = str(sample["labels"].get("file") or "")
        if file_label.endswith(suffix):
            return float(sample["value"])
    return None


def docker_stats_freshness(
    *,
    samples: list[dict[str, Any]],
    docker_stats_file: Path,
    max_age_seconds: int,
) -> dict[str, Any]:
    mtime = _textfile_mtime_seconds(samples, "docker_stats.prom")
    now = _sample_value_any(samples, "node_time_seconds")
    source = "node_textfile_mtime_seconds"
    if mtime is None:
        try:
            mtime = docker_stats_file.stat().st_mtime
            now = time.time()
            source = "file_stat_mtime"
        except FileNotFoundError:
            return {
                "fresh": False,
                "age_seconds": None,
                "max_age_seconds": max_age_seconds,
                "source": "missing",
            }
    if now is None:
        now = time.time()
    age_seconds = max(0, int(now - mtime))
    return {
        "fresh": age_seconds <= max_age_seconds,
        "age_seconds": age_seconds,
        "max_age_seconds": max_age_seconds,
        "source": source,
    }


def read_text(path: Path | None) -> str:
    if path is None:
        return ""
    try:
        return path.read_text(encoding="utf-8")
    except FileNotFoundError:
        return ""


def collect_ps_text(ps_file: Path | None) -> str:
    if ps_file is not None:
        return read_text(ps_file)
    result = subprocess.run(
        ["ps", "-eo", "pid=,ppid=,pgid=,etimes=,pcpu=,pmem=,comm=,args="],
        check=True,
        capture_output=True,
        text=True,
        timeout=10,
    )
    return result.stdout


def parse_ps_text(text: str) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    for raw_line in text.splitlines():
        line = raw_line.strip()
        if not line:
            continue
        parts = line.split(None, 7)
        if len(parts) < 7:
            continue
        pid, ppid, pgid, etimes, pcpu, pmem, comm = parts[:7]
        args = parts[7] if len(parts) > 7 else comm
        try:
            rows.append(
                {
                    "pid": int(pid),
                    "ppid": int(ppid),
                    "pgid": int(pgid),
                    "etimes": int(float(etimes)),
                    "cpu_percent": float(pcpu),
                    "mem_percent": float(pmem),
                    "comm": Path(comm).name[:48],
                    "family": classify_process_family(comm, args),
                }
            )
        except ValueError:
            continue
    return rows


def classify_process_family(comm: str, args: str) -> str:
    text = f"{comm} {args}".lower()
    if "act_runner" in text or "gitea-actions-task" in text or "/.cache/act/" in text:
        return "gitea_actions_runner"
    if "docker build" in text or "buildx" in text or "buildkit" in text:
        return "docker_build"
    if "next build" in text or "turbo build" in text or "pnpm" in text and " build" in text:
        return "web_build"
    if "chrome" in text or "chromium" in text or "playwright" in text:
        return "headless_browser"
    if "gitea" in text:
        return "gitea_service"
    if "postgres" in text or "postmaster" in text:
        return "postgres"
    if "clickhouse" in text:
        return "clickhouse"
    if "kafka" in text:
        return "kafka"
    if "sentry" in text:
        return "sentry"
    if "systemctl" in text or "systemd" in text or "dbus" in text:
        return "systemd_control_plane"
    if "sshd" in text:
        return "ssh_control_plane"
    if "python" in text:
        return "python_job"
    if "node" in text:
        return "node_service"
    return "unknown"


def summarize_processes(rows: list[dict[str, Any]], *, top_n: int) -> dict[str, Any]:
    top_rows = sorted(rows, key=lambda item: (-item["cpu_percent"], item["comm"], item["pid"]))[:top_n]
    families: dict[str, dict[str, Any]] = {}
    for row in rows:
        family = row["family"]
        current = families.setdefault(
            family,
            {
                "family": family,
                "process_count": 0,
                "cpu_percent": 0.0,
                "max_age_seconds": 0,
                "sample_comm": "",
            },
        )
        current["process_count"] += 1
        current["cpu_percent"] += row["cpu_percent"]
        current["max_age_seconds"] = max(current["max_age_seconds"], row["etimes"])
        if not current["sample_comm"] or row["cpu_percent"] > current.get("_sample_cpu", -1):
            current["sample_comm"] = row["comm"]
            current["_sample_cpu"] = row["cpu_percent"]

    family_rows = []
    for item in families.values():
        item.pop("_sample_cpu", None)
        item["cpu_percent"] = round(float(item["cpu_percent"]), 3)
        family_rows.append(item)

    return {
        "top_processes": [
            {
                "pid": row["pid"],
                "ppid": row["ppid"],
                "pgid": row["pgid"],
                "cpu_percent": round(row["cpu_percent"], 3),
                "mem_percent": round(row["mem_percent"], 3),
                "age_seconds": row["etimes"],
                "comm": row["comm"],
                "family": row["family"],
            }
            for row in top_rows
        ],
        "families": sorted(family_rows, key=lambda item: (-item["cpu_percent"], item["family"]))[:top_n],
    }


def top_docker_containers(samples: list[dict[str, Any]], *, host: str, top_n: int) -> list[dict[str, Any]]:
    rows = []
    for sample in samples:
        if sample["name"] != "docker_container_cpu_cores":
            continue
        labels = sample["labels"]
        if labels.get("host", host) != host:
            continue
        rows.append(
            {
                "container_name": labels.get("container_name") or labels.get("name") or "unknown",
                "cpu_cores": round(float(sample["value"]), 6),
            }
        )
    return sorted(rows, key=lambda item: (-item["cpu_cores"], item["container_name"]))[:top_n]


def recommend_playbook(process_families: list[dict[str, Any]], containers: list[dict[str, Any]]) -> str:
    top_container = containers[0] if containers else {}
    top_container_name = str(top_container.get("container_name") or "").lower()
    top_container_cpu = float(top_container.get("cpu_cores") or 0.0)
    top_family = process_families[0] if process_families else {}
    family = str(top_family.get("family") or "")

    if "gitea" in top_container_name and top_container_cpu >= 2.0:
        return "gitea_queue_or_hook_backlog_playbook"
    if "postgres" in top_container_name or "postgres" in family:
        return "postgres_hot_query_or_backup_export_playbook"
    if family in {"docker_build", "web_build", "gitea_actions_runner"}:
        return "build_or_runner_pressure_playbook"
    if family in {"systemd_control_plane", "ssh_control_plane"}:
        return "control_plane_saturation_playbook"
    if family == "headless_browser":
        return "orphan_browser_classification_refresh_playbook"
    return "source_specific_playbook_required"


def build_payload(args: argparse.Namespace) -> dict[str, Any]:
    host_samples = parse_prometheus_text(read_text(args.metrics_file))
    docker_samples = parse_prometheus_text(read_text(args.docker_stats_file))
    docker_stats_status = docker_stats_freshness(
        samples=host_samples,
        docker_stats_file=args.docker_stats_file,
        max_age_seconds=args.docker_stats_max_age_seconds,
    )
    process_summary = summarize_processes(parse_ps_text(collect_ps_text(args.ps_file)), top_n=args.top_n)
    untrusted_containers = top_docker_containers(docker_samples, host=args.host, top_n=args.top_n)
    containers = untrusted_containers if docker_stats_status.get("fresh") is True else []
    recommendation = recommend_playbook(process_summary["families"], containers)

    return {
        "schema_version": SCHEMA_VERSION,
        "host": args.host,
        "mode": "read_only_sanitized_evidence",
        "recommendation": recommendation,
        "controlled_apply_allowed": False,
        "next_action": "select_or_generate_source_specific_playbook_then_run_check_mode",
        "readback": {
            "host_metric_sample_count": len(host_samples),
            "docker_metric_sample_count": len(docker_samples),
            "docker_stats": docker_stats_status,
            "top_container_count": len(containers),
            "top_process_family_count": len(process_summary["families"]),
        },
        "top_containers": containers,
        "top_containers_untrusted": untrusted_containers,
        "top_process_families": process_summary["families"],
        "top_processes_sanitized": process_summary["top_processes"],
        "redaction": {
            "raw_command_lines_emitted": False,
            "workspace_paths_emitted": False,
            "urls_emitted": False,
            "secret_values_read": False,
        },
        "operation_boundaries": {
            "host_write_performed": False,
            "process_signal_performed": False,
            "docker_restart_performed": False,
            "systemd_restart_performed": False,
            "raw_session_read": False,
            "raw_runner_registration_read": False,
        },
    }


def main() -> int:
    args = parse_args()
    payload = build_payload(args)
    if args.json:
        print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
    else:
        print(f"recommendation={payload['recommendation']}")
        print(f"controlled_apply_allowed={str(payload['controlled_apply_allowed']).lower()}")
        print(f"next_action={payload['next_action']}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())