#!/usr/bin/env python3 """ Host runaway process textfile exporter for AWOOOI AIOps. This exporter is read-only. It classifies orphaned headless browser/smoke process groups separately from legitimate Gitea Actions load so host CPU alerts can point to a concrete PlayBook instead of a generic "high CPU" symptom. """ from __future__ import annotations import argparse import os import re import subprocess import tempfile import time from dataclasses import dataclass from pathlib import Path from typing import Iterable TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector")) OUTPUT_NAME = "host_runaway_process.prom" HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename) LABEL_RE = re.compile(r'["\\\n]') GITEA_ACTION_CONTAINER_RE = re.compile( r"^(?:GITEA-ACTIONS-|[A-Za-z0-9][A-Za-z0-9_.-]*-(?:cd|code-review)-[0-9]+-)" ) @dataclass(frozen=True) class ProcessRow: pid: int ppid: int pgid: int sid: int etimes: int pcpu: float stat: str comm: str args: str @dataclass(frozen=True) class RunawayRule: rule_id: str command_pattern: re.Pattern[str] context_pattern: re.Pattern[str] @dataclass(frozen=True) class ProcessGroup: rule_id: str pgid: int rows: tuple[ProcessRow, ...] cpu_percent: float oldest_age_seconds: int orphan_reason: str sample_comm: str @dataclass(frozen=True) class ActiveCiLoad: group_count: int process_count: int cpu_percent: float oldest_age_seconds: int DEFAULT_RULES = ( RunawayRule( "stockplatform_headless_smoke", re.compile(r"(chrome|chromium|playwright)", re.IGNORECASE), re.compile(r"stockplatform-review-bulk-ux|/tmp/stockplatform", re.IGNORECASE), ), RunawayRule( "headless_browser_smoke", re.compile(r"(chrome|chromium|playwright)", re.IGNORECASE), re.compile(r"--headless|--user-data-dir=/tmp|/tmp/.*(smoke|ux|playwright)", re.IGNORECASE), ), ) GITEA_ACTION_PROCESS_RE = re.compile( r"(/\.cache/act/|/home/wooo/\.cache/act/|\bdocker build\b|\bdocker-buildx\b|" r"\bbuildx build\b|\bpnpm turbo build\b|\bturbo build\b|\bnext build\b)" ) HOST_PRESSURE_GATE_RE = re.compile(r"wait-host-web-build-pressure\.sh|awoooi-wait-host-web-build-pressure\.sh") def escape_label(value: str) -> str: return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value) def run_text(command: list[str], timeout: int = 20) -> str: return subprocess.run(command, check=True, capture_output=True, text=True, timeout=timeout).stdout def read_ps_text(ps_file: Path | None = None) -> str: if ps_file: return ps_file.read_text(encoding="utf-8") linux_command = [ "ps", "-eo", "pid=,ppid=,pgid=,sid=,etimes=,pcpu=,stat=,comm=,args=", ] try: return run_text(linux_command) except (subprocess.CalledProcessError, subprocess.TimeoutExpired): return run_text( [ "ps", "-axo", "pid=,ppid=,pgid=,sess=,etime=,pcpu=,stat=,comm=,command=", ] ) def elapsed_to_seconds(value: str) -> int: try: return int(float(value)) except ValueError: pass days = 0 clock = value if "-" in value: raw_days, clock = value.split("-", 1) days = int(raw_days) parts = [int(part) for part in clock.split(":")] if len(parts) == 3: hours, minutes, seconds = parts elif len(parts) == 2: hours = 0 minutes, seconds = parts else: hours = 0 minutes = 0 seconds = parts[0] return days * 86400 + hours * 3600 + minutes * 60 + seconds def parse_ps_rows(text: str) -> list[ProcessRow]: rows: list[ProcessRow] = [] for line in text.splitlines(): raw = line.strip() if not raw: continue parts = raw.split(None, 8) if len(parts) < 9: continue try: rows.append( ProcessRow( pid=int(parts[0]), ppid=int(parts[1]), pgid=int(parts[2]), sid=int(parts[3]), etimes=elapsed_to_seconds(parts[4]), pcpu=float(parts[5]), stat=parts[6], comm=parts[7], args=parts[8], ) ) except ValueError: continue return rows def matching_rule(row: ProcessRow, rules: Iterable[RunawayRule] = DEFAULT_RULES) -> str | None: haystack = f"{row.comm} {row.args}" for rule in rules: if rule.command_pattern.search(haystack) and rule.context_pattern.search(haystack): return rule.rule_id return None def orphan_reason(rows: list[ProcessRow], all_pids: set[int]) -> str | None: if any(row.ppid == 1 for row in rows): return "ppid_1" pgid = rows[0].pgid if pgid not in all_pids: return "missing_group_leader" return None def classify_groups( rows: list[ProcessRow], *, min_age_seconds: int, min_cpu_percent: float, ) -> list[ProcessGroup]: all_pids = {row.pid for row in rows} grouped: dict[tuple[str, int], list[ProcessRow]] = {} for row in rows: rule_id = matching_rule(row) if rule_id is None: continue grouped.setdefault((rule_id, row.pgid), []).append(row) groups: list[ProcessGroup] = [] for (rule_id, pgid), members in grouped.items(): reason = orphan_reason(members, all_pids) if reason is None: continue oldest = max(row.etimes for row in members) cpu_percent = sum(row.pcpu for row in members) if oldest < min_age_seconds or cpu_percent < min_cpu_percent: continue sample_comm = sorted({row.comm for row in members})[0][:48] groups.append( ProcessGroup( rule_id=rule_id, pgid=pgid, rows=tuple(sorted(members, key=lambda row: row.pid)), cpu_percent=cpu_percent, oldest_age_seconds=oldest, orphan_reason=reason, sample_comm=sample_comm, ) ) return sorted(groups, key=lambda group: (-group.cpu_percent, group.rule_id, group.pgid)) def active_gitea_action_containers(docker_file: Path | None = None) -> int: try: if docker_file: names = docker_file.read_text(encoding="utf-8").splitlines() else: names = run_text(["docker", "ps", "--format", "{{.Names}}"], timeout=10).splitlines() except Exception: return -1 return sum(1 for name in names if GITEA_ACTION_CONTAINER_RE.search(name)) def active_gitea_action_process_load(rows: list[ProcessRow]) -> ActiveCiLoad: grouped: dict[int, list[ProcessRow]] = {} gate_pgids = { row.pgid for row in rows if HOST_PRESSURE_GATE_RE.search(f"{row.comm} {row.args}") } for row in rows: if row.pgid in gate_pgids: continue haystack = f"{row.comm} {row.args}" if not GITEA_ACTION_PROCESS_RE.search(haystack): continue if "act_runner daemon" in haystack: continue grouped.setdefault(row.pgid, []).append(row) members = [row for group in grouped.values() for row in group] return ActiveCiLoad( group_count=len(grouped), process_count=len(members), cpu_percent=sum(row.pcpu for row in members), oldest_age_seconds=max((row.etimes for row in members), default=0), ) def load5_per_core() -> float: try: load5 = float(Path("/proc/loadavg").read_text(encoding="utf-8").split()[1]) except Exception: try: load5 = os.getloadavg()[1] except OSError: return 0.0 cores = os.cpu_count() or 1 return load5 / cores def swap_used_ratio(meminfo_file: Path | None = None) -> float: path = meminfo_file or Path("/proc/meminfo") try: values: dict[str, float] = {} for line in path.read_text(encoding="utf-8").splitlines(): key, _, raw = line.partition(":") if key in {"SwapTotal", "SwapFree"}: values[key] = float(raw.strip().split()[0]) * 1024 total = values.get("SwapTotal", 0.0) free = values.get("SwapFree", 0.0) if total <= 0: return 0.0 return max(0.0, min(1.0, (total - free) / total)) except Exception: return 0.0 def render_metrics( *, host: str, groups: list[ProcessGroup], active_action_containers: int, active_action_process_load: ActiveCiLoad, min_age_seconds: int, min_cpu_percent: float, now: int, load_ratio: float, swap_ratio: float, ) -> str: labels_host = f'host="{escape_label(host)}"' rule_ids = sorted({rule.rule_id for rule in DEFAULT_RULES}) by_rule = {rule_id: [group for group in groups if group.rule_id == rule_id] for rule_id in rule_ids} lines = [ "# HELP awoooi_host_runaway_process_monitor_up Whether the host runaway process exporter completed.", "# TYPE awoooi_host_runaway_process_monitor_up gauge", "# HELP awoooi_host_runaway_process_last_run_timestamp Unix timestamp of the last exporter run.", "# TYPE awoooi_host_runaway_process_last_run_timestamp gauge", "# HELP awoooi_host_runaway_browser_orphan_group_count Count of orphaned browser/smoke process groups above thresholds.", "# TYPE awoooi_host_runaway_browser_orphan_group_count gauge", "# HELP awoooi_host_runaway_browser_orphan_process_count Count of orphaned browser/smoke processes above thresholds.", "# TYPE awoooi_host_runaway_browser_orphan_process_count gauge", "# HELP awoooi_host_runaway_browser_orphan_cpu_percent Sum CPU percent for orphaned browser/smoke process groups above thresholds.", "# TYPE awoooi_host_runaway_browser_orphan_cpu_percent gauge", "# HELP awoooi_host_runaway_browser_orphan_oldest_age_seconds Oldest age of matching orphaned process groups.", "# TYPE awoooi_host_runaway_browser_orphan_oldest_age_seconds gauge", "# HELP awoooi_host_runaway_browser_orphan_group_cpu_percent CPU percent for an individual orphaned browser/smoke process group.", "# TYPE awoooi_host_runaway_browser_orphan_group_cpu_percent gauge", "# HELP awoooi_host_runaway_browser_orphan_group_info Metadata for an individual orphaned browser/smoke process group.", "# TYPE awoooi_host_runaway_browser_orphan_group_info gauge", "# HELP awoooi_host_gitea_actions_active_container_count Active Gitea Actions task containers visible on the host, -1 when Docker is unavailable.", "# TYPE awoooi_host_gitea_actions_active_container_count gauge", "# HELP awoooi_host_gitea_actions_active_process_group_count Active Gitea Actions or BuildKit process groups visible on the host.", "# TYPE awoooi_host_gitea_actions_active_process_group_count gauge", "# HELP awoooi_host_gitea_actions_active_process_count Active Gitea Actions or BuildKit processes visible on the host.", "# TYPE awoooi_host_gitea_actions_active_process_count gauge", "# HELP awoooi_host_gitea_actions_active_process_cpu_percent CPU percent used by active Gitea Actions or BuildKit processes.", "# TYPE awoooi_host_gitea_actions_active_process_cpu_percent gauge", "# HELP awoooi_host_gitea_actions_active_process_oldest_age_seconds Oldest active Gitea Actions or BuildKit process age.", "# TYPE awoooi_host_gitea_actions_active_process_oldest_age_seconds gauge", "# HELP awoooi_host_load5_per_core Host load5 divided by CPU core count.", "# TYPE awoooi_host_load5_per_core gauge", "# HELP awoooi_host_swap_used_ratio Host swap used ratio from /proc/meminfo.", "# TYPE awoooi_host_swap_used_ratio gauge", "# HELP awoooi_host_runaway_process_remediation_authorized Static guardrail: remediation is not authorized by this exporter.", "# TYPE awoooi_host_runaway_process_remediation_authorized gauge", f"awoooi_host_runaway_process_monitor_up{{{labels_host},mode=\"read_only\"}} 1", f"awoooi_host_runaway_process_last_run_timestamp{{{labels_host}}} {now}", f"awoooi_host_gitea_actions_active_container_count{{{labels_host}}} {active_action_containers}", f"awoooi_host_gitea_actions_active_process_group_count{{{labels_host}}} {active_action_process_load.group_count}", f"awoooi_host_gitea_actions_active_process_count{{{labels_host}}} {active_action_process_load.process_count}", f"awoooi_host_gitea_actions_active_process_cpu_percent{{{labels_host}}} {active_action_process_load.cpu_percent:.6f}", f"awoooi_host_gitea_actions_active_process_oldest_age_seconds{{{labels_host}}} {active_action_process_load.oldest_age_seconds}", f"awoooi_host_load5_per_core{{{labels_host}}} {load_ratio:.6f}", f"awoooi_host_swap_used_ratio{{{labels_host}}} {swap_ratio:.6f}", f"awoooi_host_runaway_process_remediation_authorized{{{labels_host}}} 0", ] for rule_id in rule_ids: rule_labels = ( f'{labels_host},rule="{escape_label(rule_id)}",' f'min_age_seconds="{min_age_seconds}",min_cpu_percent="{min_cpu_percent:g}"' ) rule_groups = by_rule[rule_id] lines.append(f"awoooi_host_runaway_browser_orphan_group_count{{{rule_labels}}} {len(rule_groups)}") lines.append( f"awoooi_host_runaway_browser_orphan_process_count{{{rule_labels}}} " f"{sum(len(group.rows) for group in rule_groups)}" ) lines.append( f"awoooi_host_runaway_browser_orphan_cpu_percent{{{rule_labels}}} " f"{sum(group.cpu_percent for group in rule_groups):.6f}" ) lines.append( f"awoooi_host_runaway_browser_orphan_oldest_age_seconds{{{rule_labels}}} " f"{max((group.oldest_age_seconds for group in rule_groups), default=0)}" ) for group in groups[:20]: group_labels = ( f'{labels_host},rule="{escape_label(group.rule_id)}",pgid="{group.pgid}",' f'orphan_reason="{escape_label(group.orphan_reason)}",comm="{escape_label(group.sample_comm)}"' ) lines.append(f"awoooi_host_runaway_browser_orphan_group_cpu_percent{{{group_labels}}} {group.cpu_percent:.6f}") lines.append(f"awoooi_host_runaway_browser_orphan_group_info{{{group_labels}}} 1") return "\n".join(lines) + "\n" def collect(args: argparse.Namespace) -> str: rows = parse_ps_rows(read_ps_text(args.ps_file)) groups = classify_groups( rows, min_age_seconds=args.min_age_seconds, min_cpu_percent=args.min_cpu_percent, ) return render_metrics( host=args.host, groups=groups, active_action_containers=active_gitea_action_containers(args.docker_ps_file), active_action_process_load=active_gitea_action_process_load(rows), min_age_seconds=args.min_age_seconds, min_cpu_percent=args.min_cpu_percent, now=int(time.time()), load_ratio=load5_per_core(), swap_ratio=swap_used_ratio(args.meminfo_file), ) def write_textfile(payload: str, textfile_dir: Path, output_name: str) -> Path: textfile_dir.mkdir(parents=True, exist_ok=True) with tempfile.NamedTemporaryFile("w", dir=textfile_dir, delete=False, encoding="utf-8") as tmp: tmp.write(payload) tmp_path = Path(tmp.name) output_path = textfile_dir / output_name tmp_path.replace(output_path) output_path.chmod(0o644) return output_path def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Export AWOOOI host runaway process metrics.") parser.add_argument("--host", default=HOST_LABEL) parser.add_argument("--textfile-dir", type=Path, default=TEXTFILE_DIR) parser.add_argument("--output-name", default=OUTPUT_NAME) parser.add_argument("--stdout", action="store_true", help="Print metrics instead of writing the textfile.") parser.add_argument("--ps-file", type=Path, help="Use a fixture file instead of running ps.") parser.add_argument("--docker-ps-file", type=Path, help="Use a fixture file instead of docker ps.") parser.add_argument("--meminfo-file", type=Path, help="Use a fixture file instead of /proc/meminfo.") parser.add_argument( "--min-age-seconds", type=int, default=int(os.environ.get("AIOPS_RUNAWAY_PROCESS_MIN_AGE_SECONDS", "1800")), ) parser.add_argument( "--min-cpu-percent", type=float, default=float(os.environ.get("AIOPS_RUNAWAY_PROCESS_MIN_CPU_PERCENT", "50")), ) return parser.parse_args() def main() -> None: args = parse_args() payload = collect(args) if args.stdout: print(payload, end="") return output_path = write_textfile(payload, args.textfile_dir, args.output_name) print(f"HOST_RUNAWAY_PROCESS_EXPORTER_OK output={output_path}") if __name__ == "__main__": main()