fix(ops): expose host runner build load in exporter [skip ci]
This commit is contained in:
@@ -60,6 +60,14 @@ class ProcessGroup:
|
||||
sample_comm: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ActiveCiLoad:
|
||||
group_count: int
|
||||
process_count: int
|
||||
cpu_percent: float
|
||||
oldest_age_seconds: int
|
||||
|
||||
|
||||
DEFAULT_RULES = (
|
||||
RunawayRule(
|
||||
"stockplatform_headless_smoke",
|
||||
@@ -73,6 +81,11 @@ DEFAULT_RULES = (
|
||||
),
|
||||
)
|
||||
|
||||
GITEA_ACTION_PROCESS_RE = re.compile(
|
||||
r"(/\.cache/act/|/home/wooo/\.cache/act/|\bdocker build\b|\bdocker-buildx\b|"
|
||||
r"\bbuildx build\b|\bpnpm turbo build\b|\bturbo build\b|\bnext build\b)"
|
||||
)
|
||||
|
||||
|
||||
def escape_label(value: str) -> str:
|
||||
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
|
||||
@@ -220,6 +233,25 @@ def active_gitea_action_containers(docker_file: Path | None = None) -> int:
|
||||
return sum(1 for name in names if GITEA_ACTION_CONTAINER_RE.search(name))
|
||||
|
||||
|
||||
def active_gitea_action_process_load(rows: list[ProcessRow]) -> ActiveCiLoad:
|
||||
grouped: dict[int, list[ProcessRow]] = {}
|
||||
for row in rows:
|
||||
haystack = f"{row.comm} {row.args}"
|
||||
if not GITEA_ACTION_PROCESS_RE.search(haystack):
|
||||
continue
|
||||
if "act_runner daemon" in haystack:
|
||||
continue
|
||||
grouped.setdefault(row.pgid, []).append(row)
|
||||
|
||||
members = [row for group in grouped.values() for row in group]
|
||||
return ActiveCiLoad(
|
||||
group_count=len(grouped),
|
||||
process_count=len(members),
|
||||
cpu_percent=sum(row.pcpu for row in members),
|
||||
oldest_age_seconds=max((row.etimes for row in members), default=0),
|
||||
)
|
||||
|
||||
|
||||
def load5_per_core() -> float:
|
||||
try:
|
||||
load5 = float(Path("/proc/loadavg").read_text(encoding="utf-8").split()[1])
|
||||
@@ -254,6 +286,7 @@ def render_metrics(
|
||||
host: str,
|
||||
groups: list[ProcessGroup],
|
||||
active_action_containers: int,
|
||||
active_action_process_load: ActiveCiLoad,
|
||||
min_age_seconds: int,
|
||||
min_cpu_percent: float,
|
||||
now: int,
|
||||
@@ -282,6 +315,14 @@ def render_metrics(
|
||||
"# TYPE awoooi_host_runaway_browser_orphan_group_info gauge",
|
||||
"# HELP awoooi_host_gitea_actions_active_container_count Active Gitea Actions task containers visible on the host, -1 when Docker is unavailable.",
|
||||
"# TYPE awoooi_host_gitea_actions_active_container_count gauge",
|
||||
"# HELP awoooi_host_gitea_actions_active_process_group_count Active Gitea Actions or BuildKit process groups visible on the host.",
|
||||
"# TYPE awoooi_host_gitea_actions_active_process_group_count gauge",
|
||||
"# HELP awoooi_host_gitea_actions_active_process_count Active Gitea Actions or BuildKit processes visible on the host.",
|
||||
"# TYPE awoooi_host_gitea_actions_active_process_count gauge",
|
||||
"# HELP awoooi_host_gitea_actions_active_process_cpu_percent CPU percent used by active Gitea Actions or BuildKit processes.",
|
||||
"# TYPE awoooi_host_gitea_actions_active_process_cpu_percent gauge",
|
||||
"# HELP awoooi_host_gitea_actions_active_process_oldest_age_seconds Oldest active Gitea Actions or BuildKit process age.",
|
||||
"# TYPE awoooi_host_gitea_actions_active_process_oldest_age_seconds gauge",
|
||||
"# HELP awoooi_host_load5_per_core Host load5 divided by CPU core count.",
|
||||
"# TYPE awoooi_host_load5_per_core gauge",
|
||||
"# HELP awoooi_host_swap_used_ratio Host swap used ratio from /proc/meminfo.",
|
||||
@@ -291,6 +332,10 @@ def render_metrics(
|
||||
f"awoooi_host_runaway_process_monitor_up{{{labels_host},mode=\"read_only\"}} 1",
|
||||
f"awoooi_host_runaway_process_last_run_timestamp{{{labels_host}}} {now}",
|
||||
f"awoooi_host_gitea_actions_active_container_count{{{labels_host}}} {active_action_containers}",
|
||||
f"awoooi_host_gitea_actions_active_process_group_count{{{labels_host}}} {active_action_process_load.group_count}",
|
||||
f"awoooi_host_gitea_actions_active_process_count{{{labels_host}}} {active_action_process_load.process_count}",
|
||||
f"awoooi_host_gitea_actions_active_process_cpu_percent{{{labels_host}}} {active_action_process_load.cpu_percent:.6f}",
|
||||
f"awoooi_host_gitea_actions_active_process_oldest_age_seconds{{{labels_host}}} {active_action_process_load.oldest_age_seconds}",
|
||||
f"awoooi_host_load5_per_core{{{labels_host}}} {load_ratio:.6f}",
|
||||
f"awoooi_host_swap_used_ratio{{{labels_host}}} {swap_ratio:.6f}",
|
||||
f"awoooi_host_runaway_process_remediation_authorized{{{labels_host}}} 0",
|
||||
@@ -338,6 +383,7 @@ def collect(args: argparse.Namespace) -> str:
|
||||
host=args.host,
|
||||
groups=groups,
|
||||
active_action_containers=active_gitea_action_containers(args.docker_ps_file),
|
||||
active_action_process_load=active_gitea_action_process_load(rows),
|
||||
min_age_seconds=args.min_age_seconds,
|
||||
min_cpu_percent=args.min_cpu_percent,
|
||||
now=int(time.time()),
|
||||
|
||||
@@ -79,6 +79,12 @@ def test_renders_ci_load_and_swap_without_authorizing_repair(tmp_path: Path) ->
|
||||
host="110",
|
||||
groups=groups,
|
||||
active_action_containers=3,
|
||||
active_action_process_load=exporter.ActiveCiLoad(
|
||||
group_count=2,
|
||||
process_count=4,
|
||||
cpu_percent=188.5,
|
||||
oldest_age_seconds=240,
|
||||
),
|
||||
min_age_seconds=1800,
|
||||
min_cpu_percent=50,
|
||||
now=123,
|
||||
@@ -88,6 +94,10 @@ def test_renders_ci_load_and_swap_without_authorizing_repair(tmp_path: Path) ->
|
||||
|
||||
assert 'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1' in metrics
|
||||
assert 'awoooi_host_gitea_actions_active_container_count{host="110"} 3' in metrics
|
||||
assert 'awoooi_host_gitea_actions_active_process_group_count{host="110"} 2' in metrics
|
||||
assert 'awoooi_host_gitea_actions_active_process_count{host="110"} 4' in metrics
|
||||
assert 'awoooi_host_gitea_actions_active_process_cpu_percent{host="110"} 188.500000' in metrics
|
||||
assert 'awoooi_host_gitea_actions_active_process_oldest_age_seconds{host="110"} 240' in metrics
|
||||
assert 'awoooi_host_swap_used_ratio{host="110"} 1.000000' in metrics
|
||||
assert 'awoooi_host_runaway_process_remediation_authorized{host="110"} 0' in metrics
|
||||
assert 'rule="stockplatform_headless_smoke"' in metrics
|
||||
@@ -113,6 +123,28 @@ def test_counts_modern_gitea_action_container_names(tmp_path: Path) -> None:
|
||||
assert exporter.active_gitea_action_containers(docker_file) == 4
|
||||
|
||||
|
||||
def test_counts_buildkit_runner_process_load() -> None:
|
||||
exporter = load_exporter()
|
||||
rows = exporter.parse_ps_rows(
|
||||
"""
|
||||
100 10 100 100 240 0.0 S bash bash --noprofile --norc -e -o pipefail /home/wooo/.cache/act/14cc/act/workflow/8.sh
|
||||
101 100 100 100 239 1.0 S docker docker build -f apps/web/Dockerfile .
|
||||
102 101 100 100 239 2.0 S docker-buildx /home/wooo/.docker/cli-plugins/docker-buildx buildx build -f apps/web/Dockerfile .
|
||||
200 150 200 200 210 12.5 S turbo turbo build --filter=@awoooi/web --concurrency=1
|
||||
201 200 200 200 200 145.0 S node node /app/apps/web/node_modules/.bin/../next/dist/bin/next build
|
||||
300 1 300 300 9999 0.1 S act_runner act_runner daemon --config /config.yaml
|
||||
400 1 400 400 120 30.0 S node node apps/web/server.js
|
||||
"""
|
||||
)
|
||||
|
||||
load = exporter.active_gitea_action_process_load(rows)
|
||||
|
||||
assert load.group_count == 2
|
||||
assert load.process_count == 5
|
||||
assert load.cpu_percent == 160.5
|
||||
assert load.oldest_age_seconds == 240
|
||||
|
||||
|
||||
def test_remediation_defaults_to_dry_run(tmp_path: Path) -> None:
|
||||
ps_file = tmp_path / "ps.txt"
|
||||
ps_file.write_text(
|
||||
|
||||
Reference in New Issue
Block a user