fix(ops): expose host runner build load in exporter [skip ci]

This commit is contained in:
ogt
2026-06-27 12:57:53 +08:00
parent b623dc6011
commit fcd4337b3a
4 changed files with 110 additions and 4 deletions

View File

@@ -60,6 +60,14 @@ class ProcessGroup:
sample_comm: str
@dataclass(frozen=True)
class ActiveCiLoad:
group_count: int
process_count: int
cpu_percent: float
oldest_age_seconds: int
DEFAULT_RULES = (
RunawayRule(
"stockplatform_headless_smoke",
@@ -73,6 +81,11 @@ DEFAULT_RULES = (
),
)
GITEA_ACTION_PROCESS_RE = re.compile(
r"(/\.cache/act/|/home/wooo/\.cache/act/|\bdocker build\b|\bdocker-buildx\b|"
r"\bbuildx build\b|\bpnpm turbo build\b|\bturbo build\b|\bnext build\b)"
)
def escape_label(value: str) -> str:
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
@@ -220,6 +233,25 @@ def active_gitea_action_containers(docker_file: Path | None = None) -> int:
return sum(1 for name in names if GITEA_ACTION_CONTAINER_RE.search(name))
def active_gitea_action_process_load(rows: list[ProcessRow]) -> ActiveCiLoad:
grouped: dict[int, list[ProcessRow]] = {}
for row in rows:
haystack = f"{row.comm} {row.args}"
if not GITEA_ACTION_PROCESS_RE.search(haystack):
continue
if "act_runner daemon" in haystack:
continue
grouped.setdefault(row.pgid, []).append(row)
members = [row for group in grouped.values() for row in group]
return ActiveCiLoad(
group_count=len(grouped),
process_count=len(members),
cpu_percent=sum(row.pcpu for row in members),
oldest_age_seconds=max((row.etimes for row in members), default=0),
)
def load5_per_core() -> float:
try:
load5 = float(Path("/proc/loadavg").read_text(encoding="utf-8").split()[1])
@@ -254,6 +286,7 @@ def render_metrics(
host: str,
groups: list[ProcessGroup],
active_action_containers: int,
active_action_process_load: ActiveCiLoad,
min_age_seconds: int,
min_cpu_percent: float,
now: int,
@@ -282,6 +315,14 @@ def render_metrics(
"# TYPE awoooi_host_runaway_browser_orphan_group_info gauge",
"# HELP awoooi_host_gitea_actions_active_container_count Active Gitea Actions task containers visible on the host, -1 when Docker is unavailable.",
"# TYPE awoooi_host_gitea_actions_active_container_count gauge",
"# HELP awoooi_host_gitea_actions_active_process_group_count Active Gitea Actions or BuildKit process groups visible on the host.",
"# TYPE awoooi_host_gitea_actions_active_process_group_count gauge",
"# HELP awoooi_host_gitea_actions_active_process_count Active Gitea Actions or BuildKit processes visible on the host.",
"# TYPE awoooi_host_gitea_actions_active_process_count gauge",
"# HELP awoooi_host_gitea_actions_active_process_cpu_percent CPU percent used by active Gitea Actions or BuildKit processes.",
"# TYPE awoooi_host_gitea_actions_active_process_cpu_percent gauge",
"# HELP awoooi_host_gitea_actions_active_process_oldest_age_seconds Oldest active Gitea Actions or BuildKit process age.",
"# TYPE awoooi_host_gitea_actions_active_process_oldest_age_seconds gauge",
"# HELP awoooi_host_load5_per_core Host load5 divided by CPU core count.",
"# TYPE awoooi_host_load5_per_core gauge",
"# HELP awoooi_host_swap_used_ratio Host swap used ratio from /proc/meminfo.",
@@ -291,6 +332,10 @@ def render_metrics(
f"awoooi_host_runaway_process_monitor_up{{{labels_host},mode=\"read_only\"}} 1",
f"awoooi_host_runaway_process_last_run_timestamp{{{labels_host}}} {now}",
f"awoooi_host_gitea_actions_active_container_count{{{labels_host}}} {active_action_containers}",
f"awoooi_host_gitea_actions_active_process_group_count{{{labels_host}}} {active_action_process_load.group_count}",
f"awoooi_host_gitea_actions_active_process_count{{{labels_host}}} {active_action_process_load.process_count}",
f"awoooi_host_gitea_actions_active_process_cpu_percent{{{labels_host}}} {active_action_process_load.cpu_percent:.6f}",
f"awoooi_host_gitea_actions_active_process_oldest_age_seconds{{{labels_host}}} {active_action_process_load.oldest_age_seconds}",
f"awoooi_host_load5_per_core{{{labels_host}}} {load_ratio:.6f}",
f"awoooi_host_swap_used_ratio{{{labels_host}}} {swap_ratio:.6f}",
f"awoooi_host_runaway_process_remediation_authorized{{{labels_host}}} 0",
@@ -338,6 +383,7 @@ def collect(args: argparse.Namespace) -> str:
host=args.host,
groups=groups,
active_action_containers=active_gitea_action_containers(args.docker_ps_file),
active_action_process_load=active_gitea_action_process_load(rows),
min_age_seconds=args.min_age_seconds,
min_cpu_percent=args.min_cpu_percent,
now=int(time.time()),

View File

@@ -79,6 +79,12 @@ def test_renders_ci_load_and_swap_without_authorizing_repair(tmp_path: Path) ->
host="110",
groups=groups,
active_action_containers=3,
active_action_process_load=exporter.ActiveCiLoad(
group_count=2,
process_count=4,
cpu_percent=188.5,
oldest_age_seconds=240,
),
min_age_seconds=1800,
min_cpu_percent=50,
now=123,
@@ -88,6 +94,10 @@ def test_renders_ci_load_and_swap_without_authorizing_repair(tmp_path: Path) ->
assert 'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1' in metrics
assert 'awoooi_host_gitea_actions_active_container_count{host="110"} 3' in metrics
assert 'awoooi_host_gitea_actions_active_process_group_count{host="110"} 2' in metrics
assert 'awoooi_host_gitea_actions_active_process_count{host="110"} 4' in metrics
assert 'awoooi_host_gitea_actions_active_process_cpu_percent{host="110"} 188.500000' in metrics
assert 'awoooi_host_gitea_actions_active_process_oldest_age_seconds{host="110"} 240' in metrics
assert 'awoooi_host_swap_used_ratio{host="110"} 1.000000' in metrics
assert 'awoooi_host_runaway_process_remediation_authorized{host="110"} 0' in metrics
assert 'rule="stockplatform_headless_smoke"' in metrics
@@ -113,6 +123,28 @@ def test_counts_modern_gitea_action_container_names(tmp_path: Path) -> None:
assert exporter.active_gitea_action_containers(docker_file) == 4
def test_counts_buildkit_runner_process_load() -> None:
exporter = load_exporter()
rows = exporter.parse_ps_rows(
"""
100 10 100 100 240 0.0 S bash bash --noprofile --norc -e -o pipefail /home/wooo/.cache/act/14cc/act/workflow/8.sh
101 100 100 100 239 1.0 S docker docker build -f apps/web/Dockerfile .
102 101 100 100 239 2.0 S docker-buildx /home/wooo/.docker/cli-plugins/docker-buildx buildx build -f apps/web/Dockerfile .
200 150 200 200 210 12.5 S turbo turbo build --filter=@awoooi/web --concurrency=1
201 200 200 200 200 145.0 S node node /app/apps/web/node_modules/.bin/../next/dist/bin/next build
300 1 300 300 9999 0.1 S act_runner act_runner daemon --config /config.yaml
400 1 400 400 120 30.0 S node node apps/web/server.js
"""
)
load = exporter.active_gitea_action_process_load(rows)
assert load.group_count == 2
assert load.process_count == 5
assert load.cpu_percent == 160.5
assert load.oldest_age_seconds == 240
def test_remediation_defaults_to_dry_run(tmp_path: Path) -> None:
ps_file = tmp_path / "ps.txt"
ps_file.write_text(