From fcd4337b3acfe0e83ec0d8f5313c65206bf9676d Mon Sep 17 00:00:00 2001 From: ogt Date: Sat, 27 Jun 2026 12:57:53 +0800 Subject: [PATCH] fix(ops): expose host runner build load in exporter [skip ci] --- docs/LOGBOOK.md | 27 +++++++++++ .../HOST-RESOURCE-BASELINE-110-188.md | 9 ++-- scripts/ops/host-runaway-process-exporter.py | 46 +++++++++++++++++++ .../test_host_runaway_process_exporter.py | 32 +++++++++++++ 4 files changed, 110 insertions(+), 4 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index ad8bec15..35470543 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -35,6 +35,33 @@ - owner role / team、decision、decision reason、affected scope、redacted evidence refs、followup owner 尚未形成 accepted owner response。 - Wazuh live query、active response、Kali active scan、SSH、主機更新、Nginx / firewall / Docker / K8s 寫入、secret value collection、auto deploy / auto repair runtime action 全部仍 `0 / false`。 +## 2026-06-27|12:55 110 CPU 高負載二次根因:Gitea runner / BuildKit process 盲區補強 + +**時間與來源**: +- 2026-06-27 12:52-12:55 Asia/Taipei。 +- 來源:110 `uptime`、`ps`、`pstree`、Docker stats、`/home/wooo/node_exporter_textfiles/host_runaway_process.prom`。 + +**真實根因**: +- 110 load 約 `9.80 / 10.07 / 23.83` 時,最高 CPU 不是 Sentry、不是 MOMO、不是一般網站服務,而是 Gitea host `act_runner` 正在執行 AWOOOI `docker build`:`/home/wooo/.cache/act/.../workflow/8.sh` → `docker build` → `docker-buildx` → BuildKit `runc` → `pnpm turbo build --filter=@awoooi/web --concurrency=1` → `next build`,其中 `next build` 約 `143-180%` CPU。 +- 同一時間又短暫出現 `stockplatform-product-ux` headless Chrome,但該 group 未達 30 分鐘 stale 門檻,且後續已自行結束;不應誤殺活躍 smoke。 +- 既有 exporter 已能抓 `GITEA-ACTIONS-*` 與 repo-scoped action container,但這次 BuildKit executor 不出現在 `docker ps` action container 名稱內,因此 `active_container_count=0` 仍可能同時存在 active runner build process。 + +**修復內容**: +- `scripts/ops/host-runaway-process-exporter.py` 新增 host-side Gitea Actions / BuildKit process 指標: + - `awoooi_host_gitea_actions_active_process_group_count` + - `awoooi_host_gitea_actions_active_process_count` + - `awoooi_host_gitea_actions_active_process_cpu_percent` + - `awoooi_host_gitea_actions_active_process_oldest_age_seconds` +- 新增測試覆蓋 `/home/wooo/.cache/act`、`docker build`、`docker-buildx`、`turbo build`、`next build`,並排除 idle `act_runner daemon`。 + +**部署與讀回**: +- 已部署到 110 `/home/wooo/scripts/host-runaway-process-exporter.py`,sha256 `f4f3c69b35c0d0ddacbe9372501095abf79ea9d1d1be205ce9edf5d15d8668e1`。 +- 手動刷新 textfile 後讀回:`active_container_count=2`、`active_process_group_count=5`、`active_process_count=7`、`active_process_cpu_percent=430.000000`、`oldest_age_seconds=26`、`load5_per_core=0.944167`、`swap_used_ratio=0.327678`、orphan browser groups `0`。 + +**邊界**: +- 本段沒有取消 Gitea Actions、沒有 kill build、沒有 restart Docker / systemd / Nginx / firewall / K3s / DB。 +- 110 CPU 高時的第一判讀順序改為:active runner process / BuildKit → active action container → stale orphan browser → Sentry / DB / stock workload;不得用 generic restart 處理。 + ## 2026-06-27|D1K 正式站驗證:主導航收斂、Governance 同頁化與 mobile overflow 歸零 **背景**:使用者再次指出 AWOOOI / IwoooS 目前不是「缺更多文件」,而是產品結果沒有呈現專業 AI 自動化控制台應有的資訊架構、快速判讀與流程閉環。本段只補記已完成的 IA D0 正式站真相,避免把本地修補誤當正式完成,也避免把 IA D0 誤宣告為全站 UI/UX 完工。 diff --git a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md index b8171af0..06cd6fd1 100644 --- a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md +++ b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md @@ -52,7 +52,7 @@ Use these thresholds for alerting and AI triage: | Systemd runner restarts | > 2 in 15m | Critical; inspect watchdog/drop-ins and active CI jobs. | | Systemd runner WatchdogSec | > 0 for 10m | Warning; GitHub Actions runner should not be killed by systemd watchdog. | | Systemd runner quota | CPU or memory unlimited for 30m | Warning; apply CPUQuota/MemoryMax or move CI away from Sentry host. | -| Gitea Actions job runtime | > 20m for 5m | Warning; inspect logs and run `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh` dry-run before stopping stale job containers. The stale-job detector must include both legacy `GITEA-ACTIONS-*` containers and repo-scoped names such as `awoooi-cd--` / `awoooi-code-review--`. | +| Gitea Actions job runtime | > 20m for 5m | Warning; inspect logs and run `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh` dry-run before stopping stale job containers. The stale-job detector must include both legacy `GITEA-ACTIONS-*` containers and repo-scoped names such as `awoooi-cd--` / `awoooi-code-review--`。textfile exporter 也必須輸出 host-side `act_runner` / BuildKit / `next build` process load,因為活躍 CD build 可能不會出現在具名 action container 裡。 | ## Rules @@ -67,8 +67,9 @@ Use these thresholds for alerting and AI triage: 9. Disable node-exporter collectors that are slow or failing on each host; exporter scrape time is part of the resource baseline. 10. Runner services are part of the resource baseline even when jobs run outside Docker. Export `systemd_unit_*` metrics for all `actions.runner.*` services. 11. CI runner limits must be set at the systemd unit level. Docker container limits do not protect host load from runner listener, worker, and `act` helper processes. -12. Workflow-only commits must not trigger full CD image builds; CD should run only when runtime inputs change. -13. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than its workflow/job policy threshold and has no recent useful logs. +12. 當 `docker ps` 沒有 action container 時,優先用 `awoooi_host_gitea_actions_active_process_*` 判讀 110 高 CPU;BuildKit / `runc` / `next build` 可能仍在跑,即使 `awoooi_host_gitea_actions_active_container_count=0`。 +13. Workflow-only commits must not trigger full CD image builds; CD should run only when runtime inputs change. +14. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than its workflow/job policy threshold and has no recent useful logs. ## Next Safe Rollout Order @@ -79,7 +80,7 @@ Use these thresholds for alerting and AI triage: 5. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low. 6. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis. 7. Add modest caps to currently unlimited low-risk services in small batches. Do not alert every unlimited auxiliary container at once; promote candidates only after 24h usage data. -8. Deploy `scripts/ops/stop-stale-gitea-actions-jobs.sh` to 110 as `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`; keep Prometheus auto action in dry-run mode. As of 2026-06-27, it must recognize both legacy `GITEA-ACTIONS-*` and current repo-scoped `*-cd--*` / `*-code-review--*` container names. +8. Deploy `scripts/ops/stop-stale-gitea-actions-jobs.sh` to 110 as `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`; keep Prometheus auto action in dry-run mode. As of 2026-06-27, it must recognize both legacy `GITEA-ACTIONS-*` and current repo-scoped `*-cd--*` / `*-code-review--*` container names;同時 `host-runaway-process-exporter.py` 必須在沒有 action container 可見時仍輸出 BuildKit / `next build` process load。 9. Fix 110 runner services with sudo-capable host maintenance: ```bash diff --git a/scripts/ops/host-runaway-process-exporter.py b/scripts/ops/host-runaway-process-exporter.py index ac758868..d1cc6068 100755 --- a/scripts/ops/host-runaway-process-exporter.py +++ b/scripts/ops/host-runaway-process-exporter.py @@ -60,6 +60,14 @@ class ProcessGroup: sample_comm: str +@dataclass(frozen=True) +class ActiveCiLoad: + group_count: int + process_count: int + cpu_percent: float + oldest_age_seconds: int + + DEFAULT_RULES = ( RunawayRule( "stockplatform_headless_smoke", @@ -73,6 +81,11 @@ DEFAULT_RULES = ( ), ) +GITEA_ACTION_PROCESS_RE = re.compile( + r"(/\.cache/act/|/home/wooo/\.cache/act/|\bdocker build\b|\bdocker-buildx\b|" + r"\bbuildx build\b|\bpnpm turbo build\b|\bturbo build\b|\bnext build\b)" +) + def escape_label(value: str) -> str: return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value) @@ -220,6 +233,25 @@ def active_gitea_action_containers(docker_file: Path | None = None) -> int: return sum(1 for name in names if GITEA_ACTION_CONTAINER_RE.search(name)) +def active_gitea_action_process_load(rows: list[ProcessRow]) -> ActiveCiLoad: + grouped: dict[int, list[ProcessRow]] = {} + for row in rows: + haystack = f"{row.comm} {row.args}" + if not GITEA_ACTION_PROCESS_RE.search(haystack): + continue + if "act_runner daemon" in haystack: + continue + grouped.setdefault(row.pgid, []).append(row) + + members = [row for group in grouped.values() for row in group] + return ActiveCiLoad( + group_count=len(grouped), + process_count=len(members), + cpu_percent=sum(row.pcpu for row in members), + oldest_age_seconds=max((row.etimes for row in members), default=0), + ) + + def load5_per_core() -> float: try: load5 = float(Path("/proc/loadavg").read_text(encoding="utf-8").split()[1]) @@ -254,6 +286,7 @@ def render_metrics( host: str, groups: list[ProcessGroup], active_action_containers: int, + active_action_process_load: ActiveCiLoad, min_age_seconds: int, min_cpu_percent: float, now: int, @@ -282,6 +315,14 @@ def render_metrics( "# TYPE awoooi_host_runaway_browser_orphan_group_info gauge", "# HELP awoooi_host_gitea_actions_active_container_count Active Gitea Actions task containers visible on the host, -1 when Docker is unavailable.", "# TYPE awoooi_host_gitea_actions_active_container_count gauge", + "# HELP awoooi_host_gitea_actions_active_process_group_count Active Gitea Actions or BuildKit process groups visible on the host.", + "# TYPE awoooi_host_gitea_actions_active_process_group_count gauge", + "# HELP awoooi_host_gitea_actions_active_process_count Active Gitea Actions or BuildKit processes visible on the host.", + "# TYPE awoooi_host_gitea_actions_active_process_count gauge", + "# HELP awoooi_host_gitea_actions_active_process_cpu_percent CPU percent used by active Gitea Actions or BuildKit processes.", + "# TYPE awoooi_host_gitea_actions_active_process_cpu_percent gauge", + "# HELP awoooi_host_gitea_actions_active_process_oldest_age_seconds Oldest active Gitea Actions or BuildKit process age.", + "# TYPE awoooi_host_gitea_actions_active_process_oldest_age_seconds gauge", "# HELP awoooi_host_load5_per_core Host load5 divided by CPU core count.", "# TYPE awoooi_host_load5_per_core gauge", "# HELP awoooi_host_swap_used_ratio Host swap used ratio from /proc/meminfo.", @@ -291,6 +332,10 @@ def render_metrics( f"awoooi_host_runaway_process_monitor_up{{{labels_host},mode=\"read_only\"}} 1", f"awoooi_host_runaway_process_last_run_timestamp{{{labels_host}}} {now}", f"awoooi_host_gitea_actions_active_container_count{{{labels_host}}} {active_action_containers}", + f"awoooi_host_gitea_actions_active_process_group_count{{{labels_host}}} {active_action_process_load.group_count}", + f"awoooi_host_gitea_actions_active_process_count{{{labels_host}}} {active_action_process_load.process_count}", + f"awoooi_host_gitea_actions_active_process_cpu_percent{{{labels_host}}} {active_action_process_load.cpu_percent:.6f}", + f"awoooi_host_gitea_actions_active_process_oldest_age_seconds{{{labels_host}}} {active_action_process_load.oldest_age_seconds}", f"awoooi_host_load5_per_core{{{labels_host}}} {load_ratio:.6f}", f"awoooi_host_swap_used_ratio{{{labels_host}}} {swap_ratio:.6f}", f"awoooi_host_runaway_process_remediation_authorized{{{labels_host}}} 0", @@ -338,6 +383,7 @@ def collect(args: argparse.Namespace) -> str: host=args.host, groups=groups, active_action_containers=active_gitea_action_containers(args.docker_ps_file), + active_action_process_load=active_gitea_action_process_load(rows), min_age_seconds=args.min_age_seconds, min_cpu_percent=args.min_cpu_percent, now=int(time.time()), diff --git a/scripts/ops/tests/test_host_runaway_process_exporter.py b/scripts/ops/tests/test_host_runaway_process_exporter.py index 343ab0e1..2843200f 100644 --- a/scripts/ops/tests/test_host_runaway_process_exporter.py +++ b/scripts/ops/tests/test_host_runaway_process_exporter.py @@ -79,6 +79,12 @@ def test_renders_ci_load_and_swap_without_authorizing_repair(tmp_path: Path) -> host="110", groups=groups, active_action_containers=3, + active_action_process_load=exporter.ActiveCiLoad( + group_count=2, + process_count=4, + cpu_percent=188.5, + oldest_age_seconds=240, + ), min_age_seconds=1800, min_cpu_percent=50, now=123, @@ -88,6 +94,10 @@ def test_renders_ci_load_and_swap_without_authorizing_repair(tmp_path: Path) -> assert 'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1' in metrics assert 'awoooi_host_gitea_actions_active_container_count{host="110"} 3' in metrics + assert 'awoooi_host_gitea_actions_active_process_group_count{host="110"} 2' in metrics + assert 'awoooi_host_gitea_actions_active_process_count{host="110"} 4' in metrics + assert 'awoooi_host_gitea_actions_active_process_cpu_percent{host="110"} 188.500000' in metrics + assert 'awoooi_host_gitea_actions_active_process_oldest_age_seconds{host="110"} 240' in metrics assert 'awoooi_host_swap_used_ratio{host="110"} 1.000000' in metrics assert 'awoooi_host_runaway_process_remediation_authorized{host="110"} 0' in metrics assert 'rule="stockplatform_headless_smoke"' in metrics @@ -113,6 +123,28 @@ def test_counts_modern_gitea_action_container_names(tmp_path: Path) -> None: assert exporter.active_gitea_action_containers(docker_file) == 4 +def test_counts_buildkit_runner_process_load() -> None: + exporter = load_exporter() + rows = exporter.parse_ps_rows( + """ + 100 10 100 100 240 0.0 S bash bash --noprofile --norc -e -o pipefail /home/wooo/.cache/act/14cc/act/workflow/8.sh + 101 100 100 100 239 1.0 S docker docker build -f apps/web/Dockerfile . + 102 101 100 100 239 2.0 S docker-buildx /home/wooo/.docker/cli-plugins/docker-buildx buildx build -f apps/web/Dockerfile . + 200 150 200 200 210 12.5 S turbo turbo build --filter=@awoooi/web --concurrency=1 + 201 200 200 200 200 145.0 S node node /app/apps/web/node_modules/.bin/../next/dist/bin/next build + 300 1 300 300 9999 0.1 S act_runner act_runner daemon --config /config.yaml + 400 1 400 400 120 30.0 S node node apps/web/server.js + """ + ) + + load = exporter.active_gitea_action_process_load(rows) + + assert load.group_count == 2 + assert load.process_count == 5 + assert load.cpu_percent == 160.5 + assert load.oldest_age_seconds == 240 + + def test_remediation_defaults_to_dry_run(tmp_path: Path) -> None: ps_file = tmp_path / "ps.txt" ps_file.write_text(