From b3ab4da03b4c1ce016b3f00890159086b259d748 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 21 May 2026 15:51:36 +0800 Subject: [PATCH] ci(cd): wait for host web build pressure --- .gitea/workflows/cd.yaml | 8 ++++ ops/runner/README.md | 26 ++++++++++++ scripts/ci/wait-host-web-build-pressure.sh | 47 ++++++++++++++++++++++ 3 files changed, 81 insertions(+) create mode 100755 scripts/ci/wait-host-web-build-pressure.sh diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index a63806d4..4226933a 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -329,6 +329,14 @@ jobs: -u "${{ secrets.HARBOR_USERNAME }}" \ --password-stdin + # 2026-05-21 Codex: AWOOI workflow concurrency and the Docker network + # lock only protect AWOOI/Docker work. Other repos can still run + # host-side Next/Turbo builds on the same 110 runner and starve this + # deploy. Wait for those foreign web builds before starting our image + # build; the gate is read-only and never kills another process. + - name: Wait for Host Web Build Pressure + run: bash scripts/ci/wait-host-web-build-pressure.sh + # 2026-04-30 Codex: Gitea act-runner shares one Docker daemon across repos. # When another repo starts a heavy docker build while AWOOOI Web is still # building, the job container can disappear and Docker reports RWLayer=nil. diff --git a/ops/runner/README.md b/ops/runner/README.md index 597f5012..380476c7 100644 --- a/ops/runner/README.md +++ b/ops/runner/README.md @@ -191,6 +191,32 @@ loginctl show-user wooo -p Linger 若沒有登入 session,user service 不一定會自動啟動。需要 root 執行 `loginctl enable-linger wooo`,或改安裝 system-level service。 +### 第四層修復: host Web build pressure gate + +2026-05-21 追加一層 CD preflight:`.gitea/workflows/cd.yaml` 在 Harbor login +之後、Docker build lock 之前呼叫 `scripts/ci/wait-host-web-build-pressure.sh`。 + +背景是 AWOOI workflow concurrency 與 Docker network lock 只能保護 AWOOI 自己 +與 Docker build/push;其他 repo 仍可能在同一台 110 host runner 直接執行 +`next build` / `turbo build` / `vite build`。這類 host-side build 不會拿 +AWOOI 的 Docker lock,會和 AWOOI Web image 內的 Next production build 疊加, +造成 110 load、Gitea API timeout、Actions `context canceled` 或 post-deploy +觀測失真。 + +此 gate 的行為: + +- 只讀取 `ps`,不 kill / renice / reset 任何外部 process。 +- 排除 AWOOI 自身 checkout、local worktree 與 Web Docker build 內的 + `/app/apps/web` process,避免誤判自己的部署。 +- 預設最多等待 60 次、每次 10 秒;若仍有外部 build,先以 warning 放行, + 避免 CD 永久卡住。 +- 可用 `HOST_WEB_BUILD_PRESSURE_WARN_ONLY=0` 改成 hard fail,但必須先確認 + runner 隔離與其他 repo build 排程已收斂,避免把 shared runner 壓力轉成 + 部署中斷。 + +長期方向仍是 runner 隔離或 build offload;此 gate 是在 shared runner 尚未 +拆分前,降低重型前端 build 互相踩踏的保守保護層。 + --- 版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code 變更: v1.0→v2.0 序列建構取代 Job Concurrency Groups diff --git a/scripts/ci/wait-host-web-build-pressure.sh b/scripts/ci/wait-host-web-build-pressure.sh new file mode 100755 index 00000000..820988e3 --- /dev/null +++ b/scripts/ci/wait-host-web-build-pressure.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 2026-05-21 Codex: protect the shared 110 host runner from overlapping +# host-side frontend production builds launched by other repositories. +# This is intentionally a wait gate, not an auto-repair step: it never kills, +# renices, or rewrites another repo's process tree. + +ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-60}" +SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-10}" +WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-1}" +PS_COMMAND="${HOST_WEB_BUILD_PRESSURE_PS_COMMAND:-ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu}" + +list_foreign_web_builds() { + bash -c "$PS_COMMAND" | awk ' + BEGIN { IGNORECASE = 1 } + /[n]ext[\/[:alnum:]._-]*[[:space:]]+build|[t]urbo[[:space:]]+build|[v]ite[[:space:]]+build/ { + if ($0 ~ /\/workspace\/wooo\/awoooi/) next + if ($0 ~ /\/Users\/ogt\/awoooi/) next + if ($0 ~ /\/private\/tmp\/awoooi/) next + if ($0 ~ /\/app\/apps\/web/) next + if ($0 ~ /scripts\/ci\/wait-host-web-build-pressure\.sh/) next + print + } + ' +} + +for attempt in $(seq 1 "$ATTEMPTS"); do + active_builds="$(list_foreign_web_builds || true)" + if [ -z "$active_builds" ]; then + echo "✅ no foreign host web build pressure detected" + exit 0 + fi + + echo "⏳ foreign host web build pressure detected (attempt ${attempt}/${ATTEMPTS}); waiting ${SLEEP_SECONDS}s" + printf '%s\n' "$active_builds" | head -n 8 + sleep "$SLEEP_SECONDS" +done + +echo "⚠️ foreign host web build pressure still active after ${ATTEMPTS} checks" +if [ "$WARN_ONLY" = "1" ]; then + echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan" + exit 0 +fi + +echo "❌ refusing to start AWOOI image build while foreign host web builds are still active" +exit 1