ci(cd): wait for host web build pressure
All checks were successful
Code Review / ai-code-review (push) Successful in 17s

This commit is contained in:
Your Name
2026-05-21 15:51:36 +08:00
parent 8164121870
commit b3ab4da03b
3 changed files with 81 additions and 0 deletions

View File

@@ -329,6 +329,14 @@ jobs:
-u "${{ secrets.HARBOR_USERNAME }}" \
--password-stdin
# 2026-05-21 Codex: AWOOI workflow concurrency and the Docker network
# lock only protect AWOOI/Docker work. Other repos can still run
# host-side Next/Turbo builds on the same 110 runner and starve this
# deploy. Wait for those foreign web builds before starting our image
# build; the gate is read-only and never kills another process.
- name: Wait for Host Web Build Pressure
run: bash scripts/ci/wait-host-web-build-pressure.sh
# 2026-04-30 Codex: Gitea act-runner shares one Docker daemon across repos.
# When another repo starts a heavy docker build while AWOOOI Web is still
# building, the job container can disappear and Docker reports RWLayer=nil.

View File

@@ -191,6 +191,32 @@ loginctl show-user wooo -p Linger
若沒有登入 sessionuser service 不一定會自動啟動。需要 root 執行
`loginctl enable-linger wooo`,或改安裝 system-level service。
### 第四層修復: host Web build pressure gate
2026-05-21 追加一層 CD preflight`.gitea/workflows/cd.yaml` 在 Harbor login
之後、Docker build lock 之前呼叫 `scripts/ci/wait-host-web-build-pressure.sh`
背景是 AWOOI workflow concurrency 與 Docker network lock 只能保護 AWOOI 自己
與 Docker build/push其他 repo 仍可能在同一台 110 host runner 直接執行
`next build` / `turbo build` / `vite build`。這類 host-side build 不會拿
AWOOI 的 Docker lock會和 AWOOI Web image 內的 Next production build 疊加,
造成 110 load、Gitea API timeout、Actions `context canceled` 或 post-deploy
觀測失真。
此 gate 的行為:
- 只讀取 `ps`,不 kill / renice / reset 任何外部 process。
- 排除 AWOOI 自身 checkout、local worktree 與 Web Docker build 內的
`/app/apps/web` process避免誤判自己的部署。
- 預設最多等待 60 次、每次 10 秒;若仍有外部 build先以 warning 放行,
避免 CD 永久卡住。
- 可用 `HOST_WEB_BUILD_PRESSURE_WARN_ONLY=0` 改成 hard fail但必須先確認
runner 隔離與其他 repo build 排程已收斂,避免把 shared runner 壓力轉成
部署中斷。
長期方向仍是 runner 隔離或 build offload此 gate 是在 shared runner 尚未
拆分前,降低重型前端 build 互相踩踏的保守保護層。
---
版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code
變更: v1.0→v2.0 序列建構取代 Job Concurrency Groups

View File

@@ -0,0 +1,47 @@
#!/usr/bin/env bash
set -euo pipefail
# 2026-05-21 Codex: protect the shared 110 host runner from overlapping
# host-side frontend production builds launched by other repositories.
# This is intentionally a wait gate, not an auto-repair step: it never kills,
# renices, or rewrites another repo's process tree.
ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-60}"
SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-10}"
WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-1}"
PS_COMMAND="${HOST_WEB_BUILD_PRESSURE_PS_COMMAND:-ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu}"
list_foreign_web_builds() {
bash -c "$PS_COMMAND" | awk '
BEGIN { IGNORECASE = 1 }
/[n]ext[\/[:alnum:]._-]*[[:space:]]+build|[t]urbo[[:space:]]+build|[v]ite[[:space:]]+build/ {
if ($0 ~ /\/workspace\/wooo\/awoooi/) next
if ($0 ~ /\/Users\/ogt\/awoooi/) next
if ($0 ~ /\/private\/tmp\/awoooi/) next
if ($0 ~ /\/app\/apps\/web/) next
if ($0 ~ /scripts\/ci\/wait-host-web-build-pressure\.sh/) next
print
}
'
}
for attempt in $(seq 1 "$ATTEMPTS"); do
active_builds="$(list_foreign_web_builds || true)"
if [ -z "$active_builds" ]; then
echo "✅ no foreign host web build pressure detected"
exit 0
fi
echo "⏳ foreign host web build pressure detected (attempt ${attempt}/${ATTEMPTS}); waiting ${SLEEP_SECONDS}s"
printf '%s\n' "$active_builds" | head -n 8
sleep "$SLEEP_SECONDS"
done
echo "⚠️ foreign host web build pressure still active after ${ATTEMPTS} checks"
if [ "$WARN_ONLY" = "1" ]; then
echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan"
exit 0
fi
echo "❌ refusing to start AWOOI image build while foreign host web builds are still active"
exit 1