From 7cc10b2599047674507dc97432ae9d35c4f04dcb Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 30 Apr 2026 10:11:50 +0800 Subject: [PATCH] fix(cd): serialize gitea docker builds --- .gitea/workflows/cd.yaml | 51 ++++++++++++++++++++++++++++++++++++++++ docs/LOGBOOK.md | 13 ++++++++++ ops/runner/README.md | 37 +++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+) diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index ff3893bd..b93e9d5a 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -204,6 +204,47 @@ jobs: username: ${{ secrets.HARBOR_USERNAME }} password: ${{ secrets.HARBOR_PASSWORD }} + # 2026-04-30 Codex: Gitea act-runner shares one Docker daemon across repos. + # When another repo starts a heavy docker build while AWOOOI Web is still + # building, the job container can disappear and Docker reports RWLayer=nil. + # A Docker-network lock is global to the host daemon and survives container + # namespaces, unlike /tmp/flock inside the transient job container. + - name: Acquire Docker Build Lock + run: | + LOCK_NAME="awoooi-cd-docker-build-lock" + STALE_SECONDS=7200 + WAIT_ATTEMPTS=180 + + for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do + if docker network create \ + --label awoooi.ci-lock=docker-build \ + --label awoooi.owner=cd-pipeline \ + "$LOCK_NAME" >/dev/null 2>&1; then + echo "DOCKER_BUILD_LOCK=${LOCK_NAME}" >> "$GITHUB_ENV" + echo "✅ Docker build lock acquired: ${LOCK_NAME}" + exit 0 + fi + + CREATED_AT=$(docker network inspect "$LOCK_NAME" \ + --format '{{.Created}}' 2>/dev/null || true) + if [ -n "$CREATED_AT" ]; then + CREATED_EPOCH=$(date -d "$CREATED_AT" +%s 2>/dev/null || echo 0) + NOW_EPOCH=$(date +%s) + if [ "$CREATED_EPOCH" -gt 0 ] && \ + [ $((NOW_EPOCH - CREATED_EPOCH)) -gt "$STALE_SECONDS" ]; then + echo "⚠️ stale Docker build lock detected, removing ${LOCK_NAME}" + docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true + continue + fi + fi + + echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting..." + sleep 10 + done + + echo "❌ timed out waiting for Docker build lock" + exit 1 + # ── API 鏡像建置(含 Layer Cache 加速)────────────────────────────── # 2026-04-01 ogt: CACHE_BUST=git_sha 確保 src/ 和 models.json 層每次重建 # deps 層 (pip install) 仍可 cache → 加速;代碼/配置層強制失效 @@ -246,6 +287,16 @@ jobs: docker push ${{ env.HARBOR }}/awoooi/web:${{ github.sha }} docker push ${{ env.HARBOR }}/awoooi/web:latest + - name: Release Docker Build Lock + if: always() + run: | + if [ -n "${DOCKER_BUILD_LOCK:-}" ]; then + docker network rm "$DOCKER_BUILD_LOCK" >/dev/null 2>&1 || true + echo "✅ Docker build lock released: ${DOCKER_BUILD_LOCK}" + else + echo "⚡ no Docker build lock to release" + fi + # 2026-03-31 ogt: 移除中間通知 # 2026-03-31 ogt: P0-1 Secrets 自動注入 (ADR-035 強制) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index cb076d16..779707ee 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,19 @@ --- +## 2026-04-30 | CD Runner 並行 Build 修復 — RWLayer nil + +AWOOOI CD `Build and Push Web` 在 Gitea act-runner 內失敗:`RWLayer of container ... unexpectedly nil`。Web image 在 110 host 直接 build 成功,排除 Web 程式碼 build error。 + +### 根因 +- 110 `gitea-runner` 實際使用 `/home/wooo/act-runner/config.yaml`,`runner.capacity: 2`。 +- AWOOOI Web build 還在跑時,runner 於 `2026-04-30T01:26:02Z` 接了另一個 repo task,兩個 task 共用同一個 Docker daemon。 +- AWOOOI job container 隨後消失,BuildKit 回報 `RWLayer ... unexpectedly nil`,後續 notify/post steps 也因 `No such container` 失敗。 + +### 修法 +- `.gitea/workflows/cd.yaml` 新增 host-global Docker build lock,以 Docker network `awoooi-cd-docker-build-lock` 序列化 API/Web image build。 +- `ops/runner/README.md` 記錄 110 act-runner 必須 `capacity: 1`,並說明 stale lock 清理策略。 + ## 2026-04-30 | Prod 部署補救 — AI Telegram / Code Review 落地 Gitea CD runner 在 Docker/act job 容器層反覆出現 `RWLayer ... unexpectedly nil`,導致 `639bb64` 功能 commit 未能進 prod,Telegram 仍顯示舊 ACTION REQUIRED 卡片。 diff --git a/ops/runner/README.md b/ops/runner/README.md index 19cd6861..6b7e0f33 100644 --- a/ops/runner/README.md +++ b/ops/runner/README.md @@ -87,6 +87,43 @@ sudo systemctl start runner-diag-cleanup.service - Memory: `feedback_runner_zombie_process.md` - ADR: 待建立 (如果問題持續) +## 問題: Gitea act-runner 並行 Docker Build 讓 Job Container 消失 + +### 症狀 + +``` +Error response from daemon: RWLayer of container is unexpectedly nil +Error response from daemon: No such container: +``` + +### 根因分析 (2026-04-30) + +1. AWOOOI CD 在 `Build and Push Web` 仍執行 Next.js production build 時,110 的 `gitea-runner` 又接了另一個 repo 的 Actions task。 +2. 兩個 task 共用同一個 Docker daemon 與同一個 act-runner 容器;act-runner `capacity: 2` 允許跨 repo 並行。 +3. 第二個 task 啟動後,第一個 AWOOOI job container 被 Docker/act 清掉,BuildKit 後續只看到 `RWLayer ... unexpectedly nil`。 +4. Web image 在 110 host 直接 `docker build` 可成功,證明不是 Web 程式 build error。 + +### 修復 + +1. 110 act-runner 必須單工: + +```yaml +# /home/wooo/act-runner/config.yaml +runner: + capacity: 1 +``` + +2. AWOOOI CD workflow 需要 Docker daemon 全域 lock: + +```yaml +- name: Acquire Docker Build Lock + run: docker network create awoooi-cd-docker-build-lock +``` + +實作使用 Docker network 當 host-global lock,因為 `/tmp/flock` 只存在 transient job container 內,無法跨 repo/跨 container 生效。 + +3. 若 job 非正常中止留下 lock,下一次 CD 會在 lock 超過 2 小時後移除 stale network。 + --- 版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code 變更: v1.0→v2.0 序列建構取代 Job Concurrency Groups