diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml
index b93e9d5a..dec4b3ac 100644
--- a/.gitea/workflows/cd.yaml
+++ b/.gitea/workflows/cd.yaml
@@ -41,11 +41,10 @@ env:
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production
jobs:
- build-and-deploy:
+ tests:
# 2026-04-02 ogt: Gitea runner label 是 ubuntu-latest (非 GitHub 的 self-hosted)
# ADR-039 鐵律: 使用自建 runner,但 Gitea label matching 不同於 GitHub
- # 2026-04-02 Claude Code: 加入 timeout 防止 docker build/push 卡住超過 45 分鐘
- timeout-minutes: 45
+ timeout-minutes: 30
runs-on: ubuntu-latest
# 2026-04-10 ogt: B5 改用 docker run 本地啟動,移除 services: 宣告
# Gitea act runner 的 services: container name 為空,導致 CI 失敗
@@ -197,12 +196,42 @@ jobs:
# 清理
docker rm -f pg-test-b5 || true
+ - name: Notify Pipeline Failure
+ # 2026-04-30 Codex: tests job failure notifier; no jq dependency for host parity.
+ if: failure()
+ run: |
+ COMMIT_MSG="${{ steps.commit.outputs.message }}"
+ SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
+ ACTOR="${{ github.actor }}"
+ COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/\</g; s/>/\>/g')
+ MSG=$(printf '❌ AWOOOI 部署失敗\n├ 📝 %s\n├ 🔖 %s\n├ 👤 %s\n├ 🧪 Stage: tests\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
+ curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
+ -d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
+ -d "parse_mode=HTML" \
+ --data-urlencode "text=${MSG}"
+
+ build-and-deploy:
+ # 2026-04-30 Codex: Docker builds run on the host runner. Long docker build
+ # steps were killing the transient act job container with RWLayer=nil.
+ needs: tests
+ timeout-minutes: 60
+ runs-on: awoooi-host
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Get Commit Info
+ id: commit
+ run: |
+ echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
+ echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
+ echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
+
- name: Login to Harbor
- uses: docker/login-action@v3
- with:
- registry: ${{ env.HARBOR }}
- username: ${{ secrets.HARBOR_USERNAME }}
- password: ${{ secrets.HARBOR_PASSWORD }}
+ run: |
+ echo "${{ secrets.HARBOR_PASSWORD }}" | \
+ docker login "${{ env.HARBOR }}" \
+ -u "${{ secrets.HARBOR_USERNAME }}" \
+ --password-stdin
# 2026-04-30 Codex: Gitea act-runner shares one Docker daemon across repos.
# When another repo starts a heavy docker build while AWOOOI Web is still
@@ -562,9 +591,13 @@ jobs:
echo "✅ Service Registry ConfigMap 已更新"
# ─── Step 2: 更新 kustomization.yaml image tag ───
- # 安裝 kustomize(若未安裝)
+ # host runner 不保證有 root 權限,kustomize 安裝在使用者目錄。
+ export PATH="${HOME}/.local/bin:${PATH}"
if ! command -v kustomize &>/dev/null; then
- curl -sL https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.3.0/kustomize_v5.3.0_linux_amd64.tar.gz | tar xz -C /usr/local/bin
+ mkdir -p "${HOME}/.local/bin"
+ curl -sL https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.3.0/kustomize_v5.3.0_linux_amd64.tar.gz \
+ | tar xz -C "${HOME}/.local/bin"
+ chmod +x "${HOME}/.local/bin/kustomize"
fi
cd k8s/awoooi-prod
@@ -668,6 +701,33 @@ jobs:
"chmod +x ~/awoooi-ops/docker-health-monitor.sh ~/awoooi-ops/pg-backup.sh && echo '✅ 權限設定完成'" \
|| echo "⚠️ 權限設定失敗"
+ - name: Notify Pipeline Failure
+ if: failure()
+ run: |
+ COMMIT_MSG="${{ steps.commit.outputs.message }}"
+ SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
+ ACTOR="${{ github.actor }}"
+ COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/\</g; s/>/\>/g')
+ MSG=$(printf '❌ AWOOOI 部署失敗\n├ 📝 %s\n├ 🔖 %s\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
+ curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
+ -d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
+ -d "parse_mode=HTML" \
+ --data-urlencode "text=${MSG}"
+
+ post-deploy-checks:
+ needs: build-and-deploy
+ timeout-minutes: 30
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Get Commit Info
+ id: commit
+ run: |
+ echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
+ echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
+ echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
+
# Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037)
# 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter
# 2026-04-05 Claude Code cache優化: 使用 /opt/api-venv (已有 requests),移除 Setup Python Tools step
@@ -766,7 +826,8 @@ jobs:
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/\</g; s/>/\>/g')
- MSG=$(printf '❌ AWOOOI 部署失敗\n├ 📝 %s\n├ 🔖 %s\n├ 👤 %s\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
+ MSG=$(printf '❌ AWOOOI 部署失敗\n├ 📝 %s\n├ 🔖 %s\n├ 👤 %s\n├ 🩺 Stage: post-deploy-checks\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
- -H "Content-Type: application/json" \
- -d "$(jq -n --arg c "${{ secrets.TELEGRAM_CHAT_ID }}" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML"}')"
+ -d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
+ -d "parse_mode=HTML" \
+ --data-urlencode "text=${MSG}"
diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md
index 779707ee..bf68b68a 100644
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -6,6 +6,21 @@
---
+## 2026-04-30 | CD Runner 拆段 — host build/deploy
+
+承接 `RWLayer ... unexpectedly nil` 持續打斷 Gitea CD 的問題。第一層 `capacity: 1` + Docker lock 可阻止跨 repo 並行,但長時間 Web build 仍會讓 transient act job container 在 build 收尾消失。
+
+### 完成
+- 110 停用 Docker-wrapped `gitea-runner` container,改保留 host-level `act_runner` daemon。
+- `/home/wooo/act-runner/config.yaml` 新增 `awoooi-host:host` label,並保留 `ubuntu-latest` Docker label 給測試 job。
+- `.gitea/workflows/cd.yaml` 拆為 `tests`、`build-and-deploy`、`post-deploy-checks` 三段;API/Web Docker build 與 GitOps deploy 改跑 `awoooi-host`,不再在 transient act job container 內長時間 build。
+- host deploy step 的 `kustomize` 改安裝到 `${HOME}/.local/bin`,避免 host runner 沒有 root 權限時寫 `/usr/local/bin` 失敗。
+
+### 驗證
+- 110 act_runner 已宣告 labels: `ubuntu-latest ubuntu-22.04 ubuntu-24.04 awoooi-host`。
+- Docker-wrapped `gitea-runner` restart policy 已改 `no` 且狀態為 exited。
+- `.gitea/workflows/cd.yaml` YAML parse 通過,所有 `run:` block `bash -n` 通過。
+
## 2026-04-30 | CD Runner 並行 Build 修復 — RWLayer nil
AWOOOI CD `Build and Push Web` 在 Gitea act-runner 內失敗:`RWLayer of container ... unexpectedly nil`。Web image 在 110 host 直接 build 成功,排除 Web 程式碼 build error。
diff --git a/ops/runner/README.md b/ops/runner/README.md
index 6b7e0f33..93d66ed3 100644
--- a/ops/runner/README.md
+++ b/ops/runner/README.md
@@ -103,7 +103,7 @@ Error response from daemon: No such container:
3. 第二個 task 啟動後,第一個 AWOOOI job container 被 Docker/act 清掉,BuildKit 後續只看到 `RWLayer ... unexpectedly nil`。
4. Web image 在 110 host 直接 `docker build` 可成功,證明不是 Web 程式 build error。
-### 修復
+### 第一層修復
1. 110 act-runner 必須單工:
@@ -124,6 +124,33 @@ runner:
3. 若 job 非正常中止留下 lock,下一次 CD 會在 lock 超過 2 小時後移除 stale network。
+### 第二層修復: host label build/deploy
+
+`capacity: 1` 與 Docker network lock 可避免跨 repo 並行,但長時間
+`docker build` 仍可能讓 transient act job container 在 build 收尾時消失。
+2026-04-30 起,AWOOOI CD 拆成三段:
+
+| Job | runner label | 用途 |
+|-----|--------------|------|
+| `tests` | `ubuntu-latest` | API unit + B5 integration tests,仍跑在 ci-runner container |
+| `build-and-deploy` | `awoooi-host` | Harbor login、API/Web image build/push、GitOps deploy,直接跑在 110 host |
+| `post-deploy-checks` | `ubuntu-latest` | Alert chain、monitoring coverage、Playwright smoke |
+
+110 只保留 host-level `act_runner` daemon,並在同一份 config 宣告兩類 label:
+
+```yaml
+runner:
+ capacity: 1
+ labels:
+ - "ubuntu-latest:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"
+ - "ubuntu-22.04:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"
+ - "ubuntu-24.04:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"
+ - "awoooi-host:host"
+```
+
+Docker-wrapped `gitea-runner` container 必須停用,避免它用同一份 config
+搶走 `awoooi-host` job,導致 host job 其實跑在 runner container 裡。
+
---
版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code
變更: v1.0→v2.0 序列建構取代 Job Concurrency Groups