fix(cd): run docker builds on host runner
This commit is contained in:
@@ -41,11 +41,10 @@ env:
|
||||
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production
|
||||
|
||||
jobs:
|
||||
build-and-deploy:
|
||||
tests:
|
||||
# 2026-04-02 ogt: Gitea runner label 是 ubuntu-latest (非 GitHub 的 self-hosted)
|
||||
# ADR-039 鐵律: 使用自建 runner,但 Gitea label matching 不同於 GitHub
|
||||
# 2026-04-02 Claude Code: 加入 timeout 防止 docker build/push 卡住超過 45 分鐘
|
||||
timeout-minutes: 45
|
||||
timeout-minutes: 30
|
||||
runs-on: ubuntu-latest
|
||||
# 2026-04-10 ogt: B5 改用 docker run 本地啟動,移除 services: 宣告
|
||||
# Gitea act runner 的 services: container name 為空,導致 CI 失敗
|
||||
@@ -197,12 +196,42 @@ jobs:
|
||||
# 清理
|
||||
docker rm -f pg-test-b5 || true
|
||||
|
||||
- name: Notify Pipeline Failure
|
||||
# 2026-04-30 Codex: tests job failure notifier; no jq dependency for host parity.
|
||||
if: failure()
|
||||
run: |
|
||||
COMMIT_MSG="${{ steps.commit.outputs.message }}"
|
||||
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
|
||||
ACTOR="${{ github.actor }}"
|
||||
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/</\</g; s/>/\>/g')
|
||||
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🧪 Stage: tests\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
|
||||
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||||
-d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
|
||||
-d "parse_mode=HTML" \
|
||||
--data-urlencode "text=${MSG}"
|
||||
|
||||
build-and-deploy:
|
||||
# 2026-04-30 Codex: Docker builds run on the host runner. Long docker build
|
||||
# steps were killing the transient act job container with RWLayer=nil.
|
||||
needs: tests
|
||||
timeout-minutes: 60
|
||||
runs-on: awoooi-host
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Get Commit Info
|
||||
id: commit
|
||||
run: |
|
||||
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
|
||||
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
|
||||
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Login to Harbor
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.HARBOR }}
|
||||
username: ${{ secrets.HARBOR_USERNAME }}
|
||||
password: ${{ secrets.HARBOR_PASSWORD }}
|
||||
run: |
|
||||
echo "${{ secrets.HARBOR_PASSWORD }}" | \
|
||||
docker login "${{ env.HARBOR }}" \
|
||||
-u "${{ secrets.HARBOR_USERNAME }}" \
|
||||
--password-stdin
|
||||
|
||||
# 2026-04-30 Codex: Gitea act-runner shares one Docker daemon across repos.
|
||||
# When another repo starts a heavy docker build while AWOOOI Web is still
|
||||
@@ -562,9 +591,13 @@ jobs:
|
||||
echo "✅ Service Registry ConfigMap 已更新"
|
||||
|
||||
# ─── Step 2: 更新 kustomization.yaml image tag ───
|
||||
# 安裝 kustomize(若未安裝)
|
||||
# host runner 不保證有 root 權限,kustomize 安裝在使用者目錄。
|
||||
export PATH="${HOME}/.local/bin:${PATH}"
|
||||
if ! command -v kustomize &>/dev/null; then
|
||||
curl -sL https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.3.0/kustomize_v5.3.0_linux_amd64.tar.gz | tar xz -C /usr/local/bin
|
||||
mkdir -p "${HOME}/.local/bin"
|
||||
curl -sL https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.3.0/kustomize_v5.3.0_linux_amd64.tar.gz \
|
||||
| tar xz -C "${HOME}/.local/bin"
|
||||
chmod +x "${HOME}/.local/bin/kustomize"
|
||||
fi
|
||||
|
||||
cd k8s/awoooi-prod
|
||||
@@ -668,6 +701,33 @@ jobs:
|
||||
"chmod +x ~/awoooi-ops/docker-health-monitor.sh ~/awoooi-ops/pg-backup.sh && echo '✅ 權限設定完成'" \
|
||||
|| echo "⚠️ 權限設定失敗"
|
||||
|
||||
- name: Notify Pipeline Failure
|
||||
if: failure()
|
||||
run: |
|
||||
COMMIT_MSG="${{ steps.commit.outputs.message }}"
|
||||
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
|
||||
ACTOR="${{ github.actor }}"
|
||||
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/</\</g; s/>/\>/g')
|
||||
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
|
||||
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||||
-d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
|
||||
-d "parse_mode=HTML" \
|
||||
--data-urlencode "text=${MSG}"
|
||||
|
||||
post-deploy-checks:
|
||||
needs: build-and-deploy
|
||||
timeout-minutes: 30
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Get Commit Info
|
||||
id: commit
|
||||
run: |
|
||||
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
|
||||
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
|
||||
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
|
||||
|
||||
# Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037)
|
||||
# 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter
|
||||
# 2026-04-05 Claude Code cache優化: 使用 /opt/api-venv (已有 requests),移除 Setup Python Tools step
|
||||
@@ -766,7 +826,8 @@ jobs:
|
||||
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
|
||||
ACTOR="${{ github.actor }}"
|
||||
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/</\</g; s/>/\>/g')
|
||||
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
|
||||
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🩺 Stage: post-deploy-checks\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
|
||||
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$(jq -n --arg c "${{ secrets.TELEGRAM_CHAT_ID }}" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML"}')"
|
||||
-d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
|
||||
-d "parse_mode=HTML" \
|
||||
--data-urlencode "text=${MSG}"
|
||||
|
||||
@@ -6,6 +6,21 @@
|
||||
|
||||
---
|
||||
|
||||
## 2026-04-30 | CD Runner 拆段 — host build/deploy
|
||||
|
||||
承接 `RWLayer ... unexpectedly nil` 持續打斷 Gitea CD 的問題。第一層 `capacity: 1` + Docker lock 可阻止跨 repo 並行,但長時間 Web build 仍會讓 transient act job container 在 build 收尾消失。
|
||||
|
||||
### 完成
|
||||
- 110 停用 Docker-wrapped `gitea-runner` container,改保留 host-level `act_runner` daemon。
|
||||
- `/home/wooo/act-runner/config.yaml` 新增 `awoooi-host:host` label,並保留 `ubuntu-latest` Docker label 給測試 job。
|
||||
- `.gitea/workflows/cd.yaml` 拆為 `tests`、`build-and-deploy`、`post-deploy-checks` 三段;API/Web Docker build 與 GitOps deploy 改跑 `awoooi-host`,不再在 transient act job container 內長時間 build。
|
||||
- host deploy step 的 `kustomize` 改安裝到 `${HOME}/.local/bin`,避免 host runner 沒有 root 權限時寫 `/usr/local/bin` 失敗。
|
||||
|
||||
### 驗證
|
||||
- 110 act_runner 已宣告 labels: `ubuntu-latest ubuntu-22.04 ubuntu-24.04 awoooi-host`。
|
||||
- Docker-wrapped `gitea-runner` restart policy 已改 `no` 且狀態為 exited。
|
||||
- `.gitea/workflows/cd.yaml` YAML parse 通過,所有 `run:` block `bash -n` 通過。
|
||||
|
||||
## 2026-04-30 | CD Runner 並行 Build 修復 — RWLayer nil
|
||||
|
||||
AWOOOI CD `Build and Push Web` 在 Gitea act-runner 內失敗:`RWLayer of container ... unexpectedly nil`。Web image 在 110 host 直接 build 成功,排除 Web 程式碼 build error。
|
||||
|
||||
@@ -103,7 +103,7 @@ Error response from daemon: No such container: <id>
|
||||
3. 第二個 task 啟動後,第一個 AWOOOI job container 被 Docker/act 清掉,BuildKit 後續只看到 `RWLayer ... unexpectedly nil`。
|
||||
4. Web image 在 110 host 直接 `docker build` 可成功,證明不是 Web 程式 build error。
|
||||
|
||||
### 修復
|
||||
### 第一層修復
|
||||
|
||||
1. 110 act-runner 必須單工:
|
||||
|
||||
@@ -124,6 +124,33 @@ runner:
|
||||
|
||||
3. 若 job 非正常中止留下 lock,下一次 CD 會在 lock 超過 2 小時後移除 stale network。
|
||||
|
||||
### 第二層修復: host label build/deploy
|
||||
|
||||
`capacity: 1` 與 Docker network lock 可避免跨 repo 並行,但長時間
|
||||
`docker build` 仍可能讓 transient act job container 在 build 收尾時消失。
|
||||
2026-04-30 起,AWOOOI CD 拆成三段:
|
||||
|
||||
| Job | runner label | 用途 |
|
||||
|-----|--------------|------|
|
||||
| `tests` | `ubuntu-latest` | API unit + B5 integration tests,仍跑在 ci-runner container |
|
||||
| `build-and-deploy` | `awoooi-host` | Harbor login、API/Web image build/push、GitOps deploy,直接跑在 110 host |
|
||||
| `post-deploy-checks` | `ubuntu-latest` | Alert chain、monitoring coverage、Playwright smoke |
|
||||
|
||||
110 只保留 host-level `act_runner` daemon,並在同一份 config 宣告兩類 label:
|
||||
|
||||
```yaml
|
||||
runner:
|
||||
capacity: 1
|
||||
labels:
|
||||
- "ubuntu-latest:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"
|
||||
- "ubuntu-22.04:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"
|
||||
- "ubuntu-24.04:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"
|
||||
- "awoooi-host:host"
|
||||
```
|
||||
|
||||
Docker-wrapped `gitea-runner` container 必須停用,避免它用同一份 config
|
||||
搶走 `awoooi-host` job,導致 host job 其實跑在 runner container 裡。
|
||||
|
||||
---
|
||||
版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code
|
||||
變更: v1.0→v2.0 序列建構取代 Job Concurrency Groups
|
||||
|
||||
Reference in New Issue
Block a user