diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index b93e9d5a..dec4b3ac 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -41,11 +41,10 @@ env: OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production jobs: - build-and-deploy: + tests: # 2026-04-02 ogt: Gitea runner label 是 ubuntu-latest (非 GitHub 的 self-hosted) # ADR-039 鐵律: 使用自建 runner,但 Gitea label matching 不同於 GitHub - # 2026-04-02 Claude Code: 加入 timeout 防止 docker build/push 卡住超過 45 分鐘 - timeout-minutes: 45 + timeout-minutes: 30 runs-on: ubuntu-latest # 2026-04-10 ogt: B5 改用 docker run 本地啟動,移除 services: 宣告 # Gitea act runner 的 services: container name 為空,導致 CI 失敗 @@ -197,12 +196,42 @@ jobs: # 清理 docker rm -f pg-test-b5 || true + - name: Notify Pipeline Failure + # 2026-04-30 Codex: tests job failure notifier; no jq dependency for host parity. + if: failure() + run: | + COMMIT_MSG="${{ steps.commit.outputs.message }}" + SHORT_SHA="${{ steps.commit.outputs.short_sha }}" + ACTOR="${{ github.actor }}" + COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s//\>/g') + MSG=$(printf '❌ AWOOOI 部署失敗\n├ 📝 %s\n├ 🔖 %s\n├ 👤 %s\n├ 🧪 Stage: tests\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}") + curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ + -d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \ + -d "parse_mode=HTML" \ + --data-urlencode "text=${MSG}" + + build-and-deploy: + # 2026-04-30 Codex: Docker builds run on the host runner. Long docker build + # steps were killing the transient act job container with RWLayer=nil. + needs: tests + timeout-minutes: 60 + runs-on: awoooi-host + steps: + - uses: actions/checkout@v4 + + - name: Get Commit Info + id: commit + run: | + echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT + echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT + echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT + - name: Login to Harbor - uses: docker/login-action@v3 - with: - registry: ${{ env.HARBOR }} - username: ${{ secrets.HARBOR_USERNAME }} - password: ${{ secrets.HARBOR_PASSWORD }} + run: | + echo "${{ secrets.HARBOR_PASSWORD }}" | \ + docker login "${{ env.HARBOR }}" \ + -u "${{ secrets.HARBOR_USERNAME }}" \ + --password-stdin # 2026-04-30 Codex: Gitea act-runner shares one Docker daemon across repos. # When another repo starts a heavy docker build while AWOOOI Web is still @@ -562,9 +591,13 @@ jobs: echo "✅ Service Registry ConfigMap 已更新" # ─── Step 2: 更新 kustomization.yaml image tag ─── - # 安裝 kustomize(若未安裝) + # host runner 不保證有 root 權限,kustomize 安裝在使用者目錄。 + export PATH="${HOME}/.local/bin:${PATH}" if ! command -v kustomize &>/dev/null; then - curl -sL https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.3.0/kustomize_v5.3.0_linux_amd64.tar.gz | tar xz -C /usr/local/bin + mkdir -p "${HOME}/.local/bin" + curl -sL https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.3.0/kustomize_v5.3.0_linux_amd64.tar.gz \ + | tar xz -C "${HOME}/.local/bin" + chmod +x "${HOME}/.local/bin/kustomize" fi cd k8s/awoooi-prod @@ -668,6 +701,33 @@ jobs: "chmod +x ~/awoooi-ops/docker-health-monitor.sh ~/awoooi-ops/pg-backup.sh && echo '✅ 權限設定完成'" \ || echo "⚠️ 權限設定失敗" + - name: Notify Pipeline Failure + if: failure() + run: | + COMMIT_MSG="${{ steps.commit.outputs.message }}" + SHORT_SHA="${{ steps.commit.outputs.short_sha }}" + ACTOR="${{ github.actor }}" + COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s//\>/g') + MSG=$(printf '❌ AWOOOI 部署失敗\n├ 📝 %s\n├ 🔖 %s\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}") + curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ + -d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \ + -d "parse_mode=HTML" \ + --data-urlencode "text=${MSG}" + + post-deploy-checks: + needs: build-and-deploy + timeout-minutes: 30 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Get Commit Info + id: commit + run: | + echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT + echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT + echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT + # Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037) # 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter # 2026-04-05 Claude Code cache優化: 使用 /opt/api-venv (已有 requests),移除 Setup Python Tools step @@ -766,7 +826,8 @@ jobs: SHORT_SHA="${{ steps.commit.outputs.short_sha }}" ACTOR="${{ github.actor }}" COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s//\>/g') - MSG=$(printf '❌ AWOOOI 部署失敗\n├ 📝 %s\n├ 🔖 %s\n├ 👤 %s\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}") + MSG=$(printf '❌ AWOOOI 部署失敗\n├ 📝 %s\n├ 🔖 %s\n├ 👤 %s\n├ 🩺 Stage: post-deploy-checks\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}") curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ - -H "Content-Type: application/json" \ - -d "$(jq -n --arg c "${{ secrets.TELEGRAM_CHAT_ID }}" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML"}')" + -d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \ + -d "parse_mode=HTML" \ + --data-urlencode "text=${MSG}" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 779707ee..bf68b68a 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,21 @@ --- +## 2026-04-30 | CD Runner 拆段 — host build/deploy + +承接 `RWLayer ... unexpectedly nil` 持續打斷 Gitea CD 的問題。第一層 `capacity: 1` + Docker lock 可阻止跨 repo 並行,但長時間 Web build 仍會讓 transient act job container 在 build 收尾消失。 + +### 完成 +- 110 停用 Docker-wrapped `gitea-runner` container,改保留 host-level `act_runner` daemon。 +- `/home/wooo/act-runner/config.yaml` 新增 `awoooi-host:host` label,並保留 `ubuntu-latest` Docker label 給測試 job。 +- `.gitea/workflows/cd.yaml` 拆為 `tests`、`build-and-deploy`、`post-deploy-checks` 三段;API/Web Docker build 與 GitOps deploy 改跑 `awoooi-host`,不再在 transient act job container 內長時間 build。 +- host deploy step 的 `kustomize` 改安裝到 `${HOME}/.local/bin`,避免 host runner 沒有 root 權限時寫 `/usr/local/bin` 失敗。 + +### 驗證 +- 110 act_runner 已宣告 labels: `ubuntu-latest ubuntu-22.04 ubuntu-24.04 awoooi-host`。 +- Docker-wrapped `gitea-runner` restart policy 已改 `no` 且狀態為 exited。 +- `.gitea/workflows/cd.yaml` YAML parse 通過,所有 `run:` block `bash -n` 通過。 + ## 2026-04-30 | CD Runner 並行 Build 修復 — RWLayer nil AWOOOI CD `Build and Push Web` 在 Gitea act-runner 內失敗:`RWLayer of container ... unexpectedly nil`。Web image 在 110 host 直接 build 成功,排除 Web 程式碼 build error。 diff --git a/ops/runner/README.md b/ops/runner/README.md index 6b7e0f33..93d66ed3 100644 --- a/ops/runner/README.md +++ b/ops/runner/README.md @@ -103,7 +103,7 @@ Error response from daemon: No such container: 3. 第二個 task 啟動後,第一個 AWOOOI job container 被 Docker/act 清掉,BuildKit 後續只看到 `RWLayer ... unexpectedly nil`。 4. Web image 在 110 host 直接 `docker build` 可成功,證明不是 Web 程式 build error。 -### 修復 +### 第一層修復 1. 110 act-runner 必須單工: @@ -124,6 +124,33 @@ runner: 3. 若 job 非正常中止留下 lock,下一次 CD 會在 lock 超過 2 小時後移除 stale network。 +### 第二層修復: host label build/deploy + +`capacity: 1` 與 Docker network lock 可避免跨 repo 並行,但長時間 +`docker build` 仍可能讓 transient act job container 在 build 收尾時消失。 +2026-04-30 起,AWOOOI CD 拆成三段: + +| Job | runner label | 用途 | +|-----|--------------|------| +| `tests` | `ubuntu-latest` | API unit + B5 integration tests,仍跑在 ci-runner container | +| `build-and-deploy` | `awoooi-host` | Harbor login、API/Web image build/push、GitOps deploy,直接跑在 110 host | +| `post-deploy-checks` | `ubuntu-latest` | Alert chain、monitoring coverage、Playwright smoke | + +110 只保留 host-level `act_runner` daemon,並在同一份 config 宣告兩類 label: + +```yaml +runner: + capacity: 1 + labels: + - "ubuntu-latest:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" + - "ubuntu-22.04:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" + - "ubuntu-24.04:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" + - "awoooi-host:host" +``` + +Docker-wrapped `gitea-runner` container 必須停用,避免它用同一份 config +搶走 `awoooi-host` job,導致 host job 其實跑在 runner container 裡。 + --- 版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code 變更: v1.0→v2.0 序列建構取代 Job Concurrency Groups