問題: 多個 commit 快速推版時排隊堆積;docker build 卡住阻塞整條 queue 根因: cancel-in-progress:false 讓每個 commit 都排隊等,新的無法取消舊的 修復: cancel-in-progress:true — 新 push 立即取消舊 build,只部署最新 commit 安全: concurrency group 保證同時只有一個 job 跑,kubectl rollout status 防半部署 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
340 lines
17 KiB
YAML
340 lines
17 KiB
YAML
# =============================================================================
|
||
# AWOOOI CD Pipeline (Gitea Actions - 方案 B)
|
||
# =============================================================================
|
||
# 流程: Build → Push to Harbor → Deploy to K8s
|
||
# 加速措施:
|
||
# 1. Docker Layer Cache → Harbor registry cache
|
||
# 2. 內部 Mirror → 192.168.0.110:5001 (Harbor Proxy Cache for DockerHub)
|
||
# 2026-03-29 Claude Code (ADR-039) - Retry after creating Harbor project
|
||
|
||
name: CD Pipeline
|
||
|
||
on:
|
||
push:
|
||
branches: [main]
|
||
paths:
|
||
# 只有實際影響部署的程式碼才觸發 CD
|
||
- 'apps/**'
|
||
- 'k8s/**'
|
||
- '.gitea/workflows/**'
|
||
# docs/、memory/、ADR、ops/ 等不觸發
|
||
workflow_dispatch:
|
||
# 手動觸發永遠可用(用於補跑、緊急部署)
|
||
|
||
# 2026-04-02 Claude Code: 改為搶佔模式 — 新 push 立即取消舊 build,只部署最新
|
||
# 原理: concurrency group 保證同時只有一個 job 跑;cancel-in-progress:true 讓新的取代舊的
|
||
# 解決: 多個 commit 快速連推時不再排隊堆積,且 docker build 卡住時不會阻塞後續部署
|
||
# 安全: deploy 步驟本身有 kubectl rollout status 保護,不會出現半部署狀態
|
||
concurrency:
|
||
group: cd-deploy-${{ github.ref }}
|
||
cancel-in-progress: true
|
||
|
||
env:
|
||
HARBOR: 192.168.0.110:5000
|
||
# Harbor Proxy Cache (指向 DockerHub 的內部 Mirror,避免拉取限額)
|
||
HARBOR_MIRROR: 192.168.0.110:5001
|
||
# OTEL CI/CD 監控 (2026-03-31 #46c - 遷移到 Gitea)
|
||
OTEL_EXPORTER_OTLP_ENDPOINT: http://192.168.0.188:24318
|
||
OTEL_SERVICE_NAME: awoooi-cd
|
||
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production
|
||
|
||
jobs:
|
||
build-and-deploy:
|
||
# 2026-04-02 ogt: Gitea runner label 是 ubuntu-latest (非 GitHub 的 self-hosted)
|
||
# ADR-039 鐵律: 使用自建 runner,但 Gitea label matching 不同於 GitHub
|
||
# 2026-04-02 Claude Code: 加入 timeout 防止 docker build/push 卡住超過 45 分鐘
|
||
timeout-minutes: 45
|
||
runs-on: ubuntu-latest
|
||
steps:
|
||
- uses: actions/checkout@v4
|
||
|
||
# 2026-03-31 ogt: 優化告警格式 - 提高可讀性
|
||
- name: Get Commit Info
|
||
id: commit
|
||
run: |
|
||
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
|
||
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
|
||
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
|
||
|
||
- name: Notify Pipeline Start
|
||
env:
|
||
TG_MSG: "🚀 <b>AWOOOI 部署開始</b>\n├ 📝 ${{ steps.commit.outputs.message }}\n├ 🔖 <code>${{ steps.commit.outputs.short_sha }}</code>\n├ 👤 ${{ github.actor }}\n└ 🌿 main"
|
||
run: |
|
||
printf '%b' "$TG_MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||
-d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
|
||
-d "parse_mode=HTML" \
|
||
--data-urlencode "text@-"
|
||
|
||
|
||
|
||
# 2026-03-31 ogt: Phase 22.0 CI 測試 (禁止 Mock - feedback_no_mock_testing.md)
|
||
# 2026-04-01 ogt: 持久化 venv 加速 - /opt/api-venv 跨 run 保留
|
||
# pyproject.toml hash 變才重裝,其餘直接 activate (節省 ~6-7 min)
|
||
- name: Run API Tests
|
||
run: |
|
||
VENV=/opt/api-venv
|
||
HASH_FILE=/opt/api-venv/.deps_hash
|
||
CURRENT_HASH=$(md5sum apps/api/pyproject.toml | awk '{print $1}')
|
||
|
||
if [ ! -d "$VENV" ] || [ "$(cat $HASH_FILE 2>/dev/null)" != "$CURRENT_HASH" ]; then
|
||
echo "📦 deps 已變更,重建 venv..."
|
||
python3 -m venv $VENV
|
||
source $VENV/bin/activate
|
||
pip install -q uv
|
||
cd apps/api && uv pip install -q -e ".[dev]" && cd -
|
||
echo "$CURRENT_HASH" > $HASH_FILE
|
||
else
|
||
echo "⚡ 使用快取 venv (deps 未變更)"
|
||
source $VENV/bin/activate
|
||
fi
|
||
|
||
cd apps/api
|
||
# CI 排除需外部服務的測試 (Redis pool / Ollama — 2026-04-01 Claude Code)
|
||
pytest tests/ -v --tb=short -x \
|
||
--ignore=tests/test_anomaly_counter.py \
|
||
--ignore=tests/test_global_repair_cooldown.py \
|
||
--ignore=tests/test_redis_multisig.py \
|
||
--ignore=tests/test_model_regression.py \
|
||
--ignore=tests/test_prompt_validation.py \
|
||
2>&1 | tail -50
|
||
echo "✅ API 測試通過"
|
||
|
||
- name: Login to Harbor
|
||
uses: docker/login-action@v3
|
||
with:
|
||
registry: ${{ env.HARBOR }}
|
||
username: ${{ secrets.HARBOR_USERNAME }}
|
||
password: ${{ secrets.HARBOR_PASSWORD }}
|
||
|
||
# ── API 鏡像建置(含 Layer Cache 加速)──────────────────────────────
|
||
# 2026-04-01 ogt: CACHE_BUST=git_sha 確保 src/ 和 models.json 層每次重建
|
||
# deps 層 (pip install) 仍可 cache → 加速;代碼/配置層強制失效
|
||
- name: Build and Push API
|
||
run: |
|
||
docker build -f apps/api/Dockerfile \
|
||
--build-arg BUILDKIT_INLINE_CACHE=1 \
|
||
--cache-from ${{ env.HARBOR }}/awoooi/api:latest \
|
||
--build-arg CACHE_BUST=${{ github.sha }} \
|
||
-t ${{ env.HARBOR }}/awoooi/api:${{ github.sha }} \
|
||
-t ${{ env.HARBOR }}/awoooi/api:latest \
|
||
.
|
||
docker push ${{ env.HARBOR }}/awoooi/api:${{ github.sha }}
|
||
docker push ${{ env.HARBOR }}/awoooi/api:latest
|
||
|
||
# 2026-03-31 ogt: 移除中間通知,減少訊息雜訊
|
||
|
||
# ── Web 鏡像建置(精準快取失效)──────────────────────────────
|
||
# 2026-03-30 ogt: NEXT_PUBLIC_* 必須用公網域名 (build-time 寫死)
|
||
# 2026-04-01 Claude Code: 改用 CACHE_BUST=git_sha 取代 --no-cache
|
||
# - deps 層 (pnpm install) 仍可 cache → 節省 ~2-3 min
|
||
# - COPY . . 以下由 CACHE_BUST 強制失效 → CSRF fix 等代碼變更正確進入 bundle
|
||
- name: Build and Push Web
|
||
run: |
|
||
docker build -f apps/web/Dockerfile \
|
||
--build-arg NEXT_PUBLIC_API_URL=https://awoooi.wooo.work \
|
||
--build-arg CACHE_BUST=${{ github.sha }} \
|
||
--build-arg BUILDKIT_INLINE_CACHE=1 \
|
||
--cache-from ${{ env.HARBOR }}/awoooi/web:latest \
|
||
-t ${{ env.HARBOR }}/awoooi/web:${{ github.sha }} \
|
||
-t ${{ env.HARBOR }}/awoooi/web:latest \
|
||
.
|
||
docker push ${{ env.HARBOR }}/awoooi/web:${{ github.sha }}
|
||
docker push ${{ env.HARBOR }}/awoooi/web:latest
|
||
|
||
# 2026-03-31 ogt: 移除中間通知
|
||
|
||
# 2026-03-31 ogt: P0-1 Secrets 自動注入 (ADR-035 強制)
|
||
# 2026-03-31 ogt: 加入 AI API Keys (修復 mock_fallback 問題)
|
||
- name: Inject K8s Secrets
|
||
env:
|
||
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
|
||
TG_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
|
||
TG_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }}
|
||
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
|
||
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
||
# 2026-04-01 Claude Code: Langfuse LLMOps keys (Phase 15.1 補齊 CD 注入)
|
||
LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }}
|
||
LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }}
|
||
# 2026-04-02 Claude Code: Telegram 白名單 (授權簽核用)
|
||
TG_USER_WHITELIST: ${{ secrets.OPENCLAW_TG_USER_WHITELIST }}
|
||
# Phase O-4.1 2026-04-02: Sentry API Token (Wave A.1 ADR-037)
|
||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||
run: |
|
||
mkdir -p ~/.ssh
|
||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
|
||
chmod 600 ~/.ssh/deploy_key
|
||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
|
||
set -e
|
||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||
|
||
# 注入 Telegram Secrets (ADR-035 鐵律)
|
||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/OPENCLAW_TG_BOT_TOKEN","value":"'$(echo -n "${TG_BOT_TOKEN}" | base64 -w 0)'"},
|
||
{"op":"add","path":"/data/OPENCLAW_TG_CHAT_ID","value":"'$(echo -n "${TG_CHAT_ID}" | base64 -w 0)'"}
|
||
]' || { echo "❌ Telegram Secrets patch 失敗 — ADR-035 鐵律"; exit 1; }
|
||
|
||
# 2026-03-31 ogt: 注入 AI API Keys (修復 NVIDIA/Gemini mock_fallback)
|
||
# 2026-04-01 Claude Code: base64 -w 0 防止長 key 換行破壞 JSON
|
||
# NVIDIA NIM (免費 tier)
|
||
if [ -n "${NVIDIA_API_KEY}" ] && [ "${NVIDIA_API_KEY}" != "" ]; then
|
||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/NVIDIA_API_KEY","value":"'$(echo -n "${NVIDIA_API_KEY}" | base64 -w 0)'"}
|
||
]' && echo "✅ NVIDIA_API_KEY 已注入" || echo "⚠️ NVIDIA_API_KEY patch 失敗"
|
||
else
|
||
echo "⚠️ NVIDIA_API_KEY 未設定,跳過"
|
||
fi
|
||
|
||
# Gemini (備援)
|
||
if [ -n "${GEMINI_API_KEY}" ] && [ "${GEMINI_API_KEY}" != "" ]; then
|
||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/GEMINI_API_KEY","value":"'$(echo -n "${GEMINI_API_KEY}" | base64 -w 0)'"}
|
||
]' && echo "✅ GEMINI_API_KEY 已注入" || echo "⚠️ GEMINI_API_KEY patch 失敗"
|
||
else
|
||
echo "⚠️ GEMINI_API_KEY 未設定,跳過"
|
||
fi
|
||
|
||
# 2026-04-01 Claude Code: Langfuse LLMOps keys (補齊 CD 注入,之前只有手動設定)
|
||
if [ -n "${LANGFUSE_PUBLIC_KEY}" ] && [ -n "${LANGFUSE_SECRET_KEY}" ]; then
|
||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/LANGFUSE_PUBLIC_KEY","value":"'$(echo -n "${LANGFUSE_PUBLIC_KEY}" | base64 -w 0)'"},
|
||
{"op":"add","path":"/data/LANGFUSE_SECRET_KEY","value":"'$(echo -n "${LANGFUSE_SECRET_KEY}" | base64 -w 0)'"}
|
||
]' && echo "✅ LANGFUSE keys 已注入" || echo "⚠️ LANGFUSE keys patch 失敗"
|
||
else
|
||
echo "⚠️ LANGFUSE_PUBLIC_KEY/SECRET_KEY 未設定,跳過 (現有 K8s secret 值維持不變)"
|
||
fi
|
||
|
||
# 2026-04-02 Claude Code: Telegram Whitelist (授權簽核用戶 ID)
|
||
if [ -n "${TG_USER_WHITELIST}" ]; then
|
||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/OPENCLAW_TG_USER_WHITELIST","value":"'$(echo -n "${TG_USER_WHITELIST}" | base64 -w 0)'"}
|
||
]' && echo "✅ TG_USER_WHITELIST 已注入" || echo "⚠️ TG_USER_WHITELIST patch 失敗"
|
||
fi
|
||
|
||
# Phase O-4.1 2026-04-02: Sentry Auth Token (Wave A.1 ADR-037)
|
||
if [ -n "${SENTRY_AUTH_TOKEN}" ]; then
|
||
sudo kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/SENTRY_AUTH_TOKEN","value":"'$(echo -n "${SENTRY_AUTH_TOKEN}" | base64 -w 0)'"}
|
||
]' && echo "✅ SENTRY_AUTH_TOKEN 已注入" || echo "⚠️ SENTRY_AUTH_TOKEN patch 失敗"
|
||
else
|
||
echo "⚠️ SENTRY_AUTH_TOKEN 未設定,Sentry Comment API 將跳過"
|
||
fi
|
||
|
||
echo "✅ 所有 Secrets 注入完成"
|
||
SECRETS
|
||
|
||
# 2026-04-01 ogt: 合併 ConfigMap + Deploy + Health Check 為單一 SSH step
|
||
# 原本 3 次獨立 SSH 連線 → 節省 ~30s 握手開銷
|
||
- name: Deploy to K8s
|
||
env:
|
||
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
|
||
run: |
|
||
# Step 1: Apply ConfigMap (stdin pipe,必須獨立)
|
||
cat k8s/awoooi-prod/04-configmap.yaml | \
|
||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
|
||
echo "✅ ConfigMap 已更新"
|
||
|
||
# Step 2: Set images + Rollout + Health Check (合併一次 SSH)
|
||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'DEPLOY'
|
||
set -e
|
||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||
|
||
# 2026-03-30 ogt: sudoers NOPASSWD 已設定,無需密碼
|
||
sudo kubectl set image deployment/awoooi-api \
|
||
api=192.168.0.110:5000/awoooi/api:${{ github.sha }} \
|
||
-n awoooi-prod
|
||
sudo kubectl set image deployment/awoooi-web \
|
||
web=192.168.0.110:5000/awoooi/web:${{ github.sha }} \
|
||
-n awoooi-prod
|
||
sudo kubectl set image deployment/awoooi-worker \
|
||
worker=192.168.0.110:5000/awoooi/api:${{ github.sha }} \
|
||
-n awoooi-prod
|
||
|
||
sudo kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s
|
||
sudo kubectl rollout status deployment/awoooi-web -n awoooi-prod --timeout=120s
|
||
sudo kubectl rollout status deployment/awoooi-worker -n awoooi-prod --timeout=120s
|
||
echo "✅ 部署完成"
|
||
|
||
# Health Check (同一 SSH session,省去再次握手)
|
||
# 2026-04-01 Claude Code: 改用 break+flag,避免 exit 0 在 heredoc 引發 SIGPIPE
|
||
sleep 10
|
||
HEALTH_PASS=0
|
||
for i in 1 2 3; do
|
||
HTTP_CODE=$(curl -s -w "%{http_code}" -o /dev/null --connect-timeout 10 "http://localhost:32334/api/v1/health")
|
||
if [ "$HTTP_CODE" = "200" ]; then
|
||
echo "✅ API 健康檢查通過"
|
||
HEALTH_PASS=1
|
||
break
|
||
fi
|
||
echo "⏳ 嘗試 #$i: HTTP $HTTP_CODE,等待 10s..."
|
||
sleep 10
|
||
done
|
||
if [ "$HEALTH_PASS" = "0" ]; then
|
||
echo "❌ API 健康檢查失敗"
|
||
exit 1
|
||
fi
|
||
DEPLOY
|
||
|
||
# Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037)
|
||
# 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter
|
||
- name: Alert Chain Smoke Test
|
||
id: alert_chain_smoke
|
||
continue-on-error: true
|
||
run: |
|
||
pip install requests --quiet
|
||
python3 scripts/alert_chain_smoke_test.py \
|
||
--api-url http://localhost:32334 \
|
||
--json | tee /tmp/alert_chain_result.json
|
||
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
|
||
|
||
# Phase O-5 Wave C.2 2026-04-02 ogt: 監控覆蓋率驗證 (generate_monitoring.py --check)
|
||
# continue-on-error: true — 覆蓋率不足不阻塞部署,但反映在 TG 通知
|
||
- name: Monitoring Coverage Check
|
||
id: monitoring_coverage
|
||
continue-on-error: true
|
||
run: |
|
||
pip install requests --quiet
|
||
python3 scripts/generate_monitoring.py --check && echo "coverage_status=pass" >> $GITHUB_OUTPUT || echo "coverage_status=fail" >> $GITHUB_OUTPUT
|
||
|
||
# [首席架構師] 新增 Playwright E2E Smoke Test 步驟 v1.0.0 2026-04-01 (台北時間)
|
||
# continue-on-error: true — smoke 失敗不阻塞部署,但結果會反映在 TG 通知
|
||
- name: E2E Smoke Test
|
||
id: smoke
|
||
continue-on-error: true
|
||
run: |
|
||
cd apps/web
|
||
# 安裝 Playwright Chromium(CI 環境,含系統依賴)
|
||
npx playwright install chromium --with-deps
|
||
# 跑 smoke test,line reporter 方便 CI 日誌閱讀
|
||
npx playwright test tests/e2e/smoke.spec.ts --reporter=line
|
||
echo "smoke_status=pass" >> $GITHUB_OUTPUT
|
||
env:
|
||
# Playwright 在 CI 環境使用已建置的 pnpm node_modules
|
||
CI: "true"
|
||
|
||
- name: Notify Health Check Success
|
||
env:
|
||
SMOKE_RESULT: ${{ steps.smoke.outcome == 'success' && '✅' || '⚠️' }}
|
||
ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outcome == 'success' && '✅' || '⚠️' }}
|
||
MONITORING_RESULT: ${{ steps.monitoring_coverage.outcome == 'success' && '✅' || '⚠️' }}
|
||
TG_MSG: "✅ <b>AWOOOI 部署完成</b>\n├ 📝 ${{ steps.commit.outputs.message }}\n├ 🔖 <code>${{ steps.commit.outputs.short_sha }}</code>\n├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s\n├ 📦 API: ✅ Web: ✅\n├ 🩺 Health: ✅\n├ 🔗 Alert Chain: ${ALERT_CHAIN_RESULT}\n├ 📊 Monitoring: ${MONITORING_RESULT}\n└ 🎭 Smoke: ${SMOKE_RESULT}"
|
||
run: |
|
||
END_TIME=$(date +%s)
|
||
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
|
||
MINUTES=$((DURATION / 60))
|
||
SECONDS=$((DURATION % 60))
|
||
printf '%b' "$TG_MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||
-d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
|
||
-d "parse_mode=HTML" \
|
||
--data-urlencode "text@-"
|
||
|
||
- name: Notify Pipeline Failure
|
||
if: failure()
|
||
env:
|
||
TG_MSG: "❌ <b>AWOOOI 部署失敗</b>\n├ 📝 ${{ steps.commit.outputs.message }}\n├ 🔖 <code>${{ steps.commit.outputs.short_sha }}</code>\n├ 👤 ${{ github.actor }}\n└ 🔗 <a href=\"http://192.168.0.110:3001/wooo/awoooi/actions\">查看日誌</a>"
|
||
run: |
|
||
printf '%b' "$TG_MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||
-d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
|
||
-d "parse_mode=HTML" \
|
||
--data-urlencode "text@-"
|