1596 lines
84 KiB
YAML
1596 lines
84 KiB
YAML
# =============================================================================
|
||
# AWOOOI CD Pipeline (Gitea Actions - 方案 B)
|
||
# =============================================================================
|
||
# 流程: Build → Push to Harbor → Deploy to K8s
|
||
# 加速措施:
|
||
# 1. Docker Layer Cache → Harbor registry cache
|
||
# 2. 內部 Mirror → 192.168.0.110:5001 (Harbor Proxy Cache for DockerHub)
|
||
# 2026-03-29 Claude Code (ADR-039) - Retry after creating Harbor project
|
||
|
||
name: CD Pipeline
|
||
|
||
on:
|
||
push:
|
||
branches: [main]
|
||
paths:
|
||
# 只有實際影響部署的程式碼才觸發 CD
|
||
- 'apps/**'
|
||
- 'k8s/**'
|
||
- '.dockerignore'
|
||
# Dockerfile COPY scripts/ into the API image; keep production ops
|
||
# seed scripts deploy-coupled instead of repo-only.
|
||
- 'scripts/backup/backup-momo-188-pg.sh'
|
||
- 'scripts/ops/notify-awoooi-ops.sh'
|
||
- 'scripts/ops/awooop-seed-auto-repair-canary-playbook.py'
|
||
# Workflow-only changes do not rebuild runtime images. Use workflow_dispatch
|
||
# when an operator explicitly wants to test the CD pipeline itself.
|
||
# docs/、memory/、ADR 等不觸發
|
||
# ops/monitoring/alerts-unified.yml 由 deploy-alerts.yaml 獨立處理 (I3)
|
||
workflow_dispatch:
|
||
# 手動觸發永遠可用(用於補跑、緊急部署)
|
||
|
||
# 2026-04-02 Claude Code: 改為搶佔模式 — 新 push 立即取消舊 build,只部署最新
|
||
# 原理: concurrency group 保證同時只有一個 job 跑;cancel-in-progress:true 讓新的取代舊的
|
||
# 解決: 多個 commit 快速連推時不再排隊堆積,且 docker build 卡住時不會阻塞後續部署
|
||
# 安全: deploy 步驟本身有 kubectl rollout status 保護,不會出現半部署狀態
|
||
concurrency:
|
||
group: cd-deploy-${{ github.ref }}
|
||
cancel-in-progress: true
|
||
|
||
env:
|
||
HARBOR: 192.168.0.110:5000
|
||
SRE_GROUP_CHAT_ID: "-1003711974679"
|
||
# Harbor Proxy Cache (指向 DockerHub 的內部 Mirror,避免拉取限額)
|
||
HARBOR_MIRROR: 192.168.0.110:5001
|
||
# OTEL CI/CD 監控 (2026-03-31 #46c - 遷移到 Gitea)
|
||
OTEL_EXPORTER_OTLP_ENDPOINT: http://192.168.0.188:24318
|
||
OTEL_SERVICE_NAME: awoooi-cd
|
||
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production
|
||
CI_IMAGE: 192.168.0.110:5000/awoooi/ci-runner:act-22.04
|
||
# 2026-05-24 Codex: deploy through the currently Ready control-plane node.
|
||
# 120 is NotReady/SchedulingDisabled and its SSH/API endpoints are currently
|
||
# unreachable; pinning CD to it blocks secret injection before GitOps deploy.
|
||
K8S_SSH_HOST: 192.168.0.121
|
||
K8S_API_SERVER: https://192.168.0.121:6443
|
||
# 2026-06-01 Codex: post-deploy health/smoke probes use the production
|
||
# public API. The old 192.168.0.125 NodePort VIP can be absent while the
|
||
# public route and in-cluster service are healthy, causing false failures.
|
||
API_HEALTH_URL: https://awoooi.wooo.work/api/v1/health
|
||
ALERT_CHAIN_API_URL: https://awoooi.wooo.work
|
||
|
||
jobs:
|
||
tests:
|
||
# 2026-04-30 Codex: run the tests job on the host runner and launch the
|
||
# CI image explicitly. The act-managed job container can disappear mid-test
|
||
# with Docker RWLayer=nil on the shared 110 daemon.
|
||
timeout-minutes: 30
|
||
runs-on: awoooi-host
|
||
# 2026-04-10 ogt: B5 改用 docker run 本地啟動,移除 services: 宣告
|
||
# Gitea act runner 的 services: container name 為空,導致 CI 失敗
|
||
steps:
|
||
- name: Bootstrap Host Runner Tools
|
||
# 2026-05-05 Codex: awoooi-host maps to the long-lived act-runner
|
||
# container. After dirty reboots it may not contain node/curl/git, and
|
||
# actions/checkout@v4 fails before tests can start.
|
||
run: |
|
||
if command -v apk >/dev/null 2>&1; then
|
||
apk add --no-cache nodejs npm git curl bash coreutils python3 openssh-client docker-cli docker-cli-buildx
|
||
fi
|
||
|
||
- uses: actions/checkout@v4
|
||
|
||
- name: Guard Workflow Secret Surfaces
|
||
run: node scripts/ci/check-gitea-step-env-secrets.js
|
||
|
||
# 2026-03-31 ogt: 優化告警格式 - 提高可讀性
|
||
- name: Get Commit Info
|
||
id: commit
|
||
run: |
|
||
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
|
||
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
|
||
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
|
||
|
||
- name: Notify Pipeline Start
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 改用 HTML 結構化格式,提升可讀性
|
||
run: |
|
||
COMMIT_MSG="${{ steps.commit.outputs.message }}"
|
||
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
|
||
ACTOR="${{ github.actor }}"
|
||
# HTML escape commit message(防特殊字元破壞 HTML)
|
||
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/</\</g; s/>/\>/g')
|
||
MSG=$(printf '🚀 <b>AWOOOI 部署開始</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n└ 👤 %s' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
|
||
# 2026-05-02 Claude Opus 4.7 + 統帥 ogt: notify 失敗不該擋整條 CI(鐵證:
|
||
# curl 400 從 5/1 起連續炸 14 個 commit 的 build-and-deploy)— 對齊 line 922 既有 pattern
|
||
if AWOOI_CICD_STATUS=running \
|
||
AWOOI_CICD_STAGE=tests \
|
||
AWOOI_CICD_JOB_NAME="AWOOOI 部署開始" \
|
||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
|
||
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
|
||
scripts/ci/notify-awoooi-cicd.sh; then
|
||
echo "✅ CI/CD start notification mirrored through AWOOI API"
|
||
else
|
||
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||
-d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \
|
||
-d "parse_mode=HTML" \
|
||
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
|
||
fi
|
||
|
||
|
||
|
||
# 2026-03-31 ogt: Phase 22.0 CI 測試 (禁止 Mock - feedback_no_mock_testing.md)
|
||
# 2026-04-01 ogt: 持久化 venv 加速 - /opt/api-venv 跨 run 保留
|
||
# pyproject.toml hash 變才重裝,其餘直接 activate (節省 ~6-7 min)
|
||
- name: Run API Tests
|
||
run: |
|
||
cat > /tmp/awoooi-api-tests.sh <<'CI_SCRIPT'
|
||
VENV=/opt/api-venv
|
||
HASH_FILE=/opt/api-venv/.deps_hash
|
||
CURRENT_HASH=$(md5sum apps/api/pyproject.toml | awk '{print $1}')
|
||
|
||
# python3.11 是 runner 層級持久安裝,只在首次或版本消失時才 apt-get
|
||
# 2026-04-05 Claude Code: 分離 apt-get 與 venv hash-guard,避免每次 deps 變更都重跑 apt
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 apt index 失敗 → 改用 --fix-missing + retry
|
||
if ! command -v python3.11 &>/dev/null; then
|
||
echo "📦 安裝 python3.11..."
|
||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||
apt-get update -q --fix-missing || apt-get update -q || true
|
||
apt-get install -y -q python3.11-venv python3.11 || \
|
||
(add-apt-repository ppa:deadsnakes/python -y 2>/dev/null && apt-get update -q && apt-get install -y -q python3.11-venv python3.11) || true
|
||
else
|
||
echo "⚡ python3.11 已安裝,跳過 apt-get"
|
||
fi
|
||
# 確保 python3.11 存在,否則 fallback 到系統 python3
|
||
if ! command -v python3.11 &>/dev/null; then
|
||
echo "⚠️ python3.11 安裝失敗,使用 python3 fallback"
|
||
ln -sf "$(which python3)" /usr/local/bin/python3.11 || true
|
||
fi
|
||
|
||
if [ ! -d "$VENV/bin" ] || [ "$(cat $HASH_FILE 2>/dev/null)" != "$CURRENT_HASH" ]; then
|
||
echo "📦 deps 已變更,重建 venv..."
|
||
# 2026-04-17 ogt: /opt/api-venv 是 volume mount,不能 rm -rf 目錄本身
|
||
# 改用 find 清空內容,保留 mount point 目錄
|
||
find "$VENV" -mindepth 1 -delete 2>/dev/null || true
|
||
python3.11 -m venv $VENV
|
||
source $VENV/bin/activate
|
||
pip install -q uv
|
||
cd apps/api && uv pip install -q -e ".[dev]" && cd -
|
||
echo "$CURRENT_HASH" > $HASH_FILE
|
||
else
|
||
echo "⚡ 使用快取 venv (deps 未變更)"
|
||
source $VENV/bin/activate
|
||
fi
|
||
|
||
cd apps/api
|
||
cleanup_pytest_workspace_cache() {
|
||
# 2026-05-19 Codex: CI image runs as root against a bind-mounted
|
||
# checkout. Remove Python cache artifacts before act-runner cleanup
|
||
# so successful jobs do not end with root-owned __pycache__ noise.
|
||
find tests src -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true
|
||
rm -rf .pytest_cache 2>/dev/null || true
|
||
}
|
||
# CI 排除需外部服務的測試 (Redis pool / Ollama — 2026-04-01 Claude Code)
|
||
# 2026-04-05 Claude Code: 修正 exit code — | tail 會吃掉 segfault (exit 139)
|
||
# 改用 tee + PIPESTATUS[0] 正確捕捉 pytest 本身的 exit code
|
||
# 2026-04-05 Claude Code: 加 --ignore=tests/integration 排除需 asyncpg 連線的 DB 測試
|
||
# integration tests 在 prod K8s 部署後由 E2E Smoke Test 覆蓋
|
||
# PYTHONFAULTHANDLER=1: 若 C extension segfault,輸出完整 Python stacktrace
|
||
# 2026-04-05 Claude Code: test_github_webhook.py 已根治
|
||
# 原問題: import src.main → asyncpg C ext segfault (exit 139)
|
||
# 修復: 改用最小化 app,只掛載 github_webhook router,不走 DB import chain
|
||
# 現在可安全加入 CI 測試
|
||
# 2026-04-22 ogt: DATABASE_URL 改為必填後,單元測試需要此 env var 讓 Settings 通過驗證
|
||
# 單元測試不連 DB,此 CI placeholder 僅供 Pydantic 驗證,不產生真實連線
|
||
DATABASE_URL="${DATABASE_URL:-postgresql+asyncpg://ci:ci@localhost/ci}" \
|
||
PYTHONFAULTHANDLER=1 python3.11 -m pytest tests/ -v --tb=short -x -p no:cacheprovider \
|
||
--ignore=tests/integration \
|
||
--ignore=tests/test_anomaly_counter.py \
|
||
--ignore=tests/test_global_repair_cooldown.py \
|
||
--ignore=tests/test_redis_multisig.py \
|
||
--ignore=tests/test_model_regression.py \
|
||
--ignore=tests/test_prompt_validation.py \
|
||
--ignore=tests/e2e_network_test.py \
|
||
2>&1 | tee /tmp/pytest-output.txt; PYTEST_EXIT=${PIPESTATUS[0]}
|
||
tail -60 /tmp/pytest-output.txt
|
||
cleanup_pytest_workspace_cache
|
||
exit $PYTEST_EXIT
|
||
CI_SCRIPT
|
||
docker run --rm \
|
||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-api-tests" \
|
||
--cpus "2.0" \
|
||
--memory "2g" \
|
||
-v "$PWD:/workspace" \
|
||
-v /tmp/awoooi-api-tests.sh:/tmp/awoooi-api-tests.sh:ro \
|
||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||
-w /workspace \
|
||
"${{ env.CI_IMAGE }}" \
|
||
bash /tmp/awoooi-api-tests.sh
|
||
|
||
# ── 整合測試 B5 (2026-04-10) ──────────────────────────────────────────
|
||
# B5 整合測試 — postgres-test 由 services: 提供,localhost:15432 直連
|
||
# 2026-04-10 Claude Sonnet 4.6: 用 psql 直連 localhost:15432 初始化 schema
|
||
# (docker exec 在 act runner 內無法取得 service container name)
|
||
# B5: Gitea act runner 的 services: 實作與 GitHub Actions 不同
|
||
# service container 啟動後需直連,但 act 的 container name 可能為空
|
||
# 2026-04-10 ogt: 改用 docker run 本地啟動取代 services: 宣告
|
||
# 2026-04-19 ogt + Claude Opus 4.7: cd 連續 2 次 fail (run 984/985)
|
||
# 真因: act runner 把 ci-runner 跑在獨立 user-defined network,
|
||
# pg-test-b5 預設用 host bridge → 兩邊隔離無法連 (172.17.0.2 timeout)
|
||
# 修法: 把 pg-test-b5 加入 act task 的 network,用 container name 連線
|
||
- name: Integration Tests (B5 — 真實 DB)
|
||
run: |
|
||
cat > /tmp/awoooi-b5-tests.sh <<'CI_SCRIPT'
|
||
cd apps/api
|
||
# 安裝 psql client
|
||
if ! command -v psql &>/dev/null; then
|
||
apt-get install -y -q postgresql-client
|
||
fi
|
||
# 2026-04-19 ogt + Claude Opus 4.7 v3: 主動創 shared network
|
||
# 之前 grep ACT_NET 在 c0f3509 run 沒 match → fallback bridge → container name DNS 失效
|
||
# 真因: default bridge 不支援 container name DNS,必須 user-defined network
|
||
# 修法: 主動建 'b5-test-net' (idempotent),ci-runner + pg-test-b5 都加入
|
||
B5_NET="b5-test-net"
|
||
docker network create "$B5_NET" 2>/dev/null || true
|
||
# 當前 ci-runner container (hostname == short container id) 連上此 network
|
||
# 若已連 → docker network connect 回 error 1,用 || true 吞掉
|
||
docker network connect "$B5_NET" "$HOSTNAME" 2>/dev/null || true
|
||
echo "B5 shared network: $B5_NET (ci-runner hostname: $HOSTNAME)"
|
||
# 啟動測試 DB 於 shared network,用 container name 'pg-test-b5' 連線
|
||
docker rm -f pg-test-b5 2>/dev/null || true
|
||
docker run -d --name pg-test-b5 \
|
||
--network="$B5_NET" \
|
||
-e POSTGRES_DB=awoooi_test \
|
||
-e POSTGRES_USER=awoooi \
|
||
-e POSTGRES_PASSWORD=awoooi_test_2026 \
|
||
pgvector/pgvector:pg16
|
||
# 等待就緒(用 container name,最多 60 秒)
|
||
for i in $(seq 1 30); do
|
||
PGPASSWORD=awoooi_test_2026 pg_isready -h pg-test-b5 -p 5432 -U awoooi && break || sleep 2
|
||
done
|
||
# 初始化 schema
|
||
PGPASSWORD=awoooi_test_2026 psql \
|
||
-h pg-test-b5 -p 5432 -U awoooi -d awoooi_test \
|
||
-f tests/integration/setup_test_schema.sql
|
||
# 跑測試
|
||
# B5 整合測試嚴格模式 (2026-04-13 ogt: 恢復 Break-Glass 移除)
|
||
# -m integration: override pyproject.toml addopts "-m 'not integration'",讓標記測試可執行
|
||
# 2026-04-22 ogt: DATABASE_URL 改為必填後,import chain 需要此 env var 讓 Settings 通過驗證
|
||
DATABASE_URL="postgresql+asyncpg://awoooi:awoooi_test_2026@pg-test-b5:5432/awoooi_test?ssl=disable" \
|
||
TEST_DATABASE_URL="postgresql+asyncpg://awoooi:awoooi_test_2026@pg-test-b5:5432/awoooi_test?ssl=disable" \
|
||
/opt/api-venv/bin/pytest tests/integration/test_b5_core_flows.py -v --tb=short -m integration -p no:cacheprovider || PYTEST_EXIT=$?
|
||
# 清理
|
||
docker rm -f pg-test-b5 || true
|
||
# 2026-05-20 Codex: B5 imports shared tests helpers, so cleanup the
|
||
# whole tests tree to avoid root-owned __pycache__ act-runner noise.
|
||
find tests src -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true
|
||
rm -rf .pytest_cache 2>/dev/null || true
|
||
exit "${PYTEST_EXIT:-0}"
|
||
CI_SCRIPT
|
||
docker run --rm \
|
||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-b5-tests" \
|
||
--cpus "2.0" \
|
||
--memory "2g" \
|
||
-v "$PWD:/workspace" \
|
||
-v /tmp/awoooi-b5-tests.sh:/tmp/awoooi-b5-tests.sh:ro \
|
||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||
-w /workspace \
|
||
"${{ env.CI_IMAGE }}" \
|
||
bash /tmp/awoooi-b5-tests.sh
|
||
|
||
- name: Clean Test Workspace Artifacts
|
||
if: always()
|
||
env:
|
||
HOST_RUNNER_CLEANUP_IMAGE: ${{ env.CI_IMAGE }}
|
||
run: bash scripts/ci/cleanup-host-runner-workspace.sh
|
||
|
||
- name: Notify Pipeline Failure
|
||
# 2026-04-30 Codex: tests job failure notifier; no jq dependency for host parity.
|
||
if: failure()
|
||
run: |
|
||
COMMIT_MSG="${{ steps.commit.outputs.message }}"
|
||
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
|
||
ACTOR="${{ github.actor }}"
|
||
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/</\</g; s/>/\>/g')
|
||
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🧪 Stage: tests\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
|
||
if AWOOI_CICD_STATUS=failed \
|
||
AWOOI_CICD_STAGE=tests \
|
||
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
|
||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
|
||
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
|
||
scripts/ci/notify-awoooi-cicd.sh; then
|
||
echo "✅ CI/CD tests failure notification mirrored through AWOOI API"
|
||
else
|
||
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||
-d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \
|
||
-d "parse_mode=HTML" \
|
||
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
|
||
fi
|
||
|
||
build-and-deploy:
|
||
# 2026-04-30 Codex: Docker builds run on the host runner. Long docker build
|
||
# steps were killing the transient act job container with RWLayer=nil.
|
||
needs: tests
|
||
timeout-minutes: 60
|
||
runs-on: awoooi-host
|
||
steps:
|
||
- name: Bootstrap Host Runner Tools
|
||
# 2026-05-05 Codex: keep the host-mode runner self-healing before
|
||
# actions/checkout@v4 and Telegram failure notifications run.
|
||
run: |
|
||
if command -v apk >/dev/null 2>&1; then
|
||
apk add --no-cache nodejs npm git curl bash coreutils python3 openssh-client docker-cli docker-cli-buildx
|
||
fi
|
||
|
||
- uses: actions/checkout@v4
|
||
|
||
- name: Get Commit Info
|
||
id: commit
|
||
run: |
|
||
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
|
||
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
|
||
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
|
||
|
||
- name: Notify Build Deploy Start
|
||
run: |
|
||
ACTOR="${{ github.actor }}"
|
||
if AWOOI_CICD_STATUS=running \
|
||
AWOOI_CICD_STAGE=build-and-deploy \
|
||
AWOOI_CICD_JOB_NAME="AWOOOI 建置部署開始" \
|
||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
|
||
AWOOI_CICD_SUMMARY="${{ steps.commit.outputs.message }}" \
|
||
scripts/ci/notify-awoooi-cicd.sh; then
|
||
echo "✅ CI/CD build-deploy start notification mirrored through AWOOI API"
|
||
else
|
||
echo "⚠️ CI/CD build-deploy start notification failed (non-fatal)"
|
||
fi
|
||
|
||
- name: Login to Harbor
|
||
run: |
|
||
echo "${{ secrets.HARBOR_PASSWORD }}" | \
|
||
docker login "${{ env.HARBOR }}" \
|
||
-u "${{ secrets.HARBOR_USERNAME }}" \
|
||
--password-stdin
|
||
|
||
# 2026-05-21 Codex: AWOOI workflow concurrency and the Docker network
|
||
# lock only protect AWOOI/Docker work. Other repos can still run
|
||
# host-side Next/Turbo builds on the same 110 runner and starve this
|
||
# deploy. Wait for those foreign web builds before starting our image
|
||
# build; the gate is read-only and never kills another process.
|
||
- name: Wait for Host Web Build Pressure
|
||
run: bash scripts/ci/wait-host-web-build-pressure.sh
|
||
|
||
# 2026-04-30 Codex: Gitea act-runner shares one Docker daemon across repos.
|
||
# When another repo starts a heavy docker build while AWOOOI Web is still
|
||
# building, the job container can disappear and Docker reports RWLayer=nil.
|
||
# A Docker-network lock is global to the host daemon and survives container
|
||
# namespaces, unlike /tmp/flock inside the transient job container.
|
||
- name: Acquire Docker Build Lock
|
||
run: |
|
||
LOCK_NAME="awoooi-cd-docker-build-lock"
|
||
STALE_SECONDS=7200
|
||
EMPTY_LOCK_SECONDS=300
|
||
WAIT_ATTEMPTS=180
|
||
|
||
for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do
|
||
if docker network create \
|
||
--label awoooi.ci-lock=docker-build \
|
||
--label awoooi.owner=cd-pipeline \
|
||
"$LOCK_NAME" >/dev/null 2>&1; then
|
||
echo "DOCKER_BUILD_LOCK=${LOCK_NAME}" >> "$GITHUB_ENV"
|
||
echo "✅ Docker build lock acquired: ${LOCK_NAME}"
|
||
exit 0
|
||
fi
|
||
|
||
CREATED_AT=$(docker network inspect "$LOCK_NAME" \
|
||
--format '{{.Created}}' 2>/dev/null || true)
|
||
if [ -n "$CREATED_AT" ]; then
|
||
# 2026-05-03 ogt: 修復 stale 偵測 — Docker 回傳 "2006-01-02 15:04:05.999999999 -0700 MST"
|
||
# date -d 不接受奈秒小數點與末尾時區縮寫(CST/MST 等),導致 CREATED_EPOCH=0 → stale 永不觸發
|
||
# 2026-06-18 Codex: act-runner 容器可能沒有 GNU date / python3;
|
||
# node 由 bootstrap 安裝,作為 Docker CreatedAt 的穩定解析 fallback。
|
||
# 2026-06-19 Codex: Docker / Gitea runner 可能回傳 ISO
|
||
# `2026-06-18T16:20:00.123456789Z`;若 CREATED_EPOCH=0,
|
||
# empty lock 永遠不會自清,下一輪 deploy 會卡滿 30 分鐘。
|
||
CREATED_CLEAN=$(echo "$CREATED_AT" | sed 's/\.[0-9]*//' | sed 's/ [A-Z][A-Z]*$//')
|
||
CREATED_EPOCH=$(date -d "$CREATED_CLEAN" +%s 2>/dev/null || \
|
||
node -e 'const raw = process.argv[1] || ""; const base = raw.replace(/\.\d+/, "").replace(/\s+[A-Z]{2,4}$/, ""); const spaced = base.replace(/^(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2}:\d{2})\s+([+-]\d{2})(\d{2})$/, "$1T$2$3:$4"); const iso = base.replace(/^(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2}:\d{2})(Z|[+-]\d{2}:?\d{2})$/, "$1T$2$3"); const candidates = [raw, base, spaced, iso]; for (const candidate of candidates) { const ms = Date.parse(candidate); if (Number.isFinite(ms)) { console.log(Math.floor(ms / 1000)); process.exit(0); } } process.exit(1);' \
|
||
"$CREATED_AT" 2>/dev/null || \
|
||
python3 -c "import sys, datetime, re; ts = re.sub(r'\\.\d+', '', sys.argv[1]); ts = re.sub(r'\\s+[A-Z]{2,4}$', '', ts.strip()); print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))" \
|
||
"$CREATED_AT" 2>/dev/null || echo 0)
|
||
NOW_EPOCH=$(date +%s)
|
||
LOCK_AGE=$((NOW_EPOCH - CREATED_EPOCH))
|
||
# 2026-05-05 Codex: dirty reboot / cancelled Actions can leave
|
||
# the Docker-network lock behind with no active build or push.
|
||
# Waiting the full 30m CD timeout keeps deploys queued even
|
||
# though no job is protected, so clear empty locks after 5m.
|
||
# 2026-06-18 Codex: 只靠 bracket pattern 仍會命中 lock-check
|
||
# bash/awk 自己的指令列;必須排除檢查器本身,取消後留下的
|
||
# empty lock network 才能在 5 分鐘後自清。
|
||
ACTIVE_DOCKER_WORK=$(ps -eo pid,args | awk '
|
||
$0 ~ /[d]ocker (build|push)|[b]uildx build/ &&
|
||
$0 !~ /ACTIVE_DOCKER_WORK/ &&
|
||
$0 !~ /awk/ &&
|
||
$0 !~ /ps -eo pid,args/ {print}
|
||
' || true)
|
||
if [ "$CREATED_EPOCH" -eq 0 ] && \
|
||
[ $((attempt * 10)) -gt $((EMPTY_LOCK_SECONDS * 2)) ] && \
|
||
[ -z "$ACTIVE_DOCKER_WORK" ]; then
|
||
echo "⚠️ Docker build lock has unparsable CreatedAt (${CREATED_AT}) and no active docker build/push after $((attempt * 10))s, removing ${LOCK_NAME}"
|
||
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
|
||
continue
|
||
fi
|
||
if [ "$CREATED_EPOCH" -gt 0 ] && \
|
||
[ "$LOCK_AGE" -gt "$EMPTY_LOCK_SECONDS" ] && \
|
||
[ -z "$ACTIVE_DOCKER_WORK" ]; then
|
||
echo "⚠️ empty Docker build lock detected (age=${LOCK_AGE}s > ${EMPTY_LOCK_SECONDS}s, no active docker build/push), removing ${LOCK_NAME}"
|
||
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
|
||
continue
|
||
fi
|
||
if [ "$CREATED_EPOCH" -gt 0 ] && \
|
||
[ "$LOCK_AGE" -gt "$STALE_SECONDS" ]; then
|
||
echo "⚠️ stale Docker build lock detected (age=${LOCK_AGE}s > ${STALE_SECONDS}s), removing ${LOCK_NAME}"
|
||
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
|
||
continue
|
||
fi
|
||
fi
|
||
|
||
echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting..."
|
||
sleep 10
|
||
done
|
||
|
||
echo "❌ timed out waiting for Docker build lock"
|
||
exit 1
|
||
|
||
# ── API 鏡像建置(含 Layer Cache 加速)──────────────────────────────
|
||
# 2026-04-01 ogt: CACHE_BUST=git_sha 確保 src/ 和 models.json 層每次重建
|
||
# deps 層 (pip install) 仍可 cache → 加速;代碼/配置層強制失效
|
||
# 2026-05-05 Codex: host runner bootstrap installs docker-cli-buildx;
|
||
# keep BuildKit enabled because the web Dockerfile uses RUN --mount.
|
||
- name: Build and Push API
|
||
env:
|
||
DOCKER_BUILDKIT: "1"
|
||
run: |
|
||
docker build -f apps/api/Dockerfile \
|
||
--build-arg BUILDKIT_INLINE_CACHE=1 \
|
||
--cache-from ${{ env.HARBOR }}/awoooi/api:latest \
|
||
--build-arg CACHE_BUST=${{ github.sha }} \
|
||
-t ${{ env.HARBOR }}/awoooi/api:${{ github.sha }} \
|
||
-t ${{ env.HARBOR }}/awoooi/api:latest \
|
||
.
|
||
docker push ${{ env.HARBOR }}/awoooi/api:${{ github.sha }}
|
||
docker push ${{ env.HARBOR }}/awoooi/api:latest
|
||
|
||
# 2026-03-31 ogt: 移除中間通知,減少訊息雜訊
|
||
|
||
# ── Web 鏡像建置(精準快取失效)──────────────────────────────
|
||
# 2026-03-30 ogt: NEXT_PUBLIC_* 必須用公網域名 (build-time 寫死)
|
||
# 2026-04-01 Claude Code: CACHE_BUST=git_sha 取代 --no-cache
|
||
# - deps 層 (pnpm install) 仍可 cache → 節省 ~2-3 min
|
||
# - COPY . . 以下由 CACHE_BUST 強制失效 → 業務邏輯/CSRF 等變更正確進入 bundle
|
||
# 2026-05-05 Codex: mirror API build mode; BuildKit required for cache mounts.
|
||
- name: Build and Push Web
|
||
env:
|
||
DOCKER_BUILDKIT: "1"
|
||
run: |
|
||
docker build -f apps/web/Dockerfile \
|
||
--build-arg NEXT_PUBLIC_API_URL=https://awoooi.wooo.work \
|
||
--build-arg CACHE_BUST=${{ github.sha }} \
|
||
--build-arg BUILDKIT_INLINE_CACHE=1 \
|
||
--cache-from ${{ env.HARBOR }}/awoooi/web:latest \
|
||
-t ${{ env.HARBOR }}/awoooi/web:${{ github.sha }} \
|
||
-t ${{ env.HARBOR }}/awoooi/web:latest \
|
||
.
|
||
docker push ${{ env.HARBOR }}/awoooi/web:${{ github.sha }}
|
||
docker push ${{ env.HARBOR }}/awoooi/web:latest
|
||
|
||
- name: Release Docker Build Lock
|
||
if: always()
|
||
run: |
|
||
if [ -n "${DOCKER_BUILD_LOCK:-}" ]; then
|
||
docker network rm "$DOCKER_BUILD_LOCK" >/dev/null 2>&1 || true
|
||
echo "✅ Docker build lock released: ${DOCKER_BUILD_LOCK}"
|
||
else
|
||
echo "⚡ no Docker build lock to release"
|
||
fi
|
||
|
||
# 2026-03-31 ogt: 移除中間通知
|
||
|
||
# 2026-03-31 ogt: P0-1 Secrets 自動注入 (ADR-035 強制)
|
||
# 2026-03-31 ogt: 加入 AI API Keys (修復 mock_fallback 問題)
|
||
- name: Inject K8s Secrets
|
||
run: |
|
||
# 2026-05-18 Codex: 不把 secrets 放進 step-level env。
|
||
# Gitea/act_runner 的 job log 可能展開 env;這裡只在 shell 內短暫轉
|
||
# base64,並避免輸出原值。
|
||
secret_b64() {
|
||
if command -v python3.11 >/dev/null 2>&1; then
|
||
python3.11 -c 'import base64, sys; data=sys.stdin.buffer.read(); data=data[:-1] if data.endswith(b"\n") else data; sys.stdout.write(base64.b64encode(data).decode())'
|
||
elif command -v python3 >/dev/null 2>&1; then
|
||
python3 -c 'import base64, sys; data=sys.stdin.buffer.read(); data=data[:-1] if data.endswith(b"\n") else data; sys.stdout.write(base64.b64encode(data).decode())'
|
||
else
|
||
secret_value="$(cat)"
|
||
printf '%s' "${secret_value}" | base64 | tr -d '\n'
|
||
fi
|
||
}
|
||
write_deploy_key() {
|
||
mkdir -p "${HOME}/.ssh"
|
||
umask 077
|
||
cat > "${HOME}/.ssh/deploy_key" <<'AWOOOI_DEPLOY_KEY'
|
||
${{ secrets.DEPLOY_SSH_KEY }}
|
||
AWOOOI_DEPLOY_KEY
|
||
chmod 600 "${HOME}/.ssh/deploy_key"
|
||
}
|
||
|
||
TG_BOT_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_TG_BOT_TOKEN'
|
||
${{ secrets.TELEGRAM_BOT_TOKEN }}
|
||
AWOOOI_SECRET_TG_BOT_TOKEN
|
||
)"
|
||
TG_CHAT_ID_B64="$(secret_b64 <<'AWOOOI_SECRET_SRE_GROUP_CHAT_ID_COMPAT'
|
||
${{ secrets.SRE_GROUP_CHAT_ID }}
|
||
AWOOOI_SECRET_SRE_GROUP_CHAT_ID_COMPAT
|
||
)"
|
||
NVIDIA_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_NVIDIA_API_KEY'
|
||
${{ secrets.NVIDIA_API_KEY }}
|
||
AWOOOI_SECRET_NVIDIA_API_KEY
|
||
)"
|
||
GEMINI_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_GEMINI_API_KEY'
|
||
${{ secrets.GEMINI_API_KEY }}
|
||
AWOOOI_SECRET_GEMINI_API_KEY
|
||
)"
|
||
LANGFUSE_PUBLIC_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_LANGFUSE_PUBLIC_KEY'
|
||
${{ secrets.LANGFUSE_PUBLIC_KEY }}
|
||
AWOOOI_SECRET_LANGFUSE_PUBLIC_KEY
|
||
)"
|
||
LANGFUSE_SECRET_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_LANGFUSE_SECRET_KEY'
|
||
${{ secrets.LANGFUSE_SECRET_KEY }}
|
||
AWOOOI_SECRET_LANGFUSE_SECRET_KEY
|
||
)"
|
||
TG_USER_WHITELIST_B64="$(secret_b64 <<'AWOOOI_SECRET_TG_USER_WHITELIST'
|
||
${{ secrets.OPENCLAW_TG_USER_WHITELIST }}
|
||
AWOOOI_SECRET_TG_USER_WHITELIST
|
||
)"
|
||
SENTRY_AUTH_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_SENTRY_AUTH_TOKEN'
|
||
${{ secrets.SENTRY_AUTH_TOKEN }}
|
||
AWOOOI_SECRET_SENTRY_AUTH_TOKEN
|
||
)"
|
||
GITEA_WEBHOOK_SECRET_B64="$(secret_b64 <<'AWOOOI_SECRET_GITEA_WEBHOOK_SECRET'
|
||
${{ secrets.AWOOOI_GITEA_WEBHOOK_SECRET }}
|
||
AWOOOI_SECRET_GITEA_WEBHOOK_SECRET
|
||
)"
|
||
ARGOCD_API_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_ARGOCD_API_TOKEN'
|
||
${{ secrets.ARGOCD_API_TOKEN }}
|
||
AWOOOI_SECRET_ARGOCD_API_TOKEN
|
||
)"
|
||
DATABASE_URL_B64="$(secret_b64 <<'AWOOOI_SECRET_DATABASE_URL'
|
||
${{ secrets.DATABASE_URL }}
|
||
AWOOOI_SECRET_DATABASE_URL
|
||
)"
|
||
MIGRATION_DATABASE_URL_B64="$(secret_b64 <<'AWOOOI_SECRET_MIGRATION_DATABASE_URL'
|
||
${{ secrets.MIGRATION_DATABASE_URL }}
|
||
AWOOOI_SECRET_MIGRATION_DATABASE_URL
|
||
)"
|
||
REDIS_URL_B64="$(secret_b64 <<'AWOOOI_SECRET_REDIS_URL'
|
||
${{ secrets.REDIS_URL }}
|
||
AWOOOI_SECRET_REDIS_URL
|
||
)"
|
||
JWT_SECRET_B64="$(secret_b64 <<'AWOOOI_SECRET_JWT_SECRET'
|
||
${{ secrets.JWT_SECRET }}
|
||
AWOOOI_SECRET_JWT_SECRET
|
||
)"
|
||
JWT_ALGORITHM_B64="$(secret_b64 <<'AWOOOI_SECRET_JWT_ALGORITHM'
|
||
${{ secrets.JWT_ALGORITHM }}
|
||
AWOOOI_SECRET_JWT_ALGORITHM
|
||
)"
|
||
WEBHOOK_HMAC_SECRET_B64="$(secret_b64 <<'AWOOOI_SECRET_WEBHOOK_HMAC_SECRET'
|
||
${{ secrets.WEBHOOK_HMAC_SECRET }}
|
||
AWOOOI_SECRET_WEBHOOK_HMAC_SECRET
|
||
)"
|
||
AWOOOP_OPERATOR_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_AWOOOP_OPERATOR_API_KEY'
|
||
${{ secrets.AWOOOP_OPERATOR_API_KEY }}
|
||
AWOOOI_SECRET_AWOOOP_OPERATOR_API_KEY
|
||
)"
|
||
SENTRY_DSN_B64="$(secret_b64 <<'AWOOOI_SECRET_SENTRY_DSN'
|
||
${{ secrets.SENTRY_DSN }}
|
||
AWOOOI_SECRET_SENTRY_DSN
|
||
)"
|
||
CLAUDE_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_CLAUDE_API_KEY'
|
||
${{ secrets.CLAUDE_API_KEY }}
|
||
AWOOOI_SECRET_CLAUDE_API_KEY
|
||
)"
|
||
GITEA_API_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_GITEA_API_TOKEN'
|
||
${{ secrets.AWOOOI_GITEA_API_TOKEN }}
|
||
AWOOOI_SECRET_GITEA_API_TOKEN
|
||
)"
|
||
NEMOTRON_BOT_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_NEMOTRON_BOT_TOKEN'
|
||
${{ secrets.NEMOTRON_BOT_TOKEN }}
|
||
AWOOOI_SECRET_NEMOTRON_BOT_TOKEN
|
||
)"
|
||
OPENCLAW_BOT_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_OPENCLAW_BOT_TOKEN'
|
||
${{ secrets.OPENCLAW_BOT_TOKEN }}
|
||
AWOOOI_SECRET_OPENCLAW_BOT_TOKEN
|
||
)"
|
||
SMTP_HOST_B64="$(secret_b64 <<'AWOOOI_SECRET_SMTP_HOST'
|
||
${{ secrets.SMTP_HOST }}
|
||
AWOOOI_SECRET_SMTP_HOST
|
||
)"
|
||
SRE_GROUP_CHAT_ID_B64="$(secret_b64 <<'AWOOOI_SECRET_SRE_GROUP_CHAT_ID'
|
||
${{ secrets.SRE_GROUP_CHAT_ID }}
|
||
AWOOOI_SECRET_SRE_GROUP_CHAT_ID
|
||
)"
|
||
|
||
# S1/S2: 統一命名 deploy_key,改用 ssh-keyscan 與強制 host key 驗證。
|
||
write_deploy_key
|
||
# 2026-05-13 Codex: keyscan must include ED25519 explicitly. Some
|
||
# OpenSSH builds otherwise record only RSA/ECDSA, then strict deploy
|
||
# SSH fails with "No ED25519 host key is known" after image push.
|
||
# 2026-06-13 Codex: keep deploy-time host keys in a dedicated file.
|
||
# The runner user's global known_hosts is shared by cold-start and
|
||
# backup checks for 120/188; overwriting it here caused strict SSH
|
||
# recovery gates to flap after every CD run.
|
||
DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts"
|
||
ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null
|
||
test -s "${DEPLOY_KNOWN_HOSTS}" || { echo "❌ K8S host keyscan failed: ${K8S_SSH_HOST}"; exit 1; }
|
||
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10"
|
||
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" << SECRETS
|
||
set -e
|
||
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
|
||
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=\${K8S_API_SERVER}"
|
||
|
||
# 注入 Telegram Secrets (ADR-035 鐵律)
|
||
# 2026-06-12 Codex: OPENCLAW_TG_CHAT_ID 僅作舊欄位相容,
|
||
# 實際值必須與 SRE_GROUP_CHAT_ID 一致,避免正式告警旁路到其他群組。
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/OPENCLAW_TG_BOT_TOKEN","value":"${TG_BOT_TOKEN_B64}"},
|
||
{"op":"add","path":"/data/OPENCLAW_TG_CHAT_ID","value":"${TG_CHAT_ID_B64}"}
|
||
]' || { echo "❌ Telegram Secrets patch 失敗 — ADR-035 鐵律"; exit 1; }
|
||
|
||
# 2026-03-31 ogt: 注入 AI API Keys (修復 NVIDIA/Gemini mock_fallback)
|
||
# 2026-04-01 Claude Code: base64 -w 0 防止長 key 換行破壞 JSON
|
||
# NVIDIA NIM (免費 tier)
|
||
if [ -n "${NVIDIA_API_KEY_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/NVIDIA_API_KEY","value":"${NVIDIA_API_KEY_B64}"}
|
||
]' && echo "✅ NVIDIA_API_KEY 已注入" || echo "⚠️ NVIDIA_API_KEY patch 失敗"
|
||
else
|
||
echo "⚠️ NVIDIA_API_KEY 未設定,跳過"
|
||
fi
|
||
|
||
# Gemini (備援)
|
||
if [ -n "${GEMINI_API_KEY_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/GEMINI_API_KEY","value":"${GEMINI_API_KEY_B64}"}
|
||
]' && echo "✅ GEMINI_API_KEY 已注入" || echo "⚠️ GEMINI_API_KEY patch 失敗"
|
||
else
|
||
echo "⚠️ GEMINI_API_KEY 未設定,跳過"
|
||
fi
|
||
|
||
# 2026-04-01 Claude Code: Langfuse LLMOps keys (補齊 CD 注入,之前只有手動設定)
|
||
if [ -n "${LANGFUSE_PUBLIC_KEY_B64}" ] && [ -n "${LANGFUSE_SECRET_KEY_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/LANGFUSE_PUBLIC_KEY","value":"${LANGFUSE_PUBLIC_KEY_B64}"},
|
||
{"op":"add","path":"/data/LANGFUSE_SECRET_KEY","value":"${LANGFUSE_SECRET_KEY_B64}"}
|
||
]' && echo "✅ LANGFUSE keys 已注入" || echo "⚠️ LANGFUSE keys patch 失敗"
|
||
else
|
||
echo "⚠️ LANGFUSE_PUBLIC_KEY/SECRET_KEY 未設定,跳過 (現有 K8s secret 值維持不變)"
|
||
fi
|
||
|
||
# 2026-04-02 Claude Code: Telegram Whitelist (授權簽核用戶 ID)
|
||
if [ -n "${TG_USER_WHITELIST_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/OPENCLAW_TG_USER_WHITELIST","value":"${TG_USER_WHITELIST_B64}"}
|
||
]' && echo "✅ TG_USER_WHITELIST 已注入" || echo "⚠️ TG_USER_WHITELIST patch 失敗"
|
||
fi
|
||
|
||
# Phase O-4.1 2026-04-02: Sentry Auth Token (Wave A.1 ADR-037)
|
||
if [ -n "${SENTRY_AUTH_TOKEN_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/SENTRY_AUTH_TOKEN","value":"${SENTRY_AUTH_TOKEN_B64}"}
|
||
]' && echo "✅ SENTRY_AUTH_TOKEN 已注入" || echo "⚠️ SENTRY_AUTH_TOKEN patch 失敗"
|
||
else
|
||
echo "⚠️ SENTRY_AUTH_TOKEN 未設定,Sentry Comment API 將跳過"
|
||
fi
|
||
|
||
# ADR-059 2026-04-05 Claude Code: Gitea Webhook Secret
|
||
if [ -n "${GITEA_WEBHOOK_SECRET_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/GITEA_WEBHOOK_SECRET","value":"${GITEA_WEBHOOK_SECRET_B64}"}
|
||
]' && echo "✅ GITEA_WEBHOOK_SECRET 已注入" || echo "⚠️ GITEA_WEBHOOK_SECRET patch 失敗"
|
||
else
|
||
echo "⚠️ GITEA_WEBHOOK_SECRET 未設定,Gitea Webhook 簽章驗證將在 prod 失效"
|
||
fi
|
||
|
||
# MCP Phase 3: ArgoCD API Token (2026-04-11 Claude Sonnet 4.6)
|
||
if [ -n "${ARGOCD_API_TOKEN_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/ARGOCD_API_TOKEN","value":"${ARGOCD_API_TOKEN_B64}"}
|
||
]' && echo "✅ ARGOCD_API_TOKEN 已注入" || echo "⚠️ ARGOCD_API_TOKEN patch 失敗"
|
||
else
|
||
echo "⚠️ ARGOCD_API_TOKEN 未設定,ArgoCD MCP 將使用空 token"
|
||
fi
|
||
|
||
# ============================================================================
|
||
# ADR-090-B 2026-04-18 ogt + Claude Opus 4.7: L3-only 升級 L2(13 個 key)
|
||
# ============================================================================
|
||
# 目的: 消滅「只存 K8s etcd 單點」的災難盲區,Gitea Secret 成為正式真相來源
|
||
# 注意: 每個 block 與上方維持相同結構(if guard + base64 -w 0 + json patch)
|
||
|
||
# DATABASE_URL — PG 應用連線串(2026-04-18 輪替)
|
||
if [ -n "${DATABASE_URL_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/DATABASE_URL","value":"${DATABASE_URL_B64}"}
|
||
]' && echo "✅ DATABASE_URL 已注入" || echo "⚠️ DATABASE_URL patch 失敗"
|
||
else
|
||
echo "⚠️ DATABASE_URL 未設定,awoooi-api 將無法連 PG"
|
||
fi
|
||
|
||
# MIGRATION_DATABASE_URL — CI migration 用 awoooi_migrator 限權帳號(ADR-090-B)
|
||
if [ -n "${MIGRATION_DATABASE_URL_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/MIGRATION_DATABASE_URL","value":"${MIGRATION_DATABASE_URL_B64}"}
|
||
]' && echo "✅ MIGRATION_DATABASE_URL 已注入" || echo "⚠️ MIGRATION_DATABASE_URL patch 失敗"
|
||
fi
|
||
|
||
# REDIS_URL — Redis 連線(6380 on 188)
|
||
if [ -n "${REDIS_URL_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/REDIS_URL","value":"${REDIS_URL_B64}"}
|
||
]' && echo "✅ REDIS_URL 已注入" || echo "⚠️ REDIS_URL patch 失敗"
|
||
else
|
||
echo "⚠️ REDIS_URL 未設定"
|
||
fi
|
||
|
||
# JWT_SECRET / JWT_ALGORITHM — API 認證
|
||
if [ -n "${JWT_SECRET_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/JWT_SECRET","value":"${JWT_SECRET_B64}"}
|
||
]' && echo "✅ JWT_SECRET 已注入" || echo "⚠️ JWT_SECRET patch 失敗"
|
||
fi
|
||
if [ -n "${JWT_ALGORITHM_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/JWT_ALGORITHM","value":"${JWT_ALGORITHM_B64}"}
|
||
]' && echo "✅ JWT_ALGORITHM 已注入" || echo "⚠️ JWT_ALGORITHM patch 失敗"
|
||
fi
|
||
|
||
# WEBHOOK_HMAC_SECRET — Alertmanager webhook HMAC 簽章
|
||
if [ -n "${WEBHOOK_HMAC_SECRET_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/WEBHOOK_HMAC_SECRET","value":"${WEBHOOK_HMAC_SECRET_B64}"}
|
||
]' && echo "✅ WEBHOOK_HMAC_SECRET 已注入" || echo "⚠️ WEBHOOK_HMAC_SECRET patch 失敗"
|
||
fi
|
||
|
||
# AWOOOP_OPERATOR_API_KEY — AwoooP Operator mutation endpoints
|
||
if [ -n "${AWOOOP_OPERATOR_API_KEY_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/AWOOOP_OPERATOR_API_KEY","value":"${AWOOOP_OPERATOR_API_KEY_B64}"}
|
||
]' && echo "✅ AWOOOP_OPERATOR_API_KEY 已注入" || echo "⚠️ AWOOOP_OPERATOR_API_KEY patch 失敗"
|
||
fi
|
||
|
||
# SENTRY_DSN — Sentry 錯誤追蹤(不是 auth token)
|
||
if [ -n "${SENTRY_DSN_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/SENTRY_DSN","value":"${SENTRY_DSN_B64}"}
|
||
]' && echo "✅ SENTRY_DSN 已注入" || echo "⚠️ SENTRY_DSN patch 失敗"
|
||
fi
|
||
|
||
# CLAUDE_API_KEY — Claude 備援 LLM
|
||
if [ -n "${CLAUDE_API_KEY_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/CLAUDE_API_KEY","value":"${CLAUDE_API_KEY_B64}"}
|
||
]' && echo "✅ CLAUDE_API_KEY 已注入" || echo "⚠️ CLAUDE_API_KEY patch 失敗"
|
||
fi
|
||
|
||
# GITEA_API_TOKEN — Gitea API Token(從 AWOOOI_GITEA_API_TOKEN 映射)
|
||
if [ -n "${GITEA_API_TOKEN_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/GITEA_API_TOKEN","value":"${GITEA_API_TOKEN_B64}"}
|
||
]' && echo "✅ GITEA_API_TOKEN 已注入" || echo "⚠️ GITEA_API_TOKEN patch 失敗"
|
||
fi
|
||
|
||
# NEMOTRON_BOT_TOKEN / OPENCLAW_BOT_TOKEN — 多 Bot 架構
|
||
if [ -n "${NEMOTRON_BOT_TOKEN_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/NEMOTRON_BOT_TOKEN","value":"${NEMOTRON_BOT_TOKEN_B64}"}
|
||
]' && echo "✅ NEMOTRON_BOT_TOKEN 已注入" || echo "⚠️ NEMOTRON_BOT_TOKEN patch 失敗"
|
||
fi
|
||
if [ -n "${OPENCLAW_BOT_TOKEN_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/OPENCLAW_BOT_TOKEN","value":"${OPENCLAW_BOT_TOKEN_B64}"}
|
||
]' && echo "✅ OPENCLAW_BOT_TOKEN 已注入" || echo "⚠️ OPENCLAW_BOT_TOKEN patch 失敗"
|
||
fi
|
||
|
||
# SMTP_HOST / SRE_GROUP_CHAT_ID
|
||
if [ -n "${SMTP_HOST_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/SMTP_HOST","value":"${SMTP_HOST_B64}"}
|
||
]' && echo "✅ SMTP_HOST 已注入" || echo "⚠️ SMTP_HOST patch 失敗"
|
||
fi
|
||
if [ -n "${SRE_GROUP_CHAT_ID_B64}" ]; then
|
||
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
|
||
{"op":"add","path":"/data/SRE_GROUP_CHAT_ID","value":"${SRE_GROUP_CHAT_ID_B64}"}
|
||
]' && echo "✅ SRE_GROUP_CHAT_ID 已注入" || echo "⚠️ SRE_GROUP_CHAT_ID patch 失敗"
|
||
fi
|
||
|
||
# 2026-04-06 Claude Code: Sprint 3 T2 — known_hosts Secret (Security Fix A1)
|
||
# 替換關閉 host key 驗證的舊做法,讓 SSH 修復路徑使用已知主機指紋。
|
||
# asyncssh reads /etc/ssh-mcp/known_hosts and requires a non-empty
|
||
# OpenSSH known_hosts file. Keep hosts unhashed so both asyncssh and
|
||
# CLI diagnostics can trust the same secret.
|
||
# 2026-05-02 ogt + Claude Sonnet 4.6: 加 4 台主機完整性檢查
|
||
# 根因:partial scan(如 110 timeout、其他成功)會讓 [-s file] 通過、
|
||
# 後續 patch 推進缺漏的 known_hosts → asyncssh 拒所有 SSH。
|
||
# 修法:scan 完用 grep -c 驗證 4 台主機都在;缺任何一台就 abort,
|
||
# 不能覆蓋現有 secret,防止 production SSH 自動修復路徑癱瘓。
|
||
ssh-keyscan 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188 > /tmp/known_hosts_repair 2>/tmp/known_hosts_scan_err || true
|
||
EXPECTED_HOSTS=4
|
||
PRESENT=0
|
||
for ip in 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188; do
|
||
if grep -qE "^\${ip}[[:space:]]" /tmp/known_hosts_repair 2>/dev/null; then
|
||
PRESENT=\$((PRESENT + 1))
|
||
else
|
||
echo "⚠️ ssh-keyscan 缺主機 \${ip}"
|
||
fi
|
||
done
|
||
if [ "\$PRESENT" -eq "\$EXPECTED_HOSTS" ]; then
|
||
\$KUBECTL create secret generic awoooi-repair-known-hosts \
|
||
-n awoooi-prod \
|
||
--from-file=known_hosts=/tmp/known_hosts_repair \
|
||
--dry-run=client -o yaml | \$KUBECTL apply -f - \
|
||
&& echo "✅ awoooi-repair-known-hosts Secret 已建立/更新" \
|
||
|| echo "⚠️ awoooi-repair-known-hosts Secret 建立失敗 (非致命)"
|
||
KNOWN_HOSTS_B64=\$(base64 -w 0 /tmp/known_hosts_repair)
|
||
\$KUBECTL patch secret ssh-mcp-key -n awoooi-prod --type=merge \
|
||
-p="{\"data\":{\"known_hosts\":\"\${KNOWN_HOSTS_B64}\"}}" \
|
||
&& echo "✅ ssh-mcp-key known_hosts 已更新(4 台主機完整)" \
|
||
|| echo "⚠️ ssh-mcp-key known_hosts 更新失敗 (非致命)"
|
||
rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err
|
||
else
|
||
echo "❌ ssh-keyscan 只抓到 \${PRESENT}/\${EXPECTED_HOSTS} 台主機,跳過 patch(保留現有 secret)"
|
||
cat /tmp/known_hosts_scan_err 2>/dev/null | head -10
|
||
rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err
|
||
fi
|
||
|
||
echo "✅ 所有 Secrets 注入完成"
|
||
SECRETS
|
||
|
||
# 2026-04-11 Claude Sonnet 4.6 (Sprint B-3 ADR-069):
|
||
# Deploy 改為 ArgoCD GitOps 模式:更新 kustomization.yaml → git push [skip ci] → ArgoCD sync
|
||
# 舊做法 (kubectl set image) 與 ArgoCD selfHeal 衝突 — ArgoCD 會 revert 任何直接 kubectl 操作
|
||
# 新做法流程:
|
||
# 1. 更新 kustomization.yaml image tag(用 kustomize edit set image)
|
||
# 2. Apply ConfigMap/ServiceRegistry(不含 Deployment,由 ArgoCD 管)
|
||
# 3. git commit [skip ci] + push → 觸發 ArgoCD automated sync
|
||
# 4. 等待 ArgoCD sync + rollout 完成
|
||
# 5. Health Check
|
||
- name: Deploy to K8s (ArgoCD GitOps)
|
||
run: |
|
||
write_deploy_key() {
|
||
mkdir -p "${HOME}/.ssh"
|
||
umask 077
|
||
cat > "${HOME}/.ssh/deploy_key" <<'AWOOOI_DEPLOY_KEY'
|
||
${{ secrets.DEPLOY_SSH_KEY }}
|
||
AWOOOI_DEPLOY_KEY
|
||
chmod 600 "${HOME}/.ssh/deploy_key"
|
||
}
|
||
|
||
mkdir -p ~/.ssh
|
||
write_deploy_key
|
||
# 2026-05-13 Codex: mirror Inject K8s Secrets host-key handling so the
|
||
# deploy job never reaches SSH with a known_hosts file missing ED25519.
|
||
# 2026-06-13 Codex: use the deploy-only known_hosts file so this
|
||
# stage cannot wipe cold-start/backup host trust for 120/188.
|
||
DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts"
|
||
ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null
|
||
test -s "${DEPLOY_KNOWN_HOSTS}" || { echo "❌ K8S host keyscan failed: ${K8S_SSH_HOST}"; exit 1; }
|
||
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10"
|
||
|
||
IMAGE_TAG="${{ github.sha }}"
|
||
HARBOR=192.168.0.110:5000
|
||
|
||
# ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 Deployment,ConfigMap 仍直接 apply) ───
|
||
cat k8s/awoooi-prod/04-configmap.yaml | \
|
||
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
|
||
"KUBECTL='sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${{ env.K8S_API_SERVER }}'; \$KUBECTL apply -f -"
|
||
echo "✅ ConfigMap 已更新"
|
||
|
||
cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
|
||
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
|
||
"KUBECTL='sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${{ env.K8S_API_SERVER }}'; \$KUBECTL apply -f -"
|
||
echo "✅ Service Registry ConfigMap 已更新"
|
||
|
||
# ─── Step 2: 更新 kustomization.yaml image tag ───
|
||
# host runner 不保證有 root 權限,kustomize 安裝在使用者目錄。
|
||
export PATH="${HOME}/.local/bin:${PATH}"
|
||
if ! command -v kustomize &>/dev/null; then
|
||
mkdir -p "${HOME}/.local/bin"
|
||
curl -sL https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.3.0/kustomize_v5.3.0_linux_amd64.tar.gz \
|
||
| tar xz -C "${HOME}/.local/bin"
|
||
chmod +x "${HOME}/.local/bin/kustomize"
|
||
fi
|
||
|
||
cd k8s/awoooi-prod
|
||
# kustomize edit set image 更新 tag
|
||
kustomize edit set image \
|
||
192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER=${HARBOR}/awoooi/api:${IMAGE_TAG}
|
||
kustomize edit set image \
|
||
192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER=${HARBOR}/awoooi/web:${IMAGE_TAG}
|
||
cd ../..
|
||
|
||
# ─── Step 3: git commit [skip ci] + push → 觸發 ArgoCD sync ───
|
||
git config user.email "cd@awoooi.internal"
|
||
git config user.name "AWOOOI CD"
|
||
git add k8s/awoooi-prod/kustomization.yaml
|
||
DEPLOY_REVISION=""
|
||
git diff --cached --quiet && echo "⚡ kustomization.yaml 無變化,跳過 push" || {
|
||
git commit -m "chore(cd): deploy ${IMAGE_TAG::7} [skip ci]"
|
||
# 用 token 推送(避免 SSH key 需要額外設定 push 權限)
|
||
git remote remove gitea 2>/dev/null || true
|
||
git remote add gitea "http://wooo:${{ secrets.CD_PUSH_TOKEN }}@192.168.0.110:3001/wooo/awoooi.git"
|
||
# 先 rebase 避免 non-fast-forward (其他 commit 在 CI 期間已推入)
|
||
# 2026-04-17 ogt: -X theirs — kustomization.yaml 衝突時採用當次部署的 image tag
|
||
git fetch gitea main
|
||
git rebase -X theirs gitea/main
|
||
DEPLOY_REVISION=$(git rev-parse HEAD)
|
||
git push gitea main
|
||
echo "✅ kustomization.yaml 已 push,等待 ArgoCD sync 到 ${DEPLOY_REVISION:0:8}..."
|
||
}
|
||
|
||
# ─── Step 4: 等待 ArgoCD sync + rollout ───
|
||
ROLLOUT_LOG="$(mktemp)"
|
||
set +e
|
||
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
|
||
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" 2>&1 << 'ARGOCD_WAIT' | tee "$ROLLOUT_LOG"
|
||
set -e
|
||
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
|
||
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER}"
|
||
RISK_FILE="$(mktemp)"
|
||
UNKNOWN_STATUS_COUNT=0
|
||
HEALTH_FAILURE_COUNT=0
|
||
|
||
record_rollout_risk() {
|
||
local message="$1"
|
||
printf '%s\n' "$message" >> "$RISK_FILE"
|
||
echo "⚠️ Rollout risk observed: $message" >&2
|
||
}
|
||
|
||
emit_rollout_evidence() {
|
||
if [ -s "$RISK_FILE" ]; then
|
||
local summary
|
||
local kubectl_count
|
||
kubectl_count=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
|
||
summary=$(tr '\n' '; ' < "$RISK_FILE" | sed 's/[[:cntrl:]]//g' | cut -c1-700)
|
||
echo "AWOOOI_ROLLOUT_RISK=1"
|
||
echo "AWOOOI_ROLLOUT_SUMMARY=unknown_status_count=${UNKNOWN_STATUS_COUNT}; health_failure_count=${HEALTH_FAILURE_COUNT}; kubectl_failure_count=${kubectl_count}; ${summary}"
|
||
else
|
||
echo "AWOOOI_ROLLOUT_RISK=0"
|
||
fi
|
||
rm -f "$RISK_FILE"
|
||
}
|
||
trap emit_rollout_evidence EXIT
|
||
|
||
app_field() {
|
||
local jsonpath="$1"
|
||
local label="$2"
|
||
local output
|
||
local status
|
||
local kubectl_seen
|
||
set +e
|
||
output=$($KUBECTL get application awoooi-prod -n argocd -o jsonpath="$jsonpath" 2>&1)
|
||
status=$?
|
||
set -e
|
||
if [ "$status" -ne 0 ]; then
|
||
kubectl_seen=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
|
||
if [ "$kubectl_seen" -lt 3 ]; then
|
||
record_rollout_risk "argocd_${label}_query_failed=$(echo "$output" | head -c 180)"
|
||
fi
|
||
printf 'Unknown'
|
||
return 0
|
||
fi
|
||
printf '%s' "$output"
|
||
}
|
||
|
||
probe_public_health() {
|
||
local phase="$1"
|
||
local http_code
|
||
local status
|
||
set +e
|
||
http_code=$(curl -sS -w "%{http_code}" -o /dev/null --connect-timeout 3 --max-time 8 "${{ env.API_HEALTH_URL }}" 2>/dev/null)
|
||
status=$?
|
||
set -e
|
||
if [ "$status" -ne 0 ]; then
|
||
http_code="curl_error_${status}"
|
||
fi
|
||
if [ "$http_code" != "200" ]; then
|
||
HEALTH_FAILURE_COUNT=$((HEALTH_FAILURE_COUNT + 1))
|
||
if [ "$HEALTH_FAILURE_COUNT" -le 3 ]; then
|
||
record_rollout_risk "public_health_${phase}_http=${http_code}"
|
||
fi
|
||
fi
|
||
}
|
||
|
||
collect_argocd_resource_evidence() {
|
||
local template
|
||
local output
|
||
local status
|
||
template='{{range .status.resources}}{{if ne .status "Synced"}}{{.kind}}/{{.name}}{{if .namespace}} ns={{.namespace}}{{end}} sync={{.status}}{{if .health.status}} health={{.health.status}}{{end}}{{"\n"}}{{end}}{{if .health.status}}{{if ne .health.status "Healthy"}}{{.kind}}/{{.name}}{{if .namespace}} ns={{.namespace}}{{end}} sync={{.status}} health={{.health.status}}{{if .health.message}} msg={{.health.message}}{{end}}{{"\n"}}{{end}}{{end}}{{end}}'
|
||
set +e
|
||
output=$($KUBECTL get application awoooi-prod -n argocd -o "go-template=${template}" 2>&1)
|
||
status=$?
|
||
set -e
|
||
if [ "$status" -ne 0 ]; then
|
||
local output_snippet
|
||
output_snippet=$(printf '%s' "$output" | head -c 180)
|
||
echo "resource_query_failed=${output_snippet}"
|
||
return 0
|
||
fi
|
||
echo "$output" \
|
||
| awk 'NF && !seen[$0]++ {print}' \
|
||
| head -5 \
|
||
| tr '\n' ';' \
|
||
| sed 's/[[:cntrl:]]//g; s/;*$//'
|
||
}
|
||
|
||
validate_argocd_source_contract() {
|
||
local target_revision
|
||
local image_override
|
||
|
||
target_revision=$(app_field '{.spec.source.targetRevision}' source_target_revision)
|
||
image_override=$(app_field '{.spec.source.kustomize.images}' source_kustomize_images)
|
||
|
||
if [ "$target_revision" != "main" ]; then
|
||
record_rollout_risk "argocd_source_target_revision_not_main targetRevision=$target_revision"
|
||
echo "❌ ArgoCD source targetRevision must be main, got: $target_revision" >&2
|
||
exit 1
|
||
fi
|
||
|
||
if [ -n "$image_override" ]; then
|
||
local image_override_snippet
|
||
image_override_snippet=$(printf '%s' "$image_override" | head -c 180)
|
||
record_rollout_risk "argocd_source_image_override_present images=${image_override_snippet}"
|
||
echo "❌ ArgoCD source kustomize.images override must be empty; image truth belongs in k8s/awoooi-prod/kustomization.yaml" >&2
|
||
exit 1
|
||
fi
|
||
}
|
||
|
||
# 等待 ArgoCD Application 同步到目標 revision(最多 180s)。
|
||
# 2026-05-24 Codex: top-level Application health can stay Degraded
|
||
# without per-resource health detail. Treat that as rollout evidence,
|
||
# then let kubectl rollout status and API health decide pass/fail.
|
||
echo "⏳ 等待 ArgoCD sync..."
|
||
validate_argocd_source_contract
|
||
$KUBECTL annotate application awoooi-prod -n argocd \
|
||
argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true
|
||
for i in $(seq 1 36); do
|
||
SYNC=$(app_field '{.status.sync.status}' sync)
|
||
HEALTH=$(app_field '{.status.health.status}' health)
|
||
REVISION=$(app_field '{.status.sync.revision}' revision)
|
||
SHORT_REVISION=$(echo "$REVISION" | cut -c1-8)
|
||
SHORT_EXPECTED=$(echo "$EXPECTED_REVISION" | cut -c1-8)
|
||
echo " ArgoCD: sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
|
||
probe_public_health "argocd_wait"
|
||
if [ "$SYNC" = "Unknown" ] || [ "$HEALTH" = "Unknown" ] || [ "$REVISION" = "Unknown" ]; then
|
||
UNKNOWN_STATUS_COUNT=$((UNKNOWN_STATUS_COUNT + 1))
|
||
if [ "$UNKNOWN_STATUS_COUNT" -le 3 ]; then
|
||
record_rollout_risk "argocd_status_unknown sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
|
||
fi
|
||
fi
|
||
if [ "$SYNC" = "Synced" ]; then
|
||
if [ -z "$EXPECTED_REVISION" ] || [ "$REVISION" = "$EXPECTED_REVISION" ]; then
|
||
if [ "$HEALTH" != "Healthy" ]; then
|
||
RESOURCE_EVIDENCE=$(collect_argocd_resource_evidence)
|
||
if [ -n "$RESOURCE_EVIDENCE" ]; then
|
||
record_rollout_risk "argocd_health_not_healthy health=$HEALTH revision=$SHORT_REVISION resources=$RESOURCE_EVIDENCE"
|
||
else
|
||
record_rollout_risk "argocd_health_not_healthy health=$HEALTH revision=$SHORT_REVISION resources=none_visible"
|
||
fi
|
||
fi
|
||
echo "✅ ArgoCD Synced to target revision (health=$HEALTH)"
|
||
break
|
||
fi
|
||
fi
|
||
if [ "$i" = "36" ]; then
|
||
echo "❌ ArgoCD 未在期限內同步到目標 revision"
|
||
exit 1
|
||
fi
|
||
sleep 5
|
||
done
|
||
|
||
# 確認 rollout 完成
|
||
$KUBECTL rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s
|
||
$KUBECTL rollout status deployment/awoooi-web -n awoooi-prod --timeout=120s
|
||
$KUBECTL rollout status deployment/awoooi-worker -n awoooi-prod --timeout=120s
|
||
echo "✅ 部署完成"
|
||
|
||
# Health Check
|
||
HEALTH_PASS=0
|
||
for i in 1 2 3; do
|
||
set +e
|
||
HTTP_CODE=$(curl -sS -w "%{http_code}" -o /dev/null --connect-timeout 10 --max-time 20 "${{ env.API_HEALTH_URL }}" 2>/dev/null)
|
||
CURL_STATUS=$?
|
||
set -e
|
||
if [ "$CURL_STATUS" -ne 0 ]; then
|
||
HTTP_CODE="curl_error_${CURL_STATUS}"
|
||
fi
|
||
if [ "$HTTP_CODE" = "200" ]; then
|
||
echo "✅ API 健康檢查通過"
|
||
HEALTH_PASS=1
|
||
break
|
||
fi
|
||
echo "⏳ 嘗試 #$i: HTTP $HTTP_CODE,等待 10s..."
|
||
sleep 10
|
||
done
|
||
if [ "$HEALTH_PASS" = "0" ]; then
|
||
record_rollout_risk "public_health_final_failed"
|
||
echo "❌ API 健康檢查失敗"
|
||
exit 1
|
||
fi
|
||
ARGOCD_WAIT
|
||
ROLLOUT_EXIT=${PIPESTATUS[0]}
|
||
set -e
|
||
|
||
ROLLOUT_RISK="0"
|
||
ROLLOUT_SUMMARY=""
|
||
if grep -q '^AWOOOI_ROLLOUT_RISK=1$' "$ROLLOUT_LOG"; then
|
||
ROLLOUT_RISK="1"
|
||
ROLLOUT_SUMMARY=$(grep '^AWOOOI_ROLLOUT_SUMMARY=' "$ROLLOUT_LOG" | tail -1 | sed 's/^AWOOOI_ROLLOUT_SUMMARY=//' | cut -c1-700)
|
||
fi
|
||
if [ -n "${GITHUB_ENV:-}" ]; then
|
||
{
|
||
echo "AWOOI_ROLLOUT_RISK=${ROLLOUT_RISK}"
|
||
echo "AWOOI_ROLLOUT_SUMMARY=${ROLLOUT_SUMMARY}"
|
||
} >> "$GITHUB_ENV"
|
||
fi
|
||
rm -f "$ROLLOUT_LOG"
|
||
|
||
if [ "$ROLLOUT_EXIT" -eq 0 ] && [ "$ROLLOUT_RISK" = "1" ]; then
|
||
ACTOR="${GITHUB_ACTOR:-${{ github.actor }}}"
|
||
if AWOOI_CICD_STATUS=pending \
|
||
AWOOI_CICD_STAGE=rollout-risk \
|
||
AWOOI_CICD_JOB_NAME="AWOOOI 部署完成但仍有風險證據" \
|
||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
|
||
AWOOI_CICD_SUMMARY="${ROLLOUT_SUMMARY}" \
|
||
scripts/ci/notify-awoooi-cicd.sh; then
|
||
echo "✅ CI/CD rollout risk notification mirrored through AWOOI API"
|
||
else
|
||
echo "⚠️ CI/CD rollout risk notification failed (non-fatal)"
|
||
fi
|
||
fi
|
||
exit "$ROLLOUT_EXIT"
|
||
|
||
- name: Notify Build Deploy Success
|
||
run: |
|
||
END_TIME=$(date +%s)
|
||
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
|
||
ACTOR="${{ github.actor }}"
|
||
if AWOOI_CICD_STATUS=success \
|
||
AWOOI_CICD_STAGE=build-and-deploy \
|
||
AWOOI_CICD_JOB_NAME="AWOOOI 建置部署完成" \
|
||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
|
||
AWOOI_CICD_DURATION_SECONDS="${DURATION}" \
|
||
AWOOI_CICD_SUMMARY="Image build/push + ArgoCD rollout + API health passed" \
|
||
scripts/ci/notify-awoooi-cicd.sh; then
|
||
echo "✅ CI/CD build-deploy success notification mirrored through AWOOI API"
|
||
else
|
||
echo "⚠️ CI/CD build-deploy success notification failed (non-fatal)"
|
||
fi
|
||
|
||
# 2026-04-09 Claude Sonnet 4.6: Sprint 5.2 — 同步 ops 腳本到 188 (ollama user)
|
||
# 188 deploy key is rotated and must not be read by this disabled step.
|
||
# 腳本: docker-health-monitor.sh + pg-backup.sh + notify-awoooi-ops.sh
|
||
# 感知層與備份通知都先走 AWOOI API/AwoooP,Telegram 直發只保留 API 離線 fallback。
|
||
- name: Sync Ops Scripts to 188
|
||
# 2026-05-13 Codex T14e/P0:
|
||
# Disabled until the 188 ops sync path is moved to a file-secret or
|
||
# Ansible-controlled channel. Gitea Actions logs step env values, and
|
||
# multiline SSH secrets must not be exposed through CD logs.
|
||
if: ${{ false }}
|
||
continue-on-error: true
|
||
run: |
|
||
echo "188 ops script sync disabled pending secure key rotation path"
|
||
|
||
- name: Notify Pipeline Failure
|
||
if: failure()
|
||
run: |
|
||
COMMIT_MSG="${{ steps.commit.outputs.message }}"
|
||
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
|
||
ACTOR="${{ github.actor }}"
|
||
FAILURE_SUMMARY="${AWOOI_ROLLOUT_SUMMARY:-${COMMIT_MSG}}"
|
||
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/</\</g; s/>/\>/g')
|
||
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
|
||
if AWOOI_CICD_STATUS=failed \
|
||
AWOOI_CICD_STAGE=build-and-deploy \
|
||
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
|
||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
|
||
AWOOI_CICD_SUMMARY="${FAILURE_SUMMARY}" \
|
||
scripts/ci/notify-awoooi-cicd.sh; then
|
||
echo "✅ CI/CD build failure notification mirrored through AWOOI API"
|
||
else
|
||
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||
-d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \
|
||
-d "parse_mode=HTML" \
|
||
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
|
||
fi
|
||
|
||
post-deploy-checks:
|
||
needs: build-and-deploy
|
||
timeout-minutes: 30
|
||
# 2026-04-30 Codex: keep post-deploy on the host runner too. Playwright
|
||
# install-deps can also kill the act-managed job container with RWLayer=nil.
|
||
runs-on: awoooi-host
|
||
steps:
|
||
- name: Bootstrap Host Runner Tools
|
||
# 2026-05-05 Codex: post-deploy also uses checkout and curl-based
|
||
# notifications, so it needs the same runner bootstrap as earlier jobs.
|
||
run: |
|
||
if command -v apk >/dev/null 2>&1; then
|
||
apk add --no-cache nodejs npm git curl bash coreutils python3 openssh-client docker-cli docker-cli-buildx
|
||
fi
|
||
|
||
- uses: actions/checkout@v4
|
||
|
||
- name: Get Commit Info
|
||
id: commit
|
||
run: |
|
||
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
|
||
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
|
||
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
|
||
|
||
- name: Notify Post Deploy Checks Start
|
||
run: |
|
||
ACTOR="${{ github.actor }}"
|
||
if AWOOI_CICD_STATUS=running \
|
||
AWOOI_CICD_STAGE=post-deploy-checks \
|
||
AWOOI_CICD_JOB_NAME="AWOOOI 部署後驗證開始" \
|
||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
|
||
AWOOI_CICD_SUMMARY="Alert Chain / Source Link / Monitoring / Smoke gates started" \
|
||
scripts/ci/notify-awoooi-cicd.sh; then
|
||
echo "✅ CI/CD post-deploy start notification mirrored through AWOOI API"
|
||
else
|
||
echo "⚠️ CI/CD post-deploy start notification failed (non-fatal)"
|
||
fi
|
||
|
||
# Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037)
|
||
# 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter
|
||
# 2026-04-05 Claude Code cache優化: 使用 /opt/api-venv (已有 requests),移除 Setup Python Tools step
|
||
# 2026-04-10 ogt: 移除 continue-on-error — 告警鏈路失敗必須阻塞部署
|
||
- name: Alert Chain Smoke Test
|
||
id: alert_chain_smoke
|
||
run: |
|
||
write_deploy_key() {
|
||
mkdir -p "${HOME}/.ssh"
|
||
umask 077
|
||
cat > "${HOME}/.ssh/deploy_key" <<'AWOOOI_DEPLOY_KEY'
|
||
${{ secrets.DEPLOY_SSH_KEY }}
|
||
AWOOOI_DEPLOY_KEY
|
||
chmod 600 "${HOME}/.ssh/deploy_key"
|
||
}
|
||
collect_observability_statuses() {
|
||
local component="$1"
|
||
ssh $SSH_OPTS "wooo@${K8S_SSH_HOST}" \
|
||
"sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER} get pods -n observability -l app.kubernetes.io/name=${component} --no-headers -o custom-columns=STATUS:.status.phase"
|
||
}
|
||
capture_observability_statuses() {
|
||
local component="$1"
|
||
local output
|
||
if output="$(collect_observability_statuses "${component}" 2>&1)"; then
|
||
printf '%s' "${output}"
|
||
return 0
|
||
fi
|
||
printf '%s' "${output}"
|
||
return 1
|
||
}
|
||
|
||
# 2026-05-19 Codex: the smoke test runs inside CI_IMAGE, but the
|
||
# observability pod checks need the K3s host kubectl context. Capture
|
||
# those read-only statuses on the host and pass them into the
|
||
# container, instead of making the container own kube credentials.
|
||
OBSERVABILITY_PREFLIGHT_ERROR=""
|
||
OTEL_COLLECTOR_ERROR=""
|
||
EVENT_EXPORTER_ERROR=""
|
||
OTEL_COLLECTOR_STATUSES=""
|
||
EVENT_EXPORTER_STATUSES=""
|
||
|
||
write_deploy_key
|
||
DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts"
|
||
if ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null && test -s "${DEPLOY_KNOWN_HOSTS}"; then
|
||
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10"
|
||
if ! OTEL_COLLECTOR_STATUSES="$(capture_observability_statuses otel-collector)"; then
|
||
OTEL_COLLECTOR_ERROR="$(printf '%s' "${OTEL_COLLECTOR_STATUSES}" | tail -1 | head -c 200)"
|
||
OTEL_COLLECTOR_STATUSES=""
|
||
fi
|
||
if ! EVENT_EXPORTER_STATUSES="$(capture_observability_statuses event-exporter)"; then
|
||
EVENT_EXPORTER_ERROR="$(printf '%s' "${EVENT_EXPORTER_STATUSES}" | tail -1 | head -c 200)"
|
||
EVENT_EXPORTER_STATUSES=""
|
||
fi
|
||
else
|
||
OBSERVABILITY_PREFLIGHT_ERROR="K8s host keyscan failed"
|
||
OTEL_COLLECTOR_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}"
|
||
EVENT_EXPORTER_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}"
|
||
fi
|
||
|
||
SOURCE_LINK_RUN_REF="gitea-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}"
|
||
SOURCE_LINK_CANARY_WORK_ITEM_ID="source-evidence:sentry:upstream_canary:awoooi-source-link-canary-${SOURCE_LINK_RUN_REF}"
|
||
SOURCE_LINK_CANARY_EVENT_ID="sentry:source_correlation_linked:awoooi-source-link-canary-${SOURCE_LINK_RUN_REF}"
|
||
echo "source_link_canary_work_item_id=${SOURCE_LINK_CANARY_WORK_ITEM_ID}" >> "$GITHUB_OUTPUT"
|
||
echo "source_link_canary_event_id=${SOURCE_LINK_CANARY_EVENT_ID}" >> "$GITHUB_OUTPUT"
|
||
|
||
AWOOOP_OPERATOR_API_KEY="$(
|
||
ssh $SSH_OPTS "wooo@${K8S_SSH_HOST}" \
|
||
"sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER} get secret awoooi-secrets -n awoooi-prod -o jsonpath='{.data.AWOOOP_OPERATOR_API_KEY}' | base64 -d"
|
||
)"
|
||
if [ -z "${AWOOOP_OPERATOR_API_KEY}" ]; then
|
||
echo "❌ AWOOOP_OPERATOR_API_KEY missing from production secret; source-link canary cannot run"
|
||
exit 1
|
||
fi
|
||
export AWOOOP_OPERATOR_API_KEY
|
||
|
||
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
|
||
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
|
||
if docker run --rm \
|
||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
|
||
--cpus "1.0" \
|
||
--memory "1g" \
|
||
-v "$PWD:/workspace" \
|
||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||
-w /workspace \
|
||
-e AWOOOI_OTEL_COLLECTOR_STATUSES="${OTEL_COLLECTOR_STATUSES}" \
|
||
-e AWOOOI_OTEL_COLLECTOR_ERROR="${OTEL_COLLECTOR_ERROR}" \
|
||
-e AWOOOI_EVENT_EXPORTER_STATUSES="${EVENT_EXPORTER_STATUSES}" \
|
||
-e AWOOOI_EVENT_EXPORTER_ERROR="${EVENT_EXPORTER_ERROR}" \
|
||
-e AWOOOP_OPERATOR_API_KEY \
|
||
-e AWOOOP_OPERATOR_ID="gitea-cd-post-deploy" \
|
||
-e SOURCE_LINK_RUN_REF="${SOURCE_LINK_RUN_REF}" \
|
||
"${{ env.CI_IMAGE }}" \
|
||
bash -lc 'set -o pipefail; source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url ${{ env.ALERT_CHAIN_API_URL }} --source-link-canary-target-incident-id INC-20260505-25E744 --run-ref "${SOURCE_LINK_RUN_REF}" --json | tee /tmp/alert_chain_result.json'; then
|
||
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
|
||
else
|
||
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
|
||
exit 1
|
||
fi
|
||
|
||
# Phase O-5 Wave C.2 2026-04-02 ogt: 監控覆蓋率驗證 (generate_monitoring.py --check)
|
||
# 2026-04-10 ogt: 移除 continue-on-error — 覆蓋率不足必須阻塞部署
|
||
- name: Monitoring Coverage Check
|
||
id: monitoring_coverage
|
||
run: |
|
||
if docker run --rm \
|
||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-coverage" \
|
||
--cpus "1.0" \
|
||
--memory "1g" \
|
||
-v "$PWD:/workspace" \
|
||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||
-w /workspace \
|
||
"${{ env.CI_IMAGE }}" \
|
||
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/generate_monitoring.py --check'; then
|
||
echo "coverage_status=pass" >> $GITHUB_OUTPUT
|
||
else
|
||
echo "coverage_status=fail" >> $GITHUB_OUTPUT
|
||
exit 1
|
||
fi
|
||
|
||
- name: AwoooP Source Correlation Applied-Link Smoke
|
||
id: source_correlation_apply_smoke
|
||
run: |
|
||
SOURCE_LINK_CANARY_WORK_ITEM_ID="${{ steps.alert_chain_smoke.outputs.source_link_canary_work_item_id }}"
|
||
SOURCE_LINK_CANARY_EVENT_ID="${{ steps.alert_chain_smoke.outputs.source_link_canary_event_id }}"
|
||
export SOURCE_LINK_CANARY_WORK_ITEM_ID SOURCE_LINK_CANARY_EVENT_ID
|
||
if docker run --rm \
|
||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-source-link-smoke" \
|
||
--cpus "0.5" \
|
||
--memory "512m" \
|
||
-v "$PWD:/workspace" \
|
||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||
-w /workspace \
|
||
-e SOURCE_LINK_CANARY_WORK_ITEM_ID \
|
||
-e SOURCE_LINK_CANARY_EVENT_ID \
|
||
"${{ env.CI_IMAGE }}" \
|
||
bash -lc 'set -o pipefail; source /opt/api-venv/bin/activate && python3 scripts/awooop_source_correlation_apply_smoke.py \
|
||
--api-url ${{ env.ALERT_CHAIN_API_URL }} \
|
||
--target-incident-id INC-20260505-25E744 \
|
||
--work-item-id "${SOURCE_LINK_CANARY_WORK_ITEM_ID}" \
|
||
--expected-source-event-provider-event-id "${SOURCE_LINK_CANARY_EVENT_ID}" \
|
||
--allow-existing-apply \
|
||
--refresh-if-stale-days 6 \
|
||
--refresh-work-item-id "${SOURCE_LINK_CANARY_WORK_ITEM_ID}" \
|
||
--verify-refresh-candidate \
|
||
--reviewer-id gitea_cd_source_link_canary \
|
||
--operator-note "CD dedicated source-link canary; append-only status-chain proof" \
|
||
| tee /tmp/source_correlation_apply_smoke.json'; then
|
||
echo "source_correlation_apply_status=pass" >> $GITHUB_OUTPUT
|
||
else
|
||
echo "source_correlation_apply_status=fail" >> $GITHUB_OUTPUT
|
||
exit 1
|
||
fi
|
||
|
||
# [首席架構師] 新增 Playwright E2E Smoke Test 步驟 v1.0.0 2026-04-01 (台北時間)
|
||
# continue-on-error: true — smoke 失敗不阻塞部署,但結果會反映在 TG 通知
|
||
- name: E2E Smoke Test
|
||
id: smoke
|
||
continue-on-error: true
|
||
run: |
|
||
cat > /tmp/awoooi-smoke.sh <<'CI_SCRIPT'
|
||
# 首席架構師 Review I4 + 2026-04-05 Claude Code cache優化:
|
||
# playwright.config.ts import @playwright/test — 必須先安裝 pnpm node_modules
|
||
# pnpm store 持久化到 /opt/pnpm-store,pnpm-lock.yaml hash 未變則 --prefer-offline
|
||
cleanup_smoke_workspace_artifacts() {
|
||
# 2026-05-19 Codex: pnpm creates a symlink-heavy node_modules tree
|
||
# inside the bind-mounted checkout. Remove it before act-runner's
|
||
# post-job cleanup so successful smoke jobs do not end with
|
||
# errSymlink cleanup noise.
|
||
rm -rf /workspace/node_modules \
|
||
/workspace/apps/web/node_modules \
|
||
/workspace/apps/web/tests/e2e/.auth \
|
||
/workspace/apps/web/test-results \
|
||
/workspace/apps/web/playwright-report \
|
||
2>/dev/null || true
|
||
find /workspace/apps /workspace/packages \
|
||
-mindepth 2 -maxdepth 2 -type d -name node_modules -prune -exec rm -rf {} + \
|
||
2>/dev/null || true
|
||
}
|
||
trap cleanup_smoke_workspace_artifacts EXIT
|
||
|
||
PNPM_STORE=/opt/pnpm-store
|
||
PNPM_HASH_FILE=/opt/pnpm-store/.lock_hash
|
||
CURRENT_PNPM_HASH=$(md5sum pnpm-lock.yaml | awk '{print $1}')
|
||
|
||
corepack enable 2>/dev/null || npm install -g pnpm@9 -q
|
||
pnpm config set store-dir $PNPM_STORE
|
||
|
||
if [ "$(cat $PNPM_HASH_FILE 2>/dev/null)" != "$CURRENT_PNPM_HASH" ]; then
|
||
echo "📦 pnpm lock 已變更,重裝 node_modules..."
|
||
pnpm install --frozen-lockfile 2>&1 | tail -5
|
||
echo "$CURRENT_PNPM_HASH" > $PNPM_HASH_FILE
|
||
else
|
||
echo "⚡ 使用快取 pnpm store (lock 未變更),prefer-offline..."
|
||
pnpm install --frozen-lockfile --prefer-offline 2>&1 | tail -5
|
||
fi
|
||
|
||
cd apps/web
|
||
# Playwright Chromium 持久化到 /opt/playwright-browsers,版本 hash guard
|
||
export PLAYWRIGHT_BROWSERS_PATH=/opt/playwright-browsers
|
||
PLAYWRIGHT_VER=$(node -e "console.log(require('./package.json').devDependencies['@playwright/test'] || '')" 2>/dev/null || echo "unknown")
|
||
PLAYWRIGHT_HASH_FILE=/opt/playwright-browsers/.version_hash
|
||
if [ "$(cat $PLAYWRIGHT_HASH_FILE 2>/dev/null)" != "$PLAYWRIGHT_VER" ]; then
|
||
echo "📦 Playwright 版本變更 ($PLAYWRIGHT_VER),重裝 Chromium..."
|
||
npx playwright install chromium --with-deps 2>&1 | tail -5
|
||
echo "$PLAYWRIGHT_VER" > $PLAYWRIGHT_HASH_FILE
|
||
else
|
||
echo "⚡ 使用快取 Playwright Chromium ($PLAYWRIGHT_VER)"
|
||
fi
|
||
# Browser cache 命中時也要確認 OS shared libs 存在;否則 smoke 會只測到
|
||
# chromium launch failure(例如 libnspr4.so missing)。
|
||
if ! ldconfig -p 2>/dev/null | grep -q 'libnspr4'; then
|
||
echo "📦 Playwright system deps missing,補安裝 Chromium deps..."
|
||
npx playwright install-deps chromium > /tmp/playwright-install-deps.log 2>&1 || {
|
||
tail -40 /tmp/playwright-install-deps.log
|
||
exit 1
|
||
}
|
||
tail -20 /tmp/playwright-install-deps.log
|
||
fi
|
||
# 對已部署的生產環境跑 smoke test
|
||
SMOKE_STATUS=pass
|
||
npx playwright test tests/e2e/smoke.spec.ts --reporter=line || SMOKE_STATUS=fail
|
||
echo "smoke_status=${SMOKE_STATUS}" >> $GITHUB_OUTPUT
|
||
CI_SCRIPT
|
||
SMOKE_OUTPUT="$PWD/.awoooi-smoke-output"
|
||
rm -f "$SMOKE_OUTPUT"
|
||
touch "$SMOKE_OUTPUT"
|
||
chmod 666 "$SMOKE_OUTPUT"
|
||
SMOKE_DOCKER_STATUS=0
|
||
# 2026-06-01 Codex: post-deploy smoke can pass, then hang in
|
||
# runner cleanup and incorrectly mark the deploy failed. Bound only
|
||
# the smoke container; preserve pass evidence if it was written.
|
||
if command -v timeout >/dev/null 2>&1; then
|
||
# 2026-06-14 Codex: act-runner host may provide BusyBox timeout,
|
||
# which rejects GNU-only --kill-after. The short -k form works
|
||
# with BusyBox and GNU timeout.
|
||
timeout -k 20s 300s docker run --rm \
|
||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-e2e-smoke" \
|
||
--cpus "1.5" \
|
||
--memory "2g" \
|
||
-v "$PWD:/workspace" \
|
||
-v /tmp/awoooi-smoke.sh:/tmp/awoooi-smoke.sh:ro \
|
||
-v awoooi-pnpm-store:/opt/pnpm-store \
|
||
-v awoooi-playwright-browsers:/opt/playwright-browsers \
|
||
-w /workspace \
|
||
-e GITHUB_OUTPUT=/workspace/.awoooi-smoke-output \
|
||
-e CI=true \
|
||
-e PLAYWRIGHT_BASE_URL=https://awoooi.wooo.work \
|
||
"${{ env.CI_IMAGE }}" \
|
||
bash /tmp/awoooi-smoke.sh || SMOKE_DOCKER_STATUS=$?
|
||
else
|
||
docker run --rm \
|
||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-e2e-smoke" \
|
||
--cpus "1.5" \
|
||
--memory "2g" \
|
||
-v "$PWD:/workspace" \
|
||
-v /tmp/awoooi-smoke.sh:/tmp/awoooi-smoke.sh:ro \
|
||
-v awoooi-pnpm-store:/opt/pnpm-store \
|
||
-v awoooi-playwright-browsers:/opt/playwright-browsers \
|
||
-w /workspace \
|
||
-e GITHUB_OUTPUT=/workspace/.awoooi-smoke-output \
|
||
-e CI=true \
|
||
-e PLAYWRIGHT_BASE_URL=https://awoooi.wooo.work \
|
||
"${{ env.CI_IMAGE }}" \
|
||
bash /tmp/awoooi-smoke.sh || SMOKE_DOCKER_STATUS=$?
|
||
fi
|
||
if [ "$SMOKE_DOCKER_STATUS" != "0" ] && ! grep -q '^smoke_status=pass$' "$SMOKE_OUTPUT"; then
|
||
echo "smoke_status=fail" > "$SMOKE_OUTPUT"
|
||
echo "E2E smoke container failed before pass evidence: ${SMOKE_DOCKER_STATUS}"
|
||
exit "$SMOKE_DOCKER_STATUS"
|
||
fi
|
||
if [ "$SMOKE_DOCKER_STATUS" != "0" ]; then
|
||
echo "E2E smoke pass evidence was written; treating container exit ${SMOKE_DOCKER_STATUS} as cleanup timeout"
|
||
fi
|
||
cat "$SMOKE_OUTPUT" >> "$GITHUB_OUTPUT"
|
||
env:
|
||
CI: "true"
|
||
# 直接測試已部署的生產環境,不啟動本地 dev server
|
||
PLAYWRIGHT_BASE_URL: "https://awoooi.wooo.work"
|
||
|
||
- name: Notify Health Check Success
|
||
env:
|
||
SMOKE_RESULT: ${{ steps.smoke.outputs.smoke_status == 'pass' && '✅' || '⚠️' }}
|
||
ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outputs.alert_chain_status == 'pass' && '✅' || '⚠️' }}
|
||
MONITORING_RESULT: ${{ steps.monitoring_coverage.outputs.coverage_status == 'pass' && '✅' || '⚠️' }}
|
||
SOURCE_LINK_RESULT: ${{ steps.source_correlation_apply_smoke.outputs.source_correlation_apply_status == 'pass' && '✅' || '⚠️' }}
|
||
run: |
|
||
END_TIME=$(date +%s)
|
||
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
|
||
MINUTES=$((DURATION / 60))
|
||
SECONDS=$((DURATION % 60))
|
||
# 2026-04-05 ogt: TG_MSG 必須在 shell 中組裝,才能展開 ${MINUTES}/${SECONDS} 等 shell 變數
|
||
# 2026-04-05 ogt: 移除 parse_mode=HTML,避免 commit message 含特殊字元導致 400
|
||
COMMIT_MSG="${{ steps.commit.outputs.message }}"
|
||
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
|
||
TG_MSG="✅ AWOOOI 部署完成\n├ 📝 ${COMMIT_MSG}\n├ 🔖 ${SHORT_SHA}\n├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s\n├ 📦 API: ✅ Web: ✅\n├ 🩺 Health: ✅\n├ 🔗 Alert Chain: ${ALERT_CHAIN_RESULT}\n├ 🧷 Source Link: ${SOURCE_LINK_RESULT}\n├ 📊 Monitoring: ${MONITORING_RESULT}\n└ 🎭 Smoke: ${SMOKE_RESULT}"
|
||
if AWOOI_CICD_STATUS=success \
|
||
AWOOI_CICD_STAGE=post-deploy \
|
||
AWOOI_CICD_JOB_NAME="AWOOOI 部署完成" \
|
||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||
AWOOI_CICD_DURATION_SECONDS="${DURATION}" \
|
||
AWOOI_CICD_SUMMARY="API=✅; Web=✅; AlertChain=${ALERT_CHAIN_RESULT}; SourceLink=${SOURCE_LINK_RESULT}; Monitoring=${MONITORING_RESULT}; Smoke=${SMOKE_RESULT}" \
|
||
scripts/ci/notify-awoooi-cicd.sh; then
|
||
echo "✅ CI/CD success notification mirrored through AWOOI API"
|
||
else
|
||
printf '%b' "$TG_MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||
-d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \
|
||
--data-urlencode "text@-" || echo "TG notify warning (non-fatal)"
|
||
fi
|
||
|
||
- name: Notify Pipeline Failure
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 改用 HTML 結構化格式
|
||
if: failure()
|
||
run: |
|
||
COMMIT_MSG="${{ steps.commit.outputs.message }}"
|
||
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
|
||
ACTOR="${{ github.actor }}"
|
||
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/</\</g; s/>/\>/g')
|
||
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🩺 Stage: post-deploy-checks\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
|
||
if AWOOI_CICD_STATUS=failed \
|
||
AWOOI_CICD_STAGE=post-deploy-checks \
|
||
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
|
||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
|
||
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
|
||
scripts/ci/notify-awoooi-cicd.sh; then
|
||
echo "✅ CI/CD post-deploy failure notification mirrored through AWOOI API"
|
||
else
|
||
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
|
||
-d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \
|
||
-d "parse_mode=HTML" \
|
||
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
|
||
fi
|
||
|
||
- name: Clean Post-Deploy Workspace Artifacts
|
||
if: always()
|
||
env:
|
||
HOST_RUNNER_CLEANUP_IMAGE: ${{ env.CI_IMAGE }}
|
||
run: bash scripts/ci/cleanup-host-runner-workspace.sh
|