Files
awoooi/.gitea/workflows/cd.yaml
Your Name 8f4cb76db7
All checks were successful
Code Review / ai-code-review (push) Successful in 14s
fix(cd): support BusyBox timeout in smoke
2026-06-14 10:32:07 +08:00

1577 lines
82 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# =============================================================================
# AWOOOI CD Pipeline (Gitea Actions - 方案 B)
# =============================================================================
# 流程: Build → Push to Harbor → Deploy to K8s
# 加速措施:
# 1. Docker Layer Cache → Harbor registry cache
# 2. 內部 Mirror → 192.168.0.110:5001 (Harbor Proxy Cache for DockerHub)
# 2026-03-29 Claude Code (ADR-039) - Retry after creating Harbor project
name: CD Pipeline
on:
push:
branches: [main]
paths:
# 只有實際影響部署的程式碼才觸發 CD
- 'apps/**'
- 'k8s/**'
- '.dockerignore'
# Dockerfile COPY scripts/ into the API image; keep production ops
# seed scripts deploy-coupled instead of repo-only.
- 'scripts/backup/backup-momo-188-pg.sh'
- 'scripts/ops/notify-awoooi-ops.sh'
- 'scripts/ops/awooop-seed-auto-repair-canary-playbook.py'
# Workflow-only changes do not rebuild runtime images. Use workflow_dispatch
# when an operator explicitly wants to test the CD pipeline itself.
# docs/、memory/、ADR 等不觸發
# ops/monitoring/alerts-unified.yml 由 deploy-alerts.yaml 獨立處理 (I3)
workflow_dispatch:
# 手動觸發永遠可用(用於補跑、緊急部署)
# 2026-04-02 Claude Code: 改為搶佔模式 — 新 push 立即取消舊 build只部署最新
# 原理: concurrency group 保證同時只有一個 job 跑cancel-in-progress:true 讓新的取代舊的
# 解決: 多個 commit 快速連推時不再排隊堆積,且 docker build 卡住時不會阻塞後續部署
# 安全: deploy 步驟本身有 kubectl rollout status 保護,不會出現半部署狀態
concurrency:
group: cd-deploy-${{ github.ref }}
cancel-in-progress: true
env:
HARBOR: 192.168.0.110:5000
SRE_GROUP_CHAT_ID: "-1003711974679"
# Harbor Proxy Cache (指向 DockerHub 的內部 Mirror避免拉取限額)
HARBOR_MIRROR: 192.168.0.110:5001
# OTEL CI/CD 監控 (2026-03-31 #46c - 遷移到 Gitea)
OTEL_EXPORTER_OTLP_ENDPOINT: http://192.168.0.188:24318
OTEL_SERVICE_NAME: awoooi-cd
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production
CI_IMAGE: 192.168.0.110:5000/awoooi/ci-runner:act-22.04
# 2026-05-24 Codex: deploy through the currently Ready control-plane node.
# 120 is NotReady/SchedulingDisabled and its SSH/API endpoints are currently
# unreachable; pinning CD to it blocks secret injection before GitOps deploy.
K8S_SSH_HOST: 192.168.0.121
K8S_API_SERVER: https://192.168.0.121:6443
# 2026-06-01 Codex: post-deploy health/smoke probes use the production
# public API. The old 192.168.0.125 NodePort VIP can be absent while the
# public route and in-cluster service are healthy, causing false failures.
API_HEALTH_URL: https://awoooi.wooo.work/api/v1/health
ALERT_CHAIN_API_URL: https://awoooi.wooo.work
jobs:
tests:
# 2026-04-30 Codex: run the tests job on the host runner and launch the
# CI image explicitly. The act-managed job container can disappear mid-test
# with Docker RWLayer=nil on the shared 110 daemon.
timeout-minutes: 30
runs-on: awoooi-host
# 2026-04-10 ogt: B5 改用 docker run 本地啟動,移除 services: 宣告
# Gitea act runner 的 services: container name 為空,導致 CI 失敗
steps:
- name: Bootstrap Host Runner Tools
# 2026-05-05 Codex: awoooi-host maps to the long-lived act-runner
# container. After dirty reboots it may not contain node/curl/git, and
# actions/checkout@v4 fails before tests can start.
run: |
if command -v apk >/dev/null 2>&1; then
apk add --no-cache nodejs npm git curl bash openssh-client docker-cli docker-cli-buildx
fi
- uses: actions/checkout@v4
- name: Guard Workflow Secret Surfaces
run: node scripts/ci/check-gitea-step-env-secrets.js
# 2026-03-31 ogt: 優化告警格式 - 提高可讀性
- name: Get Commit Info
id: commit
run: |
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
- name: Notify Pipeline Start
# 2026-04-16 ogt + Claude Sonnet 4.6: 改用 HTML 結構化格式,提升可讀性
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
# HTML escape commit message防特殊字元破壞 HTML
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '🚀 <b>AWOOOI 部署開始</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n└ 👤 %s' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
# 2026-05-02 Claude Opus 4.7 + 統帥 ogt: notify 失敗不該擋整條 CI鐵證:
# curl 400 從 5/1 起連續炸 14 個 commit 的 build-and-deploy— 對齊 line 922 既有 pattern
if AWOOI_CICD_STATUS=running \
AWOOI_CICD_STAGE=tests \
AWOOI_CICD_JOB_NAME="AWOOOI 部署開始" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD start notification mirrored through AWOOI API"
else
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
fi
# 2026-03-31 ogt: Phase 22.0 CI 測試 (禁止 Mock - feedback_no_mock_testing.md)
# 2026-04-01 ogt: 持久化 venv 加速 - /opt/api-venv 跨 run 保留
# pyproject.toml hash 變才重裝,其餘直接 activate (節省 ~6-7 min)
- name: Run API Tests
run: |
cat > /tmp/awoooi-api-tests.sh <<'CI_SCRIPT'
VENV=/opt/api-venv
HASH_FILE=/opt/api-venv/.deps_hash
CURRENT_HASH=$(md5sum apps/api/pyproject.toml | awk '{print $1}')
# python3.11 是 runner 層級持久安裝,只在首次或版本消失時才 apt-get
# 2026-04-05 Claude Code: 分離 apt-get 與 venv hash-guard避免每次 deps 變更都重跑 apt
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 apt index 失敗 → 改用 --fix-missing + retry
if ! command -v python3.11 &>/dev/null; then
echo "📦 安裝 python3.11..."
apt-get clean && rm -rf /var/lib/apt/lists/*
apt-get update -q --fix-missing || apt-get update -q || true
apt-get install -y -q python3.11-venv python3.11 || \
(add-apt-repository ppa:deadsnakes/python -y 2>/dev/null && apt-get update -q && apt-get install -y -q python3.11-venv python3.11) || true
else
echo "⚡ python3.11 已安裝,跳過 apt-get"
fi
# 確保 python3.11 存在,否則 fallback 到系統 python3
if ! command -v python3.11 &>/dev/null; then
echo "⚠️ python3.11 安裝失敗,使用 python3 fallback"
ln -sf "$(which python3)" /usr/local/bin/python3.11 || true
fi
if [ ! -d "$VENV/bin" ] || [ "$(cat $HASH_FILE 2>/dev/null)" != "$CURRENT_HASH" ]; then
echo "📦 deps 已變更,重建 venv..."
# 2026-04-17 ogt: /opt/api-venv 是 volume mount不能 rm -rf 目錄本身
# 改用 find 清空內容,保留 mount point 目錄
find "$VENV" -mindepth 1 -delete 2>/dev/null || true
python3.11 -m venv $VENV
source $VENV/bin/activate
pip install -q uv
cd apps/api && uv pip install -q -e ".[dev]" && cd -
echo "$CURRENT_HASH" > $HASH_FILE
else
echo "⚡ 使用快取 venv (deps 未變更)"
source $VENV/bin/activate
fi
cd apps/api
cleanup_pytest_workspace_cache() {
# 2026-05-19 Codex: CI image runs as root against a bind-mounted
# checkout. Remove Python cache artifacts before act-runner cleanup
# so successful jobs do not end with root-owned __pycache__ noise.
find tests src -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true
rm -rf .pytest_cache 2>/dev/null || true
}
# CI 排除需外部服務的測試 (Redis pool / Ollama — 2026-04-01 Claude Code)
# 2026-04-05 Claude Code: 修正 exit code — | tail 會吃掉 segfault (exit 139)
# 改用 tee + PIPESTATUS[0] 正確捕捉 pytest 本身的 exit code
# 2026-04-05 Claude Code: 加 --ignore=tests/integration 排除需 asyncpg 連線的 DB 測試
# integration tests 在 prod K8s 部署後由 E2E Smoke Test 覆蓋
# PYTHONFAULTHANDLER=1: 若 C extension segfault輸出完整 Python stacktrace
# 2026-04-05 Claude Code: test_github_webhook.py 已根治
# 原問題: import src.main → asyncpg C ext segfault (exit 139)
# 修復: 改用最小化 app只掛載 github_webhook router不走 DB import chain
# 現在可安全加入 CI 測試
# 2026-04-22 ogt: DATABASE_URL 改為必填後,單元測試需要此 env var 讓 Settings 通過驗證
# 單元測試不連 DB此 CI placeholder 僅供 Pydantic 驗證,不產生真實連線
DATABASE_URL="${DATABASE_URL:-postgresql+asyncpg://ci:ci@localhost/ci}" \
PYTHONFAULTHANDLER=1 python3.11 -m pytest tests/ -v --tb=short -x -p no:cacheprovider \
--ignore=tests/integration \
--ignore=tests/test_anomaly_counter.py \
--ignore=tests/test_global_repair_cooldown.py \
--ignore=tests/test_redis_multisig.py \
--ignore=tests/test_model_regression.py \
--ignore=tests/test_prompt_validation.py \
--ignore=tests/e2e_network_test.py \
2>&1 | tee /tmp/pytest-output.txt; PYTEST_EXIT=${PIPESTATUS[0]}
tail -60 /tmp/pytest-output.txt
cleanup_pytest_workspace_cache
exit $PYTEST_EXIT
CI_SCRIPT
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-api-tests" \
--cpus "2.0" \
--memory "2g" \
-v "$PWD:/workspace" \
-v /tmp/awoooi-api-tests.sh:/tmp/awoooi-api-tests.sh:ro \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-api-tests.sh
# ── 整合測試 B5 (2026-04-10) ──────────────────────────────────────────
# B5 整合測試 — postgres-test 由 services: 提供localhost:15432 直連
# 2026-04-10 Claude Sonnet 4.6: 用 psql 直連 localhost:15432 初始化 schema
# (docker exec 在 act runner 內無法取得 service container name)
# B5: Gitea act runner 的 services: 實作與 GitHub Actions 不同
# service container 啟動後需直連,但 act 的 container name 可能為空
# 2026-04-10 ogt: 改用 docker run 本地啟動取代 services: 宣告
# 2026-04-19 ogt + Claude Opus 4.7: cd 連續 2 次 fail (run 984/985)
# 真因: act runner 把 ci-runner 跑在獨立 user-defined network,
# pg-test-b5 預設用 host bridge → 兩邊隔離無法連 (172.17.0.2 timeout)
# 修法: 把 pg-test-b5 加入 act task 的 network,用 container name 連線
- name: Integration Tests (B5 — 真實 DB)
run: |
cat > /tmp/awoooi-b5-tests.sh <<'CI_SCRIPT'
cd apps/api
# 安裝 psql client
if ! command -v psql &>/dev/null; then
apt-get install -y -q postgresql-client
fi
# 2026-04-19 ogt + Claude Opus 4.7 v3: 主動創 shared network
# 之前 grep ACT_NET 在 c0f3509 run 沒 match → fallback bridge → container name DNS 失效
# 真因: default bridge 不支援 container name DNS,必須 user-defined network
# 修法: 主動建 'b5-test-net' (idempotent),ci-runner + pg-test-b5 都加入
B5_NET="b5-test-net"
docker network create "$B5_NET" 2>/dev/null || true
# 當前 ci-runner container (hostname == short container id) 連上此 network
# 若已連 → docker network connect 回 error 1,用 || true 吞掉
docker network connect "$B5_NET" "$HOSTNAME" 2>/dev/null || true
echo "B5 shared network: $B5_NET (ci-runner hostname: $HOSTNAME)"
# 啟動測試 DB 於 shared network,用 container name 'pg-test-b5' 連線
docker rm -f pg-test-b5 2>/dev/null || true
docker run -d --name pg-test-b5 \
--network="$B5_NET" \
-e POSTGRES_DB=awoooi_test \
-e POSTGRES_USER=awoooi \
-e POSTGRES_PASSWORD=awoooi_test_2026 \
pgvector/pgvector:pg16
# 等待就緒(用 container name,最多 60 秒)
for i in $(seq 1 30); do
PGPASSWORD=awoooi_test_2026 pg_isready -h pg-test-b5 -p 5432 -U awoooi && break || sleep 2
done
# 初始化 schema
PGPASSWORD=awoooi_test_2026 psql \
-h pg-test-b5 -p 5432 -U awoooi -d awoooi_test \
-f tests/integration/setup_test_schema.sql
# 跑測試
# B5 整合測試嚴格模式 (2026-04-13 ogt: 恢復 Break-Glass 移除)
# -m integration: override pyproject.toml addopts "-m 'not integration'",讓標記測試可執行
# 2026-04-22 ogt: DATABASE_URL 改為必填後import chain 需要此 env var 讓 Settings 通過驗證
DATABASE_URL="postgresql+asyncpg://awoooi:awoooi_test_2026@pg-test-b5:5432/awoooi_test?ssl=disable" \
TEST_DATABASE_URL="postgresql+asyncpg://awoooi:awoooi_test_2026@pg-test-b5:5432/awoooi_test?ssl=disable" \
/opt/api-venv/bin/pytest tests/integration/test_b5_core_flows.py -v --tb=short -m integration -p no:cacheprovider || PYTEST_EXIT=$?
# 清理
docker rm -f pg-test-b5 || true
# 2026-05-20 Codex: B5 imports shared tests helpers, so cleanup the
# whole tests tree to avoid root-owned __pycache__ act-runner noise.
find tests src -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true
rm -rf .pytest_cache 2>/dev/null || true
exit "${PYTEST_EXIT:-0}"
CI_SCRIPT
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-b5-tests" \
--cpus "2.0" \
--memory "2g" \
-v "$PWD:/workspace" \
-v /tmp/awoooi-b5-tests.sh:/tmp/awoooi-b5-tests.sh:ro \
-v /var/run/docker.sock:/var/run/docker.sock \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-b5-tests.sh
- name: Clean Test Workspace Artifacts
if: always()
env:
HOST_RUNNER_CLEANUP_IMAGE: ${{ env.CI_IMAGE }}
run: bash scripts/ci/cleanup-host-runner-workspace.sh
- name: Notify Pipeline Failure
# 2026-04-30 Codex: tests job failure notifier; no jq dependency for host parity.
if: failure()
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🧪 Stage: tests\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
if AWOOI_CICD_STATUS=failed \
AWOOI_CICD_STAGE=tests \
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD tests failure notification mirrored through AWOOI API"
else
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
fi
build-and-deploy:
# 2026-04-30 Codex: Docker builds run on the host runner. Long docker build
# steps were killing the transient act job container with RWLayer=nil.
needs: tests
timeout-minutes: 60
runs-on: awoooi-host
steps:
- name: Bootstrap Host Runner Tools
# 2026-05-05 Codex: keep the host-mode runner self-healing before
# actions/checkout@v4 and Telegram failure notifications run.
run: |
if command -v apk >/dev/null 2>&1; then
apk add --no-cache nodejs npm git curl bash openssh-client docker-cli docker-cli-buildx
fi
- uses: actions/checkout@v4
- name: Get Commit Info
id: commit
run: |
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
- name: Notify Build Deploy Start
run: |
ACTOR="${{ github.actor }}"
if AWOOI_CICD_STATUS=running \
AWOOI_CICD_STAGE=build-and-deploy \
AWOOI_CICD_JOB_NAME="AWOOOI 建置部署開始" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${{ steps.commit.outputs.message }}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD build-deploy start notification mirrored through AWOOI API"
else
echo "⚠️ CI/CD build-deploy start notification failed (non-fatal)"
fi
- name: Login to Harbor
run: |
echo "${{ secrets.HARBOR_PASSWORD }}" | \
docker login "${{ env.HARBOR }}" \
-u "${{ secrets.HARBOR_USERNAME }}" \
--password-stdin
# 2026-05-21 Codex: AWOOI workflow concurrency and the Docker network
# lock only protect AWOOI/Docker work. Other repos can still run
# host-side Next/Turbo builds on the same 110 runner and starve this
# deploy. Wait for those foreign web builds before starting our image
# build; the gate is read-only and never kills another process.
- name: Wait for Host Web Build Pressure
run: bash scripts/ci/wait-host-web-build-pressure.sh
# 2026-04-30 Codex: Gitea act-runner shares one Docker daemon across repos.
# When another repo starts a heavy docker build while AWOOOI Web is still
# building, the job container can disappear and Docker reports RWLayer=nil.
# A Docker-network lock is global to the host daemon and survives container
# namespaces, unlike /tmp/flock inside the transient job container.
- name: Acquire Docker Build Lock
run: |
LOCK_NAME="awoooi-cd-docker-build-lock"
STALE_SECONDS=7200
EMPTY_LOCK_SECONDS=300
WAIT_ATTEMPTS=180
for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do
if docker network create \
--label awoooi.ci-lock=docker-build \
--label awoooi.owner=cd-pipeline \
"$LOCK_NAME" >/dev/null 2>&1; then
echo "DOCKER_BUILD_LOCK=${LOCK_NAME}" >> "$GITHUB_ENV"
echo "✅ Docker build lock acquired: ${LOCK_NAME}"
exit 0
fi
CREATED_AT=$(docker network inspect "$LOCK_NAME" \
--format '{{.Created}}' 2>/dev/null || true)
if [ -n "$CREATED_AT" ]; then
# 2026-05-03 ogt: 修復 stale 偵測 — Docker 回傳 "2006-01-02 15:04:05.999999999 -0700 MST"
# date -d 不接受奈秒小數點與末尾時區縮寫CST/MST 等),導致 CREATED_EPOCH=0 → stale 永不觸發
# 修法sed 去除奈秒 (.NNN...) 和末尾縮寫 (空格+大寫字母)GNU date 才能正確解析
CREATED_CLEAN=$(echo "$CREATED_AT" | sed 's/\.[0-9]*//' | sed 's/ [A-Z][A-Z]*$//')
CREATED_EPOCH=$(date -d "$CREATED_CLEAN" +%s 2>/dev/null || \
python3 -c "import sys, datetime, re; ts = re.sub(r'\\.\d+', '', sys.argv[1]); ts = re.sub(r'\\s+[A-Z]{2,4}$', '', ts.strip()); print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))" \
"$CREATED_AT" 2>/dev/null || echo 0)
NOW_EPOCH=$(date +%s)
LOCK_AGE=$((NOW_EPOCH - CREATED_EPOCH))
# 2026-05-05 Codex: dirty reboot / cancelled Actions can leave
# the Docker-network lock behind with no active build or push.
# Waiting the full 30m CD timeout keeps deploys queued even
# though no job is protected, so clear empty locks after 5m.
# 2026-05-12 Codex: 用 bracket pattern 避免 lock-check shell 自己的
# grep/awk pattern 被誤判成 active docker work導致 empty lock 永不自清。
ACTIVE_DOCKER_WORK=$(ps -eo pid,args | awk '$0 ~ /[d]ocker (build|push)|[b]uildx build/ {print}' || true)
if [ "$CREATED_EPOCH" -gt 0 ] && \
[ "$LOCK_AGE" -gt "$EMPTY_LOCK_SECONDS" ] && \
[ -z "$ACTIVE_DOCKER_WORK" ]; then
echo "⚠️ empty Docker build lock detected (age=${LOCK_AGE}s > ${EMPTY_LOCK_SECONDS}s, no active docker build/push), removing ${LOCK_NAME}"
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
continue
fi
if [ "$CREATED_EPOCH" -gt 0 ] && \
[ "$LOCK_AGE" -gt "$STALE_SECONDS" ]; then
echo "⚠️ stale Docker build lock detected (age=${LOCK_AGE}s > ${STALE_SECONDS}s), removing ${LOCK_NAME}"
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
continue
fi
fi
echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting..."
sleep 10
done
echo "❌ timed out waiting for Docker build lock"
exit 1
# ── API 鏡像建置(含 Layer Cache 加速)──────────────────────────────
# 2026-04-01 ogt: CACHE_BUST=git_sha 確保 src/ 和 models.json 層每次重建
# deps 層 (pip install) 仍可 cache → 加速;代碼/配置層強制失效
# 2026-05-05 Codex: host runner bootstrap installs docker-cli-buildx;
# keep BuildKit enabled because the web Dockerfile uses RUN --mount.
- name: Build and Push API
env:
DOCKER_BUILDKIT: "1"
run: |
docker build -f apps/api/Dockerfile \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--cache-from ${{ env.HARBOR }}/awoooi/api:latest \
--build-arg CACHE_BUST=${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/api:${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/api:latest \
.
docker push ${{ env.HARBOR }}/awoooi/api:${{ github.sha }}
docker push ${{ env.HARBOR }}/awoooi/api:latest
# 2026-03-31 ogt: 移除中間通知,減少訊息雜訊
# ── Web 鏡像建置(精準快取失效)──────────────────────────────
# 2026-03-30 ogt: NEXT_PUBLIC_* 必須用公網域名 (build-time 寫死)
# 2026-04-01 Claude Code: CACHE_BUST=git_sha 取代 --no-cache
# - deps 層 (pnpm install) 仍可 cache → 節省 ~2-3 min
# - COPY . . 以下由 CACHE_BUST 強制失效 → 業務邏輯/CSRF 等變更正確進入 bundle
# 2026-05-05 Codex: mirror API build mode; BuildKit required for cache mounts.
- name: Build and Push Web
env:
DOCKER_BUILDKIT: "1"
run: |
docker build -f apps/web/Dockerfile \
--build-arg NEXT_PUBLIC_API_URL=https://awoooi.wooo.work \
--build-arg CACHE_BUST=${{ github.sha }} \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--cache-from ${{ env.HARBOR }}/awoooi/web:latest \
-t ${{ env.HARBOR }}/awoooi/web:${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/web:latest \
.
docker push ${{ env.HARBOR }}/awoooi/web:${{ github.sha }}
docker push ${{ env.HARBOR }}/awoooi/web:latest
- name: Release Docker Build Lock
if: always()
run: |
if [ -n "${DOCKER_BUILD_LOCK:-}" ]; then
docker network rm "$DOCKER_BUILD_LOCK" >/dev/null 2>&1 || true
echo "✅ Docker build lock released: ${DOCKER_BUILD_LOCK}"
else
echo "⚡ no Docker build lock to release"
fi
# 2026-03-31 ogt: 移除中間通知
# 2026-03-31 ogt: P0-1 Secrets 自動注入 (ADR-035 強制)
# 2026-03-31 ogt: 加入 AI API Keys (修復 mock_fallback 問題)
- name: Inject K8s Secrets
run: |
# 2026-05-18 Codex: 不把 secrets 放進 step-level env。
# Gitea/act_runner 的 job log 可能展開 env這裡只在 shell 內短暫轉
# base64並避免輸出原值。
secret_b64() {
if command -v python3.11 >/dev/null 2>&1; then
python3.11 -c 'import base64, sys; data=sys.stdin.buffer.read(); data=data[:-1] if data.endswith(b"\n") else data; sys.stdout.write(base64.b64encode(data).decode())'
elif command -v python3 >/dev/null 2>&1; then
python3 -c 'import base64, sys; data=sys.stdin.buffer.read(); data=data[:-1] if data.endswith(b"\n") else data; sys.stdout.write(base64.b64encode(data).decode())'
else
secret_value="$(cat)"
printf '%s' "${secret_value}" | base64 | tr -d '\n'
fi
}
write_deploy_key() {
mkdir -p "${HOME}/.ssh"
umask 077
cat > "${HOME}/.ssh/deploy_key" <<'AWOOOI_DEPLOY_KEY'
${{ secrets.DEPLOY_SSH_KEY }}
AWOOOI_DEPLOY_KEY
chmod 600 "${HOME}/.ssh/deploy_key"
}
TG_BOT_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_TG_BOT_TOKEN'
${{ secrets.TELEGRAM_BOT_TOKEN }}
AWOOOI_SECRET_TG_BOT_TOKEN
)"
TG_CHAT_ID_B64="$(secret_b64 <<'AWOOOI_SECRET_SRE_GROUP_CHAT_ID_COMPAT'
${{ secrets.SRE_GROUP_CHAT_ID }}
AWOOOI_SECRET_SRE_GROUP_CHAT_ID_COMPAT
)"
NVIDIA_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_NVIDIA_API_KEY'
${{ secrets.NVIDIA_API_KEY }}
AWOOOI_SECRET_NVIDIA_API_KEY
)"
GEMINI_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_GEMINI_API_KEY'
${{ secrets.GEMINI_API_KEY }}
AWOOOI_SECRET_GEMINI_API_KEY
)"
LANGFUSE_PUBLIC_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_LANGFUSE_PUBLIC_KEY'
${{ secrets.LANGFUSE_PUBLIC_KEY }}
AWOOOI_SECRET_LANGFUSE_PUBLIC_KEY
)"
LANGFUSE_SECRET_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_LANGFUSE_SECRET_KEY'
${{ secrets.LANGFUSE_SECRET_KEY }}
AWOOOI_SECRET_LANGFUSE_SECRET_KEY
)"
TG_USER_WHITELIST_B64="$(secret_b64 <<'AWOOOI_SECRET_TG_USER_WHITELIST'
${{ secrets.OPENCLAW_TG_USER_WHITELIST }}
AWOOOI_SECRET_TG_USER_WHITELIST
)"
SENTRY_AUTH_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_SENTRY_AUTH_TOKEN'
${{ secrets.SENTRY_AUTH_TOKEN }}
AWOOOI_SECRET_SENTRY_AUTH_TOKEN
)"
GITEA_WEBHOOK_SECRET_B64="$(secret_b64 <<'AWOOOI_SECRET_GITEA_WEBHOOK_SECRET'
${{ secrets.AWOOOI_GITEA_WEBHOOK_SECRET }}
AWOOOI_SECRET_GITEA_WEBHOOK_SECRET
)"
ARGOCD_API_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_ARGOCD_API_TOKEN'
${{ secrets.ARGOCD_API_TOKEN }}
AWOOOI_SECRET_ARGOCD_API_TOKEN
)"
DATABASE_URL_B64="$(secret_b64 <<'AWOOOI_SECRET_DATABASE_URL'
${{ secrets.DATABASE_URL }}
AWOOOI_SECRET_DATABASE_URL
)"
MIGRATION_DATABASE_URL_B64="$(secret_b64 <<'AWOOOI_SECRET_MIGRATION_DATABASE_URL'
${{ secrets.MIGRATION_DATABASE_URL }}
AWOOOI_SECRET_MIGRATION_DATABASE_URL
)"
REDIS_URL_B64="$(secret_b64 <<'AWOOOI_SECRET_REDIS_URL'
${{ secrets.REDIS_URL }}
AWOOOI_SECRET_REDIS_URL
)"
JWT_SECRET_B64="$(secret_b64 <<'AWOOOI_SECRET_JWT_SECRET'
${{ secrets.JWT_SECRET }}
AWOOOI_SECRET_JWT_SECRET
)"
JWT_ALGORITHM_B64="$(secret_b64 <<'AWOOOI_SECRET_JWT_ALGORITHM'
${{ secrets.JWT_ALGORITHM }}
AWOOOI_SECRET_JWT_ALGORITHM
)"
WEBHOOK_HMAC_SECRET_B64="$(secret_b64 <<'AWOOOI_SECRET_WEBHOOK_HMAC_SECRET'
${{ secrets.WEBHOOK_HMAC_SECRET }}
AWOOOI_SECRET_WEBHOOK_HMAC_SECRET
)"
AWOOOP_OPERATOR_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_AWOOOP_OPERATOR_API_KEY'
${{ secrets.AWOOOP_OPERATOR_API_KEY }}
AWOOOI_SECRET_AWOOOP_OPERATOR_API_KEY
)"
SENTRY_DSN_B64="$(secret_b64 <<'AWOOOI_SECRET_SENTRY_DSN'
${{ secrets.SENTRY_DSN }}
AWOOOI_SECRET_SENTRY_DSN
)"
CLAUDE_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_CLAUDE_API_KEY'
${{ secrets.CLAUDE_API_KEY }}
AWOOOI_SECRET_CLAUDE_API_KEY
)"
GITEA_API_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_GITEA_API_TOKEN'
${{ secrets.AWOOOI_GITEA_API_TOKEN }}
AWOOOI_SECRET_GITEA_API_TOKEN
)"
NEMOTRON_BOT_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_NEMOTRON_BOT_TOKEN'
${{ secrets.NEMOTRON_BOT_TOKEN }}
AWOOOI_SECRET_NEMOTRON_BOT_TOKEN
)"
OPENCLAW_BOT_TOKEN_B64="$(secret_b64 <<'AWOOOI_SECRET_OPENCLAW_BOT_TOKEN'
${{ secrets.OPENCLAW_BOT_TOKEN }}
AWOOOI_SECRET_OPENCLAW_BOT_TOKEN
)"
SMTP_HOST_B64="$(secret_b64 <<'AWOOOI_SECRET_SMTP_HOST'
${{ secrets.SMTP_HOST }}
AWOOOI_SECRET_SMTP_HOST
)"
SRE_GROUP_CHAT_ID_B64="$(secret_b64 <<'AWOOOI_SECRET_SRE_GROUP_CHAT_ID'
${{ secrets.SRE_GROUP_CHAT_ID }}
AWOOOI_SECRET_SRE_GROUP_CHAT_ID
)"
# S1/S2: 統一命名 deploy_key改用 ssh-keyscan 與強制 host key 驗證。
write_deploy_key
# 2026-05-13 Codex: keyscan must include ED25519 explicitly. Some
# OpenSSH builds otherwise record only RSA/ECDSA, then strict deploy
# SSH fails with "No ED25519 host key is known" after image push.
# 2026-06-13 Codex: keep deploy-time host keys in a dedicated file.
# The runner user's global known_hosts is shared by cold-start and
# backup checks for 120/188; overwriting it here caused strict SSH
# recovery gates to flap after every CD run.
DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts"
ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null
test -s "${DEPLOY_KNOWN_HOSTS}" || { echo "❌ K8S host keyscan failed: ${K8S_SSH_HOST}"; exit 1; }
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10"
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" << SECRETS
set -e
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=\${K8S_API_SERVER}"
# 注入 Telegram Secrets (ADR-035 鐵律)
# 2026-06-12 Codex: OPENCLAW_TG_CHAT_ID 僅作舊欄位相容,
# 實際值必須與 SRE_GROUP_CHAT_ID 一致,避免正式告警旁路到其他群組。
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/OPENCLAW_TG_BOT_TOKEN","value":"${TG_BOT_TOKEN_B64}"},
{"op":"add","path":"/data/OPENCLAW_TG_CHAT_ID","value":"${TG_CHAT_ID_B64}"}
]' || { echo "❌ Telegram Secrets patch 失敗 — ADR-035 鐵律"; exit 1; }
# 2026-03-31 ogt: 注入 AI API Keys (修復 NVIDIA/Gemini mock_fallback)
# 2026-04-01 Claude Code: base64 -w 0 防止長 key 換行破壞 JSON
# NVIDIA NIM (免費 tier)
if [ -n "${NVIDIA_API_KEY_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/NVIDIA_API_KEY","value":"${NVIDIA_API_KEY_B64}"}
]' && echo "✅ NVIDIA_API_KEY 已注入" || echo "⚠️ NVIDIA_API_KEY patch 失敗"
else
echo "⚠️ NVIDIA_API_KEY 未設定,跳過"
fi
# Gemini (備援)
if [ -n "${GEMINI_API_KEY_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/GEMINI_API_KEY","value":"${GEMINI_API_KEY_B64}"}
]' && echo "✅ GEMINI_API_KEY 已注入" || echo "⚠️ GEMINI_API_KEY patch 失敗"
else
echo "⚠️ GEMINI_API_KEY 未設定,跳過"
fi
# 2026-04-01 Claude Code: Langfuse LLMOps keys (補齊 CD 注入,之前只有手動設定)
if [ -n "${LANGFUSE_PUBLIC_KEY_B64}" ] && [ -n "${LANGFUSE_SECRET_KEY_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/LANGFUSE_PUBLIC_KEY","value":"${LANGFUSE_PUBLIC_KEY_B64}"},
{"op":"add","path":"/data/LANGFUSE_SECRET_KEY","value":"${LANGFUSE_SECRET_KEY_B64}"}
]' && echo "✅ LANGFUSE keys 已注入" || echo "⚠️ LANGFUSE keys patch 失敗"
else
echo "⚠️ LANGFUSE_PUBLIC_KEY/SECRET_KEY 未設定,跳過 (現有 K8s secret 值維持不變)"
fi
# 2026-04-02 Claude Code: Telegram Whitelist (授權簽核用戶 ID)
if [ -n "${TG_USER_WHITELIST_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/OPENCLAW_TG_USER_WHITELIST","value":"${TG_USER_WHITELIST_B64}"}
]' && echo "✅ TG_USER_WHITELIST 已注入" || echo "⚠️ TG_USER_WHITELIST patch 失敗"
fi
# Phase O-4.1 2026-04-02: Sentry Auth Token (Wave A.1 ADR-037)
if [ -n "${SENTRY_AUTH_TOKEN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SENTRY_AUTH_TOKEN","value":"${SENTRY_AUTH_TOKEN_B64}"}
]' && echo "✅ SENTRY_AUTH_TOKEN 已注入" || echo "⚠️ SENTRY_AUTH_TOKEN patch 失敗"
else
echo "⚠️ SENTRY_AUTH_TOKEN 未設定Sentry Comment API 將跳過"
fi
# ADR-059 2026-04-05 Claude Code: Gitea Webhook Secret
if [ -n "${GITEA_WEBHOOK_SECRET_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/GITEA_WEBHOOK_SECRET","value":"${GITEA_WEBHOOK_SECRET_B64}"}
]' && echo "✅ GITEA_WEBHOOK_SECRET 已注入" || echo "⚠️ GITEA_WEBHOOK_SECRET patch 失敗"
else
echo "⚠️ GITEA_WEBHOOK_SECRET 未設定Gitea Webhook 簽章驗證將在 prod 失效"
fi
# MCP Phase 3: ArgoCD API Token (2026-04-11 Claude Sonnet 4.6)
if [ -n "${ARGOCD_API_TOKEN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/ARGOCD_API_TOKEN","value":"${ARGOCD_API_TOKEN_B64}"}
]' && echo "✅ ARGOCD_API_TOKEN 已注入" || echo "⚠️ ARGOCD_API_TOKEN patch 失敗"
else
echo "⚠️ ARGOCD_API_TOKEN 未設定ArgoCD MCP 將使用空 token"
fi
# ============================================================================
# ADR-090-B 2026-04-18 ogt + Claude Opus 4.7: L3-only 升級 L213 個 key
# ============================================================================
# 目的: 消滅「只存 K8s etcd 單點」的災難盲區Gitea Secret 成為正式真相來源
# 注意: 每個 block 與上方維持相同結構if guard + base64 -w 0 + json patch
# DATABASE_URL — PG 應用連線串2026-04-18 輪替)
if [ -n "${DATABASE_URL_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/DATABASE_URL","value":"${DATABASE_URL_B64}"}
]' && echo "✅ DATABASE_URL 已注入" || echo "⚠️ DATABASE_URL patch 失敗"
else
echo "⚠️ DATABASE_URL 未設定awoooi-api 將無法連 PG"
fi
# MIGRATION_DATABASE_URL — CI migration 用 awoooi_migrator 限權帳號ADR-090-B
if [ -n "${MIGRATION_DATABASE_URL_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/MIGRATION_DATABASE_URL","value":"${MIGRATION_DATABASE_URL_B64}"}
]' && echo "✅ MIGRATION_DATABASE_URL 已注入" || echo "⚠️ MIGRATION_DATABASE_URL patch 失敗"
fi
# REDIS_URL — Redis 連線6380 on 188
if [ -n "${REDIS_URL_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/REDIS_URL","value":"${REDIS_URL_B64}"}
]' && echo "✅ REDIS_URL 已注入" || echo "⚠️ REDIS_URL patch 失敗"
else
echo "⚠️ REDIS_URL 未設定"
fi
# JWT_SECRET / JWT_ALGORITHM — API 認證
if [ -n "${JWT_SECRET_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/JWT_SECRET","value":"${JWT_SECRET_B64}"}
]' && echo "✅ JWT_SECRET 已注入" || echo "⚠️ JWT_SECRET patch 失敗"
fi
if [ -n "${JWT_ALGORITHM_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/JWT_ALGORITHM","value":"${JWT_ALGORITHM_B64}"}
]' && echo "✅ JWT_ALGORITHM 已注入" || echo "⚠️ JWT_ALGORITHM patch 失敗"
fi
# WEBHOOK_HMAC_SECRET — Alertmanager webhook HMAC 簽章
if [ -n "${WEBHOOK_HMAC_SECRET_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/WEBHOOK_HMAC_SECRET","value":"${WEBHOOK_HMAC_SECRET_B64}"}
]' && echo "✅ WEBHOOK_HMAC_SECRET 已注入" || echo "⚠️ WEBHOOK_HMAC_SECRET patch 失敗"
fi
# AWOOOP_OPERATOR_API_KEY — AwoooP Operator mutation endpoints
if [ -n "${AWOOOP_OPERATOR_API_KEY_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/AWOOOP_OPERATOR_API_KEY","value":"${AWOOOP_OPERATOR_API_KEY_B64}"}
]' && echo "✅ AWOOOP_OPERATOR_API_KEY 已注入" || echo "⚠️ AWOOOP_OPERATOR_API_KEY patch 失敗"
fi
# SENTRY_DSN — Sentry 錯誤追蹤(不是 auth token
if [ -n "${SENTRY_DSN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SENTRY_DSN","value":"${SENTRY_DSN_B64}"}
]' && echo "✅ SENTRY_DSN 已注入" || echo "⚠️ SENTRY_DSN patch 失敗"
fi
# CLAUDE_API_KEY — Claude 備援 LLM
if [ -n "${CLAUDE_API_KEY_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/CLAUDE_API_KEY","value":"${CLAUDE_API_KEY_B64}"}
]' && echo "✅ CLAUDE_API_KEY 已注入" || echo "⚠️ CLAUDE_API_KEY patch 失敗"
fi
# GITEA_API_TOKEN — Gitea API Token從 AWOOOI_GITEA_API_TOKEN 映射)
if [ -n "${GITEA_API_TOKEN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/GITEA_API_TOKEN","value":"${GITEA_API_TOKEN_B64}"}
]' && echo "✅ GITEA_API_TOKEN 已注入" || echo "⚠️ GITEA_API_TOKEN patch 失敗"
fi
# NEMOTRON_BOT_TOKEN / OPENCLAW_BOT_TOKEN — 多 Bot 架構
if [ -n "${NEMOTRON_BOT_TOKEN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/NEMOTRON_BOT_TOKEN","value":"${NEMOTRON_BOT_TOKEN_B64}"}
]' && echo "✅ NEMOTRON_BOT_TOKEN 已注入" || echo "⚠️ NEMOTRON_BOT_TOKEN patch 失敗"
fi
if [ -n "${OPENCLAW_BOT_TOKEN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/OPENCLAW_BOT_TOKEN","value":"${OPENCLAW_BOT_TOKEN_B64}"}
]' && echo "✅ OPENCLAW_BOT_TOKEN 已注入" || echo "⚠️ OPENCLAW_BOT_TOKEN patch 失敗"
fi
# SMTP_HOST / SRE_GROUP_CHAT_ID
if [ -n "${SMTP_HOST_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SMTP_HOST","value":"${SMTP_HOST_B64}"}
]' && echo "✅ SMTP_HOST 已注入" || echo "⚠️ SMTP_HOST patch 失敗"
fi
if [ -n "${SRE_GROUP_CHAT_ID_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SRE_GROUP_CHAT_ID","value":"${SRE_GROUP_CHAT_ID_B64}"}
]' && echo "✅ SRE_GROUP_CHAT_ID 已注入" || echo "⚠️ SRE_GROUP_CHAT_ID patch 失敗"
fi
# 2026-04-06 Claude Code: Sprint 3 T2 — known_hosts Secret (Security Fix A1)
# 替換關閉 host key 驗證的舊做法,讓 SSH 修復路徑使用已知主機指紋。
# asyncssh reads /etc/ssh-mcp/known_hosts and requires a non-empty
# OpenSSH known_hosts file. Keep hosts unhashed so both asyncssh and
# CLI diagnostics can trust the same secret.
# 2026-05-02 ogt + Claude Sonnet 4.6: 加 4 台主機完整性檢查
# 根因partial scan如 110 timeout、其他成功會讓 [-s file] 通過、
# 後續 patch 推進缺漏的 known_hosts → asyncssh 拒所有 SSH。
# 修法scan 完用 grep -c 驗證 4 台主機都在;缺任何一台就 abort
# 不能覆蓋現有 secret防止 production SSH 自動修復路徑癱瘓。
ssh-keyscan 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188 > /tmp/known_hosts_repair 2>/tmp/known_hosts_scan_err || true
EXPECTED_HOSTS=4
PRESENT=0
for ip in 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188; do
if grep -qE "^\${ip}[[:space:]]" /tmp/known_hosts_repair 2>/dev/null; then
PRESENT=\$((PRESENT + 1))
else
echo "⚠️ ssh-keyscan 缺主機 \${ip}"
fi
done
if [ "\$PRESENT" -eq "\$EXPECTED_HOSTS" ]; then
\$KUBECTL create secret generic awoooi-repair-known-hosts \
-n awoooi-prod \
--from-file=known_hosts=/tmp/known_hosts_repair \
--dry-run=client -o yaml | \$KUBECTL apply -f - \
&& echo "✅ awoooi-repair-known-hosts Secret 已建立/更新" \
|| echo "⚠️ awoooi-repair-known-hosts Secret 建立失敗 (非致命)"
KNOWN_HOSTS_B64=\$(base64 -w 0 /tmp/known_hosts_repair)
\$KUBECTL patch secret ssh-mcp-key -n awoooi-prod --type=merge \
-p="{\"data\":{\"known_hosts\":\"\${KNOWN_HOSTS_B64}\"}}" \
&& echo "✅ ssh-mcp-key known_hosts 已更新4 台主機完整)" \
|| echo "⚠️ ssh-mcp-key known_hosts 更新失敗 (非致命)"
rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err
else
echo "❌ ssh-keyscan 只抓到 \${PRESENT}/\${EXPECTED_HOSTS} 台主機,跳過 patch保留現有 secret"
cat /tmp/known_hosts_scan_err 2>/dev/null | head -10
rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err
fi
echo "✅ 所有 Secrets 注入完成"
SECRETS
# 2026-04-11 Claude Sonnet 4.6 (Sprint B-3 ADR-069):
# Deploy 改為 ArgoCD GitOps 模式:更新 kustomization.yaml → git push [skip ci] → ArgoCD sync
# 舊做法 (kubectl set image) 與 ArgoCD selfHeal 衝突 — ArgoCD 會 revert 任何直接 kubectl 操作
# 新做法流程:
# 1. 更新 kustomization.yaml image tag用 kustomize edit set image
# 2. Apply ConfigMap/ServiceRegistry不含 Deployment由 ArgoCD 管)
# 3. git commit [skip ci] + push → 觸發 ArgoCD automated sync
# 4. 等待 ArgoCD sync + rollout 完成
# 5. Health Check
- name: Deploy to K8s (ArgoCD GitOps)
run: |
write_deploy_key() {
mkdir -p "${HOME}/.ssh"
umask 077
cat > "${HOME}/.ssh/deploy_key" <<'AWOOOI_DEPLOY_KEY'
${{ secrets.DEPLOY_SSH_KEY }}
AWOOOI_DEPLOY_KEY
chmod 600 "${HOME}/.ssh/deploy_key"
}
mkdir -p ~/.ssh
write_deploy_key
# 2026-05-13 Codex: mirror Inject K8s Secrets host-key handling so the
# deploy job never reaches SSH with a known_hosts file missing ED25519.
# 2026-06-13 Codex: use the deploy-only known_hosts file so this
# stage cannot wipe cold-start/backup host trust for 120/188.
DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts"
ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null
test -s "${DEPLOY_KNOWN_HOSTS}" || { echo "❌ K8S host keyscan failed: ${K8S_SSH_HOST}"; exit 1; }
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10"
IMAGE_TAG="${{ github.sha }}"
HARBOR=192.168.0.110:5000
# ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 DeploymentConfigMap 仍直接 apply) ───
cat k8s/awoooi-prod/04-configmap.yaml | \
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
"KUBECTL='sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${{ env.K8S_API_SERVER }}'; \$KUBECTL apply -f -"
echo "✅ ConfigMap 已更新"
cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
"KUBECTL='sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${{ env.K8S_API_SERVER }}'; \$KUBECTL apply -f -"
echo "✅ Service Registry ConfigMap 已更新"
# ─── Step 2: 更新 kustomization.yaml image tag ───
# host runner 不保證有 root 權限kustomize 安裝在使用者目錄。
export PATH="${HOME}/.local/bin:${PATH}"
if ! command -v kustomize &>/dev/null; then
mkdir -p "${HOME}/.local/bin"
curl -sL https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.3.0/kustomize_v5.3.0_linux_amd64.tar.gz \
| tar xz -C "${HOME}/.local/bin"
chmod +x "${HOME}/.local/bin/kustomize"
fi
cd k8s/awoooi-prod
# kustomize edit set image 更新 tag
kustomize edit set image \
192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER=${HARBOR}/awoooi/api:${IMAGE_TAG}
kustomize edit set image \
192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER=${HARBOR}/awoooi/web:${IMAGE_TAG}
cd ../..
# ─── Step 3: git commit [skip ci] + push → 觸發 ArgoCD sync ───
git config user.email "cd@awoooi.internal"
git config user.name "AWOOOI CD"
git add k8s/awoooi-prod/kustomization.yaml
DEPLOY_REVISION=""
git diff --cached --quiet && echo "⚡ kustomization.yaml 無變化,跳過 push" || {
git commit -m "chore(cd): deploy ${IMAGE_TAG::7} [skip ci]"
# 用 token 推送(避免 SSH key 需要額外設定 push 權限)
git remote remove gitea 2>/dev/null || true
git remote add gitea "http://wooo:${{ secrets.CD_PUSH_TOKEN }}@192.168.0.110:3001/wooo/awoooi.git"
# 先 rebase 避免 non-fast-forward (其他 commit 在 CI 期間已推入)
# 2026-04-17 ogt: -X theirs — kustomization.yaml 衝突時採用當次部署的 image tag
git fetch gitea main
git rebase -X theirs gitea/main
DEPLOY_REVISION=$(git rev-parse HEAD)
git push gitea main
echo "✅ kustomization.yaml 已 push等待 ArgoCD sync 到 ${DEPLOY_REVISION:0:8}..."
}
# ─── Step 4: 等待 ArgoCD sync + rollout ───
ROLLOUT_LOG="$(mktemp)"
set +e
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" 2>&1 << 'ARGOCD_WAIT' | tee "$ROLLOUT_LOG"
set -e
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER}"
RISK_FILE="$(mktemp)"
UNKNOWN_STATUS_COUNT=0
HEALTH_FAILURE_COUNT=0
record_rollout_risk() {
local message="$1"
printf '%s\n' "$message" >> "$RISK_FILE"
echo "⚠️ Rollout risk observed: $message" >&2
}
emit_rollout_evidence() {
if [ -s "$RISK_FILE" ]; then
local summary
local kubectl_count
kubectl_count=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
summary=$(tr '\n' '; ' < "$RISK_FILE" | sed 's/[[:cntrl:]]//g' | cut -c1-700)
echo "AWOOOI_ROLLOUT_RISK=1"
echo "AWOOOI_ROLLOUT_SUMMARY=unknown_status_count=${UNKNOWN_STATUS_COUNT}; health_failure_count=${HEALTH_FAILURE_COUNT}; kubectl_failure_count=${kubectl_count}; ${summary}"
else
echo "AWOOOI_ROLLOUT_RISK=0"
fi
rm -f "$RISK_FILE"
}
trap emit_rollout_evidence EXIT
app_field() {
local jsonpath="$1"
local label="$2"
local output
local status
local kubectl_seen
set +e
output=$($KUBECTL get application awoooi-prod -n argocd -o jsonpath="$jsonpath" 2>&1)
status=$?
set -e
if [ "$status" -ne 0 ]; then
kubectl_seen=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
if [ "$kubectl_seen" -lt 3 ]; then
record_rollout_risk "argocd_${label}_query_failed=$(echo "$output" | head -c 180)"
fi
printf 'Unknown'
return 0
fi
printf '%s' "$output"
}
probe_public_health() {
local phase="$1"
local http_code
local status
set +e
http_code=$(curl -sS -w "%{http_code}" -o /dev/null --connect-timeout 3 --max-time 8 "${{ env.API_HEALTH_URL }}" 2>/dev/null)
status=$?
set -e
if [ "$status" -ne 0 ]; then
http_code="curl_error_${status}"
fi
if [ "$http_code" != "200" ]; then
HEALTH_FAILURE_COUNT=$((HEALTH_FAILURE_COUNT + 1))
if [ "$HEALTH_FAILURE_COUNT" -le 3 ]; then
record_rollout_risk "public_health_${phase}_http=${http_code}"
fi
fi
}
collect_argocd_resource_evidence() {
local template
local output
local status
template='{{range .status.resources}}{{if ne .status "Synced"}}{{.kind}}/{{.name}}{{if .namespace}} ns={{.namespace}}{{end}} sync={{.status}}{{if .health.status}} health={{.health.status}}{{end}}{{"\n"}}{{end}}{{if .health.status}}{{if ne .health.status "Healthy"}}{{.kind}}/{{.name}}{{if .namespace}} ns={{.namespace}}{{end}} sync={{.status}} health={{.health.status}}{{if .health.message}} msg={{.health.message}}{{end}}{{"\n"}}{{end}}{{end}}{{end}}'
set +e
output=$($KUBECTL get application awoooi-prod -n argocd -o "go-template=${template}" 2>&1)
status=$?
set -e
if [ "$status" -ne 0 ]; then
local output_snippet
output_snippet=$(printf '%s' "$output" | head -c 180)
echo "resource_query_failed=${output_snippet}"
return 0
fi
echo "$output" \
| awk 'NF && !seen[$0]++ {print}' \
| head -5 \
| tr '\n' ';' \
| sed 's/[[:cntrl:]]//g; s/;*$//'
}
validate_argocd_source_contract() {
local target_revision
local image_override
target_revision=$(app_field '{.spec.source.targetRevision}' source_target_revision)
image_override=$(app_field '{.spec.source.kustomize.images}' source_kustomize_images)
if [ "$target_revision" != "main" ]; then
record_rollout_risk "argocd_source_target_revision_not_main targetRevision=$target_revision"
echo "❌ ArgoCD source targetRevision must be main, got: $target_revision" >&2
exit 1
fi
if [ -n "$image_override" ]; then
local image_override_snippet
image_override_snippet=$(printf '%s' "$image_override" | head -c 180)
record_rollout_risk "argocd_source_image_override_present images=${image_override_snippet}"
echo "❌ ArgoCD source kustomize.images override must be empty; image truth belongs in k8s/awoooi-prod/kustomization.yaml" >&2
exit 1
fi
}
# 等待 ArgoCD Application 同步到目標 revision最多 180s
# 2026-05-24 Codex: top-level Application health can stay Degraded
# without per-resource health detail. Treat that as rollout evidence,
# then let kubectl rollout status and API health decide pass/fail.
echo "⏳ 等待 ArgoCD sync..."
validate_argocd_source_contract
$KUBECTL annotate application awoooi-prod -n argocd \
argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true
for i in $(seq 1 36); do
SYNC=$(app_field '{.status.sync.status}' sync)
HEALTH=$(app_field '{.status.health.status}' health)
REVISION=$(app_field '{.status.sync.revision}' revision)
SHORT_REVISION=$(echo "$REVISION" | cut -c1-8)
SHORT_EXPECTED=$(echo "$EXPECTED_REVISION" | cut -c1-8)
echo " ArgoCD: sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
probe_public_health "argocd_wait"
if [ "$SYNC" = "Unknown" ] || [ "$HEALTH" = "Unknown" ] || [ "$REVISION" = "Unknown" ]; then
UNKNOWN_STATUS_COUNT=$((UNKNOWN_STATUS_COUNT + 1))
if [ "$UNKNOWN_STATUS_COUNT" -le 3 ]; then
record_rollout_risk "argocd_status_unknown sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
fi
fi
if [ "$SYNC" = "Synced" ]; then
if [ -z "$EXPECTED_REVISION" ] || [ "$REVISION" = "$EXPECTED_REVISION" ]; then
if [ "$HEALTH" != "Healthy" ]; then
RESOURCE_EVIDENCE=$(collect_argocd_resource_evidence)
if [ -n "$RESOURCE_EVIDENCE" ]; then
record_rollout_risk "argocd_health_not_healthy health=$HEALTH revision=$SHORT_REVISION resources=$RESOURCE_EVIDENCE"
else
record_rollout_risk "argocd_health_not_healthy health=$HEALTH revision=$SHORT_REVISION resources=none_visible"
fi
fi
echo "✅ ArgoCD Synced to target revision (health=$HEALTH)"
break
fi
fi
if [ "$i" = "36" ]; then
echo "❌ ArgoCD 未在期限內同步到目標 revision"
exit 1
fi
sleep 5
done
# 確認 rollout 完成
$KUBECTL rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s
$KUBECTL rollout status deployment/awoooi-web -n awoooi-prod --timeout=120s
$KUBECTL rollout status deployment/awoooi-worker -n awoooi-prod --timeout=120s
echo "✅ 部署完成"
# Health Check
HEALTH_PASS=0
for i in 1 2 3; do
set +e
HTTP_CODE=$(curl -sS -w "%{http_code}" -o /dev/null --connect-timeout 10 --max-time 20 "${{ env.API_HEALTH_URL }}" 2>/dev/null)
CURL_STATUS=$?
set -e
if [ "$CURL_STATUS" -ne 0 ]; then
HTTP_CODE="curl_error_${CURL_STATUS}"
fi
if [ "$HTTP_CODE" = "200" ]; then
echo "✅ API 健康檢查通過"
HEALTH_PASS=1
break
fi
echo "⏳ 嘗試 #$i: HTTP $HTTP_CODE等待 10s..."
sleep 10
done
if [ "$HEALTH_PASS" = "0" ]; then
record_rollout_risk "public_health_final_failed"
echo "❌ API 健康檢查失敗"
exit 1
fi
ARGOCD_WAIT
ROLLOUT_EXIT=${PIPESTATUS[0]}
set -e
ROLLOUT_RISK="0"
ROLLOUT_SUMMARY=""
if grep -q '^AWOOOI_ROLLOUT_RISK=1$' "$ROLLOUT_LOG"; then
ROLLOUT_RISK="1"
ROLLOUT_SUMMARY=$(grep '^AWOOOI_ROLLOUT_SUMMARY=' "$ROLLOUT_LOG" | tail -1 | sed 's/^AWOOOI_ROLLOUT_SUMMARY=//' | cut -c1-700)
fi
if [ -n "${GITHUB_ENV:-}" ]; then
{
echo "AWOOI_ROLLOUT_RISK=${ROLLOUT_RISK}"
echo "AWOOI_ROLLOUT_SUMMARY=${ROLLOUT_SUMMARY}"
} >> "$GITHUB_ENV"
fi
rm -f "$ROLLOUT_LOG"
if [ "$ROLLOUT_EXIT" -eq 0 ] && [ "$ROLLOUT_RISK" = "1" ]; then
ACTOR="${GITHUB_ACTOR:-${{ github.actor }}}"
if AWOOI_CICD_STATUS=pending \
AWOOI_CICD_STAGE=rollout-risk \
AWOOI_CICD_JOB_NAME="AWOOOI 部署完成但仍有風險證據" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${ROLLOUT_SUMMARY}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD rollout risk notification mirrored through AWOOI API"
else
echo "⚠️ CI/CD rollout risk notification failed (non-fatal)"
fi
fi
exit "$ROLLOUT_EXIT"
- name: Notify Build Deploy Success
run: |
END_TIME=$(date +%s)
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
ACTOR="${{ github.actor }}"
if AWOOI_CICD_STATUS=success \
AWOOI_CICD_STAGE=build-and-deploy \
AWOOI_CICD_JOB_NAME="AWOOOI 建置部署完成" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_DURATION_SECONDS="${DURATION}" \
AWOOI_CICD_SUMMARY="Image build/push + ArgoCD rollout + API health passed" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD build-deploy success notification mirrored through AWOOI API"
else
echo "⚠️ CI/CD build-deploy success notification failed (non-fatal)"
fi
# 2026-04-09 Claude Sonnet 4.6: Sprint 5.2 — 同步 ops 腳本到 188 (ollama user)
# 188 deploy key is rotated and must not be read by this disabled step.
# 腳本: docker-health-monitor.sh + pg-backup.sh + notify-awoooi-ops.sh
# 感知層與備份通知都先走 AWOOI API/AwoooPTelegram 直發只保留 API 離線 fallback。
- name: Sync Ops Scripts to 188
# 2026-05-13 Codex T14e/P0:
# Disabled until the 188 ops sync path is moved to a file-secret or
# Ansible-controlled channel. Gitea Actions logs step env values, and
# multiline SSH secrets must not be exposed through CD logs.
if: ${{ false }}
continue-on-error: true
run: |
echo "188 ops script sync disabled pending secure key rotation path"
- name: Notify Pipeline Failure
if: failure()
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
FAILURE_SUMMARY="${AWOOI_ROLLOUT_SUMMARY:-${COMMIT_MSG}}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
if AWOOI_CICD_STATUS=failed \
AWOOI_CICD_STAGE=build-and-deploy \
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${FAILURE_SUMMARY}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD build failure notification mirrored through AWOOI API"
else
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
fi
post-deploy-checks:
needs: build-and-deploy
timeout-minutes: 30
# 2026-04-30 Codex: keep post-deploy on the host runner too. Playwright
# install-deps can also kill the act-managed job container with RWLayer=nil.
runs-on: awoooi-host
steps:
- name: Bootstrap Host Runner Tools
# 2026-05-05 Codex: post-deploy also uses checkout and curl-based
# notifications, so it needs the same runner bootstrap as earlier jobs.
run: |
if command -v apk >/dev/null 2>&1; then
apk add --no-cache nodejs npm git curl bash openssh-client docker-cli docker-cli-buildx
fi
- uses: actions/checkout@v4
- name: Get Commit Info
id: commit
run: |
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
- name: Notify Post Deploy Checks Start
run: |
ACTOR="${{ github.actor }}"
if AWOOI_CICD_STATUS=running \
AWOOI_CICD_STAGE=post-deploy-checks \
AWOOI_CICD_JOB_NAME="AWOOOI 部署後驗證開始" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="Alert Chain / Source Link / Monitoring / Smoke gates started" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD post-deploy start notification mirrored through AWOOI API"
else
echo "⚠️ CI/CD post-deploy start notification failed (non-fatal)"
fi
# Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037)
# 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter
# 2026-04-05 Claude Code cache優化: 使用 /opt/api-venv (已有 requests),移除 Setup Python Tools step
# 2026-04-10 ogt: 移除 continue-on-error — 告警鏈路失敗必須阻塞部署
- name: Alert Chain Smoke Test
id: alert_chain_smoke
run: |
write_deploy_key() {
mkdir -p "${HOME}/.ssh"
umask 077
cat > "${HOME}/.ssh/deploy_key" <<'AWOOOI_DEPLOY_KEY'
${{ secrets.DEPLOY_SSH_KEY }}
AWOOOI_DEPLOY_KEY
chmod 600 "${HOME}/.ssh/deploy_key"
}
collect_observability_statuses() {
local component="$1"
ssh $SSH_OPTS "wooo@${K8S_SSH_HOST}" \
"sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER} get pods -n observability -l app.kubernetes.io/name=${component} --no-headers -o custom-columns=STATUS:.status.phase"
}
capture_observability_statuses() {
local component="$1"
local output
if output="$(collect_observability_statuses "${component}" 2>&1)"; then
printf '%s' "${output}"
return 0
fi
printf '%s' "${output}"
return 1
}
# 2026-05-19 Codex: the smoke test runs inside CI_IMAGE, but the
# observability pod checks need the K3s host kubectl context. Capture
# those read-only statuses on the host and pass them into the
# container, instead of making the container own kube credentials.
OBSERVABILITY_PREFLIGHT_ERROR=""
OTEL_COLLECTOR_ERROR=""
EVENT_EXPORTER_ERROR=""
OTEL_COLLECTOR_STATUSES=""
EVENT_EXPORTER_STATUSES=""
write_deploy_key
DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts"
if ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null && test -s "${DEPLOY_KNOWN_HOSTS}"; then
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10"
if ! OTEL_COLLECTOR_STATUSES="$(capture_observability_statuses otel-collector)"; then
OTEL_COLLECTOR_ERROR="$(printf '%s' "${OTEL_COLLECTOR_STATUSES}" | tail -1 | head -c 200)"
OTEL_COLLECTOR_STATUSES=""
fi
if ! EVENT_EXPORTER_STATUSES="$(capture_observability_statuses event-exporter)"; then
EVENT_EXPORTER_ERROR="$(printf '%s' "${EVENT_EXPORTER_STATUSES}" | tail -1 | head -c 200)"
EVENT_EXPORTER_STATUSES=""
fi
else
OBSERVABILITY_PREFLIGHT_ERROR="K8s host keyscan failed"
OTEL_COLLECTOR_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}"
EVENT_EXPORTER_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}"
fi
SOURCE_LINK_RUN_REF="gitea-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}"
SOURCE_LINK_CANARY_WORK_ITEM_ID="source-evidence:sentry:upstream_canary:awoooi-source-link-canary-${SOURCE_LINK_RUN_REF}"
SOURCE_LINK_CANARY_EVENT_ID="sentry:source_correlation_linked:awoooi-source-link-canary-${SOURCE_LINK_RUN_REF}"
echo "source_link_canary_work_item_id=${SOURCE_LINK_CANARY_WORK_ITEM_ID}" >> "$GITHUB_OUTPUT"
echo "source_link_canary_event_id=${SOURCE_LINK_CANARY_EVENT_ID}" >> "$GITHUB_OUTPUT"
AWOOOP_OPERATOR_API_KEY="$(
ssh $SSH_OPTS "wooo@${K8S_SSH_HOST}" \
"sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER} get secret awoooi-secrets -n awoooi-prod -o jsonpath='{.data.AWOOOP_OPERATOR_API_KEY}' | base64 -d"
)"
if [ -z "${AWOOOP_OPERATOR_API_KEY}" ]; then
echo "❌ AWOOOP_OPERATOR_API_KEY missing from production secret; source-link canary cannot run"
exit 1
fi
export AWOOOP_OPERATOR_API_KEY
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
--cpus "1.0" \
--memory "1g" \
-v "$PWD:/workspace" \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
-e AWOOOI_OTEL_COLLECTOR_STATUSES="${OTEL_COLLECTOR_STATUSES}" \
-e AWOOOI_OTEL_COLLECTOR_ERROR="${OTEL_COLLECTOR_ERROR}" \
-e AWOOOI_EVENT_EXPORTER_STATUSES="${EVENT_EXPORTER_STATUSES}" \
-e AWOOOI_EVENT_EXPORTER_ERROR="${EVENT_EXPORTER_ERROR}" \
-e AWOOOP_OPERATOR_API_KEY \
-e AWOOOP_OPERATOR_ID="gitea-cd-post-deploy" \
-e SOURCE_LINK_RUN_REF="${SOURCE_LINK_RUN_REF}" \
"${{ env.CI_IMAGE }}" \
bash -lc 'set -o pipefail; source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url ${{ env.ALERT_CHAIN_API_URL }} --source-link-canary-target-incident-id INC-20260505-25E744 --run-ref "${SOURCE_LINK_RUN_REF}" --json | tee /tmp/alert_chain_result.json'; then
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
else
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
exit 1
fi
# Phase O-5 Wave C.2 2026-04-02 ogt: 監控覆蓋率驗證 (generate_monitoring.py --check)
# 2026-04-10 ogt: 移除 continue-on-error — 覆蓋率不足必須阻塞部署
- name: Monitoring Coverage Check
id: monitoring_coverage
run: |
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-coverage" \
--cpus "1.0" \
--memory "1g" \
-v "$PWD:/workspace" \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/generate_monitoring.py --check'; then
echo "coverage_status=pass" >> $GITHUB_OUTPUT
else
echo "coverage_status=fail" >> $GITHUB_OUTPUT
exit 1
fi
- name: AwoooP Source Correlation Applied-Link Smoke
id: source_correlation_apply_smoke
run: |
SOURCE_LINK_CANARY_WORK_ITEM_ID="${{ steps.alert_chain_smoke.outputs.source_link_canary_work_item_id }}"
SOURCE_LINK_CANARY_EVENT_ID="${{ steps.alert_chain_smoke.outputs.source_link_canary_event_id }}"
export SOURCE_LINK_CANARY_WORK_ITEM_ID SOURCE_LINK_CANARY_EVENT_ID
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-source-link-smoke" \
--cpus "0.5" \
--memory "512m" \
-v "$PWD:/workspace" \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
-e SOURCE_LINK_CANARY_WORK_ITEM_ID \
-e SOURCE_LINK_CANARY_EVENT_ID \
"${{ env.CI_IMAGE }}" \
bash -lc 'set -o pipefail; source /opt/api-venv/bin/activate && python3 scripts/awooop_source_correlation_apply_smoke.py \
--api-url ${{ env.ALERT_CHAIN_API_URL }} \
--target-incident-id INC-20260505-25E744 \
--work-item-id "${SOURCE_LINK_CANARY_WORK_ITEM_ID}" \
--expected-source-event-provider-event-id "${SOURCE_LINK_CANARY_EVENT_ID}" \
--allow-existing-apply \
--refresh-if-stale-days 6 \
--refresh-work-item-id "${SOURCE_LINK_CANARY_WORK_ITEM_ID}" \
--verify-refresh-candidate \
--reviewer-id gitea_cd_source_link_canary \
--operator-note "CD dedicated source-link canary; append-only status-chain proof" \
| tee /tmp/source_correlation_apply_smoke.json'; then
echo "source_correlation_apply_status=pass" >> $GITHUB_OUTPUT
else
echo "source_correlation_apply_status=fail" >> $GITHUB_OUTPUT
exit 1
fi
# [首席架構師] 新增 Playwright E2E Smoke Test 步驟 v1.0.0 2026-04-01 (台北時間)
# continue-on-error: true — smoke 失敗不阻塞部署,但結果會反映在 TG 通知
- name: E2E Smoke Test
id: smoke
continue-on-error: true
run: |
cat > /tmp/awoooi-smoke.sh <<'CI_SCRIPT'
# 首席架構師 Review I4 + 2026-04-05 Claude Code cache優化:
# playwright.config.ts import @playwright/test — 必須先安裝 pnpm node_modules
# pnpm store 持久化到 /opt/pnpm-storepnpm-lock.yaml hash 未變則 --prefer-offline
cleanup_smoke_workspace_artifacts() {
# 2026-05-19 Codex: pnpm creates a symlink-heavy node_modules tree
# inside the bind-mounted checkout. Remove it before act-runner's
# post-job cleanup so successful smoke jobs do not end with
# errSymlink cleanup noise.
rm -rf /workspace/node_modules \
/workspace/apps/web/node_modules \
/workspace/apps/web/tests/e2e/.auth \
/workspace/apps/web/test-results \
/workspace/apps/web/playwright-report \
2>/dev/null || true
find /workspace/apps /workspace/packages \
-mindepth 2 -maxdepth 2 -type d -name node_modules -prune -exec rm -rf {} + \
2>/dev/null || true
}
trap cleanup_smoke_workspace_artifacts EXIT
PNPM_STORE=/opt/pnpm-store
PNPM_HASH_FILE=/opt/pnpm-store/.lock_hash
CURRENT_PNPM_HASH=$(md5sum pnpm-lock.yaml | awk '{print $1}')
corepack enable 2>/dev/null || npm install -g pnpm@9 -q
pnpm config set store-dir $PNPM_STORE
if [ "$(cat $PNPM_HASH_FILE 2>/dev/null)" != "$CURRENT_PNPM_HASH" ]; then
echo "📦 pnpm lock 已變更,重裝 node_modules..."
pnpm install --frozen-lockfile 2>&1 | tail -5
echo "$CURRENT_PNPM_HASH" > $PNPM_HASH_FILE
else
echo "⚡ 使用快取 pnpm store (lock 未變更)prefer-offline..."
pnpm install --frozen-lockfile --prefer-offline 2>&1 | tail -5
fi
cd apps/web
# Playwright Chromium 持久化到 /opt/playwright-browsers版本 hash guard
export PLAYWRIGHT_BROWSERS_PATH=/opt/playwright-browsers
PLAYWRIGHT_VER=$(node -e "console.log(require('./package.json').devDependencies['@playwright/test'] || '')" 2>/dev/null || echo "unknown")
PLAYWRIGHT_HASH_FILE=/opt/playwright-browsers/.version_hash
if [ "$(cat $PLAYWRIGHT_HASH_FILE 2>/dev/null)" != "$PLAYWRIGHT_VER" ]; then
echo "📦 Playwright 版本變更 ($PLAYWRIGHT_VER),重裝 Chromium..."
npx playwright install chromium --with-deps 2>&1 | tail -5
echo "$PLAYWRIGHT_VER" > $PLAYWRIGHT_HASH_FILE
else
echo "⚡ 使用快取 Playwright Chromium ($PLAYWRIGHT_VER)"
fi
# Browser cache 命中時也要確認 OS shared libs 存在;否則 smoke 會只測到
# chromium launch failure例如 libnspr4.so missing
if ! ldconfig -p 2>/dev/null | grep -q 'libnspr4'; then
echo "📦 Playwright system deps missing補安裝 Chromium deps..."
npx playwright install-deps chromium > /tmp/playwright-install-deps.log 2>&1 || {
tail -40 /tmp/playwright-install-deps.log
exit 1
}
tail -20 /tmp/playwright-install-deps.log
fi
# 對已部署的生產環境跑 smoke test
SMOKE_STATUS=pass
npx playwright test tests/e2e/smoke.spec.ts --reporter=line || SMOKE_STATUS=fail
echo "smoke_status=${SMOKE_STATUS}" >> $GITHUB_OUTPUT
CI_SCRIPT
SMOKE_OUTPUT="$PWD/.awoooi-smoke-output"
rm -f "$SMOKE_OUTPUT"
touch "$SMOKE_OUTPUT"
chmod 666 "$SMOKE_OUTPUT"
SMOKE_DOCKER_STATUS=0
# 2026-06-01 Codex: post-deploy smoke can pass, then hang in
# runner cleanup and incorrectly mark the deploy failed. Bound only
# the smoke container; preserve pass evidence if it was written.
if command -v timeout >/dev/null 2>&1; then
# 2026-06-14 Codex: act-runner host may provide BusyBox timeout,
# which rejects GNU-only --kill-after. The short -k form works
# with BusyBox and GNU timeout.
timeout -k 20s 300s docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-e2e-smoke" \
--cpus "1.5" \
--memory "2g" \
-v "$PWD:/workspace" \
-v /tmp/awoooi-smoke.sh:/tmp/awoooi-smoke.sh:ro \
-v awoooi-pnpm-store:/opt/pnpm-store \
-v awoooi-playwright-browsers:/opt/playwright-browsers \
-w /workspace \
-e GITHUB_OUTPUT=/workspace/.awoooi-smoke-output \
-e CI=true \
-e PLAYWRIGHT_BASE_URL=https://awoooi.wooo.work \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-smoke.sh || SMOKE_DOCKER_STATUS=$?
else
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-e2e-smoke" \
--cpus "1.5" \
--memory "2g" \
-v "$PWD:/workspace" \
-v /tmp/awoooi-smoke.sh:/tmp/awoooi-smoke.sh:ro \
-v awoooi-pnpm-store:/opt/pnpm-store \
-v awoooi-playwright-browsers:/opt/playwright-browsers \
-w /workspace \
-e GITHUB_OUTPUT=/workspace/.awoooi-smoke-output \
-e CI=true \
-e PLAYWRIGHT_BASE_URL=https://awoooi.wooo.work \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-smoke.sh || SMOKE_DOCKER_STATUS=$?
fi
if [ "$SMOKE_DOCKER_STATUS" != "0" ] && ! grep -q '^smoke_status=pass$' "$SMOKE_OUTPUT"; then
echo "smoke_status=fail" > "$SMOKE_OUTPUT"
echo "E2E smoke container failed before pass evidence: ${SMOKE_DOCKER_STATUS}"
exit "$SMOKE_DOCKER_STATUS"
fi
if [ "$SMOKE_DOCKER_STATUS" != "0" ]; then
echo "E2E smoke pass evidence was written; treating container exit ${SMOKE_DOCKER_STATUS} as cleanup timeout"
fi
cat "$SMOKE_OUTPUT" >> "$GITHUB_OUTPUT"
env:
CI: "true"
# 直接測試已部署的生產環境,不啟動本地 dev server
PLAYWRIGHT_BASE_URL: "https://awoooi.wooo.work"
- name: Notify Health Check Success
env:
SMOKE_RESULT: ${{ steps.smoke.outputs.smoke_status == 'pass' && '✅' || '⚠️' }}
ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outputs.alert_chain_status == 'pass' && '✅' || '⚠️' }}
MONITORING_RESULT: ${{ steps.monitoring_coverage.outputs.coverage_status == 'pass' && '✅' || '⚠️' }}
SOURCE_LINK_RESULT: ${{ steps.source_correlation_apply_smoke.outputs.source_correlation_apply_status == 'pass' && '✅' || '⚠️' }}
run: |
END_TIME=$(date +%s)
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
MINUTES=$((DURATION / 60))
SECONDS=$((DURATION % 60))
# 2026-04-05 ogt: TG_MSG 必須在 shell 中組裝,才能展開 ${MINUTES}/${SECONDS} 等 shell 變數
# 2026-04-05 ogt: 移除 parse_mode=HTML避免 commit message 含特殊字元導致 400
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
TG_MSG="✅ AWOOOI 部署完成\n├ 📝 ${COMMIT_MSG}\n├ 🔖 ${SHORT_SHA}\n├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s\n├ 📦 API: ✅ Web: ✅\n├ 🩺 Health: ✅\n├ 🔗 Alert Chain: ${ALERT_CHAIN_RESULT}\n├ 🧷 Source Link: ${SOURCE_LINK_RESULT}\n├ 📊 Monitoring: ${MONITORING_RESULT}\n└ 🎭 Smoke: ${SMOKE_RESULT}"
if AWOOI_CICD_STATUS=success \
AWOOI_CICD_STAGE=post-deploy \
AWOOI_CICD_JOB_NAME="AWOOOI 部署完成" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_DURATION_SECONDS="${DURATION}" \
AWOOI_CICD_SUMMARY="API=✅; Web=✅; AlertChain=${ALERT_CHAIN_RESULT}; SourceLink=${SOURCE_LINK_RESULT}; Monitoring=${MONITORING_RESULT}; Smoke=${SMOKE_RESULT}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD success notification mirrored through AWOOI API"
else
printf '%b' "$TG_MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \
--data-urlencode "text@-" || echo "TG notify warning (non-fatal)"
fi
- name: Notify Pipeline Failure
# 2026-04-16 ogt + Claude Sonnet 4.6: 改用 HTML 結構化格式
if: failure()
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🩺 Stage: post-deploy-checks\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
if AWOOI_CICD_STATUS=failed \
AWOOI_CICD_STAGE=post-deploy-checks \
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD post-deploy failure notification mirrored through AWOOI API"
else
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
fi
- name: Clean Post-Deploy Workspace Artifacts
if: always()
env:
HOST_RUNNER_CLEANUP_IMAGE: ${{ env.CI_IMAGE }}
run: bash scripts/ci/cleanup-host-runner-workspace.sh