Files
awoooi/.gitea/workflows/cd.yaml
Your Name d2aebdd477
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
fix(cd): avoid host-key prompt during deploy
2026-05-06 07:27:57 +08:00

1014 lines
55 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# =============================================================================
# AWOOOI CD Pipeline (Gitea Actions - 方案 B)
# =============================================================================
# 流程: Build → Push to Harbor → Deploy to K8s
# 加速措施:
# 1. Docker Layer Cache → Harbor registry cache
# 2. 內部 Mirror → 192.168.0.110:5001 (Harbor Proxy Cache for DockerHub)
# 2026-03-29 Claude Code (ADR-039) - Retry after creating Harbor project
name: CD Pipeline
on:
push:
branches: [main]
paths:
# 只有實際影響部署的程式碼才觸發 CD
- 'apps/**'
- 'k8s/**'
- '.dockerignore'
# Workflow-only changes do not rebuild runtime images. Use workflow_dispatch
# when an operator explicitly wants to test the CD pipeline itself.
# docs/、memory/、ADR 等不觸發
# ops/monitoring/alerts-unified.yml 由 deploy-alerts.yaml 獨立處理 (I3)
workflow_dispatch:
# 手動觸發永遠可用(用於補跑、緊急部署)
# 2026-04-02 Claude Code: 改為搶佔模式 — 新 push 立即取消舊 build只部署最新
# 原理: concurrency group 保證同時只有一個 job 跑cancel-in-progress:true 讓新的取代舊的
# 解決: 多個 commit 快速連推時不再排隊堆積,且 docker build 卡住時不會阻塞後續部署
# 安全: deploy 步驟本身有 kubectl rollout status 保護,不會出現半部署狀態
concurrency:
group: cd-deploy-${{ github.ref }}
cancel-in-progress: true
env:
HARBOR: 192.168.0.110:5000
TELEGRAM_ALERT_CHAT_ID: "-1003711974679"
# Harbor Proxy Cache (指向 DockerHub 的內部 Mirror避免拉取限額)
HARBOR_MIRROR: 192.168.0.110:5001
# OTEL CI/CD 監控 (2026-03-31 #46c - 遷移到 Gitea)
OTEL_EXPORTER_OTLP_ENDPOINT: http://192.168.0.188:24318
OTEL_SERVICE_NAME: awoooi-cd
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production
CI_IMAGE: 192.168.0.110:5000/awoooi/ci-runner:act-22.04
# 2026-05-06 Codex: deploy through the 120 control-plane node. After dirty
# reboots, 121 host-key prompts can block the non-interactive host runner.
# Both nodes support the sudo kubectl path, but 120 removes the extra hop.
K8S_SSH_HOST: 192.168.0.120
K8S_API_SERVER: https://192.168.0.120:6443
# 2026-05-05 Codex: health/smoke probes use the keepalived VIP instead of a
# fixed node. Kubectl still tunnels through K8S_SSH_HOST with --server=120.
API_HEALTH_URL: http://192.168.0.125:32334/api/v1/health
ALERT_CHAIN_API_URL: http://192.168.0.125:32334
jobs:
tests:
# 2026-04-30 Codex: run the tests job on the host runner and launch the
# CI image explicitly. The act-managed job container can disappear mid-test
# with Docker RWLayer=nil on the shared 110 daemon.
timeout-minutes: 30
runs-on: awoooi-host
# 2026-04-10 ogt: B5 改用 docker run 本地啟動,移除 services: 宣告
# Gitea act runner 的 services: container name 為空,導致 CI 失敗
steps:
- name: Bootstrap Host Runner Tools
# 2026-05-05 Codex: awoooi-host maps to the long-lived act-runner
# container. After dirty reboots it may not contain node/curl/git, and
# actions/checkout@v4 fails before tests can start.
run: |
if command -v apk >/dev/null 2>&1; then
apk add --no-cache nodejs npm git curl bash openssh-client docker-cli docker-cli-buildx
fi
- uses: actions/checkout@v4
# 2026-03-31 ogt: 優化告警格式 - 提高可讀性
- name: Get Commit Info
id: commit
run: |
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
- name: Notify Pipeline Start
# 2026-04-16 ogt + Claude Sonnet 4.6: 改用 HTML 結構化格式,提升可讀性
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
# HTML escape commit message防特殊字元破壞 HTML
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '🚀 <b>AWOOOI 部署開始</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n└ 👤 %s' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
# 2026-05-02 Claude Opus 4.7 + 統帥 ogt: notify 失敗不該擋整條 CI鐵證:
# curl 400 從 5/1 起連續炸 14 個 commit 的 build-and-deploy— 對齊 line 922 既有 pattern
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
# 2026-03-31 ogt: Phase 22.0 CI 測試 (禁止 Mock - feedback_no_mock_testing.md)
# 2026-04-01 ogt: 持久化 venv 加速 - /opt/api-venv 跨 run 保留
# pyproject.toml hash 變才重裝,其餘直接 activate (節省 ~6-7 min)
- name: Run API Tests
run: |
cat > /tmp/awoooi-api-tests.sh <<'CI_SCRIPT'
VENV=/opt/api-venv
HASH_FILE=/opt/api-venv/.deps_hash
CURRENT_HASH=$(md5sum apps/api/pyproject.toml | awk '{print $1}')
# python3.11 是 runner 層級持久安裝,只在首次或版本消失時才 apt-get
# 2026-04-05 Claude Code: 分離 apt-get 與 venv hash-guard避免每次 deps 變更都重跑 apt
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 apt index 失敗 → 改用 --fix-missing + retry
if ! command -v python3.11 &>/dev/null; then
echo "📦 安裝 python3.11..."
apt-get clean && rm -rf /var/lib/apt/lists/*
apt-get update -q --fix-missing || apt-get update -q || true
apt-get install -y -q python3.11-venv python3.11 || \
(add-apt-repository ppa:deadsnakes/python -y 2>/dev/null && apt-get update -q && apt-get install -y -q python3.11-venv python3.11) || true
else
echo "⚡ python3.11 已安裝,跳過 apt-get"
fi
# 確保 python3.11 存在,否則 fallback 到系統 python3
if ! command -v python3.11 &>/dev/null; then
echo "⚠️ python3.11 安裝失敗,使用 python3 fallback"
ln -sf "$(which python3)" /usr/local/bin/python3.11 || true
fi
if [ ! -d "$VENV/bin" ] || [ "$(cat $HASH_FILE 2>/dev/null)" != "$CURRENT_HASH" ]; then
echo "📦 deps 已變更,重建 venv..."
# 2026-04-17 ogt: /opt/api-venv 是 volume mount不能 rm -rf 目錄本身
# 改用 find 清空內容,保留 mount point 目錄
find "$VENV" -mindepth 1 -delete 2>/dev/null || true
python3.11 -m venv $VENV
source $VENV/bin/activate
pip install -q uv
cd apps/api && uv pip install -q -e ".[dev]" && cd -
echo "$CURRENT_HASH" > $HASH_FILE
else
echo "⚡ 使用快取 venv (deps 未變更)"
source $VENV/bin/activate
fi
cd apps/api
# CI 排除需外部服務的測試 (Redis pool / Ollama — 2026-04-01 Claude Code)
# 2026-04-05 Claude Code: 修正 exit code — | tail 會吃掉 segfault (exit 139)
# 改用 tee + PIPESTATUS[0] 正確捕捉 pytest 本身的 exit code
# 2026-04-05 Claude Code: 加 --ignore=tests/integration 排除需 asyncpg 連線的 DB 測試
# integration tests 在 prod K8s 部署後由 E2E Smoke Test 覆蓋
# PYTHONFAULTHANDLER=1: 若 C extension segfault輸出完整 Python stacktrace
# 2026-04-05 Claude Code: test_github_webhook.py 已根治
# 原問題: import src.main → asyncpg C ext segfault (exit 139)
# 修復: 改用最小化 app只掛載 github_webhook router不走 DB import chain
# 現在可安全加入 CI 測試
# 2026-04-22 ogt: DATABASE_URL 改為必填後,單元測試需要此 env var 讓 Settings 通過驗證
# 單元測試不連 DB此 CI placeholder 僅供 Pydantic 驗證,不產生真實連線
DATABASE_URL="${DATABASE_URL:-postgresql+asyncpg://ci:ci@localhost/ci}" \
PYTHONFAULTHANDLER=1 python3.11 -m pytest tests/ -v --tb=short -x \
--ignore=tests/integration \
--ignore=tests/test_anomaly_counter.py \
--ignore=tests/test_global_repair_cooldown.py \
--ignore=tests/test_redis_multisig.py \
--ignore=tests/test_model_regression.py \
--ignore=tests/test_prompt_validation.py \
--ignore=tests/e2e_network_test.py \
2>&1 | tee /tmp/pytest-output.txt; PYTEST_EXIT=${PIPESTATUS[0]}
tail -60 /tmp/pytest-output.txt
exit $PYTEST_EXIT
CI_SCRIPT
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-api-tests" \
--cpus "2.0" \
--memory "2g" \
-v "$PWD:/workspace" \
-v /tmp/awoooi-api-tests.sh:/tmp/awoooi-api-tests.sh:ro \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-api-tests.sh
# ── 整合測試 B5 (2026-04-10) ──────────────────────────────────────────
# B5 整合測試 — postgres-test 由 services: 提供localhost:15432 直連
# 2026-04-10 Claude Sonnet 4.6: 用 psql 直連 localhost:15432 初始化 schema
# (docker exec 在 act runner 內無法取得 service container name)
# B5: Gitea act runner 的 services: 實作與 GitHub Actions 不同
# service container 啟動後需直連,但 act 的 container name 可能為空
# 2026-04-10 ogt: 改用 docker run 本地啟動取代 services: 宣告
# 2026-04-19 ogt + Claude Opus 4.7: cd 連續 2 次 fail (run 984/985)
# 真因: act runner 把 ci-runner 跑在獨立 user-defined network,
# pg-test-b5 預設用 host bridge → 兩邊隔離無法連 (172.17.0.2 timeout)
# 修法: 把 pg-test-b5 加入 act task 的 network,用 container name 連線
- name: Integration Tests (B5 — 真實 DB)
run: |
cat > /tmp/awoooi-b5-tests.sh <<'CI_SCRIPT'
cd apps/api
# 安裝 psql client
if ! command -v psql &>/dev/null; then
apt-get install -y -q postgresql-client
fi
# 2026-04-19 ogt + Claude Opus 4.7 v3: 主動創 shared network
# 之前 grep ACT_NET 在 c0f3509 run 沒 match → fallback bridge → container name DNS 失效
# 真因: default bridge 不支援 container name DNS,必須 user-defined network
# 修法: 主動建 'b5-test-net' (idempotent),ci-runner + pg-test-b5 都加入
B5_NET="b5-test-net"
docker network create "$B5_NET" 2>/dev/null || true
# 當前 ci-runner container (hostname == short container id) 連上此 network
# 若已連 → docker network connect 回 error 1,用 || true 吞掉
docker network connect "$B5_NET" "$HOSTNAME" 2>/dev/null || true
echo "B5 shared network: $B5_NET (ci-runner hostname: $HOSTNAME)"
# 啟動測試 DB 於 shared network,用 container name 'pg-test-b5' 連線
docker rm -f pg-test-b5 2>/dev/null || true
docker run -d --name pg-test-b5 \
--network="$B5_NET" \
-e POSTGRES_DB=awoooi_test \
-e POSTGRES_USER=awoooi \
-e POSTGRES_PASSWORD=awoooi_test_2026 \
pgvector/pgvector:pg16
# 等待就緒(用 container name,最多 60 秒)
for i in $(seq 1 30); do
PGPASSWORD=awoooi_test_2026 pg_isready -h pg-test-b5 -p 5432 -U awoooi && break || sleep 2
done
# 初始化 schema
PGPASSWORD=awoooi_test_2026 psql \
-h pg-test-b5 -p 5432 -U awoooi -d awoooi_test \
-f tests/integration/setup_test_schema.sql
# 跑測試
# B5 整合測試嚴格模式 (2026-04-13 ogt: 恢復 Break-Glass 移除)
# -m integration: override pyproject.toml addopts "-m 'not integration'",讓標記測試可執行
# 2026-04-22 ogt: DATABASE_URL 改為必填後import chain 需要此 env var 讓 Settings 通過驗證
DATABASE_URL="postgresql+asyncpg://awoooi:awoooi_test_2026@pg-test-b5:5432/awoooi_test?ssl=disable" \
TEST_DATABASE_URL="postgresql+asyncpg://awoooi:awoooi_test_2026@pg-test-b5:5432/awoooi_test?ssl=disable" \
/opt/api-venv/bin/pytest tests/integration/test_b5_core_flows.py -v --tb=short -m integration
# 清理
docker rm -f pg-test-b5 || true
CI_SCRIPT
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-b5-tests" \
--cpus "2.0" \
--memory "2g" \
-v "$PWD:/workspace" \
-v /tmp/awoooi-b5-tests.sh:/tmp/awoooi-b5-tests.sh:ro \
-v /var/run/docker.sock:/var/run/docker.sock \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-b5-tests.sh
- name: Notify Pipeline Failure
# 2026-04-30 Codex: tests job failure notifier; no jq dependency for host parity.
if: failure()
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🧪 Stage: tests\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
build-and-deploy:
# 2026-04-30 Codex: Docker builds run on the host runner. Long docker build
# steps were killing the transient act job container with RWLayer=nil.
needs: tests
timeout-minutes: 60
runs-on: awoooi-host
steps:
- name: Bootstrap Host Runner Tools
# 2026-05-05 Codex: keep the host-mode runner self-healing before
# actions/checkout@v4 and Telegram failure notifications run.
run: |
if command -v apk >/dev/null 2>&1; then
apk add --no-cache nodejs npm git curl bash openssh-client docker-cli docker-cli-buildx
fi
- uses: actions/checkout@v4
- name: Get Commit Info
id: commit
run: |
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
- name: Login to Harbor
run: |
echo "${{ secrets.HARBOR_PASSWORD }}" | \
docker login "${{ env.HARBOR }}" \
-u "${{ secrets.HARBOR_USERNAME }}" \
--password-stdin
# 2026-04-30 Codex: Gitea act-runner shares one Docker daemon across repos.
# When another repo starts a heavy docker build while AWOOOI Web is still
# building, the job container can disappear and Docker reports RWLayer=nil.
# A Docker-network lock is global to the host daemon and survives container
# namespaces, unlike /tmp/flock inside the transient job container.
- name: Acquire Docker Build Lock
run: |
LOCK_NAME="awoooi-cd-docker-build-lock"
STALE_SECONDS=7200
EMPTY_LOCK_SECONDS=300
WAIT_ATTEMPTS=180
for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do
if docker network create \
--label awoooi.ci-lock=docker-build \
--label awoooi.owner=cd-pipeline \
"$LOCK_NAME" >/dev/null 2>&1; then
echo "DOCKER_BUILD_LOCK=${LOCK_NAME}" >> "$GITHUB_ENV"
echo "✅ Docker build lock acquired: ${LOCK_NAME}"
exit 0
fi
CREATED_AT=$(docker network inspect "$LOCK_NAME" \
--format '{{.Created}}' 2>/dev/null || true)
if [ -n "$CREATED_AT" ]; then
# 2026-05-03 ogt: 修復 stale 偵測 — Docker 回傳 "2006-01-02 15:04:05.999999999 -0700 MST"
# date -d 不接受奈秒小數點與末尾時區縮寫CST/MST 等),導致 CREATED_EPOCH=0 → stale 永不觸發
# 修法sed 去除奈秒 (.NNN...) 和末尾縮寫 (空格+大寫字母)GNU date 才能正確解析
CREATED_CLEAN=$(echo "$CREATED_AT" | sed 's/\.[0-9]*//' | sed 's/ [A-Z][A-Z]*$//')
CREATED_EPOCH=$(date -d "$CREATED_CLEAN" +%s 2>/dev/null || \
python3 -c "import sys, datetime, re; ts = re.sub(r'\\.\d+', '', sys.argv[1]); ts = re.sub(r'\\s+[A-Z]{2,4}$', '', ts.strip()); print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))" \
"$CREATED_AT" 2>/dev/null || echo 0)
NOW_EPOCH=$(date +%s)
LOCK_AGE=$((NOW_EPOCH - CREATED_EPOCH))
# 2026-05-05 Codex: dirty reboot / cancelled Actions can leave
# the Docker-network lock behind with no active build or push.
# Waiting the full 30m CD timeout keeps deploys queued even
# though no job is protected, so clear empty locks after 5m.
ACTIVE_DOCKER_WORK=$(ps -eo args | grep -E 'docker (build|push)|buildx build' | grep -v grep || true)
if [ "$CREATED_EPOCH" -gt 0 ] && \
[ "$LOCK_AGE" -gt "$EMPTY_LOCK_SECONDS" ] && \
[ -z "$ACTIVE_DOCKER_WORK" ]; then
echo "⚠️ empty Docker build lock detected (age=${LOCK_AGE}s > ${EMPTY_LOCK_SECONDS}s, no active docker build/push), removing ${LOCK_NAME}"
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
continue
fi
if [ "$CREATED_EPOCH" -gt 0 ] && \
[ "$LOCK_AGE" -gt "$STALE_SECONDS" ]; then
echo "⚠️ stale Docker build lock detected (age=${LOCK_AGE}s > ${STALE_SECONDS}s), removing ${LOCK_NAME}"
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
continue
fi
fi
echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting..."
sleep 10
done
echo "❌ timed out waiting for Docker build lock"
exit 1
# ── API 鏡像建置(含 Layer Cache 加速)──────────────────────────────
# 2026-04-01 ogt: CACHE_BUST=git_sha 確保 src/ 和 models.json 層每次重建
# deps 層 (pip install) 仍可 cache → 加速;代碼/配置層強制失效
# 2026-05-05 Codex: host runner bootstrap installs docker-cli-buildx;
# keep BuildKit enabled because the web Dockerfile uses RUN --mount.
- name: Build and Push API
env:
DOCKER_BUILDKIT: "1"
run: |
docker build -f apps/api/Dockerfile \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--cache-from ${{ env.HARBOR }}/awoooi/api:latest \
--build-arg CACHE_BUST=${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/api:${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/api:latest \
.
docker push ${{ env.HARBOR }}/awoooi/api:${{ github.sha }}
docker push ${{ env.HARBOR }}/awoooi/api:latest
# 2026-03-31 ogt: 移除中間通知,減少訊息雜訊
# ── Web 鏡像建置(精準快取失效)──────────────────────────────
# 2026-03-30 ogt: NEXT_PUBLIC_* 必須用公網域名 (build-time 寫死)
# 2026-04-01 Claude Code: CACHE_BUST=git_sha 取代 --no-cache
# - deps 層 (pnpm install) 仍可 cache → 節省 ~2-3 min
# - COPY . . 以下由 CACHE_BUST 強制失效 → 業務邏輯/CSRF 等變更正確進入 bundle
# 2026-05-05 Codex: mirror API build mode; BuildKit required for cache mounts.
- name: Build and Push Web
env:
DOCKER_BUILDKIT: "1"
run: |
docker build -f apps/web/Dockerfile \
--build-arg NEXT_PUBLIC_API_URL=https://awoooi.wooo.work \
--build-arg CACHE_BUST=${{ github.sha }} \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--cache-from ${{ env.HARBOR }}/awoooi/web:latest \
-t ${{ env.HARBOR }}/awoooi/web:${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/web:latest \
.
docker push ${{ env.HARBOR }}/awoooi/web:${{ github.sha }}
docker push ${{ env.HARBOR }}/awoooi/web:latest
- name: Release Docker Build Lock
if: always()
run: |
if [ -n "${DOCKER_BUILD_LOCK:-}" ]; then
docker network rm "$DOCKER_BUILD_LOCK" >/dev/null 2>&1 || true
echo "✅ Docker build lock released: ${DOCKER_BUILD_LOCK}"
else
echo "⚡ no Docker build lock to release"
fi
# 2026-03-31 ogt: 移除中間通知
# 2026-03-31 ogt: P0-1 Secrets 自動注入 (ADR-035 強制)
# 2026-03-31 ogt: 加入 AI API Keys (修復 mock_fallback 問題)
- name: Inject K8s Secrets
env:
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
TG_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
TG_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }}
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
# 2026-04-01 Claude Code: Langfuse LLMOps keys (Phase 15.1 補齊 CD 注入)
LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }}
LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }}
# 2026-04-02 Claude Code: Telegram 白名單 (授權簽核用)
TG_USER_WHITELIST: ${{ secrets.OPENCLAW_TG_USER_WHITELIST }}
# Phase O-4.1 2026-04-02: Sentry API Token (Wave A.1 ADR-037)
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
# ADR-059 2026-04-05: Gitea Webhook Secret (GITEA_ 前綴為保留字,改用 AWOOOI_ 前綴)
GITEA_WEBHOOK_SECRET: ${{ secrets.AWOOOI_GITEA_WEBHOOK_SECRET }}
# MCP Phase 3: ArgoCD API Token (2026-04-11 Claude Sonnet 4.6)
ARGOCD_API_TOKEN: ${{ secrets.ARGOCD_API_TOKEN }}
# 2026-04-18 ogt + Claude Opus 4.7: ADR-090-B L3-only 升級 L2永久連線串 + 應用 secret
DATABASE_URL: ${{ secrets.DATABASE_URL }}
MIGRATION_DATABASE_URL: ${{ secrets.MIGRATION_DATABASE_URL }}
REDIS_URL: ${{ secrets.REDIS_URL }}
JWT_SECRET: ${{ secrets.JWT_SECRET }}
JWT_ALGORITHM: ${{ secrets.JWT_ALGORITHM }}
WEBHOOK_HMAC_SECRET: ${{ secrets.WEBHOOK_HMAC_SECRET }}
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
CLAUDE_API_KEY: ${{ secrets.CLAUDE_API_KEY }}
# AWOOOI_ 前綴避開 Gitea 保留字(同 AWOOOI_GITEA_WEBHOOK_SECRET 模式)
GITEA_API_TOKEN: ${{ secrets.AWOOOI_GITEA_API_TOKEN }}
NEMOTRON_BOT_TOKEN: ${{ secrets.NEMOTRON_BOT_TOKEN }}
OPENCLAW_BOT_TOKEN: ${{ secrets.OPENCLAW_BOT_TOKEN }}
SMTP_HOST: ${{ secrets.SMTP_HOST }}
SRE_GROUP_CHAT_ID: ${{ secrets.SRE_GROUP_CHAT_ID }}
run: |
# S1/S2: 統一命名 deploy_key改用 ssh-keyscan比 StrictHostKeyChecking=no 更安全)
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan -T 5 "${{ env.K8S_SSH_HOST }}" > ~/.ssh/known_hosts 2>/dev/null
SSH_OPTS="-i ~/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -o ConnectTimeout=10"
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" << SECRETS
set -e
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=\${K8S_API_SERVER}"
# 注入 Telegram Secrets (ADR-035 鐵律)
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/OPENCLAW_TG_BOT_TOKEN","value":"'$(echo -n "${TG_BOT_TOKEN}" | base64 -w 0)'"},
{"op":"add","path":"/data/OPENCLAW_TG_CHAT_ID","value":"'$(echo -n "${TG_CHAT_ID}" | base64 -w 0)'"}
]' || { echo "❌ Telegram Secrets patch 失敗 — ADR-035 鐵律"; exit 1; }
# 2026-03-31 ogt: 注入 AI API Keys (修復 NVIDIA/Gemini mock_fallback)
# 2026-04-01 Claude Code: base64 -w 0 防止長 key 換行破壞 JSON
# NVIDIA NIM (免費 tier)
if [ -n "${NVIDIA_API_KEY}" ] && [ "${NVIDIA_API_KEY}" != "" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/NVIDIA_API_KEY","value":"'$(echo -n "${NVIDIA_API_KEY}" | base64 -w 0)'"}
]' && echo "✅ NVIDIA_API_KEY 已注入" || echo "⚠️ NVIDIA_API_KEY patch 失敗"
else
echo "⚠️ NVIDIA_API_KEY 未設定,跳過"
fi
# Gemini (備援)
if [ -n "${GEMINI_API_KEY}" ] && [ "${GEMINI_API_KEY}" != "" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/GEMINI_API_KEY","value":"'$(echo -n "${GEMINI_API_KEY}" | base64 -w 0)'"}
]' && echo "✅ GEMINI_API_KEY 已注入" || echo "⚠️ GEMINI_API_KEY patch 失敗"
else
echo "⚠️ GEMINI_API_KEY 未設定,跳過"
fi
# 2026-04-01 Claude Code: Langfuse LLMOps keys (補齊 CD 注入,之前只有手動設定)
if [ -n "${LANGFUSE_PUBLIC_KEY}" ] && [ -n "${LANGFUSE_SECRET_KEY}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/LANGFUSE_PUBLIC_KEY","value":"'$(echo -n "${LANGFUSE_PUBLIC_KEY}" | base64 -w 0)'"},
{"op":"add","path":"/data/LANGFUSE_SECRET_KEY","value":"'$(echo -n "${LANGFUSE_SECRET_KEY}" | base64 -w 0)'"}
]' && echo "✅ LANGFUSE keys 已注入" || echo "⚠️ LANGFUSE keys patch 失敗"
else
echo "⚠️ LANGFUSE_PUBLIC_KEY/SECRET_KEY 未設定,跳過 (現有 K8s secret 值維持不變)"
fi
# 2026-04-02 Claude Code: Telegram Whitelist (授權簽核用戶 ID)
if [ -n "${TG_USER_WHITELIST}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/OPENCLAW_TG_USER_WHITELIST","value":"'$(echo -n "${TG_USER_WHITELIST}" | base64 -w 0)'"}
]' && echo "✅ TG_USER_WHITELIST 已注入" || echo "⚠️ TG_USER_WHITELIST patch 失敗"
fi
# Phase O-4.1 2026-04-02: Sentry Auth Token (Wave A.1 ADR-037)
if [ -n "${SENTRY_AUTH_TOKEN}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SENTRY_AUTH_TOKEN","value":"'$(echo -n "${SENTRY_AUTH_TOKEN}" | base64 -w 0)'"}
]' && echo "✅ SENTRY_AUTH_TOKEN 已注入" || echo "⚠️ SENTRY_AUTH_TOKEN patch 失敗"
else
echo "⚠️ SENTRY_AUTH_TOKEN 未設定Sentry Comment API 將跳過"
fi
# ADR-059 2026-04-05 Claude Code: Gitea Webhook Secret
if [ -n "${GITEA_WEBHOOK_SECRET}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/GITEA_WEBHOOK_SECRET","value":"'$(echo -n "${GITEA_WEBHOOK_SECRET}" | base64 -w 0)'"}
]' && echo "✅ GITEA_WEBHOOK_SECRET 已注入" || echo "⚠️ GITEA_WEBHOOK_SECRET patch 失敗"
else
echo "⚠️ GITEA_WEBHOOK_SECRET 未設定Gitea Webhook 簽章驗證將在 prod 失效"
fi
# MCP Phase 3: ArgoCD API Token (2026-04-11 Claude Sonnet 4.6)
if [ -n "${ARGOCD_API_TOKEN}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/ARGOCD_API_TOKEN","value":"'$(echo -n "${ARGOCD_API_TOKEN}" | base64 -w 0)'"}
]' && echo "✅ ARGOCD_API_TOKEN 已注入" || echo "⚠️ ARGOCD_API_TOKEN patch 失敗"
else
echo "⚠️ ARGOCD_API_TOKEN 未設定ArgoCD MCP 將使用空 token"
fi
# ============================================================================
# ADR-090-B 2026-04-18 ogt + Claude Opus 4.7: L3-only 升級 L213 個 key
# ============================================================================
# 目的: 消滅「只存 K8s etcd 單點」的災難盲區Gitea Secret 成為正式真相來源
# 注意: 每個 block 與上方維持相同結構if guard + base64 -w 0 + json patch
# DATABASE_URL — PG 應用連線串2026-04-18 輪替)
if [ -n "${DATABASE_URL}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/DATABASE_URL","value":"'$(echo -n "${DATABASE_URL}" | base64 -w 0)'"}
]' && echo "✅ DATABASE_URL 已注入" || echo "⚠️ DATABASE_URL patch 失敗"
else
echo "⚠️ DATABASE_URL 未設定awoooi-api 將無法連 PG"
fi
# MIGRATION_DATABASE_URL — CI migration 用 awoooi_migrator 限權帳號ADR-090-B
if [ -n "${MIGRATION_DATABASE_URL}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/MIGRATION_DATABASE_URL","value":"'$(echo -n "${MIGRATION_DATABASE_URL}" | base64 -w 0)'"}
]' && echo "✅ MIGRATION_DATABASE_URL 已注入" || echo "⚠️ MIGRATION_DATABASE_URL patch 失敗"
fi
# REDIS_URL — Redis 連線6380 on 188
if [ -n "${REDIS_URL}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/REDIS_URL","value":"'$(echo -n "${REDIS_URL}" | base64 -w 0)'"}
]' && echo "✅ REDIS_URL 已注入" || echo "⚠️ REDIS_URL patch 失敗"
else
echo "⚠️ REDIS_URL 未設定"
fi
# JWT_SECRET / JWT_ALGORITHM — API 認證
if [ -n "${JWT_SECRET}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/JWT_SECRET","value":"'$(echo -n "${JWT_SECRET}" | base64 -w 0)'"}
]' && echo "✅ JWT_SECRET 已注入" || echo "⚠️ JWT_SECRET patch 失敗"
fi
if [ -n "${JWT_ALGORITHM}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/JWT_ALGORITHM","value":"'$(echo -n "${JWT_ALGORITHM}" | base64 -w 0)'"}
]' && echo "✅ JWT_ALGORITHM 已注入" || echo "⚠️ JWT_ALGORITHM patch 失敗"
fi
# WEBHOOK_HMAC_SECRET — Alertmanager webhook HMAC 簽章
if [ -n "${WEBHOOK_HMAC_SECRET}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/WEBHOOK_HMAC_SECRET","value":"'$(echo -n "${WEBHOOK_HMAC_SECRET}" | base64 -w 0)'"}
]' && echo "✅ WEBHOOK_HMAC_SECRET 已注入" || echo "⚠️ WEBHOOK_HMAC_SECRET patch 失敗"
fi
# SENTRY_DSN — Sentry 錯誤追蹤(不是 auth token
if [ -n "${SENTRY_DSN}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SENTRY_DSN","value":"'$(echo -n "${SENTRY_DSN}" | base64 -w 0)'"}
]' && echo "✅ SENTRY_DSN 已注入" || echo "⚠️ SENTRY_DSN patch 失敗"
fi
# CLAUDE_API_KEY — Claude 備援 LLM
if [ -n "${CLAUDE_API_KEY}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/CLAUDE_API_KEY","value":"'$(echo -n "${CLAUDE_API_KEY}" | base64 -w 0)'"}
]' && echo "✅ CLAUDE_API_KEY 已注入" || echo "⚠️ CLAUDE_API_KEY patch 失敗"
fi
# GITEA_API_TOKEN — Gitea API Token從 AWOOOI_GITEA_API_TOKEN 映射)
if [ -n "${GITEA_API_TOKEN}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/GITEA_API_TOKEN","value":"'$(echo -n "${GITEA_API_TOKEN}" | base64 -w 0)'"}
]' && echo "✅ GITEA_API_TOKEN 已注入" || echo "⚠️ GITEA_API_TOKEN patch 失敗"
fi
# NEMOTRON_BOT_TOKEN / OPENCLAW_BOT_TOKEN — 多 Bot 架構
if [ -n "${NEMOTRON_BOT_TOKEN}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/NEMOTRON_BOT_TOKEN","value":"'$(echo -n "${NEMOTRON_BOT_TOKEN}" | base64 -w 0)'"}
]' && echo "✅ NEMOTRON_BOT_TOKEN 已注入" || echo "⚠️ NEMOTRON_BOT_TOKEN patch 失敗"
fi
if [ -n "${OPENCLAW_BOT_TOKEN}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/OPENCLAW_BOT_TOKEN","value":"'$(echo -n "${OPENCLAW_BOT_TOKEN}" | base64 -w 0)'"}
]' && echo "✅ OPENCLAW_BOT_TOKEN 已注入" || echo "⚠️ OPENCLAW_BOT_TOKEN patch 失敗"
fi
# SMTP_HOST / SRE_GROUP_CHAT_ID
if [ -n "${SMTP_HOST}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SMTP_HOST","value":"'$(echo -n "${SMTP_HOST}" | base64 -w 0)'"}
]' && echo "✅ SMTP_HOST 已注入" || echo "⚠️ SMTP_HOST patch 失敗"
fi
if [ -n "${SRE_GROUP_CHAT_ID}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SRE_GROUP_CHAT_ID","value":"'$(echo -n "${SRE_GROUP_CHAT_ID}" | base64 -w 0)'"}
]' && echo "✅ SRE_GROUP_CHAT_ID 已注入" || echo "⚠️ SRE_GROUP_CHAT_ID patch 失敗"
fi
# 2026-04-06 Claude Code: Sprint 3 T2 — known_hosts Secret (Security Fix A1)
# 替換 StrictHostKeyChecking=no讓 SSH 修復路徑使用已知主機指紋
# asyncssh reads /etc/ssh-mcp/known_hosts and requires a non-empty
# OpenSSH known_hosts file. Keep hosts unhashed so both asyncssh and
# CLI diagnostics can trust the same secret.
# 2026-05-02 ogt + Claude Sonnet 4.6: 加 4 台主機完整性檢查
# 根因partial scan如 110 timeout、其他成功會讓 [-s file] 通過、
# 後續 patch 推進缺漏的 known_hosts → asyncssh 拒所有 SSH。
# 修法scan 完用 grep -c 驗證 4 台主機都在;缺任何一台就 abort
# 不能覆蓋現有 secret防止 production SSH 自動修復路徑癱瘓。
ssh-keyscan 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188 > /tmp/known_hosts_repair 2>/tmp/known_hosts_scan_err || true
EXPECTED_HOSTS=4
PRESENT=0
for ip in 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188; do
if grep -qE "^\${ip}[[:space:]]" /tmp/known_hosts_repair 2>/dev/null; then
PRESENT=\$((PRESENT + 1))
else
echo "⚠️ ssh-keyscan 缺主機 \${ip}"
fi
done
if [ "\$PRESENT" -eq "\$EXPECTED_HOSTS" ]; then
\$KUBECTL create secret generic awoooi-repair-known-hosts \
-n awoooi-prod \
--from-file=known_hosts=/tmp/known_hosts_repair \
--dry-run=client -o yaml | \$KUBECTL apply -f - \
&& echo "✅ awoooi-repair-known-hosts Secret 已建立/更新" \
|| echo "⚠️ awoooi-repair-known-hosts Secret 建立失敗 (非致命)"
KNOWN_HOSTS_B64=\$(base64 -w 0 /tmp/known_hosts_repair)
\$KUBECTL patch secret ssh-mcp-key -n awoooi-prod --type=merge \
-p="{\"data\":{\"known_hosts\":\"\${KNOWN_HOSTS_B64}\"}}" \
&& echo "✅ ssh-mcp-key known_hosts 已更新4 台主機完整)" \
|| echo "⚠️ ssh-mcp-key known_hosts 更新失敗 (非致命)"
rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err
else
echo "❌ ssh-keyscan 只抓到 \${PRESENT}/\${EXPECTED_HOSTS} 台主機,跳過 patch保留現有 secret"
cat /tmp/known_hosts_scan_err 2>/dev/null | head -10
rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err
fi
echo "✅ 所有 Secrets 注入完成"
SECRETS
# 2026-04-11 Claude Sonnet 4.6 (Sprint B-3 ADR-069):
# Deploy 改為 ArgoCD GitOps 模式:更新 kustomization.yaml → git push [skip ci] → ArgoCD sync
# 舊做法 (kubectl set image) 與 ArgoCD selfHeal 衝突 — ArgoCD 會 revert 任何直接 kubectl 操作
# 新做法流程:
# 1. 更新 kustomization.yaml image tag用 kustomize edit set image
# 2. Apply ConfigMap/ServiceRegistry不含 Deployment由 ArgoCD 管)
# 3. git commit [skip ci] + push → 觸發 ArgoCD automated sync
# 4. 等待 ArgoCD sync + rollout 完成
# 5. Health Check
- name: Deploy to K8s (ArgoCD GitOps)
env:
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
GITEA_TOKEN: ${{ secrets.CD_PUSH_TOKEN }}
run: |
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan -T 5 "${{ env.K8S_SSH_HOST }}" > ~/.ssh/known_hosts 2>/dev/null
SSH_OPTS="-i ~/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -o ConnectTimeout=10"
IMAGE_TAG="${{ github.sha }}"
HARBOR=192.168.0.110:5000
# ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 DeploymentConfigMap 仍直接 apply) ───
cat k8s/awoooi-prod/04-configmap.yaml | \
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
"KUBECTL='sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${{ env.K8S_API_SERVER }}'; \$KUBECTL apply -f -"
echo "✅ ConfigMap 已更新"
cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
"KUBECTL='sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${{ env.K8S_API_SERVER }}'; \$KUBECTL apply -f -"
echo "✅ Service Registry ConfigMap 已更新"
# ─── Step 2: 更新 kustomization.yaml image tag ───
# host runner 不保證有 root 權限kustomize 安裝在使用者目錄。
export PATH="${HOME}/.local/bin:${PATH}"
if ! command -v kustomize &>/dev/null; then
mkdir -p "${HOME}/.local/bin"
curl -sL https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.3.0/kustomize_v5.3.0_linux_amd64.tar.gz \
| tar xz -C "${HOME}/.local/bin"
chmod +x "${HOME}/.local/bin/kustomize"
fi
cd k8s/awoooi-prod
# kustomize edit set image 更新 tag
kustomize edit set image \
192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER=${HARBOR}/awoooi/api:${IMAGE_TAG}
kustomize edit set image \
192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER=${HARBOR}/awoooi/web:${IMAGE_TAG}
cd ../..
# ─── Step 3: git commit [skip ci] + push → 觸發 ArgoCD sync ───
git config user.email "cd@awoooi.internal"
git config user.name "AWOOOI CD"
git add k8s/awoooi-prod/kustomization.yaml
DEPLOY_REVISION=""
git diff --cached --quiet && echo "⚡ kustomization.yaml 無變化,跳過 push" || {
git commit -m "chore(cd): deploy ${IMAGE_TAG::7} [skip ci]"
# 用 token 推送(避免 SSH key 需要額外設定 push 權限)
git remote remove gitea 2>/dev/null || true
git remote add gitea http://wooo:${GITEA_TOKEN}@192.168.0.110:3001/wooo/awoooi.git
# 先 rebase 避免 non-fast-forward (其他 commit 在 CI 期間已推入)
# 2026-04-17 ogt: -X theirs — kustomization.yaml 衝突時採用當次部署的 image tag
git fetch gitea main
git rebase -X theirs gitea/main
DEPLOY_REVISION=$(git rev-parse HEAD)
git push gitea main
echo "✅ kustomization.yaml 已 push等待 ArgoCD sync 到 ${DEPLOY_REVISION:0:8}..."
}
# ─── Step 4: 等待 ArgoCD sync + rollout ───
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
set -e
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER}"
# 等待 ArgoCD Application Synced最多 180s。只看
# Synced/Healthy 可能誤判成上一個 revision 已同步,因此有
# deploy commit 時必須同時確認 status.sync.revision。
echo "⏳ 等待 ArgoCD sync..."
$KUBECTL annotate application awoooi-prod -n argocd \
argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true
for i in $(seq 1 36); do
SYNC=$($KUBECTL get application awoooi-prod -n argocd \
-o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown")
HEALTH=$($KUBECTL get application awoooi-prod -n argocd \
-o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown")
REVISION=$($KUBECTL get application awoooi-prod -n argocd \
-o jsonpath='{.status.sync.revision}' 2>/dev/null || echo "Unknown")
SHORT_REVISION=$(echo "$REVISION" | cut -c1-8)
SHORT_EXPECTED=$(echo "$EXPECTED_REVISION" | cut -c1-8)
echo " ArgoCD: sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
if [ "$SYNC" = "Synced" ] && [ "$HEALTH" = "Healthy" ]; then
if [ -z "$EXPECTED_REVISION" ] || [ "$REVISION" = "$EXPECTED_REVISION" ]; then
echo "✅ ArgoCD Synced + Healthy"
break
fi
fi
if [ "$i" = "36" ]; then
echo "❌ ArgoCD 未在期限內同步到目標 revision"
exit 1
fi
sleep 5
done
# 確認 rollout 完成
$KUBECTL rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s
$KUBECTL rollout status deployment/awoooi-web -n awoooi-prod --timeout=120s
$KUBECTL rollout status deployment/awoooi-worker -n awoooi-prod --timeout=120s
echo "✅ 部署完成"
# Health Check
HEALTH_PASS=0
for i in 1 2 3; do
HTTP_CODE=$(curl -s -w "%{http_code}" -o /dev/null --connect-timeout 10 "${{ env.API_HEALTH_URL }}")
if [ "$HTTP_CODE" = "200" ]; then
echo "✅ API 健康檢查通過"
HEALTH_PASS=1
break
fi
echo "⏳ 嘗試 #$i: HTTP $HTTP_CODE等待 10s..."
sleep 10
done
if [ "$HEALTH_PASS" = "0" ]; then
echo "❌ API 健康檢查失敗"
exit 1
fi
ARGOCD_WAIT
# 2026-04-09 Claude Sonnet 4.6: Sprint 5.2 — 同步 ops 腳本到 188 (ollama user)
# DEPLOY_SSH_KEY_188 = gitea-cd-deploy-188 (ed25519只有 188 authorized_keys)
# 腳本: docker-health-monitor.sh + pg-backup.sh (感知層 + 備份)
- name: Sync Ops Scripts to 188
continue-on-error: true
env:
SSH_KEY_188: ${{ secrets.DEPLOY_SSH_KEY_188 }}
run: |
mkdir -p ~/.ssh
echo "$SSH_KEY_188" > ~/.ssh/deploy_key_188
chmod 600 ~/.ssh/deploy_key_188
ssh-keyscan 192.168.0.188 >> ~/.ssh/known_hosts 2>/dev/null
# 同步 docker-health-monitor.sh
scp -i ~/.ssh/deploy_key_188 \
scripts/ops/docker-health-monitor.sh \
ollama@192.168.0.188:~/awoooi-ops/docker-health-monitor.sh \
&& echo "✅ docker-health-monitor.sh 已同步" \
|| echo "⚠️ docker-health-monitor.sh 同步失敗"
# 同步 pg-backup.sh
scp -i ~/.ssh/deploy_key_188 \
scripts/ops/pg-backup.sh \
ollama@192.168.0.188:~/awoooi-ops/pg-backup.sh \
&& echo "✅ pg-backup.sh 已同步" \
|| echo "⚠️ pg-backup.sh 同步失敗"
# 確保執行權限
ssh -i ~/.ssh/deploy_key_188 ollama@192.168.0.188 \
"chmod +x ~/awoooi-ops/docker-health-monitor.sh ~/awoooi-ops/pg-backup.sh && echo '✅ 權限設定完成'" \
|| echo "⚠️ 權限設定失敗"
- name: Notify Pipeline Failure
if: failure()
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"
post-deploy-checks:
needs: build-and-deploy
timeout-minutes: 30
# 2026-04-30 Codex: keep post-deploy on the host runner too. Playwright
# install-deps can also kill the act-managed job container with RWLayer=nil.
runs-on: awoooi-host
steps:
- name: Bootstrap Host Runner Tools
# 2026-05-05 Codex: post-deploy also uses checkout and curl-based
# notifications, so it needs the same runner bootstrap as earlier jobs.
run: |
if command -v apk >/dev/null 2>&1; then
apk add --no-cache nodejs npm git curl bash openssh-client docker-cli docker-cli-buildx
fi
- uses: actions/checkout@v4
- name: Get Commit Info
id: commit
run: |
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
# Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037)
# 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter
# 2026-04-05 Claude Code cache優化: 使用 /opt/api-venv (已有 requests),移除 Setup Python Tools step
# 2026-04-10 ogt: 移除 continue-on-error — 告警鏈路失敗必須阻塞部署
- name: Alert Chain Smoke Test
id: alert_chain_smoke
run: |
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
--cpus "1.0" \
--memory "1g" \
-v "$PWD:/workspace" \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url ${{ env.ALERT_CHAIN_API_URL }} --json | tee /tmp/alert_chain_result.json'; then
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
else
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
fi
# Phase O-5 Wave C.2 2026-04-02 ogt: 監控覆蓋率驗證 (generate_monitoring.py --check)
# 2026-04-10 ogt: 移除 continue-on-error — 覆蓋率不足必須阻塞部署
- name: Monitoring Coverage Check
id: monitoring_coverage
run: |
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-coverage" \
--cpus "1.0" \
--memory "1g" \
-v "$PWD:/workspace" \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/generate_monitoring.py --check'; then
echo "coverage_status=pass" >> $GITHUB_OUTPUT
else
echo "coverage_status=fail" >> $GITHUB_OUTPUT
fi
# [首席架構師] 新增 Playwright E2E Smoke Test 步驟 v1.0.0 2026-04-01 (台北時間)
# continue-on-error: true — smoke 失敗不阻塞部署,但結果會反映在 TG 通知
- name: E2E Smoke Test
id: smoke
continue-on-error: true
run: |
cat > /tmp/awoooi-smoke.sh <<'CI_SCRIPT'
# 首席架構師 Review I4 + 2026-04-05 Claude Code cache優化:
# playwright.config.ts import @playwright/test — 必須先安裝 pnpm node_modules
# pnpm store 持久化到 /opt/pnpm-storepnpm-lock.yaml hash 未變則 --prefer-offline
PNPM_STORE=/opt/pnpm-store
PNPM_HASH_FILE=/opt/pnpm-store/.lock_hash
CURRENT_PNPM_HASH=$(md5sum pnpm-lock.yaml | awk '{print $1}')
corepack enable 2>/dev/null || npm install -g pnpm@9 -q
pnpm config set store-dir $PNPM_STORE
if [ "$(cat $PNPM_HASH_FILE 2>/dev/null)" != "$CURRENT_PNPM_HASH" ]; then
echo "📦 pnpm lock 已變更,重裝 node_modules..."
pnpm install --frozen-lockfile 2>&1 | tail -5
echo "$CURRENT_PNPM_HASH" > $PNPM_HASH_FILE
else
echo "⚡ 使用快取 pnpm store (lock 未變更)prefer-offline..."
pnpm install --frozen-lockfile --prefer-offline 2>&1 | tail -5
fi
cd apps/web
# Playwright Chromium 持久化到 /opt/playwright-browsers版本 hash guard
export PLAYWRIGHT_BROWSERS_PATH=/opt/playwright-browsers
PLAYWRIGHT_VER=$(node -e "console.log(require('./package.json').devDependencies['@playwright/test'] || '')" 2>/dev/null || echo "unknown")
PLAYWRIGHT_HASH_FILE=/opt/playwright-browsers/.version_hash
if [ "$(cat $PLAYWRIGHT_HASH_FILE 2>/dev/null)" != "$PLAYWRIGHT_VER" ]; then
echo "📦 Playwright 版本變更 ($PLAYWRIGHT_VER),重裝 Chromium..."
npx playwright install chromium --with-deps 2>&1 | tail -5
echo "$PLAYWRIGHT_VER" > $PLAYWRIGHT_HASH_FILE
else
echo "⚡ 使用快取 Playwright Chromium ($PLAYWRIGHT_VER)"
fi
# Browser cache 命中時也要確認 OS shared libs 存在;否則 smoke 會只測到
# chromium launch failure例如 libnspr4.so missing
if ! ldconfig -p 2>/dev/null | grep -q 'libnspr4'; then
echo "📦 Playwright system deps missing補安裝 Chromium deps..."
npx playwright install-deps chromium > /tmp/playwright-install-deps.log 2>&1 || {
tail -40 /tmp/playwright-install-deps.log
exit 1
}
tail -20 /tmp/playwright-install-deps.log
fi
# 對已部署的生產環境跑 smoke test
npx playwright test tests/e2e/smoke.spec.ts --reporter=line \
&& echo "smoke_status=pass" >> $GITHUB_OUTPUT \
|| echo "smoke_status=fail" >> $GITHUB_OUTPUT
CI_SCRIPT
SMOKE_OUTPUT="$PWD/.awoooi-smoke-output"
rm -f "$SMOKE_OUTPUT"
touch "$SMOKE_OUTPUT"
chmod 666 "$SMOKE_OUTPUT"
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-e2e-smoke" \
--cpus "1.5" \
--memory "2g" \
-v "$PWD:/workspace" \
-v /tmp/awoooi-smoke.sh:/tmp/awoooi-smoke.sh:ro \
-v awoooi-pnpm-store:/opt/pnpm-store \
-v awoooi-playwright-browsers:/opt/playwright-browsers \
-w /workspace \
-e GITHUB_OUTPUT=/workspace/.awoooi-smoke-output \
-e CI=true \
-e PLAYWRIGHT_BASE_URL=https://awoooi.wooo.work \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-smoke.sh
cat "$SMOKE_OUTPUT" >> "$GITHUB_OUTPUT"
env:
CI: "true"
# 直接測試已部署的生產環境,不啟動本地 dev server
PLAYWRIGHT_BASE_URL: "https://awoooi.wooo.work"
- name: Notify Health Check Success
env:
SMOKE_RESULT: ${{ steps.smoke.outcome == 'success' && '✅' || '⚠️' }}
ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outcome == 'success' && '✅' || '⚠️' }}
MONITORING_RESULT: ${{ steps.monitoring_coverage.outcome == 'success' && '✅' || '⚠️' }}
run: |
END_TIME=$(date +%s)
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
MINUTES=$((DURATION / 60))
SECONDS=$((DURATION % 60))
# 2026-04-05 ogt: TG_MSG 必須在 shell 中組裝,才能展開 ${MINUTES}/${SECONDS} 等 shell 變數
# 2026-04-05 ogt: 移除 parse_mode=HTML避免 commit message 含特殊字元導致 400
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
TG_MSG="✅ AWOOOI 部署完成\n├ 📝 ${COMMIT_MSG}\n├ 🔖 ${SHORT_SHA}\n├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s\n├ 📦 API: ✅ Web: ✅\n├ 🩺 Health: ✅\n├ 🔗 Alert Chain: ${ALERT_CHAIN_RESULT}\n├ 📊 Monitoring: ${MONITORING_RESULT}\n└ 🎭 Smoke: ${SMOKE_RESULT}"
printf '%b' "$TG_MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
--data-urlencode "text@-" || echo "TG notify warning (non-fatal)"
- name: Notify Pipeline Failure
# 2026-04-16 ogt + Claude Sonnet 4.6: 改用 HTML 結構化格式
if: failure()
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🩺 Stage: post-deploy-checks\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?"