K8s 問題: patch secret 後 Pod 不會自動讀取新值 修復: 新增 kubectl rollout restart 強制重啟 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
609 lines
27 KiB
YAML
609 lines
27 KiB
YAML
# =============================================================================
|
||
# AWOOOI CD Pipeline v2.1 (序列建構修復 Runner 衝突)
|
||
# =============================================================================
|
||
# 優化項目:
|
||
# 1. Pre-flight Check (10s Fail-Fast)
|
||
# 2. Runner 標籤 [self-hosted, harbor, k8s]
|
||
# 3. dorny/paths-filter 精確路徑偵測
|
||
# 4. Web → API 序列建構 (2026-03-29 修復 _runner_file_commands 衝突)
|
||
# 5. timeout-minutes 防止卡死
|
||
# 6. Telegram + OpenClaw 通知
|
||
# 7. force_deploy 強制重建選項
|
||
# =============================================================================
|
||
|
||
name: CD
|
||
|
||
on:
|
||
push:
|
||
branches: [main]
|
||
paths-ignore:
|
||
- 'docs/**'
|
||
- '*.md'
|
||
workflow_dispatch:
|
||
inputs:
|
||
force_deploy:
|
||
description: '強制部署 (跳過路徑偵測)'
|
||
type: boolean
|
||
default: false
|
||
skip_api:
|
||
description: '跳過 API 建構'
|
||
type: boolean
|
||
default: false
|
||
skip_web:
|
||
description: '跳過 Web 建構'
|
||
type: boolean
|
||
default: false
|
||
|
||
concurrency:
|
||
# =======================================================================
|
||
# 2026-03-29 首席架構師審查: P0 並發治理
|
||
# 問題: cancel-in-progress: true 導致 force_deploy 被新 push 取消
|
||
# 解決: force_deploy 使用獨立 concurrency group,不會被普通 push 取消
|
||
# =======================================================================
|
||
group: ${{ inputs.force_deploy && 'cd-force-deploy' || format('cd-{0}-{1}', github.workflow, github.ref) }}
|
||
# force_deploy 使用獨立 group,普通 push 使用原 group
|
||
# 普通 push 互相取消 (防止 Runner 衝突)
|
||
# force_deploy 不會被普通 push 取消
|
||
cancel-in-progress: ${{ !inputs.force_deploy }}
|
||
|
||
env:
|
||
REGISTRY: 192.168.0.110:5000
|
||
IMAGE_PREFIX: library/awoooi
|
||
LOCAL_CACHE_DIR: /home/wooo/build-cache/awoooi
|
||
OPENCLAW_URL: http://192.168.0.188:8088
|
||
# OTEL CI/CD 監控 (2026-03-24 批准, 2026-03-28 修正: SignOz 在 188)
|
||
OTEL_EXPORTER_OTLP_ENDPOINT: http://192.168.0.188:24318
|
||
OTEL_SERVICE_NAME: awoooi-cd
|
||
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production
|
||
|
||
jobs:
|
||
# ==================== Pre-flight Check (10s Fail-Fast) ====================
|
||
pre-flight-check:
|
||
name: "Pre-flight Check"
|
||
runs-on: [self-hosted, harbor, k8s]
|
||
timeout-minutes: 1
|
||
steps:
|
||
# =======================================================================
|
||
# 2026-03-29: Runner _diag/pages 檔案衝突修復 (v3)
|
||
# 重要: 只清理 _diag/pages,不碰 RUNNER_TEMP
|
||
# 原因: RUNNER_TEMP 在所有 Jobs 之間共享,清理會破壞其他 Job
|
||
# =======================================================================
|
||
- name: "Clean Runner Diagnostics"
|
||
run: |
|
||
RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")")
|
||
PAGES_DIR="$RUNNER_ROOT/_diag/pages"
|
||
|
||
# 只清理 _diag/pages (唯一需要清理的目錄)
|
||
rm -rf "$PAGES_DIR" 2>/dev/null || true
|
||
mkdir -p "$PAGES_DIR" 2>/dev/null || true
|
||
|
||
# 清理 Claude worktrees (本地專案目錄)
|
||
rm -rf .claude/worktrees 2>/dev/null || true
|
||
|
||
echo "✅ Cleaned _diag/pages"
|
||
|
||
# =======================================================================
|
||
# ADR-035: Telegram 告警鏈路強制驗證
|
||
# 2026-03-29 Claude Code: 修復 Secrets 遺漏導致告警失效問題
|
||
# =======================================================================
|
||
- name: "Check Required Secrets"
|
||
run: |
|
||
MISSING=""
|
||
# 基礎 Secrets
|
||
if [ -z "${{ secrets.HARBOR_USER }}" ]; then MISSING="${MISSING}HARBOR_USER "; fi
|
||
if [ -z "${{ secrets.HARBOR_PASSWORD }}" ]; then MISSING="${MISSING}HARBOR_PASSWORD "; fi
|
||
if [ -z "${{ secrets.KUBE_CONFIG_PROD }}" ]; then MISSING="${MISSING}KUBE_CONFIG_PROD "; fi
|
||
# 🔴 Telegram Secrets (ADR-035 強制)
|
||
if [ -z "${{ secrets.OPENCLAW_TG_BOT_TOKEN }}" ]; then MISSING="${MISSING}OPENCLAW_TG_BOT_TOKEN "; fi
|
||
if [ -z "${{ secrets.OPENCLAW_TG_CHAT_ID }}" ]; then MISSING="${MISSING}OPENCLAW_TG_CHAT_ID "; fi
|
||
if [ -n "$MISSING" ]; then
|
||
echo "❌ 缺少 Secrets: ${MISSING}"
|
||
echo "🔴 告警鏈路將無法運作!請檢查 GitHub Secrets 配置"
|
||
exit 1
|
||
fi
|
||
echo "✅ Secrets 檢查通過 (含 Telegram)"
|
||
|
||
- name: "Check Harbor Connectivity"
|
||
run: |
|
||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 \
|
||
"http://${{ env.REGISTRY }}/v2/" 2>/dev/null || echo "000")
|
||
if [ "$HTTP_CODE" != "200" ] && [ "$HTTP_CODE" != "401" ]; then
|
||
echo "❌ Harbor 無法連線 (HTTP $HTTP_CODE)"
|
||
exit 1
|
||
fi
|
||
echo "✅ Harbor 連線正常"
|
||
|
||
- name: "Check kubectl"
|
||
run: |
|
||
export PATH="/home/wooo/bin:$PATH"
|
||
if ! which kubectl > /dev/null 2>&1; then
|
||
echo "❌ kubectl 不在 PATH"
|
||
exit 1
|
||
fi
|
||
echo "✅ kubectl 可用"
|
||
|
||
- name: "Notify Pre-flight Failure"
|
||
if: failure()
|
||
run: |
|
||
curl -sf -X POST "https://api.telegram.org/bot${{ secrets.OPENCLAW_TG_BOT_TOKEN }}/sendMessage" \
|
||
-d chat_id="${{ secrets.OPENCLAW_TG_CHAT_ID }}" \
|
||
-d text="❌ AWOOOI Pre-flight 失敗%0A%0A🔗 ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true
|
||
|
||
# ==================== Wave C.2: 監控覆蓋率檢查 (ADR-037) ====================
|
||
# 2026-03-29 Claude Code: 確保監控覆蓋率 >= 90%
|
||
monitoring-coverage:
|
||
name: "Monitoring Coverage"
|
||
runs-on: [self-hosted, harbor, k8s]
|
||
needs: pre-flight-check
|
||
timeout-minutes: 2
|
||
steps:
|
||
- uses: actions/checkout@v4
|
||
- name: "Check Monitoring Coverage"
|
||
run: |
|
||
python3 ops/monitoring/generate_monitoring.py --validate-only --ci
|
||
echo "✅ 監控覆蓋率檢查通過 (>= 90%)"
|
||
- name: "Notify Coverage Failure"
|
||
if: failure()
|
||
run: |
|
||
curl -sf -X POST "https://api.telegram.org/bot${{ secrets.OPENCLAW_TG_BOT_TOKEN }}/sendMessage" \
|
||
-d chat_id="${{ secrets.OPENCLAW_TG_CHAT_ID }}" \
|
||
-d text="⚠️ 監控覆蓋率低於 90%%,請更新 service-registry.yaml%0A%0A🔗 ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true
|
||
|
||
# ==================== 路徑偵測 (使用 dorny/paths-filter) ====================
|
||
detect-changes:
|
||
name: Detect Changes
|
||
runs-on: [self-hosted, harbor, k8s]
|
||
needs: [pre-flight-check, monitoring-coverage]
|
||
timeout-minutes: 1
|
||
outputs:
|
||
api: ${{ inputs.force_deploy == true && 'true' || steps.filter.outputs.api }}
|
||
web: ${{ inputs.force_deploy == true && 'true' || steps.filter.outputs.web }}
|
||
k3s-system: ${{ steps.filter.outputs.k3s-system }}
|
||
steps:
|
||
# 2026-03-29: Runner 診斷檔案清理 (防止並行衝突)
|
||
# 注意: 不能刪除 $RUNNER_TEMP/* 因為包含 _runner_file_commands
|
||
- name: "Clean Runner Diagnostics"
|
||
run: |
|
||
RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")")
|
||
rm -rf "$RUNNER_ROOT/_diag/pages" .claude/worktrees 2>/dev/null || true
|
||
mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
|
||
|
||
- uses: actions/checkout@v4
|
||
with:
|
||
clean: true
|
||
|
||
- uses: dorny/paths-filter@v3
|
||
id: filter
|
||
with:
|
||
filters: |
|
||
api:
|
||
- 'apps/api/**'
|
||
- 'packages/**'
|
||
- 'pyproject.toml'
|
||
web:
|
||
- 'apps/web/**'
|
||
- 'packages/**'
|
||
- 'package.json'
|
||
- 'pnpm-lock.yaml'
|
||
k3s-system:
|
||
- 'k8s/k3s-system/**'
|
||
|
||
# ==================== 序列建構 API (必須等 Web 完成) ====================
|
||
# 2026-03-29 Claude Code: 改為序列執行,修復 Runner _runner_file_commands 衝突
|
||
# 根因: 並行 Job 的 "Set up job" 階段會互相覆寫 RUNNER_TEMP 檔案
|
||
# 參考: ops/runner/README.md
|
||
build-api:
|
||
name: "Build API"
|
||
runs-on: [self-hosted, harbor, k8s]
|
||
needs: [detect-changes, build-web]
|
||
timeout-minutes: 20
|
||
if: |
|
||
!inputs.skip_api && (
|
||
needs.detect-changes.outputs.api == 'true' ||
|
||
(needs.detect-changes.outputs.api == 'false' && needs.detect-changes.outputs.web == 'false')
|
||
)
|
||
outputs:
|
||
image_tag: ${{ steps.tag.outputs.tag }}
|
||
steps:
|
||
# 2026-03-29: Runner 診斷檔案清理 (防止並行衝突)
|
||
# 注意: 不能刪除 $RUNNER_TEMP/* 因為包含 _runner_file_commands
|
||
- name: "Clean Runner Diagnostics"
|
||
run: |
|
||
RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")")
|
||
rm -rf "$RUNNER_ROOT/_diag/pages" .claude/worktrees 2>/dev/null || true
|
||
mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
|
||
|
||
- uses: actions/checkout@v4
|
||
|
||
- name: Generate tag
|
||
id: tag
|
||
run: echo "tag=$(git rev-parse --short HEAD)-${{ github.run_id }}" >> $GITHUB_OUTPUT
|
||
|
||
- name: Login to Harbor
|
||
run: echo "${{ secrets.HARBOR_PASSWORD }}" | docker login ${{ env.REGISTRY }} -u ${{ secrets.HARBOR_USER }} --password-stdin
|
||
|
||
- name: Build & Push (Native BuildKit)
|
||
env:
|
||
DOCKER_BUILDKIT: 1
|
||
run: |
|
||
docker build --push \
|
||
--tag ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}-api:${{ steps.tag.outputs.tag }} \
|
||
--file apps/api/Dockerfile .
|
||
echo "✅ API: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}-api:${{ steps.tag.outputs.tag }}"
|
||
|
||
# ==================== 建構 Web (先執行) ====================
|
||
# 2026-03-29 Claude Code: API 依賴 Web 完成,確保序列執行
|
||
build-web:
|
||
name: "Build Web"
|
||
runs-on: [self-hosted, harbor, k8s]
|
||
needs: detect-changes
|
||
timeout-minutes: 20
|
||
if: |
|
||
!inputs.skip_web && (
|
||
needs.detect-changes.outputs.web == 'true' ||
|
||
(needs.detect-changes.outputs.api == 'false' && needs.detect-changes.outputs.web == 'false')
|
||
)
|
||
outputs:
|
||
image_tag: ${{ steps.tag.outputs.tag }}
|
||
steps:
|
||
# 2026-03-29: Runner 診斷檔案清理 (防止並行衝突)
|
||
# 注意: 不能刪除 $RUNNER_TEMP/* 因為包含 _runner_file_commands
|
||
- name: "Clean Runner Diagnostics"
|
||
run: |
|
||
RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")")
|
||
rm -rf "$RUNNER_ROOT/_diag/pages" .claude/worktrees 2>/dev/null || true
|
||
mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
|
||
|
||
- uses: actions/checkout@v4
|
||
|
||
- name: Generate tag
|
||
id: tag
|
||
run: echo "tag=$(git rev-parse --short HEAD)-${{ github.run_id }}" >> $GITHUB_OUTPUT
|
||
|
||
- name: Login to Harbor
|
||
run: echo "${{ secrets.HARBOR_PASSWORD }}" | docker login ${{ env.REGISTRY }} -u ${{ secrets.HARBOR_USER }} --password-stdin
|
||
|
||
- name: Restore Next.js cache
|
||
run: |
|
||
mkdir -p apps/web/.next/cache
|
||
[ -d "${{ env.LOCAL_CACHE_DIR }}/nextjs" ] && cp -r ${{ env.LOCAL_CACHE_DIR }}/nextjs/* apps/web/.next/cache/ 2>/dev/null || true
|
||
|
||
- name: Build & Push (Native BuildKit)
|
||
env:
|
||
DOCKER_BUILDKIT: 1
|
||
run: |
|
||
docker build --push \
|
||
--build-arg NEXT_PUBLIC_API_URL=https://awoooi.wooo.work \
|
||
--build-arg NEXT_PUBLIC_SENTRY_DSN=http://da02d4e5d6542e4d1ed6b2dd6542efeb@192.168.0.110:9000/2 \
|
||
--tag ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}-web:${{ steps.tag.outputs.tag }} \
|
||
--file apps/web/Dockerfile .
|
||
echo "✅ Web: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}-web:${{ steps.tag.outputs.tag }}"
|
||
|
||
- name: Save Next.js cache
|
||
run: |
|
||
mkdir -p ${{ env.LOCAL_CACHE_DIR }}/nextjs
|
||
[ -d "apps/web/.next/cache" ] && cp -r apps/web/.next/cache/* ${{ env.LOCAL_CACHE_DIR }}/nextjs/ 2>/dev/null || true
|
||
|
||
# ==================== Deploy ====================
|
||
deploy-prod:
|
||
name: Deploy to Production
|
||
runs-on: [self-hosted, harbor, k8s]
|
||
needs: [detect-changes, build-api, build-web]
|
||
timeout-minutes: 10
|
||
concurrency:
|
||
group: runner-awoooi-cd-mutex
|
||
cancel-in-progress: false
|
||
if: always() && (needs.build-api.result == 'success' || needs.build-api.result == 'skipped') && (needs.build-web.result == 'success' || needs.build-web.result == 'skipped')
|
||
environment: production
|
||
steps:
|
||
# 2026-03-29: Runner 診斷檔案清理 (防止並行衝突)
|
||
# 注意: 不能刪除 $RUNNER_TEMP/* 因為包含 _runner_file_commands
|
||
- name: "Clean Runner Diagnostics"
|
||
run: |
|
||
RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")")
|
||
rm -rf "$RUNNER_ROOT/_diag/pages" .claude/worktrees 2>/dev/null || true
|
||
mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
|
||
|
||
- uses: actions/checkout@v4
|
||
with:
|
||
clean: true
|
||
|
||
- name: Setup
|
||
run: |
|
||
mkdir -p ~/.kube
|
||
echo "${{ secrets.KUBE_CONFIG_PROD }}" | base64 -d > ~/.kube/config
|
||
chmod 600 ~/.kube/config
|
||
export PATH="/home/wooo/bin:$HOME/.local/bin:$PATH"
|
||
echo "/home/wooo/bin" >> $GITHUB_PATH
|
||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||
|
||
- name: Generate tag
|
||
id: tag
|
||
run: echo "tag=$(git rev-parse --short HEAD)-${{ github.run_id }}" >> $GITHUB_OUTPUT
|
||
|
||
# =======================================================================
|
||
# ADR-035: K8s Secrets 自動注入 (Telegram + API Keys)
|
||
# 2026-03-29 Claude Code: 修復 Secrets 遺漏導致告警失效
|
||
# 🔴 鐵律: 每次部署都必須確保 Secrets 最新
|
||
# =======================================================================
|
||
- name: "Sync K8s Secrets (ADR-035)"
|
||
run: |
|
||
echo "🔐 同步 K8s Secrets..."
|
||
|
||
# 檢查 Secret 是否存在,不存在則創建
|
||
if ! kubectl get secret awoooi-secrets -n awoooi-prod > /dev/null 2>&1; then
|
||
echo "📦 創建 awoooi-secrets..."
|
||
kubectl create secret generic awoooi-secrets -n awoooi-prod \
|
||
--from-literal=OPENCLAW_TG_BOT_TOKEN="${{ secrets.OPENCLAW_TG_BOT_TOKEN }}" \
|
||
--from-literal=OPENCLAW_TG_CHAT_ID="${{ secrets.OPENCLAW_TG_CHAT_ID }}" \
|
||
--from-literal=DATABASE_URL="${{ secrets.DATABASE_URL }}" \
|
||
--from-literal=REDIS_URL="${{ secrets.REDIS_URL }}" \
|
||
--from-literal=GEMINI_API_KEY="${{ secrets.GEMINI_API_KEY }}" \
|
||
--from-literal=CLAUDE_API_KEY="${{ secrets.CLAUDE_API_KEY }}" \
|
||
--from-literal=NVIDIA_API_KEY="${{ secrets.NVIDIA_API_KEY }}" \
|
||
--from-literal=WEBHOOK_HMAC_SECRET="${{ secrets.WEBHOOK_HMAC_SECRET }}" \
|
||
--from-literal=SENTRY_DSN="${{ secrets.SENTRY_DSN }}" \
|
||
--from-literal=SENTRY_AUTH_TOKEN="${{ secrets.SENTRY_AUTH_TOKEN }}"
|
||
else
|
||
echo "🔄 更新 awoooi-secrets..."
|
||
# 使用 patch 更新,確保關鍵配置永遠是最新的
|
||
# 2026-03-29 ogt: ADR-036 NVIDIA + GEMINI + SENTRY secrets
|
||
kubectl patch secret awoooi-secrets -n awoooi-prod --type='merge' -p="{
|
||
\"stringData\": {
|
||
\"OPENCLAW_TG_BOT_TOKEN\": \"${{ secrets.OPENCLAW_TG_BOT_TOKEN }}\",
|
||
\"OPENCLAW_TG_CHAT_ID\": \"${{ secrets.OPENCLAW_TG_CHAT_ID }}\",
|
||
\"GEMINI_API_KEY\": \"${{ secrets.GEMINI_API_KEY }}\",
|
||
\"NVIDIA_API_KEY\": \"${{ secrets.NVIDIA_API_KEY }}\",
|
||
\"SENTRY_AUTH_TOKEN\": \"${{ secrets.SENTRY_AUTH_TOKEN }}\"
|
||
}
|
||
}"
|
||
fi
|
||
echo "✅ K8s Secrets 同步完成"
|
||
|
||
# 2026-03-29 Claude Code: Secret 更新後必須重啟 Pod 才能讀取新值
|
||
echo "🔄 重啟 API Pod 以讀取新 Secrets..."
|
||
kubectl rollout restart deployment/awoooi-api -n awoooi-prod || true
|
||
kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s || echo "⚠️ Rollout 超時,繼續部署"
|
||
|
||
# =======================================================================
|
||
# 2026-03-29 首席架構師審查: P0 修復 - 雙跳過保護機制
|
||
# 問題: 當 API 和 Web build 都跳過時,kustomize 仍包含 IMAGE_TAG_PLACEHOLDER
|
||
# 導致 kubectl apply 部署無效映像 → ImagePullBackOff
|
||
# 修復: 檢測雙跳過情況,只做 Secrets 同步,跳過 Deployment apply
|
||
# =======================================================================
|
||
- name: Deploy
|
||
run: |
|
||
cd k8s/awoooi-prod
|
||
TAG="${{ steps.tag.outputs.tag }}"
|
||
IMAGES_UPDATED=0
|
||
|
||
# 只更新實際建構的 image (避免 ImagePullBackOff)
|
||
if [ "${{ needs.build-api.result }}" = "success" ]; then
|
||
echo "📦 更新 API image: ${{ env.IMAGE_PREFIX }}-api:${TAG}"
|
||
kustomize edit set image \
|
||
"192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}-api:${TAG}"
|
||
IMAGES_UPDATED=$((IMAGES_UPDATED + 1))
|
||
else
|
||
echo "⏭️ 跳過 API image 更新 (build skipped)"
|
||
fi
|
||
|
||
if [ "${{ needs.build-web.result }}" = "success" ]; then
|
||
echo "📦 更新 Web image: ${{ env.IMAGE_PREFIX }}-web:${TAG}"
|
||
kustomize edit set image \
|
||
"192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}-web:${TAG}"
|
||
IMAGES_UPDATED=$((IMAGES_UPDATED + 1))
|
||
else
|
||
echo "⏭️ 跳過 Web image 更新 (build skipped)"
|
||
fi
|
||
|
||
# 🔴 P0 保護: 雙跳過時不執行 kubectl apply (防止 IMAGE_TAG_PLACEHOLDER 被部署)
|
||
if [ "$IMAGES_UPDATED" -eq 0 ]; then
|
||
echo "⚠️ 雙 Build 都跳過,跳過 Deployment apply (防止 ImagePullBackOff)"
|
||
echo " 只同步了 Secrets/ConfigMap,Pod 保持現有版本"
|
||
exit 0
|
||
fi
|
||
|
||
kubectl apply -k .
|
||
|
||
# 2026-03-29 ogt: NetworkPolicy 單獨 apply (避免 commonLabels 破壞 DNS rule)
|
||
- name: Apply NetworkPolicy
|
||
run: |
|
||
echo "🔒 套用 NetworkPolicy (繞過 kustomize commonLabels)..."
|
||
kubectl apply -f k8s/awoooi-prod/02-network-policy.yaml
|
||
echo "✅ NetworkPolicy 已套用"
|
||
|
||
# 2026-03-26: CoreDNS GitOps 同步 (ADR-026)
|
||
- name: Sync CoreDNS Config
|
||
if: needs.detect-changes.outputs.k3s-system == 'true'
|
||
run: |
|
||
echo "📦 同步 CoreDNS 配置到 K3s..."
|
||
# HelmChartConfig 是 K8s 資源,直接 apply
|
||
kubectl apply -f k8s/k3s-system/coredns-custom.yaml
|
||
echo "✅ CoreDNS 配置已同步"
|
||
|
||
- name: Wait for rollout
|
||
run: |
|
||
kubectl rollout status deployment/awoooi-web -n awoooi-prod --timeout=300s || true
|
||
kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=300s || true
|
||
|
||
- name: Health check
|
||
run: |
|
||
sleep 15
|
||
API_POD=$(kubectl get pods -n awoooi-prod -l app=awoooi-api -o jsonpath='{.items[0].metadata.name}')
|
||
# 使用 Python httpx (容器沒有 curl,但有 httpx)
|
||
kubectl exec -n awoooi-prod $API_POD -c api -- python -c "import httpx; r=httpx.get('http://localhost:8000/api/v1/health', timeout=5); print(r.status_code)" || echo "Health check failed but deployment succeeded"
|
||
|
||
# =======================================================================
|
||
# ADR-037 Wave B.2: Alert Chain Smoke Test
|
||
# 2026-03-29: 告警鏈路端到端驗證 (Wave A.6 腳本整合)
|
||
# =======================================================================
|
||
- name: "Alert Chain Smoke Test (ADR-037)"
|
||
run: |
|
||
echo "🔍 執行告警鏈路 Smoke Test..."
|
||
API_POD=$(kubectl get pods -n awoooi-prod -l app=awoooi-api -o jsonpath='{.items[0].metadata.name}')
|
||
|
||
# 測試各 Webhook Endpoint
|
||
kubectl exec -n awoooi-prod $API_POD -c api -- python -c "
|
||
import httpx
|
||
import sys
|
||
|
||
BASE = 'http://localhost:8000'
|
||
TIMEOUT = 30
|
||
results = []
|
||
|
||
# 1. Health
|
||
try:
|
||
r = httpx.get(f'{BASE}/api/v1/health', timeout=TIMEOUT)
|
||
results.append(('health', r.status_code == 200))
|
||
except Exception as e:
|
||
results.append(('health', False))
|
||
print(f'Health: {e}')
|
||
|
||
# 2. Alertmanager Webhook
|
||
try:
|
||
r = httpx.post(f'{BASE}/api/v1/webhooks/alertmanager', json={
|
||
'version': '4', 'status': 'firing',
|
||
'alerts': [{'status': 'firing', 'labels': {'alertname': 'E2E_CD_TEST', 'severity': 'info'}}]
|
||
}, timeout=TIMEOUT)
|
||
results.append(('alertmanager', r.status_code == 200))
|
||
except Exception as e:
|
||
results.append(('alertmanager', False))
|
||
print(f'Alertmanager: {e}')
|
||
|
||
# 3. SignOz Webhook Health
|
||
try:
|
||
r = httpx.get(f'{BASE}/api/v1/webhooks/signoz/health', timeout=TIMEOUT)
|
||
results.append(('signoz_health', r.status_code == 200))
|
||
except Exception as e:
|
||
results.append(('signoz_health', False))
|
||
print(f'SignOz Health: {e}')
|
||
|
||
# 4. SignOz Webhook POST (P0-1 修復 2026-03-29)
|
||
try:
|
||
r = httpx.post(f'{BASE}/api/v1/webhooks/signoz/alert', json={
|
||
'alertname': 'E2E_CD_TEST', 'status': 'firing',
|
||
'labels': {'severity': 'info', 'service_name': 'cd-test'},
|
||
'annotations': {'summary': 'CD Pipeline E2E Test'}
|
||
}, timeout=TIMEOUT)
|
||
results.append(('signoz_post', r.status_code == 200))
|
||
except Exception as e:
|
||
results.append(('signoz_post', False))
|
||
print(f'SignOz POST: {e}')
|
||
|
||
# Summary
|
||
passed = sum(1 for _, ok in results if ok)
|
||
total = len(results)
|
||
print(f'Smoke Test: {passed}/{total} passed')
|
||
for name, ok in results:
|
||
print(f' {\"✅\" if ok else \"❌\"} {name}')
|
||
|
||
sys.exit(0 if passed == total else 1)
|
||
" || {
|
||
echo "⚠️ Smoke Test 部分失敗,但不阻擋部署"
|
||
# 發送告警
|
||
curl -sf -X POST "https://api.telegram.org/bot${{ secrets.OPENCLAW_TG_BOT_TOKEN }}/sendMessage" \
|
||
-d chat_id="${{ secrets.OPENCLAW_TG_CHAT_ID }}" \
|
||
-d text="⚠️ *AWOOOI Alert Chain Smoke Test 部分失敗*%0A%0A部署已完成,但部分 Webhook 可能有問題。%0A%0A🔗 ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
|
||
-d parse_mode="Markdown" || true
|
||
}
|
||
|
||
# =======================================================================
|
||
# ADR-035: Telegram 告警鏈路 E2E 驗證
|
||
# 2026-03-29 Claude Code: 部署後必須驗證 Telegram 發送成功
|
||
# 🔴 鐵律: 失敗則告警,確保團隊知道鏈路斷裂
|
||
# =======================================================================
|
||
- name: "Verify Telegram Alert Chain (ADR-035)"
|
||
run: |
|
||
echo "🔍 驗證 Telegram 告警鏈路..."
|
||
|
||
# 發送測試告警到 AWOOOI API
|
||
API_POD=$(kubectl get pods -n awoooi-prod -l app=awoooi-api -o jsonpath='{.items[0].metadata.name}')
|
||
|
||
RESULT=$(kubectl exec -n awoooi-prod $API_POD -c api -- python -c "
|
||
import httpx
|
||
import json
|
||
r = httpx.post(
|
||
'http://localhost:8000/api/v1/webhooks/alertmanager',
|
||
json={
|
||
'receiver': 'cd-test',
|
||
'status': 'firing',
|
||
'alerts': [{
|
||
'status': 'firing',
|
||
'labels': {
|
||
'alertname': 'CD_E2E_Test_$(date +%s)',
|
||
'severity': 'info',
|
||
'namespace': 'cd-test',
|
||
'deployment': 'e2e-verify'
|
||
},
|
||
'annotations': {
|
||
'summary': 'CD 部署後 E2E 驗證 - ${{ github.sha }}'
|
||
}
|
||
}]
|
||
},
|
||
timeout=60
|
||
)
|
||
print(json.dumps(r.json()))
|
||
" 2>&1) || RESULT='{"success":false}'
|
||
|
||
echo "API Response: $RESULT"
|
||
|
||
# 檢查是否成功
|
||
SUCCESS=$(echo "$RESULT" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('success', False))" 2>/dev/null || echo "False")
|
||
|
||
if [ "$SUCCESS" != "True" ]; then
|
||
echo "🔴 Telegram 告警鏈路驗證失敗!"
|
||
echo "可能原因: Token 未配置、API 超時、LLM 服務不可用"
|
||
# 直接發送告警到 Telegram (繞過 API)
|
||
curl -sf -X POST "https://api.telegram.org/bot${{ secrets.OPENCLAW_TG_BOT_TOKEN }}/sendMessage" \
|
||
-d chat_id="${{ secrets.OPENCLAW_TG_CHAT_ID }}" \
|
||
-d text="🔴 *AWOOOI 告警鏈路驗證失敗*%0A%0A部署完成但告警鏈路可能斷裂!%0A請檢查 API Pod 日誌。%0A%0A🔗 ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
|
||
-d parse_mode="Markdown" || true
|
||
else
|
||
echo "✅ Telegram 告警鏈路驗證成功"
|
||
fi
|
||
|
||
- name: Notify OpenClaw
|
||
if: always()
|
||
run: |
|
||
STATUS="${{ job.status }}"
|
||
curl -sf -X POST "${{ env.OPENCLAW_URL }}/api/v1/webhook/pipeline" \
|
||
-H "Content-Type: application/json" \
|
||
-d "{
|
||
\"event\": \"completed\",
|
||
\"status\": \"${STATUS}\",
|
||
\"pipeline_id\": \"${{ github.run_id }}\",
|
||
\"pipeline_url\": \"${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\",
|
||
\"commit\": \"${{ github.sha }}\",
|
||
\"branch\": \"${{ github.ref_name }}\"
|
||
}" || true
|
||
|
||
- name: Notify Telegram
|
||
if: always()
|
||
run: |
|
||
# 取得 commit 資訊
|
||
COMMIT_MSG=$(git log -1 --pretty=format:'%s' | head -c 50)
|
||
AUTHOR=$(git log -1 --pretty=format:'%an')
|
||
DEPLOY_TIME=$(TZ='Asia/Taipei' date '+%Y-%m-%d %H:%M')
|
||
SHORT_SHA=$(echo "${{ github.sha }}" | head -c 7)
|
||
|
||
if [ "${{ job.status }}" = "success" ]; then
|
||
MSG="✅ *AWOOOI 部署成功*%0A%0A📦 版本: ${COMMIT_MSG}%0A⏰ 時間: ${DEPLOY_TIME}%0A👤 作者: ${AUTHOR}%0A🔖 SHA: ${SHORT_SHA}%0A%0A🔗 [查看 Workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
|
||
else
|
||
MSG="❌ *AWOOOI 部署失敗*%0A%0A📦 版本: ${COMMIT_MSG}%0A⏰ 時間: ${DEPLOY_TIME}%0A👤 作者: ${AUTHOR}%0A🔖 SHA: ${SHORT_SHA}%0A%0A🔗 [查看 Workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
|
||
fi
|
||
curl -sf -X POST "https://api.telegram.org/bot${{ secrets.OPENCLAW_TG_BOT_TOKEN }}/sendMessage" \
|
||
-d chat_id="${{ secrets.OPENCLAW_TG_CHAT_ID }}" \
|
||
-d text="${MSG}" \
|
||
-d parse_mode="Markdown" || true
|
||
|
||
# =======================================================================
|
||
# 2026-03-29 ogt: Gitea Mirror (B2 備份策略)
|
||
# 🔴 鐵律: GitHub 是主倉,Gitea 只讀備份 (無 CI)
|
||
# =======================================================================
|
||
- name: "Mirror to Gitea (B2 Backup)"
|
||
if: success()
|
||
run: |
|
||
echo "🪞 Mirror to Gitea..."
|
||
git remote add gitea "http://wooo:${{ secrets.GITEA_MIRROR_TOKEN }}@192.168.0.110:3001/wooo/awoooi.git" || true
|
||
git push gitea main --force || echo "⚠️ Gitea mirror failed (non-blocking)"
|