fix(cd): ADR-035 Telegram Secrets 自動注入三層防護
🔴 事故根因: K8s Secrets 未注入,Telegram 告警長時間失效 - kustomization.yaml 說「由 CI/CD 處理」但 CD 從未執行 🛡️ 三層防護機制: - Layer 1: Pre-flight 檢查 GitHub Secrets 存在 - Layer 2: Deploy 時 kubectl patch secret 自動注入 - Layer 3: Post-Deploy E2E 測試告警驗證 📄 文件更新: - ADR-035: docs/adr/ADR-035-telegram-alert-chain-enforcement.md - DevOps Skill v1.9: 新增 Secrets 注入鐵律 - CLAUDE.md: 新增告警鏈路章節 - LOGBOOK: 事故記錄 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
99
.github/workflows/cd.yaml
vendored
99
.github/workflows/cd.yaml
vendored
@@ -64,17 +64,26 @@ jobs:
|
||||
rm -rf "$RUNNER_ROOT/_diag/pages"/* 2>/dev/null || true
|
||||
rm -rf .claude/worktrees 2>/dev/null || true
|
||||
|
||||
# =======================================================================
|
||||
# ADR-035: Telegram 告警鏈路強制驗證
|
||||
# 2026-03-29 Claude Code: 修復 Secrets 遺漏導致告警失效問題
|
||||
# =======================================================================
|
||||
- name: "Check Required Secrets"
|
||||
run: |
|
||||
MISSING=""
|
||||
# 基礎 Secrets
|
||||
if [ -z "${{ secrets.HARBOR_USER }}" ]; then MISSING="${MISSING}HARBOR_USER "; fi
|
||||
if [ -z "${{ secrets.HARBOR_PASSWORD }}" ]; then MISSING="${MISSING}HARBOR_PASSWORD "; fi
|
||||
if [ -z "${{ secrets.KUBE_CONFIG_PROD }}" ]; then MISSING="${MISSING}KUBE_CONFIG_PROD "; fi
|
||||
# 🔴 Telegram Secrets (ADR-035 強制)
|
||||
if [ -z "${{ secrets.OPENCLAW_TG_BOT_TOKEN }}" ]; then MISSING="${MISSING}OPENCLAW_TG_BOT_TOKEN "; fi
|
||||
if [ -z "${{ secrets.OPENCLAW_TG_CHAT_ID }}" ]; then MISSING="${MISSING}OPENCLAW_TG_CHAT_ID "; fi
|
||||
if [ -n "$MISSING" ]; then
|
||||
echo "❌ 缺少 Secrets: ${MISSING}"
|
||||
echo "🔴 告警鏈路將無法運作!請檢查 GitHub Secrets 配置"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Secrets 檢查通過"
|
||||
echo "✅ Secrets 檢查通過 (含 Telegram)"
|
||||
|
||||
- name: "Check Harbor Connectivity"
|
||||
run: |
|
||||
@@ -259,6 +268,39 @@ jobs:
|
||||
id: tag
|
||||
run: echo "tag=$(git rev-parse --short HEAD)-${{ github.run_id }}" >> $GITHUB_OUTPUT
|
||||
|
||||
# =======================================================================
|
||||
# ADR-035: K8s Secrets 自動注入 (Telegram + API Keys)
|
||||
# 2026-03-29 Claude Code: 修復 Secrets 遺漏導致告警失效
|
||||
# 🔴 鐵律: 每次部署都必須確保 Secrets 最新
|
||||
# =======================================================================
|
||||
- name: "Sync K8s Secrets (ADR-035)"
|
||||
run: |
|
||||
echo "🔐 同步 K8s Secrets..."
|
||||
|
||||
# 檢查 Secret 是否存在,不存在則創建
|
||||
if ! kubectl get secret awoooi-secrets -n awoooi-prod > /dev/null 2>&1; then
|
||||
echo "📦 創建 awoooi-secrets..."
|
||||
kubectl create secret generic awoooi-secrets -n awoooi-prod \
|
||||
--from-literal=OPENCLAW_TG_BOT_TOKEN="${{ secrets.OPENCLAW_TG_BOT_TOKEN }}" \
|
||||
--from-literal=OPENCLAW_TG_CHAT_ID="${{ secrets.OPENCLAW_TG_CHAT_ID }}" \
|
||||
--from-literal=DATABASE_URL="${{ secrets.DATABASE_URL }}" \
|
||||
--from-literal=REDIS_URL="${{ secrets.REDIS_URL }}" \
|
||||
--from-literal=GEMINI_API_KEY="${{ secrets.GEMINI_API_KEY }}" \
|
||||
--from-literal=CLAUDE_API_KEY="${{ secrets.CLAUDE_API_KEY }}" \
|
||||
--from-literal=WEBHOOK_HMAC_SECRET="${{ secrets.WEBHOOK_HMAC_SECRET }}" \
|
||||
--from-literal=SENTRY_DSN="${{ secrets.SENTRY_DSN }}"
|
||||
else
|
||||
echo "🔄 更新 awoooi-secrets..."
|
||||
# 使用 patch 更新,確保 Telegram 配置永遠是最新的
|
||||
kubectl patch secret awoooi-secrets -n awoooi-prod --type='merge' -p="{
|
||||
\"stringData\": {
|
||||
\"OPENCLAW_TG_BOT_TOKEN\": \"${{ secrets.OPENCLAW_TG_BOT_TOKEN }}\",
|
||||
\"OPENCLAW_TG_CHAT_ID\": \"${{ secrets.OPENCLAW_TG_CHAT_ID }}\"
|
||||
}
|
||||
}"
|
||||
fi
|
||||
echo "✅ K8s Secrets 同步完成"
|
||||
|
||||
- name: Deploy
|
||||
run: |
|
||||
cd k8s/awoooi-prod
|
||||
@@ -304,6 +346,61 @@ jobs:
|
||||
# 使用 Python httpx (容器沒有 curl,但有 httpx)
|
||||
kubectl exec -n awoooi-prod $API_POD -c api -- python -c "import httpx; r=httpx.get('http://localhost:8000/api/v1/health', timeout=5); print(r.status_code)" || echo "Health check failed but deployment succeeded"
|
||||
|
||||
# =======================================================================
|
||||
# ADR-035: Telegram 告警鏈路 E2E 驗證
|
||||
# 2026-03-29 Claude Code: 部署後必須驗證 Telegram 發送成功
|
||||
# 🔴 鐵律: 失敗則告警,確保團隊知道鏈路斷裂
|
||||
# =======================================================================
|
||||
- name: "Verify Telegram Alert Chain (ADR-035)"
|
||||
run: |
|
||||
echo "🔍 驗證 Telegram 告警鏈路..."
|
||||
|
||||
# 發送測試告警到 AWOOOI API
|
||||
API_POD=$(kubectl get pods -n awoooi-prod -l app=awoooi-api -o jsonpath='{.items[0].metadata.name}')
|
||||
|
||||
RESULT=$(kubectl exec -n awoooi-prod $API_POD -c api -- python -c "
|
||||
import httpx
|
||||
import json
|
||||
r = httpx.post(
|
||||
'http://localhost:8000/api/v1/webhooks/alertmanager',
|
||||
json={
|
||||
'receiver': 'cd-test',
|
||||
'status': 'firing',
|
||||
'alerts': [{
|
||||
'status': 'firing',
|
||||
'labels': {
|
||||
'alertname': 'CD_E2E_Test_$(date +%s)',
|
||||
'severity': 'info',
|
||||
'namespace': 'cd-test',
|
||||
'deployment': 'e2e-verify'
|
||||
},
|
||||
'annotations': {
|
||||
'summary': 'CD 部署後 E2E 驗證 - ${{ github.sha }}'
|
||||
}
|
||||
}]
|
||||
},
|
||||
timeout=60
|
||||
)
|
||||
print(json.dumps(r.json()))
|
||||
" 2>&1) || RESULT='{"success":false}'
|
||||
|
||||
echo "API Response: $RESULT"
|
||||
|
||||
# 檢查是否成功
|
||||
SUCCESS=$(echo "$RESULT" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('success', False))" 2>/dev/null || echo "False")
|
||||
|
||||
if [ "$SUCCESS" != "True" ]; then
|
||||
echo "🔴 Telegram 告警鏈路驗證失敗!"
|
||||
echo "可能原因: Token 未配置、API 超時、LLM 服務不可用"
|
||||
# 直接發送告警到 Telegram (繞過 API)
|
||||
curl -sf -X POST "https://api.telegram.org/bot${{ secrets.OPENCLAW_TG_BOT_TOKEN }}/sendMessage" \
|
||||
-d chat_id="${{ secrets.OPENCLAW_TG_CHAT_ID }}" \
|
||||
-d text="🔴 *AWOOOI 告警鏈路驗證失敗*%0A%0A部署完成但告警鏈路可能斷裂!%0A請檢查 API Pod 日誌。%0A%0A🔗 ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
|
||||
-d parse_mode="Markdown" || true
|
||||
else
|
||||
echo "✅ Telegram 告警鏈路驗證成功"
|
||||
fi
|
||||
|
||||
- name: Notify OpenClaw
|
||||
if: always()
|
||||
run: |
|
||||
|
||||
Reference in New Issue
Block a user