diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 08e96e94..14aa99db 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -891,11 +891,77 @@ jobs: } # ─── Step 4: 等待 ArgoCD sync + rollout ─── + ROLLOUT_LOG="$(mktemp)" + set +e ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \ - "EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT' + "EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" 2>&1 << 'ARGOCD_WAIT' | tee "$ROLLOUT_LOG" set -e K8S_API_SERVER="${{ env.K8S_API_SERVER }}" KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER}" + RISK_FILE="$(mktemp)" + UNKNOWN_STATUS_COUNT=0 + HEALTH_FAILURE_COUNT=0 + + record_rollout_risk() { + local message="$1" + printf '%s\n' "$message" >> "$RISK_FILE" + echo "⚠️ Rollout risk observed: $message" >&2 + } + + emit_rollout_evidence() { + if [ -s "$RISK_FILE" ]; then + local summary + local kubectl_count + kubectl_count=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true) + summary=$(tr '\n' '; ' < "$RISK_FILE" | sed 's/[[:cntrl:]]//g' | cut -c1-700) + echo "AWOOOI_ROLLOUT_RISK=1" + echo "AWOOOI_ROLLOUT_SUMMARY=unknown_status_count=${UNKNOWN_STATUS_COUNT}; health_failure_count=${HEALTH_FAILURE_COUNT}; kubectl_failure_count=${kubectl_count}; ${summary}" + else + echo "AWOOOI_ROLLOUT_RISK=0" + fi + rm -f "$RISK_FILE" + } + trap emit_rollout_evidence EXIT + + app_field() { + local jsonpath="$1" + local label="$2" + local output + local status + local kubectl_seen + set +e + output=$($KUBECTL get application awoooi-prod -n argocd -o jsonpath="$jsonpath" 2>&1) + status=$? + set -e + if [ "$status" -ne 0 ]; then + kubectl_seen=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true) + if [ "$kubectl_seen" -lt 3 ]; then + record_rollout_risk "argocd_${label}_query_failed=$(echo "$output" | head -c 180)" + fi + printf 'Unknown' + return 0 + fi + printf '%s' "$output" + } + + probe_public_health() { + local phase="$1" + local http_code + local status + set +e + http_code=$(curl -sS -w "%{http_code}" -o /dev/null --connect-timeout 3 --max-time 5 "${{ env.API_HEALTH_URL }}" 2>/dev/null) + status=$? + set -e + if [ "$status" -ne 0 ]; then + http_code="curl_error_${status}" + fi + if [ "$http_code" != "200" ]; then + HEALTH_FAILURE_COUNT=$((HEALTH_FAILURE_COUNT + 1)) + if [ "$HEALTH_FAILURE_COUNT" -le 3 ]; then + record_rollout_risk "public_health_${phase}_http=${http_code}" + fi + fi + } # 等待 ArgoCD Application Synced(最多 180s)。只看 # Synced/Healthy 可能誤判成上一個 revision 已同步,因此有 @@ -904,15 +970,19 @@ jobs: $KUBECTL annotate application awoooi-prod -n argocd \ argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true for i in $(seq 1 36); do - SYNC=$($KUBECTL get application awoooi-prod -n argocd \ - -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - HEALTH=$($KUBECTL get application awoooi-prod -n argocd \ - -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - REVISION=$($KUBECTL get application awoooi-prod -n argocd \ - -o jsonpath='{.status.sync.revision}' 2>/dev/null || echo "Unknown") + SYNC=$(app_field '{.status.sync.status}' sync) + HEALTH=$(app_field '{.status.health.status}' health) + REVISION=$(app_field '{.status.sync.revision}' revision) SHORT_REVISION=$(echo "$REVISION" | cut -c1-8) SHORT_EXPECTED=$(echo "$EXPECTED_REVISION" | cut -c1-8) echo " ArgoCD: sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}" + probe_public_health "argocd_wait" + if [ "$SYNC" = "Unknown" ] || [ "$HEALTH" = "Unknown" ] || [ "$REVISION" = "Unknown" ]; then + UNKNOWN_STATUS_COUNT=$((UNKNOWN_STATUS_COUNT + 1)) + if [ "$UNKNOWN_STATUS_COUNT" -le 3 ]; then + record_rollout_risk "argocd_status_unknown sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}" + fi + fi if [ "$SYNC" = "Synced" ] && [ "$HEALTH" = "Healthy" ]; then if [ -z "$EXPECTED_REVISION" ] || [ "$REVISION" = "$EXPECTED_REVISION" ]; then echo "✅ ArgoCD Synced + Healthy" @@ -945,10 +1015,43 @@ jobs: sleep 10 done if [ "$HEALTH_PASS" = "0" ]; then + record_rollout_risk "public_health_final_failed" echo "❌ API 健康檢查失敗" exit 1 fi ARGOCD_WAIT + ROLLOUT_EXIT=${PIPESTATUS[0]} + set -e + + ROLLOUT_RISK="0" + ROLLOUT_SUMMARY="" + if grep -q '^AWOOOI_ROLLOUT_RISK=1$' "$ROLLOUT_LOG"; then + ROLLOUT_RISK="1" + ROLLOUT_SUMMARY=$(grep '^AWOOOI_ROLLOUT_SUMMARY=' "$ROLLOUT_LOG" | tail -1 | sed 's/^AWOOOI_ROLLOUT_SUMMARY=//' | cut -c1-700) + fi + if [ -n "${GITHUB_ENV:-}" ]; then + { + echo "AWOOI_ROLLOUT_RISK=${ROLLOUT_RISK}" + echo "AWOOI_ROLLOUT_SUMMARY=${ROLLOUT_SUMMARY}" + } >> "$GITHUB_ENV" + fi + rm -f "$ROLLOUT_LOG" + + if [ "$ROLLOUT_EXIT" -eq 0 ] && [ "$ROLLOUT_RISK" = "1" ]; then + ACTOR="${GITHUB_ACTOR:-${{ github.actor }}}" + if AWOOI_CICD_STATUS=pending \ + AWOOI_CICD_STAGE=rollout-risk \ + AWOOI_CICD_JOB_NAME="AWOOOI 部署風險已恢復" \ + AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \ + AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \ + AWOOI_CICD_SUMMARY="${ROLLOUT_SUMMARY}" \ + scripts/ci/notify-awoooi-cicd.sh; then + echo "✅ CI/CD rollout risk notification mirrored through AWOOI API" + else + echo "⚠️ CI/CD rollout risk notification failed (non-fatal)" + fi + fi + exit "$ROLLOUT_EXIT" # 2026-04-09 Claude Sonnet 4.6: Sprint 5.2 — 同步 ops 腳本到 188 (ollama user) # 188 deploy key is rotated and must not be read by this disabled step. @@ -970,6 +1073,7 @@ jobs: COMMIT_MSG="${{ steps.commit.outputs.message }}" SHORT_SHA="${{ steps.commit.outputs.short_sha }}" ACTOR="${{ github.actor }}" + FAILURE_SUMMARY="${AWOOI_ROLLOUT_SUMMARY:-${COMMIT_MSG}}" COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s//\>/g') MSG=$(printf '❌ AWOOOI 部署失敗\n├ 📝 %s\n├ 🔖 %s\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}") if AWOOI_CICD_STATUS=failed \ @@ -977,7 +1081,7 @@ jobs: AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \ AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \ AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \ - AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \ + AWOOI_CICD_SUMMARY="${FAILURE_SUMMARY}" \ scripts/ci/notify-awoooi-cicd.sh; then echo "✅ CI/CD build failure notification mirrored through AWOOI API" else