ci(cd): surface recovered rollout risk evidence
All checks were successful
Code Review / ai-code-review (push) Successful in 10s

This commit is contained in:
Your Name
2026-05-21 19:37:30 +08:00
parent 4887708717
commit 8e68dc1e35

View File

@@ -891,11 +891,77 @@ jobs:
}
# ─── Step 4: 等待 ArgoCD sync + rollout ───
ROLLOUT_LOG="$(mktemp)"
set +e
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" 2>&1 << 'ARGOCD_WAIT' | tee "$ROLLOUT_LOG"
set -e
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER}"
RISK_FILE="$(mktemp)"
UNKNOWN_STATUS_COUNT=0
HEALTH_FAILURE_COUNT=0
record_rollout_risk() {
local message="$1"
printf '%s\n' "$message" >> "$RISK_FILE"
echo "⚠️ Rollout risk observed: $message" >&2
}
emit_rollout_evidence() {
if [ -s "$RISK_FILE" ]; then
local summary
local kubectl_count
kubectl_count=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
summary=$(tr '\n' '; ' < "$RISK_FILE" | sed 's/[[:cntrl:]]//g' | cut -c1-700)
echo "AWOOOI_ROLLOUT_RISK=1"
echo "AWOOOI_ROLLOUT_SUMMARY=unknown_status_count=${UNKNOWN_STATUS_COUNT}; health_failure_count=${HEALTH_FAILURE_COUNT}; kubectl_failure_count=${kubectl_count}; ${summary}"
else
echo "AWOOOI_ROLLOUT_RISK=0"
fi
rm -f "$RISK_FILE"
}
trap emit_rollout_evidence EXIT
app_field() {
local jsonpath="$1"
local label="$2"
local output
local status
local kubectl_seen
set +e
output=$($KUBECTL get application awoooi-prod -n argocd -o jsonpath="$jsonpath" 2>&1)
status=$?
set -e
if [ "$status" -ne 0 ]; then
kubectl_seen=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
if [ "$kubectl_seen" -lt 3 ]; then
record_rollout_risk "argocd_${label}_query_failed=$(echo "$output" | head -c 180)"
fi
printf 'Unknown'
return 0
fi
printf '%s' "$output"
}
probe_public_health() {
local phase="$1"
local http_code
local status
set +e
http_code=$(curl -sS -w "%{http_code}" -o /dev/null --connect-timeout 3 --max-time 5 "${{ env.API_HEALTH_URL }}" 2>/dev/null)
status=$?
set -e
if [ "$status" -ne 0 ]; then
http_code="curl_error_${status}"
fi
if [ "$http_code" != "200" ]; then
HEALTH_FAILURE_COUNT=$((HEALTH_FAILURE_COUNT + 1))
if [ "$HEALTH_FAILURE_COUNT" -le 3 ]; then
record_rollout_risk "public_health_${phase}_http=${http_code}"
fi
fi
}
# 等待 ArgoCD Application Synced最多 180s。只看
# Synced/Healthy 可能誤判成上一個 revision 已同步,因此有
@@ -904,15 +970,19 @@ jobs:
$KUBECTL annotate application awoooi-prod -n argocd \
argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true
for i in $(seq 1 36); do
SYNC=$($KUBECTL get application awoooi-prod -n argocd \
-o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown")
HEALTH=$($KUBECTL get application awoooi-prod -n argocd \
-o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown")
REVISION=$($KUBECTL get application awoooi-prod -n argocd \
-o jsonpath='{.status.sync.revision}' 2>/dev/null || echo "Unknown")
SYNC=$(app_field '{.status.sync.status}' sync)
HEALTH=$(app_field '{.status.health.status}' health)
REVISION=$(app_field '{.status.sync.revision}' revision)
SHORT_REVISION=$(echo "$REVISION" | cut -c1-8)
SHORT_EXPECTED=$(echo "$EXPECTED_REVISION" | cut -c1-8)
echo " ArgoCD: sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
probe_public_health "argocd_wait"
if [ "$SYNC" = "Unknown" ] || [ "$HEALTH" = "Unknown" ] || [ "$REVISION" = "Unknown" ]; then
UNKNOWN_STATUS_COUNT=$((UNKNOWN_STATUS_COUNT + 1))
if [ "$UNKNOWN_STATUS_COUNT" -le 3 ]; then
record_rollout_risk "argocd_status_unknown sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
fi
fi
if [ "$SYNC" = "Synced" ] && [ "$HEALTH" = "Healthy" ]; then
if [ -z "$EXPECTED_REVISION" ] || [ "$REVISION" = "$EXPECTED_REVISION" ]; then
echo "✅ ArgoCD Synced + Healthy"
@@ -945,10 +1015,43 @@ jobs:
sleep 10
done
if [ "$HEALTH_PASS" = "0" ]; then
record_rollout_risk "public_health_final_failed"
echo "❌ API 健康檢查失敗"
exit 1
fi
ARGOCD_WAIT
ROLLOUT_EXIT=${PIPESTATUS[0]}
set -e
ROLLOUT_RISK="0"
ROLLOUT_SUMMARY=""
if grep -q '^AWOOOI_ROLLOUT_RISK=1$' "$ROLLOUT_LOG"; then
ROLLOUT_RISK="1"
ROLLOUT_SUMMARY=$(grep '^AWOOOI_ROLLOUT_SUMMARY=' "$ROLLOUT_LOG" | tail -1 | sed 's/^AWOOOI_ROLLOUT_SUMMARY=//' | cut -c1-700)
fi
if [ -n "${GITHUB_ENV:-}" ]; then
{
echo "AWOOI_ROLLOUT_RISK=${ROLLOUT_RISK}"
echo "AWOOI_ROLLOUT_SUMMARY=${ROLLOUT_SUMMARY}"
} >> "$GITHUB_ENV"
fi
rm -f "$ROLLOUT_LOG"
if [ "$ROLLOUT_EXIT" -eq 0 ] && [ "$ROLLOUT_RISK" = "1" ]; then
ACTOR="${GITHUB_ACTOR:-${{ github.actor }}}"
if AWOOI_CICD_STATUS=pending \
AWOOI_CICD_STAGE=rollout-risk \
AWOOI_CICD_JOB_NAME="AWOOOI 部署風險已恢復" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${ROLLOUT_SUMMARY}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD rollout risk notification mirrored through AWOOI API"
else
echo "⚠️ CI/CD rollout risk notification failed (non-fatal)"
fi
fi
exit "$ROLLOUT_EXIT"
# 2026-04-09 Claude Sonnet 4.6: Sprint 5.2 — 同步 ops 腳本到 188 (ollama user)
# 188 deploy key is rotated and must not be read by this disabled step.
@@ -970,6 +1073,7 @@ jobs:
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
FAILURE_SUMMARY="${AWOOI_ROLLOUT_SUMMARY:-${COMMIT_MSG}}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
if AWOOI_CICD_STATUS=failed \
@@ -977,7 +1081,7 @@ jobs:
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
AWOOI_CICD_SUMMARY="${FAILURE_SUMMARY}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD build failure notification mirrored through AWOOI API"
else