ci(cd): surface recovered rollout risk evidence
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
This commit is contained in:
@@ -891,11 +891,77 @@ jobs:
|
||||
}
|
||||
|
||||
# ─── Step 4: 等待 ArgoCD sync + rollout ───
|
||||
ROLLOUT_LOG="$(mktemp)"
|
||||
set +e
|
||||
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
|
||||
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
|
||||
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" 2>&1 << 'ARGOCD_WAIT' | tee "$ROLLOUT_LOG"
|
||||
set -e
|
||||
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
|
||||
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER}"
|
||||
RISK_FILE="$(mktemp)"
|
||||
UNKNOWN_STATUS_COUNT=0
|
||||
HEALTH_FAILURE_COUNT=0
|
||||
|
||||
record_rollout_risk() {
|
||||
local message="$1"
|
||||
printf '%s\n' "$message" >> "$RISK_FILE"
|
||||
echo "⚠️ Rollout risk observed: $message" >&2
|
||||
}
|
||||
|
||||
emit_rollout_evidence() {
|
||||
if [ -s "$RISK_FILE" ]; then
|
||||
local summary
|
||||
local kubectl_count
|
||||
kubectl_count=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
|
||||
summary=$(tr '\n' '; ' < "$RISK_FILE" | sed 's/[[:cntrl:]]//g' | cut -c1-700)
|
||||
echo "AWOOOI_ROLLOUT_RISK=1"
|
||||
echo "AWOOOI_ROLLOUT_SUMMARY=unknown_status_count=${UNKNOWN_STATUS_COUNT}; health_failure_count=${HEALTH_FAILURE_COUNT}; kubectl_failure_count=${kubectl_count}; ${summary}"
|
||||
else
|
||||
echo "AWOOOI_ROLLOUT_RISK=0"
|
||||
fi
|
||||
rm -f "$RISK_FILE"
|
||||
}
|
||||
trap emit_rollout_evidence EXIT
|
||||
|
||||
app_field() {
|
||||
local jsonpath="$1"
|
||||
local label="$2"
|
||||
local output
|
||||
local status
|
||||
local kubectl_seen
|
||||
set +e
|
||||
output=$($KUBECTL get application awoooi-prod -n argocd -o jsonpath="$jsonpath" 2>&1)
|
||||
status=$?
|
||||
set -e
|
||||
if [ "$status" -ne 0 ]; then
|
||||
kubectl_seen=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
|
||||
if [ "$kubectl_seen" -lt 3 ]; then
|
||||
record_rollout_risk "argocd_${label}_query_failed=$(echo "$output" | head -c 180)"
|
||||
fi
|
||||
printf 'Unknown'
|
||||
return 0
|
||||
fi
|
||||
printf '%s' "$output"
|
||||
}
|
||||
|
||||
probe_public_health() {
|
||||
local phase="$1"
|
||||
local http_code
|
||||
local status
|
||||
set +e
|
||||
http_code=$(curl -sS -w "%{http_code}" -o /dev/null --connect-timeout 3 --max-time 5 "${{ env.API_HEALTH_URL }}" 2>/dev/null)
|
||||
status=$?
|
||||
set -e
|
||||
if [ "$status" -ne 0 ]; then
|
||||
http_code="curl_error_${status}"
|
||||
fi
|
||||
if [ "$http_code" != "200" ]; then
|
||||
HEALTH_FAILURE_COUNT=$((HEALTH_FAILURE_COUNT + 1))
|
||||
if [ "$HEALTH_FAILURE_COUNT" -le 3 ]; then
|
||||
record_rollout_risk "public_health_${phase}_http=${http_code}"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# 等待 ArgoCD Application Synced(最多 180s)。只看
|
||||
# Synced/Healthy 可能誤判成上一個 revision 已同步,因此有
|
||||
@@ -904,15 +970,19 @@ jobs:
|
||||
$KUBECTL annotate application awoooi-prod -n argocd \
|
||||
argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true
|
||||
for i in $(seq 1 36); do
|
||||
SYNC=$($KUBECTL get application awoooi-prod -n argocd \
|
||||
-o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown")
|
||||
HEALTH=$($KUBECTL get application awoooi-prod -n argocd \
|
||||
-o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown")
|
||||
REVISION=$($KUBECTL get application awoooi-prod -n argocd \
|
||||
-o jsonpath='{.status.sync.revision}' 2>/dev/null || echo "Unknown")
|
||||
SYNC=$(app_field '{.status.sync.status}' sync)
|
||||
HEALTH=$(app_field '{.status.health.status}' health)
|
||||
REVISION=$(app_field '{.status.sync.revision}' revision)
|
||||
SHORT_REVISION=$(echo "$REVISION" | cut -c1-8)
|
||||
SHORT_EXPECTED=$(echo "$EXPECTED_REVISION" | cut -c1-8)
|
||||
echo " ArgoCD: sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
|
||||
probe_public_health "argocd_wait"
|
||||
if [ "$SYNC" = "Unknown" ] || [ "$HEALTH" = "Unknown" ] || [ "$REVISION" = "Unknown" ]; then
|
||||
UNKNOWN_STATUS_COUNT=$((UNKNOWN_STATUS_COUNT + 1))
|
||||
if [ "$UNKNOWN_STATUS_COUNT" -le 3 ]; then
|
||||
record_rollout_risk "argocd_status_unknown sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
|
||||
fi
|
||||
fi
|
||||
if [ "$SYNC" = "Synced" ] && [ "$HEALTH" = "Healthy" ]; then
|
||||
if [ -z "$EXPECTED_REVISION" ] || [ "$REVISION" = "$EXPECTED_REVISION" ]; then
|
||||
echo "✅ ArgoCD Synced + Healthy"
|
||||
@@ -945,10 +1015,43 @@ jobs:
|
||||
sleep 10
|
||||
done
|
||||
if [ "$HEALTH_PASS" = "0" ]; then
|
||||
record_rollout_risk "public_health_final_failed"
|
||||
echo "❌ API 健康檢查失敗"
|
||||
exit 1
|
||||
fi
|
||||
ARGOCD_WAIT
|
||||
ROLLOUT_EXIT=${PIPESTATUS[0]}
|
||||
set -e
|
||||
|
||||
ROLLOUT_RISK="0"
|
||||
ROLLOUT_SUMMARY=""
|
||||
if grep -q '^AWOOOI_ROLLOUT_RISK=1$' "$ROLLOUT_LOG"; then
|
||||
ROLLOUT_RISK="1"
|
||||
ROLLOUT_SUMMARY=$(grep '^AWOOOI_ROLLOUT_SUMMARY=' "$ROLLOUT_LOG" | tail -1 | sed 's/^AWOOOI_ROLLOUT_SUMMARY=//' | cut -c1-700)
|
||||
fi
|
||||
if [ -n "${GITHUB_ENV:-}" ]; then
|
||||
{
|
||||
echo "AWOOI_ROLLOUT_RISK=${ROLLOUT_RISK}"
|
||||
echo "AWOOI_ROLLOUT_SUMMARY=${ROLLOUT_SUMMARY}"
|
||||
} >> "$GITHUB_ENV"
|
||||
fi
|
||||
rm -f "$ROLLOUT_LOG"
|
||||
|
||||
if [ "$ROLLOUT_EXIT" -eq 0 ] && [ "$ROLLOUT_RISK" = "1" ]; then
|
||||
ACTOR="${GITHUB_ACTOR:-${{ github.actor }}}"
|
||||
if AWOOI_CICD_STATUS=pending \
|
||||
AWOOI_CICD_STAGE=rollout-risk \
|
||||
AWOOI_CICD_JOB_NAME="AWOOOI 部署風險已恢復" \
|
||||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||||
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
|
||||
AWOOI_CICD_SUMMARY="${ROLLOUT_SUMMARY}" \
|
||||
scripts/ci/notify-awoooi-cicd.sh; then
|
||||
echo "✅ CI/CD rollout risk notification mirrored through AWOOI API"
|
||||
else
|
||||
echo "⚠️ CI/CD rollout risk notification failed (non-fatal)"
|
||||
fi
|
||||
fi
|
||||
exit "$ROLLOUT_EXIT"
|
||||
|
||||
# 2026-04-09 Claude Sonnet 4.6: Sprint 5.2 — 同步 ops 腳本到 188 (ollama user)
|
||||
# 188 deploy key is rotated and must not be read by this disabled step.
|
||||
@@ -970,6 +1073,7 @@ jobs:
|
||||
COMMIT_MSG="${{ steps.commit.outputs.message }}"
|
||||
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
|
||||
ACTOR="${{ github.actor }}"
|
||||
FAILURE_SUMMARY="${AWOOI_ROLLOUT_SUMMARY:-${COMMIT_MSG}}"
|
||||
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/</\</g; s/>/\>/g')
|
||||
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
|
||||
if AWOOI_CICD_STATUS=failed \
|
||||
@@ -977,7 +1081,7 @@ jobs:
|
||||
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
|
||||
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
|
||||
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
|
||||
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
|
||||
AWOOI_CICD_SUMMARY="${FAILURE_SUMMARY}" \
|
||||
scripts/ci/notify-awoooi-cicd.sh; then
|
||||
echo "✅ CI/CD build failure notification mirrored through AWOOI API"
|
||||
else
|
||||
|
||||
Reference in New Issue
Block a user