diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml
index 08e96e94..14aa99db 100644
--- a/.gitea/workflows/cd.yaml
+++ b/.gitea/workflows/cd.yaml
@@ -891,11 +891,77 @@ jobs:
}
# ─── Step 4: 等待 ArgoCD sync + rollout ───
+ ROLLOUT_LOG="$(mktemp)"
+ set +e
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
- "EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
+ "EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" 2>&1 << 'ARGOCD_WAIT' | tee "$ROLLOUT_LOG"
set -e
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER}"
+ RISK_FILE="$(mktemp)"
+ UNKNOWN_STATUS_COUNT=0
+ HEALTH_FAILURE_COUNT=0
+
+ record_rollout_risk() {
+ local message="$1"
+ printf '%s\n' "$message" >> "$RISK_FILE"
+ echo "⚠️ Rollout risk observed: $message" >&2
+ }
+
+ emit_rollout_evidence() {
+ if [ -s "$RISK_FILE" ]; then
+ local summary
+ local kubectl_count
+ kubectl_count=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
+ summary=$(tr '\n' '; ' < "$RISK_FILE" | sed 's/[[:cntrl:]]//g' | cut -c1-700)
+ echo "AWOOOI_ROLLOUT_RISK=1"
+ echo "AWOOOI_ROLLOUT_SUMMARY=unknown_status_count=${UNKNOWN_STATUS_COUNT}; health_failure_count=${HEALTH_FAILURE_COUNT}; kubectl_failure_count=${kubectl_count}; ${summary}"
+ else
+ echo "AWOOOI_ROLLOUT_RISK=0"
+ fi
+ rm -f "$RISK_FILE"
+ }
+ trap emit_rollout_evidence EXIT
+
+ app_field() {
+ local jsonpath="$1"
+ local label="$2"
+ local output
+ local status
+ local kubectl_seen
+ set +e
+ output=$($KUBECTL get application awoooi-prod -n argocd -o jsonpath="$jsonpath" 2>&1)
+ status=$?
+ set -e
+ if [ "$status" -ne 0 ]; then
+ kubectl_seen=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
+ if [ "$kubectl_seen" -lt 3 ]; then
+ record_rollout_risk "argocd_${label}_query_failed=$(echo "$output" | head -c 180)"
+ fi
+ printf 'Unknown'
+ return 0
+ fi
+ printf '%s' "$output"
+ }
+
+ probe_public_health() {
+ local phase="$1"
+ local http_code
+ local status
+ set +e
+ http_code=$(curl -sS -w "%{http_code}" -o /dev/null --connect-timeout 3 --max-time 5 "${{ env.API_HEALTH_URL }}" 2>/dev/null)
+ status=$?
+ set -e
+ if [ "$status" -ne 0 ]; then
+ http_code="curl_error_${status}"
+ fi
+ if [ "$http_code" != "200" ]; then
+ HEALTH_FAILURE_COUNT=$((HEALTH_FAILURE_COUNT + 1))
+ if [ "$HEALTH_FAILURE_COUNT" -le 3 ]; then
+ record_rollout_risk "public_health_${phase}_http=${http_code}"
+ fi
+ fi
+ }
# 等待 ArgoCD Application Synced(最多 180s)。只看
# Synced/Healthy 可能誤判成上一個 revision 已同步,因此有
@@ -904,15 +970,19 @@ jobs:
$KUBECTL annotate application awoooi-prod -n argocd \
argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true
for i in $(seq 1 36); do
- SYNC=$($KUBECTL get application awoooi-prod -n argocd \
- -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown")
- HEALTH=$($KUBECTL get application awoooi-prod -n argocd \
- -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown")
- REVISION=$($KUBECTL get application awoooi-prod -n argocd \
- -o jsonpath='{.status.sync.revision}' 2>/dev/null || echo "Unknown")
+ SYNC=$(app_field '{.status.sync.status}' sync)
+ HEALTH=$(app_field '{.status.health.status}' health)
+ REVISION=$(app_field '{.status.sync.revision}' revision)
SHORT_REVISION=$(echo "$REVISION" | cut -c1-8)
SHORT_EXPECTED=$(echo "$EXPECTED_REVISION" | cut -c1-8)
echo " ArgoCD: sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
+ probe_public_health "argocd_wait"
+ if [ "$SYNC" = "Unknown" ] || [ "$HEALTH" = "Unknown" ] || [ "$REVISION" = "Unknown" ]; then
+ UNKNOWN_STATUS_COUNT=$((UNKNOWN_STATUS_COUNT + 1))
+ if [ "$UNKNOWN_STATUS_COUNT" -le 3 ]; then
+ record_rollout_risk "argocd_status_unknown sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
+ fi
+ fi
if [ "$SYNC" = "Synced" ] && [ "$HEALTH" = "Healthy" ]; then
if [ -z "$EXPECTED_REVISION" ] || [ "$REVISION" = "$EXPECTED_REVISION" ]; then
echo "✅ ArgoCD Synced + Healthy"
@@ -945,10 +1015,43 @@ jobs:
sleep 10
done
if [ "$HEALTH_PASS" = "0" ]; then
+ record_rollout_risk "public_health_final_failed"
echo "❌ API 健康檢查失敗"
exit 1
fi
ARGOCD_WAIT
+ ROLLOUT_EXIT=${PIPESTATUS[0]}
+ set -e
+
+ ROLLOUT_RISK="0"
+ ROLLOUT_SUMMARY=""
+ if grep -q '^AWOOOI_ROLLOUT_RISK=1$' "$ROLLOUT_LOG"; then
+ ROLLOUT_RISK="1"
+ ROLLOUT_SUMMARY=$(grep '^AWOOOI_ROLLOUT_SUMMARY=' "$ROLLOUT_LOG" | tail -1 | sed 's/^AWOOOI_ROLLOUT_SUMMARY=//' | cut -c1-700)
+ fi
+ if [ -n "${GITHUB_ENV:-}" ]; then
+ {
+ echo "AWOOI_ROLLOUT_RISK=${ROLLOUT_RISK}"
+ echo "AWOOI_ROLLOUT_SUMMARY=${ROLLOUT_SUMMARY}"
+ } >> "$GITHUB_ENV"
+ fi
+ rm -f "$ROLLOUT_LOG"
+
+ if [ "$ROLLOUT_EXIT" -eq 0 ] && [ "$ROLLOUT_RISK" = "1" ]; then
+ ACTOR="${GITHUB_ACTOR:-${{ github.actor }}}"
+ if AWOOI_CICD_STATUS=pending \
+ AWOOI_CICD_STAGE=rollout-risk \
+ AWOOI_CICD_JOB_NAME="AWOOOI 部署風險已恢復" \
+ AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
+ AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
+ AWOOI_CICD_SUMMARY="${ROLLOUT_SUMMARY}" \
+ scripts/ci/notify-awoooi-cicd.sh; then
+ echo "✅ CI/CD rollout risk notification mirrored through AWOOI API"
+ else
+ echo "⚠️ CI/CD rollout risk notification failed (non-fatal)"
+ fi
+ fi
+ exit "$ROLLOUT_EXIT"
# 2026-04-09 Claude Sonnet 4.6: Sprint 5.2 — 同步 ops 腳本到 188 (ollama user)
# 188 deploy key is rotated and must not be read by this disabled step.
@@ -970,6 +1073,7 @@ jobs:
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
+ FAILURE_SUMMARY="${AWOOI_ROLLOUT_SUMMARY:-${COMMIT_MSG}}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&/g; s/\</g; s/>/\>/g')
MSG=$(printf '❌ AWOOOI 部署失敗\n├ 📝 %s\n├ 🔖 %s\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
if AWOOI_CICD_STATUS=failed \
@@ -977,7 +1081,7 @@ jobs:
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
- AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
+ AWOOI_CICD_SUMMARY="${FAILURE_SUMMARY}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD build failure notification mirrored through AWOOI API"
else