Files
ewoooc/scripts/auto-repair/auto-rollback-gcp.sh
OoO d6d8777e41
All checks were successful
CD Pipeline / deploy (push) Successful in 1m12s
V10.601 收斂 Gemini 與密鑰治理
2026-06-06 14:52:46 +08:00

209 lines
5.7 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# GCP 自動回滾腳本
# 功能:檢測 GCP 應用異常,自動回滾到上一個版本
# 執行位置UAT 主機,透過 gcloud SSH 連接 GCP
set -e
# GCP 配置
GCP_PROJECT="astral-gateway-484913-d7"
GCP_ZONE="asia-east1-b"
GCP_VM="momo-pro-gcp"
NAMESPACE="momo"
DEPLOYMENT="momo-app"
HEALTH_URL="https://momo.wooo.work/health"
ERROR_THRESHOLD=5 # 連續失敗次數閾值
# 通知配置
TELEGRAM_BOT="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT="5619078117"
LOG_FILE="/var/log/auto_rollback_gcp.log"
STATE_FILE="/tmp/rollback_state_gcp.json"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [GCP] $1" | tee -a "$LOG_FILE"
}
send_telegram() {
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT}" \
-d "parse_mode=HTML" \
-d "text=$1" > /dev/null
}
# 在 GCP 上執行命令
gcp_exec() {
gcloud compute ssh "$GCP_VM" \
--zone="$GCP_ZONE" \
--project="$GCP_PROJECT" \
--command="$1" 2>/dev/null
}
# 讀取/更新狀態
get_state() {
local key=$1
if [ -f "$STATE_FILE" ]; then
python3 -c "import json; d=json.load(open('$STATE_FILE')); print(d.get('$key', ''))" 2>/dev/null || echo ""
fi
}
set_state() {
local key=$1
local value=$2
if [ -f "$STATE_FILE" ]; then
python3 -c "import json; d=json.load(open('$STATE_FILE')); d['$key']='$value'; json.dump(d, open('$STATE_FILE','w'))"
else
echo "{\"$key\": \"$value\"}" > "$STATE_FILE"
fi
}
# 健康檢查
check_health() {
local status=$(curl -s -o /dev/null -w '%{http_code}' -m 10 "$HEALTH_URL" 2>/dev/null)
if [ "$status" = "200" ]; then
return 0
else
return 1
fi
}
# 檢查 GCP Pod 狀態
check_pod_status() {
local ready=$(gcp_exec "sudo kubectl get deployment $DEPLOYMENT -n $NAMESPACE -o jsonpath='{.status.readyReplicas}'" 2>/dev/null || echo "0")
local desired=$(gcp_exec "sudo kubectl get deployment $DEPLOYMENT -n $NAMESPACE -o jsonpath='{.spec.replicas}'" 2>/dev/null || echo "1")
if [ "${ready:-0}" -ge "${desired:-1}" ]; then
return 0
else
return 1
fi
}
# 取得當前版本
get_current_revision() {
gcp_exec "sudo kubectl rollout history deployment/$DEPLOYMENT -n $NAMESPACE" 2>/dev/null | \
grep -E '^[0-9]+' | tail -1 | awk '{print $1}'
}
# 取得上一個版本
get_previous_revision() {
gcp_exec "sudo kubectl rollout history deployment/$DEPLOYMENT -n $NAMESPACE" 2>/dev/null | \
grep -E '^[0-9]+' | tail -2 | head -1 | awk '{print $1}'
}
# 執行回滾
do_rollback() {
local current_rev=$(get_current_revision)
local previous_rev=$(get_previous_revision)
if [ -z "$previous_rev" ] || [ "$previous_rev" = "$current_rev" ]; then
log "沒有可回滾的版本"
return 1
fi
log "執行 GCP 回滾: $current_rev -> $previous_rev"
# 記錄回滾時間(防止頻繁回滾)
local last_rollback=$(get_state "last_rollback")
local now=$(date +%s)
if [ -n "$last_rollback" ]; then
local elapsed=$((now - last_rollback))
if [ $elapsed -lt 3600 ]; then # 1 小時內不重複回滾
log "最近 1 小時內已回滾過,跳過"
return 1
fi
fi
# 執行回滾
gcp_exec "sudo kubectl rollout undo deployment/$DEPLOYMENT -n $NAMESPACE"
gcp_exec "sudo kubectl rollout status deployment/$DEPLOYMENT -n $NAMESPACE --timeout=120s"
set_state "last_rollback" "$now"
set_state "rollback_from" "$current_rev"
set_state "rollback_to" "$previous_rev"
log "GCP 回滾完成"
return 0
}
# 驗證回滾成功
verify_rollback() {
sleep 30 # 等待服務穩定
if check_health && check_pod_status; then
return 0
else
return 1
fi
}
# 主邏輯
main() {
log "===== 開始 GCP 異常檢測 ====="
local fail_count=$(get_state "fail_count")
fail_count=${fail_count:-0}
# 檢查健康狀態
if check_health && check_pod_status; then
# 服務正常,重置計數
if [ "$fail_count" -gt 0 ]; then
log "GCP 服務已恢復正常,重置失敗計數"
set_state "fail_count" "0"
fi
log "GCP 服務狀態正常"
return 0
fi
# 服務異常
((fail_count++))
set_state "fail_count" "$fail_count"
log "檢測到 GCP 異常,連續失敗次數: $fail_count"
if [ $fail_count -ge $ERROR_THRESHOLD ]; then
log "達到回滾閾值 ($ERROR_THRESHOLD),開始 GCP 自動回滾"
current_rev=$(get_current_revision)
if do_rollback; then
# 等待並驗證
if verify_rollback; then
send_telegram "🔄 <b>【GCP】自動回滾成功</b>
🏢 環境: 🟥 <code>PROD</code> (momo.wooo.work)
📦 Deployment: <code>$DEPLOYMENT</code>
📊 版本回滾: $current_rev$(get_current_revision)
🔍 原因: 連續 $ERROR_THRESHOLD 次健康檢查失敗
✅ 服務已恢復正常
⚠️ <b>請檢查最新部署的程式碼問題</b>
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
🏷️ <i>MOMO Pro 自動修復系統</i>"
set_state "fail_count" "0"
log "GCP 回滾驗證成功"
else
send_telegram "🔴 <b>【GCP】自動回滾失敗</b>
🏢 環境: 🟥 <code>PROD</code> (momo.wooo.work)
📦 Deployment: <code>$DEPLOYMENT</code>
❌ 回滾後服務仍然異常
📋 需要人工介入
🔗 SSH: <code>gcloud compute ssh $GCP_VM --zone=$GCP_ZONE</code>
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')"
fi
fi
else
log "GCP 異常次數 $fail_count 未達閾值 $ERROR_THRESHOLD"
fi
log "===== GCP 異常檢測完成 ====="
}
main "$@"