209 lines
5.7 KiB
Bash
209 lines
5.7 KiB
Bash
#!/bin/bash
|
||
# GCP 自動回滾腳本
|
||
# 功能:檢測 GCP 應用異常,自動回滾到上一個版本
|
||
# 執行位置:UAT 主機,透過 gcloud SSH 連接 GCP
|
||
|
||
set -e
|
||
|
||
# GCP 配置
|
||
GCP_PROJECT="astral-gateway-484913-d7"
|
||
GCP_ZONE="asia-east1-b"
|
||
GCP_VM="momo-pro-gcp"
|
||
NAMESPACE="momo"
|
||
DEPLOYMENT="momo-app"
|
||
HEALTH_URL="https://momo.wooo.work/health"
|
||
ERROR_THRESHOLD=5 # 連續失敗次數閾值
|
||
|
||
# 通知配置
|
||
TELEGRAM_BOT="<TELEGRAM_BOT_TOKEN>"
|
||
TELEGRAM_CHAT="5619078117"
|
||
LOG_FILE="/var/log/auto_rollback_gcp.log"
|
||
STATE_FILE="/tmp/rollback_state_gcp.json"
|
||
|
||
log() {
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [GCP] $1" | tee -a "$LOG_FILE"
|
||
}
|
||
|
||
send_telegram() {
|
||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
|
||
-d "chat_id=${TELEGRAM_CHAT}" \
|
||
-d "parse_mode=HTML" \
|
||
-d "text=$1" > /dev/null
|
||
}
|
||
|
||
# 在 GCP 上執行命令
|
||
gcp_exec() {
|
||
gcloud compute ssh "$GCP_VM" \
|
||
--zone="$GCP_ZONE" \
|
||
--project="$GCP_PROJECT" \
|
||
--command="$1" 2>/dev/null
|
||
}
|
||
|
||
# 讀取/更新狀態
|
||
get_state() {
|
||
local key=$1
|
||
if [ -f "$STATE_FILE" ]; then
|
||
python3 -c "import json; d=json.load(open('$STATE_FILE')); print(d.get('$key', ''))" 2>/dev/null || echo ""
|
||
fi
|
||
}
|
||
|
||
set_state() {
|
||
local key=$1
|
||
local value=$2
|
||
if [ -f "$STATE_FILE" ]; then
|
||
python3 -c "import json; d=json.load(open('$STATE_FILE')); d['$key']='$value'; json.dump(d, open('$STATE_FILE','w'))"
|
||
else
|
||
echo "{\"$key\": \"$value\"}" > "$STATE_FILE"
|
||
fi
|
||
}
|
||
|
||
# 健康檢查
|
||
check_health() {
|
||
local status=$(curl -s -o /dev/null -w '%{http_code}' -m 10 "$HEALTH_URL" 2>/dev/null)
|
||
if [ "$status" = "200" ]; then
|
||
return 0
|
||
else
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 檢查 GCP Pod 狀態
|
||
check_pod_status() {
|
||
local ready=$(gcp_exec "sudo kubectl get deployment $DEPLOYMENT -n $NAMESPACE -o jsonpath='{.status.readyReplicas}'" 2>/dev/null || echo "0")
|
||
local desired=$(gcp_exec "sudo kubectl get deployment $DEPLOYMENT -n $NAMESPACE -o jsonpath='{.spec.replicas}'" 2>/dev/null || echo "1")
|
||
|
||
if [ "${ready:-0}" -ge "${desired:-1}" ]; then
|
||
return 0
|
||
else
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 取得當前版本
|
||
get_current_revision() {
|
||
gcp_exec "sudo kubectl rollout history deployment/$DEPLOYMENT -n $NAMESPACE" 2>/dev/null | \
|
||
grep -E '^[0-9]+' | tail -1 | awk '{print $1}'
|
||
}
|
||
|
||
# 取得上一個版本
|
||
get_previous_revision() {
|
||
gcp_exec "sudo kubectl rollout history deployment/$DEPLOYMENT -n $NAMESPACE" 2>/dev/null | \
|
||
grep -E '^[0-9]+' | tail -2 | head -1 | awk '{print $1}'
|
||
}
|
||
|
||
# 執行回滾
|
||
do_rollback() {
|
||
local current_rev=$(get_current_revision)
|
||
local previous_rev=$(get_previous_revision)
|
||
|
||
if [ -z "$previous_rev" ] || [ "$previous_rev" = "$current_rev" ]; then
|
||
log "沒有可回滾的版本"
|
||
return 1
|
||
fi
|
||
|
||
log "執行 GCP 回滾: $current_rev -> $previous_rev"
|
||
|
||
# 記錄回滾時間(防止頻繁回滾)
|
||
local last_rollback=$(get_state "last_rollback")
|
||
local now=$(date +%s)
|
||
|
||
if [ -n "$last_rollback" ]; then
|
||
local elapsed=$((now - last_rollback))
|
||
if [ $elapsed -lt 3600 ]; then # 1 小時內不重複回滾
|
||
log "最近 1 小時內已回滾過,跳過"
|
||
return 1
|
||
fi
|
||
fi
|
||
|
||
# 執行回滾
|
||
gcp_exec "sudo kubectl rollout undo deployment/$DEPLOYMENT -n $NAMESPACE"
|
||
gcp_exec "sudo kubectl rollout status deployment/$DEPLOYMENT -n $NAMESPACE --timeout=120s"
|
||
|
||
set_state "last_rollback" "$now"
|
||
set_state "rollback_from" "$current_rev"
|
||
set_state "rollback_to" "$previous_rev"
|
||
|
||
log "GCP 回滾完成"
|
||
return 0
|
||
}
|
||
|
||
# 驗證回滾成功
|
||
verify_rollback() {
|
||
sleep 30 # 等待服務穩定
|
||
|
||
if check_health && check_pod_status; then
|
||
return 0
|
||
else
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 主邏輯
|
||
main() {
|
||
log "===== 開始 GCP 異常檢測 ====="
|
||
|
||
local fail_count=$(get_state "fail_count")
|
||
fail_count=${fail_count:-0}
|
||
|
||
# 檢查健康狀態
|
||
if check_health && check_pod_status; then
|
||
# 服務正常,重置計數
|
||
if [ "$fail_count" -gt 0 ]; then
|
||
log "GCP 服務已恢復正常,重置失敗計數"
|
||
set_state "fail_count" "0"
|
||
fi
|
||
log "GCP 服務狀態正常"
|
||
return 0
|
||
fi
|
||
|
||
# 服務異常
|
||
((fail_count++))
|
||
set_state "fail_count" "$fail_count"
|
||
log "檢測到 GCP 異常,連續失敗次數: $fail_count"
|
||
|
||
if [ $fail_count -ge $ERROR_THRESHOLD ]; then
|
||
log "達到回滾閾值 ($ERROR_THRESHOLD),開始 GCP 自動回滾"
|
||
|
||
current_rev=$(get_current_revision)
|
||
|
||
if do_rollback; then
|
||
# 等待並驗證
|
||
if verify_rollback; then
|
||
send_telegram "🔄 <b>【GCP】自動回滾成功</b>
|
||
|
||
🏢 環境: 🟥 <code>PROD</code> (momo.wooo.work)
|
||
📦 Deployment: <code>$DEPLOYMENT</code>
|
||
📊 版本回滾: $current_rev → $(get_current_revision)
|
||
🔍 原因: 連續 $ERROR_THRESHOLD 次健康檢查失敗
|
||
|
||
✅ 服務已恢復正常
|
||
|
||
⚠️ <b>請檢查最新部署的程式碼問題</b>
|
||
|
||
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
|
||
🏷️ <i>MOMO Pro 自動修復系統</i>"
|
||
|
||
set_state "fail_count" "0"
|
||
log "GCP 回滾驗證成功"
|
||
else
|
||
send_telegram "🔴 <b>【GCP】自動回滾失敗</b>
|
||
|
||
🏢 環境: 🟥 <code>PROD</code> (momo.wooo.work)
|
||
📦 Deployment: <code>$DEPLOYMENT</code>
|
||
❌ 回滾後服務仍然異常
|
||
📋 需要人工介入
|
||
|
||
🔗 SSH: <code>gcloud compute ssh $GCP_VM --zone=$GCP_ZONE</code>
|
||
|
||
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
fi
|
||
fi
|
||
else
|
||
log "GCP 異常次數 $fail_count 未達閾值 $ERROR_THRESHOLD"
|
||
fi
|
||
|
||
log "===== GCP 異常檢測完成 ====="
|
||
}
|
||
|
||
main "$@"
|