Files
ewoooc/scripts/auto-repair/auto-rollback.sh
OoO d6d8777e41
All checks were successful
CD Pipeline / deploy (push) Successful in 1m12s
V10.601 收斂 Gemini 與密鑰治理
2026-06-06 14:52:46 +08:00

210 lines
5.5 KiB
Bash
Executable File

#!/bin/bash
# 自動回滾腳本
# 功能:檢測應用異常(高錯誤率、健康檢查失敗),自動回滾到上一個版本
set -e
NAMESPACE="momo"
DEPLOYMENT="momo-app"
HEALTH_URL="https://mo.wooo.work/health"
ERROR_THRESHOLD=5 # 連續失敗次數閾值
TELEGRAM_BOT="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT="5619078117"
LOG_FILE="/var/log/auto_rollback.log"
STATE_FILE="/tmp/rollback_state.json"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
send_telegram() {
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT}" \
-d "parse_mode=HTML" \
-d "text=$1" > /dev/null
}
# 讀取/更新狀態
get_state() {
local key=$1
if [ -f "$STATE_FILE" ]; then
python3 -c "import json; d=json.load(open('$STATE_FILE')); print(d.get('$key', ''))" 2>/dev/null || echo ""
fi
}
set_state() {
local key=$1
local value=$2
if [ -f "$STATE_FILE" ]; then
python3 -c "import json; d=json.load(open('$STATE_FILE')); d['$key']='$value'; json.dump(d, open('$STATE_FILE','w'))"
else
echo "{\"$key\": \"$value\"}" > "$STATE_FILE"
fi
}
# 健康檢查
check_health() {
local status=$(curl -s -o /dev/null -w '%{http_code}' -m 10 "$HEALTH_URL" 2>/dev/null)
if [ "$status" = "200" ]; then
return 0
else
return 1
fi
}
# 檢查 Pod 狀態
check_pod_status() {
local ready=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0")
local desired=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1")
if [ "${ready:-0}" -ge "${desired:-1}" ]; then
return 0
else
return 1
fi
}
# 檢查 5xx 錯誤率 (需要 Prometheus)
check_error_rate() {
# 簡化版:使用最近 10 次請求的失敗比例
local fail_count=0
for i in $(seq 1 5); do
if ! check_health; then
((fail_count++))
fi
sleep 1
done
if [ $fail_count -ge 3 ]; then
return 1 # 超過 60% 失敗率
fi
return 0
}
# 取得當前版本
get_current_revision() {
kubectl rollout history deployment/"$DEPLOYMENT" -n "$NAMESPACE" 2>/dev/null | \
grep -E '^[0-9]+' | tail -1 | awk '{print $1}'
}
# 取得上一個版本
get_previous_revision() {
kubectl rollout history deployment/"$DEPLOYMENT" -n "$NAMESPACE" 2>/dev/null | \
grep -E '^[0-9]+' | tail -2 | head -1 | awk '{print $1}'
}
# 執行回滾
do_rollback() {
local current_rev=$(get_current_revision)
local previous_rev=$(get_previous_revision)
if [ -z "$previous_rev" ] || [ "$previous_rev" = "$current_rev" ]; then
log "沒有可回滾的版本"
return 1
fi
log "執行回滾: $current_rev -> $previous_rev"
# 記錄回滾時間(防止頻繁回滾)
local last_rollback=$(get_state "last_rollback")
local now=$(date +%s)
if [ -n "$last_rollback" ]; then
local elapsed=$((now - last_rollback))
if [ $elapsed -lt 3600 ]; then # 1 小時內不重複回滾
log "最近 1 小時內已回滾過,跳過"
return 1
fi
fi
# 執行回滾
kubectl rollout undo deployment/"$DEPLOYMENT" -n "$NAMESPACE"
kubectl rollout status deployment/"$DEPLOYMENT" -n "$NAMESPACE" --timeout=120s
set_state "last_rollback" "$now"
set_state "rollback_from" "$current_rev"
set_state "rollback_to" "$previous_rev"
log "回滾完成"
return 0
}
# 驗證回滾成功
verify_rollback() {
sleep 30 # 等待服務穩定
if check_health && check_pod_status; then
return 0
else
return 1
fi
}
# 主邏輯
main() {
log "===== 開始異常檢測 ====="
local fail_count=$(get_state "fail_count")
fail_count=${fail_count:-0}
# 檢查健康狀態
if check_health && check_pod_status; then
# 服務正常,重置計數
if [ "$fail_count" -gt 0 ]; then
log "服務已恢復正常,重置失敗計數"
set_state "fail_count" "0"
fi
log "服務狀態正常"
return 0
fi
# 服務異常
((fail_count++))
set_state "fail_count" "$fail_count"
log "檢測到異常,連續失敗次數: $fail_count"
if [ $fail_count -ge $ERROR_THRESHOLD ]; then
log "達到回滾閾值 ($ERROR_THRESHOLD),開始自動回滾"
current_rev=$(get_current_revision)
if do_rollback; then
# 等待並驗證
if verify_rollback; then
send_telegram "🔄 <b>自動回滾成功</b>
📦 Deployment: <code>$DEPLOYMENT</code>
📊 版本回滾: $current_rev$(get_current_revision)
🔍 原因: 連續 $ERROR_THRESHOLD 次健康檢查失敗
✅ 服務已恢復正常
⚠️ <b>請檢查最新部署的程式碼問題</b>
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
🏷️ <i>MOMO Pro 自動修復系統</i>"
set_state "fail_count" "0"
log "回滾驗證成功"
else
send_telegram "🔴 <b>自動回滾失敗</b>
📦 Deployment: <code>$DEPLOYMENT</code>
❌ 回滾後服務仍然異常
📋 需要人工介入
🔗 SSH: <code>ssh wooo@192.168.0.110</code>
🔗 查看日誌: <code>kubectl logs -f deployment/$DEPLOYMENT -n $NAMESPACE</code>
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')"
fi
fi
else
log "異常次數 $fail_count 未達閾值 $ERROR_THRESHOLD"
fi
log "===== 異常檢測完成 ====="
}
main "$@"