210 lines
5.5 KiB
Bash
Executable File
210 lines
5.5 KiB
Bash
Executable File
#!/bin/bash
|
|
# 自動回滾腳本
|
|
# 功能:檢測應用異常(高錯誤率、健康檢查失敗),自動回滾到上一個版本
|
|
|
|
set -e
|
|
|
|
NAMESPACE="momo"
|
|
DEPLOYMENT="momo-app"
|
|
HEALTH_URL="https://mo.wooo.work/health"
|
|
ERROR_THRESHOLD=5 # 連續失敗次數閾值
|
|
TELEGRAM_BOT="<TELEGRAM_BOT_TOKEN>"
|
|
TELEGRAM_CHAT="5619078117"
|
|
LOG_FILE="/var/log/auto_rollback.log"
|
|
STATE_FILE="/tmp/rollback_state.json"
|
|
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
send_telegram() {
|
|
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
|
|
-d "chat_id=${TELEGRAM_CHAT}" \
|
|
-d "parse_mode=HTML" \
|
|
-d "text=$1" > /dev/null
|
|
}
|
|
|
|
# 讀取/更新狀態
|
|
get_state() {
|
|
local key=$1
|
|
if [ -f "$STATE_FILE" ]; then
|
|
python3 -c "import json; d=json.load(open('$STATE_FILE')); print(d.get('$key', ''))" 2>/dev/null || echo ""
|
|
fi
|
|
}
|
|
|
|
set_state() {
|
|
local key=$1
|
|
local value=$2
|
|
if [ -f "$STATE_FILE" ]; then
|
|
python3 -c "import json; d=json.load(open('$STATE_FILE')); d['$key']='$value'; json.dump(d, open('$STATE_FILE','w'))"
|
|
else
|
|
echo "{\"$key\": \"$value\"}" > "$STATE_FILE"
|
|
fi
|
|
}
|
|
|
|
# 健康檢查
|
|
check_health() {
|
|
local status=$(curl -s -o /dev/null -w '%{http_code}' -m 10 "$HEALTH_URL" 2>/dev/null)
|
|
if [ "$status" = "200" ]; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# 檢查 Pod 狀態
|
|
check_pod_status() {
|
|
local ready=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0")
|
|
local desired=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1")
|
|
|
|
if [ "${ready:-0}" -ge "${desired:-1}" ]; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# 檢查 5xx 錯誤率 (需要 Prometheus)
|
|
check_error_rate() {
|
|
# 簡化版:使用最近 10 次請求的失敗比例
|
|
local fail_count=0
|
|
for i in $(seq 1 5); do
|
|
if ! check_health; then
|
|
((fail_count++))
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
if [ $fail_count -ge 3 ]; then
|
|
return 1 # 超過 60% 失敗率
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# 取得當前版本
|
|
get_current_revision() {
|
|
kubectl rollout history deployment/"$DEPLOYMENT" -n "$NAMESPACE" 2>/dev/null | \
|
|
grep -E '^[0-9]+' | tail -1 | awk '{print $1}'
|
|
}
|
|
|
|
# 取得上一個版本
|
|
get_previous_revision() {
|
|
kubectl rollout history deployment/"$DEPLOYMENT" -n "$NAMESPACE" 2>/dev/null | \
|
|
grep -E '^[0-9]+' | tail -2 | head -1 | awk '{print $1}'
|
|
}
|
|
|
|
# 執行回滾
|
|
do_rollback() {
|
|
local current_rev=$(get_current_revision)
|
|
local previous_rev=$(get_previous_revision)
|
|
|
|
if [ -z "$previous_rev" ] || [ "$previous_rev" = "$current_rev" ]; then
|
|
log "沒有可回滾的版本"
|
|
return 1
|
|
fi
|
|
|
|
log "執行回滾: $current_rev -> $previous_rev"
|
|
|
|
# 記錄回滾時間(防止頻繁回滾)
|
|
local last_rollback=$(get_state "last_rollback")
|
|
local now=$(date +%s)
|
|
|
|
if [ -n "$last_rollback" ]; then
|
|
local elapsed=$((now - last_rollback))
|
|
if [ $elapsed -lt 3600 ]; then # 1 小時內不重複回滾
|
|
log "最近 1 小時內已回滾過,跳過"
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
# 執行回滾
|
|
kubectl rollout undo deployment/"$DEPLOYMENT" -n "$NAMESPACE"
|
|
kubectl rollout status deployment/"$DEPLOYMENT" -n "$NAMESPACE" --timeout=120s
|
|
|
|
set_state "last_rollback" "$now"
|
|
set_state "rollback_from" "$current_rev"
|
|
set_state "rollback_to" "$previous_rev"
|
|
|
|
log "回滾完成"
|
|
return 0
|
|
}
|
|
|
|
# 驗證回滾成功
|
|
verify_rollback() {
|
|
sleep 30 # 等待服務穩定
|
|
|
|
if check_health && check_pod_status; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# 主邏輯
|
|
main() {
|
|
log "===== 開始異常檢測 ====="
|
|
|
|
local fail_count=$(get_state "fail_count")
|
|
fail_count=${fail_count:-0}
|
|
|
|
# 檢查健康狀態
|
|
if check_health && check_pod_status; then
|
|
# 服務正常,重置計數
|
|
if [ "$fail_count" -gt 0 ]; then
|
|
log "服務已恢復正常,重置失敗計數"
|
|
set_state "fail_count" "0"
|
|
fi
|
|
log "服務狀態正常"
|
|
return 0
|
|
fi
|
|
|
|
# 服務異常
|
|
((fail_count++))
|
|
set_state "fail_count" "$fail_count"
|
|
log "檢測到異常,連續失敗次數: $fail_count"
|
|
|
|
if [ $fail_count -ge $ERROR_THRESHOLD ]; then
|
|
log "達到回滾閾值 ($ERROR_THRESHOLD),開始自動回滾"
|
|
|
|
current_rev=$(get_current_revision)
|
|
|
|
if do_rollback; then
|
|
# 等待並驗證
|
|
if verify_rollback; then
|
|
send_telegram "🔄 <b>自動回滾成功</b>
|
|
|
|
📦 Deployment: <code>$DEPLOYMENT</code>
|
|
📊 版本回滾: $current_rev → $(get_current_revision)
|
|
🔍 原因: 連續 $ERROR_THRESHOLD 次健康檢查失敗
|
|
|
|
✅ 服務已恢復正常
|
|
|
|
⚠️ <b>請檢查最新部署的程式碼問題</b>
|
|
|
|
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
|
|
🏷️ <i>MOMO Pro 自動修復系統</i>"
|
|
|
|
set_state "fail_count" "0"
|
|
log "回滾驗證成功"
|
|
else
|
|
send_telegram "🔴 <b>自動回滾失敗</b>
|
|
|
|
📦 Deployment: <code>$DEPLOYMENT</code>
|
|
❌ 回滾後服務仍然異常
|
|
📋 需要人工介入
|
|
|
|
🔗 SSH: <code>ssh wooo@192.168.0.110</code>
|
|
🔗 查看日誌: <code>kubectl logs -f deployment/$DEPLOYMENT -n $NAMESPACE</code>
|
|
|
|
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
fi
|
|
fi
|
|
else
|
|
log "異常次數 $fail_count 未達閾值 $ERROR_THRESHOLD"
|
|
fi
|
|
|
|
log "===== 異常檢測完成 ====="
|
|
}
|
|
|
|
main "$@"
|