#!/bin/bash # GCP OOM 自動修復腳本 # 功能:檢測 GCP OOM 事件,自動增加資源限制並重啟 # 執行位置:UAT 主機,透過 gcloud SSH 連接 GCP set -e # GCP 配置 GCP_PROJECT="astral-gateway-484913-d7" GCP_ZONE="asia-east1-b" GCP_VM="momo-pro-gcp" NAMESPACE="momo" # 通知配置 TELEGRAM_BOT="" TELEGRAM_CHAT="5619078117" LOG_FILE="/var/log/oom_handler_gcp.log" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] [GCP-OOM] $1" | tee -a "$LOG_FILE" } send_telegram() { curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \ -d "chat_id=${TELEGRAM_CHAT}" \ -d "parse_mode=HTML" \ -d "text=$1" > /dev/null } # 在 GCP 上執行命令 gcp_exec() { gcloud compute ssh "$GCP_VM" \ --zone="$GCP_ZONE" \ --project="$GCP_PROJECT" \ --command="$1" 2>/dev/null } # 檢測最近的 OOM 事件 check_oom_events() { gcp_exec "sudo kubectl get events -n $NAMESPACE --field-selector reason=OOMKilled --sort-by='.lastTimestamp' -o json" 2>/dev/null | \ python3 -c " import sys, json from datetime import datetime, timedelta try: data = json.load(sys.stdin) recent_events = [] now = datetime.utcnow() threshold = now - timedelta(minutes=30) for item in data.get('items', []): ts = item.get('lastTimestamp', '') if ts: try: event_time = datetime.fromisoformat(ts.replace('Z', '+00:00')).replace(tzinfo=None) if event_time > threshold: pod = item.get('involvedObject', {}).get('name', 'unknown') recent_events.append(pod) except: pass print(','.join(set(recent_events))) except: print('') " } # 取得 Deployment 目前的記憶體限制 get_current_memory_limit() { local deployment=$1 gcp_exec "sudo kubectl get deployment $deployment -n $NAMESPACE -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}'" 2>/dev/null } # 增加記憶體限制 (增加 50%) increase_memory_limit() { local deployment=$1 local current_limit=$(get_current_memory_limit "$deployment") # 解析數值和單位 local value=$(echo "$current_limit" | sed 's/[^0-9]//g') local unit=$(echo "$current_limit" | sed 's/[0-9]//g') # 增加 50%,最大不超過 8Gi local new_value=$((value * 3 / 2)) if [ "$unit" = "Gi" ] && [ "$new_value" -gt 8 ]; then new_value=8 fi if [ "$unit" = "Mi" ] && [ "$new_value" -gt 8192 ]; then new_value=8192 fi local new_limit="${new_value}${unit}" log "增加 GCP $deployment 記憶體限制: $current_limit -> $new_limit" # 更新 Deployment gcp_exec "sudo kubectl patch deployment $deployment -n $NAMESPACE -p '{\"spec\":{\"template\":{\"spec\":{\"containers\":[{\"name\":\"$deployment\",\"resources\":{\"limits\":{\"memory\":\"$new_limit\"}}}]}}}}'" echo "$new_limit" } # 主邏輯 main() { log "===== 開始 GCP OOM 檢測 =====" oom_pods=$(check_oom_events) if [ -z "$oom_pods" ]; then log "沒有發現最近的 GCP OOM 事件" return 0 fi log "發現 GCP OOM 事件: $oom_pods" # 解析受影響的 Deployment for pod in $(echo "$oom_pods" | tr ',' '\n'); do # 從 Pod 名稱解析 Deployment deployment=$(echo "$pod" | sed 's/-[a-z0-9]*-[a-z0-9]*$//') if [ -z "$deployment" ]; then continue fi current_limit=$(get_current_memory_limit "$deployment") new_limit=$(increase_memory_limit "$deployment") if [ $? -eq 0 ]; then # 等待 Pod 重啟 sleep 10 gcp_exec "sudo kubectl rollout status deployment/$deployment -n $NAMESPACE --timeout=120s" # 發送通知 send_telegram "🔧 【GCP】OOM 自動修復完成 🏢 環境: 🟥 PROD (momo.wooo.work) 📦 Deployment: $deployment 📊 記憶體調整: $current_limit → $new_limit ✅ Pod 已自動重啟 ⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S') 🏷️ MOMO Pro 自動修復系統" log "已修復 GCP $deployment OOM 問題" fi done log "===== GCP OOM 檢測完成 =====" } main "$@"