Files
ewoooc/scripts/auto-repair/oom-handler-gcp.sh
OoO d6d8777e41
All checks were successful
CD Pipeline / deploy (push) Successful in 1m12s
V10.601 收斂 Gemini 與密鑰治理
2026-06-06 14:52:46 +08:00

151 lines
4.2 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# GCP OOM 自動修復腳本
# 功能:檢測 GCP OOM 事件,自動增加資源限制並重啟
# 執行位置UAT 主機,透過 gcloud SSH 連接 GCP
set -e
# GCP 配置
GCP_PROJECT="astral-gateway-484913-d7"
GCP_ZONE="asia-east1-b"
GCP_VM="momo-pro-gcp"
NAMESPACE="momo"
# 通知配置
TELEGRAM_BOT="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT="5619078117"
LOG_FILE="/var/log/oom_handler_gcp.log"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [GCP-OOM] $1" | tee -a "$LOG_FILE"
}
send_telegram() {
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT}" \
-d "parse_mode=HTML" \
-d "text=$1" > /dev/null
}
# 在 GCP 上執行命令
gcp_exec() {
gcloud compute ssh "$GCP_VM" \
--zone="$GCP_ZONE" \
--project="$GCP_PROJECT" \
--command="$1" 2>/dev/null
}
# 檢測最近的 OOM 事件
check_oom_events() {
gcp_exec "sudo kubectl get events -n $NAMESPACE --field-selector reason=OOMKilled --sort-by='.lastTimestamp' -o json" 2>/dev/null | \
python3 -c "
import sys, json
from datetime import datetime, timedelta
try:
data = json.load(sys.stdin)
recent_events = []
now = datetime.utcnow()
threshold = now - timedelta(minutes=30)
for item in data.get('items', []):
ts = item.get('lastTimestamp', '')
if ts:
try:
event_time = datetime.fromisoformat(ts.replace('Z', '+00:00')).replace(tzinfo=None)
if event_time > threshold:
pod = item.get('involvedObject', {}).get('name', 'unknown')
recent_events.append(pod)
except:
pass
print(','.join(set(recent_events)))
except:
print('')
"
}
# 取得 Deployment 目前的記憶體限制
get_current_memory_limit() {
local deployment=$1
gcp_exec "sudo kubectl get deployment $deployment -n $NAMESPACE -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}'" 2>/dev/null
}
# 增加記憶體限制 (增加 50%)
increase_memory_limit() {
local deployment=$1
local current_limit=$(get_current_memory_limit "$deployment")
# 解析數值和單位
local value=$(echo "$current_limit" | sed 's/[^0-9]//g')
local unit=$(echo "$current_limit" | sed 's/[0-9]//g')
# 增加 50%,最大不超過 8Gi
local new_value=$((value * 3 / 2))
if [ "$unit" = "Gi" ] && [ "$new_value" -gt 8 ]; then
new_value=8
fi
if [ "$unit" = "Mi" ] && [ "$new_value" -gt 8192 ]; then
new_value=8192
fi
local new_limit="${new_value}${unit}"
log "增加 GCP $deployment 記憶體限制: $current_limit -> $new_limit"
# 更新 Deployment
gcp_exec "sudo kubectl patch deployment $deployment -n $NAMESPACE -p '{\"spec\":{\"template\":{\"spec\":{\"containers\":[{\"name\":\"$deployment\",\"resources\":{\"limits\":{\"memory\":\"$new_limit\"}}}]}}}}'"
echo "$new_limit"
}
# 主邏輯
main() {
log "===== 開始 GCP OOM 檢測 ====="
oom_pods=$(check_oom_events)
if [ -z "$oom_pods" ]; then
log "沒有發現最近的 GCP OOM 事件"
return 0
fi
log "發現 GCP OOM 事件: $oom_pods"
# 解析受影響的 Deployment
for pod in $(echo "$oom_pods" | tr ',' '\n'); do
# 從 Pod 名稱解析 Deployment
deployment=$(echo "$pod" | sed 's/-[a-z0-9]*-[a-z0-9]*$//')
if [ -z "$deployment" ]; then
continue
fi
current_limit=$(get_current_memory_limit "$deployment")
new_limit=$(increase_memory_limit "$deployment")
if [ $? -eq 0 ]; then
# 等待 Pod 重啟
sleep 10
gcp_exec "sudo kubectl rollout status deployment/$deployment -n $NAMESPACE --timeout=120s"
# 發送通知
send_telegram "🔧 <b>【GCP】OOM 自動修復完成</b>
🏢 環境: 🟥 <code>PROD</code> (momo.wooo.work)
📦 Deployment: <code>$deployment</code>
📊 記憶體調整: $current_limit$new_limit
✅ Pod 已自動重啟
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
🏷️ <i>MOMO Pro 自動修復系統</i>"
log "已修復 GCP $deployment OOM 問題"
fi
done
log "===== GCP OOM 檢測完成 ====="
}
main "$@"