Files
ewoooc/scripts/auto-repair/oom-handler.sh
OoO d6d8777e41
All checks were successful
CD Pipeline / deploy (push) Successful in 1m12s
V10.601 收斂 Gemini 與密鑰治理
2026-06-06 14:52:46 +08:00

131 lines
3.6 KiB
Bash
Executable File

#!/bin/bash
# OOM 自動修復腳本
# 功能:檢測 OOM 事件,自動增加資源限制並重啟
set -e
NAMESPACE="momo"
TELEGRAM_BOT="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT="5619078117"
LOG_FILE="/var/log/oom_handler.log"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
send_telegram() {
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT}" \
-d "parse_mode=HTML" \
-d "text=$1" > /dev/null
}
# 檢測最近的 OOM 事件
check_oom_events() {
kubectl get events -n "$NAMESPACE" --field-selector reason=OOMKilled \
--sort-by='.lastTimestamp' -o json 2>/dev/null | \
python3 -c "
import sys, json
from datetime import datetime, timedelta
data = json.load(sys.stdin)
recent_events = []
now = datetime.utcnow()
threshold = now - timedelta(minutes=30)
for item in data.get('items', []):
ts = item.get('lastTimestamp', '')
if ts:
event_time = datetime.fromisoformat(ts.replace('Z', '+00:00')).replace(tzinfo=None)
if event_time > threshold:
pod = item.get('involvedObject', {}).get('name', 'unknown')
recent_events.append(pod)
print(','.join(set(recent_events)))
"
}
# 取得 Deployment 目前的記憶體限制
get_current_memory_limit() {
local deployment=$1
kubectl get deployment "$deployment" -n "$NAMESPACE" -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null
}
# 增加記憶體限制 (增加 50%)
increase_memory_limit() {
local deployment=$1
local current_limit=$(get_current_memory_limit "$deployment")
# 解析數值和單位
local value=$(echo "$current_limit" | sed 's/[^0-9]//g')
local unit=$(echo "$current_limit" | sed 's/[0-9]//g')
# 增加 50%,最大不超過 8Gi
local new_value=$((value * 3 / 2))
if [ "$unit" = "Gi" ] && [ "$new_value" -gt 8 ]; then
new_value=8
fi
if [ "$unit" = "Mi" ] && [ "$new_value" -gt 8192 ]; then
new_value=8192
fi
local new_limit="${new_value}${unit}"
log "增加 $deployment 記憶體限制: $current_limit -> $new_limit"
# 更新 Deployment
kubectl patch deployment "$deployment" -n "$NAMESPACE" -p \
"{\"spec\":{\"template\":{\"spec\":{\"containers\":[{\"name\":\"$deployment\",\"resources\":{\"limits\":{\"memory\":\"$new_limit\"}}}]}}}}"
echo "$new_limit"
}
# 主邏輯
main() {
log "===== 開始 OOM 檢測 ====="
oom_pods=$(check_oom_events)
if [ -z "$oom_pods" ]; then
log "沒有發現最近的 OOM 事件"
return 0
fi
log "發現 OOM 事件: $oom_pods"
# 解析受影響的 Deployment
for pod in $(echo "$oom_pods" | tr ',' '\n'); do
# 從 Pod 名稱解析 Deployment
deployment=$(echo "$pod" | sed 's/-[a-z0-9]*-[a-z0-9]*$//')
if [ -z "$deployment" ]; then
continue
fi
current_limit=$(get_current_memory_limit "$deployment")
new_limit=$(increase_memory_limit "$deployment")
if [ $? -eq 0 ]; then
# 等待 Pod 重啟
sleep 10
kubectl rollout status deployment/"$deployment" -n "$NAMESPACE" --timeout=120s
# 發送通知
send_telegram "🔧 <b>OOM 自動修復完成</b>
📦 Deployment: <code>$deployment</code>
📊 記憶體調整: $current_limit$new_limit
✅ Pod 已自動重啟
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
🏷️ <i>MOMO Pro 自動修復系統</i>"
log "已修復 $deployment OOM 問題"
fi
done
log "===== OOM 檢測完成 ====="
}
main "$@"