Files
ewoooc/scripts/auto-repair/oom-handler-gcp.sh
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

151 lines
4.2 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# GCP OOM 自動修復腳本
# 功能:檢測 GCP OOM 事件,自動增加資源限制並重啟
# 執行位置UAT 主機,透過 gcloud SSH 連接 GCP
set -e
# GCP 配置
GCP_PROJECT="astral-gateway-484913-d7"
GCP_ZONE="asia-east1-b"
GCP_VM="momo-pro-gcp"
NAMESPACE="momo"
# 通知配置
TELEGRAM_BOT="8075645931:AAH-EGKMo8ZC4QJs-Nc1_0s92xHrGdQvdpg"
TELEGRAM_CHAT="5619078117"
LOG_FILE="/var/log/oom_handler_gcp.log"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [GCP-OOM] $1" | tee -a "$LOG_FILE"
}
send_telegram() {
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT}" \
-d "parse_mode=HTML" \
-d "text=$1" > /dev/null
}
# 在 GCP 上執行命令
gcp_exec() {
gcloud compute ssh "$GCP_VM" \
--zone="$GCP_ZONE" \
--project="$GCP_PROJECT" \
--command="$1" 2>/dev/null
}
# 檢測最近的 OOM 事件
check_oom_events() {
gcp_exec "sudo kubectl get events -n $NAMESPACE --field-selector reason=OOMKilled --sort-by='.lastTimestamp' -o json" 2>/dev/null | \
python3 -c "
import sys, json
from datetime import datetime, timedelta
try:
data = json.load(sys.stdin)
recent_events = []
now = datetime.utcnow()
threshold = now - timedelta(minutes=30)
for item in data.get('items', []):
ts = item.get('lastTimestamp', '')
if ts:
try:
event_time = datetime.fromisoformat(ts.replace('Z', '+00:00')).replace(tzinfo=None)
if event_time > threshold:
pod = item.get('involvedObject', {}).get('name', 'unknown')
recent_events.append(pod)
except:
pass
print(','.join(set(recent_events)))
except:
print('')
"
}
# 取得 Deployment 目前的記憶體限制
get_current_memory_limit() {
local deployment=$1
gcp_exec "sudo kubectl get deployment $deployment -n $NAMESPACE -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}'" 2>/dev/null
}
# 增加記憶體限制 (增加 50%)
increase_memory_limit() {
local deployment=$1
local current_limit=$(get_current_memory_limit "$deployment")
# 解析數值和單位
local value=$(echo "$current_limit" | sed 's/[^0-9]//g')
local unit=$(echo "$current_limit" | sed 's/[0-9]//g')
# 增加 50%,最大不超過 8Gi
local new_value=$((value * 3 / 2))
if [ "$unit" = "Gi" ] && [ "$new_value" -gt 8 ]; then
new_value=8
fi
if [ "$unit" = "Mi" ] && [ "$new_value" -gt 8192 ]; then
new_value=8192
fi
local new_limit="${new_value}${unit}"
log "增加 GCP $deployment 記憶體限制: $current_limit -> $new_limit"
# 更新 Deployment
gcp_exec "sudo kubectl patch deployment $deployment -n $NAMESPACE -p '{\"spec\":{\"template\":{\"spec\":{\"containers\":[{\"name\":\"$deployment\",\"resources\":{\"limits\":{\"memory\":\"$new_limit\"}}}]}}}}'"
echo "$new_limit"
}
# 主邏輯
main() {
log "===== 開始 GCP OOM 檢測 ====="
oom_pods=$(check_oom_events)
if [ -z "$oom_pods" ]; then
log "沒有發現最近的 GCP OOM 事件"
return 0
fi
log "發現 GCP OOM 事件: $oom_pods"
# 解析受影響的 Deployment
for pod in $(echo "$oom_pods" | tr ',' '\n'); do
# 從 Pod 名稱解析 Deployment
deployment=$(echo "$pod" | sed 's/-[a-z0-9]*-[a-z0-9]*$//')
if [ -z "$deployment" ]; then
continue
fi
current_limit=$(get_current_memory_limit "$deployment")
new_limit=$(increase_memory_limit "$deployment")
if [ $? -eq 0 ]; then
# 等待 Pod 重啟
sleep 10
gcp_exec "sudo kubectl rollout status deployment/$deployment -n $NAMESPACE --timeout=120s"
# 發送通知
send_telegram "🔧 <b>【GCP】OOM 自動修復完成</b>
🏢 環境: 🟥 <code>PROD</code> (momo.wooo.work)
📦 Deployment: <code>$deployment</code>
📊 記憶體調整: $current_limit$new_limit
✅ Pod 已自動重啟
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
🏷️ <i>MOMO Pro 自動修復系統</i>"
log "已修復 GCP $deployment OOM 問題"
fi
done
log "===== GCP OOM 檢測完成 ====="
}
main "$@"