Files
ewoooc/scripts/auto-repair/postgres-repair-gcp.sh
OoO d6d8777e41
All checks were successful
CD Pipeline / deploy (push) Successful in 1m12s
V10.601 收斂 Gemini 與密鑰治理
2026-06-06 14:52:46 +08:00

187 lines
5.0 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# GCP PostgreSQL 自動修復腳本
# 功能:檢測 GCP 資料庫問題,自動 VACUUM、重建索引、恢復連線
# 執行位置UAT 主機,透過 gcloud SSH 連接 GCP
set -e
# GCP 配置
GCP_PROJECT="astral-gateway-484913-d7"
GCP_ZONE="asia-east1-b"
GCP_VM="momo-pro-gcp"
NAMESPACE="momo"
DB_NAME="momo_analytics"
DB_USER="momo"
# 通知配置
TELEGRAM_BOT="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT="5619078117"
LOG_FILE="/var/log/postgres_repair_gcp.log"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [GCP-PG] $1" | tee -a "$LOG_FILE"
}
send_telegram() {
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT}" \
-d "parse_mode=HTML" \
-d "text=$1" > /dev/null
}
# 在 GCP 上執行命令
gcp_exec() {
gcloud compute ssh "$GCP_VM" \
--zone="$GCP_ZONE" \
--project="$GCP_PROJECT" \
--command="$1" 2>/dev/null
}
# 在 GCP Pod 內執行 SQL
exec_sql() {
gcp_exec "sudo kubectl exec -n $NAMESPACE momo-postgres-0 -- psql -U $DB_USER -d $DB_NAME -t -c \"$1\"" 2>/dev/null
}
# 檢查資料庫連線
check_connection() {
if exec_sql "SELECT 1" | grep -q "1"; then
return 0
else
return 1
fi
}
# 檢查死鎖
check_deadlocks() {
local deadlocks=$(exec_sql "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type = 'Lock'")
echo "$deadlocks" | tr -d ' '
}
# 檢查 Dead Tuples (需要 VACUUM)
check_dead_tuples() {
local dead_tuples=$(exec_sql "SELECT sum(n_dead_tup) FROM pg_stat_user_tables")
echo "$dead_tuples" | tr -d ' '
}
# 檢查膨脹比例
check_bloat() {
local bloat=$(exec_sql "
SELECT round(100 * sum(n_dead_tup)::numeric / NULLIF(sum(n_live_tup + n_dead_tup), 0), 2)
FROM pg_stat_user_tables
")
echo "$bloat" | tr -d ' '
}
# 執行 VACUUM ANALYZE
do_vacuum() {
log "執行 GCP VACUUM ANALYZE..."
gcp_exec "sudo kubectl exec -n $NAMESPACE momo-postgres-0 -- psql -U $DB_USER -d $DB_NAME -c 'VACUUM ANALYZE'" 2>/dev/null
log "GCP VACUUM ANALYZE 完成"
}
# 終止長時間查詢
kill_long_queries() {
local threshold_minutes=30
log "終止 GCP 超過 ${threshold_minutes} 分鐘的查詢..."
local killed=$(exec_sql "
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'active'
AND query_start < now() - interval '${threshold_minutes} minutes'
AND pid <> pg_backend_pid()
" | grep -c "t" || echo "0")
log "已終止 $killed 個長時間查詢"
echo "$killed"
}
# 重啟 PostgreSQL Pod
restart_postgres() {
log "重啟 GCP PostgreSQL Pod..."
gcp_exec "sudo kubectl rollout restart statefulset/momo-postgres -n $NAMESPACE"
gcp_exec "sudo kubectl rollout status statefulset/momo-postgres -n $NAMESPACE --timeout=180s"
log "GCP PostgreSQL 重啟完成"
}
# 主修復邏輯
main() {
log "===== 開始 GCP PostgreSQL 健康檢查 ====="
local issues_found=0
local actions_taken=""
# 1. 檢查連線
if ! check_connection; then
log "錯誤:無法連線到 GCP 資料庫"
# 嘗試重啟
restart_postgres
sleep 30
if check_connection; then
actions_taken="${actions_taken}• 重啟 PostgreSQL Pod\\n"
issues_found=1
else
send_telegram "🔴 <b>【GCP】PostgreSQL 連線失敗</b>
🏢 環境: 🟥 <code>PROD</code> (momo.wooo.work)
❌ 自動重啟後仍無法連線
📋 需要人工檢查
🔗 SSH: <code>gcloud compute ssh $GCP_VM --zone=$GCP_ZONE</code>
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')"
exit 1
fi
fi
# 2. 檢查死鎖
deadlocks=$(check_deadlocks)
if [ "${deadlocks:-0}" -gt 0 ]; then
log "發現 $deadlocks 個死鎖"
killed=$(kill_long_queries)
actions_taken="${actions_taken}• 終止 $killed 個死鎖查詢\\n"
issues_found=1
fi
# 3. 檢查 Dead Tuples
dead_tuples=$(check_dead_tuples)
if [ "${dead_tuples:-0}" -gt 100000 ]; then
log "Dead tuples 過多: $dead_tuples"
do_vacuum
actions_taken="${actions_taken}• 執行 VACUUM ANALYZE\\n"
issues_found=1
fi
# 4. 檢查膨脹比例
bloat=$(check_bloat)
if [ "$(echo "$bloat > 30" | bc -l 2>/dev/null || echo 0)" -eq 1 ]; then
log "表膨脹過高: ${bloat}%"
do_vacuum
actions_taken="${actions_taken}• 表膨脹 ${bloat}%,已執行 VACUUM\\n"
issues_found=1
fi
# 發送通知 (如有問題)
if [ $issues_found -gt 0 ]; then
send_telegram "🔧 <b>【GCP】PostgreSQL 自動維護完成</b>
🏢 環境: 🟥 <code>PROD</code> (momo.wooo.work)
📋 <b>執行動作:</b>
${actions_taken}
📊 目前狀態:
• Dead Tuples: ${dead_tuples:-0}
• 死鎖數: ${deadlocks:-0}
• 膨脹比例: ${bloat:-0}%
✅ 資料庫已恢復正常
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
🏷️ <i>MOMO Pro 自動修復系統</i>"
fi
log "===== GCP PostgreSQL 健康檢查完成 ====="
}
main "$@"