187 lines
5.0 KiB
Bash
187 lines
5.0 KiB
Bash
#!/bin/bash
|
||
# GCP PostgreSQL 自動修復腳本
|
||
# 功能:檢測 GCP 資料庫問題,自動 VACUUM、重建索引、恢復連線
|
||
# 執行位置:UAT 主機,透過 gcloud SSH 連接 GCP
|
||
|
||
set -e
|
||
|
||
# GCP 配置
|
||
GCP_PROJECT="astral-gateway-484913-d7"
|
||
GCP_ZONE="asia-east1-b"
|
||
GCP_VM="momo-pro-gcp"
|
||
NAMESPACE="momo"
|
||
DB_NAME="momo_analytics"
|
||
DB_USER="momo"
|
||
|
||
# 通知配置
|
||
TELEGRAM_BOT="<TELEGRAM_BOT_TOKEN>"
|
||
TELEGRAM_CHAT="5619078117"
|
||
LOG_FILE="/var/log/postgres_repair_gcp.log"
|
||
|
||
log() {
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [GCP-PG] $1" | tee -a "$LOG_FILE"
|
||
}
|
||
|
||
send_telegram() {
|
||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
|
||
-d "chat_id=${TELEGRAM_CHAT}" \
|
||
-d "parse_mode=HTML" \
|
||
-d "text=$1" > /dev/null
|
||
}
|
||
|
||
# 在 GCP 上執行命令
|
||
gcp_exec() {
|
||
gcloud compute ssh "$GCP_VM" \
|
||
--zone="$GCP_ZONE" \
|
||
--project="$GCP_PROJECT" \
|
||
--command="$1" 2>/dev/null
|
||
}
|
||
|
||
# 在 GCP Pod 內執行 SQL
|
||
exec_sql() {
|
||
gcp_exec "sudo kubectl exec -n $NAMESPACE momo-postgres-0 -- psql -U $DB_USER -d $DB_NAME -t -c \"$1\"" 2>/dev/null
|
||
}
|
||
|
||
# 檢查資料庫連線
|
||
check_connection() {
|
||
if exec_sql "SELECT 1" | grep -q "1"; then
|
||
return 0
|
||
else
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 檢查死鎖
|
||
check_deadlocks() {
|
||
local deadlocks=$(exec_sql "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type = 'Lock'")
|
||
echo "$deadlocks" | tr -d ' '
|
||
}
|
||
|
||
# 檢查 Dead Tuples (需要 VACUUM)
|
||
check_dead_tuples() {
|
||
local dead_tuples=$(exec_sql "SELECT sum(n_dead_tup) FROM pg_stat_user_tables")
|
||
echo "$dead_tuples" | tr -d ' '
|
||
}
|
||
|
||
# 檢查膨脹比例
|
||
check_bloat() {
|
||
local bloat=$(exec_sql "
|
||
SELECT round(100 * sum(n_dead_tup)::numeric / NULLIF(sum(n_live_tup + n_dead_tup), 0), 2)
|
||
FROM pg_stat_user_tables
|
||
")
|
||
echo "$bloat" | tr -d ' '
|
||
}
|
||
|
||
# 執行 VACUUM ANALYZE
|
||
do_vacuum() {
|
||
log "執行 GCP VACUUM ANALYZE..."
|
||
gcp_exec "sudo kubectl exec -n $NAMESPACE momo-postgres-0 -- psql -U $DB_USER -d $DB_NAME -c 'VACUUM ANALYZE'" 2>/dev/null
|
||
log "GCP VACUUM ANALYZE 完成"
|
||
}
|
||
|
||
# 終止長時間查詢
|
||
kill_long_queries() {
|
||
local threshold_minutes=30
|
||
log "終止 GCP 超過 ${threshold_minutes} 分鐘的查詢..."
|
||
|
||
local killed=$(exec_sql "
|
||
SELECT pg_terminate_backend(pid)
|
||
FROM pg_stat_activity
|
||
WHERE state = 'active'
|
||
AND query_start < now() - interval '${threshold_minutes} minutes'
|
||
AND pid <> pg_backend_pid()
|
||
" | grep -c "t" || echo "0")
|
||
|
||
log "已終止 $killed 個長時間查詢"
|
||
echo "$killed"
|
||
}
|
||
|
||
# 重啟 PostgreSQL Pod
|
||
restart_postgres() {
|
||
log "重啟 GCP PostgreSQL Pod..."
|
||
gcp_exec "sudo kubectl rollout restart statefulset/momo-postgres -n $NAMESPACE"
|
||
gcp_exec "sudo kubectl rollout status statefulset/momo-postgres -n $NAMESPACE --timeout=180s"
|
||
log "GCP PostgreSQL 重啟完成"
|
||
}
|
||
|
||
# 主修復邏輯
|
||
main() {
|
||
log "===== 開始 GCP PostgreSQL 健康檢查 ====="
|
||
|
||
local issues_found=0
|
||
local actions_taken=""
|
||
|
||
# 1. 檢查連線
|
||
if ! check_connection; then
|
||
log "錯誤:無法連線到 GCP 資料庫"
|
||
|
||
# 嘗試重啟
|
||
restart_postgres
|
||
sleep 30
|
||
|
||
if check_connection; then
|
||
actions_taken="${actions_taken}• 重啟 PostgreSQL Pod\\n"
|
||
issues_found=1
|
||
else
|
||
send_telegram "🔴 <b>【GCP】PostgreSQL 連線失敗</b>
|
||
|
||
🏢 環境: 🟥 <code>PROD</code> (momo.wooo.work)
|
||
❌ 自動重啟後仍無法連線
|
||
📋 需要人工檢查
|
||
|
||
🔗 SSH: <code>gcloud compute ssh $GCP_VM --zone=$GCP_ZONE</code>
|
||
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
exit 1
|
||
fi
|
||
fi
|
||
|
||
# 2. 檢查死鎖
|
||
deadlocks=$(check_deadlocks)
|
||
if [ "${deadlocks:-0}" -gt 0 ]; then
|
||
log "發現 $deadlocks 個死鎖"
|
||
killed=$(kill_long_queries)
|
||
actions_taken="${actions_taken}• 終止 $killed 個死鎖查詢\\n"
|
||
issues_found=1
|
||
fi
|
||
|
||
# 3. 檢查 Dead Tuples
|
||
dead_tuples=$(check_dead_tuples)
|
||
if [ "${dead_tuples:-0}" -gt 100000 ]; then
|
||
log "Dead tuples 過多: $dead_tuples"
|
||
do_vacuum
|
||
actions_taken="${actions_taken}• 執行 VACUUM ANALYZE\\n"
|
||
issues_found=1
|
||
fi
|
||
|
||
# 4. 檢查膨脹比例
|
||
bloat=$(check_bloat)
|
||
if [ "$(echo "$bloat > 30" | bc -l 2>/dev/null || echo 0)" -eq 1 ]; then
|
||
log "表膨脹過高: ${bloat}%"
|
||
do_vacuum
|
||
actions_taken="${actions_taken}• 表膨脹 ${bloat}%,已執行 VACUUM\\n"
|
||
issues_found=1
|
||
fi
|
||
|
||
# 發送通知 (如有問題)
|
||
if [ $issues_found -gt 0 ]; then
|
||
send_telegram "🔧 <b>【GCP】PostgreSQL 自動維護完成</b>
|
||
|
||
🏢 環境: 🟥 <code>PROD</code> (momo.wooo.work)
|
||
📋 <b>執行動作:</b>
|
||
${actions_taken}
|
||
📊 目前狀態:
|
||
• Dead Tuples: ${dead_tuples:-0}
|
||
• 死鎖數: ${deadlocks:-0}
|
||
• 膨脹比例: ${bloat:-0}%
|
||
|
||
✅ 資料庫已恢復正常
|
||
|
||
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
|
||
🏷️ <i>MOMO Pro 自動修復系統</i>"
|
||
fi
|
||
|
||
log "===== GCP PostgreSQL 健康檢查完成 ====="
|
||
}
|
||
|
||
main "$@"
|