203 lines
5.3 KiB
Bash
Executable File
203 lines
5.3 KiB
Bash
Executable File
#!/bin/bash
|
|
# PostgreSQL 自動修復腳本
|
|
# 功能:檢測資料庫問題,自動 VACUUM、重建索引、恢復連線
|
|
|
|
set -e
|
|
|
|
NAMESPACE="momo"
|
|
DB_HOST="momo-postgres"
|
|
DB_NAME="momo_analytics"
|
|
DB_USER="momo"
|
|
TELEGRAM_BOT="<TELEGRAM_BOT_TOKEN>"
|
|
TELEGRAM_CHAT="5619078117"
|
|
LOG_FILE="/var/log/postgres_repair.log"
|
|
BACKUP_DIR="/home/wooo/backups/postgres"
|
|
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
send_telegram() {
|
|
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
|
|
-d "chat_id=${TELEGRAM_CHAT}" \
|
|
-d "parse_mode=HTML" \
|
|
-d "text=$1" > /dev/null
|
|
}
|
|
|
|
# 在 Pod 內執行 SQL
|
|
exec_sql() {
|
|
kubectl exec -n "$NAMESPACE" momo-postgres-0 -- \
|
|
psql -U "$DB_USER" -d "$DB_NAME" -t -c "$1" 2>/dev/null
|
|
}
|
|
|
|
# 檢查資料庫連線
|
|
check_connection() {
|
|
if exec_sql "SELECT 1" | grep -q "1"; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# 檢查死鎖
|
|
check_deadlocks() {
|
|
local deadlocks=$(exec_sql "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type = 'Lock'")
|
|
echo "$deadlocks" | tr -d ' '
|
|
}
|
|
|
|
# 檢查 Dead Tuples (需要 VACUUM)
|
|
check_dead_tuples() {
|
|
local dead_tuples=$(exec_sql "SELECT sum(n_dead_tup) FROM pg_stat_user_tables")
|
|
echo "$dead_tuples" | tr -d ' '
|
|
}
|
|
|
|
# 檢查膨脹比例
|
|
check_bloat() {
|
|
local bloat=$(exec_sql "
|
|
SELECT round(100 * sum(n_dead_tup)::numeric / NULLIF(sum(n_live_tup + n_dead_tup), 0), 2)
|
|
FROM pg_stat_user_tables
|
|
")
|
|
echo "$bloat" | tr -d ' '
|
|
}
|
|
|
|
# 執行 VACUUM ANALYZE
|
|
do_vacuum() {
|
|
log "執行 VACUUM ANALYZE..."
|
|
kubectl exec -n "$NAMESPACE" momo-postgres-0 -- \
|
|
psql -U "$DB_USER" -d "$DB_NAME" -c "VACUUM ANALYZE" 2>/dev/null
|
|
log "VACUUM ANALYZE 完成"
|
|
}
|
|
|
|
# 終止長時間查詢
|
|
kill_long_queries() {
|
|
local threshold_minutes=30
|
|
log "終止超過 ${threshold_minutes} 分鐘的查詢..."
|
|
|
|
local killed=$(exec_sql "
|
|
SELECT pg_terminate_backend(pid)
|
|
FROM pg_stat_activity
|
|
WHERE state = 'active'
|
|
AND query_start < now() - interval '${threshold_minutes} minutes'
|
|
AND pid <> pg_backend_pid()
|
|
" | grep -c "t" || echo "0")
|
|
|
|
log "已終止 $killed 個長時間查詢"
|
|
echo "$killed"
|
|
}
|
|
|
|
# 重建索引 (針對特定表)
|
|
reindex_table() {
|
|
local table=$1
|
|
log "重建索引: $table"
|
|
exec_sql "REINDEX TABLE $table"
|
|
}
|
|
|
|
# 備份資料庫
|
|
backup_database() {
|
|
local backup_file="${BACKUP_DIR}/momo_$(date +%Y%m%d_%H%M%S).sql.gz"
|
|
mkdir -p "$BACKUP_DIR"
|
|
|
|
log "備份資料庫到 $backup_file"
|
|
kubectl exec -n "$NAMESPACE" momo-postgres-0 -- \
|
|
pg_dump -U "$DB_USER" "$DB_NAME" | gzip > "$backup_file"
|
|
|
|
# 保留最近 7 個備份
|
|
ls -t "${BACKUP_DIR}"/*.sql.gz 2>/dev/null | tail -n +8 | xargs rm -f 2>/dev/null || true
|
|
|
|
log "備份完成: $backup_file"
|
|
echo "$backup_file"
|
|
}
|
|
|
|
# 重啟 PostgreSQL Pod
|
|
restart_postgres() {
|
|
log "重啟 PostgreSQL Pod..."
|
|
kubectl rollout restart statefulset/momo-postgres -n "$NAMESPACE"
|
|
kubectl rollout status statefulset/momo-postgres -n "$NAMESPACE" --timeout=180s
|
|
log "PostgreSQL 重啟完成"
|
|
}
|
|
|
|
# 主修復邏輯
|
|
main() {
|
|
log "===== 開始 PostgreSQL 健康檢查 ====="
|
|
|
|
local issues_found=0
|
|
local actions_taken=""
|
|
|
|
# 1. 檢查連線
|
|
if ! check_connection; then
|
|
log "錯誤:無法連線到資料庫"
|
|
|
|
# 嘗試重啟
|
|
restart_postgres
|
|
sleep 30
|
|
|
|
if check_connection; then
|
|
actions_taken="${actions_taken}• 重啟 PostgreSQL Pod\n"
|
|
issues_found=1
|
|
else
|
|
send_telegram "🔴 <b>PostgreSQL 連線失敗</b>
|
|
|
|
❌ 自動重啟後仍無法連線
|
|
📋 需要人工檢查
|
|
|
|
🔗 SSH: <code>ssh wooo@192.168.0.110</code>
|
|
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
# 2. 檢查死鎖
|
|
deadlocks=$(check_deadlocks)
|
|
if [ "$deadlocks" -gt 0 ]; then
|
|
log "發現 $deadlocks 個死鎖"
|
|
killed=$(kill_long_queries)
|
|
actions_taken="${actions_taken}• 終止 $killed 個死鎖查詢\n"
|
|
issues_found=1
|
|
fi
|
|
|
|
# 3. 檢查 Dead Tuples
|
|
dead_tuples=$(check_dead_tuples)
|
|
if [ "${dead_tuples:-0}" -gt 100000 ]; then
|
|
log "Dead tuples 過多: $dead_tuples"
|
|
do_vacuum
|
|
actions_taken="${actions_taken}• 執行 VACUUM ANALYZE\n"
|
|
issues_found=1
|
|
fi
|
|
|
|
# 4. 檢查膨脹比例
|
|
bloat=$(check_bloat)
|
|
if [ "$(echo "$bloat > 30" | bc -l 2>/dev/null || echo 0)" -eq 1 ]; then
|
|
log "表膨脹過高: ${bloat}%"
|
|
do_vacuum
|
|
actions_taken="${actions_taken}• 表膨脹 ${bloat}%,已執行 VACUUM\n"
|
|
issues_found=1
|
|
fi
|
|
|
|
# 5. 每週自動備份 (週日)
|
|
if [ "$(date +%u)" -eq 7 ] && [ "$(date +%H)" -eq 2 ]; then
|
|
backup_file=$(backup_database)
|
|
log "週備份完成: $backup_file"
|
|
fi
|
|
|
|
# 發送通知 (如有問題)
|
|
if [ $issues_found -gt 0 ]; then
|
|
send_telegram "🔧 <b>PostgreSQL 自動維護完成</b>
|
|
|
|
📋 <b>執行動作:</b>
|
|
${actions_taken}
|
|
📊 目前狀態:
|
|
• Dead Tuples: ${dead_tuples:-0}
|
|
• 死鎖數: ${deadlocks:-0}
|
|
• 膨脹比例: ${bloat:-0}%
|
|
|
|
✅ 資料庫已恢復正常
|
|
|
|
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
|
|
🏷️ <i>MOMO Pro 自動修復系統</i>"
|
|
fi
|
|
|
|
log "===== PostgreSQL 健康檢查完成 ====="
|
|
}
|
|
|
|
main "$@"
|