Files
ewoooc/scripts/auto-repair/postgres-repair.sh
OoO d6d8777e41
All checks were successful
CD Pipeline / deploy (push) Successful in 1m12s
V10.601 收斂 Gemini 與密鑰治理
2026-06-06 14:52:46 +08:00

203 lines
5.3 KiB
Bash
Executable File

#!/bin/bash
# PostgreSQL 自動修復腳本
# 功能:檢測資料庫問題,自動 VACUUM、重建索引、恢復連線
set -e
NAMESPACE="momo"
DB_HOST="momo-postgres"
DB_NAME="momo_analytics"
DB_USER="momo"
TELEGRAM_BOT="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT="5619078117"
LOG_FILE="/var/log/postgres_repair.log"
BACKUP_DIR="/home/wooo/backups/postgres"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
send_telegram() {
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT}" \
-d "parse_mode=HTML" \
-d "text=$1" > /dev/null
}
# 在 Pod 內執行 SQL
exec_sql() {
kubectl exec -n "$NAMESPACE" momo-postgres-0 -- \
psql -U "$DB_USER" -d "$DB_NAME" -t -c "$1" 2>/dev/null
}
# 檢查資料庫連線
check_connection() {
if exec_sql "SELECT 1" | grep -q "1"; then
return 0
else
return 1
fi
}
# 檢查死鎖
check_deadlocks() {
local deadlocks=$(exec_sql "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type = 'Lock'")
echo "$deadlocks" | tr -d ' '
}
# 檢查 Dead Tuples (需要 VACUUM)
check_dead_tuples() {
local dead_tuples=$(exec_sql "SELECT sum(n_dead_tup) FROM pg_stat_user_tables")
echo "$dead_tuples" | tr -d ' '
}
# 檢查膨脹比例
check_bloat() {
local bloat=$(exec_sql "
SELECT round(100 * sum(n_dead_tup)::numeric / NULLIF(sum(n_live_tup + n_dead_tup), 0), 2)
FROM pg_stat_user_tables
")
echo "$bloat" | tr -d ' '
}
# 執行 VACUUM ANALYZE
do_vacuum() {
log "執行 VACUUM ANALYZE..."
kubectl exec -n "$NAMESPACE" momo-postgres-0 -- \
psql -U "$DB_USER" -d "$DB_NAME" -c "VACUUM ANALYZE" 2>/dev/null
log "VACUUM ANALYZE 完成"
}
# 終止長時間查詢
kill_long_queries() {
local threshold_minutes=30
log "終止超過 ${threshold_minutes} 分鐘的查詢..."
local killed=$(exec_sql "
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'active'
AND query_start < now() - interval '${threshold_minutes} minutes'
AND pid <> pg_backend_pid()
" | grep -c "t" || echo "0")
log "已終止 $killed 個長時間查詢"
echo "$killed"
}
# 重建索引 (針對特定表)
reindex_table() {
local table=$1
log "重建索引: $table"
exec_sql "REINDEX TABLE $table"
}
# 備份資料庫
backup_database() {
local backup_file="${BACKUP_DIR}/momo_$(date +%Y%m%d_%H%M%S).sql.gz"
mkdir -p "$BACKUP_DIR"
log "備份資料庫到 $backup_file"
kubectl exec -n "$NAMESPACE" momo-postgres-0 -- \
pg_dump -U "$DB_USER" "$DB_NAME" | gzip > "$backup_file"
# 保留最近 7 個備份
ls -t "${BACKUP_DIR}"/*.sql.gz 2>/dev/null | tail -n +8 | xargs rm -f 2>/dev/null || true
log "備份完成: $backup_file"
echo "$backup_file"
}
# 重啟 PostgreSQL Pod
restart_postgres() {
log "重啟 PostgreSQL Pod..."
kubectl rollout restart statefulset/momo-postgres -n "$NAMESPACE"
kubectl rollout status statefulset/momo-postgres -n "$NAMESPACE" --timeout=180s
log "PostgreSQL 重啟完成"
}
# 主修復邏輯
main() {
log "===== 開始 PostgreSQL 健康檢查 ====="
local issues_found=0
local actions_taken=""
# 1. 檢查連線
if ! check_connection; then
log "錯誤:無法連線到資料庫"
# 嘗試重啟
restart_postgres
sleep 30
if check_connection; then
actions_taken="${actions_taken}• 重啟 PostgreSQL Pod\n"
issues_found=1
else
send_telegram "🔴 <b>PostgreSQL 連線失敗</b>
❌ 自動重啟後仍無法連線
📋 需要人工檢查
🔗 SSH: <code>ssh wooo@192.168.0.110</code>
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')"
exit 1
fi
fi
# 2. 檢查死鎖
deadlocks=$(check_deadlocks)
if [ "$deadlocks" -gt 0 ]; then
log "發現 $deadlocks 個死鎖"
killed=$(kill_long_queries)
actions_taken="${actions_taken}• 終止 $killed 個死鎖查詢\n"
issues_found=1
fi
# 3. 檢查 Dead Tuples
dead_tuples=$(check_dead_tuples)
if [ "${dead_tuples:-0}" -gt 100000 ]; then
log "Dead tuples 過多: $dead_tuples"
do_vacuum
actions_taken="${actions_taken}• 執行 VACUUM ANALYZE\n"
issues_found=1
fi
# 4. 檢查膨脹比例
bloat=$(check_bloat)
if [ "$(echo "$bloat > 30" | bc -l 2>/dev/null || echo 0)" -eq 1 ]; then
log "表膨脹過高: ${bloat}%"
do_vacuum
actions_taken="${actions_taken}• 表膨脹 ${bloat}%,已執行 VACUUM\n"
issues_found=1
fi
# 5. 每週自動備份 (週日)
if [ "$(date +%u)" -eq 7 ] && [ "$(date +%H)" -eq 2 ]; then
backup_file=$(backup_database)
log "週備份完成: $backup_file"
fi
# 發送通知 (如有問題)
if [ $issues_found -gt 0 ]; then
send_telegram "🔧 <b>PostgreSQL 自動維護完成</b>
📋 <b>執行動作:</b>
${actions_taken}
📊 目前狀態:
• Dead Tuples: ${dead_tuples:-0}
• 死鎖數: ${deadlocks:-0}
• 膨脹比例: ${bloat:-0}%
✅ 資料庫已恢復正常
⏰ 時間: $(date '+%Y-%m-%d %H:%M:%S')
🏷️ <i>MOMO Pro 自動修復系統</i>"
fi
log "===== PostgreSQL 健康檢查完成 ====="
}
main "$@"