# PostgreSQL 告警規則 # 監控資料庫連線、效能、磁碟空間等 apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: postgres-alerting-rules namespace: monitoring labels: release: prometheus app: kube-prometheus-stack spec: groups: - name: postgresql.alerts rules: # 資料庫連線數過高 - alert: PostgresConnectionsHigh expr: pg_stat_activity_count > 80 for: 5m labels: severity: warning environment: "{{ $labels.kubernetes_namespace }}" annotations: summary: "PostgreSQL 連線數過高" description: "PostgreSQL 連線數達到 {{ $value }} (閾值 80)" runbook: "檢查是否有連線洩漏或需要調整 max_connections" # 資料庫連線數接近上限 - alert: PostgresConnectionsCritical expr: pg_stat_activity_count > 95 for: 2m labels: severity: critical environment: "{{ $labels.kubernetes_namespace }}" annotations: summary: "PostgreSQL 連線數接近上限" description: "PostgreSQL 連線數達到 {{ $value }} (閾值 95),即將耗盡" runbook: "立即檢查並結束閒置連線" # 資料庫停機 - alert: PostgresDown expr: pg_up == 0 for: 1m labels: severity: critical environment: "{{ $labels.kubernetes_namespace }}" annotations: summary: "PostgreSQL 資料庫停機" description: "無法連接到 PostgreSQL 資料庫" runbook: "立即檢查 momo-postgres Pod 狀態" # 慢查詢過多 - alert: PostgresSlowQueries expr: rate(pg_stat_activity_max_tx_duration{state="active"}[5m]) > 5 for: 10m labels: severity: warning environment: "{{ $labels.kubernetes_namespace }}" annotations: summary: "PostgreSQL 慢查詢過多" description: "過去 5 分鐘有持續的慢查詢,最長事務時間: {{ $value }}s" runbook: "檢查 pg_stat_activity 找出慢查詢並優化" # 資料庫大小超過閾值 - alert: PostgresDatabaseSizeLarge expr: pg_database_size_bytes{datname="momo_analytics"} > 5368709120 for: 5m labels: severity: warning environment: "{{ $labels.kubernetes_namespace }}" annotations: summary: "PostgreSQL 資料庫大小超過 5GB" description: "資料庫 momo_analytics 大小: {{ humanize $value }}" runbook: "考慮清理舊資料或擴充磁碟空間" # 資料庫空間使用過高 - alert: PostgresDatabaseSizeCritical expr: pg_database_size_bytes{datname="momo_analytics"} > 8589934592 for: 5m labels: severity: critical environment: "{{ $labels.kubernetes_namespace }}" annotations: summary: "PostgreSQL 資料庫大小超過 8GB" description: "資料庫 momo_analytics 大小: {{ humanize $value }},空間即將耗盡" runbook: "立即清理資料或擴充磁碟空間" # 等待鎖定過長 - alert: PostgresLockWaiting expr: pg_locks_count{mode="ExclusiveLock"} > 10 for: 5m labels: severity: warning environment: "{{ $labels.kubernetes_namespace }}" annotations: summary: "PostgreSQL 鎖定等待過多" description: "有 {{ $value }} 個排他鎖定,可能有阻塞" runbook: "檢查 pg_locks 找出阻塞的查詢" # 複製延遲 (如有 Replica) - alert: PostgresReplicationLag expr: pg_replication_lag > 60 for: 5m labels: severity: warning environment: "{{ $labels.kubernetes_namespace }}" annotations: summary: "PostgreSQL 複製延遲" description: "複製延遲: {{ $value }}s" runbook: "檢查網路和 Replica 狀態" # 死鎖發生 - alert: PostgresDeadlocks expr: rate(pg_stat_database_deadlocks{datname="momo_analytics"}[5m]) > 0 for: 1m labels: severity: warning environment: "{{ $labels.kubernetes_namespace }}" annotations: summary: "PostgreSQL 發生死鎖" description: "資料庫 momo_analytics 發生死鎖" runbook: "檢查應用程式的交易邏輯" # 表膨脹 (Dead Tuples 過多) - alert: PostgresTableBloat expr: pg_stat_user_tables_n_dead_tup > 100000 for: 30m labels: severity: warning environment: "{{ $labels.kubernetes_namespace }}" annotations: summary: "PostgreSQL 表膨脹" description: "表 {{ $labels.relname }} 有 {{ $value }} 個死亡元組" runbook: "執行 VACUUM ANALYZE {{ $labels.relname }}"