# ============================================================================= # AWOOOI Database Alerts # ============================================================================= # 負責人: DevOps Commander # 版本: v1.0 # 日期: 2026-03-29 # ADR: ADR-037 Phase B (Database Exporters) # # 告警目標: PostgreSQL (192.168.0.188:5432) + Redis (192.168.0.188:6380) # ============================================================================= apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: database-alerts namespace: monitoring labels: release: prometheus app: prometheus spec: groups: # ========================================================================= # PostgreSQL 告警 # ========================================================================= - name: postgresql rules: # ----------------------------------------------------------------- # 連接池即將耗盡 (> 80%) # ----------------------------------------------------------------- - alert: PostgreSQLConnectionPoolNearLimit expr: | sum(pg_stat_activity_count{state="active"}) by (datname) > 80 for: 5m labels: severity: warning service: postgres team: infra annotations: summary: "PostgreSQL 活躍連接數過高" description: "Database {{ $labels.datname }} 活躍連接: {{ $value }}" runbook_url: "https://awoooi.internal/runbooks/postgres-connections" # ----------------------------------------------------------------- # 連接池耗盡 (> 95%) # ----------------------------------------------------------------- - alert: PostgreSQLConnectionPoolExhausted expr: | sum(pg_stat_activity_count{state="active"}) by (datname) > 95 for: 2m labels: severity: critical service: postgres team: infra annotations: summary: "PostgreSQL 連接池即將耗盡" description: "Database {{ $labels.datname }} 活躍連接 > 95" # ----------------------------------------------------------------- # 慢查詢過多 # ----------------------------------------------------------------- - alert: PostgreSQLSlowQueries expr: pg_slow_queries > 5 for: 5m labels: severity: warning service: postgres team: backend annotations: summary: "PostgreSQL 慢查詢數量過多" description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢 (> 1s)" # ----------------------------------------------------------------- # 鎖等待過多 # ----------------------------------------------------------------- - alert: PostgreSQLLockWaiting expr: sum(pg_locks_waiting) > 10 for: 2m labels: severity: warning service: postgres team: backend annotations: summary: "PostgreSQL 鎖等待過多" description: "{{ $value }} 個查詢正在等待鎖" # ----------------------------------------------------------------- # 表膨脹 (Dead Tuple > 20%) # ----------------------------------------------------------------- - alert: PostgreSQLTableBloat expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20 for: 30m labels: severity: warning service: postgres team: infra annotations: summary: "PostgreSQL 表膨脹嚴重" description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%" # ----------------------------------------------------------------- # 資料庫不可達 # ----------------------------------------------------------------- - alert: PostgreSQLDown expr: pg_up == 0 for: 1m labels: severity: critical service: postgres team: infra annotations: summary: "PostgreSQL 無法連線" description: "PostgreSQL Exporter 無法連接到資料庫" # ----------------------------------------------------------------- # 長時間執行查詢 (> 60s) # ----------------------------------------------------------------- - alert: PostgreSQLLongRunningQuery expr: pg_longest_query_seconds > 60 for: 1m labels: severity: warning service: postgres team: backend annotations: summary: "PostgreSQL 長時間執行查詢" description: "User {{ $labels.usename }} 查詢已執行 {{ $value | humanizeDuration }}" # ----------------------------------------------------------------- # 高 Rollback 率 (> 5%) # ----------------------------------------------------------------- - alert: PostgreSQLHighRollbackRate expr: pg_stat_database_transactions_rollback_ratio > 5 for: 15m labels: severity: warning service: postgres team: backend annotations: summary: "PostgreSQL 事務回滾率過高" description: "Database {{ $labels.datname }} 回滾率: {{ $value }}%" # ========================================================================= # Redis 告警 # ========================================================================= - name: redis rules: # ----------------------------------------------------------------- # 記憶體使用過高 (> 85%) # ----------------------------------------------------------------- - alert: RedisMemoryHigh expr: | redis_memory_max_bytes > 0 and redis_memory_used_bytes / redis_memory_max_bytes > 0.85 for: 5m labels: severity: warning service: redis team: infra annotations: summary: "Redis 記憶體使用 > 85%" description: "Redis 記憶體使用: {{ $value | humanizePercentage }}" # ----------------------------------------------------------------- # 記憶體即將耗盡 (> 95%) # ----------------------------------------------------------------- - alert: RedisMemoryCritical expr: | redis_memory_used_bytes / redis_memory_max_bytes > 0.95 for: 2m labels: severity: critical service: redis team: infra annotations: summary: "Redis 記憶體即將耗盡" description: "Redis 記憶體使用 > 95%" # ----------------------------------------------------------------- # 快取命中率過低 (< 80%) # ----------------------------------------------------------------- - alert: RedisCacheHitRateLow expr: | rate(redis_keyspace_hits_total[5m]) / (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m])) < 0.8 for: 15m labels: severity: warning service: redis team: backend annotations: summary: "Redis 快取命中率過低" description: "命中率: {{ $value | humanizePercentage }}" # ----------------------------------------------------------------- # 連接數過高 # ----------------------------------------------------------------- - alert: RedisConnectionsHigh expr: redis_connected_clients > 500 for: 5m labels: severity: warning service: redis team: infra annotations: summary: "Redis 連接數過高" description: "連接數: {{ $value }}" # ----------------------------------------------------------------- # Key 驅逐頻繁 # ----------------------------------------------------------------- - alert: RedisEvictedKeys expr: rate(redis_evicted_keys_total[5m]) > 100 for: 5m labels: severity: warning service: redis team: backend annotations: summary: "Redis Key 驅逐頻繁" description: "每秒驅逐 {{ $value }} 個 key" # ----------------------------------------------------------------- # Redis 不可達 # ----------------------------------------------------------------- - alert: RedisDown expr: redis_up == 0 for: 1m labels: severity: critical service: redis team: infra annotations: summary: "Redis 無法連線" description: "Redis Exporter 無法連接到 Redis" # ----------------------------------------------------------------- # 命令延遲過高 # ----------------------------------------------------------------- - alert: RedisLatencyHigh expr: | redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01 for: 5m labels: severity: warning service: redis team: backend annotations: summary: "Redis 命令延遲過高" description: "平均命令延遲 > 10ms" # ----------------------------------------------------------------- # 阻塞客戶端 # ----------------------------------------------------------------- - alert: RedisBlockedClients expr: redis_blocked_clients > 10 for: 5m labels: severity: warning service: redis team: backend annotations: summary: "Redis 有阻塞的客戶端" description: "{{ $value }} 個客戶端被阻塞 (BLPOP/BRPOP)"