- 加入 redis_memory_max_bytes > 0 前置條件 - 防止 Redis 未設 maxmemory 時除以零產生 +Inf 永遠觸發 - 影響: alerts-unified.yml + database-alerts.yaml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
265 lines
9.9 KiB
YAML
265 lines
9.9 KiB
YAML
# =============================================================================
|
|
# AWOOOI Database Alerts
|
|
# =============================================================================
|
|
# 負責人: DevOps Commander
|
|
# 版本: v1.0
|
|
# 日期: 2026-03-29
|
|
# ADR: ADR-037 Phase B (Database Exporters)
|
|
#
|
|
# 告警目標: PostgreSQL (192.168.0.188:5432) + Redis (192.168.0.188:6380)
|
|
# =============================================================================
|
|
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: database-alerts
|
|
namespace: monitoring
|
|
labels:
|
|
release: prometheus
|
|
app: prometheus
|
|
spec:
|
|
groups:
|
|
# =========================================================================
|
|
# PostgreSQL 告警
|
|
# =========================================================================
|
|
- name: postgresql
|
|
rules:
|
|
# -----------------------------------------------------------------
|
|
# 連接池即將耗盡 (> 80%)
|
|
# -----------------------------------------------------------------
|
|
- alert: PostgreSQLConnectionPoolNearLimit
|
|
expr: |
|
|
sum(pg_stat_activity_count{state="active"}) by (datname) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
team: infra
|
|
annotations:
|
|
summary: "PostgreSQL 活躍連接數過高"
|
|
description: "Database {{ $labels.datname }} 活躍連接: {{ $value }}"
|
|
runbook_url: "https://awoooi.internal/runbooks/postgres-connections"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 連接池耗盡 (> 95%)
|
|
# -----------------------------------------------------------------
|
|
- alert: PostgreSQLConnectionPoolExhausted
|
|
expr: |
|
|
sum(pg_stat_activity_count{state="active"}) by (datname) > 95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: postgres
|
|
team: infra
|
|
annotations:
|
|
summary: "PostgreSQL 連接池即將耗盡"
|
|
description: "Database {{ $labels.datname }} 活躍連接 > 95"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 慢查詢過多
|
|
# -----------------------------------------------------------------
|
|
- alert: PostgreSQLSlowQueries
|
|
expr: pg_slow_queries > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
team: backend
|
|
annotations:
|
|
summary: "PostgreSQL 慢查詢數量過多"
|
|
description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢 (> 1s)"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 鎖等待過多
|
|
# -----------------------------------------------------------------
|
|
- alert: PostgreSQLLockWaiting
|
|
expr: sum(pg_locks_waiting) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
team: backend
|
|
annotations:
|
|
summary: "PostgreSQL 鎖等待過多"
|
|
description: "{{ $value }} 個查詢正在等待鎖"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 表膨脹 (Dead Tuple > 20%)
|
|
# -----------------------------------------------------------------
|
|
- alert: PostgreSQLTableBloat
|
|
expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
team: infra
|
|
annotations:
|
|
summary: "PostgreSQL 表膨脹嚴重"
|
|
description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 資料庫不可達
|
|
# -----------------------------------------------------------------
|
|
- alert: PostgreSQLDown
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: postgres
|
|
team: infra
|
|
annotations:
|
|
summary: "PostgreSQL 無法連線"
|
|
description: "PostgreSQL Exporter 無法連接到資料庫"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 長時間執行查詢 (> 60s)
|
|
# -----------------------------------------------------------------
|
|
- alert: PostgreSQLLongRunningQuery
|
|
expr: pg_longest_query_seconds > 60
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
team: backend
|
|
annotations:
|
|
summary: "PostgreSQL 長時間執行查詢"
|
|
description: "User {{ $labels.usename }} 查詢已執行 {{ $value | humanizeDuration }}"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 高 Rollback 率 (> 5%)
|
|
# -----------------------------------------------------------------
|
|
- alert: PostgreSQLHighRollbackRate
|
|
expr: pg_stat_database_transactions_rollback_ratio > 5
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
team: backend
|
|
annotations:
|
|
summary: "PostgreSQL 事務回滾率過高"
|
|
description: "Database {{ $labels.datname }} 回滾率: {{ $value }}%"
|
|
|
|
# =========================================================================
|
|
# Redis 告警
|
|
# =========================================================================
|
|
- name: redis
|
|
rules:
|
|
# -----------------------------------------------------------------
|
|
# 記憶體使用過高 (> 85%)
|
|
# -----------------------------------------------------------------
|
|
- alert: RedisMemoryHigh
|
|
expr: |
|
|
redis_memory_max_bytes > 0 and redis_memory_used_bytes / redis_memory_max_bytes > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
team: infra
|
|
annotations:
|
|
summary: "Redis 記憶體使用 > 85%"
|
|
description: "Redis 記憶體使用: {{ $value | humanizePercentage }}"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 記憶體即將耗盡 (> 95%)
|
|
# -----------------------------------------------------------------
|
|
- alert: RedisMemoryCritical
|
|
expr: |
|
|
redis_memory_used_bytes / redis_memory_max_bytes > 0.95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: redis
|
|
team: infra
|
|
annotations:
|
|
summary: "Redis 記憶體即將耗盡"
|
|
description: "Redis 記憶體使用 > 95%"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 快取命中率過低 (< 80%)
|
|
# -----------------------------------------------------------------
|
|
- alert: RedisCacheHitRateLow
|
|
expr: |
|
|
rate(redis_keyspace_hits_total[5m])
|
|
/
|
|
(rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))
|
|
< 0.8
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
team: backend
|
|
annotations:
|
|
summary: "Redis 快取命中率過低"
|
|
description: "命中率: {{ $value | humanizePercentage }}"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 連接數過高
|
|
# -----------------------------------------------------------------
|
|
- alert: RedisConnectionsHigh
|
|
expr: redis_connected_clients > 500
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
team: infra
|
|
annotations:
|
|
summary: "Redis 連接數過高"
|
|
description: "連接數: {{ $value }}"
|
|
|
|
# -----------------------------------------------------------------
|
|
# Key 驅逐頻繁
|
|
# -----------------------------------------------------------------
|
|
- alert: RedisEvictedKeys
|
|
expr: rate(redis_evicted_keys_total[5m]) > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
team: backend
|
|
annotations:
|
|
summary: "Redis Key 驅逐頻繁"
|
|
description: "每秒驅逐 {{ $value }} 個 key"
|
|
|
|
# -----------------------------------------------------------------
|
|
# Redis 不可達
|
|
# -----------------------------------------------------------------
|
|
- alert: RedisDown
|
|
expr: redis_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: redis
|
|
team: infra
|
|
annotations:
|
|
summary: "Redis 無法連線"
|
|
description: "Redis Exporter 無法連接到 Redis"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 命令延遲過高
|
|
# -----------------------------------------------------------------
|
|
- alert: RedisLatencyHigh
|
|
expr: |
|
|
redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
team: backend
|
|
annotations:
|
|
summary: "Redis 命令延遲過高"
|
|
description: "平均命令延遲 > 10ms"
|
|
|
|
# -----------------------------------------------------------------
|
|
# 阻塞客戶端
|
|
# -----------------------------------------------------------------
|
|
- alert: RedisBlockedClients
|
|
expr: redis_blocked_clients > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
team: backend
|
|
annotations:
|
|
summary: "Redis 有阻塞的客戶端"
|
|
description: "{{ $value }} 個客戶端被阻塞 (BLPOP/BRPOP)"
|