Files
awoooi/k8s/monitoring/database-alerts.yaml
OG T a5a6bd3408 feat(monitoring): K8s alert rules + Grafana dashboards + ops 腳本
- k8s/monitoring/alert-chain-monitor.yaml
- k8s/monitoring/database-alerts.yaml
- ops/grafana/ Grafana dashboards
- ops/signoz/ SignOz 配置
- ops/scripts/ 維運腳本

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-29 16:04:14 +08:00

265 lines
9.9 KiB
YAML

# =============================================================================
# AWOOOI Database Alerts
# =============================================================================
# 負責人: DevOps Commander
# 版本: v1.0
# 日期: 2026-03-29
# ADR: ADR-037 Phase B (Database Exporters)
#
# 告警目標: PostgreSQL (192.168.0.188:5432) + Redis (192.168.0.188:6380)
# =============================================================================
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: database-alerts
namespace: monitoring
labels:
release: prometheus
app: prometheus
spec:
groups:
# =========================================================================
# PostgreSQL 告警
# =========================================================================
- name: postgresql
rules:
# -----------------------------------------------------------------
# 連接池即將耗盡 (> 80%)
# -----------------------------------------------------------------
- alert: PostgreSQLConnectionPoolNearLimit
expr: |
sum(pg_stat_activity_count{state="active"}) by (datname) > 80
for: 5m
labels:
severity: warning
service: postgres
team: infra
annotations:
summary: "PostgreSQL 活躍連接數過高"
description: "Database {{ $labels.datname }} 活躍連接: {{ $value }}"
runbook_url: "https://awoooi.internal/runbooks/postgres-connections"
# -----------------------------------------------------------------
# 連接池耗盡 (> 95%)
# -----------------------------------------------------------------
- alert: PostgreSQLConnectionPoolExhausted
expr: |
sum(pg_stat_activity_count{state="active"}) by (datname) > 95
for: 2m
labels:
severity: critical
service: postgres
team: infra
annotations:
summary: "PostgreSQL 連接池即將耗盡"
description: "Database {{ $labels.datname }} 活躍連接 > 95"
# -----------------------------------------------------------------
# 慢查詢過多
# -----------------------------------------------------------------
- alert: PostgreSQLSlowQueries
expr: pg_slow_queries > 5
for: 5m
labels:
severity: warning
service: postgres
team: backend
annotations:
summary: "PostgreSQL 慢查詢數量過多"
description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢 (> 1s)"
# -----------------------------------------------------------------
# 鎖等待過多
# -----------------------------------------------------------------
- alert: PostgreSQLLockWaiting
expr: sum(pg_locks_waiting) > 10
for: 2m
labels:
severity: warning
service: postgres
team: backend
annotations:
summary: "PostgreSQL 鎖等待過多"
description: "{{ $value }} 個查詢正在等待鎖"
# -----------------------------------------------------------------
# 表膨脹 (Dead Tuple > 20%)
# -----------------------------------------------------------------
- alert: PostgreSQLTableBloat
expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20
for: 30m
labels:
severity: warning
service: postgres
team: infra
annotations:
summary: "PostgreSQL 表膨脹嚴重"
description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%"
# -----------------------------------------------------------------
# 資料庫不可達
# -----------------------------------------------------------------
- alert: PostgreSQLDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
service: postgres
team: infra
annotations:
summary: "PostgreSQL 無法連線"
description: "PostgreSQL Exporter 無法連接到資料庫"
# -----------------------------------------------------------------
# 長時間執行查詢 (> 60s)
# -----------------------------------------------------------------
- alert: PostgreSQLLongRunningQuery
expr: pg_longest_query_seconds > 60
for: 1m
labels:
severity: warning
service: postgres
team: backend
annotations:
summary: "PostgreSQL 長時間執行查詢"
description: "User {{ $labels.usename }} 查詢已執行 {{ $value | humanizeDuration }}"
# -----------------------------------------------------------------
# 高 Rollback 率 (> 5%)
# -----------------------------------------------------------------
- alert: PostgreSQLHighRollbackRate
expr: pg_stat_database_transactions_rollback_ratio > 5
for: 15m
labels:
severity: warning
service: postgres
team: backend
annotations:
summary: "PostgreSQL 事務回滾率過高"
description: "Database {{ $labels.datname }} 回滾率: {{ $value }}%"
# =========================================================================
# Redis 告警
# =========================================================================
- name: redis
rules:
# -----------------------------------------------------------------
# 記憶體使用過高 (> 85%)
# -----------------------------------------------------------------
- alert: RedisMemoryHigh
expr: |
redis_memory_used_bytes / redis_memory_max_bytes > 0.85
for: 5m
labels:
severity: warning
service: redis
team: infra
annotations:
summary: "Redis 記憶體使用 > 85%"
description: "Redis 記憶體使用: {{ $value | humanizePercentage }}"
# -----------------------------------------------------------------
# 記憶體即將耗盡 (> 95%)
# -----------------------------------------------------------------
- alert: RedisMemoryCritical
expr: |
redis_memory_used_bytes / redis_memory_max_bytes > 0.95
for: 2m
labels:
severity: critical
service: redis
team: infra
annotations:
summary: "Redis 記憶體即將耗盡"
description: "Redis 記憶體使用 > 95%"
# -----------------------------------------------------------------
# 快取命中率過低 (< 80%)
# -----------------------------------------------------------------
- alert: RedisCacheHitRateLow
expr: |
rate(redis_keyspace_hits_total[5m])
/
(rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))
< 0.8
for: 15m
labels:
severity: warning
service: redis
team: backend
annotations:
summary: "Redis 快取命中率過低"
description: "命中率: {{ $value | humanizePercentage }}"
# -----------------------------------------------------------------
# 連接數過高
# -----------------------------------------------------------------
- alert: RedisConnectionsHigh
expr: redis_connected_clients > 500
for: 5m
labels:
severity: warning
service: redis
team: infra
annotations:
summary: "Redis 連接數過高"
description: "連接數: {{ $value }}"
# -----------------------------------------------------------------
# Key 驅逐頻繁
# -----------------------------------------------------------------
- alert: RedisEvictedKeys
expr: rate(redis_evicted_keys_total[5m]) > 100
for: 5m
labels:
severity: warning
service: redis
team: backend
annotations:
summary: "Redis Key 驅逐頻繁"
description: "每秒驅逐 {{ $value }} 個 key"
# -----------------------------------------------------------------
# Redis 不可達
# -----------------------------------------------------------------
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
service: redis
team: infra
annotations:
summary: "Redis 無法連線"
description: "Redis Exporter 無法連接到 Redis"
# -----------------------------------------------------------------
# 命令延遲過高
# -----------------------------------------------------------------
- alert: RedisLatencyHigh
expr: |
redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01
for: 5m
labels:
severity: warning
service: redis
team: backend
annotations:
summary: "Redis 命令延遲過高"
description: "平均命令延遲 > 10ms"
# -----------------------------------------------------------------
# 阻塞客戶端
# -----------------------------------------------------------------
- alert: RedisBlockedClients
expr: redis_blocked_clients > 10
for: 5m
labels:
severity: warning
service: redis
team: backend
annotations:
summary: "Redis 有阻塞的客戶端"
description: "{{ $value }} 個客戶端被阻塞 (BLPOP/BRPOP)"