diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index e07a315d..7177fa46 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -204,6 +204,99 @@ groups: summary: "Redis 記憶體使用過高" description: "Redis 記憶體使用率超過 80%" + # ========================================================================= + # Sprint 5.2 Plan B: 資料庫詳細指標告警 (database_detail_alerts) + # 前置: postgres-exporter:9187 + redis-exporter:9121 on 192.168.0.188 + # 2026-04-08 Claude Sonnet 4.6 Asia/Taipei + # ========================================================================= + - name: database_detail_alerts + rules: + # ---- PostgreSQL 詳細指標 ---- + - alert: PostgreSQLSlowQueries + expr: pg_stat_activity_max_tx_duration{datname="awoooi_prod"} > 60 + for: 5m + labels: + severity: warning + layer: systemd-188 + component: postgres + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "PostgreSQL 有慢查詢 (>60s)" + description: "awoooi_prod 資料庫最長事務超過 60 秒" + + - alert: PostgreSQLDeadlocks + expr: increase(pg_stat_database_deadlocks{datname="awoooi_prod"}[5m]) > 0 + for: 1m + labels: + severity: warning + layer: systemd-188 + component: postgres + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "PostgreSQL 死鎖發生" + description: "過去 5 分鐘 awoooi_prod 資料庫有死鎖" + + - alert: PostgreSQLTooManyConnections + expr: pg_stat_activity_count{datname="awoooi_prod"} > 50 + for: 5m + labels: + severity: warning + layer: systemd-188 + component: postgres + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "PostgreSQL 連接數過高 ({{ $value }})" + description: "awoooi_prod 連接數超過 50" + + # ---- Redis 詳細指標 ---- + - alert: RedisKeyEviction + expr: increase(redis_evicted_keys_total[5m]) > 0 + for: 1m + labels: + severity: warning + layer: systemd-188 + component: redis + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "Redis 發生 Key 驅逐" + description: "過去 5 分鐘有 Key 被驅逐,可能記憶體不足" + + - alert: RedisConnectionsHigh + expr: redis_connected_clients > 100 + for: 5m + labels: + severity: warning + layer: systemd-188 + component: redis + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "Redis 連接數過高 ({{ $value }})" + description: "Redis 連接數超過 100" + + - alert: RedisCommandLatencyHigh + expr: redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01 + for: 5m + labels: + severity: warning + layer: systemd-188 + component: redis + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "Redis 命令平均延遲過高 (>10ms)" + description: "Redis 命令平均延遲超過 10ms" + # ========================================================================= # 服務可用性告警 (service_alerts) — 含 layer 標籤決定修復路徑 # =========================================================================