From d9e0fab3fee71f91feb2f6f1a9db9fc3a9a38845 Mon Sep 17 00:00:00 2001 From: OG T Date: Wed, 8 Apr 2026 18:19:03 +0800 Subject: [PATCH] =?UTF-8?q?feat(monitoring):=20Sprint=205.2=20Plan=20B=20?= =?UTF-8?q?=E2=80=94=20=E8=B3=87=E6=96=99=E5=BA=AB=E8=A9=B3=E7=B4=B0?= =?UTF-8?q?=E5=91=8A=E8=AD=A6=E8=A6=8F=E5=89=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增 database_detail_alerts 規則群組: PostgreSQL: - PostgreSQLSlowQueries: 慢查詢 >60s - PostgreSQLDeadlocks: 死鎖發生 - PostgreSQLTooManyConnections: 連接數 >50 Redis: - RedisKeyEviction: Key 驅逐 - RedisConnectionsHigh: 連接數 >100 - RedisCommandLatencyHigh: 命令延遲 >10ms 前置: postgres-exporter:9187 + redis-exporter:9121 已在 188 部署 ✅ Prometheus scrape 已更新 ✅ Co-Authored-By: Claude Sonnet 4.6 --- ops/monitoring/alerts-unified.yml | 93 +++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index e07a315d..7177fa46 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -204,6 +204,99 @@ groups: summary: "Redis 記憶體使用過高" description: "Redis 記憶體使用率超過 80%" + # ========================================================================= + # Sprint 5.2 Plan B: 資料庫詳細指標告警 (database_detail_alerts) + # 前置: postgres-exporter:9187 + redis-exporter:9121 on 192.168.0.188 + # 2026-04-08 Claude Sonnet 4.6 Asia/Taipei + # ========================================================================= + - name: database_detail_alerts + rules: + # ---- PostgreSQL 詳細指標 ---- + - alert: PostgreSQLSlowQueries + expr: pg_stat_activity_max_tx_duration{datname="awoooi_prod"} > 60 + for: 5m + labels: + severity: warning + layer: systemd-188 + component: postgres + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "PostgreSQL 有慢查詢 (>60s)" + description: "awoooi_prod 資料庫最長事務超過 60 秒" + + - alert: PostgreSQLDeadlocks + expr: increase(pg_stat_database_deadlocks{datname="awoooi_prod"}[5m]) > 0 + for: 1m + labels: + severity: warning + layer: systemd-188 + component: postgres + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "PostgreSQL 死鎖發生" + description: "過去 5 分鐘 awoooi_prod 資料庫有死鎖" + + - alert: PostgreSQLTooManyConnections + expr: pg_stat_activity_count{datname="awoooi_prod"} > 50 + for: 5m + labels: + severity: warning + layer: systemd-188 + component: postgres + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "PostgreSQL 連接數過高 ({{ $value }})" + description: "awoooi_prod 連接數超過 50" + + # ---- Redis 詳細指標 ---- + - alert: RedisKeyEviction + expr: increase(redis_evicted_keys_total[5m]) > 0 + for: 1m + labels: + severity: warning + layer: systemd-188 + component: redis + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "Redis 發生 Key 驅逐" + description: "過去 5 分鐘有 Key 被驅逐,可能記憶體不足" + + - alert: RedisConnectionsHigh + expr: redis_connected_clients > 100 + for: 5m + labels: + severity: warning + layer: systemd-188 + component: redis + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "Redis 連接數過高 ({{ $value }})" + description: "Redis 連接數超過 100" + + - alert: RedisCommandLatencyHigh + expr: redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01 + for: 5m + labels: + severity: warning + layer: systemd-188 + component: redis + host: "188" + team: ops + auto_repair: "false" + annotations: + summary: "Redis 命令平均延遲過高 (>10ms)" + description: "Redis 命令平均延遲超過 10ms" + # ========================================================================= # 服務可用性告警 (service_alerts) — 含 layer 標籤決定修復路徑 # =========================================================================