Files
ewoooc/k8s/monitoring/postgres-alerting-rules.yaml
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

134 lines
5.0 KiB
YAML

# PostgreSQL 告警規則
# 監控資料庫連線、效能、磁碟空間等
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: postgres-alerting-rules
namespace: monitoring
labels:
release: prometheus
app: kube-prometheus-stack
spec:
groups:
- name: postgresql.alerts
rules:
# 資料庫連線數過高
- alert: PostgresConnectionsHigh
expr: pg_stat_activity_count > 80
for: 5m
labels:
severity: warning
environment: "{{ $labels.kubernetes_namespace }}"
annotations:
summary: "PostgreSQL 連線數過高"
description: "PostgreSQL 連線數達到 {{ $value }} (閾值 80)"
runbook: "檢查是否有連線洩漏或需要調整 max_connections"
# 資料庫連線數接近上限
- alert: PostgresConnectionsCritical
expr: pg_stat_activity_count > 95
for: 2m
labels:
severity: critical
environment: "{{ $labels.kubernetes_namespace }}"
annotations:
summary: "PostgreSQL 連線數接近上限"
description: "PostgreSQL 連線數達到 {{ $value }} (閾值 95),即將耗盡"
runbook: "立即檢查並結束閒置連線"
# 資料庫停機
- alert: PostgresDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
environment: "{{ $labels.kubernetes_namespace }}"
annotations:
summary: "PostgreSQL 資料庫停機"
description: "無法連接到 PostgreSQL 資料庫"
runbook: "立即檢查 momo-postgres Pod 狀態"
# 慢查詢過多
- alert: PostgresSlowQueries
expr: rate(pg_stat_activity_max_tx_duration{state="active"}[5m]) > 5
for: 10m
labels:
severity: warning
environment: "{{ $labels.kubernetes_namespace }}"
annotations:
summary: "PostgreSQL 慢查詢過多"
description: "過去 5 分鐘有持續的慢查詢,最長事務時間: {{ $value }}s"
runbook: "檢查 pg_stat_activity 找出慢查詢並優化"
# 資料庫大小超過閾值
- alert: PostgresDatabaseSizeLarge
expr: pg_database_size_bytes{datname="momo_analytics"} > 5368709120
for: 5m
labels:
severity: warning
environment: "{{ $labels.kubernetes_namespace }}"
annotations:
summary: "PostgreSQL 資料庫大小超過 5GB"
description: "資料庫 momo_analytics 大小: {{ humanize $value }}"
runbook: "考慮清理舊資料或擴充磁碟空間"
# 資料庫空間使用過高
- alert: PostgresDatabaseSizeCritical
expr: pg_database_size_bytes{datname="momo_analytics"} > 8589934592
for: 5m
labels:
severity: critical
environment: "{{ $labels.kubernetes_namespace }}"
annotations:
summary: "PostgreSQL 資料庫大小超過 8GB"
description: "資料庫 momo_analytics 大小: {{ humanize $value }},空間即將耗盡"
runbook: "立即清理資料或擴充磁碟空間"
# 等待鎖定過長
- alert: PostgresLockWaiting
expr: pg_locks_count{mode="ExclusiveLock"} > 10
for: 5m
labels:
severity: warning
environment: "{{ $labels.kubernetes_namespace }}"
annotations:
summary: "PostgreSQL 鎖定等待過多"
description: "有 {{ $value }} 個排他鎖定,可能有阻塞"
runbook: "檢查 pg_locks 找出阻塞的查詢"
# 複製延遲 (如有 Replica)
- alert: PostgresReplicationLag
expr: pg_replication_lag > 60
for: 5m
labels:
severity: warning
environment: "{{ $labels.kubernetes_namespace }}"
annotations:
summary: "PostgreSQL 複製延遲"
description: "複製延遲: {{ $value }}s"
runbook: "檢查網路和 Replica 狀態"
# 死鎖發生
- alert: PostgresDeadlocks
expr: rate(pg_stat_database_deadlocks{datname="momo_analytics"}[5m]) > 0
for: 1m
labels:
severity: warning
environment: "{{ $labels.kubernetes_namespace }}"
annotations:
summary: "PostgreSQL 發生死鎖"
description: "資料庫 momo_analytics 發生死鎖"
runbook: "檢查應用程式的交易邏輯"
# 表膨脹 (Dead Tuples 過多)
- alert: PostgresTableBloat
expr: pg_stat_user_tables_n_dead_tup > 100000
for: 30m
labels:
severity: warning
environment: "{{ $labels.kubernetes_namespace }}"
annotations:
summary: "PostgreSQL 表膨脹"
description: "表 {{ $labels.relname }} 有 {{ $value }} 個死亡元組"
runbook: "執行 VACUUM ANALYZE {{ $labels.relname }}"