Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
134 lines
5.0 KiB
YAML
134 lines
5.0 KiB
YAML
# PostgreSQL 告警規則
|
|
# 監控資料庫連線、效能、磁碟空間等
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: postgres-alerting-rules
|
|
namespace: monitoring
|
|
labels:
|
|
release: prometheus
|
|
app: kube-prometheus-stack
|
|
spec:
|
|
groups:
|
|
- name: postgresql.alerts
|
|
rules:
|
|
# 資料庫連線數過高
|
|
- alert: PostgresConnectionsHigh
|
|
expr: pg_stat_activity_count > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
environment: "{{ $labels.kubernetes_namespace }}"
|
|
annotations:
|
|
summary: "PostgreSQL 連線數過高"
|
|
description: "PostgreSQL 連線數達到 {{ $value }} (閾值 80)"
|
|
runbook: "檢查是否有連線洩漏或需要調整 max_connections"
|
|
|
|
# 資料庫連線數接近上限
|
|
- alert: PostgresConnectionsCritical
|
|
expr: pg_stat_activity_count > 95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
environment: "{{ $labels.kubernetes_namespace }}"
|
|
annotations:
|
|
summary: "PostgreSQL 連線數接近上限"
|
|
description: "PostgreSQL 連線數達到 {{ $value }} (閾值 95),即將耗盡"
|
|
runbook: "立即檢查並結束閒置連線"
|
|
|
|
# 資料庫停機
|
|
- alert: PostgresDown
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
environment: "{{ $labels.kubernetes_namespace }}"
|
|
annotations:
|
|
summary: "PostgreSQL 資料庫停機"
|
|
description: "無法連接到 PostgreSQL 資料庫"
|
|
runbook: "立即檢查 momo-postgres Pod 狀態"
|
|
|
|
# 慢查詢過多
|
|
- alert: PostgresSlowQueries
|
|
expr: rate(pg_stat_activity_max_tx_duration{state="active"}[5m]) > 5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
environment: "{{ $labels.kubernetes_namespace }}"
|
|
annotations:
|
|
summary: "PostgreSQL 慢查詢過多"
|
|
description: "過去 5 分鐘有持續的慢查詢,最長事務時間: {{ $value }}s"
|
|
runbook: "檢查 pg_stat_activity 找出慢查詢並優化"
|
|
|
|
# 資料庫大小超過閾值
|
|
- alert: PostgresDatabaseSizeLarge
|
|
expr: pg_database_size_bytes{datname="momo_analytics"} > 5368709120
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
environment: "{{ $labels.kubernetes_namespace }}"
|
|
annotations:
|
|
summary: "PostgreSQL 資料庫大小超過 5GB"
|
|
description: "資料庫 momo_analytics 大小: {{ humanize $value }}"
|
|
runbook: "考慮清理舊資料或擴充磁碟空間"
|
|
|
|
# 資料庫空間使用過高
|
|
- alert: PostgresDatabaseSizeCritical
|
|
expr: pg_database_size_bytes{datname="momo_analytics"} > 8589934592
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
environment: "{{ $labels.kubernetes_namespace }}"
|
|
annotations:
|
|
summary: "PostgreSQL 資料庫大小超過 8GB"
|
|
description: "資料庫 momo_analytics 大小: {{ humanize $value }},空間即將耗盡"
|
|
runbook: "立即清理資料或擴充磁碟空間"
|
|
|
|
# 等待鎖定過長
|
|
- alert: PostgresLockWaiting
|
|
expr: pg_locks_count{mode="ExclusiveLock"} > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
environment: "{{ $labels.kubernetes_namespace }}"
|
|
annotations:
|
|
summary: "PostgreSQL 鎖定等待過多"
|
|
description: "有 {{ $value }} 個排他鎖定,可能有阻塞"
|
|
runbook: "檢查 pg_locks 找出阻塞的查詢"
|
|
|
|
# 複製延遲 (如有 Replica)
|
|
- alert: PostgresReplicationLag
|
|
expr: pg_replication_lag > 60
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
environment: "{{ $labels.kubernetes_namespace }}"
|
|
annotations:
|
|
summary: "PostgreSQL 複製延遲"
|
|
description: "複製延遲: {{ $value }}s"
|
|
runbook: "檢查網路和 Replica 狀態"
|
|
|
|
# 死鎖發生
|
|
- alert: PostgresDeadlocks
|
|
expr: rate(pg_stat_database_deadlocks{datname="momo_analytics"}[5m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
environment: "{{ $labels.kubernetes_namespace }}"
|
|
annotations:
|
|
summary: "PostgreSQL 發生死鎖"
|
|
description: "資料庫 momo_analytics 發生死鎖"
|
|
runbook: "檢查應用程式的交易邏輯"
|
|
|
|
# 表膨脹 (Dead Tuples 過多)
|
|
- alert: PostgresTableBloat
|
|
expr: pg_stat_user_tables_n_dead_tup > 100000
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
environment: "{{ $labels.kubernetes_namespace }}"
|
|
annotations:
|
|
summary: "PostgreSQL 表膨脹"
|
|
description: "表 {{ $labels.relname }} 有 {{ $value }} 個死亡元組"
|
|
runbook: "執行 VACUUM ANALYZE {{ $labels.relname }}"
|