Files
awoooi/docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md
OG T 7b2f585244 docs: 完整監控實施步驟 (7 Phase 詳細文檔)
Phase A: AnomalyCounter 服務 (4h)
- Redis Sorted Set 滑動窗口計數
- 頻率閾值告警 (REPEAT/ESCALATE/PERMANENT_FIX)
- Tier 決策邏輯整合

Phase B: Database Exporters (3h)
- pg_exporter: 連接池/慢查詢/鎖等待/膨脹監控
- redis_exporter: 記憶體/命中率/驅逐監控
- 15+ 告警規則

Phase C: Incident 頻率欄位 (2h)
- IncidentFrequencyStats 模型
- 告警聚合邏輯 (10 分鐘窗口)
- 前端頻率顯示

Phase D: Sentry Comment 回寫 (1h)
- 完成 TODO 實作
- Sentry API Token 配置

Phase E: SignOz 告警規則 (2h)
- Error Rate / Latency 告警
- Trace 異常檢測
- SignOz Webhook Handler

Phase F: Alert Chain E2E (2h)
- Smoke Test 腳本
- CD Pipeline 整合
- 鏈路監控告警

Phase G: Learning Service (3h)
- 修復效果學習
- 成功率計算
- Playbook 自動更新

總工時: 17h (2-3 天)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-29 10:23:04 +08:00

14 KiB

資料庫 Exporter 部署實施步驟

優先級: P0 預估工時: 3h 目標: PostgreSQL 與 Redis 完整監控覆蓋


現狀分析

服務 當前監控 缺失指標
PostgreSQL 連接數、慢查詢、鎖等待、複製延遲
Redis 記憶體使用、命中率、命令延遲、驅逐率

Phase B-1: PostgreSQL Exporter (1.5h)

Step 1: 建立 Docker Compose 配置 (15min)

# ops/monitoring/docker-compose.exporters.yaml
# 2026-03-29 ogt: 資料庫監控 Exporter
# 部署位置: 192.168.0.188 (pg 主機)

version: '3.8'

services:
  # ==========================================================================
  # PostgreSQL Exporter
  # ==========================================================================
  postgres-exporter:
    image: prometheuscommunity/postgres-exporter:v0.15.0
    container_name: postgres-exporter
    restart: unless-stopped
    ports:
      - "9187:9187"
    environment:
      DATA_SOURCE_NAME: "postgresql://postgres:${POSTGRES_PASSWORD}@postgres:5432/awoooi?sslmode=disable"
      PG_EXPORTER_EXTEND_QUERY_PATH: "/etc/postgres_exporter/queries.yaml"
    volumes:
      - ./postgres-exporter-queries.yaml:/etc/postgres_exporter/queries.yaml:ro
    networks:
      - monitoring
    depends_on:
      - postgres
    labels:
      - "prometheus.scrape=true"
      - "prometheus.port=9187"

  # ==========================================================================
  # Redis Exporter
  # ==========================================================================
  redis-exporter:
    image: oliver006/redis_exporter:v1.58.0
    container_name: redis-exporter
    restart: unless-stopped
    ports:
      - "9121:9121"
    environment:
      REDIS_ADDR: "redis://redis:6379"
      REDIS_PASSWORD: "${REDIS_PASSWORD}"
    networks:
      - monitoring
    depends_on:
      - redis
    labels:
      - "prometheus.scrape=true"
      - "prometheus.port=9121"

networks:
  monitoring:
    external: true

Step 2: 自訂 PostgreSQL 查詢 (15min)

# ops/monitoring/postgres-exporter-queries.yaml
# 自訂查詢 - 擴展預設指標

# ==========================================================================
# 連接池監控
# ==========================================================================
pg_stat_activity_count:
  query: |
    SELECT
      datname,
      state,
      count(*) as count
    FROM pg_stat_activity
    WHERE datname IS NOT NULL
    GROUP BY datname, state
  metrics:
    - datname:
        usage: "LABEL"
        description: "Database name"
    - state:
        usage: "LABEL"
        description: "Connection state"
    - count:
        usage: "GAUGE"
        description: "Number of connections"

# ==========================================================================
# 慢查詢監控 (> 1 秒)
# ==========================================================================
pg_slow_queries:
  query: |
    SELECT
      datname,
      usename,
      count(*) as slow_query_count
    FROM pg_stat_activity
    WHERE state = 'active'
      AND query_start < now() - interval '1 second'
      AND query NOT LIKE 'SELECT pg_%'
    GROUP BY datname, usename
  metrics:
    - datname:
        usage: "LABEL"
        description: "Database name"
    - usename:
        usage: "LABEL"
        description: "User name"
    - slow_query_count:
        usage: "GAUGE"
        description: "Number of slow queries (> 1s)"

# ==========================================================================
# 鎖等待監控
# ==========================================================================
pg_locks_waiting:
  query: |
    SELECT
      datname,
      mode,
      count(*) as waiting_count
    FROM pg_locks
    WHERE NOT granted
    GROUP BY datname, mode
  metrics:
    - datname:
        usage: "LABEL"
        description: "Database name"
    - mode:
        usage: "LABEL"
        description: "Lock mode"
    - waiting_count:
        usage: "GAUGE"
        description: "Number of locks waiting"

# ==========================================================================
# 表膨脹估算 (Dead Tuples)
# ==========================================================================
pg_stat_user_tables_bloat:
  query: |
    SELECT
      schemaname,
      relname,
      n_dead_tup,
      n_live_tup,
      CASE WHEN n_live_tup > 0
        THEN round(100.0 * n_dead_tup / n_live_tup, 2)
        ELSE 0
      END as dead_tuple_ratio
    FROM pg_stat_user_tables
    WHERE n_live_tup > 1000
    ORDER BY n_dead_tup DESC
    LIMIT 20
  metrics:
    - schemaname:
        usage: "LABEL"
        description: "Schema name"
    - relname:
        usage: "LABEL"
        description: "Table name"
    - n_dead_tup:
        usage: "GAUGE"
        description: "Dead tuples"
    - n_live_tup:
        usage: "GAUGE"
        description: "Live tuples"
    - dead_tuple_ratio:
        usage: "GAUGE"
        description: "Dead tuple percentage"

# ==========================================================================
# 資料庫大小
# ==========================================================================
pg_database_size_bytes:
  query: |
    SELECT
      datname,
      pg_database_size(datname) as size_bytes
    FROM pg_database
    WHERE datname NOT IN ('template0', 'template1')
  metrics:
    - datname:
        usage: "LABEL"
        description: "Database name"
    - size_bytes:
        usage: "GAUGE"
        description: "Database size in bytes"

Step 3: 建立 Prometheus Scrape 配置 (15min)

# k8s/monitoring/prometheus-scrape-exporters.yaml
# 新增到 Prometheus ConfigMap

# PostgreSQL Exporter
- job_name: 'postgres-exporter'
  static_configs:
    - targets: ['192.168.0.188:9187']
  relabel_configs:
    - source_labels: [__address__]
      target_label: instance
      replacement: 'postgres-primary'

# Redis Exporter
- job_name: 'redis-exporter'
  static_configs:
    - targets: ['192.168.0.188:9121']
  relabel_configs:
    - source_labels: [__address__]
      target_label: instance
      replacement: 'redis-primary'

Step 4: 建立告警規則 (30min)

# k8s/monitoring/database-alerts.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: database-alerts
  namespace: monitoring
  labels:
    app: prometheus
spec:
  groups:
    # =========================================================================
    # PostgreSQL 告警
    # =========================================================================
    - name: postgresql
      rules:
        # 連接池即將耗盡
        - alert: PostgreSQLConnectionPoolNearLimit
          expr: |
            sum(pg_stat_activity_count{state="active"}) by (datname)
            /
            (SELECT setting::int FROM pg_settings WHERE name = 'max_connections')
            > 0.8
          for: 5m
          labels:
            severity: warning
            service: postgres
            owner: infra-team
          annotations:
            summary: "PostgreSQL 連接池使用率 > 80%"
            description: "Database {{ $labels.datname }} 連接使用率: {{ $value | humanizePercentage }}"
            auto_repair: "analyze_connection_leak"

        # 連接池耗盡
        - alert: PostgreSQLConnectionPoolExhausted
          expr: |
            sum(pg_stat_activity_count{state="active"}) by (datname)
            /
            100  # 假設 max_connections = 100
            > 0.95
          for: 2m
          labels:
            severity: critical
            service: postgres
            owner: infra-team
          annotations:
            summary: "PostgreSQL 連接池即將耗盡"
            description: "Database {{ $labels.datname }} 連接使用率 > 95%"
            auto_repair: "restart_api_pods"

        # 慢查詢告警
        - alert: PostgreSQLSlowQueries
          expr: pg_slow_queries > 5
          for: 5m
          labels:
            severity: warning
            service: postgres
            owner: backend-team
          annotations:
            summary: "PostgreSQL 慢查詢數量過多"
            description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢"
            auto_repair: "analyze_slow_queries"

        # 鎖等待告警
        - alert: PostgreSQLLockWaiting
          expr: sum(pg_locks_waiting) > 10
          for: 2m
          labels:
            severity: warning
            service: postgres
            owner: backend-team
          annotations:
            summary: "PostgreSQL 鎖等待過多"
            description: "{{ $value }} 個查詢正在等待鎖"

        # 表膨脹告警
        - alert: PostgreSQLTableBloat
          expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20
          for: 30m
          labels:
            severity: warning
            service: postgres
            owner: infra-team
          annotations:
            summary: "PostgreSQL 表膨脹"
            description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%"
            auto_repair: "schedule_vacuum"

        # PostgreSQL Down
        - alert: PostgreSQLDown
          expr: pg_up == 0
          for: 1m
          labels:
            severity: critical
            service: postgres
            owner: infra-team
          annotations:
            summary: "PostgreSQL 無法連線"
            auto_repair: "restart_postgres_container"

    # =========================================================================
    # Redis 告警
    # =========================================================================
    - name: redis
      rules:
        # 記憶體使用過高
        - alert: RedisMemoryHigh
          expr: |
            redis_memory_used_bytes / redis_memory_max_bytes > 0.85
          for: 5m
          labels:
            severity: warning
            service: redis
            owner: infra-team
          annotations:
            summary: "Redis 記憶體使用 > 85%"
            description: "Redis 記憶體: {{ $value | humanizePercentage }}"
            auto_repair: "analyze_redis_keys"

        # 記憶體即將耗盡
        - alert: RedisMemoryCritical
          expr: |
            redis_memory_used_bytes / redis_memory_max_bytes > 0.95
          for: 2m
          labels:
            severity: critical
            service: redis
            owner: infra-team
          annotations:
            summary: "Redis 記憶體即將耗盡"
            description: "Redis 記憶體使用 > 95%"
            auto_repair: "flush_expired_keys"

        # 快取命中率過低
        - alert: RedisCacheHitRateLow
          expr: |
            rate(redis_keyspace_hits_total[5m])
            /
            (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))
            < 0.8
          for: 15m
          labels:
            severity: warning
            service: redis
            owner: backend-team
          annotations:
            summary: "Redis 快取命中率 < 80%"
            description: "命中率: {{ $value | humanizePercentage }}"

        # 連接數過高
        - alert: RedisConnectionsHigh
          expr: redis_connected_clients > 500
          for: 5m
          labels:
            severity: warning
            service: redis
            owner: infra-team
          annotations:
            summary: "Redis 連接數過高"
            description: "連接數: {{ $value }}"

        # Key 驅逐告警
        - alert: RedisEvictedKeys
          expr: rate(redis_evicted_keys_total[5m]) > 100
          for: 5m
          labels:
            severity: warning
            service: redis
            owner: backend-team
          annotations:
            summary: "Redis Key 驅逐頻繁"
            description: "每秒驅逐 {{ $value }} 個 key"
            auto_repair: "increase_redis_memory"

        # Redis Down
        - alert: RedisDown
          expr: redis_up == 0
          for: 1m
          labels:
            severity: critical
            service: redis
            owner: infra-team
          annotations:
            summary: "Redis 無法連線"
            auto_repair: "restart_redis_container"

Step 5: 部署腳本 (15min)

#!/bin/bash
# ops/monitoring/deploy-exporters.sh
# 部署資料庫 Exporter 到 192.168.0.188

set -euo pipefail

HOST="192.168.0.188"
DEPLOY_DIR="/opt/monitoring/exporters"

echo "=== 部署資料庫 Exporter ==="

# 1. 建立目錄
ssh $HOST "mkdir -p $DEPLOY_DIR"

# 2. 複製配置
scp ops/monitoring/docker-compose.exporters.yaml $HOST:$DEPLOY_DIR/docker-compose.yaml
scp ops/monitoring/postgres-exporter-queries.yaml $HOST:$DEPLOY_DIR/

# 3. 載入環境變數 (從 .env)
ssh $HOST "cd $DEPLOY_DIR && docker compose up -d"

# 4. 驗證
echo "等待服務啟動..."
sleep 10

echo "驗證 PostgreSQL Exporter..."
curl -s http://$HOST:9187/metrics | head -5

echo "驗證 Redis Exporter..."
curl -s http://$HOST:9121/metrics | head -5

# 5. 更新 Prometheus 配置
echo "更新 Prometheus scrape 配置..."
kubectl apply -f k8s/monitoring/prometheus-scrape-exporters.yaml

# 6. 部署告警規則
echo "部署告警規則..."
kubectl apply -f k8s/monitoring/database-alerts.yaml

# 7. 重載 Prometheus
kubectl rollout restart deployment/prometheus -n monitoring

echo "=== 部署完成 ==="
echo "PostgreSQL Exporter: http://$HOST:9187/metrics"
echo "Redis Exporter: http://$HOST:9121/metrics"

Phase B-2: 驗證清單 (30min)

驗證 Prometheus Targets

# 檢查 targets 是否 UP
curl -s http://192.168.0.120:30090/api/v1/targets | jq '.data.activeTargets[] | select(.labels.job | contains("exporter")) | {job: .labels.job, health: .health}'

預期輸出:

{"job": "postgres-exporter", "health": "up"}
{"job": "redis-exporter", "health": "up"}

驗證關鍵指標

# PostgreSQL 連接數
curl -s http://192.168.0.188:9187/metrics | grep pg_stat_activity_count

# Redis 記憶體
curl -s http://192.168.0.188:9121/metrics | grep redis_memory_used_bytes

觸發測試告警

# 模擬連接池壓力測試
pgbench -c 80 -j 4 -T 60 -h 192.168.0.188 -U postgres awoooi

交付物清單

檔案 狀態 說明
ops/monitoring/docker-compose.exporters.yaml 🆕 Exporter 容器配置
ops/monitoring/postgres-exporter-queries.yaml 🆕 自訂 PG 查詢
k8s/monitoring/prometheus-scrape-exporters.yaml 🆕 Scrape 配置
k8s/monitoring/database-alerts.yaml 🆕 告警規則
ops/monitoring/deploy-exporters.sh 🆕 部署腳本

預估總工時: 3h 部署位置: 192.168.0.188 依賴: Docker Compose, 現有 PostgreSQL/Redis