Phase A: AnomalyCounter 服務 (4h) - Redis Sorted Set 滑動窗口計數 - 頻率閾值告警 (REPEAT/ESCALATE/PERMANENT_FIX) - Tier 決策邏輯整合 Phase B: Database Exporters (3h) - pg_exporter: 連接池/慢查詢/鎖等待/膨脹監控 - redis_exporter: 記憶體/命中率/驅逐監控 - 15+ 告警規則 Phase C: Incident 頻率欄位 (2h) - IncidentFrequencyStats 模型 - 告警聚合邏輯 (10 分鐘窗口) - 前端頻率顯示 Phase D: Sentry Comment 回寫 (1h) - 完成 TODO 實作 - Sentry API Token 配置 Phase E: SignOz 告警規則 (2h) - Error Rate / Latency 告警 - Trace 異常檢測 - SignOz Webhook Handler Phase F: Alert Chain E2E (2h) - Smoke Test 腳本 - CD Pipeline 整合 - 鏈路監控告警 Phase G: Learning Service (3h) - 修復效果學習 - 成功率計算 - Playbook 自動更新 總工時: 17h (2-3 天) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
14 KiB
14 KiB
資料庫 Exporter 部署實施步驟
優先級: P0 預估工時: 3h 目標: PostgreSQL 與 Redis 完整監控覆蓋
現狀分析
| 服務 | 當前監控 | 缺失指標 |
|---|---|---|
| PostgreSQL | ❌ 零 | 連接數、慢查詢、鎖等待、複製延遲 |
| Redis | ❌ 零 | 記憶體使用、命中率、命令延遲、驅逐率 |
Phase B-1: PostgreSQL Exporter (1.5h)
Step 1: 建立 Docker Compose 配置 (15min)
# ops/monitoring/docker-compose.exporters.yaml
# 2026-03-29 ogt: 資料庫監控 Exporter
# 部署位置: 192.168.0.188 (pg 主機)
version: '3.8'
services:
# ==========================================================================
# PostgreSQL Exporter
# ==========================================================================
postgres-exporter:
image: prometheuscommunity/postgres-exporter:v0.15.0
container_name: postgres-exporter
restart: unless-stopped
ports:
- "9187:9187"
environment:
DATA_SOURCE_NAME: "postgresql://postgres:${POSTGRES_PASSWORD}@postgres:5432/awoooi?sslmode=disable"
PG_EXPORTER_EXTEND_QUERY_PATH: "/etc/postgres_exporter/queries.yaml"
volumes:
- ./postgres-exporter-queries.yaml:/etc/postgres_exporter/queries.yaml:ro
networks:
- monitoring
depends_on:
- postgres
labels:
- "prometheus.scrape=true"
- "prometheus.port=9187"
# ==========================================================================
# Redis Exporter
# ==========================================================================
redis-exporter:
image: oliver006/redis_exporter:v1.58.0
container_name: redis-exporter
restart: unless-stopped
ports:
- "9121:9121"
environment:
REDIS_ADDR: "redis://redis:6379"
REDIS_PASSWORD: "${REDIS_PASSWORD}"
networks:
- monitoring
depends_on:
- redis
labels:
- "prometheus.scrape=true"
- "prometheus.port=9121"
networks:
monitoring:
external: true
Step 2: 自訂 PostgreSQL 查詢 (15min)
# ops/monitoring/postgres-exporter-queries.yaml
# 自訂查詢 - 擴展預設指標
# ==========================================================================
# 連接池監控
# ==========================================================================
pg_stat_activity_count:
query: |
SELECT
datname,
state,
count(*) as count
FROM pg_stat_activity
WHERE datname IS NOT NULL
GROUP BY datname, state
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- state:
usage: "LABEL"
description: "Connection state"
- count:
usage: "GAUGE"
description: "Number of connections"
# ==========================================================================
# 慢查詢監控 (> 1 秒)
# ==========================================================================
pg_slow_queries:
query: |
SELECT
datname,
usename,
count(*) as slow_query_count
FROM pg_stat_activity
WHERE state = 'active'
AND query_start < now() - interval '1 second'
AND query NOT LIKE 'SELECT pg_%'
GROUP BY datname, usename
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- usename:
usage: "LABEL"
description: "User name"
- slow_query_count:
usage: "GAUGE"
description: "Number of slow queries (> 1s)"
# ==========================================================================
# 鎖等待監控
# ==========================================================================
pg_locks_waiting:
query: |
SELECT
datname,
mode,
count(*) as waiting_count
FROM pg_locks
WHERE NOT granted
GROUP BY datname, mode
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- mode:
usage: "LABEL"
description: "Lock mode"
- waiting_count:
usage: "GAUGE"
description: "Number of locks waiting"
# ==========================================================================
# 表膨脹估算 (Dead Tuples)
# ==========================================================================
pg_stat_user_tables_bloat:
query: |
SELECT
schemaname,
relname,
n_dead_tup,
n_live_tup,
CASE WHEN n_live_tup > 0
THEN round(100.0 * n_dead_tup / n_live_tup, 2)
ELSE 0
END as dead_tuple_ratio
FROM pg_stat_user_tables
WHERE n_live_tup > 1000
ORDER BY n_dead_tup DESC
LIMIT 20
metrics:
- schemaname:
usage: "LABEL"
description: "Schema name"
- relname:
usage: "LABEL"
description: "Table name"
- n_dead_tup:
usage: "GAUGE"
description: "Dead tuples"
- n_live_tup:
usage: "GAUGE"
description: "Live tuples"
- dead_tuple_ratio:
usage: "GAUGE"
description: "Dead tuple percentage"
# ==========================================================================
# 資料庫大小
# ==========================================================================
pg_database_size_bytes:
query: |
SELECT
datname,
pg_database_size(datname) as size_bytes
FROM pg_database
WHERE datname NOT IN ('template0', 'template1')
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- size_bytes:
usage: "GAUGE"
description: "Database size in bytes"
Step 3: 建立 Prometheus Scrape 配置 (15min)
# k8s/monitoring/prometheus-scrape-exporters.yaml
# 新增到 Prometheus ConfigMap
# PostgreSQL Exporter
- job_name: 'postgres-exporter'
static_configs:
- targets: ['192.168.0.188:9187']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'postgres-primary'
# Redis Exporter
- job_name: 'redis-exporter'
static_configs:
- targets: ['192.168.0.188:9121']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'redis-primary'
Step 4: 建立告警規則 (30min)
# k8s/monitoring/database-alerts.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: database-alerts
namespace: monitoring
labels:
app: prometheus
spec:
groups:
# =========================================================================
# PostgreSQL 告警
# =========================================================================
- name: postgresql
rules:
# 連接池即將耗盡
- alert: PostgreSQLConnectionPoolNearLimit
expr: |
sum(pg_stat_activity_count{state="active"}) by (datname)
/
(SELECT setting::int FROM pg_settings WHERE name = 'max_connections')
> 0.8
for: 5m
labels:
severity: warning
service: postgres
owner: infra-team
annotations:
summary: "PostgreSQL 連接池使用率 > 80%"
description: "Database {{ $labels.datname }} 連接使用率: {{ $value | humanizePercentage }}"
auto_repair: "analyze_connection_leak"
# 連接池耗盡
- alert: PostgreSQLConnectionPoolExhausted
expr: |
sum(pg_stat_activity_count{state="active"}) by (datname)
/
100 # 假設 max_connections = 100
> 0.95
for: 2m
labels:
severity: critical
service: postgres
owner: infra-team
annotations:
summary: "PostgreSQL 連接池即將耗盡"
description: "Database {{ $labels.datname }} 連接使用率 > 95%"
auto_repair: "restart_api_pods"
# 慢查詢告警
- alert: PostgreSQLSlowQueries
expr: pg_slow_queries > 5
for: 5m
labels:
severity: warning
service: postgres
owner: backend-team
annotations:
summary: "PostgreSQL 慢查詢數量過多"
description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢"
auto_repair: "analyze_slow_queries"
# 鎖等待告警
- alert: PostgreSQLLockWaiting
expr: sum(pg_locks_waiting) > 10
for: 2m
labels:
severity: warning
service: postgres
owner: backend-team
annotations:
summary: "PostgreSQL 鎖等待過多"
description: "{{ $value }} 個查詢正在等待鎖"
# 表膨脹告警
- alert: PostgreSQLTableBloat
expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20
for: 30m
labels:
severity: warning
service: postgres
owner: infra-team
annotations:
summary: "PostgreSQL 表膨脹"
description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%"
auto_repair: "schedule_vacuum"
# PostgreSQL Down
- alert: PostgreSQLDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
service: postgres
owner: infra-team
annotations:
summary: "PostgreSQL 無法連線"
auto_repair: "restart_postgres_container"
# =========================================================================
# Redis 告警
# =========================================================================
- name: redis
rules:
# 記憶體使用過高
- alert: RedisMemoryHigh
expr: |
redis_memory_used_bytes / redis_memory_max_bytes > 0.85
for: 5m
labels:
severity: warning
service: redis
owner: infra-team
annotations:
summary: "Redis 記憶體使用 > 85%"
description: "Redis 記憶體: {{ $value | humanizePercentage }}"
auto_repair: "analyze_redis_keys"
# 記憶體即將耗盡
- alert: RedisMemoryCritical
expr: |
redis_memory_used_bytes / redis_memory_max_bytes > 0.95
for: 2m
labels:
severity: critical
service: redis
owner: infra-team
annotations:
summary: "Redis 記憶體即將耗盡"
description: "Redis 記憶體使用 > 95%"
auto_repair: "flush_expired_keys"
# 快取命中率過低
- alert: RedisCacheHitRateLow
expr: |
rate(redis_keyspace_hits_total[5m])
/
(rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))
< 0.8
for: 15m
labels:
severity: warning
service: redis
owner: backend-team
annotations:
summary: "Redis 快取命中率 < 80%"
description: "命中率: {{ $value | humanizePercentage }}"
# 連接數過高
- alert: RedisConnectionsHigh
expr: redis_connected_clients > 500
for: 5m
labels:
severity: warning
service: redis
owner: infra-team
annotations:
summary: "Redis 連接數過高"
description: "連接數: {{ $value }}"
# Key 驅逐告警
- alert: RedisEvictedKeys
expr: rate(redis_evicted_keys_total[5m]) > 100
for: 5m
labels:
severity: warning
service: redis
owner: backend-team
annotations:
summary: "Redis Key 驅逐頻繁"
description: "每秒驅逐 {{ $value }} 個 key"
auto_repair: "increase_redis_memory"
# Redis Down
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
service: redis
owner: infra-team
annotations:
summary: "Redis 無法連線"
auto_repair: "restart_redis_container"
Step 5: 部署腳本 (15min)
#!/bin/bash
# ops/monitoring/deploy-exporters.sh
# 部署資料庫 Exporter 到 192.168.0.188
set -euo pipefail
HOST="192.168.0.188"
DEPLOY_DIR="/opt/monitoring/exporters"
echo "=== 部署資料庫 Exporter ==="
# 1. 建立目錄
ssh $HOST "mkdir -p $DEPLOY_DIR"
# 2. 複製配置
scp ops/monitoring/docker-compose.exporters.yaml $HOST:$DEPLOY_DIR/docker-compose.yaml
scp ops/monitoring/postgres-exporter-queries.yaml $HOST:$DEPLOY_DIR/
# 3. 載入環境變數 (從 .env)
ssh $HOST "cd $DEPLOY_DIR && docker compose up -d"
# 4. 驗證
echo "等待服務啟動..."
sleep 10
echo "驗證 PostgreSQL Exporter..."
curl -s http://$HOST:9187/metrics | head -5
echo "驗證 Redis Exporter..."
curl -s http://$HOST:9121/metrics | head -5
# 5. 更新 Prometheus 配置
echo "更新 Prometheus scrape 配置..."
kubectl apply -f k8s/monitoring/prometheus-scrape-exporters.yaml
# 6. 部署告警規則
echo "部署告警規則..."
kubectl apply -f k8s/monitoring/database-alerts.yaml
# 7. 重載 Prometheus
kubectl rollout restart deployment/prometheus -n monitoring
echo "=== 部署完成 ==="
echo "PostgreSQL Exporter: http://$HOST:9187/metrics"
echo "Redis Exporter: http://$HOST:9121/metrics"
Phase B-2: 驗證清單 (30min)
驗證 Prometheus Targets
# 檢查 targets 是否 UP
curl -s http://192.168.0.120:30090/api/v1/targets | jq '.data.activeTargets[] | select(.labels.job | contains("exporter")) | {job: .labels.job, health: .health}'
預期輸出:
{"job": "postgres-exporter", "health": "up"}
{"job": "redis-exporter", "health": "up"}
驗證關鍵指標
# PostgreSQL 連接數
curl -s http://192.168.0.188:9187/metrics | grep pg_stat_activity_count
# Redis 記憶體
curl -s http://192.168.0.188:9121/metrics | grep redis_memory_used_bytes
觸發測試告警
# 模擬連接池壓力測試
pgbench -c 80 -j 4 -T 60 -h 192.168.0.188 -U postgres awoooi
交付物清單
| 檔案 | 狀態 | 說明 |
|---|---|---|
ops/monitoring/docker-compose.exporters.yaml |
🆕 | Exporter 容器配置 |
ops/monitoring/postgres-exporter-queries.yaml |
🆕 | 自訂 PG 查詢 |
k8s/monitoring/prometheus-scrape-exporters.yaml |
🆕 | Scrape 配置 |
k8s/monitoring/database-alerts.yaml |
🆕 | 告警規則 |
ops/monitoring/deploy-exporters.sh |
🆕 | 部署腳本 |
預估總工時: 3h 部署位置: 192.168.0.188 依賴: Docker Compose, 現有 PostgreSQL/Redis