Phase A: AnomalyCounter 服務 (4h) - Redis Sorted Set 滑動窗口計數 - 頻率閾值告警 (REPEAT/ESCALATE/PERMANENT_FIX) - Tier 決策邏輯整合 Phase B: Database Exporters (3h) - pg_exporter: 連接池/慢查詢/鎖等待/膨脹監控 - redis_exporter: 記憶體/命中率/驅逐監控 - 15+ 告警規則 Phase C: Incident 頻率欄位 (2h) - IncidentFrequencyStats 模型 - 告警聚合邏輯 (10 分鐘窗口) - 前端頻率顯示 Phase D: Sentry Comment 回寫 (1h) - 完成 TODO 實作 - Sentry API Token 配置 Phase E: SignOz 告警規則 (2h) - Error Rate / Latency 告警 - Trace 異常檢測 - SignOz Webhook Handler Phase F: Alert Chain E2E (2h) - Smoke Test 腳本 - CD Pipeline 整合 - 鏈路監控告警 Phase G: Learning Service (3h) - 修復效果學習 - 成功率計算 - Playbook 自動更新 總工時: 17h (2-3 天) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
523 lines
14 KiB
Markdown
523 lines
14 KiB
Markdown
# 資料庫 Exporter 部署實施步驟
|
|
|
|
> **優先級**: P0
|
|
> **預估工時**: 3h
|
|
> **目標**: PostgreSQL 與 Redis 完整監控覆蓋
|
|
|
|
---
|
|
|
|
## 現狀分析
|
|
|
|
| 服務 | 當前監控 | 缺失指標 |
|
|
|------|---------|---------|
|
|
| PostgreSQL | ❌ 零 | 連接數、慢查詢、鎖等待、複製延遲 |
|
|
| Redis | ❌ 零 | 記憶體使用、命中率、命令延遲、驅逐率 |
|
|
|
|
---
|
|
|
|
## Phase B-1: PostgreSQL Exporter (1.5h)
|
|
|
|
### Step 1: 建立 Docker Compose 配置 (15min)
|
|
|
|
```yaml
|
|
# ops/monitoring/docker-compose.exporters.yaml
|
|
# 2026-03-29 ogt: 資料庫監控 Exporter
|
|
# 部署位置: 192.168.0.188 (pg 主機)
|
|
|
|
version: '3.8'
|
|
|
|
services:
|
|
# ==========================================================================
|
|
# PostgreSQL Exporter
|
|
# ==========================================================================
|
|
postgres-exporter:
|
|
image: prometheuscommunity/postgres-exporter:v0.15.0
|
|
container_name: postgres-exporter
|
|
restart: unless-stopped
|
|
ports:
|
|
- "9187:9187"
|
|
environment:
|
|
DATA_SOURCE_NAME: "postgresql://postgres:${POSTGRES_PASSWORD}@postgres:5432/awoooi?sslmode=disable"
|
|
PG_EXPORTER_EXTEND_QUERY_PATH: "/etc/postgres_exporter/queries.yaml"
|
|
volumes:
|
|
- ./postgres-exporter-queries.yaml:/etc/postgres_exporter/queries.yaml:ro
|
|
networks:
|
|
- monitoring
|
|
depends_on:
|
|
- postgres
|
|
labels:
|
|
- "prometheus.scrape=true"
|
|
- "prometheus.port=9187"
|
|
|
|
# ==========================================================================
|
|
# Redis Exporter
|
|
# ==========================================================================
|
|
redis-exporter:
|
|
image: oliver006/redis_exporter:v1.58.0
|
|
container_name: redis-exporter
|
|
restart: unless-stopped
|
|
ports:
|
|
- "9121:9121"
|
|
environment:
|
|
REDIS_ADDR: "redis://redis:6379"
|
|
REDIS_PASSWORD: "${REDIS_PASSWORD}"
|
|
networks:
|
|
- monitoring
|
|
depends_on:
|
|
- redis
|
|
labels:
|
|
- "prometheus.scrape=true"
|
|
- "prometheus.port=9121"
|
|
|
|
networks:
|
|
monitoring:
|
|
external: true
|
|
```
|
|
|
|
### Step 2: 自訂 PostgreSQL 查詢 (15min)
|
|
|
|
```yaml
|
|
# ops/monitoring/postgres-exporter-queries.yaml
|
|
# 自訂查詢 - 擴展預設指標
|
|
|
|
# ==========================================================================
|
|
# 連接池監控
|
|
# ==========================================================================
|
|
pg_stat_activity_count:
|
|
query: |
|
|
SELECT
|
|
datname,
|
|
state,
|
|
count(*) as count
|
|
FROM pg_stat_activity
|
|
WHERE datname IS NOT NULL
|
|
GROUP BY datname, state
|
|
metrics:
|
|
- datname:
|
|
usage: "LABEL"
|
|
description: "Database name"
|
|
- state:
|
|
usage: "LABEL"
|
|
description: "Connection state"
|
|
- count:
|
|
usage: "GAUGE"
|
|
description: "Number of connections"
|
|
|
|
# ==========================================================================
|
|
# 慢查詢監控 (> 1 秒)
|
|
# ==========================================================================
|
|
pg_slow_queries:
|
|
query: |
|
|
SELECT
|
|
datname,
|
|
usename,
|
|
count(*) as slow_query_count
|
|
FROM pg_stat_activity
|
|
WHERE state = 'active'
|
|
AND query_start < now() - interval '1 second'
|
|
AND query NOT LIKE 'SELECT pg_%'
|
|
GROUP BY datname, usename
|
|
metrics:
|
|
- datname:
|
|
usage: "LABEL"
|
|
description: "Database name"
|
|
- usename:
|
|
usage: "LABEL"
|
|
description: "User name"
|
|
- slow_query_count:
|
|
usage: "GAUGE"
|
|
description: "Number of slow queries (> 1s)"
|
|
|
|
# ==========================================================================
|
|
# 鎖等待監控
|
|
# ==========================================================================
|
|
pg_locks_waiting:
|
|
query: |
|
|
SELECT
|
|
datname,
|
|
mode,
|
|
count(*) as waiting_count
|
|
FROM pg_locks
|
|
WHERE NOT granted
|
|
GROUP BY datname, mode
|
|
metrics:
|
|
- datname:
|
|
usage: "LABEL"
|
|
description: "Database name"
|
|
- mode:
|
|
usage: "LABEL"
|
|
description: "Lock mode"
|
|
- waiting_count:
|
|
usage: "GAUGE"
|
|
description: "Number of locks waiting"
|
|
|
|
# ==========================================================================
|
|
# 表膨脹估算 (Dead Tuples)
|
|
# ==========================================================================
|
|
pg_stat_user_tables_bloat:
|
|
query: |
|
|
SELECT
|
|
schemaname,
|
|
relname,
|
|
n_dead_tup,
|
|
n_live_tup,
|
|
CASE WHEN n_live_tup > 0
|
|
THEN round(100.0 * n_dead_tup / n_live_tup, 2)
|
|
ELSE 0
|
|
END as dead_tuple_ratio
|
|
FROM pg_stat_user_tables
|
|
WHERE n_live_tup > 1000
|
|
ORDER BY n_dead_tup DESC
|
|
LIMIT 20
|
|
metrics:
|
|
- schemaname:
|
|
usage: "LABEL"
|
|
description: "Schema name"
|
|
- relname:
|
|
usage: "LABEL"
|
|
description: "Table name"
|
|
- n_dead_tup:
|
|
usage: "GAUGE"
|
|
description: "Dead tuples"
|
|
- n_live_tup:
|
|
usage: "GAUGE"
|
|
description: "Live tuples"
|
|
- dead_tuple_ratio:
|
|
usage: "GAUGE"
|
|
description: "Dead tuple percentage"
|
|
|
|
# ==========================================================================
|
|
# 資料庫大小
|
|
# ==========================================================================
|
|
pg_database_size_bytes:
|
|
query: |
|
|
SELECT
|
|
datname,
|
|
pg_database_size(datname) as size_bytes
|
|
FROM pg_database
|
|
WHERE datname NOT IN ('template0', 'template1')
|
|
metrics:
|
|
- datname:
|
|
usage: "LABEL"
|
|
description: "Database name"
|
|
- size_bytes:
|
|
usage: "GAUGE"
|
|
description: "Database size in bytes"
|
|
```
|
|
|
|
### Step 3: 建立 Prometheus Scrape 配置 (15min)
|
|
|
|
```yaml
|
|
# k8s/monitoring/prometheus-scrape-exporters.yaml
|
|
# 新增到 Prometheus ConfigMap
|
|
|
|
# PostgreSQL Exporter
|
|
- job_name: 'postgres-exporter'
|
|
static_configs:
|
|
- targets: ['192.168.0.188:9187']
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: instance
|
|
replacement: 'postgres-primary'
|
|
|
|
# Redis Exporter
|
|
- job_name: 'redis-exporter'
|
|
static_configs:
|
|
- targets: ['192.168.0.188:9121']
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: instance
|
|
replacement: 'redis-primary'
|
|
```
|
|
|
|
### Step 4: 建立告警規則 (30min)
|
|
|
|
```yaml
|
|
# k8s/monitoring/database-alerts.yaml
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: database-alerts
|
|
namespace: monitoring
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
groups:
|
|
# =========================================================================
|
|
# PostgreSQL 告警
|
|
# =========================================================================
|
|
- name: postgresql
|
|
rules:
|
|
# 連接池即將耗盡
|
|
- alert: PostgreSQLConnectionPoolNearLimit
|
|
expr: |
|
|
sum(pg_stat_activity_count{state="active"}) by (datname)
|
|
/
|
|
(SELECT setting::int FROM pg_settings WHERE name = 'max_connections')
|
|
> 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
owner: infra-team
|
|
annotations:
|
|
summary: "PostgreSQL 連接池使用率 > 80%"
|
|
description: "Database {{ $labels.datname }} 連接使用率: {{ $value | humanizePercentage }}"
|
|
auto_repair: "analyze_connection_leak"
|
|
|
|
# 連接池耗盡
|
|
- alert: PostgreSQLConnectionPoolExhausted
|
|
expr: |
|
|
sum(pg_stat_activity_count{state="active"}) by (datname)
|
|
/
|
|
100 # 假設 max_connections = 100
|
|
> 0.95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: postgres
|
|
owner: infra-team
|
|
annotations:
|
|
summary: "PostgreSQL 連接池即將耗盡"
|
|
description: "Database {{ $labels.datname }} 連接使用率 > 95%"
|
|
auto_repair: "restart_api_pods"
|
|
|
|
# 慢查詢告警
|
|
- alert: PostgreSQLSlowQueries
|
|
expr: pg_slow_queries > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
owner: backend-team
|
|
annotations:
|
|
summary: "PostgreSQL 慢查詢數量過多"
|
|
description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢"
|
|
auto_repair: "analyze_slow_queries"
|
|
|
|
# 鎖等待告警
|
|
- alert: PostgreSQLLockWaiting
|
|
expr: sum(pg_locks_waiting) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
owner: backend-team
|
|
annotations:
|
|
summary: "PostgreSQL 鎖等待過多"
|
|
description: "{{ $value }} 個查詢正在等待鎖"
|
|
|
|
# 表膨脹告警
|
|
- alert: PostgreSQLTableBloat
|
|
expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
owner: infra-team
|
|
annotations:
|
|
summary: "PostgreSQL 表膨脹"
|
|
description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%"
|
|
auto_repair: "schedule_vacuum"
|
|
|
|
# PostgreSQL Down
|
|
- alert: PostgreSQLDown
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: postgres
|
|
owner: infra-team
|
|
annotations:
|
|
summary: "PostgreSQL 無法連線"
|
|
auto_repair: "restart_postgres_container"
|
|
|
|
# =========================================================================
|
|
# Redis 告警
|
|
# =========================================================================
|
|
- name: redis
|
|
rules:
|
|
# 記憶體使用過高
|
|
- alert: RedisMemoryHigh
|
|
expr: |
|
|
redis_memory_used_bytes / redis_memory_max_bytes > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
owner: infra-team
|
|
annotations:
|
|
summary: "Redis 記憶體使用 > 85%"
|
|
description: "Redis 記憶體: {{ $value | humanizePercentage }}"
|
|
auto_repair: "analyze_redis_keys"
|
|
|
|
# 記憶體即將耗盡
|
|
- alert: RedisMemoryCritical
|
|
expr: |
|
|
redis_memory_used_bytes / redis_memory_max_bytes > 0.95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: redis
|
|
owner: infra-team
|
|
annotations:
|
|
summary: "Redis 記憶體即將耗盡"
|
|
description: "Redis 記憶體使用 > 95%"
|
|
auto_repair: "flush_expired_keys"
|
|
|
|
# 快取命中率過低
|
|
- alert: RedisCacheHitRateLow
|
|
expr: |
|
|
rate(redis_keyspace_hits_total[5m])
|
|
/
|
|
(rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))
|
|
< 0.8
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
owner: backend-team
|
|
annotations:
|
|
summary: "Redis 快取命中率 < 80%"
|
|
description: "命中率: {{ $value | humanizePercentage }}"
|
|
|
|
# 連接數過高
|
|
- alert: RedisConnectionsHigh
|
|
expr: redis_connected_clients > 500
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
owner: infra-team
|
|
annotations:
|
|
summary: "Redis 連接數過高"
|
|
description: "連接數: {{ $value }}"
|
|
|
|
# Key 驅逐告警
|
|
- alert: RedisEvictedKeys
|
|
expr: rate(redis_evicted_keys_total[5m]) > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
owner: backend-team
|
|
annotations:
|
|
summary: "Redis Key 驅逐頻繁"
|
|
description: "每秒驅逐 {{ $value }} 個 key"
|
|
auto_repair: "increase_redis_memory"
|
|
|
|
# Redis Down
|
|
- alert: RedisDown
|
|
expr: redis_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: redis
|
|
owner: infra-team
|
|
annotations:
|
|
summary: "Redis 無法連線"
|
|
auto_repair: "restart_redis_container"
|
|
```
|
|
|
|
### Step 5: 部署腳本 (15min)
|
|
|
|
```bash
|
|
#!/bin/bash
|
|
# ops/monitoring/deploy-exporters.sh
|
|
# 部署資料庫 Exporter 到 192.168.0.188
|
|
|
|
set -euo pipefail
|
|
|
|
HOST="192.168.0.188"
|
|
DEPLOY_DIR="/opt/monitoring/exporters"
|
|
|
|
echo "=== 部署資料庫 Exporter ==="
|
|
|
|
# 1. 建立目錄
|
|
ssh $HOST "mkdir -p $DEPLOY_DIR"
|
|
|
|
# 2. 複製配置
|
|
scp ops/monitoring/docker-compose.exporters.yaml $HOST:$DEPLOY_DIR/docker-compose.yaml
|
|
scp ops/monitoring/postgres-exporter-queries.yaml $HOST:$DEPLOY_DIR/
|
|
|
|
# 3. 載入環境變數 (從 .env)
|
|
ssh $HOST "cd $DEPLOY_DIR && docker compose up -d"
|
|
|
|
# 4. 驗證
|
|
echo "等待服務啟動..."
|
|
sleep 10
|
|
|
|
echo "驗證 PostgreSQL Exporter..."
|
|
curl -s http://$HOST:9187/metrics | head -5
|
|
|
|
echo "驗證 Redis Exporter..."
|
|
curl -s http://$HOST:9121/metrics | head -5
|
|
|
|
# 5. 更新 Prometheus 配置
|
|
echo "更新 Prometheus scrape 配置..."
|
|
kubectl apply -f k8s/monitoring/prometheus-scrape-exporters.yaml
|
|
|
|
# 6. 部署告警規則
|
|
echo "部署告警規則..."
|
|
kubectl apply -f k8s/monitoring/database-alerts.yaml
|
|
|
|
# 7. 重載 Prometheus
|
|
kubectl rollout restart deployment/prometheus -n monitoring
|
|
|
|
echo "=== 部署完成 ==="
|
|
echo "PostgreSQL Exporter: http://$HOST:9187/metrics"
|
|
echo "Redis Exporter: http://$HOST:9121/metrics"
|
|
```
|
|
|
|
---
|
|
|
|
## Phase B-2: 驗證清單 (30min)
|
|
|
|
### 驗證 Prometheus Targets
|
|
|
|
```bash
|
|
# 檢查 targets 是否 UP
|
|
curl -s http://192.168.0.120:30090/api/v1/targets | jq '.data.activeTargets[] | select(.labels.job | contains("exporter")) | {job: .labels.job, health: .health}'
|
|
```
|
|
|
|
預期輸出:
|
|
```json
|
|
{"job": "postgres-exporter", "health": "up"}
|
|
{"job": "redis-exporter", "health": "up"}
|
|
```
|
|
|
|
### 驗證關鍵指標
|
|
|
|
```bash
|
|
# PostgreSQL 連接數
|
|
curl -s http://192.168.0.188:9187/metrics | grep pg_stat_activity_count
|
|
|
|
# Redis 記憶體
|
|
curl -s http://192.168.0.188:9121/metrics | grep redis_memory_used_bytes
|
|
```
|
|
|
|
### 觸發測試告警
|
|
|
|
```bash
|
|
# 模擬連接池壓力測試
|
|
pgbench -c 80 -j 4 -T 60 -h 192.168.0.188 -U postgres awoooi
|
|
```
|
|
|
|
---
|
|
|
|
## 交付物清單
|
|
|
|
| 檔案 | 狀態 | 說明 |
|
|
|------|------|------|
|
|
| `ops/monitoring/docker-compose.exporters.yaml` | 🆕 | Exporter 容器配置 |
|
|
| `ops/monitoring/postgres-exporter-queries.yaml` | 🆕 | 自訂 PG 查詢 |
|
|
| `k8s/monitoring/prometheus-scrape-exporters.yaml` | 🆕 | Scrape 配置 |
|
|
| `k8s/monitoring/database-alerts.yaml` | 🆕 | 告警規則 |
|
|
| `ops/monitoring/deploy-exporters.sh` | 🆕 | 部署腳本 |
|
|
|
|
---
|
|
|
|
**預估總工時**: 3h
|
|
**部署位置**: 192.168.0.188
|
|
**依賴**: Docker Compose, 現有 PostgreSQL/Redis
|