遺漏文件導致 CD Monitoring Coverage 步驟失敗 新增: - generate_monitoring.py - 監控覆蓋率檢查 - coverage_report.py - 覆蓋率報告 - discover_docker.py - Docker 服務發現 - deploy-exporters.sh - Exporter 部署腳本 - postgres-exporter-queries.yaml - PostgreSQL 查詢配置 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
216 lines
6.2 KiB
YAML
216 lines
6.2 KiB
YAML
# =============================================================================
|
|
# PostgreSQL Exporter 自訂查詢
|
|
# =============================================================================
|
|
# 負責人: DevOps Commander
|
|
# 版本: v1.0
|
|
# 日期: 2026-03-29
|
|
# ADR: ADR-037 Phase B
|
|
#
|
|
# 用途: 擴展預設指標,監控 AWOOOI 特定需求
|
|
# =============================================================================
|
|
|
|
# ==========================================================================
|
|
# 連接池監控
|
|
# ==========================================================================
|
|
pg_stat_activity_count:
|
|
query: |
|
|
SELECT
|
|
datname,
|
|
state,
|
|
count(*) as count
|
|
FROM pg_stat_activity
|
|
WHERE datname IS NOT NULL
|
|
GROUP BY datname, state
|
|
metrics:
|
|
- datname:
|
|
usage: "LABEL"
|
|
description: "Database name"
|
|
- state:
|
|
usage: "LABEL"
|
|
description: "Connection state (active, idle, idle in transaction)"
|
|
- count:
|
|
usage: "GAUGE"
|
|
description: "Number of connections in this state"
|
|
|
|
# ==========================================================================
|
|
# 慢查詢監控 (> 1 秒)
|
|
# ==========================================================================
|
|
pg_slow_queries:
|
|
query: |
|
|
SELECT
|
|
datname,
|
|
usename,
|
|
count(*) as slow_query_count
|
|
FROM pg_stat_activity
|
|
WHERE state = 'active'
|
|
AND query_start < now() - interval '1 second'
|
|
AND query NOT LIKE 'SELECT pg_%'
|
|
AND query NOT LIKE '%pg_stat%'
|
|
GROUP BY datname, usename
|
|
metrics:
|
|
- datname:
|
|
usage: "LABEL"
|
|
description: "Database name"
|
|
- usename:
|
|
usage: "LABEL"
|
|
description: "User name"
|
|
- slow_query_count:
|
|
usage: "GAUGE"
|
|
description: "Number of slow queries (> 1s)"
|
|
|
|
# ==========================================================================
|
|
# 鎖等待監控
|
|
# ==========================================================================
|
|
pg_locks_waiting:
|
|
query: |
|
|
SELECT
|
|
COALESCE(d.datname, 'unknown') as datname,
|
|
l.mode,
|
|
count(*) as waiting_count
|
|
FROM pg_locks l
|
|
LEFT JOIN pg_database d ON l.database = d.oid
|
|
WHERE NOT l.granted
|
|
GROUP BY d.datname, l.mode
|
|
metrics:
|
|
- datname:
|
|
usage: "LABEL"
|
|
description: "Database name"
|
|
- mode:
|
|
usage: "LABEL"
|
|
description: "Lock mode"
|
|
- waiting_count:
|
|
usage: "GAUGE"
|
|
description: "Number of locks waiting"
|
|
|
|
# ==========================================================================
|
|
# 表膨脹估算 (Dead Tuples)
|
|
# ==========================================================================
|
|
pg_stat_user_tables_bloat:
|
|
query: |
|
|
SELECT
|
|
schemaname,
|
|
relname,
|
|
n_dead_tup,
|
|
n_live_tup,
|
|
CASE WHEN n_live_tup > 0
|
|
THEN round(100.0 * n_dead_tup / n_live_tup, 2)
|
|
ELSE 0
|
|
END as dead_tuple_ratio
|
|
FROM pg_stat_user_tables
|
|
WHERE n_live_tup > 1000
|
|
ORDER BY n_dead_tup DESC
|
|
LIMIT 20
|
|
metrics:
|
|
- schemaname:
|
|
usage: "LABEL"
|
|
description: "Schema name"
|
|
- relname:
|
|
usage: "LABEL"
|
|
description: "Table name"
|
|
- n_dead_tup:
|
|
usage: "GAUGE"
|
|
description: "Dead tuples count"
|
|
- n_live_tup:
|
|
usage: "GAUGE"
|
|
description: "Live tuples count"
|
|
- dead_tuple_ratio:
|
|
usage: "GAUGE"
|
|
description: "Dead tuple percentage"
|
|
|
|
# ==========================================================================
|
|
# 資料庫大小
|
|
# ==========================================================================
|
|
pg_database_size_bytes:
|
|
query: |
|
|
SELECT
|
|
datname,
|
|
pg_database_size(datname) as size_bytes
|
|
FROM pg_database
|
|
WHERE datname NOT IN ('template0', 'template1')
|
|
metrics:
|
|
- datname:
|
|
usage: "LABEL"
|
|
description: "Database name"
|
|
- size_bytes:
|
|
usage: "GAUGE"
|
|
description: "Database size in bytes"
|
|
|
|
# ==========================================================================
|
|
# 事務統計 (AWOOOI 特定)
|
|
# ==========================================================================
|
|
pg_stat_database_transactions:
|
|
query: |
|
|
SELECT
|
|
datname,
|
|
xact_commit,
|
|
xact_rollback,
|
|
CASE WHEN xact_commit + xact_rollback > 0
|
|
THEN round(100.0 * xact_rollback / (xact_commit + xact_rollback), 2)
|
|
ELSE 0
|
|
END as rollback_ratio
|
|
FROM pg_stat_database
|
|
WHERE datname = 'awoooi'
|
|
OR datname = 'awoooi_prod'
|
|
metrics:
|
|
- datname:
|
|
usage: "LABEL"
|
|
description: "Database name"
|
|
- xact_commit:
|
|
usage: "COUNTER"
|
|
description: "Committed transactions"
|
|
- xact_rollback:
|
|
usage: "COUNTER"
|
|
description: "Rolled back transactions"
|
|
- rollback_ratio:
|
|
usage: "GAUGE"
|
|
description: "Rollback percentage"
|
|
|
|
# ==========================================================================
|
|
# 複製延遲 (若有 Replica)
|
|
# ==========================================================================
|
|
pg_replication_lag:
|
|
query: |
|
|
SELECT
|
|
COALESCE(client_addr::text, 'local') as client_addr,
|
|
application_name,
|
|
EXTRACT(EPOCH FROM (now() - sent_lsn::text::pg_lsn::text::timestamp)) as lag_seconds
|
|
FROM pg_stat_replication
|
|
WHERE state = 'streaming'
|
|
master: true
|
|
metrics:
|
|
- client_addr:
|
|
usage: "LABEL"
|
|
description: "Replica address"
|
|
- application_name:
|
|
usage: "LABEL"
|
|
description: "Application name"
|
|
- lag_seconds:
|
|
usage: "GAUGE"
|
|
description: "Replication lag in seconds"
|
|
|
|
# ==========================================================================
|
|
# 最長執行查詢
|
|
# ==========================================================================
|
|
pg_longest_query_seconds:
|
|
query: |
|
|
SELECT
|
|
datname,
|
|
usename,
|
|
EXTRACT(EPOCH FROM (now() - query_start)) as duration_seconds
|
|
FROM pg_stat_activity
|
|
WHERE state = 'active'
|
|
AND query NOT LIKE 'SELECT pg_%'
|
|
AND query_start IS NOT NULL
|
|
ORDER BY query_start
|
|
LIMIT 1
|
|
metrics:
|
|
- datname:
|
|
usage: "LABEL"
|
|
description: "Database name"
|
|
- usename:
|
|
usage: "LABEL"
|
|
description: "User name"
|
|
- duration_seconds:
|
|
usage: "GAUGE"
|
|
description: "Duration of longest running query in seconds"
|