Files
awoooi/ops/monitoring/postgres-exporter-queries.yaml
OG T c7f9c119e7 fix(cd): 補提交 ops/monitoring 腳本
遺漏文件導致 CD Monitoring Coverage 步驟失敗

新增:
- generate_monitoring.py - 監控覆蓋率檢查
- coverage_report.py - 覆蓋率報告
- discover_docker.py - Docker 服務發現
- deploy-exporters.sh - Exporter 部署腳本
- postgres-exporter-queries.yaml - PostgreSQL 查詢配置

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-29 15:45:42 +08:00

216 lines
6.2 KiB
YAML

# =============================================================================
# PostgreSQL Exporter 自訂查詢
# =============================================================================
# 負責人: DevOps Commander
# 版本: v1.0
# 日期: 2026-03-29
# ADR: ADR-037 Phase B
#
# 用途: 擴展預設指標,監控 AWOOOI 特定需求
# =============================================================================
# ==========================================================================
# 連接池監控
# ==========================================================================
pg_stat_activity_count:
query: |
SELECT
datname,
state,
count(*) as count
FROM pg_stat_activity
WHERE datname IS NOT NULL
GROUP BY datname, state
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- state:
usage: "LABEL"
description: "Connection state (active, idle, idle in transaction)"
- count:
usage: "GAUGE"
description: "Number of connections in this state"
# ==========================================================================
# 慢查詢監控 (> 1 秒)
# ==========================================================================
pg_slow_queries:
query: |
SELECT
datname,
usename,
count(*) as slow_query_count
FROM pg_stat_activity
WHERE state = 'active'
AND query_start < now() - interval '1 second'
AND query NOT LIKE 'SELECT pg_%'
AND query NOT LIKE '%pg_stat%'
GROUP BY datname, usename
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- usename:
usage: "LABEL"
description: "User name"
- slow_query_count:
usage: "GAUGE"
description: "Number of slow queries (> 1s)"
# ==========================================================================
# 鎖等待監控
# ==========================================================================
pg_locks_waiting:
query: |
SELECT
COALESCE(d.datname, 'unknown') as datname,
l.mode,
count(*) as waiting_count
FROM pg_locks l
LEFT JOIN pg_database d ON l.database = d.oid
WHERE NOT l.granted
GROUP BY d.datname, l.mode
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- mode:
usage: "LABEL"
description: "Lock mode"
- waiting_count:
usage: "GAUGE"
description: "Number of locks waiting"
# ==========================================================================
# 表膨脹估算 (Dead Tuples)
# ==========================================================================
pg_stat_user_tables_bloat:
query: |
SELECT
schemaname,
relname,
n_dead_tup,
n_live_tup,
CASE WHEN n_live_tup > 0
THEN round(100.0 * n_dead_tup / n_live_tup, 2)
ELSE 0
END as dead_tuple_ratio
FROM pg_stat_user_tables
WHERE n_live_tup > 1000
ORDER BY n_dead_tup DESC
LIMIT 20
metrics:
- schemaname:
usage: "LABEL"
description: "Schema name"
- relname:
usage: "LABEL"
description: "Table name"
- n_dead_tup:
usage: "GAUGE"
description: "Dead tuples count"
- n_live_tup:
usage: "GAUGE"
description: "Live tuples count"
- dead_tuple_ratio:
usage: "GAUGE"
description: "Dead tuple percentage"
# ==========================================================================
# 資料庫大小
# ==========================================================================
pg_database_size_bytes:
query: |
SELECT
datname,
pg_database_size(datname) as size_bytes
FROM pg_database
WHERE datname NOT IN ('template0', 'template1')
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- size_bytes:
usage: "GAUGE"
description: "Database size in bytes"
# ==========================================================================
# 事務統計 (AWOOOI 特定)
# ==========================================================================
pg_stat_database_transactions:
query: |
SELECT
datname,
xact_commit,
xact_rollback,
CASE WHEN xact_commit + xact_rollback > 0
THEN round(100.0 * xact_rollback / (xact_commit + xact_rollback), 2)
ELSE 0
END as rollback_ratio
FROM pg_stat_database
WHERE datname = 'awoooi'
OR datname = 'awoooi_prod'
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- xact_commit:
usage: "COUNTER"
description: "Committed transactions"
- xact_rollback:
usage: "COUNTER"
description: "Rolled back transactions"
- rollback_ratio:
usage: "GAUGE"
description: "Rollback percentage"
# ==========================================================================
# 複製延遲 (若有 Replica)
# ==========================================================================
pg_replication_lag:
query: |
SELECT
COALESCE(client_addr::text, 'local') as client_addr,
application_name,
EXTRACT(EPOCH FROM (now() - sent_lsn::text::pg_lsn::text::timestamp)) as lag_seconds
FROM pg_stat_replication
WHERE state = 'streaming'
master: true
metrics:
- client_addr:
usage: "LABEL"
description: "Replica address"
- application_name:
usage: "LABEL"
description: "Application name"
- lag_seconds:
usage: "GAUGE"
description: "Replication lag in seconds"
# ==========================================================================
# 最長執行查詢
# ==========================================================================
pg_longest_query_seconds:
query: |
SELECT
datname,
usename,
EXTRACT(EPOCH FROM (now() - query_start)) as duration_seconds
FROM pg_stat_activity
WHERE state = 'active'
AND query NOT LIKE 'SELECT pg_%'
AND query_start IS NOT NULL
ORDER BY query_start
LIMIT 1
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- usename:
usage: "LABEL"
description: "User name"
- duration_seconds:
usage: "GAUGE"
description: "Duration of longest running query in seconds"