- alerts-unified.yml: - SentryClickHouseMemoryPressure: usage_bytes → working_set_bytes,0.8 → 0.85 - GiteaMemoryPressure: 同步修正(同樣 page cache 虛高根因) - ops/monitoring/tests/clickhouse_memory_test.yml: promtool 4 cases - 04-awoooi-devops-commander.md v2.8: Prometheus 指標選擇規範 + Gitea HMAC Webhook 規範 - LOGBOOK: 記錄 T0 五大並行任務(A 按鈕 / B ClickHouse / C Gitea webhook / D ElephantAlpha / F Code review) 鐵證: 2026-04-23 23:13 sentry-clickhouse usage_bytes=88.5% vs working_set=7.8% 根因: container_memory_usage_bytes 含 OS page cache,OOM killer 不視為壓力 修法: 改用 K8s/cadvisor 認可的 working_set_bytes (RSS + active cache),閾值 0.85 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
87 lines
4.9 KiB
YAML
87 lines
4.9 KiB
YAML
# Unit tests for SentryClickHouseMemoryPressure
|
||
# 2026-04-25 ogt + Claude Opus 4.7
|
||
rule_files:
|
||
- ../alerts-unified.yml
|
||
|
||
evaluation_interval: 1m
|
||
|
||
tests:
|
||
# ---- 負測 1:page cache 高、working_set 低(修正後不該觸發)----
|
||
- interval: 1m
|
||
name: "page cache spike must NOT alert (the original false-positive scenario)"
|
||
input_series:
|
||
# working_set: 411 MiB / 8 GiB = 5%(正常)
|
||
- series: 'container_memory_working_set_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '430917632 430917632 430917632 430917632 430917632 430917632 430917632 430917632 430917632 430917632 430917632 430917632 430917632 430917632'
|
||
# usage_bytes: 7.5 GiB / 8 GiB = 93.7%(如果規則用錯指標就會誤觸發)
|
||
- series: 'container_memory_usage_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '8053063680 8053063680 8053063680 8053063680 8053063680 8053063680 8053063680 8053063680 8053063680 8053063680 8053063680 8053063680 8053063680 8053063680'
|
||
- series: 'container_spec_memory_limit_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592'
|
||
alert_rule_test:
|
||
- eval_time: 12m
|
||
alertname: SentryClickHouseMemoryPressure
|
||
# 期望沒有任何告警(exp_alerts 留空)
|
||
exp_alerts: []
|
||
|
||
# ---- 負測 2:working_set 略高但 < 85%(不該觸發)----
|
||
- interval: 1m
|
||
name: "working_set 80% must NOT alert (below 85% threshold)"
|
||
input_series:
|
||
# working_set: 6.5 GiB / 8 GiB = 80%(< 85%,不該觸發)
|
||
- series: 'container_memory_working_set_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673'
|
||
- series: 'container_memory_usage_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673 6871947673'
|
||
- series: 'container_spec_memory_limit_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592 8589934592'
|
||
alert_rule_test:
|
||
- eval_time: 12m
|
||
alertname: SentryClickHouseMemoryPressure
|
||
exp_alerts: []
|
||
|
||
# ---- 正測 1:working_set > 85% 持續 10 分鐘(必須觸發)----
|
||
- interval: 1m
|
||
name: "working_set 90% sustained 10m MUST alert (real memory pressure)"
|
||
input_series:
|
||
# working_set: 7.4 GiB / 8 GiB = 86.7%(持續高水位)
|
||
- series: 'container_memory_working_set_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '7449424589x14'
|
||
- series: 'container_memory_usage_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '7449424589x14'
|
||
- series: 'container_spec_memory_limit_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '8589934592x14'
|
||
alert_rule_test:
|
||
- eval_time: 12m
|
||
alertname: SentryClickHouseMemoryPressure
|
||
exp_alerts:
|
||
- exp_labels:
|
||
alertname: SentryClickHouseMemoryPressure
|
||
alert_category: infrastructure
|
||
auto_repair: "false"
|
||
component: sentry-clickhouse
|
||
name: sentry-self-hosted-clickhouse-1
|
||
notification_type: TYPE-1
|
||
severity: warning
|
||
team: platform
|
||
exp_annotations:
|
||
summary: "Sentry ClickHouse 工作集記憶體 > 85% limit"
|
||
description: "sentry clickhouse working_set / mem_limit = 86.72% (排除 page cache)。"
|
||
runbook: "檢查 Sentry 查詢壓力;確認非 page cache 假象;必要時調整 /opt/sentry/docker-compose.override.yml clickhouse mem_limit"
|
||
|
||
# ---- 正測 2:尖峰 < 10 分鐘(不該觸發,for: 10m 過濾掉)----
|
||
- interval: 1m
|
||
name: "working_set 95% spike for only 5m must NOT alert (for:10m guard)"
|
||
input_series:
|
||
# 前 5 分鐘 90%,之後降回 5%
|
||
- series: 'container_memory_working_set_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '7730941132 7730941132 7730941132 7730941132 7730941132 430917632 430917632 430917632 430917632 430917632 430917632 430917632'
|
||
- series: 'container_memory_usage_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '7730941132 7730941132 7730941132 7730941132 7730941132 430917632 430917632 430917632 430917632 430917632 430917632 430917632'
|
||
- series: 'container_spec_memory_limit_bytes{name="sentry-self-hosted-clickhouse-1"}'
|
||
values: '8589934592x12'
|
||
alert_rule_test:
|
||
- eval_time: 11m
|
||
alertname: SentryClickHouseMemoryPressure
|
||
exp_alerts: []
|