From d08d1e49518fcf57abce7bda10554adbcba395cc Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 5 May 2026 15:01:31 +0800 Subject: [PATCH] fix(ops): alert on missing docker resource limits --- apps/api/alert_rules.yaml | 1 + docs/LOGBOOK.md | 3 ++- docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md | 12 +++++++----- ops/monitoring/alerts-unified.yml | 17 +++++++++++++++++ ops/monitoring/alerts.yml | 17 +++++++++++++++++ 5 files changed, 44 insertions(+), 6 deletions(-) diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index 1b20f44b..b00f2319 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -174,6 +174,7 @@ rules: - DockerContainerCpuSustainedHigh - DockerContainerCpuRunawayCritical - DockerContainerMemoryLimitPressure + - DockerContainerMissingResourceLimit - DockerContainerRestartSpike - DockerGiteaActionsJobStale response: diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 9be7d2ba..e35099ed 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1608,7 +1608,8 @@ psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks | `3b73cc7` | CD paths 收斂,workflow-only commits 不再觸發完整 image build/deploy | | `7d45f0c` | Docker textfile 補 `docker_container_started_seconds` + `DockerGiteaActionsJobStale` | | `5e625f7` | 110 stale Gitea Actions job dry-run cleanup script + runbook/alert annotation | -| `pending` | stale job cleanup policy thresholds aligned with workflow/job timeout buffers | +| `72d66e4` | stale job cleanup policy thresholds aligned with workflow/job timeout buffers | +| `pending` | `DockerContainerMissingResourceLimit` alert routing for Compose services missing CPU/memory guardrails | ### 下一步 diff --git a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md index 9c79d074..58cbd1c6 100644 --- a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md +++ b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md @@ -42,6 +42,7 @@ Use these thresholds for alerting and AI triage: | Docker container CPU | > 2 cores for 10m | Warning diagnosis; check limit, backlog, and workload type. | | Docker container CPU | > 4 cores for 15m | Critical diagnosis; never generic restart. | | Docker memory / limit | > 85% for 10m | Warning; raise memory or reduce workload, never lower the limit. | +| Docker CPU or memory limit | missing for 30m | Warning; add service-specific caps before the next spike. | | Docker restarts | > 5 in 15m | Critical; pull logs and fix crash signature. | | Systemd runner restarts | > 2 in 15m | Critical; inspect watchdog/drop-ins and active CI jobs. | | Systemd runner WatchdogSec | > 0 for 10m | Warning; GitHub Actions runner should not be killed by systemd watchdog. | @@ -57,11 +58,12 @@ Use these thresholds for alerting and AI triage: 5. For Kafka/Snuba, treat high CPU as backlog digestion unless lag stops decreasing. 6. For monitoring tools, caps are required, but every cap must be paired with self-monitoring. 7. Every Docker Compose host must emit `docker_container_cpu_cores`, `docker_container_memory_*`, and a restart counter via node-exporter textfile. -8. Disable node-exporter collectors that are slow or failing on each host; exporter scrape time is part of the resource baseline. -9. Runner services are part of the resource baseline even when jobs run outside Docker. Export `systemd_unit_*` metrics for all `actions.runner.*` services. -10. CI runner limits must be set at the systemd unit level. Docker container limits do not protect host load from runner listener, worker, and `act` helper processes. -11. Workflow-only commits must not trigger full CD image builds; CD should run only when runtime inputs change. -12. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than its workflow/job policy threshold and has no recent useful logs. +8. Important Docker services must have both CPU and memory guardrails; missing limit alerts should drive service-specific compose fixes, not generic restarts. +9. Disable node-exporter collectors that are slow or failing on each host; exporter scrape time is part of the resource baseline. +10. Runner services are part of the resource baseline even when jobs run outside Docker. Export `systemd_unit_*` metrics for all `actions.runner.*` services. +11. CI runner limits must be set at the systemd unit level. Docker container limits do not protect host load from runner listener, worker, and `act` helper processes. +12. Workflow-only commits must not trigger full CD image builds; CD should run only when runtime inputs change. +13. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than its workflow/job policy threshold and has no recent useful logs. ## Next Safe Rollout Order diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 1f9f28d9..7f962afb 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -663,6 +663,23 @@ groups: auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'" runbook: "先抓 crash signature;若是 config/DB/網路問題,修設定,不用無限 restart。" + - alert: DockerContainerMissingResourceLimit + # 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory. + expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) + for: 30m + labels: + severity: warning + layer: docker + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "true" + annotations: + summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit" + description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail,長時間尖峰可能拖垮 110/188。" + auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker inspect {{ $labels.container_name }} --format \"NanoCpus={{.HostConfig.NanoCpus}} Memory={{.HostConfig.Memory}}\"; docker stats --no-stream {{ $labels.container_name }}'" + runbook: "先盤點 workload,再在 compose/service-specific playbook 補 cpus/mem_limit。不可盲目降 ClickHouse/Kafka;momo app/scheduler 可用 2 core/2GiB 起步。" + - alert: DockerGiteaActionsJobStale # 2026-05-05 ogt + Codex: Gitea/act job containers can outlive workflow timeout and keep 110 hot. expr: time() - docker_container_started_seconds{container_name=~"GITEA-ACTIONS-.*"} > 1200 diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index 388dd581..928a00ec 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -669,6 +669,23 @@ groups: auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'" runbook: "先抓 crash signature;若是 config/DB/網路問題,修設定,不用無限 restart。" + - alert: DockerContainerMissingResourceLimit + # 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory. + expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) + for: 30m + labels: + severity: warning + layer: docker + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "true" + annotations: + summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit" + description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail,長時間尖峰可能拖垮 110/188。" + auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker inspect {{ $labels.container_name }} --format \"NanoCpus={{.HostConfig.NanoCpus}} Memory={{.HostConfig.Memory}}\"; docker stats --no-stream {{ $labels.container_name }}'" + runbook: "先盤點 workload,再在 compose/service-specific playbook 補 cpus/mem_limit。不可盲目降 ClickHouse/Kafka;momo app/scheduler 可用 2 core/2GiB 起步。" + - alert: DockerGiteaActionsJobStale # 2026-05-05 ogt + Codex: Gitea/act job containers can outlive workflow timeout and keep 110 hot. expr: time() - docker_container_started_seconds{container_name=~"GITEA-ACTIONS-.*"} > 1200