fix(ops): alert on missing docker resource limits
Some checks failed
CD Pipeline / tests (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Successful in 23s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 38s

This commit is contained in:
Your Name
2026-05-05 15:01:31 +08:00
parent e24c8ea051
commit d08d1e4951
5 changed files with 44 additions and 6 deletions

View File

@@ -663,6 +663,23 @@ groups:
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
runbook: "先抓 crash signature若是 config/DB/網路問題,修設定,不用無限 restart。"
- alert: DockerContainerMissingResourceLimit
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0)
for: 30m
labels:
severity: warning
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit"
description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail長時間尖峰可能拖垮 110/188。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker inspect {{ $labels.container_name }} --format \"NanoCpus={{.HostConfig.NanoCpus}} Memory={{.HostConfig.Memory}}\"; docker stats --no-stream {{ $labels.container_name }}'"
runbook: "先盤點 workload再在 compose/service-specific playbook 補 cpus/mem_limit。不可盲目降 ClickHouse/Kafkamomo app/scheduler 可用 2 core/2GiB 起步。"
- alert: DockerGiteaActionsJobStale
# 2026-05-05 ogt + Codex: Gitea/act job containers can outlive workflow timeout and keep 110 hot.
expr: time() - docker_container_started_seconds{container_name=~"GITEA-ACTIONS-.*"} > 1200

View File

@@ -669,6 +669,23 @@ groups:
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
runbook: "先抓 crash signature若是 config/DB/網路問題,修設定,不用無限 restart。"
- alert: DockerContainerMissingResourceLimit
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0)
for: 30m
labels:
severity: warning
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit"
description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail長時間尖峰可能拖垮 110/188。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker inspect {{ $labels.container_name }} --format \"NanoCpus={{.HostConfig.NanoCpus}} Memory={{.HostConfig.Memory}}\"; docker stats --no-stream {{ $labels.container_name }}'"
runbook: "先盤點 workload再在 compose/service-specific playbook 補 cpus/mem_limit。不可盲目降 ClickHouse/Kafkamomo app/scheduler 可用 2 core/2GiB 起步。"
- alert: DockerGiteaActionsJobStale
# 2026-05-05 ogt + Codex: Gitea/act job containers can outlive workflow timeout and keep 110 hot.
expr: time() - docker_container_started_seconds{container_name=~"GITEA-ACTIONS-.*"} > 1200