fix(ops): alert on missing docker resource limits
Some checks failed
CD Pipeline / tests (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Successful in 23s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 38s
Some checks failed
CD Pipeline / tests (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Successful in 23s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 38s
This commit is contained in:
@@ -663,6 +663,23 @@ groups:
|
||||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
|
||||
runbook: "先抓 crash signature;若是 config/DB/網路問題,修設定,不用無限 restart。"
|
||||
|
||||
- alert: DockerContainerMissingResourceLimit
|
||||
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
|
||||
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: docker
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "true"
|
||||
annotations:
|
||||
summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit"
|
||||
description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail,長時間尖峰可能拖垮 110/188。"
|
||||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker inspect {{ $labels.container_name }} --format \"NanoCpus={{.HostConfig.NanoCpus}} Memory={{.HostConfig.Memory}}\"; docker stats --no-stream {{ $labels.container_name }}'"
|
||||
runbook: "先盤點 workload,再在 compose/service-specific playbook 補 cpus/mem_limit。不可盲目降 ClickHouse/Kafka;momo app/scheduler 可用 2 core/2GiB 起步。"
|
||||
|
||||
- alert: DockerGiteaActionsJobStale
|
||||
# 2026-05-05 ogt + Codex: Gitea/act job containers can outlive workflow timeout and keep 110 hot.
|
||||
expr: time() - docker_container_started_seconds{container_name=~"GITEA-ACTIONS-.*"} > 1200
|
||||
|
||||
@@ -669,6 +669,23 @@ groups:
|
||||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
|
||||
runbook: "先抓 crash signature;若是 config/DB/網路問題,修設定,不用無限 restart。"
|
||||
|
||||
- alert: DockerContainerMissingResourceLimit
|
||||
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
|
||||
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|process-spans).*"} == 0)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: docker
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "true"
|
||||
annotations:
|
||||
summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit"
|
||||
description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail,長時間尖峰可能拖垮 110/188。"
|
||||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker inspect {{ $labels.container_name }} --format \"NanoCpus={{.HostConfig.NanoCpus}} Memory={{.HostConfig.Memory}}\"; docker stats --no-stream {{ $labels.container_name }}'"
|
||||
runbook: "先盤點 workload,再在 compose/service-specific playbook 補 cpus/mem_limit。不可盲目降 ClickHouse/Kafka;momo app/scheduler 可用 2 core/2GiB 起步。"
|
||||
|
||||
- alert: DockerGiteaActionsJobStale
|
||||
# 2026-05-05 ogt + Codex: Gitea/act job containers can outlive workflow timeout and keep 110 hot.
|
||||
expr: time() - docker_container_started_seconds{container_name=~"GITEA-ACTIONS-.*"} > 1200
|
||||
|
||||
Reference in New Issue
Block a user