Files
awoooi/k8s/monitoring/docker-compose-110.yml
OG T 08db3580a7
Some checks are pending
CD Pipeline / build-and-deploy (push) Has started running
fix(monitoring): 修復 110 主機 CPU 高負載
根因 1: cadvisor 持續掃描 overlay2 磁碟用量 (每次 1-4s × N 容器)
  → 加 --disable_metrics=disk,diskIO,tcp,udp,percpu,sched,process
  → --housekeeping_interval=30s --docker_only=true
  → CPU 從 239% 降到 <1%

根因 2: node_exporter scrape_timeout 預設 10s,高 load 下超時→broken pipe→瘋狂重試
  → 加 scrape_interval: 30s / scrape_timeout: 25s
  → CPU 從 48% 降到 0%

整體 load average: 20 → 9

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 13:53:13 +08:00

139 lines
3.5 KiB
YAML

version: '3.8'
services:
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
restart: unless-stopped
command:
- --logtostderr
- --disable_metrics=disk,diskIO,tcp,udp,percpu,sched,process
- --housekeeping_interval=30s
- --max_housekeeping_interval=60s
- --docker_only=true
ports:
- "9180:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
- /etc/localtime:/etc/localtime:ro
environment:
- TZ=Asia/Taipei
privileged: true
devices:
- /dev/kmsg
networks:
- monitoring
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./alerts.yml:/etc/prometheus/alerts.yml:ro
- prometheus_data:/prometheus
- /etc/localtime:/etc/localtime:ro
environment:
- TZ=Asia/Taipei
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.enable-lifecycle'
extra_hosts:
- "host.docker.internal:host-gateway"
networks:
- monitoring
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
ports:
- "3002:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=WoooTech2026
- GF_SERVER_ROOT_URL=http://192.168.0.110:3002
- TZ=Asia/Taipei
volumes:
- grafana_data:/var/lib/grafana
- /etc/localtime:/etc/localtime:ro
networks:
- monitoring
depends_on:
- prometheus
blackbox-exporter:
image: prom/blackbox-exporter:latest
container_name: blackbox-exporter
restart: unless-stopped
ports:
- "9115:9115"
volumes:
- ./blackbox.yml:/etc/blackbox_exporter/config.yml:ro
- /etc/localtime:/etc/localtime:ro
environment:
- TZ=Asia/Taipei
command:
- '--config.file=/etc/blackbox_exporter/config.yml'
networks:
- monitoring
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
- /etc/localtime:/etc/localtime:ro
environment:
- TZ=Asia/Taipei
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
networks:
- monitoring
# === Phase 5: GitHub Exporter (OPS.176) ===
github-exporter:
image: promhippie/github-exporter:latest
container_name: github-exporter
restart: unless-stopped
ports:
- '9504:9504'
environment:
- GITHUB_EXPORTER_TOKEN=${GITHUB_TOKEN}
- GITHUB_EXPORTER_REPOS=owenhytsai/wooo-aiops,owenhytsai/clawbot-v5
- GITHUB_EXPORTER_LOG_LEVEL=info
networks:
- monitoring
labels:
- 'com.wooo.service=github-exporter'
- 'com.wooo.phase=phase-5'
logging:
driver: json-file
options:
max-size: '10m'
max-file: '3'
networks:
monitoring:
driver: bridge
volumes:
prometheus_data:
grafana_data:
alertmanager_data: