Some checks are pending
CD Pipeline / build-and-deploy (push) Has started running
根因 1: cadvisor 持續掃描 overlay2 磁碟用量 (每次 1-4s × N 容器) → 加 --disable_metrics=disk,diskIO,tcp,udp,percpu,sched,process → --housekeeping_interval=30s --docker_only=true → CPU 從 239% 降到 <1% 根因 2: node_exporter scrape_timeout 預設 10s,高 load 下超時→broken pipe→瘋狂重試 → 加 scrape_interval: 30s / scrape_timeout: 25s → CPU 從 48% 降到 0% 整體 load average: 20 → 9 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
139 lines
3.5 KiB
YAML
139 lines
3.5 KiB
YAML
version: '3.8'
|
|
|
|
services:
|
|
cadvisor:
|
|
image: gcr.io/cadvisor/cadvisor:latest
|
|
container_name: cadvisor
|
|
restart: unless-stopped
|
|
command:
|
|
- --logtostderr
|
|
- --disable_metrics=disk,diskIO,tcp,udp,percpu,sched,process
|
|
- --housekeeping_interval=30s
|
|
- --max_housekeeping_interval=60s
|
|
- --docker_only=true
|
|
ports:
|
|
- "9180:8080"
|
|
volumes:
|
|
- /:/rootfs:ro
|
|
- /var/run:/var/run:ro
|
|
- /sys:/sys:ro
|
|
- /var/lib/docker/:/var/lib/docker:ro
|
|
- /dev/disk/:/dev/disk:ro
|
|
- /etc/localtime:/etc/localtime:ro
|
|
environment:
|
|
- TZ=Asia/Taipei
|
|
privileged: true
|
|
devices:
|
|
- /dev/kmsg
|
|
networks:
|
|
- monitoring
|
|
|
|
prometheus:
|
|
image: prom/prometheus:latest
|
|
container_name: prometheus
|
|
restart: unless-stopped
|
|
ports:
|
|
- "9090:9090"
|
|
volumes:
|
|
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
- ./alerts.yml:/etc/prometheus/alerts.yml:ro
|
|
- prometheus_data:/prometheus
|
|
- /etc/localtime:/etc/localtime:ro
|
|
environment:
|
|
- TZ=Asia/Taipei
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--storage.tsdb.retention.time=30d'
|
|
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
|
- '--web.console.templates=/usr/share/prometheus/consoles'
|
|
- '--web.enable-lifecycle'
|
|
extra_hosts:
|
|
- "host.docker.internal:host-gateway"
|
|
networks:
|
|
- monitoring
|
|
|
|
grafana:
|
|
image: grafana/grafana:latest
|
|
container_name: grafana
|
|
restart: unless-stopped
|
|
ports:
|
|
- "3002:3000"
|
|
environment:
|
|
- GF_SECURITY_ADMIN_USER=admin
|
|
- GF_SECURITY_ADMIN_PASSWORD=WoooTech2026
|
|
- GF_SERVER_ROOT_URL=http://192.168.0.110:3002
|
|
- TZ=Asia/Taipei
|
|
volumes:
|
|
- grafana_data:/var/lib/grafana
|
|
- /etc/localtime:/etc/localtime:ro
|
|
networks:
|
|
- monitoring
|
|
depends_on:
|
|
- prometheus
|
|
|
|
blackbox-exporter:
|
|
image: prom/blackbox-exporter:latest
|
|
container_name: blackbox-exporter
|
|
restart: unless-stopped
|
|
ports:
|
|
- "9115:9115"
|
|
volumes:
|
|
- ./blackbox.yml:/etc/blackbox_exporter/config.yml:ro
|
|
- /etc/localtime:/etc/localtime:ro
|
|
environment:
|
|
- TZ=Asia/Taipei
|
|
command:
|
|
- '--config.file=/etc/blackbox_exporter/config.yml'
|
|
networks:
|
|
- monitoring
|
|
|
|
alertmanager:
|
|
image: prom/alertmanager:latest
|
|
container_name: alertmanager
|
|
restart: unless-stopped
|
|
ports:
|
|
- "9093:9093"
|
|
volumes:
|
|
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
|
- alertmanager_data:/alertmanager
|
|
- /etc/localtime:/etc/localtime:ro
|
|
environment:
|
|
- TZ=Asia/Taipei
|
|
command:
|
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
|
- '--storage.path=/alertmanager'
|
|
networks:
|
|
- monitoring
|
|
|
|
# === Phase 5: GitHub Exporter (OPS.176) ===
|
|
github-exporter:
|
|
image: promhippie/github-exporter:latest
|
|
container_name: github-exporter
|
|
restart: unless-stopped
|
|
ports:
|
|
- '9504:9504'
|
|
environment:
|
|
- GITHUB_EXPORTER_TOKEN=${GITHUB_TOKEN}
|
|
- GITHUB_EXPORTER_REPOS=owenhytsai/wooo-aiops,owenhytsai/clawbot-v5
|
|
- GITHUB_EXPORTER_LOG_LEVEL=info
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- 'com.wooo.service=github-exporter'
|
|
- 'com.wooo.phase=phase-5'
|
|
logging:
|
|
driver: json-file
|
|
options:
|
|
max-size: '10m'
|
|
max-file: '3'
|
|
|
|
networks:
|
|
monitoring:
|
|
driver: bridge
|
|
|
|
volumes:
|
|
prometheus_data:
|
|
grafana_data:
|
|
alertmanager_data:
|