fix(monitoring): 修復 110 主機 CPU 高負載
Some checks are pending
CD Pipeline / build-and-deploy (push) Has started running
Some checks are pending
CD Pipeline / build-and-deploy (push) Has started running
根因 1: cadvisor 持續掃描 overlay2 磁碟用量 (每次 1-4s × N 容器) → 加 --disable_metrics=disk,diskIO,tcp,udp,percpu,sched,process → --housekeeping_interval=30s --docker_only=true → CPU 從 239% 降到 <1% 根因 2: node_exporter scrape_timeout 預設 10s,高 load 下超時→broken pipe→瘋狂重試 → 加 scrape_interval: 30s / scrape_timeout: 25s → CPU 從 48% 降到 0% 整體 load average: 20 → 9 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
138
k8s/monitoring/docker-compose-110.yml
Normal file
138
k8s/monitoring/docker-compose-110.yml
Normal file
@@ -0,0 +1,138 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
container_name: cadvisor
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- --logtostderr
|
||||
- --disable_metrics=disk,diskIO,tcp,udp,percpu,sched,process
|
||||
- --housekeeping_interval=30s
|
||||
- --max_housekeeping_interval=60s
|
||||
- --docker_only=true
|
||||
ports:
|
||||
- "9180:8080"
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
environment:
|
||||
- TZ=Asia/Taipei
|
||||
privileged: true
|
||||
devices:
|
||||
- /dev/kmsg
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||
- prometheus_data:/prometheus
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
environment:
|
||||
- TZ=Asia/Taipei
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: grafana
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "3002:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=WoooTech2026
|
||||
- GF_SERVER_ROOT_URL=http://192.168.0.110:3002
|
||||
- TZ=Asia/Taipei
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
networks:
|
||||
- monitoring
|
||||
depends_on:
|
||||
- prometheus
|
||||
|
||||
blackbox-exporter:
|
||||
image: prom/blackbox-exporter:latest
|
||||
container_name: blackbox-exporter
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9115:9115"
|
||||
volumes:
|
||||
- ./blackbox.yml:/etc/blackbox_exporter/config.yml:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
environment:
|
||||
- TZ=Asia/Taipei
|
||||
command:
|
||||
- '--config.file=/etc/blackbox_exporter/config.yml'
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9093:9093"
|
||||
volumes:
|
||||
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
- alertmanager_data:/alertmanager
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
environment:
|
||||
- TZ=Asia/Taipei
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
# === Phase 5: GitHub Exporter (OPS.176) ===
|
||||
github-exporter:
|
||||
image: promhippie/github-exporter:latest
|
||||
container_name: github-exporter
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- '9504:9504'
|
||||
environment:
|
||||
- GITHUB_EXPORTER_TOKEN=${GITHUB_TOKEN}
|
||||
- GITHUB_EXPORTER_REPOS=owenhytsai/wooo-aiops,owenhytsai/clawbot-v5
|
||||
- GITHUB_EXPORTER_LOG_LEVEL=info
|
||||
networks:
|
||||
- monitoring
|
||||
labels:
|
||||
- 'com.wooo.service=github-exporter'
|
||||
- 'com.wooo.phase=phase-5'
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: '10m'
|
||||
max-file: '3'
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
prometheus_data:
|
||||
grafana_data:
|
||||
alertmanager_data:
|
||||
@@ -22,6 +22,8 @@ scrape_configs:
|
||||
|
||||
# === Node Exporters (5 台主機) ===
|
||||
- job_name: "node-exporter-110"
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 25s
|
||||
static_configs:
|
||||
- targets: ["192.168.0.110:9100"]
|
||||
labels:
|
||||
|
||||
Reference in New Issue
Block a user