From 08db3580a76ca3583ed73da93a79652ad99a4bf1 Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 9 Apr 2026 13:53:13 +0800 Subject: [PATCH] =?UTF-8?q?fix(monitoring):=20=E4=BF=AE=E5=BE=A9=20110=20?= =?UTF-8?q?=E4=B8=BB=E6=A9=9F=20CPU=20=E9=AB=98=E8=B2=A0=E8=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因 1: cadvisor 持續掃描 overlay2 磁碟用量 (每次 1-4s × N 容器) → 加 --disable_metrics=disk,diskIO,tcp,udp,percpu,sched,process → --housekeeping_interval=30s --docker_only=true → CPU 從 239% 降到 <1% 根因 2: node_exporter scrape_timeout 預設 10s,高 load 下超時→broken pipe→瘋狂重試 → 加 scrape_interval: 30s / scrape_timeout: 25s → CPU 從 48% 降到 0% 整體 load average: 20 → 9 Co-Authored-By: Claude Sonnet 4.6 --- k8s/monitoring/docker-compose-110.yml | 138 ++++++++++++++++++++++++++ k8s/monitoring/prometheus.yml | 2 + 2 files changed, 140 insertions(+) create mode 100644 k8s/monitoring/docker-compose-110.yml diff --git a/k8s/monitoring/docker-compose-110.yml b/k8s/monitoring/docker-compose-110.yml new file mode 100644 index 00000000..3116b227 --- /dev/null +++ b/k8s/monitoring/docker-compose-110.yml @@ -0,0 +1,138 @@ +version: '3.8' + +services: + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + container_name: cadvisor + restart: unless-stopped + command: + - --logtostderr + - --disable_metrics=disk,diskIO,tcp,udp,percpu,sched,process + - --housekeeping_interval=30s + - --max_housekeeping_interval=60s + - --docker_only=true + ports: + - "9180:8080" + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + - /etc/localtime:/etc/localtime:ro + environment: + - TZ=Asia/Taipei + privileged: true + devices: + - /dev/kmsg + networks: + - monitoring + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + restart: unless-stopped + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./alerts.yml:/etc/prometheus/alerts.yml:ro + - prometheus_data:/prometheus + - /etc/localtime:/etc/localtime:ro + environment: + - TZ=Asia/Taipei + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=30d' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.enable-lifecycle' + extra_hosts: + - "host.docker.internal:host-gateway" + networks: + - monitoring + + grafana: + image: grafana/grafana:latest + container_name: grafana + restart: unless-stopped + ports: + - "3002:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=WoooTech2026 + - GF_SERVER_ROOT_URL=http://192.168.0.110:3002 + - TZ=Asia/Taipei + volumes: + - grafana_data:/var/lib/grafana + - /etc/localtime:/etc/localtime:ro + networks: + - monitoring + depends_on: + - prometheus + + blackbox-exporter: + image: prom/blackbox-exporter:latest + container_name: blackbox-exporter + restart: unless-stopped + ports: + - "9115:9115" + volumes: + - ./blackbox.yml:/etc/blackbox_exporter/config.yml:ro + - /etc/localtime:/etc/localtime:ro + environment: + - TZ=Asia/Taipei + command: + - '--config.file=/etc/blackbox_exporter/config.yml' + networks: + - monitoring + + alertmanager: + image: prom/alertmanager:latest + container_name: alertmanager + restart: unless-stopped + ports: + - "9093:9093" + volumes: + - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - alertmanager_data:/alertmanager + - /etc/localtime:/etc/localtime:ro + environment: + - TZ=Asia/Taipei + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + networks: + - monitoring + + # === Phase 5: GitHub Exporter (OPS.176) === + github-exporter: + image: promhippie/github-exporter:latest + container_name: github-exporter + restart: unless-stopped + ports: + - '9504:9504' + environment: + - GITHUB_EXPORTER_TOKEN=${GITHUB_TOKEN} + - GITHUB_EXPORTER_REPOS=owenhytsai/wooo-aiops,owenhytsai/clawbot-v5 + - GITHUB_EXPORTER_LOG_LEVEL=info + networks: + - monitoring + labels: + - 'com.wooo.service=github-exporter' + - 'com.wooo.phase=phase-5' + logging: + driver: json-file + options: + max-size: '10m' + max-file: '3' + +networks: + monitoring: + driver: bridge + +volumes: + prometheus_data: + grafana_data: + alertmanager_data: diff --git a/k8s/monitoring/prometheus.yml b/k8s/monitoring/prometheus.yml index cffef73a..5f716740 100644 --- a/k8s/monitoring/prometheus.yml +++ b/k8s/monitoring/prometheus.yml @@ -22,6 +22,8 @@ scrape_configs: # === Node Exporters (5 台主機) === - job_name: "node-exporter-110" + scrape_interval: 30s + scrape_timeout: 25s static_configs: - targets: ["192.168.0.110:9100"] labels: