From 5bd8a8a719d8faa2f32a9e8834cecc590a78b3c7 Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 9 Apr 2026 12:20:19 +0800 Subject: [PATCH] =?UTF-8?q?fix(monitoring):=20=E8=A3=9C=E9=BD=8A=20blackbo?= =?UTF-8?q?x-tcp=20scrape=20targets=20(11=E2=86=9215)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SentryDown/HarborDown/SignOzDown 等告警引用的 instance 不在 scrape list 導致 absent metric = 0,告警持續 firing 新增缺少的 targets: 192.168.0.125:6443/32334/32335 (K3s) 192.168.0.110:9000/5000/3100 (Sentry/Harbor/Langfuse) 192.168.0.188:3301/5432/6380/11434/8089 (SignOz/PG/Redis/Ollama/OpenClaw) 已在 110 主機 reload Prometheus,全部 15 targets UP Co-Authored-By: Claude Sonnet 4.6 --- k8s/monitoring/prometheus.yml | 175 ++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 k8s/monitoring/prometheus.yml diff --git a/k8s/monitoring/prometheus.yml b/k8s/monitoring/prometheus.yml new file mode 100644 index 00000000..cffef73a --- /dev/null +++ b/k8s/monitoring/prometheus.yml @@ -0,0 +1,175 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + monitor: "wooo-aiops-monitor" + environment: "production" + +alerting: + alertmanagers: + - static_configs: + - targets: ["alertmanager:9093"] + +rule_files: + - "alerts.yml" + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + labels: + host: "110" + + # === Node Exporters (5 台主機) === + - job_name: "node-exporter-110" + static_configs: + - targets: ["192.168.0.110:9100"] + labels: + host: "110" + role: "devops" + + - job_name: "node-exporter-112" + static_configs: + - targets: ["192.168.0.112:9100"] + labels: + host: "112" + role: "kali-security" + + - job_name: "node-exporter-188" + scrape_interval: 30s + scrape_timeout: 25s + static_configs: + - targets: ["192.168.0.188:9100"] + labels: + host: "188" + role: "ai-web" + + # === Blackbox HTTP 探測 === + - job_name: "blackbox-http" + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - https://aiops.wooo.work + - https://mo.wooo.work + - http://192.168.0.110:3001 + - http://192.168.0.120:31234 + - http://192.168.0.120:31235 + - https://www.tsenyang.com + - http://stock.wooo.work + - https://bitan.wooo.work + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 192.168.0.110:9115 + + # === Blackbox TCP 探測 === + - job_name: "blackbox-tcp" + metrics_path: /probe + params: + module: [tcp_connect] + static_configs: + - targets: + # K3s 層 + - 192.168.0.125:6443 + - 192.168.0.125:32334 + - 192.168.0.125:32335 + # 110 服務 + - 192.168.0.110:9090 + - 192.168.0.110:3001 + - 192.168.0.110:9000 + - 192.168.0.110:5000 + - 192.168.0.110:3100 + # 188 服務 + - 192.168.0.188:3301 + - 192.168.0.188:5432 + - 192.168.0.188:6380 + - 192.168.0.188:11434 + - 192.168.0.188:8089 + # K3s Worker + - 192.168.0.120:31234 + - 192.168.0.120:31235 + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 192.168.0.110:9115 + + # === cAdvisor (Docker 容器監控) === + - job_name: "cadvisor-110" + scrape_interval: 60s + scrape_timeout: 55s + static_configs: + - targets: ["192.168.0.110:9180"] + labels: + host: "110" + + # === ClawBot === + - job_name: "blackbox-clawbot" + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://192.168.0.188:8088/health + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 192.168.0.110:9115 + + - job_name: "clawbot" + scrape_interval: 30s + metrics_path: /metrics + static_configs: + - targets: ["192.168.0.188:8088"] + labels: + host: "188" + service: "clawbot" + + # === GitHub Actions Exporter (Phase 5 OPS.176) === + - job_name: 'github-actions' + scrape_interval: 60s + static_configs: + - targets: ['github-exporter:9504'] + labels: + source: 'github-actions' + + # === AWOOOI API Metrics (Phase O Wave A.2, 2026-04-02) === + - job_name: 'awoooi-api' + scrape_interval: 30s + metrics_path: /metrics + static_configs: + - targets: ['192.168.0.125:32334'] + labels: + host: '125' + service: 'awoooi-api' + env: 'prod' + + # === Sprint 5.2 Plan B: PostgreSQL Exporter (2026-04-08 Claude Sonnet 4.6) === + - job_name: 'postgres-exporter' + scrape_interval: 30s + static_configs: + - targets: ['192.168.0.188:9187'] + labels: + host: '188' + service: 'postgresql' + layer: 'systemd-188' + + # === Sprint 5.2 Plan B: Redis Exporter (2026-04-08 Claude Sonnet 4.6) === + - job_name: 'redis-exporter' + scrape_interval: 30s + static_configs: + - targets: ['192.168.0.188:9121'] + labels: + host: '188' + service: 'redis' + layer: 'systemd-188'