Files
awoooi/k8s/monitoring/prometheus.yml
OG T 16d682346a
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
feat(adr-074): M1 飛輪健康度 Exporter + M2 主機網路監控
ADR-074 M1:
  - FlywheelStatsService: 計算6項飛輪指標(Playbook數/成功率/KM向量化/alertname NULL/卡住數)
  - GET /api/v1/stats/flywheel — 六節點即時狀態(C1 前端用)
  - GET /api/v1/stats/summary — KPI 面板數據(C1 前端用)
  - GET /api/v1/stats/flywheel/metrics — Prometheus text format
  - flywheel-alerts.yaml: 5條告警規則(FlywheelPlaybookZero/ExecutionSuccessLow/KMVectorizationLow/AlertnameNullHigh/IncidentsStuck)
  - prometheus.yml: awoooi-flywheel scrape job(5分鐘間隔)

ADR-074 M2:
  - prometheus.yml: host-connectivity Blackbox TCP probe(110:22/188:22/120:6443/121:6443)
  - flywheel-alerts.yaml: HostNetworkPartition 告警規則

597 unit tests passed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 15:31:01 +08:00

209 lines
5.4 KiB
YAML

global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: "wooo-aiops-monitor"
environment: "production"
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
rule_files:
- "alerts.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
labels:
host: "110"
# === Node Exporters (5 台主機) ===
- job_name: "node-exporter-110"
scrape_interval: 30s
scrape_timeout: 25s
static_configs:
- targets: ["192.168.0.110:9100"]
labels:
host: "110"
role: "devops"
- job_name: "node-exporter-112"
static_configs:
- targets: ["192.168.0.112:9100"]
labels:
host: "112"
role: "kali-security"
- job_name: "node-exporter-188"
scrape_interval: 30s
scrape_timeout: 25s
static_configs:
- targets: ["192.168.0.188:9100"]
labels:
host: "188"
role: "ai-web"
# === Blackbox HTTP 探測 ===
- job_name: "blackbox-http"
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://aiops.wooo.work
- https://mo.wooo.work
- http://192.168.0.110:3001
- http://192.168.0.120:31234
- http://192.168.0.120:31235
- https://www.tsenyang.com
- http://stock.wooo.work
- https://bitan.wooo.work
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.0.110:9115
# === Blackbox TCP 探測 ===
- job_name: "blackbox-tcp"
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
# K3s 層
- 192.168.0.125:6443
- 192.168.0.125:32334
- 192.168.0.125:32335
# 110 服務
- 192.168.0.110:9090
- 192.168.0.110:3001
- 192.168.0.110:9000
- 192.168.0.110:5000
- 192.168.0.110:3100
# 188 服務
- 192.168.0.188:3301
- 192.168.0.188:5432
- 192.168.0.188:6380
- 192.168.0.188:11434
- 192.168.0.188:8089
# K3s Worker
- 192.168.0.120:31234
- 192.168.0.120:31235
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.0.110:9115
# === cAdvisor (Docker 容器監控) ===
- job_name: "cadvisor-110"
scrape_interval: 60s
scrape_timeout: 55s
static_configs:
- targets: ["192.168.0.110:9180"]
labels:
host: "110"
# === ClawBot ===
- job_name: "blackbox-clawbot"
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://192.168.0.188:8088/health
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.0.110:9115
- job_name: "clawbot"
scrape_interval: 30s
metrics_path: /metrics
static_configs:
- targets: ["192.168.0.188:8088"]
labels:
host: "188"
service: "clawbot"
# === GitHub Actions Exporter (Phase 5 OPS.176) ===
- job_name: 'github-actions'
scrape_interval: 60s
static_configs:
- targets: ['github-exporter:9504']
labels:
source: 'github-actions'
# === AWOOOI API Metrics (Phase O Wave A.2, 2026-04-02) ===
- job_name: 'awoooi-api'
scrape_interval: 30s
metrics_path: /metrics
static_configs:
- targets: ['192.168.0.125:32334']
labels:
host: '125'
service: 'awoooi-api'
env: 'prod'
# === ADR-074 M1: 飛輪健康度指標 (2026-04-12 ogt) ===
- job_name: 'awoooi-flywheel'
scrape_interval: 5m
metrics_path: /api/v1/stats/flywheel/metrics
static_configs:
- targets: ['192.168.0.125:32334']
labels:
host: '125'
service: 'awoooi-flywheel'
env: 'prod'
# === ADR-074 M2: 主機間網路連通性 (2026-04-12 ogt) ===
- job_name: 'host-connectivity'
scrape_interval: 60s
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- 192.168.0.110:22
- 192.168.0.188:22
- 192.168.0.120:6443
- 192.168.0.121:6443
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.0.188:9115
# === Sprint 5.2 Plan B: PostgreSQL Exporter (2026-04-08 Claude Sonnet 4.6) ===
- job_name: 'postgres-exporter'
scrape_interval: 30s
static_configs:
- targets: ['192.168.0.188:9187']
labels:
host: '188'
service: 'postgresql'
layer: 'systemd-188'
# === Sprint 5.2 Plan B: Redis Exporter (2026-04-08 Claude Sonnet 4.6) ===
- job_name: 'redis-exporter'
scrape_interval: 30s
static_configs:
- targets: ['192.168.0.188:9121']
labels:
host: '188'
service: 'redis'
layer: 'systemd-188'