# ============================================================================= # Prometheus Remote Write → SigNoz - Phase O-3 # ============================================================================= # 建立者: Claude Code (首席架構師) # 日期: 2026-04-02 (台北時間) # # ❌ 此方案已廢棄 (2026-04-02 實際部署時發現) # 原因: SigNoz OTEL Collector 不支援 Prometheus remote_write 格式 (Protobuf) # 端點 /api/v1/write 回傳 404 Not Found # # ✅ 改用方案: SigNoz OTEL Collector Prometheus Receiver 直接 scrape # 設定檔: ops/signoz/otel-collector-config-phase-o.yaml # 實際部署: .188:/home/ollama/signoz/deploy/docker/otel-collector-config.yaml # 新增 jobs: node-from-signoz (node-exporter) + kube-state-from-signoz # # ============================================================================= # ===== 新增至 prometheus.yml 最外層 (與 scrape_configs 同級) ===== remote_write: - url: http://localhost:24318/api/v1/write # 只在同一台機器上 (188 → 188),無需跨網路 remote_timeout: 30s # 佇列配置: 適合小型叢集 queue_config: capacity: 2500 max_shards: 5 min_shards: 1 max_samples_per_send: 500 batch_send_deadline: 5s # ===== 白名單過濾 (只轉關鍵指標) ===== # 預估 ~50 個 metric series,90 天約 2-5 GB write_relabel_configs: # 1. 節點資源指標 - source_labels: [__name__] regex: "node_cpu_seconds_total|node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|node_filesystem_avail_bytes|node_filesystem_size_bytes|node_load1|node_load5|node_load15|node_network_receive_bytes_total|node_network_transmit_bytes_total" action: keep # 2. 容器資源指標 - source_labels: [__name__] regex: "container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_memory_rss|container_network_receive_bytes_total|container_network_transmit_bytes_total" action: keep # 3. K8s 狀態指標 (Pod 重啟是 RCA 關鍵!) - source_labels: [__name__] regex: "kube_pod_container_status_restarts_total|kube_pod_status_phase|kube_deployment_status_replicas_available|kube_deployment_status_replicas_unavailable|kube_node_status_condition" action: keep # 4. API 效能指標 - source_labels: [__name__] regex: "http_request_duration_seconds.*|http_requests_total" action: keep # 5. AWOOOI 自訂業務指標 - source_labels: [__name__] regex: "awoooi_.*" action: keep # 6. 資料庫指標 - source_labels: [__name__] regex: "pg_stat_activity_count|pg_stat_database_conflicts|pg_stat_database_deadlocks|pg_up|redis_connected_clients|redis_memory_used_bytes|redis_keyspace_hits_total|redis_keyspace_misses_total|redis_up" action: keep # 7. Probe 健康狀態 - source_labels: [__name__] regex: "probe_success|probe_duration_seconds|probe_ssl_earliest_cert_expiry" action: keep # 8. UP 指標 (所有 target 存活狀態) - source_labels: [__name__] regex: "up" action: keep # ============================================================================= # ⚠️ 注意事項 # ============================================================================= # # 1. write_relabel_configs 是 OR 邏輯:多個 keep 規則中, # 任一匹配即保留。這是 Prometheus 內建行為。 # # 2. 如果 SigNoz OTLP/HTTP 端點不支援 Prometheus remote_write 格式, # 需要改用 victoria-metrics 或 cortex 作為中繼。 # 替代方案: 在 SigNoz 的 OTEL Collector 中啟用 prometheusremotewrite receiver。 # # 3. 增加/移除指標後,用以下指令驗證: # curl -s "http://192.168.0.188:9090/api/v1/query?query=up" | jq '.data.result | length' #