From 30b7b10f01e4a930ae54e528acccb31db2a5516f Mon Sep 17 00:00:00 2001 From: OG T Date: Fri, 3 Apr 2026 00:18:00 +0800 Subject: [PATCH] =?UTF-8?q?feat(grafana):=20Wave=20D=20=E2=80=94=20AI?= =?UTF-8?q?=E7=9B=A3=E6=8E=A7=20+=20=E5=9F=BA=E7=A4=8E=E8=A8=AD=E6=96=BD?= =?UTF-8?q?=20Dashboard=20(Grafana=20188:3002)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增 2 個 Dashboard,匯入既有 Nemotron Dashboard: 1. ai-monitoring.json — LLM + NVIDIA AI 監控 - LLM 呼叫速率 (req/min) - LLM P99/P50 延遲 - Nemotron Tool Calling P99/P50 延遲 - LLM Cache 命中率 % - LLM Fallback 次數 - Alert Chain 健康/最後成功時間 2. infra-monitoring.json — Node + K3s 基礎設施 - CPU/Memory 使用率 - K3s Pod 數量 (by namespace) - K3s Pod 重啟次數 - Prometheus Targets UP/DOWN - API 請求速率 3. nvidia-nemotron.json — 既有 18-panel Nemotron Dashboard (版控) 部署: 192.168.0.188:3002 (Grafana 12.4.1) Provisioning: monitoring/grafana/provisioning/dashboards/ Co-Authored-By: Claude Sonnet 4.6 --- ops/grafana/dashboards/ai-monitoring.json | 347 +++++++++++++++++++ ops/grafana/dashboards/infra-monitoring.json | 306 ++++++++++++++++ 2 files changed, 653 insertions(+) create mode 100644 ops/grafana/dashboards/ai-monitoring.json create mode 100644 ops/grafana/dashboards/infra-monitoring.json diff --git a/ops/grafana/dashboards/ai-monitoring.json b/ops/grafana/dashboards/ai-monitoring.json new file mode 100644 index 00000000..cea25eaf --- /dev/null +++ b/ops/grafana/dashboards/ai-monitoring.json @@ -0,0 +1,347 @@ +{ + "dashboard": { + "panels": [ + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "reqpm" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(llm_call_total[5m]) * 60", + "legendFormat": "{{provider}} LLM 呼叫/分", + "refId": "A" + } + ], + "title": "LLM 呼叫速率 (req/min)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5000 + }, + { + "color": "red", + "value": 30000 + } + ] + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "targets": [ + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.99, rate(llm_call_latency_seconds_bucket[5m])) * 1000", + "legendFormat": "P99 延遲 ms", + "refId": "A" + }, + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.50, rate(llm_call_latency_seconds_bucket[5m])) * 1000", + "legendFormat": "P50 延遲 ms", + "refId": "B" + } + ], + "title": "LLM P99 延遲 (ms)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 15000 + }, + { + "color": "red", + "value": 45000 + } + ] + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "targets": [ + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.99, rate(nvidia_tool_call_latency_seconds_bucket[5m])) * 1000", + "legendFormat": "Nemotron P99 ms", + "refId": "A" + }, + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.50, rate(nvidia_tool_call_latency_seconds_bucket[5m])) * 1000", + "legendFormat": "Nemotron P50 ms", + "refId": "B" + } + ], + "title": "NVIDIA Nemotron Tool Calling 延遲 (ms)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 50 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(wooo_llm_cache_hit_total[5m]) / (rate(wooo_llm_cache_hit_total[5m]) + rate(wooo_llm_cache_miss_total[5m])) * 100", + "legendFormat": "Cache 命中率 %", + "refId": "A" + } + ], + "title": "LLM Cache 命中率", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "orange", + "mode": "fixed" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(llm_fallback_total[5m]) * 60", + "legendFormat": "Fallback/分", + "refId": "A" + } + ], + "title": "LLM Fallback 次數", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + }, + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 16 + }, + "id": 6, + "options": { + "colorMode": "background", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "awoooi_alert_chain_healthy", + "legendFormat": "Alert Chain", + "refId": "A" + } + ], + "title": "Alert Chain 健康狀態", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 60 + }, + { + "color": "red", + "value": 720 + } + ] + }, + "unit": "m" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 16 + }, + "id": 7, + "options": { + "colorMode": "background", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "(time() - awoooi_alert_chain_last_success_timestamp) / 60", + "legendFormat": "距上次成功 (分鐘)", + "refId": "A" + } + ], + "title": "Alert Chain 最後成功時間", + "type": "stat" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "awoooi", + "ai", + "nvidia", + "llm" + ], + "time": { + "from": "now-3h", + "to": "now" + }, + "timezone": "Asia/Taipei", + "title": "AWOOOI AI 監控 — LLM + NVIDIA", + "uid": "awoooi-ai-monitoring" + }, + "folderId": 0, + "overwrite": true +} diff --git a/ops/grafana/dashboards/infra-monitoring.json b/ops/grafana/dashboards/infra-monitoring.json new file mode 100644 index 00000000..40f537a3 --- /dev/null +++ b/ops/grafana/dashboards/infra-monitoring.json @@ -0,0 +1,306 @@ +{ + "dashboard": { + "panels": [ + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "max": 100, + "min": 0, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "targets": [ + { + "datasource": "Prometheus", + "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "legendFormat": "{{instance}} CPU %", + "refId": "A" + } + ], + "title": "CPU 使用率 %", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "max": 100, + "min": 0, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 75 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "targets": [ + { + "datasource": "Prometheus", + "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", + "legendFormat": "{{instance}} Memory %", + "refId": "A" + } + ], + "title": "記憶體使用率 %", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "targets": [ + { + "datasource": "Prometheus", + "expr": "count by(namespace) (kube_pod_info)", + "legendFormat": "{{namespace}}", + "refId": "A" + } + ], + "title": "K3s Pod 數量 (by namespace)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "targets": [ + { + "datasource": "Prometheus", + "expr": "increase(kube_pod_container_status_restarts_total[1h])", + "legendFormat": "{{namespace}}/{{pod}}", + "refId": "A" + } + ], + "title": "K3s Pod 重啟次數", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "green", + "value": 14 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "count(up == 1)", + "legendFormat": "UP", + "refId": "A" + } + ], + "title": "Prometheus Targets UP 數量", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 16 + }, + "id": 6, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "count(up == 0) or vector(0)", + "legendFormat": "DOWN", + "refId": "A" + } + ], + "title": "Prometheus Targets DOWN", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "reqpm" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 7, + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(http_requests_total{job=\"awoooi-api\"}[5m]) * 60", + "legendFormat": "{{method}} {{endpoint}}", + "refId": "A" + } + ], + "title": "API 請求速率 (req/min)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "awoooi", + "infra", + "node", + "k3s" + ], + "time": { + "from": "now-3h", + "to": "now" + }, + "timezone": "Asia/Taipei", + "title": "AWOOOI 基礎設施監控 — Node + K3s", + "uid": "awoooi-infra-monitoring" + }, + "folderId": 0, + "overwrite": true +}