feat(grafana): Wave D — AI監控 + 基礎設施 Dashboard (Grafana 188:3002)

新增 2 個 Dashboard,匯入既有 Nemotron Dashboard:

1. ai-monitoring.json — LLM + NVIDIA AI 監控
   - LLM 呼叫速率 (req/min)
   - LLM P99/P50 延遲
   - Nemotron Tool Calling P99/P50 延遲
   - LLM Cache 命中率 %
   - LLM Fallback 次數
   - Alert Chain 健康/最後成功時間

2. infra-monitoring.json — Node + K3s 基礎設施
   - CPU/Memory 使用率
   - K3s Pod 數量 (by namespace)
   - K3s Pod 重啟次數
   - Prometheus Targets UP/DOWN
   - API 請求速率

3. nvidia-nemotron.json — 既有 18-panel Nemotron Dashboard (版控)

部署: 192.168.0.188:3002 (Grafana 12.4.1)
Provisioning: monitoring/grafana/provisioning/dashboards/

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-03 00:18:00 +08:00
parent cb0f92557d
commit 30b7b10f01
2 changed files with 653 additions and 0 deletions

View File

@@ -0,0 +1,347 @@
{
"dashboard": {
"panels": [
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"unit": "reqpm"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"targets": [
{
"datasource": "Prometheus",
"expr": "rate(llm_call_total[5m]) * 60",
"legendFormat": "{{provider}} LLM 呼叫/分",
"refId": "A"
}
],
"title": "LLM 呼叫速率 (req/min)",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 5000
},
{
"color": "red",
"value": 30000
}
]
},
"unit": "ms"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"targets": [
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.99, rate(llm_call_latency_seconds_bucket[5m])) * 1000",
"legendFormat": "P99 延遲 ms",
"refId": "A"
},
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.50, rate(llm_call_latency_seconds_bucket[5m])) * 1000",
"legendFormat": "P50 延遲 ms",
"refId": "B"
}
],
"title": "LLM P99 延遲 (ms)",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 15000
},
{
"color": "red",
"value": 45000
}
]
},
"unit": "ms"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 3,
"targets": [
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.99, rate(nvidia_tool_call_latency_seconds_bucket[5m])) * 1000",
"legendFormat": "Nemotron P99 ms",
"refId": "A"
},
{
"datasource": "Prometheus",
"expr": "histogram_quantile(0.50, rate(nvidia_tool_call_latency_seconds_bucket[5m])) * 1000",
"legendFormat": "Nemotron P50 ms",
"refId": "B"
}
],
"title": "NVIDIA Nemotron Tool Calling 延遲 (ms)",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"thresholds": {
"steps": [
{
"color": "red",
"value": null
},
{
"color": "yellow",
"value": 20
},
{
"color": "green",
"value": 50
}
]
},
"unit": "percent"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 4,
"targets": [
{
"datasource": "Prometheus",
"expr": "rate(wooo_llm_cache_hit_total[5m]) / (rate(wooo_llm_cache_hit_total[5m]) + rate(wooo_llm_cache_miss_total[5m])) * 100",
"legendFormat": "Cache 命中率 %",
"refId": "A"
}
],
"title": "LLM Cache 命中率",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "orange",
"mode": "fixed"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 0.1
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 5,
"targets": [
{
"datasource": "Prometheus",
"expr": "rate(llm_fallback_total[5m]) * 60",
"legendFormat": "Fallback/分",
"refId": "A"
}
],
"title": "LLM Fallback 次數",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"0": {
"color": "red",
"text": "DOWN"
},
"1": {
"color": "green",
"text": "UP"
}
},
"type": "value"
}
],
"thresholds": {
"steps": [
{
"color": "red",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 6,
"x": 12,
"y": 16
},
"id": 6,
"options": {
"colorMode": "background",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"textMode": "auto"
},
"targets": [
{
"datasource": "Prometheus",
"expr": "awoooi_alert_chain_healthy",
"legendFormat": "Alert Chain",
"refId": "A"
}
],
"title": "Alert Chain 健康狀態",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 60
},
{
"color": "red",
"value": 720
}
]
},
"unit": "m"
}
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 16
},
"id": 7,
"options": {
"colorMode": "background",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"textMode": "auto"
},
"targets": [
{
"datasource": "Prometheus",
"expr": "(time() - awoooi_alert_chain_last_success_timestamp) / 60",
"legendFormat": "距上次成功 (分鐘)",
"refId": "A"
}
],
"title": "Alert Chain 最後成功時間",
"type": "stat"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": [
"awoooi",
"ai",
"nvidia",
"llm"
],
"time": {
"from": "now-3h",
"to": "now"
},
"timezone": "Asia/Taipei",
"title": "AWOOOI AI 監控 — LLM + NVIDIA",
"uid": "awoooi-ai-monitoring"
},
"folderId": 0,
"overwrite": true
}

View File

@@ -0,0 +1,306 @@
{
"dashboard": {
"panels": [
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"max": 100,
"min": 0,
"thresholds": {
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"targets": [
{
"datasource": "Prometheus",
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}} CPU %",
"refId": "A"
}
],
"title": "CPU 使用率 %",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"max": 100,
"min": 0,
"thresholds": {
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 75
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"targets": [
{
"datasource": "Prometheus",
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "{{instance}} Memory %",
"refId": "A"
}
],
"title": "記憶體使用率 %",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 3,
"targets": [
{
"datasource": "Prometheus",
"expr": "count by(namespace) (kube_pod_info)",
"legendFormat": "{{namespace}}",
"refId": "A"
}
],
"title": "K3s Pod 數量 (by namespace)",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 4,
"targets": [
{
"datasource": "Prometheus",
"expr": "increase(kube_pod_container_status_restarts_total[1h])",
"legendFormat": "{{namespace}}/{{pod}}",
"refId": "A"
}
],
"title": "K3s Pod 重啟次數",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{
"color": "red",
"value": null
},
{
"color": "yellow",
"value": 10
},
{
"color": "green",
"value": 14
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 6,
"x": 0,
"y": 16
},
"id": 5,
"options": {
"colorMode": "background",
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"textMode": "auto"
},
"targets": [
{
"datasource": "Prometheus",
"expr": "count(up == 1)",
"legendFormat": "UP",
"refId": "A"
}
],
"title": "Prometheus Targets UP 數量",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 16
},
"id": 6,
"options": {
"colorMode": "background",
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"textMode": "auto"
},
"targets": [
{
"datasource": "Prometheus",
"expr": "count(up == 0) or vector(0)",
"legendFormat": "DOWN",
"refId": "A"
}
],
"title": "Prometheus Targets DOWN",
"type": "stat"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"unit": "reqpm"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 7,
"targets": [
{
"datasource": "Prometheus",
"expr": "rate(http_requests_total{job=\"awoooi-api\"}[5m]) * 60",
"legendFormat": "{{method}} {{endpoint}}",
"refId": "A"
}
],
"title": "API 請求速率 (req/min)",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": [
"awoooi",
"infra",
"node",
"k3s"
],
"time": {
"from": "now-3h",
"to": "now"
},
"timezone": "Asia/Taipei",
"title": "AWOOOI 基礎設施監控 — Node + K3s",
"uid": "awoooi-infra-monitoring"
},
"folderId": 0,
"overwrite": true
}