diff --git a/ops/grafana/dashboards/ai-monitoring.json b/ops/grafana/dashboards/ai-monitoring.json new file mode 100644 index 00000000..cea25eaf --- /dev/null +++ b/ops/grafana/dashboards/ai-monitoring.json @@ -0,0 +1,347 @@ +{ + "dashboard": { + "panels": [ + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "reqpm" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(llm_call_total[5m]) * 60", + "legendFormat": "{{provider}} LLM 呼叫/分", + "refId": "A" + } + ], + "title": "LLM 呼叫速率 (req/min)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5000 + }, + { + "color": "red", + "value": 30000 + } + ] + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "targets": [ + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.99, rate(llm_call_latency_seconds_bucket[5m])) * 1000", + "legendFormat": "P99 延遲 ms", + "refId": "A" + }, + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.50, rate(llm_call_latency_seconds_bucket[5m])) * 1000", + "legendFormat": "P50 延遲 ms", + "refId": "B" + } + ], + "title": "LLM P99 延遲 (ms)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 15000 + }, + { + "color": "red", + "value": 45000 + } + ] + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "targets": [ + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.99, rate(nvidia_tool_call_latency_seconds_bucket[5m])) * 1000", + "legendFormat": "Nemotron P99 ms", + "refId": "A" + }, + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.50, rate(nvidia_tool_call_latency_seconds_bucket[5m])) * 1000", + "legendFormat": "Nemotron P50 ms", + "refId": "B" + } + ], + "title": "NVIDIA Nemotron Tool Calling 延遲 (ms)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "max": 100, + "min": 0, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 50 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(wooo_llm_cache_hit_total[5m]) / (rate(wooo_llm_cache_hit_total[5m]) + rate(wooo_llm_cache_miss_total[5m])) * 100", + "legendFormat": "Cache 命中率 %", + "refId": "A" + } + ], + "title": "LLM Cache 命中率", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "orange", + "mode": "fixed" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(llm_fallback_total[5m]) * 60", + "legendFormat": "Fallback/分", + "refId": "A" + } + ], + "title": "LLM Fallback 次數", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + }, + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 16 + }, + "id": 6, + "options": { + "colorMode": "background", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "awoooi_alert_chain_healthy", + "legendFormat": "Alert Chain", + "refId": "A" + } + ], + "title": "Alert Chain 健康狀態", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 60 + }, + { + "color": "red", + "value": 720 + } + ] + }, + "unit": "m" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 16 + }, + "id": 7, + "options": { + "colorMode": "background", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "(time() - awoooi_alert_chain_last_success_timestamp) / 60", + "legendFormat": "距上次成功 (分鐘)", + "refId": "A" + } + ], + "title": "Alert Chain 最後成功時間", + "type": "stat" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "awoooi", + "ai", + "nvidia", + "llm" + ], + "time": { + "from": "now-3h", + "to": "now" + }, + "timezone": "Asia/Taipei", + "title": "AWOOOI AI 監控 — LLM + NVIDIA", + "uid": "awoooi-ai-monitoring" + }, + "folderId": 0, + "overwrite": true +} diff --git a/ops/grafana/dashboards/infra-monitoring.json b/ops/grafana/dashboards/infra-monitoring.json new file mode 100644 index 00000000..40f537a3 --- /dev/null +++ b/ops/grafana/dashboards/infra-monitoring.json @@ -0,0 +1,306 @@ +{ + "dashboard": { + "panels": [ + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "max": 100, + "min": 0, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "targets": [ + { + "datasource": "Prometheus", + "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "legendFormat": "{{instance}} CPU %", + "refId": "A" + } + ], + "title": "CPU 使用率 %", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "max": 100, + "min": 0, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 75 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "targets": [ + { + "datasource": "Prometheus", + "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", + "legendFormat": "{{instance}} Memory %", + "refId": "A" + } + ], + "title": "記憶體使用率 %", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "targets": [ + { + "datasource": "Prometheus", + "expr": "count by(namespace) (kube_pod_info)", + "legendFormat": "{{namespace}}", + "refId": "A" + } + ], + "title": "K3s Pod 數量 (by namespace)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "targets": [ + { + "datasource": "Prometheus", + "expr": "increase(kube_pod_container_status_restarts_total[1h])", + "legendFormat": "{{namespace}}/{{pod}}", + "refId": "A" + } + ], + "title": "K3s Pod 重啟次數", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "green", + "value": 14 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "count(up == 1)", + "legendFormat": "UP", + "refId": "A" + } + ], + "title": "Prometheus Targets UP 數量", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 16 + }, + "id": 6, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "count(up == 0) or vector(0)", + "legendFormat": "DOWN", + "refId": "A" + } + ], + "title": "Prometheus Targets DOWN", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "reqpm" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 7, + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(http_requests_total{job=\"awoooi-api\"}[5m]) * 60", + "legendFormat": "{{method}} {{endpoint}}", + "refId": "A" + } + ], + "title": "API 請求速率 (req/min)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "awoooi", + "infra", + "node", + "k3s" + ], + "time": { + "from": "now-3h", + "to": "now" + }, + "timezone": "Asia/Taipei", + "title": "AWOOOI 基礎設施監控 — Node + K3s", + "uid": "awoooi-infra-monitoring" + }, + "folderId": 0, + "overwrite": true +}