Files
awoooi/ops/monitoring/grafana/dashboards/ollama_failover.json
Your Name 4111ea4f9f
All checks were successful
Code Review / ai-code-review (push) Successful in 12s
CD Pipeline / tests (push) Successful in 1m13s
CD Pipeline / build-and-deploy (push) Successful in 3m36s
CD Pipeline / post-deploy-checks (push) Successful in 1m20s
fix(ai): remove 188 ollama provider
2026-05-06 14:34:48 +08:00

296 lines
9.0 KiB
JSON
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"__inputs": [],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "10.0.0"
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
},
{
"type": "panel",
"id": "piechart",
"name": "Pie chart",
"version": ""
},
{
"type": "panel",
"id": "barchart",
"name": "Bar chart",
"version": ""
}
],
"annotations": {
"list": []
},
"description": "Ollama 容災監控 — 可用性、推理延遲、AI 路由分布、Failover/Recovery 觸發 | P2.3 2026-04-26 台北時區",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 50 },
{ "color": "green", "value": 100 }
]
},
"unit": "percent",
"min": 0,
"max": 100
},
"overrides": []
},
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 0 },
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"title": "Ollama 可用性",
"description": "up{job=~\"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111\"} × 100\n- 綠色 100% = 主機在線\n- 紅色 0% = 主機離線(容災應已觸發)\n\n資料來源: Prometheus scrape job ollama_gcp_a / ollama_gcp_b / ollama_local",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=~\"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111\"} * 100",
"legendFormat": "{{ job }}",
"refId": "A"
}
],
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "延遲 (秒)",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 10 },
{ "color": "red", "value": 30 }
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 0 },
"id": 2,
"options": {
"legend": {
"calcs": ["lastNotNull", "max"],
"displayMode": "table",
"placement": "bottom"
},
"tooltip": { "mode": "multi", "sort": "none" }
},
"title": "推理延遲 P50 / P99",
"description": "histogram_quantile(0.5/0.99, rate(ollama_inference_duration_seconds_bucket[5m]))\n- P50 > 10s = SLOW 門檻\n- P99 > 30s = DEGRADED 門檻,觸發 failover\n\n⚠ BACKLOG: ollama_inference_duration_seconds_bucket 尚未暴露,面板會顯示 No Data 直到 Part 3 backlog 補完",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "histogram_quantile(0.5, rate(ollama_inference_duration_seconds_bucket[5m]))",
"legendFormat": "P50",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "histogram_quantile(0.99, rate(ollama_inference_duration_seconds_bucket[5m]))",
"legendFormat": "P99",
"refId": "B"
}
],
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"hideFrom": { "legend": false, "tooltip": false, "viz": false }
},
"mappings": [],
"unit": "reqps"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 6 },
"id": 3,
"options": {
"displayLabels": ["name", "percent"],
"legend": {
"displayMode": "table",
"placement": "right",
"values": ["value", "percent"]
},
"pieType": "pie",
"tooltip": { "mode": "single", "sort": "none" }
},
"title": "AI Provider 路由分布",
"description": "sum by (provider) (rate(ai_router_selected_provider_total[5m]))\n- 正常狀態: ollama / ollama_gcp_a 佔大多數\n- failover 中: ollama_gcp_b / ollama_local / gemini 比例上升\n- 全走 gemini = Ollama provider pool 完全 offline\n\n資料來源: OLLAMA_FAILOVER_TRIGGERED_TOTAL + AI_ROUTER_PROVIDER_TOTAL (src/core/metrics.py)",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "sum by (provider) (rate(ai_router_selected_provider_total[5m]))",
"legendFormat": "{{ provider }}",
"refId": "A"
}
],
"type": "piechart"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisLabel": "次數/小時",
"fillOpacity": 80,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"lineWidth": 1
},
"mappings": [],
"unit": "short"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Failover" },
"properties": [
{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "Recovery" },
"properties": [
{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }
]
}
]
},
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 6 },
"id": 4,
"options": {
"barRadius": 0.05,
"barWidth": 0.7,
"groupWidth": 0.7,
"legend": {
"calcs": ["sum"],
"displayMode": "list",
"placement": "bottom"
},
"orientation": "auto",
"tooltip": { "mode": "multi", "sort": "none" },
"xTickLabelRotation": 0
},
"title": "Failover / Recovery 觸發次數",
"description": "橘色 = ollama_failover_triggered_total (切離 111)\n綠色 = ollama_recovery_triggered_total (切回 111)\n\n正常狀態兩條都接近 0。\n橘升後緊跟綠升 = auto recovery 正常工作。\n橘升但綠不升 = OllamaRecoveryStuck看 alert。\n\n資料來源: src/core/metrics.py OLLAMA_FAILOVER_TRIGGERED_TOTAL / OLLAMA_RECOVERY_TRIGGERED_TOTAL",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "sum(rate(ollama_failover_triggered_total[1h])) * 3600",
"legendFormat": "Failover",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "sum(rate(ollama_recovery_triggered_total[1h])) * 3600",
"legendFormat": "Recovery",
"refId": "B"
}
],
"type": "barchart"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["ollama", "failover", "aiops", "p2.3"],
"templating": {
"list": [
{
"current": {},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
}
]
},
"time": { "from": "now-3h", "to": "now" },
"timepicker": {},
"timezone": "Asia/Taipei",
"title": "Ollama 容災監控",
"uid": "ollama-failover-p23",
"version": 1
}