diff --git a/ops/grafana/dashboards/ai-monitoring.json b/ops/grafana/dashboards/ai-monitoring.json
new file mode 100644
index 00000000..cea25eaf
--- /dev/null
+++ b/ops/grafana/dashboards/ai-monitoring.json
@@ -0,0 +1,347 @@
+{
+  "dashboard": {
+    "panels": [
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "unit": "reqpm"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 0
+        },
+        "id": 1,
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "rate(llm_call_total[5m]) * 60",
+            "legendFormat": "{{provider}} LLM 呼叫/分",
+            "refId": "A"
+          }
+        ],
+        "title": "LLM 呼叫速率 (req/min)",
+        "type": "timeseries"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 5000
+                },
+                {
+                  "color": "red",
+                  "value": 30000
+                }
+              ]
+            },
+            "unit": "ms"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        },
+        "id": 2,
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "histogram_quantile(0.99, rate(llm_call_latency_seconds_bucket[5m])) * 1000",
+            "legendFormat": "P99 延遲 ms",
+            "refId": "A"
+          },
+          {
+            "datasource": "Prometheus",
+            "expr": "histogram_quantile(0.50, rate(llm_call_latency_seconds_bucket[5m])) * 1000",
+            "legendFormat": "P50 延遲 ms",
+            "refId": "B"
+          }
+        ],
+        "title": "LLM P99 延遲 (ms)",
+        "type": "timeseries"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 15000
+                },
+                {
+                  "color": "red",
+                  "value": 45000
+                }
+              ]
+            },
+            "unit": "ms"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 8
+        },
+        "id": 3,
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "histogram_quantile(0.99, rate(nvidia_tool_call_latency_seconds_bucket[5m])) * 1000",
+            "legendFormat": "Nemotron P99 ms",
+            "refId": "A"
+          },
+          {
+            "datasource": "Prometheus",
+            "expr": "histogram_quantile(0.50, rate(nvidia_tool_call_latency_seconds_bucket[5m])) * 1000",
+            "legendFormat": "Nemotron P50 ms",
+            "refId": "B"
+          }
+        ],
+        "title": "NVIDIA Nemotron Tool Calling 延遲 (ms)",
+        "type": "timeseries"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "max": 100,
+            "min": 0,
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "red",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 20
+                },
+                {
+                  "color": "green",
+                  "value": 50
+                }
+              ]
+            },
+            "unit": "percent"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 8
+        },
+        "id": 4,
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "rate(wooo_llm_cache_hit_total[5m]) / (rate(wooo_llm_cache_hit_total[5m]) + rate(wooo_llm_cache_miss_total[5m])) * 100",
+            "legendFormat": "Cache 命中率 %",
+            "refId": "A"
+          }
+        ],
+        "title": "LLM Cache 命中率",
+        "type": "timeseries"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "fixedColor": "orange",
+              "mode": "fixed"
+            },
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "red",
+                  "value": 0.1
+                }
+              ]
+            },
+            "unit": "short"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 16
+        },
+        "id": 5,
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "rate(llm_fallback_total[5m]) * 60",
+            "legendFormat": "Fallback/分",
+            "refId": "A"
+          }
+        ],
+        "title": "LLM Fallback 次數",
+        "type": "timeseries"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "mappings": [
+              {
+                "options": {
+                  "0": {
+                    "color": "red",
+                    "text": "DOWN"
+                  },
+                  "1": {
+                    "color": "green",
+                    "text": "UP"
+                  }
+                },
+                "type": "value"
+              }
+            ],
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "red",
+                  "value": null
+                },
+                {
+                  "color": "green",
+                  "value": 1
+                }
+              ]
+            },
+            "unit": "short"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 6,
+          "x": 12,
+          "y": 16
+        },
+        "id": 6,
+        "options": {
+          "colorMode": "background",
+          "orientation": "auto",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ]
+          },
+          "textMode": "auto"
+        },
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "awoooi_alert_chain_healthy",
+            "legendFormat": "Alert Chain",
+            "refId": "A"
+          }
+        ],
+        "title": "Alert Chain 健康狀態",
+        "type": "stat"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 60
+                },
+                {
+                  "color": "red",
+                  "value": 720
+                }
+              ]
+            },
+            "unit": "m"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 6,
+          "x": 18,
+          "y": 16
+        },
+        "id": 7,
+        "options": {
+          "colorMode": "background",
+          "orientation": "auto",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ]
+          },
+          "textMode": "auto"
+        },
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "(time() - awoooi_alert_chain_last_success_timestamp) / 60",
+            "legendFormat": "距上次成功 (分鐘)",
+            "refId": "A"
+          }
+        ],
+        "title": "Alert Chain 最後成功時間",
+        "type": "stat"
+      }
+    ],
+    "refresh": "30s",
+    "schemaVersion": 39,
+    "tags": [
+      "awoooi",
+      "ai",
+      "nvidia",
+      "llm"
+    ],
+    "time": {
+      "from": "now-3h",
+      "to": "now"
+    },
+    "timezone": "Asia/Taipei",
+    "title": "AWOOOI AI 監控 — LLM + NVIDIA",
+    "uid": "awoooi-ai-monitoring"
+  },
+  "folderId": 0,
+  "overwrite": true
+}
diff --git a/ops/grafana/dashboards/infra-monitoring.json b/ops/grafana/dashboards/infra-monitoring.json
new file mode 100644
index 00000000..40f537a3
--- /dev/null
+++ b/ops/grafana/dashboards/infra-monitoring.json
@@ -0,0 +1,306 @@
+{
+  "dashboard": {
+    "panels": [
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "max": 100,
+            "min": 0,
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 70
+                },
+                {
+                  "color": "red",
+                  "value": 90
+                }
+              ]
+            },
+            "unit": "percent"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 0
+        },
+        "id": 1,
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
+            "legendFormat": "{{instance}} CPU %",
+            "refId": "A"
+          }
+        ],
+        "title": "CPU 使用率 %",
+        "type": "timeseries"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "max": 100,
+            "min": 0,
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 75
+                },
+                {
+                  "color": "red",
+                  "value": 90
+                }
+              ]
+            },
+            "unit": "percent"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        },
+        "id": 2,
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
+            "legendFormat": "{{instance}} Memory %",
+            "refId": "A"
+          }
+        ],
+        "title": "記憶體使用率 %",
+        "type": "timeseries"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "unit": "short"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 8
+        },
+        "id": 3,
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "count by(namespace) (kube_pod_info)",
+            "legendFormat": "{{namespace}}",
+            "refId": "A"
+          }
+        ],
+        "title": "K3s Pod 數量 (by namespace)",
+        "type": "timeseries"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "red",
+                  "value": 1
+                }
+              ]
+            },
+            "unit": "short"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 8
+        },
+        "id": 4,
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "increase(kube_pod_container_status_restarts_total[1h])",
+            "legendFormat": "{{namespace}}/{{pod}}",
+            "refId": "A"
+          }
+        ],
+        "title": "K3s Pod 重啟次數",
+        "type": "timeseries"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "red",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 10
+                },
+                {
+                  "color": "green",
+                  "value": 14
+                }
+              ]
+            },
+            "unit": "short"
+          }
+        },
+        "gridPos": {
+          "h": 6,
+          "w": 6,
+          "x": 0,
+          "y": 16
+        },
+        "id": 5,
+        "options": {
+          "colorMode": "background",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ]
+          },
+          "textMode": "auto"
+        },
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "count(up == 1)",
+            "legendFormat": "UP",
+            "refId": "A"
+          }
+        ],
+        "title": "Prometheus Targets UP 數量",
+        "type": "stat"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "red",
+                  "value": 1
+                }
+              ]
+            },
+            "unit": "short"
+          }
+        },
+        "gridPos": {
+          "h": 6,
+          "w": 6,
+          "x": 6,
+          "y": 16
+        },
+        "id": 6,
+        "options": {
+          "colorMode": "background",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ]
+          },
+          "textMode": "auto"
+        },
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "count(up == 0) or vector(0)",
+            "legendFormat": "DOWN",
+            "refId": "A"
+          }
+        ],
+        "title": "Prometheus Targets DOWN",
+        "type": "stat"
+      },
+      {
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "unit": "reqpm"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 16
+        },
+        "id": 7,
+        "targets": [
+          {
+            "datasource": "Prometheus",
+            "expr": "rate(http_requests_total{job=\"awoooi-api\"}[5m]) * 60",
+            "legendFormat": "{{method}} {{endpoint}}",
+            "refId": "A"
+          }
+        ],
+        "title": "API 請求速率 (req/min)",
+        "type": "timeseries"
+      }
+    ],
+    "refresh": "30s",
+    "schemaVersion": 39,
+    "tags": [
+      "awoooi",
+      "infra",
+      "node",
+      "k3s"
+    ],
+    "time": {
+      "from": "now-3h",
+      "to": "now"
+    },
+    "timezone": "Asia/Taipei",
+    "title": "AWOOOI 基礎設施監控 — Node + K3s",
+    "uid": "awoooi-infra-monitoring"
+  },
+  "folderId": 0,
+  "overwrite": true
+}