新增 AI 自動化 Grafana 儀表板

2026-04-30 00:06:24 +08:00
parent d5f4fd7198
commit 5a61c020e3
13 changed files with 866 additions and 15 deletions
--- a/CONSTITUTION.md
+++ b/CONSTITUTION.md
@@ -2,7 +2,7 @@

 > 本文件定義專案開發的核心準則與不可違反的規範
 > **建立日期**: 2026-01-12
-> **當前版本**: V10.9 (四 AI Agent 自動化 Smoke 每日摘要版)
+> **當前版本**: V10.10 (四 AI Agent 自動化 Grafana 觀測版)
 > **最後更新**: 2026-04-29

 ---
@@ -347,6 +347,7 @@
 - ✅ **正確**：EventRouter 是告警、降級、去重、通知 replay 與 L2 safe action 的入口
 - ✅ **正確**：AutoHeal 是自癒副作用入口，失敗時必須安全降級為 alert / log / file queue
 - ✅ **正確**：L2 safe action 必須可審計、可回放、低副作用
+- ✅ **正確**：AI 自動化觀測變更需同步 `/metrics`、Smoke dashboard 與 Grafana provisioning，避免告警閉環變成黑盒
 - ❌ **禁止**：自動 restart / stop / recreate `momo-db` 或 `momo-postgres`
 - ❌ **禁止**：AI 分析失敗導致 Telegram 告警完全不送出
 - **依據**：ADR-012、ADR-013、ADR-018
--- a/TODO_NEXT_STEPS.txt
+++ b/TODO_NEXT_STEPS.txt
@@ -7,17 +7,18 @@
   - ADR-018：四 AI Agent 自動化控制面立案。
   - Memory：新增 `docs/memory/ai_automation_closure_20260429.md`。
   - Guide/Skills 替代：新增 `docs/guides/ai_automation_session_sop.md`。
-   - SOT：更新 `docs/AI_INTELLIGENCE_MODULE_SOT.md` 至 V10.9 AI Automation Smoke Summary 架構。
+   - SOT：更新 `docs/AI_INTELLIGENCE_MODULE_SOT.md` 至 V10.10 AI Automation Grafana Observability 架構。
   - Codex 規則：更新 `AGENTS.md`、`CONSTITUTION.md`、ADR/memory 索引。
   - Prometheus 指標化：新增 EventRouter / AutoHeal / safe action / replay in-process metrics，並接入 `/metrics`。
   - 線上 smoke dashboard：新增 `/ai_automation_smoke` 與 `/api/ai-automation/smoke`，覆蓋 EventRouter、AutoHeal、NemoTron fallback、OpenClaw embedding queue、ElephantAlpha HITL。
   - Smoke 趨勢保存：`/api/ai-automation/smoke` 每次快檢追加 JSONL 精簡紀錄，dashboard 顯示最近趨勢。
   - Smoke 趨勢管理：新增 JSONL 匯出、清理與每日摘要。
   - Smoke 每日摘要：新增 Telegram 手動推播 API 與 momo-scheduler 每日 09:10 排程入口。
+   - Grafana 視覺化：新增 `MOMO AI Automation Overview` provisioning dashboard，覆蓋 EventRouter、safe action、replay、AutoHeal 指標。

 【下次待辦】
-   - Superset / Grafana 視覺化：`momo_ai_event_router_dispatch_total`、`momo_ai_event_router_latency_ms_*`、`momo_ai_autoheal_action_total`。
-   - Grafana/Superset panel 設定與 Smoke 摘要成效觀察。
+   - 將 Grafana provisioning dashboard 部署到 110/188 監控環境後，觀察 panel 是否都有資料。
+   - Superset panel 設定與 Smoke 摘要成效觀察。

 ================================================================================
                      品牌資產最終處理與維護 (Phase 7) [DONE]
--- a/app.py
+++ b/app.py
@@ -95,9 +95,8 @@ except Exception as e:
    sys_log.error(f"無法檢測磁碟空間: {e}")

 # 🚩 系統版本定義 (備份與顯示用)
-# 🚩 2026-04-29 V10.9: AI Smoke 每日摘要 — Telegram 手動推播 /
-#                  scheduler 09:10 排程入口
-SYSTEM_VERSION = "V10.9"
+# 🚩 2026-04-29 V10.10: AI 自動化 Grafana dashboard provisioning
+SYSTEM_VERSION = "V10.10"

 # ==========================================
 # 🔒 SQL Injection 防護函數
--- a/config.py
+++ b/config.py
@@ -253,7 +253,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
 # ==========================================
 # 系統版本與路徑
 # ==========================================
-SYSTEM_VERSION = "V10.9"
+SYSTEM_VERSION = "V10.10"
 LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
 public_url = PUBLIC_URL  # 用於模板顯示

--- a/docker/grafana/provisioning/dashboards/json/ai-automation-overview.json
+++ b/docker/grafana/provisioning/dashboards/json/ai-automation-overview.json
@@ -0,0 +1,779 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [
+    {
+      "icon": "external link",
+      "tags": [],
+      "targetBlank": true,
+      "title": "AI Automation Smoke Dashboard",
+      "tooltip": "App read-only smoke dashboard",
+      "type": "link",
+      "url": "/ai_automation_smoke"
+    }
+  ],
+  "liveNow": false,
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 100,
+      "panels": [],
+      "title": "AI 自動化總覽",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "description": "最近 1 小時 EventRouter 接收並處理的事件量。",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 0,
+        "y": 1
+      },
+      "id": 1,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(momo_ai_event_router_dispatch_total[1h]))",
+          "legendFormat": "dispatch / 1h",
+          "refId": "A"
+        }
+      ],
+      "title": "EventRouter 1h 事件量",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "description": "最近 24 小時 L2 SAFE_ACTIONS 執行量，依 action/status 分組。",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 6,
+        "y": 1
+      },
+      "id": 2,
+      "options": {
+        "displayMode": "gradient",
+        "maxVizHeight": 300,
+        "minVizHeight": 16,
+        "minVizWidth": 8,
+        "namePlacement": "auto",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showUnfilled": true,
+        "valueMode": "color"
+      },
+      "targets": [
+        {
+          "expr": "sum by (action, status) (increase(momo_ai_event_router_safe_action_total[24h]))",
+          "legendFormat": "{{action}} / {{status}}",
+          "refId": "A"
+        }
+      ],
+      "title": "L2 Safe Action 24h",
+      "type": "bargauge"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "description": "最近 24 小時 Telegram queue replay 狀態。",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 12,
+        "y": 1
+      },
+      "id": 3,
+      "options": {
+        "displayMode": "gradient",
+        "maxVizHeight": 300,
+        "minVizHeight": 16,
+        "minVizWidth": 8,
+        "namePlacement": "auto",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showUnfilled": true,
+        "valueMode": "color"
+      },
+      "targets": [
+        {
+          "expr": "sum by (status) (increase(momo_ai_event_router_replay_total[24h]))",
+          "legendFormat": "{{status}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Telegram Replay 24h",
+      "type": "bargauge"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "description": "最近 24 小時 AutoHeal 動作結果，依 action/error_type/result 分組。",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 18,
+        "y": 1
+      },
+      "id": 4,
+      "options": {
+        "displayMode": "gradient",
+        "maxVizHeight": 300,
+        "minVizHeight": 16,
+        "minVizWidth": 8,
+        "namePlacement": "auto",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showUnfilled": true,
+        "valueMode": "color"
+      },
+      "targets": [
+        {
+          "expr": "sum by (action, error_type, result) (increase(momo_ai_autoheal_action_total[24h]))",
+          "legendFormat": "{{action}} / {{error_type}} / {{result}}",
+          "refId": "A"
+        }
+      ],
+      "title": "AutoHeal Action 24h",
+      "type": "bargauge"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 6
+      },
+      "id": 101,
+      "panels": [],
+      "title": "EventRouter 路由與延遲",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "description": "EventRouter 每 5 分鐘新增 dispatch 數，依 outcome 分組。",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 15,
+            "gradientMode": "opacity",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "normal"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 7
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "sum"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "expr": "sum by (outcome) (increase(momo_ai_event_router_dispatch_total[5m]))",
+          "legendFormat": "{{outcome}}",
+          "refId": "A"
+        }
+      ],
+      "title": "EventRouter Dispatch by Outcome",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "description": "EventRouter 平均延遲與最大延遲，依 tier/event_type 分組。",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 3000
+              },
+              {
+                "color": "red",
+                "value": 10000
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 7
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "expr": "sum by (tier, event_type) (increase(momo_ai_event_router_latency_ms_sum[5m])) / clamp_min(sum by (tier, event_type) (increase(momo_ai_event_router_latency_ms_count[5m])), 1)",
+          "legendFormat": "avg {{tier}} / {{event_type}}",
+          "refId": "A"
+        },
+        {
+          "expr": "max by (tier, event_type) (momo_ai_event_router_latency_ms_max)",
+          "legendFormat": "max {{tier}} / {{event_type}}",
+          "refId": "B"
+        }
+      ],
+      "title": "EventRouter Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "description": "最近 1 小時 dispatch 明細表，方便追 L1/L2/L3 與事件類型。",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "custom": {
+            "align": "auto",
+            "cellOptions": {
+              "type": "auto"
+            },
+            "inspect": false
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 15
+      },
+      "id": 7,
+      "options": {
+        "cellHeight": "sm",
+        "footer": {
+          "countRows": false,
+          "enablePagination": true,
+          "fields": "",
+          "reducer": [
+            "sum"
+          ],
+          "show": false
+        },
+        "showHeader": true
+      },
+      "targets": [
+        {
+          "expr": "sum by (tier, event_type, outcome) (increase(momo_ai_event_router_dispatch_total[1h]))",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "{{tier}} / {{event_type}} / {{outcome}}",
+          "refId": "A"
+        }
+      ],
+      "title": "EventRouter 1h 明細",
+      "type": "table"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 23
+      },
+      "id": 102,
+      "panels": [],
+      "title": "AutoHeal 自癒觀測",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "description": "AutoHeal 每 5 分鐘平均耗時與最大耗時。",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 3000
+              },
+              {
+                "color": "red",
+                "value": 10000
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 24
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "expr": "sum by (action, error_type) (increase(momo_ai_autoheal_duration_ms_sum[5m])) / clamp_min(sum by (action, error_type) (increase(momo_ai_autoheal_duration_ms_count[5m])), 1)",
+          "legendFormat": "avg {{action}} / {{error_type}}",
+          "refId": "A"
+        },
+        {
+          "expr": "max by (action, error_type) (momo_ai_autoheal_duration_ms_max)",
+          "legendFormat": "max {{action}} / {{error_type}}",
+          "refId": "B"
+        }
+      ],
+      "title": "AutoHeal Duration",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "description": "AutoHeal 最近 24 小時 action 明細表，確認是否有高風險類型集中。",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "custom": {
+            "align": "auto",
+            "cellOptions": {
+              "type": "auto"
+            },
+            "inspect": false
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 24
+      },
+      "id": 9,
+      "options": {
+        "cellHeight": "sm",
+        "footer": {
+          "countRows": false,
+          "enablePagination": true,
+          "fields": "",
+          "reducer": [
+            "sum"
+          ],
+          "show": false
+        },
+        "showHeader": true
+      },
+      "targets": [
+        {
+          "expr": "sum by (action, error_type, result) (increase(momo_ai_autoheal_action_total[24h]))",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "{{action}} / {{error_type}} / {{result}}",
+          "refId": "A"
+        }
+      ],
+      "title": "AutoHeal 24h 明細",
+      "type": "table"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 32
+      },
+      "id": 103,
+      "panels": [],
+      "title": "操作說明",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {},
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 24,
+        "x": 0,
+        "y": 33
+      },
+      "id": 10,
+      "options": {
+        "code": {
+          "language": "plaintext",
+          "showLineNumbers": false,
+          "showMiniMap": false
+        },
+        "content": "資料來源：momo-app /metrics。此 dashboard 只讀 Prometheus，不觸發 AutoHeal、不清理 queue、不碰 momo-db。Smoke 狀態仍以 /ai_automation_smoke 與 /api/ai-automation/smoke 為準；Grafana 用來觀察 dispatch、latency、safe action、replay、AutoHeal 趨勢。",
+        "mode": "markdown"
+      },
+      "pluginVersion": "10.0.0",
+      "title": "AI 自動化觀測邊界",
+      "type": "text"
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 38,
+  "tags": [
+    "momo",
+    "ai-automation",
+    "event-router",
+    "autoheal"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "Asia/Taipei",
+  "title": "MOMO AI Automation Overview",
+  "uid": "momo-ai-automation-overview",
+  "version": 1,
+  "weekStart": ""
+}
--- a/docker/prometheus/prometheus.yml
+++ b/docker/prometheus/prometheus.yml
@@ -71,9 +71,7 @@ scrape_configs:
  # ===========================================================================

  # ---------------------------------------------------------------------------
-  # Momo Flask 應用 - 資料庫與應用指標
-  # ---------------------------------------------------------------------------
-  # Momo Flask 應用 - 健康檢查 (應用未提供 /metrics，改用 /health)
+  # Momo Flask 應用 - 資料庫、應用與 AI 自動化指標
  # ---------------------------------------------------------------------------
  - job_name: 'momo-app'
    static_configs:
--- a/docs/AI_INTELLIGENCE_MODULE_SOT.md
+++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md
@@ -1,8 +1,8 @@
 # MOMO PRO — AI 競價情報模組 Single Source of Truth

 > **最後更新**: 2026-04-29 (台北時間)
-> **狀態**: 🟢 四 AI Agent 自動化閉環已落地 — EventRouter / AutoHeal / OpenClaw Memory / ElephantAlpha bridge / Prometheus metrics / Smoke Dashboard / Smoke Trend Management / Telegram Summary 具測試覆蓋
-> **適用版本**: V10.9 AI Automation Smoke Summary 架構
+> **狀態**: 🟢 四 AI Agent 自動化閉環已落地 — EventRouter / AutoHeal / OpenClaw Memory / ElephantAlpha bridge / Prometheus metrics / Smoke Dashboard / Smoke Trend Management / Telegram Summary / Grafana provisioning 具測試覆蓋
+> **適用版本**: V10.10 AI Automation Grafana Observability 架構

 ---

@@ -67,6 +67,7 @@ SQL漏斗(~300筆)
 - Smoke API 會將最近快檢結果保存到 JSONL，dashboard 顯示最近狀態趨勢。
 - Smoke history 支援 JSONL 匯出、清理與每日 OK / Warning / Critical 摘要。
 - Smoke 每日摘要支援手動 Telegram 推播，並由 `momo-scheduler` 每日 09:10 呼叫 `run_ai_smoke_daily_summary_task()`。
+- Grafana provisioning 新增 `docker/grafana/provisioning/dashboards/json/ai-automation-overview.json`，觀測 EventRouter dispatch/latency、safe action、Telegram replay 與 AutoHeal action/duration。

 ---

--- a/docs/adr/ADR-012-agent-action-ladder.md
+++ b/docs/adr/ADR-012-agent-action-ladder.md
@@ -148,7 +148,8 @@ L1 Hermes 掛  → L0 模板直出 + 🟡 「AI 分析暫不可用」
 - 2026-04-29 已補 smoke 結果 JSONL 保存與 dashboard 趨勢視覺化。
 - 2026-04-29 已補 smoke history JSONL 匯出、清理與每日摘要。
 - 2026-04-29 已補 smoke 每日摘要 Telegram 手動推播與 momo-scheduler 09:10 排程入口。
- 尚未完成：Grafana/Superset 視覺化面板與推播成效觀察。
+- 2026-04-29 已補 Grafana provisioning dashboard：`MOMO AI Automation Overview` 覆蓋 EventRouter、L2 safe action、Telegram replay 與 AutoHeal Prometheus 指標。
+- 尚未完成：Superset 視覺化面板、Grafana 線上部署後資料觀察與 Smoke 摘要推播成效觀察。

 ## References
 - `services/event_router.py` — 分流入口（Phase 1）
--- a/docs/guides/ai_automation_session_sop.md
+++ b/docs/guides/ai_automation_session_sop.md
@@ -26,6 +26,7 @@
 - Telegram 失敗必須可暫存與 replay。
 - EventRouter / AutoHeal 變更必須更新 `services/ai_automation_metrics.py` 指標或確認既有指標已覆蓋。
 - AI 自動化閉環變更必須確認 `/api/ai-automation/smoke` 與 `/ai_automation_smoke` 仍能反映新狀態。
+- AI 自動化 Prometheus 指標變更必須同步檢查 `docker/grafana/provisioning/dashboards/json/ai-automation-overview.json` 是否需要新增 panel 或查詢。
 - Smoke dashboard 會保存 JSONL 趨勢；若新增檢查項目，要確保 history compact record 仍保持小而可讀。
 - Smoke history 管理只能操作 `MOMO_AI_AUTOMATION_SMOKE_HISTORY` 指向的 JSONL，不得清理 DB 或 EventRouter queue。
 - Smoke 每日摘要推播只讀 history，不得重新執行 smoke，也不得把完整 details 寫進 Telegram。
@@ -41,6 +42,7 @@
 - 若有架構決策，新增 ADR 並更新 `docs/adr/README.md`。
 - 若有長期實況，更新 `docs/memory/*.md` 與 `docs/memory/README.md`。
 - 若 AI 架構事實改變，更新 `docs/AI_INTELLIGENCE_MODULE_SOT.md`。
+- 若 AI 自動化可觀測性改變，更新 Grafana provisioning JSON 與對應測試。
 - 若 Codex 工作規則改變，更新 `AGENTS.md`；若紅線改變，更新 `CONSTITUTION.md`。
 - 提交前跑 `git diff --check` 與相關 pytest。
 - 使用者要求推版時，commit 後 push 到遠端。
--- a/docs/memory/README.md
+++ b/docs/memory/README.md
@@ -13,7 +13,7 @@
 | 檔案 | 用途 | 何時閱讀 |
 |---|---|---|
 | `history_logs.md` | 重大里程碑與歷史脈絡 | 要理解演進背景、排查「為何變成這樣」時 |
-| `ai_automation_closure_20260429.md` | 四 AI Agent 自動化閉環與 2026-04-29 修復實況 | 接續 AI 自動化、EventRouter、AutoHeal、OpenClaw memory、ElephantAlpha 編排時 |
+| `ai_automation_closure_20260429.md` | 四 AI Agent 自動化閉環、Smoke、metrics 與 Grafana 觀測實況 | 接續 AI 自動化、EventRouter、AutoHeal、OpenClaw memory、ElephantAlpha 編排、可觀測性時 |
 | `credentials_passbook.md` | 伺服器、帳密、埠位對照 | 需要維運、部署、憑證核對時 |
 | `feedback_db_metadata_import.md` | SQLAlchemy metadata / `create_all()` 漏表鐵律 | 新增 model、修 schema、排查 fresh env 漏表時 |
 | `project_phase3f_cleanup_roadmap.md` | ADR-017 執行矩陣與階段紅線 | 正在做 3f 模組化收尾時 |
--- a/docs/memory/ai_automation_closure_20260429.md
+++ b/docs/memory/ai_automation_closure_20260429.md
@@ -14,6 +14,7 @@
 - Smoke API 會保存最近快檢 JSONL 趨勢，dashboard 顯示 OK / Warning / Critical 最近分布。
 - Smoke history 已支援 JSONL 匯出、清理與每日摘要；清理只影響 smoke history，不碰 DB 或 EventRouter queue。
 - Smoke 每日摘要已支援手動 Telegram 推播與 scheduler 09:10 排程入口；摘要只讀 JSONL history。
+- Grafana provisioning 已新增 `MOMO AI Automation Overview`，由 Prometheus `/metrics` 觀測 EventRouter、safe action、replay 與 AutoHeal 趨勢。

 ## 已落地範圍

@@ -30,6 +31,7 @@
 - Smoke history 只保存精簡紀錄，不保存完整 details，避免長期檔案膨脹與敏感資訊堆積。
 - Export API 回傳 `application/x-ndjson`，clear API 只刪除 `MOMO_AI_AUTOMATION_SMOKE_HISTORY` 指向檔案。
 - Daily summary API：`POST /api/ai-automation/smoke/daily-summary/send`。
+- Grafana dashboard 檔案：`docker/grafana/provisioning/dashboards/json/ai-automation-overview.json`；provider 會載入 JSON 目錄，不需要修改 dashboard provider。

 ## 驗證紀錄

@@ -38,6 +40,8 @@
 - 2026-04-29 AI smoke trend 批次：`5 passed`（smoke + metrics）。
 - 2026-04-29 AI smoke management 批次：`7 passed`（smoke + metrics）。
 - 2026-04-29 AI smoke summary 批次：`9 passed`（smoke + metrics）。
+- 2026-04-29 AI Grafana observability 批次：`3 passed`（Grafana dashboard JSON 結構與必要 metric）。
+- 2026-04-29 AI Grafana observability + AI core 回歸：`36 passed`，collect-only：`36 tests collected`。
 - 2026-04-29 L2 安全記憶批次：`24 passed`。
 - collect-only：`48 tests collected`。
 - `git diff --check` 已通過。
@@ -54,6 +58,11 @@
 - `5b25f55` 補齊 EventRouter 失敗通知回放
 - `162a76b` 落地 L2 安全記憶動作
 - `d58e4d0` 同步四 Agent AI 自動化治理紀錄
+- `e6a1c9d` 補齊 AI 自動化可觀測性指標
+- `cde8b0c` 新增 AI 自動化 Smoke Dashboard
+- `81159b5` 保存 AI Smoke 趨勢紀錄
+- `10bbd55` 補齊 AI Smoke 趨勢管理
+- `d5f4fd7` 加入 AI Smoke 每日摘要推播

 ## 下次進場先看

--- a/docs/memory/history_logs.md
+++ b/docs/memory/history_logs.md
@@ -29,6 +29,7 @@
 - **Smoke 趨勢保存**: Smoke API 追加 JSONL 精簡紀錄，dashboard 顯示最近 OK / Warning / Critical 趨勢。
 - **Smoke 趨勢管理**: Dashboard 增加 JSONL 匯出、清理與每日摘要，清理範圍限定 smoke history 檔。
 - **Smoke 每日摘要推播**: 新增 Telegram 手動推播 API 與 momo-scheduler 每日 09:10 摘要任務，只讀 smoke history。
+- **Grafana AI 觀測**: 新增 `MOMO AI Automation Overview` provisioning dashboard，覆蓋 EventRouter、safe action、replay、AutoHeal Prometheus 指標。

 ### 2026-04-28~29：Phase 3e 重構大戰 + daily_sales cache 隱形 bug 根除
 - **app.py 縮減 -10.8%**: 7,386 → 6,590 行，11 commits 全綠零 502。
--- a/tests/test_grafana_ai_automation_dashboard.py
+++ b/tests/test_grafana_ai_automation_dashboard.py
@@ -0,0 +1,59 @@
+import json
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+DASHBOARD_PATH = ROOT / "docker/grafana/provisioning/dashboards/json/ai-automation-overview.json"
+
+
+def _panel_targets(dashboard: dict):
+    for panel in dashboard.get("panels", []):
+        yield from panel.get("targets", [])
+
+
+def test_ai_automation_grafana_dashboard_is_valid_json():
+    dashboard = json.loads(DASHBOARD_PATH.read_text(encoding="utf-8"))
+
+    assert dashboard["uid"] == "momo-ai-automation-overview"
+    assert dashboard["title"] == "MOMO AI Automation Overview"
+    assert "ai-automation" in dashboard["tags"]
+    assert dashboard["timezone"] == "Asia/Taipei"
+
+
+def test_ai_automation_grafana_dashboard_tracks_required_metrics():
+    dashboard = json.loads(DASHBOARD_PATH.read_text(encoding="utf-8"))
+    expressions = "\n".join(
+        target["expr"]
+        for target in _panel_targets(dashboard)
+        if target.get("expr")
+    )
+
+    for metric in [
+        "momo_ai_event_router_dispatch_total",
+        "momo_ai_event_router_latency_ms_count",
+        "momo_ai_event_router_latency_ms_sum",
+        "momo_ai_event_router_latency_ms_max",
+        "momo_ai_event_router_safe_action_total",
+        "momo_ai_event_router_replay_total",
+        "momo_ai_autoheal_action_total",
+        "momo_ai_autoheal_duration_ms_count",
+        "momo_ai_autoheal_duration_ms_sum",
+        "momo_ai_autoheal_duration_ms_max",
+    ]:
+        assert metric in expressions
+
+
+def test_ai_automation_grafana_dashboard_uses_prometheus_datasource():
+    dashboard = json.loads(DASHBOARD_PATH.read_text(encoding="utf-8"))
+
+    metric_panels = [
+        panel
+        for panel in dashboard["panels"]
+        if panel.get("targets")
+    ]
+
+    assert metric_panels
+    assert all(
+        panel.get("datasource", {}).get("uid") == "prometheus"
+        for panel in metric_panels
+    )