From 5a61c020e3bdcd369019f0b8caca34e50e4ce496 Mon Sep 17 00:00:00 2001 From: OoO Date: Thu, 30 Apr 2026 00:06:24 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=20AI=20=E8=87=AA=E5=8B=95?= =?UTF-8?q?=E5=8C=96=20Grafana=20=E5=84=80=E8=A1=A8=E6=9D=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CONSTITUTION.md | 3 +- TODO_NEXT_STEPS.txt | 7 +- app.py | 5 +- config.py | 2 +- .../json/ai-automation-overview.json | 779 ++++++++++++++++++ docker/prometheus/prometheus.yml | 4 +- docs/AI_INTELLIGENCE_MODULE_SOT.md | 5 +- docs/adr/ADR-012-agent-action-ladder.md | 3 +- docs/guides/ai_automation_session_sop.md | 2 + docs/memory/README.md | 2 +- docs/memory/ai_automation_closure_20260429.md | 9 + docs/memory/history_logs.md | 1 + tests/test_grafana_ai_automation_dashboard.py | 59 ++ 13 files changed, 866 insertions(+), 15 deletions(-) create mode 100644 docker/grafana/provisioning/dashboards/json/ai-automation-overview.json create mode 100644 tests/test_grafana_ai_automation_dashboard.py diff --git a/CONSTITUTION.md b/CONSTITUTION.md index e073b3c..2b9290d 100644 --- a/CONSTITUTION.md +++ b/CONSTITUTION.md @@ -2,7 +2,7 @@ > 本文件定義專案開發的核心準則與不可違反的規範 > **建立日期**: 2026-01-12 -> **當前版本**: V10.9 (四 AI Agent 自動化 Smoke 每日摘要版) +> **當前版本**: V10.10 (四 AI Agent 自動化 Grafana 觀測版) > **最後更新**: 2026-04-29 --- @@ -347,6 +347,7 @@ - ✅ **正確**:EventRouter 是告警、降級、去重、通知 replay 與 L2 safe action 的入口 - ✅ **正確**:AutoHeal 是自癒副作用入口,失敗時必須安全降級為 alert / log / file queue - ✅ **正確**:L2 safe action 必須可審計、可回放、低副作用 +- ✅ **正確**:AI 自動化觀測變更需同步 `/metrics`、Smoke dashboard 與 Grafana provisioning,避免告警閉環變成黑盒 - ❌ **禁止**:自動 restart / stop / recreate `momo-db` 或 `momo-postgres` - ❌ **禁止**:AI 分析失敗導致 Telegram 告警完全不送出 - **依據**:ADR-012、ADR-013、ADR-018 diff --git a/TODO_NEXT_STEPS.txt b/TODO_NEXT_STEPS.txt index dbcd5cc..be7f7ed 100644 --- a/TODO_NEXT_STEPS.txt +++ b/TODO_NEXT_STEPS.txt @@ -7,17 +7,18 @@ - ADR-018:四 AI Agent 自動化控制面立案。 - Memory:新增 `docs/memory/ai_automation_closure_20260429.md`。 - Guide/Skills 替代:新增 `docs/guides/ai_automation_session_sop.md`。 - - SOT:更新 `docs/AI_INTELLIGENCE_MODULE_SOT.md` 至 V10.9 AI Automation Smoke Summary 架構。 + - SOT:更新 `docs/AI_INTELLIGENCE_MODULE_SOT.md` 至 V10.10 AI Automation Grafana Observability 架構。 - Codex 規則:更新 `AGENTS.md`、`CONSTITUTION.md`、ADR/memory 索引。 - Prometheus 指標化:新增 EventRouter / AutoHeal / safe action / replay in-process metrics,並接入 `/metrics`。 - 線上 smoke dashboard:新增 `/ai_automation_smoke` 與 `/api/ai-automation/smoke`,覆蓋 EventRouter、AutoHeal、NemoTron fallback、OpenClaw embedding queue、ElephantAlpha HITL。 - Smoke 趨勢保存:`/api/ai-automation/smoke` 每次快檢追加 JSONL 精簡紀錄,dashboard 顯示最近趨勢。 - Smoke 趨勢管理:新增 JSONL 匯出、清理與每日摘要。 - Smoke 每日摘要:新增 Telegram 手動推播 API 與 momo-scheduler 每日 09:10 排程入口。 + - Grafana 視覺化:新增 `MOMO AI Automation Overview` provisioning dashboard,覆蓋 EventRouter、safe action、replay、AutoHeal 指標。 【下次待辦】 - - Superset / Grafana 視覺化:`momo_ai_event_router_dispatch_total`、`momo_ai_event_router_latency_ms_*`、`momo_ai_autoheal_action_total`。 - - Grafana/Superset panel 設定與 Smoke 摘要成效觀察。 + - 將 Grafana provisioning dashboard 部署到 110/188 監控環境後,觀察 panel 是否都有資料。 + - Superset panel 設定與 Smoke 摘要成效觀察。 ================================================================================ 品牌資產最終處理與維護 (Phase 7) [DONE] diff --git a/app.py b/app.py index f65340d..e832b9e 100644 --- a/app.py +++ b/app.py @@ -95,9 +95,8 @@ except Exception as e: sys_log.error(f"無法檢測磁碟空間: {e}") # 🚩 系統版本定義 (備份與顯示用) -# 🚩 2026-04-29 V10.9: AI Smoke 每日摘要 — Telegram 手動推播 / -# scheduler 09:10 排程入口 -SYSTEM_VERSION = "V10.9" +# 🚩 2026-04-29 V10.10: AI 自動化 Grafana dashboard provisioning +SYSTEM_VERSION = "V10.10" # ========================================== # 🔒 SQL Injection 防護函數 diff --git a/config.py b/config.py index 5fbf508..33d9ee5 100644 --- a/config.py +++ b/config.py @@ -253,7 +253,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.9" +SYSTEM_VERSION = "V10.10" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docker/grafana/provisioning/dashboards/json/ai-automation-overview.json b/docker/grafana/provisioning/dashboards/json/ai-automation-overview.json new file mode 100644 index 0000000..d143cc1 --- /dev/null +++ b/docker/grafana/provisioning/dashboards/json/ai-automation-overview.json @@ -0,0 +1,779 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [ + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "AI Automation Smoke Dashboard", + "tooltip": "App read-only smoke dashboard", + "type": "link", + "url": "/ai_automation_smoke" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "AI 自動化總覽", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "最近 1 小時 EventRouter 接收並處理的事件量。", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "sum(increase(momo_ai_event_router_dispatch_total[1h]))", + "legendFormat": "dispatch / 1h", + "refId": "A" + } + ], + "title": "EventRouter 1h 事件量", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "最近 24 小時 L2 SAFE_ACTIONS 執行量,依 action/status 分組。", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 2, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "targets": [ + { + "expr": "sum by (action, status) (increase(momo_ai_event_router_safe_action_total[24h]))", + "legendFormat": "{{action}} / {{status}}", + "refId": "A" + } + ], + "title": "L2 Safe Action 24h", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "最近 24 小時 Telegram queue replay 狀態。", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "targets": [ + { + "expr": "sum by (status) (increase(momo_ai_event_router_replay_total[24h]))", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "Telegram Replay 24h", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "最近 24 小時 AutoHeal 動作結果,依 action/error_type/result 分組。", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 4, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "targets": [ + { + "expr": "sum by (action, error_type, result) (increase(momo_ai_autoheal_action_total[24h]))", + "legendFormat": "{{action}} / {{error_type}} / {{result}}", + "refId": "A" + } + ], + "title": "AutoHeal Action 24h", + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 101, + "panels": [], + "title": "EventRouter 路由與延遲", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "EventRouter 每 5 分鐘新增 dispatch 數,依 outcome 分組。", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "sum" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum by (outcome) (increase(momo_ai_event_router_dispatch_total[5m]))", + "legendFormat": "{{outcome}}", + "refId": "A" + } + ], + "title": "EventRouter Dispatch by Outcome", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "EventRouter 平均延遲與最大延遲,依 tier/event_type 分組。", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3000 + }, + { + "color": "red", + "value": 10000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum by (tier, event_type) (increase(momo_ai_event_router_latency_ms_sum[5m])) / clamp_min(sum by (tier, event_type) (increase(momo_ai_event_router_latency_ms_count[5m])), 1)", + "legendFormat": "avg {{tier}} / {{event_type}}", + "refId": "A" + }, + { + "expr": "max by (tier, event_type) (momo_ai_event_router_latency_ms_max)", + "legendFormat": "max {{tier}} / {{event_type}}", + "refId": "B" + } + ], + "title": "EventRouter Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "最近 1 小時 dispatch 明細表,方便追 L1/L2/L3 與事件類型。", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 7, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "targets": [ + { + "expr": "sum by (tier, event_type, outcome) (increase(momo_ai_event_router_dispatch_total[1h]))", + "format": "table", + "instant": true, + "legendFormat": "{{tier}} / {{event_type}} / {{outcome}}", + "refId": "A" + } + ], + "title": "EventRouter 1h 明細", + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 102, + "panels": [], + "title": "AutoHeal 自癒觀測", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "AutoHeal 每 5 分鐘平均耗時與最大耗時。", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3000 + }, + { + "color": "red", + "value": 10000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum by (action, error_type) (increase(momo_ai_autoheal_duration_ms_sum[5m])) / clamp_min(sum by (action, error_type) (increase(momo_ai_autoheal_duration_ms_count[5m])), 1)", + "legendFormat": "avg {{action}} / {{error_type}}", + "refId": "A" + }, + { + "expr": "max by (action, error_type) (momo_ai_autoheal_duration_ms_max)", + "legendFormat": "max {{action}} / {{error_type}}", + "refId": "B" + } + ], + "title": "AutoHeal Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "AutoHeal 最近 24 小時 action 明細表,確認是否有高風險類型集中。", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 9, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "targets": [ + { + "expr": "sum by (action, error_type, result) (increase(momo_ai_autoheal_action_total[24h]))", + "format": "table", + "instant": true, + "legendFormat": "{{action}} / {{error_type}} / {{result}}", + "refId": "A" + } + ], + "title": "AutoHeal 24h 明細", + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 103, + "panels": [], + "title": "操作說明", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 10, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "資料來源:momo-app /metrics。此 dashboard 只讀 Prometheus,不觸發 AutoHeal、不清理 queue、不碰 momo-db。Smoke 狀態仍以 /ai_automation_smoke 與 /api/ai-automation/smoke 為準;Grafana 用來觀察 dispatch、latency、safe action、replay、AutoHeal 趨勢。", + "mode": "markdown" + }, + "pluginVersion": "10.0.0", + "title": "AI 自動化觀測邊界", + "type": "text" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": [ + "momo", + "ai-automation", + "event-router", + "autoheal" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "Asia/Taipei", + "title": "MOMO AI Automation Overview", + "uid": "momo-ai-automation-overview", + "version": 1, + "weekStart": "" +} diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 0efc042..4078c45 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -71,9 +71,7 @@ scrape_configs: # =========================================================================== # --------------------------------------------------------------------------- - # Momo Flask 應用 - 資料庫與應用指標 - # --------------------------------------------------------------------------- - # Momo Flask 應用 - 健康檢查 (應用未提供 /metrics,改用 /health) + # Momo Flask 應用 - 資料庫、應用與 AI 自動化指標 # --------------------------------------------------------------------------- - job_name: 'momo-app' static_configs: diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index 4c89d96..3d40ca4 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -1,8 +1,8 @@ # MOMO PRO — AI 競價情報模組 Single Source of Truth > **最後更新**: 2026-04-29 (台北時間) -> **狀態**: 🟢 四 AI Agent 自動化閉環已落地 — EventRouter / AutoHeal / OpenClaw Memory / ElephantAlpha bridge / Prometheus metrics / Smoke Dashboard / Smoke Trend Management / Telegram Summary 具測試覆蓋 -> **適用版本**: V10.9 AI Automation Smoke Summary 架構 +> **狀態**: 🟢 四 AI Agent 自動化閉環已落地 — EventRouter / AutoHeal / OpenClaw Memory / ElephantAlpha bridge / Prometheus metrics / Smoke Dashboard / Smoke Trend Management / Telegram Summary / Grafana provisioning 具測試覆蓋 +> **適用版本**: V10.10 AI Automation Grafana Observability 架構 --- @@ -67,6 +67,7 @@ SQL漏斗(~300筆) - Smoke API 會將最近快檢結果保存到 JSONL,dashboard 顯示最近狀態趨勢。 - Smoke history 支援 JSONL 匯出、清理與每日 OK / Warning / Critical 摘要。 - Smoke 每日摘要支援手動 Telegram 推播,並由 `momo-scheduler` 每日 09:10 呼叫 `run_ai_smoke_daily_summary_task()`。 +- Grafana provisioning 新增 `docker/grafana/provisioning/dashboards/json/ai-automation-overview.json`,觀測 EventRouter dispatch/latency、safe action、Telegram replay 與 AutoHeal action/duration。 --- diff --git a/docs/adr/ADR-012-agent-action-ladder.md b/docs/adr/ADR-012-agent-action-ladder.md index a0a735d..e983d0b 100644 --- a/docs/adr/ADR-012-agent-action-ladder.md +++ b/docs/adr/ADR-012-agent-action-ladder.md @@ -148,7 +148,8 @@ L1 Hermes 掛 → L0 模板直出 + 🟡 「AI 分析暫不可用」 - 2026-04-29 已補 smoke 結果 JSONL 保存與 dashboard 趨勢視覺化。 - 2026-04-29 已補 smoke history JSONL 匯出、清理與每日摘要。 - 2026-04-29 已補 smoke 每日摘要 Telegram 手動推播與 momo-scheduler 09:10 排程入口。 -- 尚未完成:Grafana/Superset 視覺化面板與推播成效觀察。 +- 2026-04-29 已補 Grafana provisioning dashboard:`MOMO AI Automation Overview` 覆蓋 EventRouter、L2 safe action、Telegram replay 與 AutoHeal Prometheus 指標。 +- 尚未完成:Superset 視覺化面板、Grafana 線上部署後資料觀察與 Smoke 摘要推播成效觀察。 ## References - `services/event_router.py` — 分流入口(Phase 1) diff --git a/docs/guides/ai_automation_session_sop.md b/docs/guides/ai_automation_session_sop.md index c19e333..51c8630 100644 --- a/docs/guides/ai_automation_session_sop.md +++ b/docs/guides/ai_automation_session_sop.md @@ -26,6 +26,7 @@ - Telegram 失敗必須可暫存與 replay。 - EventRouter / AutoHeal 變更必須更新 `services/ai_automation_metrics.py` 指標或確認既有指標已覆蓋。 - AI 自動化閉環變更必須確認 `/api/ai-automation/smoke` 與 `/ai_automation_smoke` 仍能反映新狀態。 +- AI 自動化 Prometheus 指標變更必須同步檢查 `docker/grafana/provisioning/dashboards/json/ai-automation-overview.json` 是否需要新增 panel 或查詢。 - Smoke dashboard 會保存 JSONL 趨勢;若新增檢查項目,要確保 history compact record 仍保持小而可讀。 - Smoke history 管理只能操作 `MOMO_AI_AUTOMATION_SMOKE_HISTORY` 指向的 JSONL,不得清理 DB 或 EventRouter queue。 - Smoke 每日摘要推播只讀 history,不得重新執行 smoke,也不得把完整 details 寫進 Telegram。 @@ -41,6 +42,7 @@ - 若有架構決策,新增 ADR 並更新 `docs/adr/README.md`。 - 若有長期實況,更新 `docs/memory/*.md` 與 `docs/memory/README.md`。 - 若 AI 架構事實改變,更新 `docs/AI_INTELLIGENCE_MODULE_SOT.md`。 +- 若 AI 自動化可觀測性改變,更新 Grafana provisioning JSON 與對應測試。 - 若 Codex 工作規則改變,更新 `AGENTS.md`;若紅線改變,更新 `CONSTITUTION.md`。 - 提交前跑 `git diff --check` 與相關 pytest。 - 使用者要求推版時,commit 後 push 到遠端。 diff --git a/docs/memory/README.md b/docs/memory/README.md index f9ef2ca..8d3bcd4 100644 --- a/docs/memory/README.md +++ b/docs/memory/README.md @@ -13,7 +13,7 @@ | 檔案 | 用途 | 何時閱讀 | |---|---|---| | `history_logs.md` | 重大里程碑與歷史脈絡 | 要理解演進背景、排查「為何變成這樣」時 | -| `ai_automation_closure_20260429.md` | 四 AI Agent 自動化閉環與 2026-04-29 修復實況 | 接續 AI 自動化、EventRouter、AutoHeal、OpenClaw memory、ElephantAlpha 編排時 | +| `ai_automation_closure_20260429.md` | 四 AI Agent 自動化閉環、Smoke、metrics 與 Grafana 觀測實況 | 接續 AI 自動化、EventRouter、AutoHeal、OpenClaw memory、ElephantAlpha 編排、可觀測性時 | | `credentials_passbook.md` | 伺服器、帳密、埠位對照 | 需要維運、部署、憑證核對時 | | `feedback_db_metadata_import.md` | SQLAlchemy metadata / `create_all()` 漏表鐵律 | 新增 model、修 schema、排查 fresh env 漏表時 | | `project_phase3f_cleanup_roadmap.md` | ADR-017 執行矩陣與階段紅線 | 正在做 3f 模組化收尾時 | diff --git a/docs/memory/ai_automation_closure_20260429.md b/docs/memory/ai_automation_closure_20260429.md index 2bc82f7..506173c 100644 --- a/docs/memory/ai_automation_closure_20260429.md +++ b/docs/memory/ai_automation_closure_20260429.md @@ -14,6 +14,7 @@ - Smoke API 會保存最近快檢 JSONL 趨勢,dashboard 顯示 OK / Warning / Critical 最近分布。 - Smoke history 已支援 JSONL 匯出、清理與每日摘要;清理只影響 smoke history,不碰 DB 或 EventRouter queue。 - Smoke 每日摘要已支援手動 Telegram 推播與 scheduler 09:10 排程入口;摘要只讀 JSONL history。 +- Grafana provisioning 已新增 `MOMO AI Automation Overview`,由 Prometheus `/metrics` 觀測 EventRouter、safe action、replay 與 AutoHeal 趨勢。 ## 已落地範圍 @@ -30,6 +31,7 @@ - Smoke history 只保存精簡紀錄,不保存完整 details,避免長期檔案膨脹與敏感資訊堆積。 - Export API 回傳 `application/x-ndjson`,clear API 只刪除 `MOMO_AI_AUTOMATION_SMOKE_HISTORY` 指向檔案。 - Daily summary API:`POST /api/ai-automation/smoke/daily-summary/send`。 +- Grafana dashboard 檔案:`docker/grafana/provisioning/dashboards/json/ai-automation-overview.json`;provider 會載入 JSON 目錄,不需要修改 dashboard provider。 ## 驗證紀錄 @@ -38,6 +40,8 @@ - 2026-04-29 AI smoke trend 批次:`5 passed`(smoke + metrics)。 - 2026-04-29 AI smoke management 批次:`7 passed`(smoke + metrics)。 - 2026-04-29 AI smoke summary 批次:`9 passed`(smoke + metrics)。 +- 2026-04-29 AI Grafana observability 批次:`3 passed`(Grafana dashboard JSON 結構與必要 metric)。 +- 2026-04-29 AI Grafana observability + AI core 回歸:`36 passed`,collect-only:`36 tests collected`。 - 2026-04-29 L2 安全記憶批次:`24 passed`。 - collect-only:`48 tests collected`。 - `git diff --check` 已通過。 @@ -54,6 +58,11 @@ - `5b25f55` 補齊 EventRouter 失敗通知回放 - `162a76b` 落地 L2 安全記憶動作 - `d58e4d0` 同步四 Agent AI 自動化治理紀錄 +- `e6a1c9d` 補齊 AI 自動化可觀測性指標 +- `cde8b0c` 新增 AI 自動化 Smoke Dashboard +- `81159b5` 保存 AI Smoke 趨勢紀錄 +- `10bbd55` 補齊 AI Smoke 趨勢管理 +- `d5f4fd7` 加入 AI Smoke 每日摘要推播 ## 下次進場先看 diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index 359f4ad..1d3d50f 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -29,6 +29,7 @@ - **Smoke 趨勢保存**: Smoke API 追加 JSONL 精簡紀錄,dashboard 顯示最近 OK / Warning / Critical 趨勢。 - **Smoke 趨勢管理**: Dashboard 增加 JSONL 匯出、清理與每日摘要,清理範圍限定 smoke history 檔。 - **Smoke 每日摘要推播**: 新增 Telegram 手動推播 API 與 momo-scheduler 每日 09:10 摘要任務,只讀 smoke history。 +- **Grafana AI 觀測**: 新增 `MOMO AI Automation Overview` provisioning dashboard,覆蓋 EventRouter、safe action、replay、AutoHeal Prometheus 指標。 ### 2026-04-28~29:Phase 3e 重構大戰 + daily_sales cache 隱形 bug 根除 - **app.py 縮減 -10.8%**: 7,386 → 6,590 行,11 commits 全綠零 502。 diff --git a/tests/test_grafana_ai_automation_dashboard.py b/tests/test_grafana_ai_automation_dashboard.py new file mode 100644 index 0000000..9ae076a --- /dev/null +++ b/tests/test_grafana_ai_automation_dashboard.py @@ -0,0 +1,59 @@ +import json +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +DASHBOARD_PATH = ROOT / "docker/grafana/provisioning/dashboards/json/ai-automation-overview.json" + + +def _panel_targets(dashboard: dict): + for panel in dashboard.get("panels", []): + yield from panel.get("targets", []) + + +def test_ai_automation_grafana_dashboard_is_valid_json(): + dashboard = json.loads(DASHBOARD_PATH.read_text(encoding="utf-8")) + + assert dashboard["uid"] == "momo-ai-automation-overview" + assert dashboard["title"] == "MOMO AI Automation Overview" + assert "ai-automation" in dashboard["tags"] + assert dashboard["timezone"] == "Asia/Taipei" + + +def test_ai_automation_grafana_dashboard_tracks_required_metrics(): + dashboard = json.loads(DASHBOARD_PATH.read_text(encoding="utf-8")) + expressions = "\n".join( + target["expr"] + for target in _panel_targets(dashboard) + if target.get("expr") + ) + + for metric in [ + "momo_ai_event_router_dispatch_total", + "momo_ai_event_router_latency_ms_count", + "momo_ai_event_router_latency_ms_sum", + "momo_ai_event_router_latency_ms_max", + "momo_ai_event_router_safe_action_total", + "momo_ai_event_router_replay_total", + "momo_ai_autoheal_action_total", + "momo_ai_autoheal_duration_ms_count", + "momo_ai_autoheal_duration_ms_sum", + "momo_ai_autoheal_duration_ms_max", + ]: + assert metric in expressions + + +def test_ai_automation_grafana_dashboard_uses_prometheus_datasource(): + dashboard = json.loads(DASHBOARD_PATH.read_text(encoding="utf-8")) + + metric_panels = [ + panel + for panel in dashboard["panels"] + if panel.get("targets") + ] + + assert metric_panels + assert all( + panel.get("datasource", {}).get("uid") == "prometheus" + for panel in metric_panels + )