fix(metrics): 輸出 AI 自動化 baseline 指標

2026-04-30 10:32:43 +08:00
parent e73cd6e6a3
commit d8f93df4ec
9 changed files with 80 additions and 8 deletions
--- a/CONSTITUTION.md
+++ b/CONSTITUTION.md
@@ -2,7 +2,7 @@

 > 本文件定義專案開發的核心準則與不可違反的規範
 > **建立日期**: 2026-01-12
-> **當前版本**: V10.18 (Scheduler 例外記錄強化版)
+> **當前版本**: V10.19 (AI metrics baseline 觀測版)
 > **最後更新**: 2026-04-30

 ---
--- a/TODO_NEXT_STEPS.txt
+++ b/TODO_NEXT_STEPS.txt
@@ -27,9 +27,10 @@
   - DatabaseManager 連線池收斂：PostgreSQL 每 worker pool 調整為 `pool_size=2/max_overflow=3`，避免多 route 重複 new manager 時吃滿連線。
   - Ollama embedding 強化：改為優先 `/api/embed`，舊節點才 fallback `/api/embeddings`，並新增 `EMBEDDING_TIMEOUT`。
   - Scheduler 例外記錄強化：清除 `scheduler.py` 靜默 `except/pass`，資源清理、EDM 可選欄位、備份 insight/通知失敗全改為可診斷 log。
+   - AI metrics baseline 觀測：`/metrics` 在尚無 AI 自動化事件時仍輸出 `momo_ai_*` zero-baseline series，避免重啟後 Grafana/Prometheus 看不到 metric names。

 【下次待辦】
-   - 觀察 Prometheus scrape 後 `momo_ai_*` 是否在事件發生後產生時間序列。
+   - 觀察 Prometheus scrape 後 `momo_ai_*` baseline 與非 baseline 事件序列是否持續穩定。
   - Superset panel 設定與 Smoke 摘要成效觀察。

 ================================================================================
--- a/app.py
+++ b/app.py
@@ -95,8 +95,8 @@ except Exception as e:
    sys_log.error(f"無法檢測磁碟空間: {e}")

 # 🚩 系統版本定義 (備份與顯示用)
-# 🚩 2026-04-30 V10.18: Scheduler exception logging hardening
-SYSTEM_VERSION = "V10.18"
+# 🚩 2026-04-30 V10.19: AI metrics zero-baseline export
+SYSTEM_VERSION = "V10.19"

 # ==========================================
 # 🔒 SQL Injection 防護函數
--- a/config.py
+++ b/config.py
@@ -254,7 +254,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
 # ==========================================
 # 系統版本與路徑
 # ==========================================
-SYSTEM_VERSION = "V10.18"
+SYSTEM_VERSION = "V10.19"
 LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
 public_url = PUBLIC_URL  # 用於模板顯示

--- a/docs/AI_INTELLIGENCE_MODULE_SOT.md
+++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md
@@ -2,7 +2,7 @@

 > **最後更新**: 2026-04-30 (台北時間)
 > **狀態**: 🟢 四 AI Agent 自動化閉環已落地 — EventRouter / AutoHeal / OpenClaw Memory / ElephantAlpha bridge / Prometheus metrics / Smoke Dashboard / Smoke Trend Management / Telegram Summary / Grafana provisioning / Prometheus scrape / CD Gunicorn 掛載具測試覆蓋
-> **適用版本**: V10.18 Scheduler 例外記錄強化版
+> **適用版本**: V10.19 AI metrics baseline 觀測版

 ---

@@ -62,6 +62,7 @@ SQL漏斗(~300筆)
 - `/metrics` 匯出 `momo_ai_event_router_safe_action_total`。
 - `/metrics` 匯出 `momo_ai_event_router_replay_total`。
 - `/metrics` 匯出 `momo_ai_autoheal_action_total` 與 `momo_ai_autoheal_duration_ms_count/sum/max`。
+- `/metrics` 在尚無事件時仍輸出 `momo_ai_*` zero-baseline series，讓 Prometheus/Grafana 重啟後可立即看到 metric names。
 - `/ai_automation_smoke` 提供登入後 smoke dashboard。
 - `/api/ai-automation/smoke` 提供 read-only JSON 狀態，不做外部網路呼叫。
 - Smoke API 會將最近快檢結果保存到 JSONL，dashboard 顯示最近狀態趨勢。
--- a/docs/memory/ai_automation_closure_20260429.md
+++ b/docs/memory/ai_automation_closure_20260429.md
@@ -27,6 +27,7 @@
 - 2026-04-30 `DatabaseManager()` 多 route 重複建立曾有吃滿 PostgreSQL clients 風險；已重用 engine/session 並將每 worker pool 收斂為 `pool_size=2/max_overflow=3`。
 - 2026-04-30 OpenClaw embedding worker 曾在舊 `/api/embeddings` 路徑遇到 Hermes timeout；Ollama client 已改為優先 `/api/embed`，舊節點才 fallback `/api/embeddings`。
 - 2026-04-30 `scheduler.py` 殘留靜默 `except/pass`；已改為 warning/debug log，備份 insight 與 Telegram 通知失敗不再靜默。
+- 2026-04-30 `/metrics` 已補 `momo_ai_*` zero-baseline series；app 重啟後即使尚無 EventRouter / AutoHeal 事件，Prometheus/Grafana 也能先看到 metric names。

 ## 已落地範圍

@@ -66,6 +67,7 @@
 - 2026-04-30 DatabaseManager pool convergence：`tests/test_database_manager_cache.py` 覆蓋 pool size/overflow 與 engine reuse。
 - 2026-04-30 Ollama embedding API migration：新增 `tests/test_ollama_embedding.py`。
 - 2026-04-30 Phase 3f cleanup contracts：`tests/test_phase3f_cleanup_contracts.py` 覆蓋 orphan services、env 範例、scheduler 靜默例外。
+- 2026-04-30 AI metrics baseline：`tests/test_ai_automation_metrics.py` 覆蓋無事件 snapshot 仍匯出 `momo_ai_*` baseline。
 - 2026-04-29 L2 安全記憶批次：`24 passed`。
 - collect-only：`48 tests collected`。
 - `git diff --check` 已通過。
--- a/docs/memory/history_logs.md
+++ b/docs/memory/history_logs.md
@@ -40,6 +40,7 @@
 - **DatabaseManager 連線池收斂**: PostgreSQL 每 worker pool 收斂為 `pool_size=2/max_overflow=3`，並以 cache 重用 engine/session。
 - **Ollama embedding API 遷移**: embedding client 優先使用官方 `/api/embed`，舊節點才 fallback `/api/embeddings`，降低 deprecated endpoint 與 timeout 風險。
 - **Scheduler 例外記錄強化**: 清除 `scheduler.py` 靜默 `except/pass`，Chrome 清理、EDM optional 欄位、備份 insight/Telegram 失敗均保留 log。
+- **AI metrics baseline 觀測**: `/metrics` 在尚無 AI 自動化事件時仍輸出 `momo_ai_*` zero-baseline series，避免 app 重啟後 Grafana/Prometheus 看不到 metric names。

 ### 2026-04-28~29：Phase 3e 重構大戰 + daily_sales cache 隱形 bug 根除
 - **app.py 縮減 -10.8%**: 7,386 → 6,590 行，11 commits 全綠零 502。
--- a/routes/system_public_routes.py
+++ b/routes/system_public_routes.py
@@ -152,7 +152,28 @@ def _register_ai_automation_metrics(registry, gauge_cls, metrics_snapshot):
        ),
    }

-    for (metric, labels), value in metrics_snapshot.get("counters", {}).items():
+    counter_samples = {
+        ("event_router_dispatch_total", (
+            ("event_type", "baseline"),
+            ("outcome", "none"),
+            ("tier", "baseline"),
+        )): 0,
+        ("event_router_safe_action_total", (
+            ("action", "baseline"),
+            ("status", "none"),
+        )): 0,
+        ("event_router_replay_total", (
+            ("status", "none"),
+        )): 0,
+        ("autoheal_action_total", (
+            ("action", "baseline"),
+            ("error_type", "none"),
+            ("result", "none"),
+        )): 0,
+    }
+    counter_samples.update(metrics_snapshot.get("counters", {}))
+
+    for (metric, labels), value in counter_samples.items():
        if metric not in definitions:
            continue
        name, help_text, label_names = definitions[metric]
@@ -173,7 +194,19 @@ def _register_ai_automation_metrics(registry, gauge_cls, metrics_snapshot):
        ),
    }

-    for (metric, labels), values in metrics_snapshot.get("latency", {}).items():
+    latency_samples = {
+        ("event_router_latency_ms", (
+            ("event_type", "baseline"),
+            ("tier", "baseline"),
+        )): {"count": 0, "sum": 0, "max": 0},
+        ("autoheal_duration_ms", (
+            ("action", "baseline"),
+            ("error_type", "none"),
+        )): {"count": 0, "sum": 0, "max": 0},
+    }
+    latency_samples.update(metrics_snapshot.get("latency", {}))
+
+    for (metric, labels), values in latency_samples.items():
        if metric not in latency_defs:
            continue
        name, help_text, label_names = latency_defs[metric]
--- a/tests/test_ai_automation_metrics.py
+++ b/tests/test_ai_automation_metrics.py
@@ -63,6 +63,40 @@ def test_system_metrics_exports_ai_automation_metrics():
    assert "momo_ai_event_router_latency_ms_count" in output


+def test_system_metrics_exports_ai_automation_zero_baseline():
+    from prometheus_client import CollectorRegistry, Gauge, generate_latest
+    from routes.system_public_routes import _register_ai_automation_metrics
+    from services import ai_automation_metrics as metrics
+
+    metrics.reset_for_tests()
+    registry = CollectorRegistry()
+
+    _register_ai_automation_metrics(registry, Gauge, metrics.snapshot())
+
+    output = generate_latest(registry).decode("utf-8")
+    assert (
+        'momo_ai_event_router_dispatch_total{event_type="baseline",outcome="none",tier="baseline"} 0.0'
+        in output
+    )
+    assert (
+        'momo_ai_event_router_safe_action_total{action="baseline",status="none"} 0.0'
+        in output
+    )
+    assert 'momo_ai_event_router_replay_total{status="none"} 0.0' in output
+    assert (
+        'momo_ai_autoheal_action_total{action="baseline",error_type="none",result="none"} 0.0'
+        in output
+    )
+    assert (
+        'momo_ai_event_router_latency_ms_count{event_type="baseline",tier="baseline"} 0.0'
+        in output
+    )
+    assert (
+        'momo_ai_autoheal_duration_ms_count{action="baseline",error_type="none"} 0.0'
+        in output
+    )
+
+
 def test_system_metrics_counts_sales_records_with_raw_count_query():
    from prometheus_client import CollectorRegistry, Gauge, generate_latest
    from routes.system_public_routes import _set_database_record_counts