diff --git a/CONSTITUTION.md b/CONSTITUTION.md index 4c1b09a..5cd65a7 100644 --- a/CONSTITUTION.md +++ b/CONSTITUTION.md @@ -2,7 +2,7 @@ > 本文件定義專案開發的核心準則與不可違反的規範 > **建立日期**: 2026-01-12 -> **當前版本**: V10.18 (Scheduler 例外記錄強化版) +> **當前版本**: V10.19 (AI metrics baseline 觀測版) > **最後更新**: 2026-04-30 --- diff --git a/TODO_NEXT_STEPS.txt b/TODO_NEXT_STEPS.txt index 721cd45..1060b6f 100644 --- a/TODO_NEXT_STEPS.txt +++ b/TODO_NEXT_STEPS.txt @@ -27,9 +27,10 @@ - DatabaseManager 連線池收斂:PostgreSQL 每 worker pool 調整為 `pool_size=2/max_overflow=3`,避免多 route 重複 new manager 時吃滿連線。 - Ollama embedding 強化:改為優先 `/api/embed`,舊節點才 fallback `/api/embeddings`,並新增 `EMBEDDING_TIMEOUT`。 - Scheduler 例外記錄強化:清除 `scheduler.py` 靜默 `except/pass`,資源清理、EDM 可選欄位、備份 insight/通知失敗全改為可診斷 log。 + - AI metrics baseline 觀測:`/metrics` 在尚無 AI 自動化事件時仍輸出 `momo_ai_*` zero-baseline series,避免重啟後 Grafana/Prometheus 看不到 metric names。 【下次待辦】 - - 觀察 Prometheus scrape 後 `momo_ai_*` 是否在事件發生後產生時間序列。 + - 觀察 Prometheus scrape 後 `momo_ai_*` baseline 與非 baseline 事件序列是否持續穩定。 - Superset panel 設定與 Smoke 摘要成效觀察。 ================================================================================ diff --git a/app.py b/app.py index 1c170a0..13fe8c1 100644 --- a/app.py +++ b/app.py @@ -95,8 +95,8 @@ except Exception as e: sys_log.error(f"無法檢測磁碟空間: {e}") # 🚩 系統版本定義 (備份與顯示用) -# 🚩 2026-04-30 V10.18: Scheduler exception logging hardening -SYSTEM_VERSION = "V10.18" +# 🚩 2026-04-30 V10.19: AI metrics zero-baseline export +SYSTEM_VERSION = "V10.19" # ========================================== # 🔒 SQL Injection 防護函數 diff --git a/config.py b/config.py index f856d5b..09053e8 100644 --- a/config.py +++ b/config.py @@ -254,7 +254,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.18" +SYSTEM_VERSION = "V10.19" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index 8cb5ff0..7393405 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -2,7 +2,7 @@ > **最後更新**: 2026-04-30 (台北時間) > **狀態**: 🟢 四 AI Agent 自動化閉環已落地 — EventRouter / AutoHeal / OpenClaw Memory / ElephantAlpha bridge / Prometheus metrics / Smoke Dashboard / Smoke Trend Management / Telegram Summary / Grafana provisioning / Prometheus scrape / CD Gunicorn 掛載具測試覆蓋 -> **適用版本**: V10.18 Scheduler 例外記錄強化版 +> **適用版本**: V10.19 AI metrics baseline 觀測版 --- @@ -62,6 +62,7 @@ SQL漏斗(~300筆) - `/metrics` 匯出 `momo_ai_event_router_safe_action_total`。 - `/metrics` 匯出 `momo_ai_event_router_replay_total`。 - `/metrics` 匯出 `momo_ai_autoheal_action_total` 與 `momo_ai_autoheal_duration_ms_count/sum/max`。 +- `/metrics` 在尚無事件時仍輸出 `momo_ai_*` zero-baseline series,讓 Prometheus/Grafana 重啟後可立即看到 metric names。 - `/ai_automation_smoke` 提供登入後 smoke dashboard。 - `/api/ai-automation/smoke` 提供 read-only JSON 狀態,不做外部網路呼叫。 - Smoke API 會將最近快檢結果保存到 JSONL,dashboard 顯示最近狀態趨勢。 diff --git a/docs/memory/ai_automation_closure_20260429.md b/docs/memory/ai_automation_closure_20260429.md index 3fb3283..0af04f0 100644 --- a/docs/memory/ai_automation_closure_20260429.md +++ b/docs/memory/ai_automation_closure_20260429.md @@ -27,6 +27,7 @@ - 2026-04-30 `DatabaseManager()` 多 route 重複建立曾有吃滿 PostgreSQL clients 風險;已重用 engine/session 並將每 worker pool 收斂為 `pool_size=2/max_overflow=3`。 - 2026-04-30 OpenClaw embedding worker 曾在舊 `/api/embeddings` 路徑遇到 Hermes timeout;Ollama client 已改為優先 `/api/embed`,舊節點才 fallback `/api/embeddings`。 - 2026-04-30 `scheduler.py` 殘留靜默 `except/pass`;已改為 warning/debug log,備份 insight 與 Telegram 通知失敗不再靜默。 +- 2026-04-30 `/metrics` 已補 `momo_ai_*` zero-baseline series;app 重啟後即使尚無 EventRouter / AutoHeal 事件,Prometheus/Grafana 也能先看到 metric names。 ## 已落地範圍 @@ -66,6 +67,7 @@ - 2026-04-30 DatabaseManager pool convergence:`tests/test_database_manager_cache.py` 覆蓋 pool size/overflow 與 engine reuse。 - 2026-04-30 Ollama embedding API migration:新增 `tests/test_ollama_embedding.py`。 - 2026-04-30 Phase 3f cleanup contracts:`tests/test_phase3f_cleanup_contracts.py` 覆蓋 orphan services、env 範例、scheduler 靜默例外。 +- 2026-04-30 AI metrics baseline:`tests/test_ai_automation_metrics.py` 覆蓋無事件 snapshot 仍匯出 `momo_ai_*` baseline。 - 2026-04-29 L2 安全記憶批次:`24 passed`。 - collect-only:`48 tests collected`。 - `git diff --check` 已通過。 diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index 1ccd9f4..f964bed 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -40,6 +40,7 @@ - **DatabaseManager 連線池收斂**: PostgreSQL 每 worker pool 收斂為 `pool_size=2/max_overflow=3`,並以 cache 重用 engine/session。 - **Ollama embedding API 遷移**: embedding client 優先使用官方 `/api/embed`,舊節點才 fallback `/api/embeddings`,降低 deprecated endpoint 與 timeout 風險。 - **Scheduler 例外記錄強化**: 清除 `scheduler.py` 靜默 `except/pass`,Chrome 清理、EDM optional 欄位、備份 insight/Telegram 失敗均保留 log。 +- **AI metrics baseline 觀測**: `/metrics` 在尚無 AI 自動化事件時仍輸出 `momo_ai_*` zero-baseline series,避免 app 重啟後 Grafana/Prometheus 看不到 metric names。 ### 2026-04-28~29:Phase 3e 重構大戰 + daily_sales cache 隱形 bug 根除 - **app.py 縮減 -10.8%**: 7,386 → 6,590 行,11 commits 全綠零 502。 diff --git a/routes/system_public_routes.py b/routes/system_public_routes.py index 299a80b..1e8592c 100644 --- a/routes/system_public_routes.py +++ b/routes/system_public_routes.py @@ -152,7 +152,28 @@ def _register_ai_automation_metrics(registry, gauge_cls, metrics_snapshot): ), } - for (metric, labels), value in metrics_snapshot.get("counters", {}).items(): + counter_samples = { + ("event_router_dispatch_total", ( + ("event_type", "baseline"), + ("outcome", "none"), + ("tier", "baseline"), + )): 0, + ("event_router_safe_action_total", ( + ("action", "baseline"), + ("status", "none"), + )): 0, + ("event_router_replay_total", ( + ("status", "none"), + )): 0, + ("autoheal_action_total", ( + ("action", "baseline"), + ("error_type", "none"), + ("result", "none"), + )): 0, + } + counter_samples.update(metrics_snapshot.get("counters", {})) + + for (metric, labels), value in counter_samples.items(): if metric not in definitions: continue name, help_text, label_names = definitions[metric] @@ -173,7 +194,19 @@ def _register_ai_automation_metrics(registry, gauge_cls, metrics_snapshot): ), } - for (metric, labels), values in metrics_snapshot.get("latency", {}).items(): + latency_samples = { + ("event_router_latency_ms", ( + ("event_type", "baseline"), + ("tier", "baseline"), + )): {"count": 0, "sum": 0, "max": 0}, + ("autoheal_duration_ms", ( + ("action", "baseline"), + ("error_type", "none"), + )): {"count": 0, "sum": 0, "max": 0}, + } + latency_samples.update(metrics_snapshot.get("latency", {})) + + for (metric, labels), values in latency_samples.items(): if metric not in latency_defs: continue name, help_text, label_names = latency_defs[metric] diff --git a/tests/test_ai_automation_metrics.py b/tests/test_ai_automation_metrics.py index 477072c..1e832c1 100644 --- a/tests/test_ai_automation_metrics.py +++ b/tests/test_ai_automation_metrics.py @@ -63,6 +63,40 @@ def test_system_metrics_exports_ai_automation_metrics(): assert "momo_ai_event_router_latency_ms_count" in output +def test_system_metrics_exports_ai_automation_zero_baseline(): + from prometheus_client import CollectorRegistry, Gauge, generate_latest + from routes.system_public_routes import _register_ai_automation_metrics + from services import ai_automation_metrics as metrics + + metrics.reset_for_tests() + registry = CollectorRegistry() + + _register_ai_automation_metrics(registry, Gauge, metrics.snapshot()) + + output = generate_latest(registry).decode("utf-8") + assert ( + 'momo_ai_event_router_dispatch_total{event_type="baseline",outcome="none",tier="baseline"} 0.0' + in output + ) + assert ( + 'momo_ai_event_router_safe_action_total{action="baseline",status="none"} 0.0' + in output + ) + assert 'momo_ai_event_router_replay_total{status="none"} 0.0' in output + assert ( + 'momo_ai_autoheal_action_total{action="baseline",error_type="none",result="none"} 0.0' + in output + ) + assert ( + 'momo_ai_event_router_latency_ms_count{event_type="baseline",tier="baseline"} 0.0' + in output + ) + assert ( + 'momo_ai_autoheal_duration_ms_count{action="baseline",error_type="none"} 0.0' + in output + ) + + def test_system_metrics_counts_sales_records_with_raw_count_query(): from prometheus_client import CollectorRegistry, Gauge, generate_latest from routes.system_public_routes import _set_database_record_counts