From 73561efa7a2a6fe71e208366ca1b4aab75c2abc3 Mon Sep 17 00:00:00 2001 From: OoO Date: Thu, 30 Apr 2026 09:13:36 +0800 Subject: [PATCH] =?UTF-8?q?=E5=BC=B7=E5=8C=96=20CD=20Gunicorn=20=E6=8E=9B?= =?UTF-8?q?=E8=BC=89=E8=88=87=20metrics=20=E9=99=8D=E5=99=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AGENTS.md | 3 +- CONSTITUTION.md | 4 +- TODO_NEXT_STEPS.txt | 2 + app.py | 4 +- config.py | 2 +- docker-compose.yml | 1 + docs/AI_INTELLIGENCE_MODULE_SOT.md | 8 ++-- docs/guides/devops_handbook.md | 9 ++++ docs/memory/README.md | 1 + docs/memory/ai_automation_closure_20260429.md | 5 +++ .../db_connection_pool_singleton_20260430.md | 15 +++++++ docs/memory/history_logs.md | 2 + routes/system_public_routes.py | 31 ++++++++------ tests/test_ai_automation_metrics.py | 42 +++++++++++++++++++ tests/test_docker_compose_runtime_mounts.py | 12 ++++++ 15 files changed, 120 insertions(+), 21 deletions(-) create mode 100644 docs/memory/db_connection_pool_singleton_20260430.md create mode 100644 tests/test_docker_compose_runtime_mounts.py diff --git a/AGENTS.md b/AGENTS.md index 26663e2..2b3b0e5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,6 +1,6 @@ # EwoooC (MOMO Pro System) — Codex 專案工作規則 -> 版本: V13.1 +> 版本: V13.2 > 目標: 把專案知識整理成 Codex 可低成本讀取、可持續維護、可安全落地的單一工作入口。 ## 1. 入口原則 @@ -124,6 +124,7 @@ - 禁止影響 `momo-db` 的資料與容器生命週期。 - 跨專案資源邊界以 ADR-011 為準。 - 部署、容器、SSH 類操作先看 `docs/adr/ADR-011-cross-project-resource-isolation.md`。 +- `gunicorn.conf.py` 必須透過 `docker-compose.yml` bind mount 進 `momo-app`;除救急外,不以 `docker cp` 當常態部署方式。 ## 8. 常用入口 diff --git a/CONSTITUTION.md b/CONSTITUTION.md index c17a740..8bc52bb 100644 --- a/CONSTITUTION.md +++ b/CONSTITUTION.md @@ -2,8 +2,8 @@ > 本文件定義專案開發的核心準則與不可違反的規範 > **建立日期**: 2026-01-12 -> **當前版本**: V10.12 (CD 健康檢查強化版) -> **最後更新**: 2026-04-29 +> **當前版本**: V10.13 (CD Gunicorn 掛載強化版) +> **最後更新**: 2026-04-30 --- diff --git a/TODO_NEXT_STEPS.txt b/TODO_NEXT_STEPS.txt index 0e3c514..ac6d77b 100644 --- a/TODO_NEXT_STEPS.txt +++ b/TODO_NEXT_STEPS.txt @@ -20,6 +20,8 @@ - Gunicorn preload 修復:`post_fork` 略過 Flask/Werkzeug request-bound LocalProxy,避免 worker boot fail。 - CD 健康檢查強化:改為 internal container health + external `mo.wooo.work` 雙檢查,重試窗延長到約 3 分鐘。 - CD Sync reload 修復:rsync 後明確 `docker compose restart momo-app scheduler telegram-bot`,避免檔案已同步但 process 仍跑舊版。 + - CD Gunicorn 掛載強化:`momo-app` 明確掛載 `./gunicorn.conf.py:/app/gunicorn.conf.py:ro`,避免重啟後吃到 image 內舊版設定。 + - Metrics schema drift 降噪:`realtime_sales_monthly` 總筆數改用 raw `COUNT(*)`,避免 ORM 欄位 drift 造成 Prometheus scrape warning。 【下次待辦】 - 觀察 Prometheus scrape 後 `momo_ai_*` 是否在事件發生後產生時間序列。 diff --git a/app.py b/app.py index 2326cc2..c783a2a 100644 --- a/app.py +++ b/app.py @@ -95,8 +95,8 @@ except Exception as e: sys_log.error(f"無法檢測磁碟空間: {e}") # 🚩 系統版本定義 (備份與顯示用) -# 🚩 2026-04-30 V10.12: CD health check internal/external hardening -SYSTEM_VERSION = "V10.12" +# 🚩 2026-04-30 V10.13: CD Gunicorn bind mount + metrics schema drift hardening +SYSTEM_VERSION = "V10.13" # ========================================== # 🔒 SQL Injection 防護函數 diff --git a/config.py b/config.py index b604c05..ac278a1 100644 --- a/config.py +++ b/config.py @@ -253,7 +253,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.12" +SYSTEM_VERSION = "V10.13" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docker-compose.yml b/docker-compose.yml index c4a94b9..dece50a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -59,6 +59,7 @@ services: - ./config.py:/app/config.py:ro - ./app.py:/app/app.py:ro - ./auth.py:/app/auth.py:ro + - ./gunicorn.conf.py:/app/gunicorn.conf.py:ro - ./scheduler.py:/app/scheduler.py:ro - ./services:/app/services:ro - ./routes:/app/routes:ro diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index 4864782..f7e5f3c 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -1,8 +1,8 @@ # MOMO PRO — AI 競價情報模組 Single Source of Truth -> **最後更新**: 2026-04-29 (台北時間) -> **狀態**: 🟢 四 AI Agent 自動化閉環已落地 — EventRouter / AutoHeal / OpenClaw Memory / ElephantAlpha bridge / Prometheus metrics / Smoke Dashboard / Smoke Trend Management / Telegram Summary / Grafana provisioning / Prometheus scrape 具測試覆蓋 -> **適用版本**: V10.11 AI Automation Metrics Scrape 架構 +> **最後更新**: 2026-04-30 (台北時間) +> **狀態**: 🟢 四 AI Agent 自動化閉環已落地 — EventRouter / AutoHeal / OpenClaw Memory / ElephantAlpha bridge / Prometheus metrics / Smoke Dashboard / Smoke Trend Management / Telegram Summary / Grafana provisioning / Prometheus scrape / CD Gunicorn 掛載具測試覆蓋 +> **適用版本**: V10.13 CD Gunicorn 掛載強化版 --- @@ -69,6 +69,8 @@ SQL漏斗(~300筆) - Smoke 每日摘要支援手動 Telegram 推播,並由 `momo-scheduler` 每日 09:10 呼叫 `run_ai_smoke_daily_summary_task()`。 - Grafana provisioning 新增 `docker/grafana/provisioning/dashboards/json/ai-automation-overview.json`,觀測 EventRouter dispatch/latency、safe action、Telegram replay 與 AutoHeal action/duration。 - Active monitoring stack 使用 `monitoring/prometheus.yml` 的 `momo-app` job scrape `momo-pro-system:80/metrics`;Prometheus container 需加入 `momo-network`。 +- `/metrics` 對 `realtime_sales_monthly` 只用 raw `SELECT COUNT(*)` 取得總筆數,避免 ORM schema drift 讓 Prometheus scrape 產生 warning。 +- `momo-app` 必須 bind mount `./gunicorn.conf.py:/app/gunicorn.conf.py:ro`,讓 CD sync/rebuild 後的 Gunicorn runtime 設定與 repo 保持一致。 --- diff --git a/docs/guides/devops_handbook.md b/docs/guides/devops_handbook.md index 195a393..574542f 100644 --- a/docs/guides/devops_handbook.md +++ b/docs/guides/devops_handbook.md @@ -77,3 +77,12 @@ - **原因**: `docker-compose.yml` 遺漏了 `/app/routes` 的 Volume 掛載。 - **檢查**: `docker inspect momo-telegram-bot | jq '.[0].Mounts'`。 - **修復**: 確保 `volumes` 段落包含 `- ./routes:/app/routes:ro`。 + +### 5. Gunicorn 設定更新後仍吃舊版 image 內容 +- **原因**: `gunicorn.conf.py` 若沒有 bind mount,容器 restart 會使用 image 內建檔案;host 上新版設定不會自動進入 `/app/gunicorn.conf.py`。 +- **檢查**: `docker inspect momo-pro-system | jq '.[0].Mounts'`,確認有 `/app/gunicorn.conf.py`。 +- **修復**: `docker-compose.yml` 的 `momo-app.volumes` 必須包含 `- ./gunicorn.conf.py:/app/gunicorn.conf.py:ro`,再走 CD rebuild 或精準 recreate app/scheduler/bot。 + +### 6. `/metrics` 持續出現 realtime_sales_monthly 欄位不存在 warning +- **原因**: 線上表 schema 可能比 ORM 窄,ORM count 會包子查詢並引用不存在欄位。 +- **修復**: metrics 只需要總筆數時使用 `SELECT COUNT(*) FROM realtime_sales_monthly`,不要透過 `session.query(RealtimeSalesMonthly).count()`。 diff --git a/docs/memory/README.md b/docs/memory/README.md index 8d3bcd4..f0a92b5 100644 --- a/docs/memory/README.md +++ b/docs/memory/README.md @@ -16,6 +16,7 @@ | `ai_automation_closure_20260429.md` | 四 AI Agent 自動化閉環、Smoke、metrics 與 Grafana 觀測實況 | 接續 AI 自動化、EventRouter、AutoHeal、OpenClaw memory、ElephantAlpha 編排、可觀測性時 | | `credentials_passbook.md` | 伺服器、帳密、埠位對照 | 需要維運、部署、憑證核對時 | | `feedback_db_metadata_import.md` | SQLAlchemy metadata / `create_all()` 漏表鐵律 | 新增 model、修 schema、排查 fresh env 漏表時 | +| `db_connection_pool_singleton_20260430.md` | PostgreSQL `too many clients` 連線池放大事故與 DatabaseManager singleton 修正 | 排查 DB 連線數暴增、route 內反覆初始化 DatabaseManager、SQLAlchemy engine/pool 行為時 | | `project_phase3f_cleanup_roadmap.md` | ADR-017 執行矩陣與階段紅線 | 正在做 3f 模組化收尾時 | | `schema_inventory_baseline.md` | DB 表分類與 drift 基線 | 要收斂 migration / ORM / raw SQL 真相時 | diff --git a/docs/memory/ai_automation_closure_20260429.md b/docs/memory/ai_automation_closure_20260429.md index eab92c1..37b40a8 100644 --- a/docs/memory/ai_automation_closure_20260429.md +++ b/docs/memory/ai_automation_closure_20260429.md @@ -20,6 +20,8 @@ - 2026-04-30 發現並修復 `gunicorn.conf.py` `post_fork` 掃到 Flask/Werkzeug LocalProxy 導致 worker boot fail 的問題。 - 2026-04-30 CD 健康檢查曾因 rebuild 後短暫 502 太早失敗;已改為 internal `docker exec momo-pro-system /health` + external `https://mo.wooo.work/health` 雙檢查,重試約 3 分鐘。 - 2026-04-30 CD Sync 模式曾只 rsync + `docker compose up -d`,導致 host 檔案已是新版但 gunicorn process 仍跑舊版;已補 `docker compose restart momo-app scheduler telegram-bot`。 +- 2026-04-30 `gunicorn.conf.py` 不是 app container bind mount,曾導致手動 restart 後回吃 image 內舊設定;`momo-app` 已補 `./gunicorn.conf.py:/app/gunicorn.conf.py:ro`。 +- 2026-04-30 `/metrics` 對 `realtime_sales_monthly` 改用 raw `SELECT COUNT(*)`,避免 ORM 欄位與線上表 schema drift 時每次 Prometheus scrape 都產生 warning。 ## 已落地範圍 @@ -38,6 +40,7 @@ - Daily summary API:`POST /api/ai-automation/smoke/daily-summary/send`。 - Grafana dashboard 檔案:`docker/grafana/provisioning/dashboards/json/ai-automation-overview.json`;provider 會載入 JSON 目錄,不需要修改 dashboard provider。 - Active monitoring 使用 `monitoring/prometheus.yml`,不是 `docker/prometheus/prometheus.yml`;若線上 panel 無資料,先查 Prometheus 是否有 `momo-app` target。 +- App container 的 runtime `gunicorn.conf.py` 由 `docker-compose.yml` bind mount;若未來改 gunicorn 設定,不應再手動 `docker cp` 作為常態流程。 ## 驗證紀錄 @@ -51,6 +54,8 @@ - 2026-04-30 Gunicorn LocalProxy 修復:新增 `tests/test_gunicorn_config.py`。 - 2026-04-30 Prometheus scrape 修復:新增 `tests/test_prometheus_ai_automation_scrape.py`。 - 2026-04-30 CD health check hardening:新增 `tests/test_cd_health_check.py`。 +- 2026-04-30 CD Gunicorn mount hardening:新增 `tests/test_docker_compose_runtime_mounts.py`。 +- 2026-04-30 Metrics schema drift 降噪:`tests/test_ai_automation_metrics.py` 覆蓋 raw sales count query。 - 2026-04-29 L2 安全記憶批次:`24 passed`。 - collect-only:`48 tests collected`。 - `git diff --check` 已通過。 diff --git a/docs/memory/db_connection_pool_singleton_20260430.md b/docs/memory/db_connection_pool_singleton_20260430.md new file mode 100644 index 0000000..95bf97e --- /dev/null +++ b/docs/memory/db_connection_pool_singleton_20260430.md @@ -0,0 +1,15 @@ +# DB Connection Pool Singleton Hotfix (2026-04-30) + +## 背景 + +`/daily_sales` 出現 PostgreSQL `FATAL: sorry, too many clients already`。根因不是單一查詢過重,而是多個 route 會在 request 內直接呼叫 `DatabaseManager()`,舊實作每次初始化都 `create_engine()`,導致同一個 worker process 內反覆建立新的 SQLAlchemy connection pool。 + +## 修正 + +`database/manager.py` 的 `DatabaseManager` 以 `(DATABASE_TYPE, effective_db_path)` 作為 key 快取 `engine` 與 `Session` factory。後續直接 `DatabaseManager()` 會共用同一組連線池,不再持續放大 PostgreSQL client 數。 + +## 維運提醒 + +- 若再次遇到 `too many clients already`,先檢查是否有新模組繞過 `database.manager.get_db_manager()` 或直接 `create_engine()`。 +- 熱修後需重啟 `momo-pro-system`,讓舊 process 釋放既有連線池。 +- 不需要重啟或重建 `momo-db`。 diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index b204cdd..2618975 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -33,6 +33,8 @@ - **Grafana 線上載入與 scrape 修復**: 188 active Grafana 載入 4 dashboards;active Prometheus 補 `momo-app` scrape job,並修復 gunicorn preload LocalProxy boot crash。 - **CD 健康檢查強化**: Gitea Actions health check 改為 internal container health + external URL 雙檢查,降低 rebuild 後短暫 502 誤判。 - **CD Sync reload 修復**: rsync 後明確 restart 三容器,避免 bind-mounted Python 檔案更新但 gunicorn/scheduler/bot process 未 reload。 +- **CD Gunicorn 掛載強化**: `momo-app` 補 `./gunicorn.conf.py:/app/gunicorn.conf.py:ro`,避免容器 restart 後回吃 image 內舊 gunicorn 設定。 +- **Metrics schema drift 降噪**: `/metrics` 的 `realtime_sales_monthly` 總筆數改用 raw `COUNT(*)`,避免 ORM 欄位 drift 造成 Prometheus scrape warning。 ### 2026-04-28~29:Phase 3e 重構大戰 + daily_sales cache 隱形 bug 根除 - **app.py 縮減 -10.8%**: 7,386 → 6,590 行,11 commits 全綠零 502。 diff --git a/routes/system_public_routes.py b/routes/system_public_routes.py index 6b02de9..299a80b 100644 --- a/routes/system_public_routes.py +++ b/routes/system_public_routes.py @@ -70,23 +70,16 @@ def prometheus_metrics(): db_status.set(0) app_health.set(0) + session = None try: db = DatabaseManager() session = db.get_session() - - product_count = Gauge('momo_products_total', '商品總數', registry=registry) - product_count.set(session.query(Product).count()) - - price_record_count = Gauge('momo_price_records_total', '價格記錄總數', registry=registry) - price_record_count.set(session.query(PriceRecord).count()) - - from database.realtime_sales_models import RealtimeSalesMonthly - sales_count = Gauge('momo_sales_records_total', '業績資料總數', registry=registry) - sales_count.set(session.query(RealtimeSalesMonthly).count()) - - session.close() + _set_database_record_counts(registry, Gauge, session) except Exception as e: sys_log.warning(f"[Metrics] 無法取得資料庫統計: {e}") + finally: + if session is not None: + session.close() try: _register_ai_automation_metrics(registry, Gauge, ai_metrics_snapshot()) @@ -113,6 +106,20 @@ def _labels_to_dict(labels): return dict(labels) +def _set_database_record_counts(registry, gauge_cls, session): + """Register DB counts without selecting drift-prone sales columns.""" + product_count = gauge_cls('momo_products_total', '商品總數', registry=registry) + product_count.set(session.query(Product).count()) + + price_record_count = gauge_cls('momo_price_records_total', '價格記錄總數', registry=registry) + price_record_count.set(session.query(PriceRecord).count()) + + # V-Fix: realtime_sales_monthly 線上欄位曾與 ORM 不同步,metrics 只需要總筆數。 + sales_count = gauge_cls('momo_sales_records_total', '業績資料總數', registry=registry) + sales_total = session.execute(text("SELECT COUNT(*) FROM realtime_sales_monthly")).scalar() or 0 + sales_count.set(sales_total) + + def _register_ai_automation_metrics(registry, gauge_cls, metrics_snapshot): """Export dependency-free AI metrics into a per-request Prometheus registry.""" gauges = {} diff --git a/tests/test_ai_automation_metrics.py b/tests/test_ai_automation_metrics.py index 9f5e1c4..477072c 100644 --- a/tests/test_ai_automation_metrics.py +++ b/tests/test_ai_automation_metrics.py @@ -61,3 +61,45 @@ def test_system_metrics_exports_ai_automation_metrics(): assert 'event_type="crawler_timeout"' in output assert 'outcome="delivered"' in output assert "momo_ai_event_router_latency_ms_count" in output + + +def test_system_metrics_counts_sales_records_with_raw_count_query(): + from prometheus_client import CollectorRegistry, Gauge, generate_latest + from routes.system_public_routes import _set_database_record_counts + + class FakeQuery: + def __init__(self, value): + self.value = value + + def count(self): + return self.value + + class FakeResult: + def scalar(self): + return 789 + + class FakeSession: + def __init__(self): + self.executed_sql = [] + + def query(self, model): + if model.__name__ == "Product": + return FakeQuery(123) + if model.__name__ == "PriceRecord": + return FakeQuery(456) + raise AssertionError(f"不應透過 ORM 查詢 {model.__name__}") + + def execute(self, statement): + self.executed_sql.append(str(statement)) + return FakeResult() + + session = FakeSession() + registry = CollectorRegistry() + + _set_database_record_counts(registry, Gauge, session) + + output = generate_latest(registry).decode("utf-8") + assert "momo_products_total 123.0" in output + assert "momo_price_records_total 456.0" in output + assert "momo_sales_records_total 789.0" in output + assert session.executed_sql == ["SELECT COUNT(*) FROM realtime_sales_monthly"] diff --git a/tests/test_docker_compose_runtime_mounts.py b/tests/test_docker_compose_runtime_mounts.py new file mode 100644 index 0000000..b3871fe --- /dev/null +++ b/tests/test_docker_compose_runtime_mounts.py @@ -0,0 +1,12 @@ +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +DOCKER_COMPOSE = ROOT / "docker-compose.yml" + + +def test_momo_app_mounts_gunicorn_config_for_sync_deploys(): + compose = DOCKER_COMPOSE.read_text(encoding="utf-8") + + assert 'command: ["gunicorn", "--config", "gunicorn.conf.py", "app:app"]' in compose + assert "- ./gunicorn.conf.py:/app/gunicorn.conf.py:ro" in compose