fix(db): 收斂 DatabaseManager PostgreSQL 連線池
All checks were successful
CD Pipeline / deploy (push) Successful in 1m35s
All checks were successful
CD Pipeline / deploy (push) Successful in 1m35s
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
|
||||
> 本文件定義專案開發的核心準則與不可違反的規範
|
||||
> **建立日期**: 2026-01-12
|
||||
> **當前版本**: V10.15 (ElephantAlpha NIM fallback 強化版)
|
||||
> **當前版本**: V10.16 (DatabaseManager 連線池收斂版)
|
||||
> **最後更新**: 2026-04-30
|
||||
|
||||
---
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
- Metrics schema drift 降噪:`realtime_sales_monthly` 總筆數改用 raw `COUNT(*)`,避免 ORM 欄位 drift 造成 Prometheus scrape warning。
|
||||
- CD Rebuild 切換強化:rebuild 模式改為先 `docker compose build --no-cache momo-app` 成功,再 stop/rm/recreate 三應用容器,避免長時間 502。
|
||||
- ElephantAlpha NIM fallback 強化:預設改用 production 可呼叫的 `nvidia/llama-3.3-nemotron-super-49b-v1.5`,Ultra 253B 權限 404 時自動 fallback。
|
||||
- DatabaseManager 連線池收斂:PostgreSQL 每 worker pool 調整為 `pool_size=2/max_overflow=3`,避免多 route 重複 new manager 時吃滿連線。
|
||||
|
||||
【下次待辦】
|
||||
- 觀察 Prometheus scrape 後 `momo_ai_*` 是否在事件發生後產生時間序列。
|
||||
|
||||
4
app.py
4
app.py
@@ -95,8 +95,8 @@ except Exception as e:
|
||||
sys_log.error(f"無法檢測磁碟空間: {e}")
|
||||
|
||||
# 🚩 系統版本定義 (備份與顯示用)
|
||||
# 🚩 2026-04-30 V10.15: ElephantAlpha NIM model fallback hardening
|
||||
SYSTEM_VERSION = "V10.15"
|
||||
# 🚩 2026-04-30 V10.16: DatabaseManager PostgreSQL pool convergence
|
||||
SYSTEM_VERSION = "V10.16"
|
||||
|
||||
# ==========================================
|
||||
# 🔒 SQL Injection 防護函數
|
||||
|
||||
@@ -253,7 +253,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
|
||||
# ==========================================
|
||||
# 系統版本與路徑
|
||||
# ==========================================
|
||||
SYSTEM_VERSION = "V10.15"
|
||||
SYSTEM_VERSION = "V10.16"
|
||||
LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
|
||||
public_url = PUBLIC_URL # 用於模板顯示
|
||||
|
||||
|
||||
@@ -101,53 +101,53 @@ class DatabaseManager:
|
||||
self.Session = cached['Session']
|
||||
return
|
||||
|
||||
if DATABASE_TYPE == 'postgresql':
|
||||
# PostgreSQL 模式 - 使用 config.py 的連線字串
|
||||
# 連線池配置以提升穩定性
|
||||
self.engine = create_engine(
|
||||
effective_db_path,
|
||||
echo=False,
|
||||
pool_pre_ping=True, # 自動檢測斷線連線
|
||||
pool_size=5, # 連線池大小
|
||||
max_overflow=10, # 額外連線數
|
||||
pool_recycle=1800, # 30分鐘回收連線
|
||||
pool_timeout=30, # 獲取連線超時
|
||||
connect_args={
|
||||
'connect_timeout': 10, # 連線超時 10 秒
|
||||
'options': '-c statement_timeout=60000' # SQL 超時 60 秒
|
||||
if DATABASE_TYPE == 'postgresql':
|
||||
# PostgreSQL 模式 - 使用 config.py 的連線字串
|
||||
# V-Fix: gunicorn 多 worker 下需控制每個 process 的最大連線數。
|
||||
self.engine = create_engine(
|
||||
effective_db_path,
|
||||
echo=False,
|
||||
pool_pre_ping=True, # 自動檢測斷線連線
|
||||
pool_size=2, # 每個 worker 保留少量常駐連線
|
||||
max_overflow=3, # 突發上限,避免吃滿 PostgreSQL clients
|
||||
pool_recycle=1800, # 30分鐘回收連線
|
||||
pool_timeout=30, # 獲取連線超時
|
||||
connect_args={
|
||||
'connect_timeout': 10, # 連線超時 10 秒
|
||||
'options': '-c statement_timeout=60000' # SQL 超時 60 秒
|
||||
}
|
||||
)
|
||||
ensure_metadata_initialized(self.engine, use_postgres_lock=True)
|
||||
self.Session = sessionmaker(bind=self.engine)
|
||||
self._instance_cache[cache_key] = {
|
||||
'engine': self.engine,
|
||||
'Session': self.Session,
|
||||
}
|
||||
)
|
||||
ensure_metadata_initialized(self.engine, use_postgres_lock=True)
|
||||
self.Session = sessionmaker(bind=self.engine)
|
||||
self._instance_cache[cache_key] = {
|
||||
'engine': self.engine,
|
||||
'Session': self.Session,
|
||||
}
|
||||
sys_log.info(f"[Database] ✅ 使用 PostgreSQL 資料庫 (連線池已優化)")
|
||||
# ADR-013: 確保 AIOps 自動修復表存在並植入種子 PlayBook
|
||||
self._init_autoheal_tables()
|
||||
else:
|
||||
# SQLite 模式 - 向後相容
|
||||
if db_path is None:
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
effective_db_path = os.path.join(base_dir, 'data', 'momo_database.db')
|
||||
|
||||
if str(effective_db_path).startswith('sqlite://'):
|
||||
sqlite_db_file = make_url(effective_db_path).database
|
||||
if sqlite_db_file:
|
||||
os.makedirs(os.path.dirname(sqlite_db_file), exist_ok=True)
|
||||
self.engine = create_engine(effective_db_path, echo=False)
|
||||
sys_log.info(f"[Database] ✅ 使用 PostgreSQL 資料庫 (連線池已收斂)")
|
||||
# ADR-013: 確保 AIOps 自動修復表存在並植入種子 PlayBook
|
||||
self._init_autoheal_tables()
|
||||
else:
|
||||
os.makedirs(os.path.dirname(effective_db_path), exist_ok=True)
|
||||
self.engine = create_engine(f'sqlite:///{effective_db_path}', echo=False)
|
||||
Base.metadata.create_all(self.engine)
|
||||
self.Session = sessionmaker(bind=self.engine)
|
||||
self._instance_cache[cache_key] = {
|
||||
'engine': self.engine,
|
||||
'Session': self.Session,
|
||||
}
|
||||
self._check_and_fix_schema()
|
||||
sys_log.info(f"[Database] 使用 SQLite 資料庫: {effective_db_path}")
|
||||
# SQLite 模式 - 向後相容
|
||||
if db_path is None:
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
effective_db_path = os.path.join(base_dir, 'data', 'momo_database.db')
|
||||
|
||||
if str(effective_db_path).startswith('sqlite://'):
|
||||
sqlite_db_file = make_url(effective_db_path).database
|
||||
if sqlite_db_file:
|
||||
os.makedirs(os.path.dirname(sqlite_db_file), exist_ok=True)
|
||||
self.engine = create_engine(effective_db_path, echo=False)
|
||||
else:
|
||||
os.makedirs(os.path.dirname(effective_db_path), exist_ok=True)
|
||||
self.engine = create_engine(f'sqlite:///{effective_db_path}', echo=False)
|
||||
Base.metadata.create_all(self.engine)
|
||||
self.Session = sessionmaker(bind=self.engine)
|
||||
self._instance_cache[cache_key] = {
|
||||
'engine': self.engine,
|
||||
'Session': self.Session,
|
||||
}
|
||||
self._check_and_fix_schema()
|
||||
sys_log.info(f"[Database] 使用 SQLite 資料庫: {effective_db_path}")
|
||||
|
||||
def _check_and_fix_schema(self):
|
||||
"""自動檢查並修復資料庫結構 (僅限 SQLite)"""
|
||||
|
||||
@@ -96,3 +96,8 @@
|
||||
- **原因**: NVIDIA `/v1/models` 可能列出模型,但帳號未必可呼叫該 hosted function;production 曾對 Ultra 253B 回 `Function ... Not found for account`。
|
||||
- **檢查**: 在容器內用 `NVIDIA_API_KEY` 呼叫 `https://integrate.api.nvidia.com/v1/models`,再用低 `max_tokens` 測試目標 model。
|
||||
- **修復**: `ELEPHANT_ALPHA_MODEL` 預設使用 `nvidia/llama-3.3-nemotron-super-49b-v1.5`,並保留 `ELEPHANT_ALPHA_FALLBACK_MODELS`。
|
||||
|
||||
### 9. PostgreSQL 連線數快速升高
|
||||
- **原因**: Flask routes 會頻繁建立 `DatabaseManager()`,若每次都產生新 engine/pool,Gunicorn 多 worker 會快速吃滿 PostgreSQL clients。
|
||||
- **修復**: `DatabaseManager` 以 `(DATABASE_TYPE, DATABASE_PATH)` 重用 engine/session,且 PostgreSQL pool 收斂為 `pool_size=2/max_overflow=3`。
|
||||
- **檢查**: app log 應出現 `使用 PostgreSQL 資料庫 (連線池已收斂)`,Gunicorn `post_fork` 仍需 dispose inherited engines。
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
- 2026-04-30 `/metrics` 對 `realtime_sales_monthly` 改用 raw `SELECT COUNT(*)`,避免 ORM 欄位與線上表 schema drift 時每次 Prometheus scrape 都產生 warning。
|
||||
- 2026-04-30 CD Rebuild 模式曾先停三應用容器再 no-cache build,造成 build 時間全變成 502;已改為 build 成功後才短暫 stop/rm/recreate。
|
||||
- 2026-04-30 production `NVIDIA_API_KEY` 可列出 Ultra 253B 但呼叫 `nvidia/llama-3.1-nemotron-ultra-253b-v1` 會 404;ElephantAlpha 預設改用 `nvidia/llama-3.3-nemotron-super-49b-v1.5` 並加入 fallback models。
|
||||
- 2026-04-30 `DatabaseManager()` 多 route 重複建立曾有吃滿 PostgreSQL clients 風險;已重用 engine/session 並將每 worker pool 收斂為 `pool_size=2/max_overflow=3`。
|
||||
|
||||
## 已落地範圍
|
||||
|
||||
@@ -60,6 +61,7 @@
|
||||
- 2026-04-30 Metrics schema drift 降噪:`tests/test_ai_automation_metrics.py` 覆蓋 raw sales count query。
|
||||
- 2026-04-30 CD rebuild cutover hardening:`tests/test_cd_health_check.py` 覆蓋 build-before-stop 順序。
|
||||
- 2026-04-30 ElephantAlpha NIM fallback hardening:新增 `tests/test_elephant_service.py`。
|
||||
- 2026-04-30 DatabaseManager pool convergence:`tests/test_database_manager_cache.py` 覆蓋 pool size/overflow 與 engine reuse。
|
||||
- 2026-04-29 L2 安全記憶批次:`24 passed`。
|
||||
- collect-only:`48 tests collected`。
|
||||
- `git diff --check` 已通過。
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
- **Metrics schema drift 降噪**: `/metrics` 的 `realtime_sales_monthly` 總筆數改用 raw `COUNT(*)`,避免 ORM 欄位 drift 造成 Prometheus scrape warning。
|
||||
- **CD Rebuild 切換強化**: rebuild 模式改成先 build 成功、再短暫 stop/rm/recreate 三應用容器,避免 no-cache build 長時間 502。
|
||||
- **ElephantAlpha NIM fallback 強化**: production 帳號呼叫 Ultra 253B 會 404,預設改用可呼叫的 Nemotron Super 49B v1.5,並加入 70B / 8B fallback。
|
||||
- **DatabaseManager 連線池收斂**: PostgreSQL 每 worker pool 收斂為 `pool_size=2/max_overflow=3`,並以 cache 重用 engine/session。
|
||||
|
||||
### 2026-04-28~29:Phase 3e 重構大戰 + daily_sales cache 隱形 bug 根除
|
||||
- **app.py 縮減 -10.8%**: 7,386 → 6,590 行,11 commits 全綠零 502。
|
||||
|
||||
@@ -14,3 +14,48 @@ def test_database_manager_reuses_engine_for_same_sqlite_path(tmp_path):
|
||||
finally:
|
||||
DatabaseManager._instance_cache.clear()
|
||||
first.engine.dispose()
|
||||
|
||||
|
||||
def test_database_manager_uses_bounded_postgres_pool(monkeypatch):
|
||||
import config
|
||||
import database.manager as manager
|
||||
|
||||
captured = {}
|
||||
|
||||
class FakeEngine:
|
||||
def dispose(self):
|
||||
pass
|
||||
|
||||
class FakeSession:
|
||||
pass
|
||||
|
||||
def fake_create_engine(url, **kwargs):
|
||||
captured["url"] = url
|
||||
captured["kwargs"] = kwargs
|
||||
return FakeEngine()
|
||||
|
||||
def fake_sessionmaker(bind):
|
||||
captured["session_bind"] = bind
|
||||
return FakeSession
|
||||
|
||||
DatabaseManager._instance_cache.clear()
|
||||
monkeypatch.setattr(config, "DATABASE_TYPE", "postgresql")
|
||||
monkeypatch.setattr(config, "DATABASE_PATH", "postgresql://example/db")
|
||||
monkeypatch.setattr(manager, "create_engine", fake_create_engine)
|
||||
monkeypatch.setattr(manager, "sessionmaker", fake_sessionmaker)
|
||||
monkeypatch.setattr(manager, "ensure_metadata_initialized", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(DatabaseManager, "_init_autoheal_tables", lambda self: None)
|
||||
|
||||
db = DatabaseManager()
|
||||
|
||||
assert captured["url"] == "postgresql://example/db"
|
||||
assert captured["kwargs"]["pool_pre_ping"] is True
|
||||
assert captured["kwargs"]["pool_size"] == 2
|
||||
assert captured["kwargs"]["max_overflow"] == 3
|
||||
assert captured["kwargs"]["pool_recycle"] == 1800
|
||||
assert captured["kwargs"]["pool_timeout"] == 30
|
||||
assert captured["kwargs"]["connect_args"]["connect_timeout"] == 10
|
||||
assert "statement_timeout=60000" in captured["kwargs"]["connect_args"]["options"]
|
||||
assert db.engine is captured["session_bind"]
|
||||
|
||||
DatabaseManager._instance_cache.clear()
|
||||
|
||||
Reference in New Issue
Block a user