From 952c10955b428dd916392ed3f066bda59bf1ce96 Mon Sep 17 00:00:00 2001 From: OG T Date: Wed, 15 Apr 2026 15:38:43 +0800 Subject: [PATCH] =?UTF-8?q?fix(db):=20=E5=A4=9A=20replica=20=E4=B8=A6?= =?UTF-8?q?=E8=A1=8C=E5=95=9F=E5=8B=95=E7=AB=B6=E7=88=AD=20=E2=80=94=20?= =?UTF-8?q?=E6=AF=8F=20table=20=E7=8D=A8=E7=AB=8B=20tx=20+=20DROP=20INDEX?= =?UTF-8?q?=20IF=20EXISTS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因:單一大 transaction 內兩個 pod 同時建同一個 table, 其中一個 CREATE INDEX 失敗 → 整個 transaction ROLLBACK → table 也消失 → 下次重啟同樣情況 → 無限 CrashLoop。 修法三層: 1. 每個 table 用獨立 transaction 建立(失敗不影響其他) 2. 建 table 前先 DROP INDEX IF EXISTS 清殘留孤兒 index 3. 捕捉 "already exists" 讓並行 pod 優雅跳過(不 crash) Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/db/base.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/apps/api/src/db/base.py b/apps/api/src/db/base.py index ede5270f..0990bd93 100644 --- a/apps/api/src/db/base.py +++ b/apps/api/src/db/base.py @@ -143,21 +143,33 @@ async def init_db() -> None: Call this at application startup. """ engine = get_engine() + + # 2026-04-15 ogt: 多 replica 並行啟動競爭修復 + # 問題:單一大 transaction 裡兩個 pod 同時建 table → 其中一個 CREATE INDEX 失敗 + # PostgreSQL 中 transaction 內任何錯誤導致整個 transaction ROLLBACK + # → table + index 全消失 → 下次重啟同樣問題 → 無限 CrashLoop + # 修法:每個 table 獨立 transaction;先 DROP INDEX IF EXISTS 清殘留孤兒 index; + # 捕捉 "already exists" 讓並行 pod 優雅跳過 + async with engine.connect() as probe_conn: + existing = set(await probe_conn.run_sync( + lambda c: __import__('sqlalchemy', fromlist=['inspect']).inspect(c).get_table_names() + )) + + for table in Base.metadata.sorted_tables: + if table.name not in existing: + try: + async with engine.begin() as conn: + # 先清殘留孤兒 index(前次 CrashLoop 留下的部分狀態) + for index in table.indexes: + await conn.execute(text(f'DROP INDEX IF EXISTS "{index.name}"')) + await conn.run_sync(table.create) + except Exception as exc: + if "already exists" in str(exc).lower(): + pass # 並行 pod 已建好,忽略 + else: + raise + async with engine.begin() as conn: - # SQLAlchemy 2.0 問題:create_all(checkfirst=True) 跳過 CREATE TABLE, - # 但仍對 __table_args__ Index 物件發出獨立 CREATE INDEX → CrashLoopBackOff - # 修法:先 inspect 取得現有 tables,只對不存在的 table 呼叫 table.create() - # 這樣 index 只隨新 table 一起建立,永遠不會 duplicate - # 2026-04-15 Claude Sonnet 4.6(亞太)Phase 3 修復 - def _create_missing_tables(sync_conn): - from sqlalchemy import inspect as sa_inspect - existing = set(sa_inspect(sync_conn).get_table_names()) - for table in Base.metadata.sorted_tables: - if table.name not in existing: - table.create(sync_conn) - - await conn.run_sync(_create_missing_tables) - # 2026-04-02 Claude Code: 確保 risklevel enum 包含 'high' 值 # Phase 23 新增,避免舊 DB 缺少此值導致 InvalidTextRepresentation await conn.execute(