From 1ceab9ad34f6aed53e829fdb2504676199656052 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 2 Jul 2026 14:11:34 +0800 Subject: [PATCH] fix(api): skip bootstrap ddl statement timeout --- apps/api/src/db/base.py | 31 +++++++++++++++++++ .../tests/test_runtime_bootstrap_guards.py | 24 ++++++++++++++ docs/LOGBOOK.md | 15 +++++++++ 3 files changed, 70 insertions(+) diff --git a/apps/api/src/db/base.py b/apps/api/src/db/base.py index 9e63f017..1d24bb61 100644 --- a/apps/api/src/db/base.py +++ b/apps/api/src/db/base.py @@ -216,6 +216,29 @@ def _is_database_connection_budget_error(exc: BaseException) -> bool: return False +def _is_database_bootstrap_ddl_timeout(exc: BaseException) -> bool: + """Return True only for optional bootstrap DDL canceled by DB timeout.""" + seen: set[int] = set() + current: BaseException | None = exc + timeout_markers = ( + "querycancelederror", + "canceling statement due to statement timeout", + "statement timeout", + ) + + while current is not None and id(current) not in seen: + seen.add(id(current)) + message = f"{type(current).__name__}: {current}".lower() + if any(marker in message for marker in timeout_markers): + return True + current = ( + getattr(current, "orig", None) + or getattr(current, "__cause__", None) + or getattr(current, "__context__", None) + ) + return False + + async def init_db() -> None: """ Initialize database tables @@ -251,6 +274,14 @@ async def init_db() -> None: timeout_seconds=_DB_BOOTSTRAP_DDL_WAIT_SECONDS, lock_name=_DB_BOOTSTRAP_LOCK_NAME, ) + except DBAPIError as exc: + if not _is_database_bootstrap_ddl_timeout(exc): + raise + logger.warning( + "database_bootstrap_statement_timeout_skipped", + error_type=type(exc).__name__, + lock_name=_DB_BOOTSTRAP_LOCK_NAME, + ) finally: await lock_conn.execute( text("SELECT pg_advisory_unlock(hashtext(:lock_name))"), diff --git a/apps/api/tests/test_runtime_bootstrap_guards.py b/apps/api/tests/test_runtime_bootstrap_guards.py index 93db3215..317824ae 100644 --- a/apps/api/tests/test_runtime_bootstrap_guards.py +++ b/apps/api/tests/test_runtime_bootstrap_guards.py @@ -12,6 +12,7 @@ from collections.abc import Awaitable from typing import Any import pytest +from sqlalchemy.exc import DBAPIError class _FakeScalarResult: @@ -190,6 +191,29 @@ async def test_init_db_releases_bootstrap_lock_when_ddl_times_out(monkeypatch): assert any("pg_advisory_unlock" in stmt for stmt in fake_engine.lock_conn.statements) +@pytest.mark.asyncio +async def test_init_db_skips_bootstrap_when_postgres_statement_timeout(monkeypatch): + from src.db import base as db_base + + fake_engine = _FakeEngine() + + async def fake_run_init_db_ddl(_engine: object) -> None: + raise DBAPIError( + "ALTER TABLE approval_records ADD COLUMN IF NOT EXISTS telegram_message_id", + {}, + Exception("canceling statement due to statement timeout"), + ) + + monkeypatch.setattr(db_base, "get_engine", lambda: fake_engine) + monkeypatch.setattr(db_base, "_run_init_db_ddl", fake_run_init_db_ddl) + + await db_base.init_db() + + assert "pg_try_advisory_lock" in fake_engine.lock_conn.statements[0] + assert any("pg_advisory_unlock" in stmt for stmt in fake_engine.lock_conn.statements) + assert "COMMIT" in fake_engine.lock_conn.statements + + @pytest.mark.asyncio async def test_signal_worker_initializes_worker_redis_pool_before_tasks(monkeypatch): from src.workers import signal_worker diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 3cdfa368..4377dc35 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,18 @@ +## 2026-07-02 — 14:12 API rollout CrashLoop root cause 與 bootstrap DDL timeout 修復 + +**完成內容**: +- `f9469bcc2` CD rollout 期間 production API 變成 `502`;read-only K8s 查詢確認 `awoooi-api` 新 pod `CrashLoopBackOff`,web / worker / auto-repair-canary 皆 Running。 +- `kubectl logs --previous` 顯示 API startup 在 `init_db()` 執行 `ALTER TABLE approval_records ADD COLUMN IF NOT EXISTS telegram_message_id...` 時被 PostgreSQL `statement_timeout` 取消,SQLAlchemy 包成 `DBAPIError` 後重新 raise,導致 container exit 3。 +- `apps/api/src/db/base.py` 新增 `_is_database_bootstrap_ddl_timeout()`,只針對 optional bootstrap DDL 的 `QueryCanceledError` / `canceling statement due to statement timeout` fail-visible skip;其他 DBAPIError 仍 raise,避免吞真正 migration / SQL 錯誤。 +- `apps/api/tests/test_runtime_bootstrap_guards.py` 新增 regression:模擬 `DBAPIError(... statement timeout ...)` 時,`init_db()` 必須釋放 advisory lock、commit unlock,且不讓 API startup crash。 + +**驗證**: +- `python3.11 -m py_compile apps/api/src/db/base.py` +- `DATABASE_URL=postgresql+asyncpg://test:test@localhost/test python3.11 -m pytest apps/api/tests/test_runtime_bootstrap_guards.py -q`:`9 passed`。 + +**仍維持**: +- 未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未重啟主機 / Docker / Nginx / K3s / DB / firewall;K8s 查詢為 read-only。 + ## 2026-07-02 — 13:55 Telegram 告警 receipt / AI route coverage 缺口讀回 **完成內容**: