fix(api): serialize startup bootstrap ddl
This commit is contained in:
@@ -157,6 +157,9 @@ async def get_db_context(project_id: str | None = None) -> AsyncGenerator[AsyncS
|
||||
# Initialization
|
||||
# =============================================================================
|
||||
|
||||
_DB_BOOTSTRAP_LOCK_NAME = "awoooi:init_db:ddl"
|
||||
|
||||
|
||||
async def init_db() -> None:
|
||||
"""
|
||||
Initialize database tables
|
||||
@@ -165,6 +168,28 @@ async def init_db() -> None:
|
||||
"""
|
||||
engine = get_engine()
|
||||
|
||||
async with engine.connect() as lock_conn:
|
||||
# 2026-05-24 ogt + Codex: 兩個 API replica 同時啟動時,PostgreSQL 會在
|
||||
# ALTER TABLE ... IF NOT EXISTS 上互相等待並 deadlock。整段 bootstrap
|
||||
# DDL 必須序列化,避免 rollout 因一個 pod CrashLoop 變成 1/2 ready。
|
||||
await lock_conn.execute(
|
||||
text("SELECT pg_advisory_lock(hashtext(:lock_name))"),
|
||||
{"lock_name": _DB_BOOTSTRAP_LOCK_NAME},
|
||||
)
|
||||
try:
|
||||
await _run_init_db_ddl(engine)
|
||||
finally:
|
||||
await lock_conn.execute(
|
||||
text("SELECT pg_advisory_unlock(hashtext(:lock_name))"),
|
||||
{"lock_name": _DB_BOOTSTRAP_LOCK_NAME},
|
||||
)
|
||||
|
||||
|
||||
async def _run_init_db_ddl(engine: AsyncEngine) -> None:
|
||||
"""
|
||||
Run idempotent DB bootstrap DDL while caller holds the bootstrap advisory lock.
|
||||
"""
|
||||
|
||||
# 2026-04-15 ogt: 多 replica 並行啟動競爭修復
|
||||
# 問題:單一大 transaction 裡兩個 pod 同時建 table → 其中一個 CREATE INDEX 失敗
|
||||
# PostgreSQL 中 transaction 內任何錯誤導致整個 transaction ROLLBACK
|
||||
|
||||
@@ -31,6 +31,7 @@ from fastapi.responses import JSONResponse, Response
|
||||
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
|
||||
from sentry_sdk.integrations.fastapi import FastApiIntegration
|
||||
from sentry_sdk.integrations.starlette import StarletteIntegration
|
||||
from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
|
||||
|
||||
from src.api.v1 import agents as agents_v1 # Phase 9.5: Agent Teams API
|
||||
from src.api.v1 import ai as ai_v1
|
||||
@@ -84,7 +85,11 @@ from src.api.v1 import webhooks as webhooks_v1
|
||||
from src.core.config import settings
|
||||
from src.core.http_client import close_all_http_clients, init_all_http_clients
|
||||
from src.core.logging import get_logger, setup_logging
|
||||
from src.core.redis_client import close_redis_pool, init_redis_pool
|
||||
from src.core.redis_client import (
|
||||
close_redis_pool,
|
||||
close_worker_redis_pool,
|
||||
init_redis_pool,
|
||||
)
|
||||
from src.core.sse import get_publisher
|
||||
from src.core.telemetry import setup_telemetry, shutdown_telemetry
|
||||
|
||||
@@ -783,6 +788,7 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
|
||||
# Phase 6.1: 關閉 Signal Worker (先關閉 Consumer)
|
||||
await close_signal_worker()
|
||||
await close_worker_redis_pool()
|
||||
await publisher.stop()
|
||||
await close_executor()
|
||||
await close_openclaw()
|
||||
@@ -835,12 +841,8 @@ else:
|
||||
# Middleware
|
||||
# =============================================================================
|
||||
|
||||
# 2026-04-03 ogt: Nginx 反向代理修正 — 讓 FastAPI 信任 X-Forwarded-Proto
|
||||
# 解決問題: /api/v1/knowledge (無結尾斜線) 307 redirect 產生 http:// Location
|
||||
# 原因: FastAPI 不知道自己在 HTTPS 後面,redirect 回 http://
|
||||
# 效果: 有了此中間件,307 Location 會是 https://
|
||||
from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
|
||||
|
||||
# 2026-04-03 ogt: Nginx 反向代理修正 — 讓 FastAPI 信任 X-Forwarded-Proto。
|
||||
# 避免 /api/v1/knowledge 等 redirect 在 HTTPS 反向代理後產生 http:// Location。
|
||||
app.add_middleware(ProxyHeadersMiddleware, trusted_hosts="*")
|
||||
|
||||
# CORS - Strict Whitelist (Iron Law #2)
|
||||
|
||||
@@ -33,7 +33,7 @@ from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
from src.core.redis_client import get_redis, get_worker_redis
|
||||
from src.core.redis_client import get_redis, get_worker_redis, init_worker_redis_pool
|
||||
from src.core.telemetry import restore_trace_context
|
||||
from src.services.incident_engine import get_incident_engine
|
||||
|
||||
@@ -131,6 +131,7 @@ class SignalWorker:
|
||||
return
|
||||
|
||||
await self._ensure_consumer_group()
|
||||
await init_worker_redis_pool()
|
||||
|
||||
self._running = True
|
||||
self._task = asyncio.create_task(self._consume_loop())
|
||||
|
||||
Reference in New Issue
Block a user