""" batch_vectorize_km.py — ADR-073 Phase 1 Step 9 批次向量化所有 embedding IS NULL 的 KnowledgeEntry。 執行方式(在 K8s Pod 內): # 透過 AWOOOI API embed-all endpoint python3 scripts/batch_vectorize_km.py --via-api # 直接呼叫 Service(需在 Pod 內) python3 scripts/batch_vectorize_km.py 2026-04-12 Claude Sonnet 4.6 (ADR-073 Phase 1) """ import asyncio import asyncpg import os import sys import httpx VIA_API = "--via-api" in sys.argv DRY_RUN = "--dry-run" in sys.argv DATABASE_URL = os.environ.get("DATABASE_URL", "").replace("postgresql+asyncpg://", "postgresql://") API_BASE = os.environ.get("API_BASE", "http://localhost:8000") PROJECT_ID = os.environ.get("AWOOOP_PROJECT_ID", "awoooi") async def check_status(): """確認需要向量化的數量""" conn = await asyncpg.connect(DATABASE_URL) try: await conn.execute("SELECT set_config('app.project_id', $1, FALSE)", PROJECT_ID) total = await conn.fetchval("SELECT count(*) FROM knowledge_entries") null_emb = await conn.fetchval("SELECT count(*) FROM knowledge_entries WHERE embedding IS NULL") return total, null_emb finally: await conn.close() async def via_api(): """呼叫 /api/v1/knowledge/embed-all endpoint""" print(f"呼叫 {API_BASE}/api/v1/knowledge/embed-all ...") async with httpx.AsyncClient(timeout=300.0) as client: resp = await client.post(f"{API_BASE}/api/v1/knowledge/embed-all") if resp.status_code == 200: result = resp.json() print(f"✅ API 回應: {result}") else: print(f"❌ API 錯誤 {resp.status_code}: {resp.text}") async def via_service(): """直接呼叫 KnowledgeService.embed_all_entries()""" sys.path.insert(0, "/app") from src.services.knowledge_service import get_knowledge_service service = get_knowledge_service() result = await service.embed_all_entries() print(f"✅ 向量化結果: {result}") async def main(): if not DATABASE_URL and not VIA_API: print("ERROR: DATABASE_URL 未設定,請加 --via-api 使用 API 模式") sys.exit(1) if not VIA_API and DATABASE_URL: total, null_emb = await check_status() print(f"knowledge_entries: total={total}, embedding IS NULL={null_emb}") if null_emb == 0: print("✅ 所有條目已向量化,無需執行") return if DRY_RUN: print(f"[DRY RUN] 將向量化 {null_emb} 筆 KM 條目") print("使用 --via-api 透過 API 執行,或不加參數直接執行 Service") return if VIA_API: await via_api() else: await via_service() # 完成後確認 if DATABASE_URL: total, null_emb = await check_status() print(f"\n完成後狀態: total={total}, embedding IS NULL={null_emb}, vectorized={total - null_emb}") if __name__ == "__main__": asyncio.run(main())