From b4055c5915bfcf8f6a37f9656be291ebe7fc526c Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 4 May 2026 11:18:20 +0800 Subject: [PATCH] =?UTF-8?q?feat(embedding):=20ADR-110=20=E5=8D=87=E7=B4=9A?= =?UTF-8?q?=20bge-m3:latest=201024=20=E7=B6=AD=E5=90=91=E9=87=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCP-A (34.143.170.20) 無 nomic-embed-text,改用 bge-m3:latest(專用 多語言 embedding 模型),產生 1024 維向量。 變更: - embedding_service.py: 加入 bge-m3:latest=1024 維到 MODEL_DIMENSIONS, 預設模型改為 bge-m3:latest,更新文件說明 - playbook_embedding_repository.py + interfaces.py: 更新維度說明 - migrations/embedding_bge_m3_1024.sql: pgvector schema 遷移 rag_chunks + playbook_embeddings vector(768) → vector(1024) - scripts/reembed_bge_m3.py: 遷移後重新嵌入現有資料的 script 遷移步驟: 1. 執行 embedding_bge_m3_1024.sql(清空現有 768 維向量,變更維度) 2. 執行 python scripts/reembed_bge_m3.py 重新嵌入 2026-05-04 ogt + Claude Sonnet 4.6 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/migrations/embedding_bge_m3_1024.sql | 88 +++++++++ apps/api/scripts/reembed_bge_m3.py | 187 ++++++++++++++++++ apps/api/src/repositories/interfaces.py | 2 +- .../playbook_embedding_repository.py | 4 +- apps/api/src/services/embedding_service.py | 23 ++- 5 files changed, 292 insertions(+), 12 deletions(-) create mode 100644 apps/api/migrations/embedding_bge_m3_1024.sql create mode 100644 apps/api/scripts/reembed_bge_m3.py diff --git a/apps/api/migrations/embedding_bge_m3_1024.sql b/apps/api/migrations/embedding_bge_m3_1024.sql new file mode 100644 index 00000000..935841db --- /dev/null +++ b/apps/api/migrations/embedding_bge_m3_1024.sql @@ -0,0 +1,88 @@ +-- ADR-110 GCP-A Primary Embedding 升級:nomic-embed-text 768 → bge-m3 1024 維 +-- 2026-05-04 ogt + Claude Sonnet 4.6 +-- +-- 背景: +-- GCP-A (34.143.170.20) 無 nomic-embed-text,改用 bge-m3:latest(專用 embedding 模型) +-- bge-m3 產生 1024 維向量,現有 schema vector(768) 不相容,INSERT 會直接失敗 +-- +-- 影響範圍: +-- 1. rag_chunks.embedding vector(768) → vector(1024) +-- 2. playbook_embeddings.embedding vector(768) → vector(1024) +-- +-- 遷移策略:清空現有向量資料,切換維度後由 re-embed script 重新嵌入 +-- 現有向量資料若要保留,需先 dump 用 nomic 格式備份(舊維度無法轉換) +-- +-- 執行前置條件: +-- 1. pgvector >= 0.5.0 (已滿足) +-- 2. 確認現有向量資料是否需要備份(重要 playbook 建議先備份) +-- 3. embedding service 已切換到 bge-m3(models.json v1.4.0) +-- +-- 回滾方式:執行 embedding_rollback_768.sql(需重新嵌入至 nomic-embed-text 格式) + +BEGIN; + +-- 1. rag_chunks:清空向量資料,變更欄位維度 +-- ivfflat index 必須先 DROP 才能 ALTER COLUMN +DROP INDEX IF EXISTS idx_rag_chunks_embedding; + +ALTER TABLE rag_chunks + ALTER COLUMN embedding TYPE vector(1024) + USING NULL; -- 清空現有 768 維向量(維度不可轉換) + +-- 重建 ivfflat index(lists=100 適合 ~10k 筆以下資料) +CREATE INDEX IF NOT EXISTS idx_rag_chunks_embedding + ON rag_chunks + USING ivfflat (embedding vector_cosine_ops) + WITH (lists = 100); + +COMMENT ON COLUMN rag_chunks.embedding IS + 'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)'; + + +-- 2. playbook_embeddings:清空向量資料,變更欄位維度 +DROP INDEX IF EXISTS ix_playbook_embeddings_vec; + +ALTER TABLE playbook_embeddings + ALTER COLUMN embedding TYPE vector(1024) + USING NULL; -- 清空現有 768 維向量 + +CREATE INDEX IF NOT EXISTS ix_playbook_embeddings_vec + ON playbook_embeddings + USING ivfflat (embedding vector_cosine_ops) + WITH (lists = 100); + +COMMENT ON COLUMN playbook_embeddings.embedding IS + 'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)'; + +COMMENT ON TABLE playbook_embeddings IS + 'Playbook 向量索引 — ADR-110 GCP-A bge-m3 1024 維 (2026-05-04)'; + + +-- 3. 驗證遷移結果 +DO $$ +DECLARE + v_rag_dim integer; + v_pb_dim integer; +BEGIN + SELECT atttypmod INTO v_rag_dim + FROM pg_attribute + JOIN pg_class ON attrelid = pg_class.oid + WHERE relname = 'rag_chunks' AND attname = 'embedding'; + + SELECT atttypmod INTO v_pb_dim + FROM pg_attribute + JOIN pg_class ON attrelid = pg_class.oid + WHERE relname = 'playbook_embeddings' AND attname = 'embedding'; + + -- atttypmod for vector(1024) = 1024 + 1 = 1025 + IF v_rag_dim != 1025 THEN + RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗:expected 1025, got %', v_rag_dim; + END IF; + IF v_pb_dim != 1025 THEN + RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗:expected 1025, got %', v_pb_dim; + END IF; + + RAISE NOTICE '✅ embedding 遷移驗證通過:rag_chunks 和 playbook_embeddings 均為 vector(1024)'; +END $$; + +COMMIT; diff --git a/apps/api/scripts/reembed_bge_m3.py b/apps/api/scripts/reembed_bge_m3.py new file mode 100644 index 00000000..15d7688d --- /dev/null +++ b/apps/api/scripts/reembed_bge_m3.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +""" +Re-embed Script: bge-m3:latest 1024 維重新嵌入 +=============================================== +遷移 embedding_bge_m3_1024.sql 後執行,重新嵌入: + 1. rag_chunks(embedding IS NULL 的筆數) + 2. playbook_embeddings(embedding IS NULL 的筆數) + +用法: + cd apps/api + python scripts/reembed_bge_m3.py [--dry-run] [--batch 50] + +前置條件: + 1. embedding_bge_m3_1024.sql 已執行(schema 已升為 vector(1024)) + 2. GCP-A Ollama (34.143.170.20:11434) 可連線且有 bge-m3:latest + 3. DATABASE_URL 環境變數已設定(或 .env 存在) + +2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP-A Primary Embedding 升級 +""" +from __future__ import annotations + +import argparse +import asyncio +import os +import sys +from pathlib import Path + +# 確保 src 在 import 路徑 +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import asyncpg +import httpx +import structlog + +logging = structlog.get_logger(__name__) + +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://34.143.170.20:11434") +EMBEDDING_MODEL = "bge-m3:latest" +EXPECTED_DIM = 1024 + + +async def embed_text(client: httpx.AsyncClient, text: str) -> list[float]: + """呼叫 Ollama bge-m3 嵌入單一文本""" + resp = await client.post( + f"{OLLAMA_URL}/api/embeddings", + json={"model": EMBEDDING_MODEL, "prompt": text}, + timeout=60.0, + ) + resp.raise_for_status() + embedding = resp.json().get("embedding", []) + if len(embedding) != EXPECTED_DIM: + raise ValueError(f"bge-m3 維度錯誤: got {len(embedding)}, expected {EXPECTED_DIM}") + return embedding + + +async def reembed_rag_chunks( + conn: asyncpg.Connection, + client: httpx.AsyncClient, + batch_size: int, + dry_run: bool, +) -> int: + rows = await conn.fetch( + "SELECT id, content FROM rag_chunks WHERE embedding IS NULL ORDER BY id LIMIT $1", + batch_size * 10, + ) + if not rows: + logging.info("rag_chunks_all_embedded") + return 0 + + done = 0 + for row in rows: + try: + vec = await embed_text(client, row["content"]) + if not dry_run: + vec_str = "[" + ",".join(f"{v:.8f}" for v in vec) + "]" + await conn.execute( + "UPDATE rag_chunks SET embedding = $1::vector WHERE id = $2", + vec_str, row["id"], + ) + done += 1 + if done % 10 == 0: + logging.info("rag_chunks_progress", done=done, total=len(rows)) + except Exception as e: + logging.error("rag_chunk_embed_failed", id=row["id"], error=str(e)) + + return done + + +async def reembed_playbook_embeddings( + conn: asyncpg.Connection, + client: httpx.AsyncClient, + batch_size: int, + dry_run: bool, +) -> int: + # playbook_embeddings 關聯 playbooks 表取原始內容 + rows = await conn.fetch(""" + SELECT pe.playbook_id, p.title, p.description, p.steps + FROM playbook_embeddings pe + JOIN playbooks p ON pe.playbook_id = p.id + WHERE pe.embedding IS NULL + ORDER BY pe.playbook_id + LIMIT $1 + """, batch_size * 10) + + if not rows: + logging.info("playbook_embeddings_all_embedded") + return 0 + + done = 0 + for row in rows: + text_parts = [row["title"] or "", row["description"] or ""] + if row["steps"]: + if isinstance(row["steps"], list): + text_parts.extend(str(s) for s in row["steps"]) + else: + text_parts.append(str(row["steps"])) + text = "\n".join(p for p in text_parts if p) + + try: + vec = await embed_text(client, text) + if not dry_run: + vec_str = "[" + ",".join(f"{v:.8f}" for v in vec) + "]" + await conn.execute( + "UPDATE playbook_embeddings SET embedding = $1::vector WHERE playbook_id = $2", + vec_str, row["playbook_id"], + ) + done += 1 + if done % 10 == 0: + logging.info("playbook_embed_progress", done=done, total=len(rows)) + except Exception as e: + logging.error("playbook_embed_failed", playbook_id=row["playbook_id"], error=str(e)) + + return done + + +async def main(dry_run: bool, batch_size: int) -> None: + database_url = os.getenv("DATABASE_URL") + if not database_url: + # 嘗試讀 .env + env_file = Path(__file__).parent.parent / ".env" + if env_file.exists(): + for line in env_file.read_text().splitlines(): + if line.startswith("DATABASE_URL="): + database_url = line.split("=", 1)[1].strip().strip('"\'') + break + if not database_url: + print("❌ DATABASE_URL 未設定,請設定環境變數或 .env 檔案", file=sys.stderr) + sys.exit(1) + + if dry_run: + print("🔍 DRY RUN 模式 — 不會實際更新 DB") + + async with httpx.AsyncClient() as http_client: + # 先驗證 bge-m3 可用且維度正確 + print(f"🔗 驗證 GCP-A Ollama ({OLLAMA_URL}) bge-m3 連線...") + try: + test_vec = await embed_text(http_client, "連線測試") + print(f"✅ bge-m3 可用,維度 = {len(test_vec)}") + except Exception as e: + print(f"❌ bge-m3 連線失敗: {e}", file=sys.stderr) + sys.exit(1) + + conn = await asyncpg.connect(database_url) + try: + # 統計待嵌入筆數 + rag_null = await conn.fetchval("SELECT COUNT(*) FROM rag_chunks WHERE embedding IS NULL") + pb_null = await conn.fetchval("SELECT COUNT(*) FROM playbook_embeddings WHERE embedding IS NULL") + print(f"📊 待嵌入:rag_chunks={rag_null} 筆,playbook_embeddings={pb_null} 筆") + + if rag_null == 0 and pb_null == 0: + print("✅ 所有向量已嵌入,無需重新處理") + return + + rag_done = await reembed_rag_chunks(conn, http_client, batch_size, dry_run) + pb_done = await reembed_playbook_embeddings(conn, http_client, batch_size, dry_run) + + print(f"{'[DRY RUN] ' if dry_run else ''}✅ 完成: rag_chunks={rag_done}, playbook_embeddings={pb_done}") + finally: + await conn.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Re-embed script for bge-m3 1024 維遷移") + parser.add_argument("--dry-run", action="store_true", help="只統計,不寫 DB") + parser.add_argument("--batch", type=int, default=50, help="每批次處理筆數") + args = parser.parse_args() + asyncio.run(main(dry_run=args.dry_run, batch_size=args.batch)) diff --git a/apps/api/src/repositories/interfaces.py b/apps/api/src/repositories/interfaces.py index 9d9291c3..2976b20a 100644 --- a/apps/api/src/repositories/interfaces.py +++ b/apps/api/src/repositories/interfaces.py @@ -274,7 +274,7 @@ class IKnowledgeRepository(Protocol): ... async def save_embedding(self, entry_id: str, embedding: list[float]) -> bool: - """儲存向量 embedding (768 維, pgvector)""" + """儲存向量 embedding (1024 維, pgvector, bge-m3:latest)""" ... async def semantic_search( diff --git a/apps/api/src/repositories/playbook_embedding_repository.py b/apps/api/src/repositories/playbook_embedding_repository.py index 34e19847..452d8d22 100644 --- a/apps/api/src/repositories/playbook_embedding_repository.py +++ b/apps/api/src/repositories/playbook_embedding_repository.py @@ -23,7 +23,7 @@ class PlaybookEmbeddingRepository: Playbook Embedding Repository 職責: playbook_embeddings 表 CRUD - 使用 pgvector 儲存 nomic-embed-text 768 維向量 + 使用 pgvector 儲存 bge-m3:latest 1024 維向量(ADR-110 2026-05-04 升級自 768 維) Args: db: SQLAlchemy AsyncSession (DI 注入) @@ -47,7 +47,7 @@ class PlaybookEmbeddingRepository: Args: playbook_id: Playbook ID - embedding: 768 維浮點向量 (list[float]) + embedding: 1024 維浮點向量 (list[float]),bge-m3:latest alert_names: 索引時的 alert_names 快照 keywords: 索引時的 keywords 快照 diff --git a/apps/api/src/services/embedding_service.py b/apps/api/src/services/embedding_service.py index 7c854889..60e8d1c3 100644 --- a/apps/api/src/services/embedding_service.py +++ b/apps/api/src/services/embedding_service.py @@ -1,17 +1,18 @@ """ -Embedding Service - Ollama BGE-M3 替代方案 -========================================== +Embedding Service - Ollama bge-m3:latest 專用向量化 +=================================================== -使用 Ollama qwen2.5:7b-instruct 提供文本向量化功能。 -雖非專用 embedding 模型,但支援多語言 (繁中/英文)。 +使用 Ollama bge-m3:latest 提供文本向量化功能(1024 維)。 +bge-m3 為專用多語言 embedding 模型,支援繁中/英文語義搜尋。 Phase 13.2 #84 - RAG Tool 基礎設施 +ADR-110 2026-05-04: GCP-A Primary 升級 bge-m3(768→1024 維遷移) -版本: v1.1 +版本: v1.2 建立日期: 2026-03-26 20:30 (台北時區) -更新日期: 2026-03-29 20:50 (台北時區) +更新日期: 2026-05-04 (台北時區) — ADR-110 bge-m3 升級 建立者: Claude Code -更新者: Claude Code (P1 修復: 維度配置化) +更新者: ogt + Claude Sonnet 4.6 (ADR-110 GCP-A Primary) """ import asyncio @@ -58,7 +59,7 @@ class OllamaEmbeddingService: Ollama Embedding Service 使用 Ollama API 進行文本向量化。 - 預設使用 qwen2.5:7b-instruct (3584 維向量)。 + 預設使用 bge-m3:latest (1024 維向量),來自 GCP-A (34.143.170.20)。 Usage: service = OllamaEmbeddingService() @@ -71,12 +72,16 @@ class OllamaEmbeddingService: "qwen2.5:3b-instruct": 2048, "llama3.2:3b": 3072, "nomic-embed-text": 768, + # 2026-05-04 ogt + Claude Sonnet 4.6: ADR-110 GCP-A Primary — bge-m3 專用 embedding 模型 + # bge-m3 產生 1024 維向量;pgvector schema 已遷移至 vector(1024)(見 embedding_bge_m3_1024.sql) + "bge-m3:latest": 1024, + "bge-m3": 1024, } DEFAULT_DIMENSION = 3584 # 未知模型的預設值 def __init__( self, - model: str = "qwen2.5:7b-instruct", + model: str = "bge-m3:latest", ollama_url: str | None = None, timeout: float = 30.0, default_dimension: int | None = None,