Files
awoooi/apps/api/migrations/embedding_bge_m3_1024.sql
Your Name 86bd6432ee
Some checks failed
Code Review / ai-code-review (push) Successful in 9s
run-migration / migrate (push) Successful in 7s
CD Pipeline / tests (push) Successful in 2m8s
CD Pipeline / build-and-deploy (push) Failing after 9s
CD Pipeline / post-deploy-checks (push) Has been skipped
fix(ops): make bge-m3 migration idempotent
2026-05-05 22:21:47 +08:00

174 lines
6.0 KiB
PL/PgSQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- ADR-110 GCP-A Primary Embedding 升級nomic-embed-text 768 → bge-m3 1024 維
-- 2026-05-04 ogt + Claude Sonnet 4.6
--
-- 背景:
-- GCP-A (34.143.170.20) 無 nomic-embed-text改用 bge-m3:latest專用 embedding 模型)
-- bge-m3 產生 1024 維向量,現有 schema vector(768) 不相容INSERT 會直接失敗
--
-- 影響範圍:
-- 1. knowledge_entries.embedding vector(768) → vector(1024)
-- 2. rag_chunks.embedding vector(768) → vector(1024)
-- 3. playbook_embeddings.embedding vector(768) → vector(1024)
--
-- 遷移策略:僅在欄位不是 vector(1024) 時清空現有向量資料,切換維度後由 re-embed script 重新嵌入
-- 已經是 vector(1024) 的環境重跑本 migration 時,必須保留既有向量資料。
-- 現有向量資料若要保留,需先 dump 用 nomic 格式備份(舊維度無法轉換)
--
-- 執行前置條件:
-- 1. pgvector >= 0.5.0 (已滿足)
-- 2. 確認現有向量資料是否需要備份(重要 playbook 建議先備份)
-- 3. embedding service 已切換到 bge-m3models.json v1.4.0
--
-- 回滾方式:執行 embedding_rollback_768.sql需重新嵌入至 nomic-embed-text 格式)
BEGIN;
-- 1. knowledge_entries備份舊向量並清空變更欄位維度
DO $$
DECLARE
v_dim integer;
BEGIN
SELECT a.atttypmod INTO v_dim
FROM pg_attribute a
JOIN pg_class c ON a.attrelid = c.oid
WHERE c.relname = 'knowledge_entries'
AND a.attname = 'embedding';
IF v_dim IS DISTINCT FROM 1024 THEN
EXECUTE $sql$
CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS
SELECT
id,
embedding::text AS embedding_768,
NOW() AS backed_up_at
FROM knowledge_entries
WHERE embedding IS NOT NULL
$sql$;
EXECUTE $sql$
ALTER TABLE knowledge_entries
ALTER COLUMN embedding TYPE vector(1024)
USING NULL
$sql$;
RAISE NOTICE 'knowledge_entries.embedding migrated from vector(%) to vector(1024); old embeddings were backed up and cleared', v_dim;
ELSE
RAISE NOTICE 'knowledge_entries.embedding already vector(1024); existing embeddings preserved';
END IF;
END $$;
COMMENT ON COLUMN knowledge_entries.embedding IS
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-05 ADR-110 follow-up)';
-- 2. rag_chunks清空向量資料變更欄位維度
-- ivfflat index 必須先 DROP 才能 ALTER COLUMN
DO $$
DECLARE
v_dim integer;
BEGIN
SELECT a.atttypmod INTO v_dim
FROM pg_attribute a
JOIN pg_class c ON a.attrelid = c.oid
WHERE c.relname = 'rag_chunks'
AND a.attname = 'embedding';
IF v_dim IS DISTINCT FROM 1024 THEN
EXECUTE 'DROP INDEX IF EXISTS idx_rag_chunks_embedding';
EXECUTE $sql$
ALTER TABLE rag_chunks
ALTER COLUMN embedding TYPE vector(1024)
USING NULL
$sql$;
RAISE NOTICE 'rag_chunks.embedding migrated from vector(%) to vector(1024); old embeddings were cleared', v_dim;
ELSE
RAISE NOTICE 'rag_chunks.embedding already vector(1024); existing embeddings preserved';
END IF;
END $$;
-- 重建 ivfflat indexlists=100 適合 ~10k 筆以下資料)
CREATE INDEX IF NOT EXISTS idx_rag_chunks_embedding
ON rag_chunks
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
COMMENT ON COLUMN rag_chunks.embedding IS
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)';
-- 3. playbook_embeddings清空向量資料變更欄位維度
DO $$
DECLARE
v_dim integer;
BEGIN
SELECT a.atttypmod INTO v_dim
FROM pg_attribute a
JOIN pg_class c ON a.attrelid = c.oid
WHERE c.relname = 'playbook_embeddings'
AND a.attname = 'embedding';
IF v_dim IS DISTINCT FROM 1024 THEN
EXECUTE 'DROP INDEX IF EXISTS ix_playbook_embeddings_vec';
EXECUTE $sql$
ALTER TABLE playbook_embeddings
ALTER COLUMN embedding TYPE vector(1024)
USING NULL
$sql$;
RAISE NOTICE 'playbook_embeddings.embedding migrated from vector(%) to vector(1024); old embeddings were cleared', v_dim;
ELSE
RAISE NOTICE 'playbook_embeddings.embedding already vector(1024); existing embeddings preserved';
END IF;
END $$;
CREATE INDEX IF NOT EXISTS ix_playbook_embeddings_vec
ON playbook_embeddings
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
COMMENT ON COLUMN playbook_embeddings.embedding IS
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)';
COMMENT ON TABLE playbook_embeddings IS
'Playbook 向量索引 — ADR-110 GCP-A bge-m3 1024 維 (2026-05-04)';
-- 3. 驗證遷移結果
DO $$
DECLARE
v_km_dim integer;
v_rag_dim integer;
v_pb_dim integer;
BEGIN
SELECT atttypmod INTO v_km_dim
FROM pg_attribute
JOIN pg_class ON attrelid = pg_class.oid
WHERE relname = 'knowledge_entries' AND attname = 'embedding';
SELECT atttypmod INTO v_rag_dim
FROM pg_attribute
JOIN pg_class ON attrelid = pg_class.oid
WHERE relname = 'rag_chunks' AND attname = 'embedding';
SELECT atttypmod INTO v_pb_dim
FROM pg_attribute
JOIN pg_class ON attrelid = pg_class.oid
WHERE relname = 'playbook_embeddings' AND attname = 'embedding';
-- pgvector atttypmod stores the configured dimension.
IF v_km_dim != 1024 THEN
RAISE EXCEPTION 'knowledge_entries.embedding 維度驗證失敗expected 1024, got %', v_km_dim;
END IF;
IF v_rag_dim != 1024 THEN
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗expected 1024, got %', v_rag_dim;
END IF;
IF v_pb_dim != 1024 THEN
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗expected 1024, got %', v_pb_dim;
END IF;
RAISE NOTICE '✅ embedding 遷移驗證通過knowledge_entries、rag_chunks、playbook_embeddings 均為 vector(1024)';
END $$;
COMMIT;