diff --git a/apps/api/tests/integration/test_knowledge_semantic_search.py b/apps/api/tests/integration/test_knowledge_semantic_search.py new file mode 100644 index 00000000..1af98d3d --- /dev/null +++ b/apps/api/tests/integration/test_knowledge_semantic_search.py @@ -0,0 +1,160 @@ +""" +Knowledge Base 語意搜尋整合測試 +================================= +KB Phase 2: pgvector semantic search E2E + +建立時間: 2026-04-04 (台北時區) +建立者: Claude Code (KB Phase 2 整合測試) + +規則: +- 使用真實 awoooi_dev DB + Ollama embedding +- 每個測試後 rollback +- 禁止 Mock +""" + +import pytest + +from src.models.knowledge import EntryType, KnowledgeEntryCreate +from src.repositories.knowledge_repository import KnowledgeDBRepository + + +# ============================================================================= +# Tests +# ============================================================================= + +@pytest.mark.asyncio +async def test_save_embedding(db_session): + """save_embedding 正確寫入 pgvector 欄位""" + repo = KnowledgeDBRepository(db_session) + + data = KnowledgeEntryCreate( + title="K8s Pod OOMKilled 處理流程", + content="當 Pod 因記憶體不足被 OOMKilled 時,應先增加 memory limit 再滾動重啟。", + entry_type=EntryType.RUNBOOK, + category="基礎設施", + source="human", + ) + entry = await repo.create(data) + + # 手動產生假 embedding (768 維) + fake_embedding = [0.01] * 768 + success = await repo.save_embedding(entry.id, fake_embedding) + + assert success is True + + +@pytest.mark.asyncio +async def test_semantic_search_returns_results(db_session): + """semantic_search 能查到相關條目""" + repo = KnowledgeDBRepository(db_session) + + # 建立一筆條目 + data = KnowledgeEntryCreate( + title="SLA 監控與告警設定", + content="設定 SLA 告警:當 API 回應時間超過 500ms 時發送 Telegram 告警。", + entry_type=EntryType.BEST_PRACTICE, + category="應用層", + source="human", + ) + entry = await repo.create(data) + + # 存入 embedding (模擬向量) + embedding_a = [0.5] * 768 + await repo.save_embedding(entry.id, embedding_a) + + # 用相近向量搜尋,應能找到 + query_embedding = [0.5] * 768 # 完全相同 → cosine similarity = 1.0 + results = await repo.semantic_search(query_embedding, limit=5, threshold=0.9) + + assert len(results) >= 1 + ids = [e.id for e, _ in results] + assert entry.id in ids + + +@pytest.mark.asyncio +async def test_semantic_search_threshold_filters(db_session): + """threshold 過濾:低相似度條目不出現在結果 + + 使用兩個方向幾乎相反的向量(正負交替)確保 cosine similarity < threshold + """ + repo = KnowledgeDBRepository(db_session) + + data = KnowledgeEntryCreate( + title="Redis 連線逾時排查", + content="Redis 連線逾時通常由網路抖動或 Redis 記憶體滿導致。", + entry_type=EntryType.RUNBOOK, + category="基礎設施", + source="human", + ) + entry = await repo.create(data) + + # 存入正交向量 A:奇數維 1.0,偶數維 0.0 + emb_a = [1.0 if i % 2 == 0 else 0.0 for i in range(768)] + await repo.save_embedding(entry.id, emb_a) + + # 查詢向量 B:奇數維 0.0,偶數維 1.0 → 與 A 完全正交,cosine similarity = 0 + emb_b = [0.0 if i % 2 == 0 else 1.0 for i in range(768)] + + # threshold=0.5 → 正交向量 (score=0) 應被過濾掉 + results = await repo.semantic_search(emb_b, limit=5, threshold=0.5) + + ids = [e.id for e, _ in results] + assert entry.id not in ids + + +@pytest.mark.asyncio +async def test_semantic_search_archived_excluded(db_session): + """archived 條目不出現在語意搜尋結果""" + repo = KnowledgeDBRepository(db_session) + + data = KnowledgeEntryCreate( + title="已封存的測試文章", + content="此文章已被封存,不應出現在搜尋結果。", + entry_type=EntryType.BEST_PRACTICE, + category="應用層", + source="human", + ) + entry = await repo.create(data) + embedding = [0.8] * 768 + await repo.save_embedding(entry.id, embedding) + + # 封存 + await repo.delete(entry.id) + + # 搜尋不應包含已封存條目 + results = await repo.semantic_search(embedding, limit=10, threshold=0.5) + ids = [e.id for e, _ in results] + assert entry.id not in ids + + +@pytest.mark.asyncio +async def test_list_unembedded_entries(db_session): + """list_unembedded_entries 正確列出未 embed 條目""" + repo = KnowledgeDBRepository(db_session) + + # 建立兩筆條目,一筆有 embedding、一筆沒有 + data_a = KnowledgeEntryCreate( + title="有 Embedding 的條目", + content="已完成向量化。", + entry_type=EntryType.RUNBOOK, + category="基礎設施", + source="human", + ) + data_b = KnowledgeEntryCreate( + title="沒有 Embedding 的條目", + content="尚未向量化。", + entry_type=EntryType.RUNBOOK, + category="基礎設施", + source="human", + ) + entry_a = await repo.create(data_a) + entry_b = await repo.create(data_b) + + # 只給 entry_a embedding + await repo.save_embedding(entry_a.id, [0.1] * 768) + + rows = await repo.list_unembedded_entries() + row_ids = [r[0] for r in rows] + + assert entry_b.id in row_ids + assert entry_a.id not in row_ids