test(knowledge): pgvector 語意搜尋整合測試 (5 tests)
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
- test_save_embedding: CAST AS vector 語法驗證 - test_semantic_search_returns_results: cosine similarity 查詢 - test_semantic_search_threshold_filters: 正交向量被 threshold 過濾 - test_semantic_search_archived_excluded: archived 不出現 - test_list_unembedded_entries: 未 embed 條目列舉 全部 5/5 PASSED (awoooi_dev PostgreSQL + pgvector) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
160
apps/api/tests/integration/test_knowledge_semantic_search.py
Normal file
160
apps/api/tests/integration/test_knowledge_semantic_search.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
Knowledge Base 語意搜尋整合測試
|
||||
=================================
|
||||
KB Phase 2: pgvector semantic search E2E
|
||||
|
||||
建立時間: 2026-04-04 (台北時區)
|
||||
建立者: Claude Code (KB Phase 2 整合測試)
|
||||
|
||||
規則:
|
||||
- 使用真實 awoooi_dev DB + Ollama embedding
|
||||
- 每個測試後 rollback
|
||||
- 禁止 Mock
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from src.models.knowledge import EntryType, KnowledgeEntryCreate
|
||||
from src.repositories.knowledge_repository import KnowledgeDBRepository
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests
|
||||
# =============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_save_embedding(db_session):
|
||||
"""save_embedding 正確寫入 pgvector 欄位"""
|
||||
repo = KnowledgeDBRepository(db_session)
|
||||
|
||||
data = KnowledgeEntryCreate(
|
||||
title="K8s Pod OOMKilled 處理流程",
|
||||
content="當 Pod 因記憶體不足被 OOMKilled 時,應先增加 memory limit 再滾動重啟。",
|
||||
entry_type=EntryType.RUNBOOK,
|
||||
category="基礎設施",
|
||||
source="human",
|
||||
)
|
||||
entry = await repo.create(data)
|
||||
|
||||
# 手動產生假 embedding (768 維)
|
||||
fake_embedding = [0.01] * 768
|
||||
success = await repo.save_embedding(entry.id, fake_embedding)
|
||||
|
||||
assert success is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_semantic_search_returns_results(db_session):
|
||||
"""semantic_search 能查到相關條目"""
|
||||
repo = KnowledgeDBRepository(db_session)
|
||||
|
||||
# 建立一筆條目
|
||||
data = KnowledgeEntryCreate(
|
||||
title="SLA 監控與告警設定",
|
||||
content="設定 SLA 告警:當 API 回應時間超過 500ms 時發送 Telegram 告警。",
|
||||
entry_type=EntryType.BEST_PRACTICE,
|
||||
category="應用層",
|
||||
source="human",
|
||||
)
|
||||
entry = await repo.create(data)
|
||||
|
||||
# 存入 embedding (模擬向量)
|
||||
embedding_a = [0.5] * 768
|
||||
await repo.save_embedding(entry.id, embedding_a)
|
||||
|
||||
# 用相近向量搜尋,應能找到
|
||||
query_embedding = [0.5] * 768 # 完全相同 → cosine similarity = 1.0
|
||||
results = await repo.semantic_search(query_embedding, limit=5, threshold=0.9)
|
||||
|
||||
assert len(results) >= 1
|
||||
ids = [e.id for e, _ in results]
|
||||
assert entry.id in ids
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_semantic_search_threshold_filters(db_session):
|
||||
"""threshold 過濾:低相似度條目不出現在結果
|
||||
|
||||
使用兩個方向幾乎相反的向量(正負交替)確保 cosine similarity < threshold
|
||||
"""
|
||||
repo = KnowledgeDBRepository(db_session)
|
||||
|
||||
data = KnowledgeEntryCreate(
|
||||
title="Redis 連線逾時排查",
|
||||
content="Redis 連線逾時通常由網路抖動或 Redis 記憶體滿導致。",
|
||||
entry_type=EntryType.RUNBOOK,
|
||||
category="基礎設施",
|
||||
source="human",
|
||||
)
|
||||
entry = await repo.create(data)
|
||||
|
||||
# 存入正交向量 A:奇數維 1.0,偶數維 0.0
|
||||
emb_a = [1.0 if i % 2 == 0 else 0.0 for i in range(768)]
|
||||
await repo.save_embedding(entry.id, emb_a)
|
||||
|
||||
# 查詢向量 B:奇數維 0.0,偶數維 1.0 → 與 A 完全正交,cosine similarity = 0
|
||||
emb_b = [0.0 if i % 2 == 0 else 1.0 for i in range(768)]
|
||||
|
||||
# threshold=0.5 → 正交向量 (score=0) 應被過濾掉
|
||||
results = await repo.semantic_search(emb_b, limit=5, threshold=0.5)
|
||||
|
||||
ids = [e.id for e, _ in results]
|
||||
assert entry.id not in ids
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_semantic_search_archived_excluded(db_session):
|
||||
"""archived 條目不出現在語意搜尋結果"""
|
||||
repo = KnowledgeDBRepository(db_session)
|
||||
|
||||
data = KnowledgeEntryCreate(
|
||||
title="已封存的測試文章",
|
||||
content="此文章已被封存,不應出現在搜尋結果。",
|
||||
entry_type=EntryType.BEST_PRACTICE,
|
||||
category="應用層",
|
||||
source="human",
|
||||
)
|
||||
entry = await repo.create(data)
|
||||
embedding = [0.8] * 768
|
||||
await repo.save_embedding(entry.id, embedding)
|
||||
|
||||
# 封存
|
||||
await repo.delete(entry.id)
|
||||
|
||||
# 搜尋不應包含已封存條目
|
||||
results = await repo.semantic_search(embedding, limit=10, threshold=0.5)
|
||||
ids = [e.id for e, _ in results]
|
||||
assert entry.id not in ids
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_unembedded_entries(db_session):
|
||||
"""list_unembedded_entries 正確列出未 embed 條目"""
|
||||
repo = KnowledgeDBRepository(db_session)
|
||||
|
||||
# 建立兩筆條目,一筆有 embedding、一筆沒有
|
||||
data_a = KnowledgeEntryCreate(
|
||||
title="有 Embedding 的條目",
|
||||
content="已完成向量化。",
|
||||
entry_type=EntryType.RUNBOOK,
|
||||
category="基礎設施",
|
||||
source="human",
|
||||
)
|
||||
data_b = KnowledgeEntryCreate(
|
||||
title="沒有 Embedding 的條目",
|
||||
content="尚未向量化。",
|
||||
entry_type=EntryType.RUNBOOK,
|
||||
category="基礎設施",
|
||||
source="human",
|
||||
)
|
||||
entry_a = await repo.create(data_a)
|
||||
entry_b = await repo.create(data_b)
|
||||
|
||||
# 只給 entry_a embedding
|
||||
await repo.save_embedding(entry_a.id, [0.1] * 768)
|
||||
|
||||
rows = await repo.list_unembedded_entries()
|
||||
row_ids = [r[0] for r in rows]
|
||||
|
||||
assert entry_b.id in row_ids
|
||||
assert entry_a.id not in row_ids
|
||||
Reference in New Issue
Block a user