test(knowledge): pgvector 語意搜尋整合測試 (5 tests)
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

- test_save_embedding: CAST AS vector 語法驗證
- test_semantic_search_returns_results: cosine similarity 查詢
- test_semantic_search_threshold_filters: 正交向量被 threshold 過濾
- test_semantic_search_archived_excluded: archived 不出現
- test_list_unembedded_entries: 未 embed 條目列舉

全部 5/5 PASSED (awoooi_dev PostgreSQL + pgvector)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-04 11:55:09 +08:00
parent 72d7536ead
commit f6567751a9

View File

@@ -0,0 +1,160 @@
"""
Knowledge Base 語意搜尋整合測試
=================================
KB Phase 2: pgvector semantic search E2E
建立時間: 2026-04-04 (台北時區)
建立者: Claude Code (KB Phase 2 整合測試)
規則:
- 使用真實 awoooi_dev DB + Ollama embedding
- 每個測試後 rollback
- 禁止 Mock
"""
import pytest
from src.models.knowledge import EntryType, KnowledgeEntryCreate
from src.repositories.knowledge_repository import KnowledgeDBRepository
# =============================================================================
# Tests
# =============================================================================
@pytest.mark.asyncio
async def test_save_embedding(db_session):
"""save_embedding 正確寫入 pgvector 欄位"""
repo = KnowledgeDBRepository(db_session)
data = KnowledgeEntryCreate(
title="K8s Pod OOMKilled 處理流程",
content="當 Pod 因記憶體不足被 OOMKilled 時,應先增加 memory limit 再滾動重啟。",
entry_type=EntryType.RUNBOOK,
category="基礎設施",
source="human",
)
entry = await repo.create(data)
# 手動產生假 embedding (768 維)
fake_embedding = [0.01] * 768
success = await repo.save_embedding(entry.id, fake_embedding)
assert success is True
@pytest.mark.asyncio
async def test_semantic_search_returns_results(db_session):
"""semantic_search 能查到相關條目"""
repo = KnowledgeDBRepository(db_session)
# 建立一筆條目
data = KnowledgeEntryCreate(
title="SLA 監控與告警設定",
content="設定 SLA 告警:當 API 回應時間超過 500ms 時發送 Telegram 告警。",
entry_type=EntryType.BEST_PRACTICE,
category="應用層",
source="human",
)
entry = await repo.create(data)
# 存入 embedding (模擬向量)
embedding_a = [0.5] * 768
await repo.save_embedding(entry.id, embedding_a)
# 用相近向量搜尋,應能找到
query_embedding = [0.5] * 768 # 完全相同 → cosine similarity = 1.0
results = await repo.semantic_search(query_embedding, limit=5, threshold=0.9)
assert len(results) >= 1
ids = [e.id for e, _ in results]
assert entry.id in ids
@pytest.mark.asyncio
async def test_semantic_search_threshold_filters(db_session):
"""threshold 過濾:低相似度條目不出現在結果
使用兩個方向幾乎相反的向量(正負交替)確保 cosine similarity < threshold
"""
repo = KnowledgeDBRepository(db_session)
data = KnowledgeEntryCreate(
title="Redis 連線逾時排查",
content="Redis 連線逾時通常由網路抖動或 Redis 記憶體滿導致。",
entry_type=EntryType.RUNBOOK,
category="基礎設施",
source="human",
)
entry = await repo.create(data)
# 存入正交向量 A奇數維 1.0,偶數維 0.0
emb_a = [1.0 if i % 2 == 0 else 0.0 for i in range(768)]
await repo.save_embedding(entry.id, emb_a)
# 查詢向量 B奇數維 0.0,偶數維 1.0 → 與 A 完全正交cosine similarity = 0
emb_b = [0.0 if i % 2 == 0 else 1.0 for i in range(768)]
# threshold=0.5 → 正交向量 (score=0) 應被過濾掉
results = await repo.semantic_search(emb_b, limit=5, threshold=0.5)
ids = [e.id for e, _ in results]
assert entry.id not in ids
@pytest.mark.asyncio
async def test_semantic_search_archived_excluded(db_session):
"""archived 條目不出現在語意搜尋結果"""
repo = KnowledgeDBRepository(db_session)
data = KnowledgeEntryCreate(
title="已封存的測試文章",
content="此文章已被封存,不應出現在搜尋結果。",
entry_type=EntryType.BEST_PRACTICE,
category="應用層",
source="human",
)
entry = await repo.create(data)
embedding = [0.8] * 768
await repo.save_embedding(entry.id, embedding)
# 封存
await repo.delete(entry.id)
# 搜尋不應包含已封存條目
results = await repo.semantic_search(embedding, limit=10, threshold=0.5)
ids = [e.id for e, _ in results]
assert entry.id not in ids
@pytest.mark.asyncio
async def test_list_unembedded_entries(db_session):
"""list_unembedded_entries 正確列出未 embed 條目"""
repo = KnowledgeDBRepository(db_session)
# 建立兩筆條目,一筆有 embedding、一筆沒有
data_a = KnowledgeEntryCreate(
title="有 Embedding 的條目",
content="已完成向量化。",
entry_type=EntryType.RUNBOOK,
category="基礎設施",
source="human",
)
data_b = KnowledgeEntryCreate(
title="沒有 Embedding 的條目",
content="尚未向量化。",
entry_type=EntryType.RUNBOOK,
category="基礎設施",
source="human",
)
entry_a = await repo.create(data_a)
entry_b = await repo.create(data_b)
# 只給 entry_a embedding
await repo.save_embedding(entry_a.id, [0.1] * 768)
rows = await repo.list_unembedded_entries()
row_ids = [r[0] for r in rows]
assert entry_b.id in row_ids
assert entry_a.id not in row_ids