Files
awoooi/apps/api/tests/integration/test_knowledge_semantic_search.py
OG T f6567751a9
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
test(knowledge): pgvector 語意搜尋整合測試 (5 tests)
- test_save_embedding: CAST AS vector 語法驗證
- test_semantic_search_returns_results: cosine similarity 查詢
- test_semantic_search_threshold_filters: 正交向量被 threshold 過濾
- test_semantic_search_archived_excluded: archived 不出現
- test_list_unembedded_entries: 未 embed 條目列舉

全部 5/5 PASSED (awoooi_dev PostgreSQL + pgvector)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-04 11:55:09 +08:00

161 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Knowledge Base 語意搜尋整合測試
=================================
KB Phase 2: pgvector semantic search E2E
建立時間: 2026-04-04 (台北時區)
建立者: Claude Code (KB Phase 2 整合測試)
規則:
- 使用真實 awoooi_dev DB + Ollama embedding
- 每個測試後 rollback
- 禁止 Mock
"""
import pytest
from src.models.knowledge import EntryType, KnowledgeEntryCreate
from src.repositories.knowledge_repository import KnowledgeDBRepository
# =============================================================================
# Tests
# =============================================================================
@pytest.mark.asyncio
async def test_save_embedding(db_session):
"""save_embedding 正確寫入 pgvector 欄位"""
repo = KnowledgeDBRepository(db_session)
data = KnowledgeEntryCreate(
title="K8s Pod OOMKilled 處理流程",
content="當 Pod 因記憶體不足被 OOMKilled 時,應先增加 memory limit 再滾動重啟。",
entry_type=EntryType.RUNBOOK,
category="基礎設施",
source="human",
)
entry = await repo.create(data)
# 手動產生假 embedding (768 維)
fake_embedding = [0.01] * 768
success = await repo.save_embedding(entry.id, fake_embedding)
assert success is True
@pytest.mark.asyncio
async def test_semantic_search_returns_results(db_session):
"""semantic_search 能查到相關條目"""
repo = KnowledgeDBRepository(db_session)
# 建立一筆條目
data = KnowledgeEntryCreate(
title="SLA 監控與告警設定",
content="設定 SLA 告警:當 API 回應時間超過 500ms 時發送 Telegram 告警。",
entry_type=EntryType.BEST_PRACTICE,
category="應用層",
source="human",
)
entry = await repo.create(data)
# 存入 embedding (模擬向量)
embedding_a = [0.5] * 768
await repo.save_embedding(entry.id, embedding_a)
# 用相近向量搜尋,應能找到
query_embedding = [0.5] * 768 # 完全相同 → cosine similarity = 1.0
results = await repo.semantic_search(query_embedding, limit=5, threshold=0.9)
assert len(results) >= 1
ids = [e.id for e, _ in results]
assert entry.id in ids
@pytest.mark.asyncio
async def test_semantic_search_threshold_filters(db_session):
"""threshold 過濾:低相似度條目不出現在結果
使用兩個方向幾乎相反的向量(正負交替)確保 cosine similarity < threshold
"""
repo = KnowledgeDBRepository(db_session)
data = KnowledgeEntryCreate(
title="Redis 連線逾時排查",
content="Redis 連線逾時通常由網路抖動或 Redis 記憶體滿導致。",
entry_type=EntryType.RUNBOOK,
category="基礎設施",
source="human",
)
entry = await repo.create(data)
# 存入正交向量 A奇數維 1.0,偶數維 0.0
emb_a = [1.0 if i % 2 == 0 else 0.0 for i in range(768)]
await repo.save_embedding(entry.id, emb_a)
# 查詢向量 B奇數維 0.0,偶數維 1.0 → 與 A 完全正交cosine similarity = 0
emb_b = [0.0 if i % 2 == 0 else 1.0 for i in range(768)]
# threshold=0.5 → 正交向量 (score=0) 應被過濾掉
results = await repo.semantic_search(emb_b, limit=5, threshold=0.5)
ids = [e.id for e, _ in results]
assert entry.id not in ids
@pytest.mark.asyncio
async def test_semantic_search_archived_excluded(db_session):
"""archived 條目不出現在語意搜尋結果"""
repo = KnowledgeDBRepository(db_session)
data = KnowledgeEntryCreate(
title="已封存的測試文章",
content="此文章已被封存,不應出現在搜尋結果。",
entry_type=EntryType.BEST_PRACTICE,
category="應用層",
source="human",
)
entry = await repo.create(data)
embedding = [0.8] * 768
await repo.save_embedding(entry.id, embedding)
# 封存
await repo.delete(entry.id)
# 搜尋不應包含已封存條目
results = await repo.semantic_search(embedding, limit=10, threshold=0.5)
ids = [e.id for e, _ in results]
assert entry.id not in ids
@pytest.mark.asyncio
async def test_list_unembedded_entries(db_session):
"""list_unembedded_entries 正確列出未 embed 條目"""
repo = KnowledgeDBRepository(db_session)
# 建立兩筆條目,一筆有 embedding、一筆沒有
data_a = KnowledgeEntryCreate(
title="有 Embedding 的條目",
content="已完成向量化。",
entry_type=EntryType.RUNBOOK,
category="基礎設施",
source="human",
)
data_b = KnowledgeEntryCreate(
title="沒有 Embedding 的條目",
content="尚未向量化。",
entry_type=EntryType.RUNBOOK,
category="基礎設施",
source="human",
)
entry_a = await repo.create(data_a)
entry_b = await repo.create(data_b)
# 只給 entry_a embedding
await repo.save_embedding(entry_a.id, [0.1] * 768)
rows = await repo.list_unembedded_entries()
row_ids = [r[0] for r in rows]
assert entry_b.id in row_ids
assert entry_a.id not in row_ids