Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
- test_save_embedding: CAST AS vector 語法驗證 - test_semantic_search_returns_results: cosine similarity 查詢 - test_semantic_search_threshold_filters: 正交向量被 threshold 過濾 - test_semantic_search_archived_excluded: archived 不出現 - test_list_unembedded_entries: 未 embed 條目列舉 全部 5/5 PASSED (awoooi_dev PostgreSQL + pgvector) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
161 lines
5.0 KiB
Python
161 lines
5.0 KiB
Python
"""
|
||
Knowledge Base 語意搜尋整合測試
|
||
=================================
|
||
KB Phase 2: pgvector semantic search E2E
|
||
|
||
建立時間: 2026-04-04 (台北時區)
|
||
建立者: Claude Code (KB Phase 2 整合測試)
|
||
|
||
規則:
|
||
- 使用真實 awoooi_dev DB + Ollama embedding
|
||
- 每個測試後 rollback
|
||
- 禁止 Mock
|
||
"""
|
||
|
||
import pytest
|
||
|
||
from src.models.knowledge import EntryType, KnowledgeEntryCreate
|
||
from src.repositories.knowledge_repository import KnowledgeDBRepository
|
||
|
||
|
||
# =============================================================================
|
||
# Tests
|
||
# =============================================================================
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_save_embedding(db_session):
|
||
"""save_embedding 正確寫入 pgvector 欄位"""
|
||
repo = KnowledgeDBRepository(db_session)
|
||
|
||
data = KnowledgeEntryCreate(
|
||
title="K8s Pod OOMKilled 處理流程",
|
||
content="當 Pod 因記憶體不足被 OOMKilled 時,應先增加 memory limit 再滾動重啟。",
|
||
entry_type=EntryType.RUNBOOK,
|
||
category="基礎設施",
|
||
source="human",
|
||
)
|
||
entry = await repo.create(data)
|
||
|
||
# 手動產生假 embedding (768 維)
|
||
fake_embedding = [0.01] * 768
|
||
success = await repo.save_embedding(entry.id, fake_embedding)
|
||
|
||
assert success is True
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_semantic_search_returns_results(db_session):
|
||
"""semantic_search 能查到相關條目"""
|
||
repo = KnowledgeDBRepository(db_session)
|
||
|
||
# 建立一筆條目
|
||
data = KnowledgeEntryCreate(
|
||
title="SLA 監控與告警設定",
|
||
content="設定 SLA 告警:當 API 回應時間超過 500ms 時發送 Telegram 告警。",
|
||
entry_type=EntryType.BEST_PRACTICE,
|
||
category="應用層",
|
||
source="human",
|
||
)
|
||
entry = await repo.create(data)
|
||
|
||
# 存入 embedding (模擬向量)
|
||
embedding_a = [0.5] * 768
|
||
await repo.save_embedding(entry.id, embedding_a)
|
||
|
||
# 用相近向量搜尋,應能找到
|
||
query_embedding = [0.5] * 768 # 完全相同 → cosine similarity = 1.0
|
||
results = await repo.semantic_search(query_embedding, limit=5, threshold=0.9)
|
||
|
||
assert len(results) >= 1
|
||
ids = [e.id for e, _ in results]
|
||
assert entry.id in ids
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_semantic_search_threshold_filters(db_session):
|
||
"""threshold 過濾:低相似度條目不出現在結果
|
||
|
||
使用兩個方向幾乎相反的向量(正負交替)確保 cosine similarity < threshold
|
||
"""
|
||
repo = KnowledgeDBRepository(db_session)
|
||
|
||
data = KnowledgeEntryCreate(
|
||
title="Redis 連線逾時排查",
|
||
content="Redis 連線逾時通常由網路抖動或 Redis 記憶體滿導致。",
|
||
entry_type=EntryType.RUNBOOK,
|
||
category="基礎設施",
|
||
source="human",
|
||
)
|
||
entry = await repo.create(data)
|
||
|
||
# 存入正交向量 A:奇數維 1.0,偶數維 0.0
|
||
emb_a = [1.0 if i % 2 == 0 else 0.0 for i in range(768)]
|
||
await repo.save_embedding(entry.id, emb_a)
|
||
|
||
# 查詢向量 B:奇數維 0.0,偶數維 1.0 → 與 A 完全正交,cosine similarity = 0
|
||
emb_b = [0.0 if i % 2 == 0 else 1.0 for i in range(768)]
|
||
|
||
# threshold=0.5 → 正交向量 (score=0) 應被過濾掉
|
||
results = await repo.semantic_search(emb_b, limit=5, threshold=0.5)
|
||
|
||
ids = [e.id for e, _ in results]
|
||
assert entry.id not in ids
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_semantic_search_archived_excluded(db_session):
|
||
"""archived 條目不出現在語意搜尋結果"""
|
||
repo = KnowledgeDBRepository(db_session)
|
||
|
||
data = KnowledgeEntryCreate(
|
||
title="已封存的測試文章",
|
||
content="此文章已被封存,不應出現在搜尋結果。",
|
||
entry_type=EntryType.BEST_PRACTICE,
|
||
category="應用層",
|
||
source="human",
|
||
)
|
||
entry = await repo.create(data)
|
||
embedding = [0.8] * 768
|
||
await repo.save_embedding(entry.id, embedding)
|
||
|
||
# 封存
|
||
await repo.delete(entry.id)
|
||
|
||
# 搜尋不應包含已封存條目
|
||
results = await repo.semantic_search(embedding, limit=10, threshold=0.5)
|
||
ids = [e.id for e, _ in results]
|
||
assert entry.id not in ids
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_list_unembedded_entries(db_session):
|
||
"""list_unembedded_entries 正確列出未 embed 條目"""
|
||
repo = KnowledgeDBRepository(db_session)
|
||
|
||
# 建立兩筆條目,一筆有 embedding、一筆沒有
|
||
data_a = KnowledgeEntryCreate(
|
||
title="有 Embedding 的條目",
|
||
content="已完成向量化。",
|
||
entry_type=EntryType.RUNBOOK,
|
||
category="基礎設施",
|
||
source="human",
|
||
)
|
||
data_b = KnowledgeEntryCreate(
|
||
title="沒有 Embedding 的條目",
|
||
content="尚未向量化。",
|
||
entry_type=EntryType.RUNBOOK,
|
||
category="基礎設施",
|
||
source="human",
|
||
)
|
||
entry_a = await repo.create(data_a)
|
||
entry_b = await repo.create(data_b)
|
||
|
||
# 只給 entry_a embedding
|
||
await repo.save_embedding(entry_a.id, [0.1] * 768)
|
||
|
||
rows = await repo.list_unembedded_entries()
|
||
row_ids = [r[0] for r in rows]
|
||
|
||
assert entry_b.id in row_ids
|
||
assert entry_a.id not in row_ids
|