awoooi/apps/api/tests/integration/test_knowledge_semantic_search.py

"""
Knowledge Base 語意搜尋整合測試
=================================
KB Phase 2: pgvector semantic search E2E

建立時間: 2026-04-04 (台北時區)
建立者: Claude Code (KB Phase 2 整合測試)

規則:
- 使用真實 awoooi_dev DB + Ollama embedding
- 每個測試後 rollback
- 禁止 Mock
"""

import pytest

from src.models.knowledge import EntryType, KnowledgeEntryCreate
from src.repositories.knowledge_repository import KnowledgeDBRepository


# =============================================================================
# Tests
# =============================================================================

@pytest.mark.asyncio
async def test_save_embedding(db_session):
    """save_embedding 正確寫入 pgvector 欄位"""
    repo = KnowledgeDBRepository(db_session)

    data = KnowledgeEntryCreate(
        title="K8s Pod OOMKilled 處理流程",
        content="當 Pod 因記憶體不足被 OOMKilled 時，應先增加 memory limit 再滾動重啟。",
        entry_type=EntryType.RUNBOOK,
        category="基礎設施",
        source="human",
    )
    entry = await repo.create(data)

    # 手動產生假 embedding (768 維)
    fake_embedding = [0.01] * 768
    success = await repo.save_embedding(entry.id, fake_embedding)

    assert success is True


@pytest.mark.asyncio
async def test_semantic_search_returns_results(db_session):
    """semantic_search 能查到相關條目"""
    repo = KnowledgeDBRepository(db_session)

    # 建立一筆條目
    data = KnowledgeEntryCreate(
        title="SLA 監控與告警設定",
        content="設定 SLA 告警：當 API 回應時間超過 500ms 時發送 Telegram 告警。",
        entry_type=EntryType.BEST_PRACTICE,
        category="應用層",
        source="human",
    )
    entry = await repo.create(data)

    # 存入 embedding (模擬向量)
    embedding_a = [0.5] * 768
    await repo.save_embedding(entry.id, embedding_a)

    # 用相近向量搜尋，應能找到
    query_embedding = [0.5] * 768  # 完全相同 → cosine similarity = 1.0
    results = await repo.semantic_search(query_embedding, limit=5, threshold=0.9)

    assert len(results) >= 1
    ids = [e.id for e, _ in results]
    assert entry.id in ids


@pytest.mark.asyncio
async def test_semantic_search_threshold_filters(db_session):
    """threshold 過濾：低相似度條目不出現在結果

    使用兩個方向幾乎相反的向量（正負交替）確保 cosine similarity < threshold
    """
    repo = KnowledgeDBRepository(db_session)

    data = KnowledgeEntryCreate(
        title="Redis 連線逾時排查",
        content="Redis 連線逾時通常由網路抖動或 Redis 記憶體滿導致。",
        entry_type=EntryType.RUNBOOK,
        category="基礎設施",
        source="human",
    )
    entry = await repo.create(data)

    # 存入正交向量 A：奇數維 1.0，偶數維 0.0
    emb_a = [1.0 if i % 2 == 0 else 0.0 for i in range(768)]
    await repo.save_embedding(entry.id, emb_a)

    # 查詢向量 B：奇數維 0.0，偶數維 1.0 → 與 A 完全正交，cosine similarity = 0
    emb_b = [0.0 if i % 2 == 0 else 1.0 for i in range(768)]

    # threshold=0.5 → 正交向量 (score=0) 應被過濾掉
    results = await repo.semantic_search(emb_b, limit=5, threshold=0.5)

    ids = [e.id for e, _ in results]
    assert entry.id not in ids


@pytest.mark.asyncio
async def test_semantic_search_archived_excluded(db_session):
    """archived 條目不出現在語意搜尋結果"""
    repo = KnowledgeDBRepository(db_session)

    data = KnowledgeEntryCreate(
        title="已封存的測試文章",
        content="此文章已被封存，不應出現在搜尋結果。",
        entry_type=EntryType.BEST_PRACTICE,
        category="應用層",
        source="human",
    )
    entry = await repo.create(data)
    embedding = [0.8] * 768
    await repo.save_embedding(entry.id, embedding)

    # 封存
    await repo.delete(entry.id)

    # 搜尋不應包含已封存條目
    results = await repo.semantic_search(embedding, limit=10, threshold=0.5)
    ids = [e.id for e, _ in results]
    assert entry.id not in ids


@pytest.mark.asyncio
async def test_list_unembedded_entries(db_session):
    """list_unembedded_entries 正確列出未 embed 條目"""
    repo = KnowledgeDBRepository(db_session)

    # 建立兩筆條目，一筆有 embedding、一筆沒有
    data_a = KnowledgeEntryCreate(
        title="有 Embedding 的條目",
        content="已完成向量化。",
        entry_type=EntryType.RUNBOOK,
        category="基礎設施",
        source="human",
    )
    data_b = KnowledgeEntryCreate(
        title="沒有 Embedding 的條目",
        content="尚未向量化。",
        entry_type=EntryType.RUNBOOK,
        category="基礎設施",
        source="human",
    )
    entry_a = await repo.create(data_a)
    entry_b = await repo.create(data_b)

    # 只給 entry_a embedding
    await repo.save_embedding(entry_a.id, [0.1] * 768)

    rows = await repo.list_unembedded_entries()
    row_ids = [r[0] for r in rows]

    assert entry_b.id in row_ids
    assert entry_a.id not in row_ids