awoooi/apps/api/src/repositories/knowledge_repository.py

"""
Knowledge Repository - PostgreSQL 實作
=======================================
Knowledge Base Phase 1: CRUD + 搜尋

建立時間: 2026-04-02 (台北時區)
建立者: Claude Code (Knowledge Base Phase 1)

遵循 leWOOOgo 積木化原則:
- 實作 IKnowledgeRepository Protocol
- 只做資料存取，業務邏輯在 Service 層
"""

import structlog
from sqlalchemy import String, func, or_, select, update
from sqlalchemy.ext.asyncio import AsyncSession

from src.db.models import KnowledgeEntryRecord
from src.models.knowledge import (
    EntryStatus,
    EntryType,
    KnowledgeEntry,
    KnowledgeEntryCreate,
)

logger = structlog.get_logger(__name__)


class KnowledgeDBRepository:
    """
    Knowledge Repository - PostgreSQL 實作

    實作 IKnowledgeRepository Protocol
    """

    def __init__(self, db: AsyncSession):
        self.db = db

    async def create(self, data: KnowledgeEntryCreate) -> KnowledgeEntry:
        """建立知識條目"""
        record = KnowledgeEntryRecord(
            title=data.title,
            content=data.content,
            entry_type=data.entry_type,
            category=data.category,
            tags=data.tags,
            source=data.source,
            # 2026-04-04 ogt: Phase 25 P1 — 支援指定 status（ANTI_PATTERN 直接 PUBLISHED）
            status=data.status,
            related_incident_id=data.related_incident_id,
            related_playbook_id=data.related_playbook_id,
            # 2026-04-04 ogt: Phase 25 P1 — Anti-Pattern 閉環用症狀 hash
            symptoms_hash=data.symptoms_hash,
            created_by=data.created_by,
        )
        self.db.add(record)
        await self.db.flush()
        logger.info("knowledge_entry_created", entry_id=record.id, title=record.title)
        return self._to_model(record)

    async def get_by_id(self, entry_id: str) -> KnowledgeEntry | None:
        """根據 ID 取得知識條目（排除 archived）"""
        result = await self.db.execute(
            select(KnowledgeEntryRecord).where(
                KnowledgeEntryRecord.id == entry_id,
                KnowledgeEntryRecord.status != EntryStatus.ARCHIVED,
            )
        )
        record = result.scalar_one_or_none()
        return self._to_model(record) if record else None

    async def update(self, entry_id: str, data: dict) -> KnowledgeEntry | None:
        """更新知識條目"""
        result = await self.db.execute(
            select(KnowledgeEntryRecord).where(KnowledgeEntryRecord.id == entry_id)
        )
        record = result.scalar_one_or_none()
        if not record:
            return None

        for key, value in data.items():
            if value is not None and hasattr(record, key):
                setattr(record, key, value)

        await self.db.flush()
        logger.info("knowledge_entry_updated", entry_id=entry_id)
        return self._to_model(record)

    async def delete(self, entry_id: str) -> bool:
        """軟刪除 → status = archived"""
        result = await self.db.execute(
            update(KnowledgeEntryRecord)
            .where(KnowledgeEntryRecord.id == entry_id)
            .values(status=EntryStatus.ARCHIVED)
        )
        return result.rowcount > 0

    async def list_entries(
        self,
        category: str | None = None,
        entry_type: EntryType | None = None,
        status: EntryStatus | None = None,
        tags: list[str] | None = None,
        q: str | None = None,
        limit: int = 20,
        offset: int = 0,
    ) -> tuple[list[KnowledgeEntry], int]:
        """列出知識條目 (支援篩選)"""
        query = select(KnowledgeEntryRecord).where(
            KnowledgeEntryRecord.status != EntryStatus.ARCHIVED
        )
        count_query = select(func.count()).select_from(KnowledgeEntryRecord).where(
            KnowledgeEntryRecord.status != EntryStatus.ARCHIVED
        )

        if category:
            query = query.where(KnowledgeEntryRecord.category == category)
            count_query = count_query.where(KnowledgeEntryRecord.category == category)
        if entry_type:
            query = query.where(KnowledgeEntryRecord.entry_type == entry_type)
            count_query = count_query.where(KnowledgeEntryRecord.entry_type == entry_type)
        if status:
            query = query.where(KnowledgeEntryRecord.status == status)
            count_query = count_query.where(KnowledgeEntryRecord.status == status)
        if tags:
            for tag in tags:
                tag_filter = KnowledgeEntryRecord.tags.op('@>')(f'["{tag}"]')
                query = query.where(tag_filter)
                count_query = count_query.where(tag_filter)
        if q:
            like_q = f"%{q}%"
            filter_cond = or_(
                KnowledgeEntryRecord.title.ilike(like_q),
                KnowledgeEntryRecord.content.ilike(like_q),
            )
            query = query.where(filter_cond)
            count_query = count_query.where(filter_cond)

        total_result = await self.db.execute(count_query)
        total = total_result.scalar() or 0

        query = query.order_by(KnowledgeEntryRecord.updated_at.desc())
        query = query.limit(limit).offset(offset)

        result = await self.db.execute(query)
        records = result.scalars().all()

        return [self._to_model(r) for r in records], total

    async def get_categories(self) -> list[tuple[str, int]]:
        """取得分類統計"""
        result = await self.db.execute(
            select(
                KnowledgeEntryRecord.category,
                func.count().label("cnt"),
            )
            .where(KnowledgeEntryRecord.status != EntryStatus.ARCHIVED)
            .group_by(KnowledgeEntryRecord.category)
            .order_by(func.count().desc())
        )
        return [(row.category, row.cnt) for row in result.all()]

    async def search(self, query: str, limit: int = 20) -> list[KnowledgeEntry]:
        """關鍵字搜尋 (title + content + tags)"""
        like_q = f"%{query}%"
        result = await self.db.execute(
            select(KnowledgeEntryRecord)
            .where(
                KnowledgeEntryRecord.status != EntryStatus.ARCHIVED,
                or_(
                    KnowledgeEntryRecord.title.ilike(like_q),
                    KnowledgeEntryRecord.content.ilike(like_q),
                    KnowledgeEntryRecord.tags.cast(String).ilike(like_q),
                ),
            )
            .order_by(KnowledgeEntryRecord.view_count.desc())
            .limit(limit)
        )
        records = result.scalars().all()
        return [self._to_model(r) for r in records]

    async def increment_view_count(self, entry_id: str) -> bool:
        """view_count +1"""
        result = await self.db.execute(
            update(KnowledgeEntryRecord)
            .where(KnowledgeEntryRecord.id == entry_id)
            .values(view_count=KnowledgeEntryRecord.view_count + 1)
        )
        return result.rowcount > 0

    async def list_unembedded_entries(self) -> list[tuple[str, str, str]]:
        """列出尚未產生 embedding 的條目 [(id, title, content)]"""
        from sqlalchemy import text as sa_text
        result = await self.db.execute(
            sa_text(
                "SELECT id, title, content FROM knowledge_entries "
                "WHERE embedding IS NULL AND status != 'ARCHIVED'"
            )
        )
        return [(row.id, row.title, row.content) for row in result.fetchall()]

    async def save_embedding(self, entry_id: str, embedding: list[float]) -> bool:
        """儲存向量 embedding (768 維)

        注意: asyncpg 不支援 :param::type 語法，必須用 CAST(:param AS vector)
        """
        from sqlalchemy import text as sa_text
        result = await self.db.execute(
            sa_text(
                "UPDATE knowledge_entries SET embedding = CAST(:emb AS vector) WHERE id = :id"
            ),
            {"emb": str(embedding), "id": entry_id},
        )
        return result.rowcount > 0

    async def semantic_search(
        self,
        query_embedding: list[float],
        limit: int = 10,
        threshold: float = 0.5,
    ) -> list[tuple[KnowledgeEntry, float]]:
        """
        語意搜尋 — cosine similarity (pgvector)

        Returns:
            list of (entry, similarity_score) 已按分數降序排列
        """
        from sqlalchemy import text as sa_text
        sql = sa_text("""
            SELECT id, 1 - (embedding <=> CAST(:emb AS vector)) AS score
            FROM knowledge_entries
            WHERE status != 'ARCHIVED'
              AND embedding IS NOT NULL
              AND 1 - (embedding <=> CAST(:emb AS vector)) >= :threshold
            ORDER BY embedding <=> CAST(:emb AS vector)
            LIMIT :limit
        """)
        rows = await self.db.execute(
            sql,
            {"emb": str(query_embedding), "threshold": threshold, "limit": limit},
        )
        rows = rows.fetchall()

        if not rows:
            return []

        # 批次取得完整 entry
        ids = [r[0] for r in rows]
        scores = {r[0]: float(r[1]) for r in rows}

        result = await self.db.execute(
            select(KnowledgeEntryRecord).where(KnowledgeEntryRecord.id.in_(ids))
        )
        records = {r.id: r for r in result.scalars().all()}

        return [
            (self._to_model(records[entry_id]), scores[entry_id])
            for entry_id in ids
            if entry_id in records
        ]

    def _to_model(self, record: KnowledgeEntryRecord) -> KnowledgeEntry:
        """ORM Record → Pydantic Model"""
        return KnowledgeEntry(
            id=record.id,
            title=record.title,
            content=record.content,
            entry_type=record.entry_type,
            category=record.category,
            tags=record.tags or [],
            source=record.source,
            status=record.status,
            related_incident_id=record.related_incident_id,
            related_playbook_id=record.related_playbook_id,
            symptoms_hash=getattr(record, "symptoms_hash", None),
            view_count=record.view_count,
            created_by=record.created_by,
            created_at=record.created_at,
            updated_at=record.updated_at,
        )