awoooi/scripts/rag_index_docs.py

#!/usr/bin/env python3
"""
RAG 初始索引腳本
================
將 docs/adr/ 和 docs/runbooks/ 批次索引到 pgvector rag_chunks

用法:
  python scripts/rag_index_docs.py [--api-url http://localhost:8000]

2026-04-10 Claude Sonnet 4.6 Asia/Taipei
"""
from __future__ import annotations

import argparse
import asyncio
import sys
from pathlib import Path

import httpx

ROOT = Path(__file__).parent.parent
DIRS = [
    ("adr", ROOT / "docs" / "adr"),
    ("runbook", ROOT / "docs" / "runbooks"),
]
API_BASE = "http://localhost:8000"


async def index_file(
    client: httpx.AsyncClient,
    source: str,
    path: Path,
    api_url: str,
) -> bool:
    text = path.read_text(encoding="utf-8", errors="replace")
    if not text.strip():
        return False

    payload = {
        "source": source,
        "source_id": path.stem,
        "title": path.stem,
        "text": text,
        "metadata": {"file": str(path.relative_to(ROOT))},
    }
    try:
        resp = await client.post(
            f"{api_url}/api/v1/knowledge/rag/index",
            json=payload,
            timeout=120.0,
        )
        if resp.status_code == 200:
            data = resp.json()
            print(f"  ✓ [{source}] {path.name}")
            return data.get("ok", False)
        else:
            print(f"  ✗ [{source}] {path.name} — HTTP {resp.status_code}", file=sys.stderr)
            return False
    except Exception as e:
        print(f"  ✗ [{source}] {path.name} — {e}", file=sys.stderr)
        return False


async def main(api_url: str) -> None:
    # 健康檢查
    async with httpx.AsyncClient() as hc:
        try:
            r = await hc.get(f"{api_url}/health", timeout=5.0)
            print(f"API 連線: {api_url} → HTTP {r.status_code}")
        except Exception as e:
            print(f"⚠️  API 不可達: {e}\n繼續嘗試索引...", file=sys.stderr)

    total = success = failed = 0

    async with httpx.AsyncClient() as client:
        for source, directory in DIRS:
            if not directory.exists():
                print(f"⚠️  目錄不存在: {directory}")
                continue

            files = sorted(directory.glob("*.md"))
            print(f"\n📂 {directory.name}/ ({len(files)} 個文件)")

            for path in files:
                total += 1
                ok = await index_file(client, source, path, api_url)
                if ok:
                    success += 1
                else:
                    failed += 1
                # 避免 Ollama embed 過載
                await asyncio.sleep(0.5)

    print(f"\n{'='*50}")
    print(f"完成: {success}/{total} 成功, {failed} 失敗")

    # 顯示統計
    async with httpx.AsyncClient() as hc:
        try:
            r = await hc.get(f"{api_url}/api/v1/knowledge/rag/stats", timeout=10.0)
            if r.status_code == 200:
                stats = r.json()
                print(f"RAG 統計: {stats['total_chunks']} chunks, {stats['sources']} sources")
        except Exception:
            pass


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="RAG 初始索引")
    parser.add_argument("--api-url", default=API_BASE, help=f"API base URL (預設: {API_BASE})")
    args = parser.parse_args()
    asyncio.run(main(args.api_url))