#!/usr/bin/env python3 """ RAG 初始索引腳本 ================ 將 docs/adr/ 和 docs/runbooks/ 批次索引到 pgvector rag_chunks 用法: python scripts/rag_index_docs.py [--api-url http://localhost:8000] 2026-04-10 Claude Sonnet 4.6 Asia/Taipei """ from __future__ import annotations import argparse import asyncio import sys from pathlib import Path import httpx ROOT = Path(__file__).parent.parent DIRS = [ ("adr", ROOT / "docs" / "adr"), ("runbook", ROOT / "docs" / "runbooks"), ] API_BASE = "http://localhost:8000" async def index_file( client: httpx.AsyncClient, source: str, path: Path, api_url: str, ) -> bool: text = path.read_text(encoding="utf-8", errors="replace") if not text.strip(): return False payload = { "source": source, "source_id": path.stem, "title": path.stem, "text": text, "metadata": {"file": str(path.relative_to(ROOT))}, } try: resp = await client.post( f"{api_url}/api/v1/knowledge/rag/index", json=payload, timeout=120.0, ) if resp.status_code == 200: data = resp.json() print(f" ✓ [{source}] {path.name}") return data.get("ok", False) else: print(f" ✗ [{source}] {path.name} — HTTP {resp.status_code}", file=sys.stderr) return False except Exception as e: print(f" ✗ [{source}] {path.name} — {e}", file=sys.stderr) return False async def main(api_url: str) -> None: # 健康檢查 async with httpx.AsyncClient() as hc: try: r = await hc.get(f"{api_url}/health", timeout=5.0) print(f"API 連線: {api_url} → HTTP {r.status_code}") except Exception as e: print(f"⚠️ API 不可達: {e}\n繼續嘗試索引...", file=sys.stderr) total = success = failed = 0 async with httpx.AsyncClient() as client: for source, directory in DIRS: if not directory.exists(): print(f"⚠️ 目錄不存在: {directory}") continue files = sorted(directory.glob("*.md")) print(f"\n📂 {directory.name}/ ({len(files)} 個文件)") for path in files: total += 1 ok = await index_file(client, source, path, api_url) if ok: success += 1 else: failed += 1 # 避免 Ollama embed 過載 await asyncio.sleep(0.5) print(f"\n{'='*50}") print(f"完成: {success}/{total} 成功, {failed} 失敗") # 顯示統計 async with httpx.AsyncClient() as hc: try: r = await hc.get(f"{api_url}/api/v1/knowledge/rag/stats", timeout=10.0) if r.status_code == 200: stats = r.json() print(f"RAG 統計: {stats['total_chunks']} chunks, {stats['sources']} sources") except Exception: pass if __name__ == "__main__": parser = argparse.ArgumentParser(description="RAG 初始索引") parser.add_argument("--api-url", default=API_BASE, help=f"API base URL (預設: {API_BASE})") args = parser.parse_args() asyncio.run(main(args.api_url))