Files
awoooi/scripts/rag_index_docs.py
OG T 07570c3b85 feat(rag): 初始索引腳本 — ADR+Runbook 批次餵入 pgvector
scripts/rag_index_docs.py: 批次呼叫 /knowledge/rag/index
支援 --api-url 參數,含 0.5s 節流避免 Ollama 過載

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 09:59:13 +08:00

113 lines
3.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
RAG 初始索引腳本
================
將 docs/adr/ 和 docs/runbooks/ 批次索引到 pgvector rag_chunks
用法:
python scripts/rag_index_docs.py [--api-url http://localhost:8000]
2026-04-10 Claude Sonnet 4.6 Asia/Taipei
"""
from __future__ import annotations
import argparse
import asyncio
import sys
from pathlib import Path
import httpx
ROOT = Path(__file__).parent.parent
DIRS = [
("adr", ROOT / "docs" / "adr"),
("runbook", ROOT / "docs" / "runbooks"),
]
API_BASE = "http://localhost:8000"
async def index_file(
client: httpx.AsyncClient,
source: str,
path: Path,
api_url: str,
) -> bool:
text = path.read_text(encoding="utf-8", errors="replace")
if not text.strip():
return False
payload = {
"source": source,
"source_id": path.stem,
"title": path.stem,
"text": text,
"metadata": {"file": str(path.relative_to(ROOT))},
}
try:
resp = await client.post(
f"{api_url}/api/v1/knowledge/rag/index",
json=payload,
timeout=120.0,
)
if resp.status_code == 200:
data = resp.json()
print(f" ✓ [{source}] {path.name}")
return data.get("ok", False)
else:
print(f" ✗ [{source}] {path.name} — HTTP {resp.status_code}", file=sys.stderr)
return False
except Exception as e:
print(f" ✗ [{source}] {path.name}{e}", file=sys.stderr)
return False
async def main(api_url: str) -> None:
# 健康檢查
async with httpx.AsyncClient() as hc:
try:
r = await hc.get(f"{api_url}/health", timeout=5.0)
print(f"API 連線: {api_url} → HTTP {r.status_code}")
except Exception as e:
print(f"⚠️ API 不可達: {e}\n繼續嘗試索引...", file=sys.stderr)
total = success = failed = 0
async with httpx.AsyncClient() as client:
for source, directory in DIRS:
if not directory.exists():
print(f"⚠️ 目錄不存在: {directory}")
continue
files = sorted(directory.glob("*.md"))
print(f"\n📂 {directory.name}/ ({len(files)} 個文件)")
for path in files:
total += 1
ok = await index_file(client, source, path, api_url)
if ok:
success += 1
else:
failed += 1
# 避免 Ollama embed 過載
await asyncio.sleep(0.5)
print(f"\n{'='*50}")
print(f"完成: {success}/{total} 成功, {failed} 失敗")
# 顯示統計
async with httpx.AsyncClient() as hc:
try:
r = await hc.get(f"{api_url}/api/v1/knowledge/rag/stats", timeout=10.0)
if r.status_code == 200:
stats = r.json()
print(f"RAG 統計: {stats['total_chunks']} chunks, {stats['sources']} sources")
except Exception:
pass
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="RAG 初始索引")
parser.add_argument("--api-url", default=API_BASE, help=f"API base URL (預設: {API_BASE})")
args = parser.parse_args()
asyncio.run(main(args.api_url))