scripts/rag_index_docs.py: 批次呼叫 /knowledge/rag/index 支援 --api-url 參數,含 0.5s 節流避免 Ollama 過載 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
113 lines
3.2 KiB
Python
Executable File
113 lines
3.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
RAG 初始索引腳本
|
|
================
|
|
將 docs/adr/ 和 docs/runbooks/ 批次索引到 pgvector rag_chunks
|
|
|
|
用法:
|
|
python scripts/rag_index_docs.py [--api-url http://localhost:8000]
|
|
|
|
2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
|
|
ROOT = Path(__file__).parent.parent
|
|
DIRS = [
|
|
("adr", ROOT / "docs" / "adr"),
|
|
("runbook", ROOT / "docs" / "runbooks"),
|
|
]
|
|
API_BASE = "http://localhost:8000"
|
|
|
|
|
|
async def index_file(
|
|
client: httpx.AsyncClient,
|
|
source: str,
|
|
path: Path,
|
|
api_url: str,
|
|
) -> bool:
|
|
text = path.read_text(encoding="utf-8", errors="replace")
|
|
if not text.strip():
|
|
return False
|
|
|
|
payload = {
|
|
"source": source,
|
|
"source_id": path.stem,
|
|
"title": path.stem,
|
|
"text": text,
|
|
"metadata": {"file": str(path.relative_to(ROOT))},
|
|
}
|
|
try:
|
|
resp = await client.post(
|
|
f"{api_url}/api/v1/knowledge/rag/index",
|
|
json=payload,
|
|
timeout=120.0,
|
|
)
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
print(f" ✓ [{source}] {path.name}")
|
|
return data.get("ok", False)
|
|
else:
|
|
print(f" ✗ [{source}] {path.name} — HTTP {resp.status_code}", file=sys.stderr)
|
|
return False
|
|
except Exception as e:
|
|
print(f" ✗ [{source}] {path.name} — {e}", file=sys.stderr)
|
|
return False
|
|
|
|
|
|
async def main(api_url: str) -> None:
|
|
# 健康檢查
|
|
async with httpx.AsyncClient() as hc:
|
|
try:
|
|
r = await hc.get(f"{api_url}/health", timeout=5.0)
|
|
print(f"API 連線: {api_url} → HTTP {r.status_code}")
|
|
except Exception as e:
|
|
print(f"⚠️ API 不可達: {e}\n繼續嘗試索引...", file=sys.stderr)
|
|
|
|
total = success = failed = 0
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
for source, directory in DIRS:
|
|
if not directory.exists():
|
|
print(f"⚠️ 目錄不存在: {directory}")
|
|
continue
|
|
|
|
files = sorted(directory.glob("*.md"))
|
|
print(f"\n📂 {directory.name}/ ({len(files)} 個文件)")
|
|
|
|
for path in files:
|
|
total += 1
|
|
ok = await index_file(client, source, path, api_url)
|
|
if ok:
|
|
success += 1
|
|
else:
|
|
failed += 1
|
|
# 避免 Ollama embed 過載
|
|
await asyncio.sleep(0.5)
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"完成: {success}/{total} 成功, {failed} 失敗")
|
|
|
|
# 顯示統計
|
|
async with httpx.AsyncClient() as hc:
|
|
try:
|
|
r = await hc.get(f"{api_url}/api/v1/knowledge/rag/stats", timeout=10.0)
|
|
if r.status_code == 200:
|
|
stats = r.json()
|
|
print(f"RAG 統計: {stats['total_chunks']} chunks, {stats['sources']} sources")
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="RAG 初始索引")
|
|
parser.add_argument("--api-url", default=API_BASE, help=f"API base URL (預設: {API_BASE})")
|
|
args = parser.parse_args()
|
|
asyncio.run(main(args.api_url))
|