feat(rag): 初始索引腳本 — ADR+Runbook 批次餵入 pgvector
scripts/rag_index_docs.py: 批次呼叫 /knowledge/rag/index 支援 --api-url 參數,含 0.5s 節流避免 Ollama 過載 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
112
scripts/rag_index_docs.py
Executable file
112
scripts/rag_index_docs.py
Executable file
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
RAG 初始索引腳本
|
||||
================
|
||||
將 docs/adr/ 和 docs/runbooks/ 批次索引到 pgvector rag_chunks
|
||||
|
||||
用法:
|
||||
python scripts/rag_index_docs.py [--api-url http://localhost:8000]
|
||||
|
||||
2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
ROOT = Path(__file__).parent.parent
|
||||
DIRS = [
|
||||
("adr", ROOT / "docs" / "adr"),
|
||||
("runbook", ROOT / "docs" / "runbooks"),
|
||||
]
|
||||
API_BASE = "http://localhost:8000"
|
||||
|
||||
|
||||
async def index_file(
|
||||
client: httpx.AsyncClient,
|
||||
source: str,
|
||||
path: Path,
|
||||
api_url: str,
|
||||
) -> bool:
|
||||
text = path.read_text(encoding="utf-8", errors="replace")
|
||||
if not text.strip():
|
||||
return False
|
||||
|
||||
payload = {
|
||||
"source": source,
|
||||
"source_id": path.stem,
|
||||
"title": path.stem,
|
||||
"text": text,
|
||||
"metadata": {"file": str(path.relative_to(ROOT))},
|
||||
}
|
||||
try:
|
||||
resp = await client.post(
|
||||
f"{api_url}/api/v1/knowledge/rag/index",
|
||||
json=payload,
|
||||
timeout=120.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
print(f" ✓ [{source}] {path.name}")
|
||||
return data.get("ok", False)
|
||||
else:
|
||||
print(f" ✗ [{source}] {path.name} — HTTP {resp.status_code}", file=sys.stderr)
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ✗ [{source}] {path.name} — {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
async def main(api_url: str) -> None:
|
||||
# 健康檢查
|
||||
async with httpx.AsyncClient() as hc:
|
||||
try:
|
||||
r = await hc.get(f"{api_url}/health", timeout=5.0)
|
||||
print(f"API 連線: {api_url} → HTTP {r.status_code}")
|
||||
except Exception as e:
|
||||
print(f"⚠️ API 不可達: {e}\n繼續嘗試索引...", file=sys.stderr)
|
||||
|
||||
total = success = failed = 0
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
for source, directory in DIRS:
|
||||
if not directory.exists():
|
||||
print(f"⚠️ 目錄不存在: {directory}")
|
||||
continue
|
||||
|
||||
files = sorted(directory.glob("*.md"))
|
||||
print(f"\n📂 {directory.name}/ ({len(files)} 個文件)")
|
||||
|
||||
for path in files:
|
||||
total += 1
|
||||
ok = await index_file(client, source, path, api_url)
|
||||
if ok:
|
||||
success += 1
|
||||
else:
|
||||
failed += 1
|
||||
# 避免 Ollama embed 過載
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"完成: {success}/{total} 成功, {failed} 失敗")
|
||||
|
||||
# 顯示統計
|
||||
async with httpx.AsyncClient() as hc:
|
||||
try:
|
||||
r = await hc.get(f"{api_url}/api/v1/knowledge/rag/stats", timeout=10.0)
|
||||
if r.status_code == 200:
|
||||
stats = r.json()
|
||||
print(f"RAG 統計: {stats['total_chunks']} chunks, {stats['sources']} sources")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="RAG 初始索引")
|
||||
parser.add_argument("--api-url", default=API_BASE, help=f"API base URL (預設: {API_BASE})")
|
||||
args = parser.parse_args()
|
||||
asyncio.run(main(args.api_url))
|
||||
Reference in New Issue
Block a user