34 lines
1.2 KiB
Python
34 lines
1.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Add all PDFs under a folder to the RAG vector store. Run from project root."""
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
from local_rag import LocalRAG
|
|
|
|
# Folder to scan for PDFs (recursively).
|
|
DATA_ROOT = Path("/Users/Philipp/Desktop/workspace/python/gpt_publikationen/data_vs")
|
|
|
|
# Must match server.py so the chat loads the same index.
|
|
VECTORSTORE_PATH = "./vectorstore"
|
|
|
|
# If True: delete the existing vector store folder, then index all PDFs from scratch.
|
|
# If False: load the existing index (if any) and merge new chunks from these PDFs into it.
|
|
CLEAR_VECTORSTORE_FIRST = True
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if CLEAR_VECTORSTORE_FIRST and Path(VECTORSTORE_PATH).exists():
|
|
shutil.rmtree(VECTORSTORE_PATH)
|
|
print(f"Cleared existing vector store: {VECTORSTORE_PATH}")
|
|
elif not CLEAR_VECTORSTORE_FIRST:
|
|
print(f"Appending to existing vector store (if any): {VECTORSTORE_PATH}")
|
|
|
|
pdfs = sorted(p for p in DATA_ROOT.rglob("*") if p.suffix.lower() == ".pdf")
|
|
print(f"Found {len(pdfs)} PDF(s) under {DATA_ROOT}")
|
|
if not pdfs:
|
|
raise SystemExit("No PDFs found.")
|
|
|
|
rag = LocalRAG(vectorstore_path=VECTORSTORE_PATH)
|
|
rag.add_documents([str(p) for p in pdfs])
|
|
print("Done.")
|