localRAG/add_pdfs.py

34 lines
1.2 KiB
Python

#!/usr/bin/env python3
"""Add all PDFs under a folder to the RAG vector store. Run from project root."""
import shutil
from pathlib import Path
from local_rag import LocalRAG
# Folder to scan for PDFs (recursively).
DATA_ROOT = Path("/Users/Philipp/Desktop/workspace/python/gpt_publikationen/data_vs")
# Must match server.py so the chat loads the same index.
VECTORSTORE_PATH = "./vectorstore"
# If True: delete the existing vector store folder, then index all PDFs from scratch.
# If False: load the existing index (if any) and merge new chunks from these PDFs into it.
CLEAR_VECTORSTORE_FIRST = True
if __name__ == "__main__":
if CLEAR_VECTORSTORE_FIRST and Path(VECTORSTORE_PATH).exists():
shutil.rmtree(VECTORSTORE_PATH)
print(f"Cleared existing vector store: {VECTORSTORE_PATH}")
elif not CLEAR_VECTORSTORE_FIRST:
print(f"Appending to existing vector store (if any): {VECTORSTORE_PATH}")
pdfs = sorted(p for p in DATA_ROOT.rglob("*") if p.suffix.lower() == ".pdf")
print(f"Found {len(pdfs)} PDF(s) under {DATA_ROOT}")
if not pdfs:
raise SystemExit("No PDFs found.")
rag = LocalRAG(vectorstore_path=VECTORSTORE_PATH)
rag.add_documents([str(p) for p in pdfs])
print("Done.")