From 7d70169ee3c0832172f48dd5496f7519053b9f9b Mon Sep 17 00:00:00 2001 From: Philipp Mock Date: Wed, 25 Mar 2026 08:57:10 +0100 Subject: [PATCH] reworked document adding and most of the project --- README.md | 203 ++++++++++++++++++++++++++++++++++++++++----------- add_pdfs.py | 12 ++- local_rag.py | 46 +++++++++--- server.py | 2 +- 4 files changed, 205 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 0ac7d33..7f52c0b 100644 --- a/README.md +++ b/README.md @@ -1,90 +1,205 @@ # Local RAG Setup -Minimal RAG implementation with LangChain, FAISS, and support for either Ollama or OpenAI (API-key needed). +Minimal RAG implementation with LangChain, FAISS, and either **Ollama** (local) or **OpenAI** (API key). A web chat UI is included. -## Dependencies +--- -- `langchain` - Core framework -- `langchain-community` - Loaders, vectorstores -- `langchain-ollama` - Ollama integration -- `langchain-openai` - OpenAI integration -- `langchain-text-splitters` - Text splitting -- `langchain-huggingface` - HuggingFace embeddings -- `faiss-cpu` - Vector search -- `sentence-transformers` - Embeddings -- `pypdf` - PDF loading -- `fastapi` - Web server -- `uvicorn` - ASGI server +## What you need (before you start) -## Installation +- **Python 3.10 or newer** ([python.org](https://www.python.org/downloads/)) +- **Git** (optional, only if you clone the project) +- Either: + - **Ollama** installed and running ([ollama.com](https://ollama.com)), with at least one model pulled, **or** + - An **OpenAI API key** (if you use OpenAI in the chat) + +--- + +## Install dependencies (step by step) + +Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) or [Anaconda](https://www.anaconda.com/) if you do not have Conda yet. + +All commands below assume your terminal is open **in the project folder** (the folder that contains `requirements.txt`). ```bash conda create -n local_rag python=3.10 -y conda activate local_rag +pip install --upgrade pip pip install -r requirements.txt ``` -## Setup +Use `conda activate local_rag` in every new terminal session before running `python` or `uvicorn` for this project. -### Ollama (optional) +### OpenAI (only if you use the OpenAI provider in the chat) + +In the same terminal **before** starting the server: + +```bash +export OPENAI_API_KEY="your-key-here" +``` + +On Windows (Command Prompt): `set OPENAI_API_KEY=your-key-here` + +--- + +## Run Ollama (only if you use Ollama) + +In a **separate** terminal: ```bash ollama serve -ollama pull mistral ``` -### OpenAI (optional) - -Set the API key when using OpenAI: +In another terminal, pull a model once (example): ```bash -export OPENAI_API_KEY="your-key" +ollama pull gpt-oss:20b ``` -## Add Documents +The model name must match what you configure in `server.py` (see [Configuration reference](#configuration-reference)). -**Option 1:** Add PDFs from a folder via script. Edit `DATA_ROOT` in [add_pdfs.py](add_pdfs.py) to point at your folder, then run: +--- + +## Build the vector store from a folder of PDFs + +The project includes [add_pdfs.py](add_pdfs.py). It finds every **`.pdf`** file under a folder you choose (including subfolders), then chunks, embeds, and saves to FAISS. + +**Two modes** (set in the script): + +| Setting | Behavior | +|---------|----------| +| `CLEAR_VECTORSTORE_FIRST = True` | Deletes the existing vector store folder, then builds a **new** index from the PDFs under `DATA_ROOT`. Use this for a full rebuild. | +| `CLEAR_VECTORSTORE_FIRST = False` | Keeps the current index (if it exists) and **merges** chunks from the PDFs under `DATA_ROOT` into it. Use this to add another batch of PDFs without wiping what you already indexed. | + +**Steps:** + +1. Open [add_pdfs.py](add_pdfs.py) in a text editor. +2. Set **`DATA_ROOT`** to the folder that contains your PDFs (absolute path or path relative to how you run the script). +3. Set **`CLEAR_VECTORSTORE_FIRST`** to `True` (fresh index) or `False` (append to existing store). +4. Optionally set **`VECTORSTORE_PATH`** (default: `./vectorstore`). It must match **`VECTORSTORE_PATH`** in [server.py](server.py) so the chat loads the same index. +5. From the project folder, with `conda activate local_rag` (or your chosen env name): ```bash python add_pdfs.py ``` -The script clears the existing vector store and indexes all PDFs recursively. Supports `.pdf`, `.txt`, `.md`. +Indexing can take a long time for many large PDFs. When it finishes, you should see `Vector store saved to ...`. -**Option 2:** Use `local_rag.py` programmatically: +**Note:** This script only indexes **PDF** files. To add `.txt` or `.md` files, use the Python snippet below or call `add_documents` yourself. + +--- + +## Add more documents later (alternative to add_pdfs) + +You can also merge files by hand with a short script (any mix of supported types): ```python from local_rag import LocalRAG -rag = LocalRAG() -rag.add_documents(["path/to/doc1.pdf", "path/to/doc2.txt"]) + +rag = LocalRAG(vectorstore_path="./vectorstore") # same path as server.py +rag.add_documents([ + "path/to/new1.pdf", + "path/to/notes.txt", +]) ``` -## Chat GUI +`add_documents` merges new chunks into the existing FAISS store and saves it again—the same behavior as [add_pdfs.py](add_pdfs.py) with `CLEAR_VECTORSTORE_FIRST = False`. -Start the server: +--- + +## Swap or experiment with different vector stores + +The vector index is stored on disk under the folder given by **`VECTORSTORE_PATH`** (default `./vectorstore`). That folder contains files such as `index.faiss` and `index.pkl`. + +**To use a different index:** + +1. Set **`VECTORSTORE_PATH`** in both [server.py](server.py) and any script you use to build the index (e.g. [add_pdfs.py](add_pdfs.py)) to the **same** path, e.g. `./vectorstore_experiment`. +2. Rebuild the index (run `add_pdfs.py` or `add_documents`) so that folder is created. +3. **Restart** the web server so it loads the new path at startup. + +**Tips:** + +- Keep multiple copies of the folder (e.g. `vectorstore_backup`, `vectorstore_papers_only`) and swap `VECTORSTORE_PATH` to switch between them. +- If you change **chunk size**, **embedding model**, or **FAISS** usage in code, treat the old index as incompatible: use a new `VECTORSTORE_PATH` or delete the old folder and rebuild. + +--- + +## Run the chat web app + +With the Conda environment activated (`conda activate local_rag`) and (if needed) `OPENAI_API_KEY` set: ```bash uvicorn server:app --reload ``` -Open [http://localhost:8000](http://localhost:8000). The chat UI provides: +Open [http://127.0.0.1:8000](http://127.0.0.1:8000) or [http://localhost:8000](http://localhost:8000). -- **Provider switch** – Toggle between Ollama and OpenAI without restart (OpenAI requires `OPENAI_API_KEY`) -- **Conversation history** – Multi-turn chat with context -- **Markdown** – Assistant replies rendered as markdown (headings, code, lists, links) +- Use the **LLM provider** dropdown: **Ollama** or **OpenAI** (OpenAI only works if the server was started with a valid `OPENAI_API_KEY`). +- You need a **non-empty vector store** (see above) for answers to work. -Ensure the vector store is populated and at least one provider (Ollama or OpenAI) is configured. +--- -## API +## API (short reference) -- `POST /api/chat` – `{ "message": "...", "history": [...], "llm_provider": "ollama"|"openai" }` -- `GET /api/providers` – `{ "ollama": true, "openai": true|false }` -- `GET /api/health` – Health and vectorstore status +| Endpoint | Purpose | +|----------|---------| +| `POST /api/chat` | Body: `message`, optional `history`, optional `llm_provider` (`ollama` or `openai`) | +| `GET /api/providers` | Which providers are available (`openai` false if no API key at startup) | +| `GET /api/health` | Server and whether a vector store is loaded | -## How it works +--- -1. **Load documents** – PDFs or text via PyPDFLoader / TextLoader -2. **Chunk** – RecursiveCharacterTextSplitter (2000 chars, 400 overlap) -3. **Embed** – sentence-transformers/all-MiniLM-L6-v2 -4. **Store** – FAISS vector store (similarity search with scores) -5. **Query** – Retrieve chunks, optionally rephrase with conversation history, generate answer with selected LLM +## How it works (high level) + +1. **Load documents** – PDFs via `PyPDFLoader`, text via `TextLoader`. +2. **Chunk** – `RecursiveCharacterTextSplitter` (defaults in [local_rag.py](local_rag.py)). +3. **Embed** – Hugging Face `sentence-transformers/all-MiniLM-L6-v2`. +4. **Store** – FAISS; retrieval uses `similarity_search_with_score`. +5. **Query** – Optional rephrase with chat history, retrieval, then answer from the LLM. + +--- + +## Configuration reference (what to edit) + +These are the main places to change behavior without restructuring the app. + +### [server.py](server.py) + +| What | Where | +|------|--------| +| Ollama model name | `OLLAMA_MODEL = "..."` | +| OpenAI model name | `OPENAI_MODEL = "..."` | +| Where the FAISS index is loaded from | `VECTORSTORE_PATH = "./vectorstore"` (must match your indexing script) | + +### [local_rag.py](local_rag.py) – `LocalRAG.__init__` + +| What | Where (approx.) | +|------|------------------| +| Default vector store folder | Parameter `vectorstore_path="./vectorstore"` | +| Embedding model | `HuggingFaceEmbeddings(model_name="sentence-transformers/...")` | +| Chunk size and overlap | Module-level `CHUNK_SIZE` and `CHUNK_OVERLAP` (used by `RecursiveCharacterTextSplitter` when adding documents) | +| Default Ollama / OpenAI model strings | Parameters `ollama_model`, `openai_model`, `ollama_base_url` | + +Changing the embedding model or chunk settings requires **rebuilding** the vector store (old index is not compatible). + +### [local_rag.py](local_rag.py) – `query_with_history` + +| What | Where | +|------|--------| +| Default number of chunks retrieved (`k`) | Module-level `RETRIEVAL_K` (overrides: pass `k=` to `query` / `query_with_history`) | +| Extra text appended only to the **FAISS query** (biases retrieval, not the final answer phrasing) | `QUERY_ADDITIONAL_INSTRUCTIONS` (concatenated to the search query before embedding) | +| **Rephrase** prompt (standalone question when there is chat history) | String `rephrase_prompt = f"""..."""` inside `query_with_history` | +| **Answer** prompt – opening instructions only | Module-level `ANSWER_PROMPT` (edit the role / style lines). The block from chat history through `Answer:` is built in `query_with_history` | + +### [add_pdfs.py](add_pdfs.py) + +| What | Where | +|------|--------| +| Folder to scan for PDFs | `DATA_ROOT = Path("...")` | +| Output vector store folder | `VECTORSTORE_PATH = "./vectorstore"` (keep in sync with `server.py`) | +| Wipe index vs merge | `CLEAR_VECTORSTORE_FIRST = True` (delete and rebuild) or `False` (append to existing index) | + +--- + +## Dependencies (for developers) + +See [requirements.txt](requirements.txt) for the full list (LangChain, FAISS, sentence-transformers, FastAPI, uvicorn, etc.). diff --git a/add_pdfs.py b/add_pdfs.py index c1bed00..b118040 100644 --- a/add_pdfs.py +++ b/add_pdfs.py @@ -5,13 +5,23 @@ from pathlib import Path from local_rag import LocalRAG +# Folder to scan for PDFs (recursively). DATA_ROOT = Path("/Users/Philipp/Desktop/workspace/python/gpt_publikationen/data_vs") + +# Must match server.py so the chat loads the same index. VECTORSTORE_PATH = "./vectorstore" +# If True: delete the existing vector store folder, then index all PDFs from scratch. +# If False: load the existing index (if any) and merge new chunks from these PDFs into it. +CLEAR_VECTORSTORE_FIRST = True + + if __name__ == "__main__": - if Path(VECTORSTORE_PATH).exists(): + if CLEAR_VECTORSTORE_FIRST and Path(VECTORSTORE_PATH).exists(): shutil.rmtree(VECTORSTORE_PATH) print(f"Cleared existing vector store: {VECTORSTORE_PATH}") + elif not CLEAR_VECTORSTORE_FIRST: + print(f"Appending to existing vector store (if any): {VECTORSTORE_PATH}") pdfs = sorted(p for p in DATA_ROOT.rglob("*") if p.suffix.lower() == ".pdf") print(f"Found {len(pdfs)} PDF(s) under {DATA_ROOT}") diff --git a/local_rag.py b/local_rag.py index 128749b..386eca4 100644 --- a/local_rag.py +++ b/local_rag.py @@ -12,6 +12,23 @@ from langchain_ollama import ChatOllama from langchain_openai import ChatOpenAI from langchain_text_splitters import RecursiveCharacterTextSplitter +# Pipeline tuning: used when adding documents (rebuild index after changing). +CHUNK_SIZE = 2000 +CHUNK_OVERLAP = 400 + +# Retrieval / query tuning +RETRIEVAL_K = 5 + +# Appended to the embedding query for FAISS (does not affect the answer-generation prompt). +QUERY_ADDITIONAL_INSTRUCTIONS = ( + "Do not return a list of references but prioritize meaningful text from abstracts, results and discussion sections." +) + +# Opening instructions for the final answer LLM call only. The rest (history, context, question) +# is assembled in query_with_history. +ANSWER_PROMPT = """You are an assistant for question-answering. Use the chat history (if any) and the retrieved context below to answer the current question. +If you don't know the answer, say so. Keep the conversation coherent.""" + class LocalRAG: def __init__( @@ -19,7 +36,7 @@ class LocalRAG: vectorstore_path="./vectorstore", llm_provider="ollama", ollama_model="gpt-oss:20b", - openai_model="gpt-5.2", + openai_model="gpt-5-mini", ollama_base_url="http://localhost:11434", ): """Initialize local RAG system. llm_provider: 'ollama' or 'openai'.""" @@ -32,11 +49,12 @@ class LocalRAG: model_name="sentence-transformers/all-MiniLM-L6-v2" ) - # Text splitter + # Text splitter (used when adding documents; tune CHUNK_SIZE / CHUNK_OVERLAP above) self.text_splitter = RecursiveCharacterTextSplitter( - chunk_size=2000, - chunk_overlap=400 + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, ) + print(f"Text splitter: chunk_size={CHUNK_SIZE}, chunk_overlap={CHUNK_OVERLAP}") # LLM (Ollama or OpenAI) if llm_provider == "openai": @@ -186,15 +204,19 @@ class LocalRAG: for doc, score in docs_with_scores ] - def query(self, question, k=8): + def query(self, question, k=None): """Query the RAG system (no conversation history). Returns dict with 'answer' and 'retrieved'.""" + if k is None: + k = RETRIEVAL_K return self.query_with_history(question, chat_history=[], k=k) - def query_with_history(self, question, chat_history=None, k=8): + def query_with_history(self, question, chat_history=None, k=None): """Query the RAG with conversation history: rephrase question using history for retrieval, then answer with full conversation + retrieved context in the prompt. Returns dict with 'answer' and 'retrieved' (list of chunks with content, source, page). """ + if k is None: + k = RETRIEVAL_K if self.vectorstore is None: return { "answer": "Error: No documents loaded. Please add documents first.", @@ -203,9 +225,6 @@ class LocalRAG: history_str = self._format_history(chat_history) search_query = question - rag_query_instruction = ( - "Do not return a list of references but prioritize meaningful text from abstracts, results and discussion sections." - ) print(f"[RAG] User question: {question!r}") @@ -224,7 +243,7 @@ Standalone question:""" search_query = (rephrase_response.content if hasattr(rephrase_response, "content") else str(rephrase_response)).strip() or question print(f"[RAG] Standalone search query (rephrased): {search_query!r}") - retrieval_query = f"{search_query}\n\n{rag_query_instruction}" + retrieval_query = f"{search_query}\n\n{QUERY_ADDITIONAL_INSTRUCTIONS}" print(f"[RAG] Search query: {search_query!r}") print(f"[RAG] Retrieval query sent to vector store: {retrieval_query!r}") @@ -239,8 +258,9 @@ Standalone question:""" # 3) Answer using conversation history + retrieved context history_block = f"Chat history:\n{history_str}\n\n" if history_str else "" - answer_prompt = f"""You are an assistant for question-answering. Use the chat history (if any) and the retrieved context below to answer the current question. - If you don't know the answer, say so. Keep the conversation coherent. + answer_prompt = ( + ANSWER_PROMPT + + f""" {history_block}Relevant context from documents: @@ -249,6 +269,8 @@ Standalone question:""" Current question: {question} Answer:""" + ) + print(f"[RAG] Composed answer prompt:\n{answer_prompt}") response = self.llm.invoke(answer_prompt) answer = response.content if hasattr(response, "content") else str(response) diff --git a/server.py b/server.py index bdcb0da..c93aee9 100644 --- a/server.py +++ b/server.py @@ -12,7 +12,7 @@ from pydantic import BaseModel from local_rag import LocalRAG OLLAMA_MODEL = "gpt-oss:20b" -OPENAI_MODEL = "gpt-5.2" +OPENAI_MODEL = "gpt-5-mini" VECTORSTORE_PATH = "./vectorstore" # Dual RAG instances for on-the-fly provider switching