localRAG/local_rag.py

"""
Local RAG setup with LangChain, Ollama, and FAISS
Minimal dependencies, simple code
"""
import os
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_ollama import ChatOllama


class LocalRAG:
    def __init__(self, vectorstore_path="./vectorstore", ollama_model="mistral:7b"):
        """Initialize local RAG system"""
        self.vectorstore_path = vectorstore_path
        self.ollama_model = ollama_model

        # Embeddings
        print("Loading embeddings model...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

        # Text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )

        # Ollama LLM
        print(f"Connecting to Ollama (model: {ollama_model})...")
        self.llm = ChatOllama(
            model=ollama_model,
            base_url="http://localhost:11434"
        )

        # Vector store (load if exists, otherwise None)
        self.vectorstore = None
        self._load_vectorstore()

    def _load_vectorstore(self):
        """Load existing vector store if available"""
        index_file = os.path.join(self.vectorstore_path, "index.faiss")
        if os.path.exists(index_file):
            try:
                self.vectorstore = FAISS.load_local(
                    self.vectorstore_path,
                    self.embeddings,
                    allow_dangerous_deserialization=True
                )
                print(f"Loaded existing vector store from {self.vectorstore_path}")
            except Exception as e:
                print(f"Could not load vector store: {e}")
                self.vectorstore = None

    def add_documents(self, file_paths):
        """Add documents to the vector store"""
        print(f"\nLoading {len(file_paths)} document(s)...")
        all_docs = []

        for file_path in file_paths:
            path = Path(file_path)
            if not path.exists():
                print(f"Warning: {file_path} not found, skipping")
                continue

            # Load document
            if path.suffix.lower() == '.pdf':
                loader = PyPDFLoader(str(path))
            elif path.suffix.lower() in ['.txt', '.md']:
                loader = TextLoader(str(path))
            else:
                print(f"Warning: Unsupported file type {path.suffix}, skipping")
                continue

            docs = loader.load()
            chunks = self.text_splitter.split_documents(docs)
            all_docs.extend(chunks)
            print(f"  - {path.name}: {len(chunks)} chunks")

        if not all_docs:
            print("No documents loaded!")
            return

        # Create or update vector store
        print(f"\nCreating embeddings for {len(all_docs)} chunks...")
        if self.vectorstore is None:
            self.vectorstore = FAISS.from_documents(all_docs, self.embeddings)
        else:
            new_store = FAISS.from_documents(all_docs, self.embeddings)
            self.vectorstore.merge_from(new_store)

        # Save
        os.makedirs(self.vectorstore_path, exist_ok=True)
        self.vectorstore.save_local(self.vectorstore_path)
        print(f"Vector store saved to {self.vectorstore_path}")

    def list_documents(self):
        """List all documents in the vector store"""
        if self.vectorstore is None:
            print("No documents in vector store.")
            return []

        # Get all documents from the vector store
        # We'll retrieve a large number to get all documents
        all_docs = self.vectorstore.similarity_search("", k=10000)  # Large k to get all

        # Extract unique document sources from metadata
        documents = {}
        for doc in all_docs:
            source = doc.metadata.get('source', 'Unknown')
            if source not in documents:
                documents[source] = {
                    'source': source,
                    'chunks': 0,
                    'page': doc.metadata.get('page', None)
                }
            documents[source]['chunks'] += 1

        # Convert to list and sort
        doc_list = list(documents.values())
        doc_list.sort(key=lambda x: x['source'])

        print(f"\nDocuments in vector store ({len(doc_list)} unique documents):")
        print("-" * 60)
        for doc_info in doc_list:
            print(f"  - {doc_info['source']}")
            print(f"    Chunks: {doc_info['chunks']}")
            if doc_info['page'] is not None:
                print(f"    Page: {doc_info['page']}")

        return doc_list

    def query(self, question, k=4):
        """Query the RAG system. Returns dict with 'answer' and 'retrieved' (list of chunks with content, source, page)."""
        if self.vectorstore is None:
            return {
                "answer": "Error: No documents loaded. Please add documents first.",
                "retrieved": [],
            }

        docs = self.vectorstore.similarity_search(question, k=k)
        retrieved = [
            {
                "content": doc.page_content,
                "source": doc.metadata.get("source", ""),
                "page": doc.metadata.get("page"),
            }
            for doc in docs
        ]

        # Combine context from documents
        context = "\n\n".join([doc.page_content for doc in docs])

        prompt = f"""Use the following context to answer the question.
If you don't know the answer, say that you don't know instead of making up an answer.

Context:
{context}

Question: {question}

Answer:"""

        response = self.llm.invoke(prompt)
        answer = response.content if hasattr(response, "content") else str(response)

        return {"answer": answer, "retrieved": retrieved}


def main():
    """Example usage"""
    print("=" * 60)
    print("Local RAG with LangChain, Ollama, and FAISS")
    print("=" * 60)

    # Initialize
    rag = LocalRAG(ollama_model="mistral:7b")

    # Add documents (uncomment and add your file paths)
    # rag.add_documents([
    #     "data/dok1.pdf",
    #     "data/dok2.pdf",
    #     "data/dok3.pdf"
    # ])

    # List documents
    rag.list_documents()

    # Query
    question = "What do the documents say about modality for perceived message perception?"
    result = rag.query(question)
    print(f"\nQuestion: {question}")
    print(f"Answer: {result['answer']}")
    if result.get("retrieved"):
        print(f"Retrieved {len(result['retrieved'])} chunks")

    # print("\nSetup complete! Uncomment the code above to add documents and query.")


if __name__ == "__main__":
    main()