# rag_system.py from typing import List, Dict import chromadb from pdf_loader import load_pdf from optimal_chunker import OptimalChunker from embeddings import embed_texts from langchain_groq import ChatGroq from dotenv import load_dotenv import os load_dotenv() class RAGPipeline: def __init__(self, persist_dir: str = "./chroma_db", collection_name: str = "pdf_docs"): # Vector DB (Chroma 1.x new API) self.client = chromadb.PersistentClient(path=persist_dir) self.col = self.client.get_or_create_collection(name=collection_name) # Chunker self.chunker = OptimalChunker() # LLM (Groq) self.llm = ChatGroq( model="llama3-8b-8192", temperature=0.0, api_key=os.getenv("GROQ_API_KEY"), ) # 1) Load 2) Chunk 3) Embed 4) Upsert to Chroma def index_document(self, pdf_path: str, doc_id_prefix: str = "doc") -> Dict: docs = load_pdf(pdf_path) text = "\n\n".join(d.page_content for d in docs) summary = self.chunker.fit_on_text(text) chunks = self.chunker.transform() # embeddings: list[list[float]] vectors = embed_texts(chunks) ids = [f"{doc_id_prefix}-{i}" for i in range(len(chunks))] self.col.add( ids=ids, documents=chunks, embeddings=vectors, metadatas=[{"source": pdf_path, "chunk": i} for i in range(len(chunks))], ) return {"chunks_indexed": len(chunks), "best_strategy": summary} # 5) Retrieve 6) Ask LLM def query(self, question: str, k: int = 4) -> Dict: results = self.col.query(query_texts=[question], n_results=k) chunks: List[str] = results["documents"][0] if results.get("documents") else [] context = "\n\n".join(chunks) prompt = f"""You are an extraction assistant. Use ONLY the Context to answer. Rules: - If the answer is explicitly present in Context, return that substring EXACTLY. - Do not paraphrase. Do not add words. Return a verbatim span from Context. - If the answer is not in Context, reply exactly: I don't know Question: {question} Context: {context} Answer (verbatim from Context):""" resp = self.llm.invoke(prompt) answer = resp.content.strip() # Fallback if the model still hedges if (not answer or answer.lower().startswith("i don't know")) and context.strip(): answer = chunks[0] if chunks else "I don't know" return { "answer": answer, "used_chunks": len(chunks), "context_preview": context[:500], } if __name__ == "__main__": rag = RAGPipeline() info = rag.index_document("sample.pdf") # ensure day3/sample.pdf exists print("Indexed:", info) out = rag.query("What text does the PDF contain?") print("Answer:", out["answer"]) print("Used chunks:", out["used_chunks"])