Spaces:

HamidOmarov
/

First_RAG_System

Sleeping

HamidOmarov commited on 8 days ago

Commit

e02136d

verified ·

1 Parent(s): e218376

Upload 7 files

Files changed (7) hide show

app.py ADDED Viewed

+import gradio as gr
+from rag_system import RAGPipeline
+rag = RAGPipeline()
+def chat_with_pdf(pdf_file, question):
+    if pdf_file is None or question.strip() == "":
+        return "Please upload a PDF and enter a question."
+    # Index the PDF
+    rag.index_document(pdf_file.name)
+    # Query the indexed document
+    return rag.query(question)
+interface = gr.Interface(
+    fn=chat_with_pdf,
+    inputs=[
+        gr.File(label="Upload PDF", file_types=[".pdf"]),
+        gr.Textbox(label="Ask a question")
+    ],
+    outputs=gr.Textbox(label="Answer"),
+    title="Chat with your PDF",
+    description="Upload a PDF and ask questions about its content"
+)
+if __name__ == "__main__":
+    interface.launch()

embedder_light.py ADDED Viewed

+from transformers import AutoTokenizer, AutoModel
+import torch
+def get_embedder():
+    model_name = "microsoft/MiniLM-L12-H384-uncased"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name)
+    return tokenizer, model
+def embed_text(texts, tokenizer, model):
+    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+    with torch.no_grad():
+        model_output = model(**encoded_input)
+    embeddings = model_output.last_hidden_state.mean(dim=1)
+    return embeddings.numpy().tolist()

optimal_chunker.py ADDED Viewed

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+def chunk_documents(docs, chunk_size=500, chunk_overlap=50):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap
+    )
+    chunks = text_splitter.split_documents(docs)
+    return chunks

pdf_loader.py ADDED Viewed

+from langchain_community.document_loaders import PyPDFLoader
+def load_pdf(file_path):
+    loader = PyPDFLoader(file_path)
+    pages = loader.load()
+    return pages

rag_system.py ADDED Viewed

+from pdf_loader import load_pdf
+from optimal_chunker import chunk_documents
+from embedder_light import get_embedder, embed_text
+from vector_store import get_chroma_client, create_collection
+class RAGPipeline:
+    def __init__(self):
+        self.tokenizer, self.model = get_embedder()
+        self.db_client = get_chroma_client()
+        self.collection = create_collection(self.db_client)
+    def index_document(self, pdf_path):
+        print(f"📄 Loading: {pdf_path}")
+        docs = load_pdf(pdf_path)
+        print("✂️ Chunking...")
+        chunks = chunk_documents(docs)
+        print("🔢 Creating embeddings...")
+        texts = [chunk.page_content for chunk in chunks]
+        vectors = embed_text(texts, self.tokenizer, self.model)
+        print("🧠 Adding to ChromaDB...")
+        ids = [f"doc_{i}" for i in range(len(texts))]
+        self.collection.add(documents=texts, embeddings=vectors, ids=ids)
+        print(f"✅ Indexed {len(texts)} chunks.")
+    def query(self, question):
+        print(f"❓ Question: {question}")
+        question_vec = embed_text([question], self.tokenizer, self.model)[0]
+        results = self.collection.query(
+            query_embeddings=[question_vec],
+            n_results=3
+        )
+        print("
+🔍 Top Documents:")
+        for i, doc in enumerate(results["documents"][0]):
+            print(f"{i+1}. {doc[:200]}...
+")
+        return results["documents"][0][0]

requirements.txt ADDED Viewed

+gradio
+langchain-community
+chromadb
+transformers
+torch
+tiktoken
+pypdf
+numpy

vector_store.py ADDED Viewed

+import chromadb
+def get_chroma_client():
+    client = chromadb.PersistentClient(path="./chroma_db")
+    return client
+def create_collection(client, name="pdf_docs"):
+    return client.get_or_create_collection(name)