HamidOmarov commited on
Commit
e02136d
Β·
verified Β·
1 Parent(s): e218376

Upload 7 files

Browse files
Files changed (7) hide show
  1. app.py +28 -0
  2. embedder_light.py +15 -0
  3. optimal_chunker.py +9 -0
  4. pdf_loader.py +6 -0
  5. rag_system.py +43 -0
  6. requirements.txt +8 -0
  7. vector_store.py +8 -0
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rag_system import RAGPipeline
3
+
4
+ rag = RAGPipeline()
5
+
6
+ def chat_with_pdf(pdf_file, question):
7
+ if pdf_file is None or question.strip() == "":
8
+ return "Please upload a PDF and enter a question."
9
+
10
+ # Index the PDF
11
+ rag.index_document(pdf_file.name)
12
+
13
+ # Query the indexed document
14
+ return rag.query(question)
15
+
16
+ interface = gr.Interface(
17
+ fn=chat_with_pdf,
18
+ inputs=[
19
+ gr.File(label="Upload PDF", file_types=[".pdf"]),
20
+ gr.Textbox(label="Ask a question")
21
+ ],
22
+ outputs=gr.Textbox(label="Answer"),
23
+ title="Chat with your PDF",
24
+ description="Upload a PDF and ask questions about its content"
25
+ )
26
+
27
+ if __name__ == "__main__":
28
+ interface.launch()
embedder_light.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ import torch
3
+
4
+ def get_embedder():
5
+ model_name = "microsoft/MiniLM-L12-H384-uncased"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+ model = AutoModel.from_pretrained(model_name)
8
+ return tokenizer, model
9
+
10
+ def embed_text(texts, tokenizer, model):
11
+ encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
12
+ with torch.no_grad():
13
+ model_output = model(**encoded_input)
14
+ embeddings = model_output.last_hidden_state.mean(dim=1)
15
+ return embeddings.numpy().tolist()
optimal_chunker.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+
3
+ def chunk_documents(docs, chunk_size=500, chunk_overlap=50):
4
+ text_splitter = RecursiveCharacterTextSplitter(
5
+ chunk_size=chunk_size,
6
+ chunk_overlap=chunk_overlap
7
+ )
8
+ chunks = text_splitter.split_documents(docs)
9
+ return chunks
pdf_loader.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+
3
+ def load_pdf(file_path):
4
+ loader = PyPDFLoader(file_path)
5
+ pages = loader.load()
6
+ return pages
rag_system.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf_loader import load_pdf
2
+ from optimal_chunker import chunk_documents
3
+ from embedder_light import get_embedder, embed_text
4
+ from vector_store import get_chroma_client, create_collection
5
+
6
+ class RAGPipeline:
7
+ def __init__(self):
8
+ self.tokenizer, self.model = get_embedder()
9
+ self.db_client = get_chroma_client()
10
+ self.collection = create_collection(self.db_client)
11
+
12
+ def index_document(self, pdf_path):
13
+ print(f"πŸ“„ Loading: {pdf_path}")
14
+ docs = load_pdf(pdf_path)
15
+
16
+ print("βœ‚οΈ Chunking...")
17
+ chunks = chunk_documents(docs)
18
+
19
+ print("πŸ”’ Creating embeddings...")
20
+ texts = [chunk.page_content for chunk in chunks]
21
+ vectors = embed_text(texts, self.tokenizer, self.model)
22
+
23
+ print("🧠 Adding to ChromaDB...")
24
+ ids = [f"doc_{i}" for i in range(len(texts))]
25
+ self.collection.add(documents=texts, embeddings=vectors, ids=ids)
26
+
27
+ print(f"βœ… Indexed {len(texts)} chunks.")
28
+
29
+ def query(self, question):
30
+ print(f"❓ Question: {question}")
31
+ question_vec = embed_text([question], self.tokenizer, self.model)[0]
32
+
33
+ results = self.collection.query(
34
+ query_embeddings=[question_vec],
35
+ n_results=3
36
+ )
37
+
38
+ print("
39
+ πŸ” Top Documents:")
40
+ for i, doc in enumerate(results["documents"][0]):
41
+ print(f"{i+1}. {doc[:200]}...
42
+ ")
43
+ return results["documents"][0][0]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain-community
3
+ chromadb
4
+ transformers
5
+ torch
6
+ tiktoken
7
+ pypdf
8
+ numpy
vector_store.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+
3
+ def get_chroma_client():
4
+ client = chromadb.PersistentClient(path="./chroma_db")
5
+ return client
6
+
7
+ def create_collection(client, name="pdf_docs"):
8
+ return client.get_or_create_collection(name)