Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- app.py +28 -0
- embedder_light.py +15 -0
- optimal_chunker.py +9 -0
- pdf_loader.py +6 -0
- rag_system.py +43 -0
- requirements.txt +8 -0
- vector_store.py +8 -0
app.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from rag_system import RAGPipeline
|
3 |
+
|
4 |
+
rag = RAGPipeline()
|
5 |
+
|
6 |
+
def chat_with_pdf(pdf_file, question):
|
7 |
+
if pdf_file is None or question.strip() == "":
|
8 |
+
return "Please upload a PDF and enter a question."
|
9 |
+
|
10 |
+
# Index the PDF
|
11 |
+
rag.index_document(pdf_file.name)
|
12 |
+
|
13 |
+
# Query the indexed document
|
14 |
+
return rag.query(question)
|
15 |
+
|
16 |
+
interface = gr.Interface(
|
17 |
+
fn=chat_with_pdf,
|
18 |
+
inputs=[
|
19 |
+
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
20 |
+
gr.Textbox(label="Ask a question")
|
21 |
+
],
|
22 |
+
outputs=gr.Textbox(label="Answer"),
|
23 |
+
title="Chat with your PDF",
|
24 |
+
description="Upload a PDF and ask questions about its content"
|
25 |
+
)
|
26 |
+
|
27 |
+
if __name__ == "__main__":
|
28 |
+
interface.launch()
|
embedder_light.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModel
|
2 |
+
import torch
|
3 |
+
|
4 |
+
def get_embedder():
|
5 |
+
model_name = "microsoft/MiniLM-L12-H384-uncased"
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
7 |
+
model = AutoModel.from_pretrained(model_name)
|
8 |
+
return tokenizer, model
|
9 |
+
|
10 |
+
def embed_text(texts, tokenizer, model):
|
11 |
+
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
|
12 |
+
with torch.no_grad():
|
13 |
+
model_output = model(**encoded_input)
|
14 |
+
embeddings = model_output.last_hidden_state.mean(dim=1)
|
15 |
+
return embeddings.numpy().tolist()
|
optimal_chunker.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
2 |
+
|
3 |
+
def chunk_documents(docs, chunk_size=500, chunk_overlap=50):
|
4 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
5 |
+
chunk_size=chunk_size,
|
6 |
+
chunk_overlap=chunk_overlap
|
7 |
+
)
|
8 |
+
chunks = text_splitter.split_documents(docs)
|
9 |
+
return chunks
|
pdf_loader.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyPDFLoader
|
2 |
+
|
3 |
+
def load_pdf(file_path):
|
4 |
+
loader = PyPDFLoader(file_path)
|
5 |
+
pages = loader.load()
|
6 |
+
return pages
|
rag_system.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf_loader import load_pdf
|
2 |
+
from optimal_chunker import chunk_documents
|
3 |
+
from embedder_light import get_embedder, embed_text
|
4 |
+
from vector_store import get_chroma_client, create_collection
|
5 |
+
|
6 |
+
class RAGPipeline:
|
7 |
+
def __init__(self):
|
8 |
+
self.tokenizer, self.model = get_embedder()
|
9 |
+
self.db_client = get_chroma_client()
|
10 |
+
self.collection = create_collection(self.db_client)
|
11 |
+
|
12 |
+
def index_document(self, pdf_path):
|
13 |
+
print(f"π Loading: {pdf_path}")
|
14 |
+
docs = load_pdf(pdf_path)
|
15 |
+
|
16 |
+
print("βοΈ Chunking...")
|
17 |
+
chunks = chunk_documents(docs)
|
18 |
+
|
19 |
+
print("π’ Creating embeddings...")
|
20 |
+
texts = [chunk.page_content for chunk in chunks]
|
21 |
+
vectors = embed_text(texts, self.tokenizer, self.model)
|
22 |
+
|
23 |
+
print("π§ Adding to ChromaDB...")
|
24 |
+
ids = [f"doc_{i}" for i in range(len(texts))]
|
25 |
+
self.collection.add(documents=texts, embeddings=vectors, ids=ids)
|
26 |
+
|
27 |
+
print(f"β
Indexed {len(texts)} chunks.")
|
28 |
+
|
29 |
+
def query(self, question):
|
30 |
+
print(f"β Question: {question}")
|
31 |
+
question_vec = embed_text([question], self.tokenizer, self.model)[0]
|
32 |
+
|
33 |
+
results = self.collection.query(
|
34 |
+
query_embeddings=[question_vec],
|
35 |
+
n_results=3
|
36 |
+
)
|
37 |
+
|
38 |
+
print("
|
39 |
+
π Top Documents:")
|
40 |
+
for i, doc in enumerate(results["documents"][0]):
|
41 |
+
print(f"{i+1}. {doc[:200]}...
|
42 |
+
")
|
43 |
+
return results["documents"][0][0]
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
langchain-community
|
3 |
+
chromadb
|
4 |
+
transformers
|
5 |
+
torch
|
6 |
+
tiktoken
|
7 |
+
pypdf
|
8 |
+
numpy
|
vector_store.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
|
3 |
+
def get_chroma_client():
|
4 |
+
client = chromadb.PersistentClient(path="./chroma_db")
|
5 |
+
return client
|
6 |
+
|
7 |
+
def create_collection(client, name="pdf_docs"):
|
8 |
+
return client.get_or_create_collection(name)
|