Spaces:
Sleeping
Sleeping
File size: 2,824 Bytes
6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 21206fd 6674899 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import os
import gradio as gr
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceHub
import zipfile
# Extract PDFs from zip file
def extract_pdfs_from_zip(zip_path="data.zip", extract_to="data"):
if not os.path.exists(zip_path):
raise FileNotFoundError(f"Zip file '{zip_path}' not found.")
if not os.path.exists(extract_to):
os.makedirs(extract_to)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
def load_pdfs(directory="data"):
if not os.path.exists(directory):
raise FileNotFoundError(f"The directory '{directory}' does not exist.")
raw_documents = []
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
loader = PyMuPDFLoader(os.path.join(directory, filename))
docs = loader.load()
raw_documents.extend(docs)
return raw_documents
def split_documents(documents):
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
return text_splitter.split_documents(documents)
def initialize_qa_system():
print("π¦ Extracting PDFs from zip...")
extract_pdfs_from_zip()
print("π Loading PDFs...")
raw_docs = load_pdfs()
print(f"β
Loaded {len(raw_docs)} raw documents.")
if len(raw_docs) == 0:
raise ValueError("No PDF documents found in the 'data' directory.")
print("πͺ Splitting documents into chunks...")
docs = split_documents(raw_docs)
print(f"β
Split into {len(docs)} chunks.")
print("π§ Generating embeddings...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("π¦ Creating FAISS vector store...")
db = FAISS.from_documents(docs, embeddings)
print("β
Vector store created successfully!")
print("π€ Initializing LLM...")
llm = HuggingFaceHub(
repo_id="google/flan-t5-xxl",
model_kwargs={"temperature": 0.5, "max_length": 512}
)
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=db.as_retriever(search_kwargs={"k": 3})
)
return qa
# Initialize the QA system
qa_system = initialize_qa_system()
def chat_response(message, history):
response = qa_system({"query": message})
return response["result"]
# Create Gradio interface
demo = gr.ChatInterface(
fn=chat_response,
title="PDF Knowledge Chatbot",
description="Ask questions about the content in your PDF documents"
)
if __name__ == "__main__":
demo.launch() |