Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
from langchain_community.vectorstores import FAISS | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.document_loaders import PyMuPDFLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.chains import RetrievalQA | |
from langchain_community.llms import HuggingFaceHub # Updated import path | |
import zipfile | |
# Rest of your existing code remains the same... | |
# Extract PDFs from zip file | |
def extract_pdfs_from_zip(zip_path="data.zip", extract_to="data"): | |
if not os.path.exists(zip_path): | |
raise FileNotFoundError(f"Zip file '{zip_path}' not found.") | |
if not os.path.exists(extract_to): | |
os.makedirs(extract_to) | |
with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
zip_ref.extractall(extract_to) | |
def load_pdfs(directory="data"): | |
if not os.path.exists(directory): | |
raise FileNotFoundError(f"The directory '{directory}' does not exist.") | |
raw_documents = [] | |
for filename in os.listdir(directory): | |
if filename.endswith(".pdf"): | |
loader = PyMuPDFLoader(os.path.join(directory, filename)) | |
docs = loader.load() | |
raw_documents.extend(docs) | |
return raw_documents | |
def split_documents(documents): | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
return text_splitter.split_documents(documents) | |
def initialize_qa_system(): | |
print("π¦ Extracting PDFs from zip...") | |
extract_pdfs_from_zip() | |
print("π Loading PDFs...") | |
raw_docs = load_pdfs() | |
print(f"β Loaded {len(raw_docs)} raw documents.") | |
if len(raw_docs) == 0: | |
raise ValueError("No PDF documents found in the 'data' directory.") | |
print("πͺ Splitting documents into chunks...") | |
docs = split_documents(raw_docs) | |
print(f"β Split into {len(docs)} chunks.") | |
print("π§ Generating embeddings...") | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
print("π¦ Creating FAISS vector store...") | |
db = FAISS.from_documents(docs, embeddings) | |
print("β Vector store created successfully!") | |
print("π€ Initializing LLM...") | |
llm = HuggingFaceHub( | |
repo_id="google/flan-t5-xxl", | |
model_kwargs={"temperature": 0.5, "max_length": 512} | |
) | |
qa = RetrievalQA.from_chain_type( | |
llm=llm, | |
chain_type="stuff", | |
retriever=db.as_retriever(search_kwargs={"k": 3}) | |
) | |
return qa | |
# Initialize the QA system | |
qa_system = initialize_qa_system() | |
def chat_response(message, history): | |
response = qa_system({"query": message}) | |
return response["result"] | |
# Create Gradio interface | |
demo = gr.ChatInterface( | |
fn=chat_response, | |
title="PDF Knowledge Chatbot", | |
description="Ask questions about the content in your PDF documents" | |
) | |
if __name__ == "__main__": | |
demo.launch() |