trry / app.py
random2222's picture
Update app.py
8a6b9ad verified
raw
history blame
2.91 kB
import os
import gradio as gr
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFaceHub # Updated import path
import zipfile
# Rest of your existing code remains the same...
# Extract PDFs from zip file
def extract_pdfs_from_zip(zip_path="data.zip", extract_to="data"):
if not os.path.exists(zip_path):
raise FileNotFoundError(f"Zip file '{zip_path}' not found.")
if not os.path.exists(extract_to):
os.makedirs(extract_to)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
def load_pdfs(directory="data"):
if not os.path.exists(directory):
raise FileNotFoundError(f"The directory '{directory}' does not exist.")
raw_documents = []
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
loader = PyMuPDFLoader(os.path.join(directory, filename))
docs = loader.load()
raw_documents.extend(docs)
return raw_documents
def split_documents(documents):
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
return text_splitter.split_documents(documents)
def initialize_qa_system():
print("πŸ“¦ Extracting PDFs from zip...")
extract_pdfs_from_zip()
print("πŸ”„ Loading PDFs...")
raw_docs = load_pdfs()
print(f"βœ… Loaded {len(raw_docs)} raw documents.")
if len(raw_docs) == 0:
raise ValueError("No PDF documents found in the 'data' directory.")
print("πŸͺ“ Splitting documents into chunks...")
docs = split_documents(raw_docs)
print(f"βœ… Split into {len(docs)} chunks.")
print("🧠 Generating embeddings...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("πŸ“¦ Creating FAISS vector store...")
db = FAISS.from_documents(docs, embeddings)
print("βœ… Vector store created successfully!")
print("πŸ€– Initializing LLM...")
llm = HuggingFaceHub(
repo_id="google/flan-t5-xxl",
model_kwargs={"temperature": 0.5, "max_length": 512}
)
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=db.as_retriever(search_kwargs={"k": 3})
)
return qa
# Initialize the QA system
qa_system = initialize_qa_system()
def chat_response(message, history):
response = qa_system({"query": message})
return response["result"]
# Create Gradio interface
demo = gr.ChatInterface(
fn=chat_response,
title="PDF Knowledge Chatbot",
description="Ask questions about the content in your PDF documents"
)
if __name__ == "__main__":
demo.launch()