File size: 2,909 Bytes
6674899
21206fd
6674899
8a6b9ad
6674899
 
 
8a6b9ad
6674899
21206fd
8a6b9ad
 
6674899
 
 
 
 
 
 
 
 
 
21206fd
6674899
 
 
 
 
 
 
 
 
 
 
21206fd
6674899
 
 
21206fd
6674899
 
 
 
 
 
 
21206fd
6674899
 
21206fd
6674899
 
 
21206fd
6674899
 
21206fd
6674899
 
 
21206fd
6674899
 
 
 
 
 
 
 
 
 
 
 
21206fd
6674899
 
 
 
 
 
 
 
21206fd
6674899
 
 
21206fd
 
 
6674899
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import gradio as gr
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFaceHub  # Updated import path
import zipfile

# Rest of your existing code remains the same...

# Extract PDFs from zip file
def extract_pdfs_from_zip(zip_path="data.zip", extract_to="data"):
    if not os.path.exists(zip_path):
        raise FileNotFoundError(f"Zip file '{zip_path}' not found.")
    
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def load_pdfs(directory="data"):
    if not os.path.exists(directory):
        raise FileNotFoundError(f"The directory '{directory}' does not exist.")
    
    raw_documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            loader = PyMuPDFLoader(os.path.join(directory, filename))
            docs = loader.load()
            raw_documents.extend(docs)
    return raw_documents

def split_documents(documents):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    return text_splitter.split_documents(documents)

def initialize_qa_system():
    print("πŸ“¦ Extracting PDFs from zip...")
    extract_pdfs_from_zip()
    
    print("πŸ”„ Loading PDFs...")
    raw_docs = load_pdfs()
    print(f"βœ… Loaded {len(raw_docs)} raw documents.")

    if len(raw_docs) == 0:
        raise ValueError("No PDF documents found in the 'data' directory.")

    print("πŸͺ“ Splitting documents into chunks...")
    docs = split_documents(raw_docs)
    print(f"βœ… Split into {len(docs)} chunks.")

    print("🧠 Generating embeddings...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    print("πŸ“¦ Creating FAISS vector store...")
    db = FAISS.from_documents(docs, embeddings)
    print("βœ… Vector store created successfully!")

    print("πŸ€– Initializing LLM...")
    llm = HuggingFaceHub(
        repo_id="google/flan-t5-xxl",
        model_kwargs={"temperature": 0.5, "max_length": 512}
    )
    
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=db.as_retriever(search_kwargs={"k": 3})
    )
    return qa

# Initialize the QA system
qa_system = initialize_qa_system()

def chat_response(message, history):
    response = qa_system({"query": message})
    return response["result"]

# Create Gradio interface
demo = gr.ChatInterface(
    fn=chat_response,
    title="PDF Knowledge Chatbot",
    description="Ask questions about the content in your PDF documents"
)

if __name__ == "__main__":
    demo.launch()