File size: 4,826 Bytes
7b666bb
 
 
9d72b0b
7b666bb
 
 
 
 
9d72b0b
7b666bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d72b0b
7b666bb
9d72b0b
 
 
 
 
 
 
 
7b666bb
 
 
9d72b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b666bb
 
 
9d72b0b
 
 
 
 
 
 
7b666bb
 
 
 
 
 
 
 
 
 
 
 
 
 
9d72b0b
 
 
 
7b666bb
9d72b0b
 
 
 
 
 
 
 
 
 
 
7b666bb
 
9d72b0b
7b666bb
 
 
 
 
 
 
9d72b0b
7b666bb
 
9d72b0b
7b666bb
 
 
 
 
9d72b0b
7b666bb
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings  # Updated import
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
import torch
from transformers import pipeline
from langdetect import detect

# Load a smaller LLM (e.g., Zephyr-7B or Mistral-7B)
def load_llm():
    model_name = "HuggingFaceH4/zephyr-7b-alpha"  # Replace with your preferred model
    pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.float16, device_map="auto")
    llm = HuggingFacePipeline(pipeline=pipe)
    return llm

# Extract text from PDF
def extract_text_from_pdf(file):
    reader = PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Split text into chunks
def split_text(text, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_text(text)
    return chunks

# Create embeddings and vector store
def create_vector_store(chunks, indexing_method="multi-representation"):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    if indexing_method == "multi-representation":
        vector_store = FAISS.from_texts(chunks, embeddings)
    elif indexing_method == "raptors":
        # Implement RAPTORS logic here (e.g., hierarchical chunking)
        vector_store = FAISS.from_texts(chunks, embeddings)
    elif indexing_method == "colbert":
        # Implement ColBERT logic here (e.g., contextualized embeddings)
        vector_store = FAISS.from_texts(chunks, embeddings)
    return vector_store

# Query the PDF
def query_pdf(vector_store, query, llm, query_method="multi-query"):
    if query_method == "multi-query":
        # Implement Multi-Query logic here
        qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
    elif query_method == "rag-fusion":
        # Implement RAG Fusion logic here
        qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
    elif query_method == "decomposition":
        # Implement Decomposition logic here
        qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
    elif query_method == "step-back":
        # Implement Step Back logic here
        qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
    elif query_method == "hyde":
        # Implement HyDE logic here
        qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
    result = qa.run(query)
    return result

# Detect language of the text
def detect_language(text):
    try:
        return detect(text)
    except:
        return "en"  # Default to English if detection fails

# Streamlit App
def main():
    st.title("Chat with PDF")
    st.write("Upload a PDF and ask questions about it!")

    # File uploader
    uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
    if uploaded_file is None:
        st.info("Using default PDF.")
        uploaded_file = "default.pdf"  # Add a default PDF

    # Extract text
    text = extract_text_from_pdf(uploaded_file)

    # Detect language
    language = detect_language(text)
    st.write(f"Detected Language: {language}")

    # Split text into chunks
    chunk_size = st.slider("Chunk Size", 500, 2000, 1000)
    chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200)
    chunks = split_text(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    # Indexing options
    indexing_method = st.selectbox(
        "Indexing Method",
        ["multi-representation", "raptors", "colbert"],
        help="Choose how to index the PDF text."
    )
    st.write(f"**Indexing Method:** {indexing_method}")

    # Create vector store
    vector_store = create_vector_store(chunks, indexing_method=indexing_method)

    # Load LLM
    llm = load_llm()

    # Query translation options
    query_method = st.selectbox(
        "Query Translation Method",
        ["multi-query", "rag-fusion", "decomposition", "step-back", "hyde"],
        help="Choose a method to improve query retrieval."
    )
    st.write(f"**Query Translation Method:** {query_method}")

    # User input
    query = st.text_input("Ask a question about the PDF:")
    if query:
        # Query the PDF
        result = query_pdf(vector_store, query, llm, query_method=query_method)
        st.write("**Answer:**", result["answer"])
        st.write("**Source Text:**", result["source_text"])

if __name__ == "__main__":
    main()