Spaces:
Sleeping
Sleeping
File size: 4,826 Bytes
7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb 9d72b0b 7b666bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings # Updated import
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
import torch
from transformers import pipeline
from langdetect import detect
# Load a smaller LLM (e.g., Zephyr-7B or Mistral-7B)
def load_llm():
model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model
pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.float16, device_map="auto")
llm = HuggingFacePipeline(pipeline=pipe)
return llm
# Extract text from PDF
def extract_text_from_pdf(file):
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Split text into chunks
def split_text(text, chunk_size=1000, chunk_overlap=200):
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunks = splitter.split_text(text)
return chunks
# Create embeddings and vector store
def create_vector_store(chunks, indexing_method="multi-representation"):
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
if indexing_method == "multi-representation":
vector_store = FAISS.from_texts(chunks, embeddings)
elif indexing_method == "raptors":
# Implement RAPTORS logic here (e.g., hierarchical chunking)
vector_store = FAISS.from_texts(chunks, embeddings)
elif indexing_method == "colbert":
# Implement ColBERT logic here (e.g., contextualized embeddings)
vector_store = FAISS.from_texts(chunks, embeddings)
return vector_store
# Query the PDF
def query_pdf(vector_store, query, llm, query_method="multi-query"):
if query_method == "multi-query":
# Implement Multi-Query logic here
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
elif query_method == "rag-fusion":
# Implement RAG Fusion logic here
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
elif query_method == "decomposition":
# Implement Decomposition logic here
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
elif query_method == "step-back":
# Implement Step Back logic here
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
elif query_method == "hyde":
# Implement HyDE logic here
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
result = qa.run(query)
return result
# Detect language of the text
def detect_language(text):
try:
return detect(text)
except:
return "en" # Default to English if detection fails
# Streamlit App
def main():
st.title("Chat with PDF")
st.write("Upload a PDF and ask questions about it!")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is None:
st.info("Using default PDF.")
uploaded_file = "default.pdf" # Add a default PDF
# Extract text
text = extract_text_from_pdf(uploaded_file)
# Detect language
language = detect_language(text)
st.write(f"Detected Language: {language}")
# Split text into chunks
chunk_size = st.slider("Chunk Size", 500, 2000, 1000)
chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200)
chunks = split_text(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
# Indexing options
indexing_method = st.selectbox(
"Indexing Method",
["multi-representation", "raptors", "colbert"],
help="Choose how to index the PDF text."
)
st.write(f"**Indexing Method:** {indexing_method}")
# Create vector store
vector_store = create_vector_store(chunks, indexing_method=indexing_method)
# Load LLM
llm = load_llm()
# Query translation options
query_method = st.selectbox(
"Query Translation Method",
["multi-query", "rag-fusion", "decomposition", "step-back", "hyde"],
help="Choose a method to improve query retrieval."
)
st.write(f"**Query Translation Method:** {query_method}")
# User input
query = st.text_input("Ask a question about the PDF:")
if query:
# Query the PDF
result = query_pdf(vector_store, query, llm, query_method=query_method)
st.write("**Answer:**", result["answer"])
st.write("**Source Text:**", result["source_text"])
if __name__ == "__main__":
main() |