AllAboutRAG / app.py
bainskarman's picture
Update app.py
751f053 verified
raw
history blame
7.48 kB
import streamlit as st
import os
import requests
import re
from langdetect import detect
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import hashlib
# Load the Hugging Face token from environment variables
huggingface_token = os.environ.get("Key2")
# Initialize Sentence Transformer model for better embeddings
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
# Cache PDF extraction
@st.cache_data
def extract_text_from_pdf(pdf_file):
pdf_reader = PdfReader(pdf_file)
text_data = []
for page_num, page in enumerate(pdf_reader.pages):
text = page.extract_text()
text = re.sub(r'\s+', ' ', text) # Clean extra whitespace
text_data.append({
"page": page_num + 1,
"content": text
})
return text_data
# Enhanced text chunking with overlap
def split_text_into_chunks(text, chunk_size=500, overlap=100):
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunks.append(" ".join(words[i:i + chunk_size]))
return chunks
# Enhanced semantic search using sentence transformers
def semantic_search(query, chunks, threshold=0.3):
query_embedding = sentence_model.encode([query])
chunk_embeddings = sentence_model.encode(chunks)
similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
results = [(chunks[i], similarities[i]) for i in np.argsort(similarities)[::-1]]
return [res for res in results if res[1] > threshold]
# Improved query translation with error handling
def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
model_name = "HuggingFaceH4/zephyr-7b-alpha"
api_url = f"https://api-inference.huggingface.co/models/{model_name}"
headers = {"Authorization": f"Bearer {huggingface_token}"}
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
},
}
try:
response = requests.post(api_url, headers=headers, json=payload, timeout=30)
if response.status_code == 200:
return response.json()[0]["generated_text"]
else:
st.error(f"API Error: {response.status_code}")
return None
except Exception as e:
st.error(f"Connection Error: {str(e)}")
return None
# Enhanced indexing strategies
def create_index(text_chunks, method="Multi-Representation"):
if method == "Multi-Representation":
return TfidfVectorizer().fit_transform(text_chunks)
elif method == "Raptors":
embeddings = sentence_model.encode(text_chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
return index
elif method == "ColBERT":
return sentence_model.encode(text_chunks)
# Improved similarity search with multiple methods
def similarity_search(query, chunks, method="Cosine", index=None, k=5):
if method == "Cosine":
return semantic_search(query, chunks)
elif method == "KNN":
if isinstance(index, faiss.IndexFlatL2):
query_embedding = sentence_model.encode([query])
distances, indices = index.search(query_embedding, k)
return [(chunks[i], 1 - distances[0][j]) for j, i in enumerate(indices[0])]
return []
DEFAULT_SYSTEM_PROMPTS = {
"Multi-Query": """You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}""",
"RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple
queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
"Decomposition": """You are an AI language model assistant. Your task is to break down
the given user question into simpler sub-questions. Provide these sub-questions separated
by newlines. Original question: {question}""",
"Step Back": """You are an AI language model assistant. Your task is to refine the given
user question by taking a step back and asking a more general question. Original question: {question}""",
"HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical
document that would be relevant to the given user question. Original question: {question}""",
}
# Streamlit App
def main():
st.title("Enhanced RAG Model with Advanced Features")
# Sidebar configurations
st.sidebar.title("Configuration")
pdf_file = st.sidebar.file_uploader("Upload PDF", type="pdf")
query_translation = st.sidebar.selectbox("Query Translation", list(DEFAULT_SYSTEM_PROMPTS.keys()))
indexing_method = st.sidebar.selectbox("Indexing Method", ["Multi-Representation", "Raptors", "ColBERT"])
similarity_method = st.sidebar.selectbox("Similarity Search", ["Cosine", "KNN"])
similarity_threshold = st.sidebar.slider("Similarity Threshold", 0.0, 1.0, 0.3)
# Main interface
prompt = st.text_input("Enter your query:")
if prompt:
with st.spinner("Processing..."):
# Query Translation
translated_prompt = query_huggingface_model(
DEFAULT_SYSTEM_PROMPTS[query_translation].format(question=prompt)
)
if pdf_file:
# Process PDF
text_data = extract_text_from_pdf(pdf_file)
full_text = " ".join([p["content"] for p in text_data])
chunks = split_text_into_chunks(full_text)
# Create index
index = create_index(chunks, indexing_method)
# Perform search
if query_translation == "HyDE":
hypothetical_answer = translated_prompt
results = semantic_search(hypothetical_answer, chunks, similarity_threshold)
else:
results = similarity_search(prompt, chunks, similarity_method, index)
# Display results
if results:
st.subheader("Top Results:")
for i, (chunk, score) in enumerate(results[:3]):
st.markdown(f"**Result {i+1}** (Score: {score:.2f}):")
st.write(chunk)
# Generate response
context = "\n".join([chunk for chunk, _ in results[:3]])
response = query_huggingface_model(
f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
)
st.subheader("Generated Response:")
st.write(response)
else:
st.warning("No relevant documents found matching the query.")
else:
st.error("Please upload a PDF document first.")
if __name__ == "__main__":
main()