AllAboutRAG / app.py
bainskarman's picture
Update app.py
5f45885 verified
raw
history blame
10.4 kB
import streamlit as st
import os
import requests
from langdetect import detect
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
# Load the Hugging Face token from environment variables
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN") # Replace with your Hugging Face token
# Function to query the Hugging Face API
def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model
api_url = f"https://api-inference.huggingface.co/models/{model_name}"
headers = {"Authorization": f"Bearer {huggingface_token}"}
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
},
}
response = requests.post(api_url, headers=headers, json=payload)
if response.status_code == 200:
return response.json()[0]["generated_text"]
else:
st.error(f"Error: {response.status_code} - {response.text}")
return None
# Function to detect language
def detect_language(text):
try:
return detect(text)
except:
return "en" # Default to English if detection fails
# Function to extract text from PDF with line and page numbers
def extract_text_from_pdf(pdf_file):
pdf_reader = PdfReader(pdf_file)
text_data = []
for page_num, page in enumerate(pdf_reader.pages):
lines = page.extract_text().split('\n')
for line_num, line in enumerate(lines):
text_data.append({
"page": page_num + 1,
"line": line_num + 1,
"content": line
})
return text_data
# Function to search for query in PDF content
def search_pdf_content(pdf_text_data, query):
results = []
for entry in pdf_text_data:
if query.lower() in entry["content"].lower():
results.append(entry)
return results
# Function to split text into chunks
def split_text_into_chunks(text, chunk_size=500):
words = text.split()
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
# Function to compute cosine similarity between query and document chunks
def compute_cosine_similarity(query, chunks):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([query] + chunks)
cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
return cosine_similarities
# Function to find KNN-based similar documents
def find_knn_similar_documents(query, chunks, k=5):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([query] + chunks)
knn = NearestNeighbors(n_neighbors=k, metric="cosine")
knn.fit(tfidf_matrix[1:])
distances, indices = knn.kneighbors(tfidf_matrix[0:1])
return indices.flatten(), distances.flatten()
# Default system prompts for each query translation method
DEFAULT_SYSTEM_PROMPTS = {
"Multi-Query": """You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}""",
"RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple
queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
"Decomposition": """You are an AI language model assistant. Your task is to break down
the given user question into simpler sub-questions. Provide these sub-questions separated
by newlines. Original question: {question}""",
"Step Back": """You are an AI language model assistant. Your task is to refine the given
user question by taking a step back and asking a more general question. Original question: {question}""",
"HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical
document that would be relevant to the given user question. Original question: {question}""",
}
# Streamlit App
def main():
st.title("RAG Model with Advanced Query Translation and Indexing")
st.write("Enter a prompt and get a response from the model.")
# Sidebar for options
st.sidebar.title("Options")
# PDF Upload
st.sidebar.header("Upload PDF")
pdf_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
# Query Translation Options
st.sidebar.header("Query Translation")
query_translation = st.sidebar.selectbox(
"Select Query Translation Method",
["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"]
)
# Indexing Options
st.sidebar.header("Indexing")
indexing_method = st.sidebar.selectbox(
"Select Indexing Method",
["Multi-Representation", "Raptors", "ColBERT"]
)
# Similarity Search Options
st.sidebar.header("Similarity Search")
similarity_method = st.sidebar.selectbox(
"Select Similarity Search Method",
["Cosine Similarity", "KNN"]
)
if similarity_method == "KNN":
k_value = st.sidebar.slider("Select K Value", 1, 10, 5)
# LLM Parameters
st.sidebar.header("LLM Parameters")
max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 1000)
temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
top_k = st.sidebar.slider("Top K", 1, 100, 50)
# System Prompt
st.sidebar.header("System Prompt")
default_system_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation]
system_prompt = st.sidebar.text_area("System Prompt", default_system_prompt)
# Main Content
st.header("Input Prompt")
prompt = st.text_input("Enter your prompt:")
if prompt:
st.write("**Prompt:**", prompt)
# Detect Language
language = detect_language(prompt)
st.write(f"**Detected Language:** {language}")
# Query Translation
if st.button("Apply Query Translation"):
st.write(f"**Applied Query Translation Method:** {query_translation}")
# Format the system prompt with the user's question
formatted_prompt = system_prompt.format(question=prompt)
st.write("**Formatted System Prompt:**", formatted_prompt)
# Query the Hugging Face model for query translation
translated_queries = query_huggingface_model(formatted_prompt, max_new_tokens, temperature, top_k)
if translated_queries:
st.write("**Translated Queries:**")
st.write(translated_queries.split("\n")[-1]) # Print only the updated question part
# Indexing
if st.button("Apply Indexing"):
st.write(f"**Applied Indexing Method:** {indexing_method}")
if pdf_file is not None:
# Extract and search PDF content
pdf_text_data = extract_text_from_pdf(pdf_file)
search_results = search_pdf_content(pdf_text_data, prompt)
if search_results:
st.write("**Relevant Content from PDF:**")
for result in search_results:
st.write(f"**Page {result['page']}, Line {result['line']}:** {result['content']}")
# Split text into chunks
chunks = split_text_into_chunks("\n".join([result["content"] for result in search_results]))
st.write("**Chunks Obtained from PDF:**")
for i, chunk in enumerate(chunks):
st.write(f"**Chunk {i + 1}:** {chunk}")
# Perform similarity search
if similarity_method == "Cosine Similarity":
st.write("**Cosine Similarity Results:**")
cosine_similarities = compute_cosine_similarity(prompt, chunks)
for i, similarity in enumerate(cosine_similarities):
st.write(f"**Chunk {i + 1} Similarity:** {similarity:.4f}")
elif similarity_method == "KNN":
st.write(f"**KNN Results (k={k_value}):**")
indices, distances = find_knn_similar_documents(prompt, chunks, k_value)
for i, (index, distance) in enumerate(zip(indices, distances)):
st.write(f"**Chunk {index + 1} Distance:** {distance:.4f}")
else:
st.write("**No relevant content found in the PDF.**")
else:
st.write("**No PDF uploaded.**")
# Generate Response
if st.button("Generate Response"):
if pdf_file is not None:
# Extract and search PDF content
pdf_text_data = extract_text_from_pdf(pdf_file)
search_results = search_pdf_content(pdf_text_data, prompt)
if search_results:
st.write("**Relevant Content from PDF:**")
for result in search_results:
st.write(f"**Page {result['page']}, Line {result['line']}:** \"{result['content']}\"")
# Generate response based on PDF content
pdf_context = "\n".join([result["content"] for result in search_results])
response = query_huggingface_model(f"Based on the following context:\n{pdf_context}\n\nAnswer this question: {prompt}", max_new_tokens, temperature, top_k)
else:
st.write("**No relevant content found in the PDF. Generating response without PDF context.**")
response = query_huggingface_model(prompt, max_new_tokens, temperature, top_k)
else:
st.write("**No PDF uploaded. Generating response without PDF context.**")
response = query_huggingface_model(prompt, max_new_tokens, temperature, top_k)
if response:
st.write("**Response:**", response)
if __name__ == "__main__":
main()