Spaces:
Sleeping
Sleeping
File size: 10,423 Bytes
7b666bb c0a164f 5f45885 d2c0564 5e06280 5f45885 a0f23a4 5f45885 a0f23a4 5f45885 a0f23a4 5f45885 a0f23a4 5f45885 a0f23a4 5f45885 a0f23a4 7b666bb d2c0564 5f45885 d2c0564 a1fd273 5e06280 a1fd273 5f45885 a1fd273 5e06280 5f45885 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
import streamlit as st
import os
import requests
from langdetect import detect
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
# Load the Hugging Face token from environment variables
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN") # Replace with your Hugging Face token
# Function to query the Hugging Face API
def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model
api_url = f"https://api-inference.huggingface.co/models/{model_name}"
headers = {"Authorization": f"Bearer {huggingface_token}"}
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
},
}
response = requests.post(api_url, headers=headers, json=payload)
if response.status_code == 200:
return response.json()[0]["generated_text"]
else:
st.error(f"Error: {response.status_code} - {response.text}")
return None
# Function to detect language
def detect_language(text):
try:
return detect(text)
except:
return "en" # Default to English if detection fails
# Function to extract text from PDF with line and page numbers
def extract_text_from_pdf(pdf_file):
pdf_reader = PdfReader(pdf_file)
text_data = []
for page_num, page in enumerate(pdf_reader.pages):
lines = page.extract_text().split('\n')
for line_num, line in enumerate(lines):
text_data.append({
"page": page_num + 1,
"line": line_num + 1,
"content": line
})
return text_data
# Function to search for query in PDF content
def search_pdf_content(pdf_text_data, query):
results = []
for entry in pdf_text_data:
if query.lower() in entry["content"].lower():
results.append(entry)
return results
# Function to split text into chunks
def split_text_into_chunks(text, chunk_size=500):
words = text.split()
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
# Function to compute cosine similarity between query and document chunks
def compute_cosine_similarity(query, chunks):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([query] + chunks)
cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
return cosine_similarities
# Function to find KNN-based similar documents
def find_knn_similar_documents(query, chunks, k=5):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([query] + chunks)
knn = NearestNeighbors(n_neighbors=k, metric="cosine")
knn.fit(tfidf_matrix[1:])
distances, indices = knn.kneighbors(tfidf_matrix[0:1])
return indices.flatten(), distances.flatten()
# Default system prompts for each query translation method
DEFAULT_SYSTEM_PROMPTS = {
"Multi-Query": """You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}""",
"RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple
queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
"Decomposition": """You are an AI language model assistant. Your task is to break down
the given user question into simpler sub-questions. Provide these sub-questions separated
by newlines. Original question: {question}""",
"Step Back": """You are an AI language model assistant. Your task is to refine the given
user question by taking a step back and asking a more general question. Original question: {question}""",
"HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical
document that would be relevant to the given user question. Original question: {question}""",
}
# Streamlit App
def main():
st.title("RAG Model with Advanced Query Translation and Indexing")
st.write("Enter a prompt and get a response from the model.")
# Sidebar for options
st.sidebar.title("Options")
# PDF Upload
st.sidebar.header("Upload PDF")
pdf_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
# Query Translation Options
st.sidebar.header("Query Translation")
query_translation = st.sidebar.selectbox(
"Select Query Translation Method",
["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"]
)
# Indexing Options
st.sidebar.header("Indexing")
indexing_method = st.sidebar.selectbox(
"Select Indexing Method",
["Multi-Representation", "Raptors", "ColBERT"]
)
# Similarity Search Options
st.sidebar.header("Similarity Search")
similarity_method = st.sidebar.selectbox(
"Select Similarity Search Method",
["Cosine Similarity", "KNN"]
)
if similarity_method == "KNN":
k_value = st.sidebar.slider("Select K Value", 1, 10, 5)
# LLM Parameters
st.sidebar.header("LLM Parameters")
max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 1000)
temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
top_k = st.sidebar.slider("Top K", 1, 100, 50)
# System Prompt
st.sidebar.header("System Prompt")
default_system_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation]
system_prompt = st.sidebar.text_area("System Prompt", default_system_prompt)
# Main Content
st.header("Input Prompt")
prompt = st.text_input("Enter your prompt:")
if prompt:
st.write("**Prompt:**", prompt)
# Detect Language
language = detect_language(prompt)
st.write(f"**Detected Language:** {language}")
# Query Translation
if st.button("Apply Query Translation"):
st.write(f"**Applied Query Translation Method:** {query_translation}")
# Format the system prompt with the user's question
formatted_prompt = system_prompt.format(question=prompt)
st.write("**Formatted System Prompt:**", formatted_prompt)
# Query the Hugging Face model for query translation
translated_queries = query_huggingface_model(formatted_prompt, max_new_tokens, temperature, top_k)
if translated_queries:
st.write("**Translated Queries:**")
st.write(translated_queries.split("\n")[-1]) # Print only the updated question part
# Indexing
if st.button("Apply Indexing"):
st.write(f"**Applied Indexing Method:** {indexing_method}")
if pdf_file is not None:
# Extract and search PDF content
pdf_text_data = extract_text_from_pdf(pdf_file)
search_results = search_pdf_content(pdf_text_data, prompt)
if search_results:
st.write("**Relevant Content from PDF:**")
for result in search_results:
st.write(f"**Page {result['page']}, Line {result['line']}:** {result['content']}")
# Split text into chunks
chunks = split_text_into_chunks("\n".join([result["content"] for result in search_results]))
st.write("**Chunks Obtained from PDF:**")
for i, chunk in enumerate(chunks):
st.write(f"**Chunk {i + 1}:** {chunk}")
# Perform similarity search
if similarity_method == "Cosine Similarity":
st.write("**Cosine Similarity Results:**")
cosine_similarities = compute_cosine_similarity(prompt, chunks)
for i, similarity in enumerate(cosine_similarities):
st.write(f"**Chunk {i + 1} Similarity:** {similarity:.4f}")
elif similarity_method == "KNN":
st.write(f"**KNN Results (k={k_value}):**")
indices, distances = find_knn_similar_documents(prompt, chunks, k_value)
for i, (index, distance) in enumerate(zip(indices, distances)):
st.write(f"**Chunk {index + 1} Distance:** {distance:.4f}")
else:
st.write("**No relevant content found in the PDF.**")
else:
st.write("**No PDF uploaded.**")
# Generate Response
if st.button("Generate Response"):
if pdf_file is not None:
# Extract and search PDF content
pdf_text_data = extract_text_from_pdf(pdf_file)
search_results = search_pdf_content(pdf_text_data, prompt)
if search_results:
st.write("**Relevant Content from PDF:**")
for result in search_results:
st.write(f"**Page {result['page']}, Line {result['line']}:** \"{result['content']}\"")
# Generate response based on PDF content
pdf_context = "\n".join([result["content"] for result in search_results])
response = query_huggingface_model(f"Based on the following context:\n{pdf_context}\n\nAnswer this question: {prompt}", max_new_tokens, temperature, top_k)
else:
st.write("**No relevant content found in the PDF. Generating response without PDF context.**")
response = query_huggingface_model(prompt, max_new_tokens, temperature, top_k)
else:
st.write("**No PDF uploaded. Generating response without PDF context.**")
response = query_huggingface_model(prompt, max_new_tokens, temperature, top_k)
if response:
st.write("**Response:**", response)
if __name__ == "__main__":
main() |