Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import requests | |
import re | |
from langdetect import detect | |
from PyPDF2 import PdfReader | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.neighbors import NearestNeighbors | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import hashlib | |
# Load the Hugging Face token from environment variables | |
huggingface_token = os.environ.get("Key2") | |
# Initialize Sentence Transformer model for better embeddings | |
sentence_model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Cache PDF extraction | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PdfReader(pdf_file) | |
text_data = [] | |
for page_num, page in enumerate(pdf_reader.pages): | |
text = page.extract_text() | |
text = re.sub(r'\s+', ' ', text) # Clean extra whitespace | |
text_data.append({ | |
"page": page_num + 1, | |
"content": text | |
}) | |
return text_data | |
# Enhanced text chunking with overlap | |
def split_text_into_chunks(text, chunk_size=500, overlap=100): | |
words = text.split() | |
chunks = [] | |
for i in range(0, len(words), chunk_size - overlap): | |
chunks.append(" ".join(words[i:i + chunk_size])) | |
return chunks | |
# Enhanced semantic search using sentence transformers | |
def semantic_search(query, chunks, threshold=0.3): | |
query_embedding = sentence_model.encode([query]) | |
chunk_embeddings = sentence_model.encode(chunks) | |
similarities = cosine_similarity(query_embedding, chunk_embeddings)[0] | |
results = [(chunks[i], similarities[i]) for i in np.argsort(similarities)[::-1]] | |
return [res for res in results if res[1] > threshold] | |
# Improved query translation with error handling | |
def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50): | |
model_name = "HuggingFaceH4/zephyr-7b-alpha" | |
api_url = f"https://api-inference.huggingface.co/models/{model_name}" | |
headers = {"Authorization": f"Bearer {huggingface_token}"} | |
payload = { | |
"inputs": prompt, | |
"parameters": { | |
"max_new_tokens": max_new_tokens, | |
"temperature": temperature, | |
"top_k": top_k, | |
}, | |
} | |
try: | |
response = requests.post(api_url, headers=headers, json=payload, timeout=30) | |
if response.status_code == 200: | |
return response.json()[0]["generated_text"] | |
else: | |
st.error(f"API Error: {response.status_code}") | |
return None | |
except Exception as e: | |
st.error(f"Connection Error: {str(e)}") | |
return None | |
# Enhanced indexing strategies | |
def create_index(text_chunks, method="Multi-Representation"): | |
if method == "Multi-Representation": | |
return TfidfVectorizer().fit_transform(text_chunks) | |
elif method == "Raptors": | |
embeddings = sentence_model.encode(text_chunks) | |
index = faiss.IndexFlatL2(embeddings.shape[1]) | |
index.add(embeddings) | |
return index | |
elif method == "ColBERT": | |
return sentence_model.encode(text_chunks) | |
# Improved similarity search with multiple methods | |
def similarity_search(query, chunks, method="Cosine", index=None, k=5): | |
if method == "Cosine": | |
return semantic_search(query, chunks) | |
elif method == "KNN": | |
if isinstance(index, faiss.IndexFlatL2): | |
query_embedding = sentence_model.encode([query]) | |
distances, indices = index.search(query_embedding, k) | |
return [(chunks[i], 1 - distances[0][j]) for j, i in enumerate(indices[0])] | |
return [] | |
DEFAULT_SYSTEM_PROMPTS = { | |
"Multi-Query": """You are an AI language model assistant. Your task is to generate five | |
different versions of the given user question to retrieve relevant documents from a vector | |
database. By generating multiple perspectives on the user question, your goal is to help | |
the user overcome some of the limitations of the distance-based similarity search. | |
Provide these alternative questions separated by newlines. Original question: {question}""", | |
"RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple | |
queries into a single, refined query to improve retrieval accuracy. Original question: {question}""", | |
"Decomposition": """You are an AI language model assistant. Your task is to break down | |
the given user question into simpler sub-questions. Provide these sub-questions separated | |
by newlines. Original question: {question}""", | |
"Step Back": """You are an AI language model assistant. Your task is to refine the given | |
user question by taking a step back and asking a more general question. Original question: {question}""", | |
"HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical | |
document that would be relevant to the given user question. Original question: {question}""", | |
} | |
# Streamlit App | |
def main(): | |
st.title("Enhanced RAG Model with Advanced Features") | |
# Sidebar configurations | |
st.sidebar.title("Configuration") | |
pdf_file = st.sidebar.file_uploader("Upload PDF", type="pdf") | |
query_translation = st.sidebar.selectbox("Query Translation", list(DEFAULT_SYSTEM_PROMPTS.keys())) | |
indexing_method = st.sidebar.selectbox("Indexing Method", ["Multi-Representation", "Raptors", "ColBERT"]) | |
similarity_method = st.sidebar.selectbox("Similarity Search", ["Cosine", "KNN"]) | |
similarity_threshold = st.sidebar.slider("Similarity Threshold", 0.0, 1.0, 0.3) | |
# Main interface | |
prompt = st.text_input("Enter your query:") | |
if prompt: | |
with st.spinner("Processing..."): | |
# Query Translation | |
translated_prompt = query_huggingface_model( | |
DEFAULT_SYSTEM_PROMPTS[query_translation].format(question=prompt) | |
) | |
if pdf_file: | |
# Process PDF | |
text_data = extract_text_from_pdf(pdf_file) | |
full_text = " ".join([p["content"] for p in text_data]) | |
chunks = split_text_into_chunks(full_text) | |
# Create index | |
index = create_index(chunks, indexing_method) | |
# Perform search | |
if query_translation == "HyDE": | |
hypothetical_answer = translated_prompt | |
results = semantic_search(hypothetical_answer, chunks, similarity_threshold) | |
else: | |
results = similarity_search(prompt, chunks, similarity_method, index) | |
# Display results | |
if results: | |
st.subheader("Top Results:") | |
for i, (chunk, score) in enumerate(results[:3]): | |
st.markdown(f"**Result {i+1}** (Score: {score:.2f}):") | |
st.write(chunk) | |
# Generate response | |
context = "\n".join([chunk for chunk, _ in results[:3]]) | |
response = query_huggingface_model( | |
f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:" | |
) | |
st.subheader("Generated Response:") | |
st.write(response) | |
else: | |
st.warning("No relevant documents found matching the query.") | |
else: | |
st.error("Please upload a PDF document first.") | |
if __name__ == "__main__": | |
main() |