Spaces:
Sleeping
Sleeping
File size: 7,482 Bytes
7b666bb c0a164f 5f45885 4f13fd4 d2c0564 5e06280 5f45885 a0f23a4 4f13fd4 a0f23a4 5f45885 4f13fd4 a0f23a4 4f13fd4 5f45885 4f13fd4 5f45885 a0f23a4 d2c0564 4f13fd4 5f45885 4f13fd4 751f053 5f45885 4f13fd4 5f45885 4f13fd4 5f45885 4f13fd4 5f45885 4f13fd4 5f45885 4f13fd4 5f45885 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import streamlit as st
import os
import requests
import re
from langdetect import detect
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import hashlib
# Load the Hugging Face token from environment variables
huggingface_token = os.environ.get("Key2")
# Initialize Sentence Transformer model for better embeddings
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
# Cache PDF extraction
@st.cache_data
def extract_text_from_pdf(pdf_file):
pdf_reader = PdfReader(pdf_file)
text_data = []
for page_num, page in enumerate(pdf_reader.pages):
text = page.extract_text()
text = re.sub(r'\s+', ' ', text) # Clean extra whitespace
text_data.append({
"page": page_num + 1,
"content": text
})
return text_data
# Enhanced text chunking with overlap
def split_text_into_chunks(text, chunk_size=500, overlap=100):
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunks.append(" ".join(words[i:i + chunk_size]))
return chunks
# Enhanced semantic search using sentence transformers
def semantic_search(query, chunks, threshold=0.3):
query_embedding = sentence_model.encode([query])
chunk_embeddings = sentence_model.encode(chunks)
similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
results = [(chunks[i], similarities[i]) for i in np.argsort(similarities)[::-1]]
return [res for res in results if res[1] > threshold]
# Improved query translation with error handling
def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
model_name = "HuggingFaceH4/zephyr-7b-alpha"
api_url = f"https://api-inference.huggingface.co/models/{model_name}"
headers = {"Authorization": f"Bearer {huggingface_token}"}
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
},
}
try:
response = requests.post(api_url, headers=headers, json=payload, timeout=30)
if response.status_code == 200:
return response.json()[0]["generated_text"]
else:
st.error(f"API Error: {response.status_code}")
return None
except Exception as e:
st.error(f"Connection Error: {str(e)}")
return None
# Enhanced indexing strategies
def create_index(text_chunks, method="Multi-Representation"):
if method == "Multi-Representation":
return TfidfVectorizer().fit_transform(text_chunks)
elif method == "Raptors":
embeddings = sentence_model.encode(text_chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
return index
elif method == "ColBERT":
return sentence_model.encode(text_chunks)
# Improved similarity search with multiple methods
def similarity_search(query, chunks, method="Cosine", index=None, k=5):
if method == "Cosine":
return semantic_search(query, chunks)
elif method == "KNN":
if isinstance(index, faiss.IndexFlatL2):
query_embedding = sentence_model.encode([query])
distances, indices = index.search(query_embedding, k)
return [(chunks[i], 1 - distances[0][j]) for j, i in enumerate(indices[0])]
return []
DEFAULT_SYSTEM_PROMPTS = {
"Multi-Query": """You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}""",
"RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple
queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
"Decomposition": """You are an AI language model assistant. Your task is to break down
the given user question into simpler sub-questions. Provide these sub-questions separated
by newlines. Original question: {question}""",
"Step Back": """You are an AI language model assistant. Your task is to refine the given
user question by taking a step back and asking a more general question. Original question: {question}""",
"HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical
document that would be relevant to the given user question. Original question: {question}""",
}
# Streamlit App
def main():
st.title("Enhanced RAG Model with Advanced Features")
# Sidebar configurations
st.sidebar.title("Configuration")
pdf_file = st.sidebar.file_uploader("Upload PDF", type="pdf")
query_translation = st.sidebar.selectbox("Query Translation", list(DEFAULT_SYSTEM_PROMPTS.keys()))
indexing_method = st.sidebar.selectbox("Indexing Method", ["Multi-Representation", "Raptors", "ColBERT"])
similarity_method = st.sidebar.selectbox("Similarity Search", ["Cosine", "KNN"])
similarity_threshold = st.sidebar.slider("Similarity Threshold", 0.0, 1.0, 0.3)
# Main interface
prompt = st.text_input("Enter your query:")
if prompt:
with st.spinner("Processing..."):
# Query Translation
translated_prompt = query_huggingface_model(
DEFAULT_SYSTEM_PROMPTS[query_translation].format(question=prompt)
)
if pdf_file:
# Process PDF
text_data = extract_text_from_pdf(pdf_file)
full_text = " ".join([p["content"] for p in text_data])
chunks = split_text_into_chunks(full_text)
# Create index
index = create_index(chunks, indexing_method)
# Perform search
if query_translation == "HyDE":
hypothetical_answer = translated_prompt
results = semantic_search(hypothetical_answer, chunks, similarity_threshold)
else:
results = similarity_search(prompt, chunks, similarity_method, index)
# Display results
if results:
st.subheader("Top Results:")
for i, (chunk, score) in enumerate(results[:3]):
st.markdown(f"**Result {i+1}** (Score: {score:.2f}):")
st.write(chunk)
# Generate response
context = "\n".join([chunk for chunk, _ in results[:3]])
response = query_huggingface_model(
f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
)
st.subheader("Generated Response:")
st.write(response)
else:
st.warning("No relevant documents found matching the query.")
else:
st.error("Please upload a PDF document first.")
if __name__ == "__main__":
main() |