Spaces:

bainskarman
/

AllAboutRAG

Sleeping

App Files Files Community

bainskarman commited on Mar 13

Commit

911335e

verified ·

1 Parent(s): 751f053

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -144

app.py CHANGED Viewed

@@ -1,54 +1,37 @@
 import streamlit as st
 import os
 import requests
-import re
-from langdetect import detect
-from PyPDF2 import PdfReader
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.neighbors import NearestNeighbors
 import numpy as np
 from sentence_transformers import SentenceTransformer
-import faiss
-import hashlib
-# Load the Hugging Face token from environment variables
 huggingface_token = os.environ.get("Key2")
-# Initialize Sentence Transformer model for better embeddings
-sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Cache PDF extraction
-@st.cache_data
-def extract_text_from_pdf(pdf_file):
-    pdf_reader = PdfReader(pdf_file)
-    text_data = []
-    for page_num, page in enumerate(pdf_reader.pages):
-        text = page.extract_text()
-        text = re.sub(r'\s+', ' ', text)  # Clean extra whitespace
-        text_data.append({
-            "page": page_num + 1,
-            "content": text
-        })
-    return text_data
-# Enhanced text chunking with overlap
-def split_text_into_chunks(text, chunk_size=500, overlap=100):
-    words = text.split()
-    chunks = []
-    for i in range(0, len(words), chunk_size - overlap):
-        chunks.append(" ".join(words[i:i + chunk_size]))
-    return chunks
-# Enhanced semantic search using sentence transformers
-def semantic_search(query, chunks, threshold=0.3):
-    query_embedding = sentence_model.encode([query])
-    chunk_embeddings = sentence_model.encode(chunks)
-    similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
-    results = [(chunks[i], similarities[i]) for i in np.argsort(similarities)[::-1]]
-    return [res for res in results if res[1] > threshold]
-# Improved query translation with error handling
 def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
     model_name = "HuggingFaceH4/zephyr-7b-alpha"
     api_url = f"https://api-inference.huggingface.co/models/{model_name}"
@@ -61,113 +44,108 @@ def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=
             "top_k": top_k,
         },
     }
-    try:
-        response = requests.post(api_url, headers=headers, json=payload, timeout=30)
-        if response.status_code == 200:
-            return response.json()[0]["generated_text"]
-        else:
-            st.error(f"API Error: {response.status_code}")
-            return None
-    except Exception as e:
-        st.error(f"Connection Error: {str(e)}")
         return None
-# Enhanced indexing strategies
-def create_index(text_chunks, method="Multi-Representation"):
-    if method == "Multi-Representation":
-        return TfidfVectorizer().fit_transform(text_chunks)
-    elif method == "Raptors":
-        embeddings = sentence_model.encode(text_chunks)
-        index = faiss.IndexFlatL2(embeddings.shape[1])
-        index.add(embeddings)
-        return index
-    elif method == "ColBERT":
-        return sentence_model.encode(text_chunks)
-# Improved similarity search with multiple methods
-def similarity_search(query, chunks, method="Cosine", index=None, k=5):
-    if method == "Cosine":
-        return semantic_search(query, chunks)
-    elif method == "KNN":
-        if isinstance(index, faiss.IndexFlatL2):
-            query_embedding = sentence_model.encode([query])
-            distances, indices = index.search(query_embedding, k)
-            return [(chunks[i], 1 - distances[0][j]) for j, i in enumerate(indices[0])]
-    return []
-DEFAULT_SYSTEM_PROMPTS = {
-    "Multi-Query": """You are an AI language model assistant. Your task is to generate five
-different versions of the given user question to retrieve relevant documents from a vector
-database. By generating multiple perspectives on the user question, your goal is to help
-the user overcome some of the limitations of the distance-based similarity search.
-Provide these alternative questions separated by newlines. Original question: {question}""",
-    "RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple
-queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
-    "Decomposition": """You are an AI language model assistant. Your task is to break down
-the given user question into simpler sub-questions. Provide these sub-questions separated
-by newlines. Original question: {question}""",
-    "Step Back": """You are an AI language model assistant. Your task is to refine the given
-user question by taking a step back and asking a more general question. Original question: {question}""",
-    "HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical
-document that would be relevant to the given user question. Original question: {question}""",
-}
 # Streamlit App
 def main():
-    st.title("Enhanced RAG Model with Advanced Features")
-    # Sidebar configurations
-    st.sidebar.title("Configuration")
-    pdf_file = st.sidebar.file_uploader("Upload PDF", type="pdf")
-    query_translation = st.sidebar.selectbox("Query Translation", list(DEFAULT_SYSTEM_PROMPTS.keys()))
-    indexing_method = st.sidebar.selectbox("Indexing Method", ["Multi-Representation", "Raptors", "ColBERT"])
-    similarity_method = st.sidebar.selectbox("Similarity Search", ["Cosine", "KNN"])
-    similarity_threshold = st.sidebar.slider("Similarity Threshold", 0.0, 1.0, 0.3)
-    # Main interface
     prompt = st.text_input("Enter your query:")
-    if prompt:
-        with st.spinner("Processing..."):
-            # Query Translation
-            translated_prompt = query_huggingface_model(
-                DEFAULT_SYSTEM_PROMPTS[query_translation].format(question=prompt)
-            )
-            if pdf_file:
-                # Process PDF
-                text_data = extract_text_from_pdf(pdf_file)
-                full_text = " ".join([p["content"] for p in text_data])
-                chunks = split_text_into_chunks(full_text)
-                # Create index
-                index = create_index(chunks, indexing_method)
-                # Perform search
-                if query_translation == "HyDE":
-                    hypothetical_answer = translated_prompt
-                    results = semantic_search(hypothetical_answer, chunks, similarity_threshold)
-                else:
-                    results = similarity_search(prompt, chunks, similarity_method, index)
-                # Display results
-                if results:
-                    st.subheader("Top Results:")
-                    for i, (chunk, score) in enumerate(results[:3]):
-                        st.markdown(f"**Result {i+1}** (Score: {score:.2f}):")
-                        st.write(chunk)
-                    # Generate response
-                    context = "\n".join([chunk for chunk, _ in results[:3]])
-                    response = query_huggingface_model(
-                        f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
-                    )
-                    st.subheader("Generated Response:")
-                    st.write(response)
-                else:
-                    st.warning("No relevant documents found matching the query.")
-            else:
-                st.error("Please upload a PDF document first.")
 if __name__ == "__main__":
-    main()

 import streamlit as st
 import os
 import requests
+import faiss
 import numpy as np
+from pdfminer.high_level import extract_text
 from sentence_transformers import SentenceTransformer
+from langdetect import detect
+# Load the Hugging Face token
 huggingface_token = os.environ.get("Key2")
+# Load Sentence Transformer Model
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+# Default system prompts for each query translation method
+DEFAULT_SYSTEM_PROMPTS = {
+    "Multi-Query": """You are an AI language model assistant. Your task is to generate five \
+different versions of the given user question to retrieve relevant documents from a vector \
+database. By generating multiple perspectives on the user question, your goal is to help\
+the user overcome some of the limitations of the distance-based similarity search.\
+Provide these alternative questions separated by newlines. Original question: {question}""",
+    "RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple \
+queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
+    "Decomposition": """You are an AI language model assistant. Your task is to break down \
+the given user question into simpler sub-questions. Provide these sub-questions separated \
+by newlines. Original question: {question}""",
+    "Step Back": """You are an AI language model assistant. Your task is to refine the given \
+user question by taking a step back and asking a more general question. Original question: {question}""",
+    "HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical \
+document that would be relevant to the given user question. Original question: {question}""",
+}
+# Function to query the Hugging Face model
 def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
     model_name = "HuggingFaceH4/zephyr-7b-alpha"
     api_url = f"https://api-inference.huggingface.co/models/{model_name}"
             "top_k": top_k,
         },
     }
+    response = requests.post(api_url, headers=headers, json=payload)
+    if response.status_code == 200:
+        return response.json()[0]["generated_text"]
+    else:
+        st.error(f"Error: {response.status_code} - {response.text}")
         return None
+# Function to detect language
+def detect_language(text):
+    try:
+        return detect(text)
+    except:
+        return "en"
+# Extract text from PDF with line and page numbers
+def extract_text_from_pdf(pdf_file):
+    text = extract_text(pdf_file)
+    return text.split("\n")
+# Chunk text into smaller segments
+def split_text_into_chunks(text_lines, chunk_size=500):
+    words = " ".join(text_lines).split()
+    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+# Build FAISS Index
+def build_faiss_index(embeddings):
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings)
+    return index
+# Search in FAISS Index
+def search_faiss_index(query_embedding, index, top_k=5):
+    distances, indices = index.search(query_embedding, top_k)
+    return indices[0], distances[0]
 # Streamlit App
 def main():
+    st.title("Enhanced RAG Model with FAISS Indexing")
+    # Sidebar for options
+    st.sidebar.header("Upload PDF")
+    pdf_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
+    st.sidebar.header("Query Translation")
+    query_translation = st.sidebar.selectbox(
+        "Select Query Translation Method",
+        ["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"]
+    )
+    st.sidebar.header("Similarity Search")
+    similarity_method = st.sidebar.selectbox("Select Similarity Search Method", ["Cosine Similarity", "KNN"])
+    if similarity_method == "KNN":
+        k_value = st.sidebar.slider("Select K Value", 1, 10, 5)
+    # LLM Parameters
+    max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 500)
+    temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
+    top_k = st.sidebar.slider("Top K", 1, 100, 50)
+    # Input Prompt
     prompt = st.text_input("Enter your query:")
+    if pdf_file and prompt:
+        # Extract text from PDF
+        text_lines = extract_text_from_pdf(pdf_file)
+        # Detect Language
+        lang = detect_language(" ".join(text_lines))
+        st.write(f"**Detected Language:** {lang}")
+        # Chunk the text
+        chunks = split_text_into_chunks(text_lines)
+        # Encode chunks
+        chunk_embeddings = embedder.encode(chunks, convert_to_tensor=False)
+        # Build FAISS index
+        index = build_faiss_index(np.array(chunk_embeddings))
+        # Embed the query
+        query_embedding = embedder.encode([prompt], convert_to_tensor=False)
+        # Search for relevant chunks
+        top_k_indices, _ = search_faiss_index(np.array(query_embedding), index, top_k=5)
+        # Retrieve relevant chunks
+        relevant_chunks = [chunks[i] for i in top_k_indices]
+        # Combine the context
+        context = "\n".join(relevant_chunks)
+        # Format the system prompt
+        formatted_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation].format(question=prompt)
+        # Query LLM
+        llm_input = f"{formatted_prompt}\n\nContext: {context}\n\nAnswer this question: {prompt}"
+        response = query_huggingface_model(llm_input, max_new_tokens, temperature, top_k)
+        # Display the result
+        st.subheader("Response:")
+        st.write(response)
 if __name__ == "__main__":
+    main()