File size: 7,482 Bytes
7b666bb
c0a164f
5f45885
4f13fd4
d2c0564
5e06280
5f45885
 
 
a0f23a4
4f13fd4
 
 
a0f23a4
5f45885
4f13fd4
a0f23a4
4f13fd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f45885
4f13fd4
5f45885
 
a0f23a4
 
 
 
 
 
 
 
d2c0564
4f13fd4
 
 
 
 
 
 
 
 
5f45885
4f13fd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751f053
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f45885
 
 
4f13fd4
 
 
 
 
 
 
 
 
 
 
 
 
5f45885
4f13fd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f45885
4f13fd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f45885
4f13fd4
5f45885
4f13fd4
5f45885
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import streamlit as st
import os
import requests
import re
from langdetect import detect
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import hashlib

# Load the Hugging Face token from environment variables
huggingface_token = os.environ.get("Key2")

# Initialize Sentence Transformer model for better embeddings
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Cache PDF extraction
@st.cache_data
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text_data = []
    for page_num, page in enumerate(pdf_reader.pages):
        text = page.extract_text()
        text = re.sub(r'\s+', ' ', text)  # Clean extra whitespace
        text_data.append({
            "page": page_num + 1,
            "content": text
        })
    return text_data

# Enhanced text chunking with overlap
def split_text_into_chunks(text, chunk_size=500, overlap=100):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i + chunk_size]))
    return chunks

# Enhanced semantic search using sentence transformers
def semantic_search(query, chunks, threshold=0.3):
    query_embedding = sentence_model.encode([query])
    chunk_embeddings = sentence_model.encode(chunks)
    similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
    results = [(chunks[i], similarities[i]) for i in np.argsort(similarities)[::-1]]
    return [res for res in results if res[1] > threshold]

# Improved query translation with error handling
def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
    model_name = "HuggingFaceH4/zephyr-7b-alpha"
    api_url = f"https://api-inference.huggingface.co/models/{model_name}"
    headers = {"Authorization": f"Bearer {huggingface_token}"}
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": max_new_tokens,
            "temperature": temperature,
            "top_k": top_k,
        },
    }
    try:
        response = requests.post(api_url, headers=headers, json=payload, timeout=30)
        if response.status_code == 200:
            return response.json()[0]["generated_text"]
        else:
            st.error(f"API Error: {response.status_code}")
            return None
    except Exception as e:
        st.error(f"Connection Error: {str(e)}")
        return None

# Enhanced indexing strategies
def create_index(text_chunks, method="Multi-Representation"):
    if method == "Multi-Representation":
        return TfidfVectorizer().fit_transform(text_chunks)
    elif method == "Raptors":
        embeddings = sentence_model.encode(text_chunks)
        index = faiss.IndexFlatL2(embeddings.shape[1])
        index.add(embeddings)
        return index
    elif method == "ColBERT":
        return sentence_model.encode(text_chunks)

# Improved similarity search with multiple methods
def similarity_search(query, chunks, method="Cosine", index=None, k=5):
    if method == "Cosine":
        return semantic_search(query, chunks)
    elif method == "KNN":
        if isinstance(index, faiss.IndexFlatL2):
            query_embedding = sentence_model.encode([query])
            distances, indices = index.search(query_embedding, k)
            return [(chunks[i], 1 - distances[0][j]) for j, i in enumerate(indices[0])]
    return []
    
DEFAULT_SYSTEM_PROMPTS = {
    "Multi-Query": """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}""",
    "RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple 
queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
    "Decomposition": """You are an AI language model assistant. Your task is to break down 
the given user question into simpler sub-questions. Provide these sub-questions separated 
by newlines. Original question: {question}""",
    "Step Back": """You are an AI language model assistant. Your task is to refine the given 
user question by taking a step back and asking a more general question. Original question: {question}""",
    "HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical 
document that would be relevant to the given user question. Original question: {question}""",
}

# Streamlit App
def main():
    st.title("Enhanced RAG Model with Advanced Features")
    
    # Sidebar configurations
    st.sidebar.title("Configuration")
    pdf_file = st.sidebar.file_uploader("Upload PDF", type="pdf")
    query_translation = st.sidebar.selectbox("Query Translation", list(DEFAULT_SYSTEM_PROMPTS.keys()))
    indexing_method = st.sidebar.selectbox("Indexing Method", ["Multi-Representation", "Raptors", "ColBERT"])
    similarity_method = st.sidebar.selectbox("Similarity Search", ["Cosine", "KNN"])
    similarity_threshold = st.sidebar.slider("Similarity Threshold", 0.0, 1.0, 0.3)
    
    # Main interface
    prompt = st.text_input("Enter your query:")
    
    if prompt:
        with st.spinner("Processing..."):
            # Query Translation
            translated_prompt = query_huggingface_model(
                DEFAULT_SYSTEM_PROMPTS[query_translation].format(question=prompt)
            )
            
            if pdf_file:
                # Process PDF
                text_data = extract_text_from_pdf(pdf_file)
                full_text = " ".join([p["content"] for p in text_data])
                chunks = split_text_into_chunks(full_text)
                
                # Create index
                index = create_index(chunks, indexing_method)
                
                # Perform search
                if query_translation == "HyDE":
                    hypothetical_answer = translated_prompt
                    results = semantic_search(hypothetical_answer, chunks, similarity_threshold)
                else:
                    results = similarity_search(prompt, chunks, similarity_method, index)
                
                # Display results
                if results:
                    st.subheader("Top Results:")
                    for i, (chunk, score) in enumerate(results[:3]):
                        st.markdown(f"**Result {i+1}** (Score: {score:.2f}):")
                        st.write(chunk)
                    
                    # Generate response
                    context = "\n".join([chunk for chunk, _ in results[:3]])
                    response = query_huggingface_model(
                        f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
                    )
                    st.subheader("Generated Response:")
                    st.write(response)
                else:
                    st.warning("No relevant documents found matching the query.")
            else:
                st.error("Please upload a PDF document first.")

if __name__ == "__main__":
    main()