File size: 10,423 Bytes
7b666bb
c0a164f
5f45885
d2c0564
5e06280
5f45885
 
 
a0f23a4
 
5f45885
 
a0f23a4
5f45885
 
 
 
 
a0f23a4
 
 
 
 
 
 
 
5f45885
a0f23a4
5f45885
a0f23a4
5f45885
a0f23a4
7b666bb
d2c0564
 
 
 
5f45885
d2c0564
 
a1fd273
5e06280
 
a1fd273
 
5f45885
 
 
 
 
 
 
a1fd273
5e06280
5f45885
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import streamlit as st
import os
import requests
from langdetect import detect
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Load the Hugging Face token from environment variables
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")  # Replace with your Hugging Face token

# Function to query the Hugging Face API
def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
    model_name = "HuggingFaceH4/zephyr-7b-alpha"  # Replace with your preferred model
    api_url = f"https://api-inference.huggingface.co/models/{model_name}"
    headers = {"Authorization": f"Bearer {huggingface_token}"}
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": max_new_tokens,
            "temperature": temperature,
            "top_k": top_k,
        },
    }
    response = requests.post(api_url, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()[0]["generated_text"]
    else:
        st.error(f"Error: {response.status_code} - {response.text}")
        return None

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return "en"  # Default to English if detection fails

# Function to extract text from PDF with line and page numbers
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text_data = []
    for page_num, page in enumerate(pdf_reader.pages):
        lines = page.extract_text().split('\n')
        for line_num, line in enumerate(lines):
            text_data.append({
                "page": page_num + 1,
                "line": line_num + 1,
                "content": line
            })
    return text_data

# Function to search for query in PDF content
def search_pdf_content(pdf_text_data, query):
    results = []
    for entry in pdf_text_data:
        if query.lower() in entry["content"].lower():
            results.append(entry)
    return results

# Function to split text into chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split()
    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Function to compute cosine similarity between query and document chunks
def compute_cosine_similarity(query, chunks):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([query] + chunks)
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    return cosine_similarities

# Function to find KNN-based similar documents
def find_knn_similar_documents(query, chunks, k=5):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([query] + chunks)
    knn = NearestNeighbors(n_neighbors=k, metric="cosine")
    knn.fit(tfidf_matrix[1:])
    distances, indices = knn.kneighbors(tfidf_matrix[0:1])
    return indices.flatten(), distances.flatten()

# Default system prompts for each query translation method
DEFAULT_SYSTEM_PROMPTS = {
    "Multi-Query": """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}""",
    "RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple 
queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
    "Decomposition": """You are an AI language model assistant. Your task is to break down 
the given user question into simpler sub-questions. Provide these sub-questions separated 
by newlines. Original question: {question}""",
    "Step Back": """You are an AI language model assistant. Your task is to refine the given 
user question by taking a step back and asking a more general question. Original question: {question}""",
    "HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical 
document that would be relevant to the given user question. Original question: {question}""",
}

# Streamlit App
def main():
    st.title("RAG Model with Advanced Query Translation and Indexing")
    st.write("Enter a prompt and get a response from the model.")

    # Sidebar for options
    st.sidebar.title("Options")

    # PDF Upload
    st.sidebar.header("Upload PDF")
    pdf_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")

    # Query Translation Options
    st.sidebar.header("Query Translation")
    query_translation = st.sidebar.selectbox(
        "Select Query Translation Method",
        ["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"]
    )

    # Indexing Options
    st.sidebar.header("Indexing")
    indexing_method = st.sidebar.selectbox(
        "Select Indexing Method",
        ["Multi-Representation", "Raptors", "ColBERT"]
    )

    # Similarity Search Options
    st.sidebar.header("Similarity Search")
    similarity_method = st.sidebar.selectbox(
        "Select Similarity Search Method",
        ["Cosine Similarity", "KNN"]
    )
    if similarity_method == "KNN":
        k_value = st.sidebar.slider("Select K Value", 1, 10, 5)

    # LLM Parameters
    st.sidebar.header("LLM Parameters")
    max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 1000)
    temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
    top_k = st.sidebar.slider("Top K", 1, 100, 50)

    # System Prompt
    st.sidebar.header("System Prompt")
    default_system_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation]
    system_prompt = st.sidebar.text_area("System Prompt", default_system_prompt)

    # Main Content
    st.header("Input Prompt")
    prompt = st.text_input("Enter your prompt:")
    if prompt:
        st.write("**Prompt:**", prompt)

        # Detect Language
        language = detect_language(prompt)
        st.write(f"**Detected Language:** {language}")

        # Query Translation
        if st.button("Apply Query Translation"):
            st.write(f"**Applied Query Translation Method:** {query_translation}")
            # Format the system prompt with the user's question
            formatted_prompt = system_prompt.format(question=prompt)
            st.write("**Formatted System Prompt:**", formatted_prompt)

            # Query the Hugging Face model for query translation
            translated_queries = query_huggingface_model(formatted_prompt, max_new_tokens, temperature, top_k)
            if translated_queries:
                st.write("**Translated Queries:**")
                st.write(translated_queries.split("\n")[-1])  # Print only the updated question part

        # Indexing
        if st.button("Apply Indexing"):
            st.write(f"**Applied Indexing Method:** {indexing_method}")
            if pdf_file is not None:
                # Extract and search PDF content
                pdf_text_data = extract_text_from_pdf(pdf_file)
                search_results = search_pdf_content(pdf_text_data, prompt)

                if search_results:
                    st.write("**Relevant Content from PDF:**")
                    for result in search_results:
                        st.write(f"**Page {result['page']}, Line {result['line']}:** {result['content']}")

                    # Split text into chunks
                    chunks = split_text_into_chunks("\n".join([result["content"] for result in search_results]))
                    st.write("**Chunks Obtained from PDF:**")
                    for i, chunk in enumerate(chunks):
                        st.write(f"**Chunk {i + 1}:** {chunk}")

                    # Perform similarity search
                    if similarity_method == "Cosine Similarity":
                        st.write("**Cosine Similarity Results:**")
                        cosine_similarities = compute_cosine_similarity(prompt, chunks)
                        for i, similarity in enumerate(cosine_similarities):
                            st.write(f"**Chunk {i + 1} Similarity:** {similarity:.4f}")
                    elif similarity_method == "KNN":
                        st.write(f"**KNN Results (k={k_value}):**")
                        indices, distances = find_knn_similar_documents(prompt, chunks, k_value)
                        for i, (index, distance) in enumerate(zip(indices, distances)):
                            st.write(f"**Chunk {index + 1} Distance:** {distance:.4f}")
                else:
                    st.write("**No relevant content found in the PDF.**")
            else:
                st.write("**No PDF uploaded.**")

        # Generate Response
        if st.button("Generate Response"):
            if pdf_file is not None:
                # Extract and search PDF content
                pdf_text_data = extract_text_from_pdf(pdf_file)
                search_results = search_pdf_content(pdf_text_data, prompt)

                if search_results:
                    st.write("**Relevant Content from PDF:**")
                    for result in search_results:
                        st.write(f"**Page {result['page']}, Line {result['line']}:** \"{result['content']}\"")

                    # Generate response based on PDF content
                    pdf_context = "\n".join([result["content"] for result in search_results])
                    response = query_huggingface_model(f"Based on the following context:\n{pdf_context}\n\nAnswer this question: {prompt}", max_new_tokens, temperature, top_k)
                else:
                    st.write("**No relevant content found in the PDF. Generating response without PDF context.**")
                    response = query_huggingface_model(prompt, max_new_tokens, temperature, top_k)
            else:
                st.write("**No PDF uploaded. Generating response without PDF context.**")
                response = query_huggingface_model(prompt, max_new_tokens, temperature, top_k)

            if response:
                st.write("**Response:**", response)

if __name__ == "__main__":
    main()