bainskarman commited on
Commit
911335e
·
verified ·
1 Parent(s): 751f053

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -144
app.py CHANGED
@@ -1,54 +1,37 @@
1
  import streamlit as st
2
  import os
3
  import requests
4
- import re
5
- from langdetect import detect
6
- from PyPDF2 import PdfReader
7
- from sklearn.feature_extraction.text import TfidfVectorizer
8
- from sklearn.metrics.pairwise import cosine_similarity
9
- from sklearn.neighbors import NearestNeighbors
10
  import numpy as np
 
11
  from sentence_transformers import SentenceTransformer
12
- import faiss
13
- import hashlib
14
 
15
- # Load the Hugging Face token from environment variables
16
  huggingface_token = os.environ.get("Key2")
17
 
18
- # Initialize Sentence Transformer model for better embeddings
19
- sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
20
 
21
- # Cache PDF extraction
22
- @st.cache_data
23
- def extract_text_from_pdf(pdf_file):
24
- pdf_reader = PdfReader(pdf_file)
25
- text_data = []
26
- for page_num, page in enumerate(pdf_reader.pages):
27
- text = page.extract_text()
28
- text = re.sub(r'\s+', ' ', text) # Clean extra whitespace
29
- text_data.append({
30
- "page": page_num + 1,
31
- "content": text
32
- })
33
- return text_data
34
-
35
- # Enhanced text chunking with overlap
36
- def split_text_into_chunks(text, chunk_size=500, overlap=100):
37
- words = text.split()
38
- chunks = []
39
- for i in range(0, len(words), chunk_size - overlap):
40
- chunks.append(" ".join(words[i:i + chunk_size]))
41
- return chunks
42
-
43
- # Enhanced semantic search using sentence transformers
44
- def semantic_search(query, chunks, threshold=0.3):
45
- query_embedding = sentence_model.encode([query])
46
- chunk_embeddings = sentence_model.encode(chunks)
47
- similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
48
- results = [(chunks[i], similarities[i]) for i in np.argsort(similarities)[::-1]]
49
- return [res for res in results if res[1] > threshold]
50
-
51
- # Improved query translation with error handling
52
  def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
53
  model_name = "HuggingFaceH4/zephyr-7b-alpha"
54
  api_url = f"https://api-inference.huggingface.co/models/{model_name}"
@@ -61,113 +44,108 @@ def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=
61
  "top_k": top_k,
62
  },
63
  }
64
- try:
65
- response = requests.post(api_url, headers=headers, json=payload, timeout=30)
66
- if response.status_code == 200:
67
- return response.json()[0]["generated_text"]
68
- else:
69
- st.error(f"API Error: {response.status_code}")
70
- return None
71
- except Exception as e:
72
- st.error(f"Connection Error: {str(e)}")
73
  return None
74
 
75
- # Enhanced indexing strategies
76
- def create_index(text_chunks, method="Multi-Representation"):
77
- if method == "Multi-Representation":
78
- return TfidfVectorizer().fit_transform(text_chunks)
79
- elif method == "Raptors":
80
- embeddings = sentence_model.encode(text_chunks)
81
- index = faiss.IndexFlatL2(embeddings.shape[1])
82
- index.add(embeddings)
83
- return index
84
- elif method == "ColBERT":
85
- return sentence_model.encode(text_chunks)
86
-
87
- # Improved similarity search with multiple methods
88
- def similarity_search(query, chunks, method="Cosine", index=None, k=5):
89
- if method == "Cosine":
90
- return semantic_search(query, chunks)
91
- elif method == "KNN":
92
- if isinstance(index, faiss.IndexFlatL2):
93
- query_embedding = sentence_model.encode([query])
94
- distances, indices = index.search(query_embedding, k)
95
- return [(chunks[i], 1 - distances[0][j]) for j, i in enumerate(indices[0])]
96
- return []
97
-
98
- DEFAULT_SYSTEM_PROMPTS = {
99
- "Multi-Query": """You are an AI language model assistant. Your task is to generate five
100
- different versions of the given user question to retrieve relevant documents from a vector
101
- database. By generating multiple perspectives on the user question, your goal is to help
102
- the user overcome some of the limitations of the distance-based similarity search.
103
- Provide these alternative questions separated by newlines. Original question: {question}""",
104
- "RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple
105
- queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
106
- "Decomposition": """You are an AI language model assistant. Your task is to break down
107
- the given user question into simpler sub-questions. Provide these sub-questions separated
108
- by newlines. Original question: {question}""",
109
- "Step Back": """You are an AI language model assistant. Your task is to refine the given
110
- user question by taking a step back and asking a more general question. Original question: {question}""",
111
- "HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical
112
- document that would be relevant to the given user question. Original question: {question}""",
113
- }
114
 
115
  # Streamlit App
116
  def main():
117
- st.title("Enhanced RAG Model with Advanced Features")
118
-
119
- # Sidebar configurations
120
- st.sidebar.title("Configuration")
121
- pdf_file = st.sidebar.file_uploader("Upload PDF", type="pdf")
122
- query_translation = st.sidebar.selectbox("Query Translation", list(DEFAULT_SYSTEM_PROMPTS.keys()))
123
- indexing_method = st.sidebar.selectbox("Indexing Method", ["Multi-Representation", "Raptors", "ColBERT"])
124
- similarity_method = st.sidebar.selectbox("Similarity Search", ["Cosine", "KNN"])
125
- similarity_threshold = st.sidebar.slider("Similarity Threshold", 0.0, 1.0, 0.3)
126
-
127
- # Main interface
 
 
 
 
 
 
 
 
 
 
 
 
128
  prompt = st.text_input("Enter your query:")
129
-
130
- if prompt:
131
- with st.spinner("Processing..."):
132
- # Query Translation
133
- translated_prompt = query_huggingface_model(
134
- DEFAULT_SYSTEM_PROMPTS[query_translation].format(question=prompt)
135
- )
136
-
137
- if pdf_file:
138
- # Process PDF
139
- text_data = extract_text_from_pdf(pdf_file)
140
- full_text = " ".join([p["content"] for p in text_data])
141
- chunks = split_text_into_chunks(full_text)
142
-
143
- # Create index
144
- index = create_index(chunks, indexing_method)
145
-
146
- # Perform search
147
- if query_translation == "HyDE":
148
- hypothetical_answer = translated_prompt
149
- results = semantic_search(hypothetical_answer, chunks, similarity_threshold)
150
- else:
151
- results = similarity_search(prompt, chunks, similarity_method, index)
152
-
153
- # Display results
154
- if results:
155
- st.subheader("Top Results:")
156
- for i, (chunk, score) in enumerate(results[:3]):
157
- st.markdown(f"**Result {i+1}** (Score: {score:.2f}):")
158
- st.write(chunk)
159
-
160
- # Generate response
161
- context = "\n".join([chunk for chunk, _ in results[:3]])
162
- response = query_huggingface_model(
163
- f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
164
- )
165
- st.subheader("Generated Response:")
166
- st.write(response)
167
- else:
168
- st.warning("No relevant documents found matching the query.")
169
- else:
170
- st.error("Please upload a PDF document first.")
171
 
172
  if __name__ == "__main__":
173
- main()
 
1
  import streamlit as st
2
  import os
3
  import requests
4
+ import faiss
 
 
 
 
 
5
  import numpy as np
6
+ from pdfminer.high_level import extract_text
7
  from sentence_transformers import SentenceTransformer
8
+ from langdetect import detect
 
9
 
10
+ # Load the Hugging Face token
11
  huggingface_token = os.environ.get("Key2")
12
 
13
+ # Load Sentence Transformer Model
14
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
15
 
16
+ # Default system prompts for each query translation method
17
+ DEFAULT_SYSTEM_PROMPTS = {
18
+ "Multi-Query": """You are an AI language model assistant. Your task is to generate five \
19
+ different versions of the given user question to retrieve relevant documents from a vector \
20
+ database. By generating multiple perspectives on the user question, your goal is to help\
21
+ the user overcome some of the limitations of the distance-based similarity search.\
22
+ Provide these alternative questions separated by newlines. Original question: {question}""",
23
+ "RAG Fusion": """You are an AI language model assistant. Your task is to combine multiple \
24
+ queries into a single, refined query to improve retrieval accuracy. Original question: {question}""",
25
+ "Decomposition": """You are an AI language model assistant. Your task is to break down \
26
+ the given user question into simpler sub-questions. Provide these sub-questions separated \
27
+ by newlines. Original question: {question}""",
28
+ "Step Back": """You are an AI language model assistant. Your task is to refine the given \
29
+ user question by taking a step back and asking a more general question. Original question: {question}""",
30
+ "HyDE": """You are an AI language model assistant. Your task is to generate a hypothetical \
31
+ document that would be relevant to the given user question. Original question: {question}""",
32
+ }
33
+
34
+ # Function to query the Hugging Face model
 
 
 
 
 
 
 
 
 
 
 
 
35
  def query_huggingface_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
36
  model_name = "HuggingFaceH4/zephyr-7b-alpha"
37
  api_url = f"https://api-inference.huggingface.co/models/{model_name}"
 
44
  "top_k": top_k,
45
  },
46
  }
47
+ response = requests.post(api_url, headers=headers, json=payload)
48
+ if response.status_code == 200:
49
+ return response.json()[0]["generated_text"]
50
+ else:
51
+ st.error(f"Error: {response.status_code} - {response.text}")
 
 
 
 
52
  return None
53
 
54
+ # Function to detect language
55
+ def detect_language(text):
56
+ try:
57
+ return detect(text)
58
+ except:
59
+ return "en"
60
+
61
+ # Extract text from PDF with line and page numbers
62
+ def extract_text_from_pdf(pdf_file):
63
+ text = extract_text(pdf_file)
64
+ return text.split("\n")
65
+
66
+ # Chunk text into smaller segments
67
+ def split_text_into_chunks(text_lines, chunk_size=500):
68
+ words = " ".join(text_lines).split()
69
+ return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
70
+
71
+ # Build FAISS Index
72
+ def build_faiss_index(embeddings):
73
+ dimension = embeddings.shape[1]
74
+ index = faiss.IndexFlatL2(dimension)
75
+ index.add(embeddings)
76
+ return index
77
+
78
+ # Search in FAISS Index
79
+ def search_faiss_index(query_embedding, index, top_k=5):
80
+ distances, indices = index.search(query_embedding, top_k)
81
+ return indices[0], distances[0]
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  # Streamlit App
84
  def main():
85
+ st.title("Enhanced RAG Model with FAISS Indexing")
86
+
87
+ # Sidebar for options
88
+ st.sidebar.header("Upload PDF")
89
+ pdf_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
90
+
91
+ st.sidebar.header("Query Translation")
92
+ query_translation = st.sidebar.selectbox(
93
+ "Select Query Translation Method",
94
+ ["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"]
95
+ )
96
+
97
+ st.sidebar.header("Similarity Search")
98
+ similarity_method = st.sidebar.selectbox("Select Similarity Search Method", ["Cosine Similarity", "KNN"])
99
+ if similarity_method == "KNN":
100
+ k_value = st.sidebar.slider("Select K Value", 1, 10, 5)
101
+
102
+ # LLM Parameters
103
+ max_new_tokens = st.sidebar.slider("Max New Tokens", 10, 1000, 500)
104
+ temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7)
105
+ top_k = st.sidebar.slider("Top K", 1, 100, 50)
106
+
107
+ # Input Prompt
108
  prompt = st.text_input("Enter your query:")
109
+
110
+ if pdf_file and prompt:
111
+ # Extract text from PDF
112
+ text_lines = extract_text_from_pdf(pdf_file)
113
+
114
+ # Detect Language
115
+ lang = detect_language(" ".join(text_lines))
116
+ st.write(f"**Detected Language:** {lang}")
117
+
118
+ # Chunk the text
119
+ chunks = split_text_into_chunks(text_lines)
120
+
121
+ # Encode chunks
122
+ chunk_embeddings = embedder.encode(chunks, convert_to_tensor=False)
123
+
124
+ # Build FAISS index
125
+ index = build_faiss_index(np.array(chunk_embeddings))
126
+
127
+ # Embed the query
128
+ query_embedding = embedder.encode([prompt], convert_to_tensor=False)
129
+
130
+ # Search for relevant chunks
131
+ top_k_indices, _ = search_faiss_index(np.array(query_embedding), index, top_k=5)
132
+
133
+ # Retrieve relevant chunks
134
+ relevant_chunks = [chunks[i] for i in top_k_indices]
135
+
136
+ # Combine the context
137
+ context = "\n".join(relevant_chunks)
138
+
139
+ # Format the system prompt
140
+ formatted_prompt = DEFAULT_SYSTEM_PROMPTS[query_translation].format(question=prompt)
141
+
142
+ # Query LLM
143
+ llm_input = f"{formatted_prompt}\n\nContext: {context}\n\nAnswer this question: {prompt}"
144
+ response = query_huggingface_model(llm_input, max_new_tokens, temperature, top_k)
145
+
146
+ # Display the result
147
+ st.subheader("Response:")
148
+ st.write(response)
 
 
149
 
150
  if __name__ == "__main__":
151
+ main()