Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,18 +8,33 @@ from typing import List, Dict, Any
|
|
8 |
import time
|
9 |
import requests
|
10 |
import re
|
|
|
|
|
11 |
|
12 |
# Configure device
|
13 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
print(f"Using device: {device}")
|
15 |
|
16 |
-
class
|
17 |
def __init__(self):
|
18 |
self.embedder = None
|
19 |
self.knowledge_base = []
|
20 |
self.embeddings = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
self.initialize_models()
|
22 |
self.load_markdown_knowledge_base()
|
|
|
23 |
|
24 |
def initialize_models(self):
|
25 |
"""Initialize the embedding model"""
|
@@ -80,6 +95,7 @@ class RAGtimBot:
|
|
80 |
# Fallback to zero embedding
|
81 |
self.embeddings.append(np.zeros(384))
|
82 |
|
|
|
83 |
print(f"β
Knowledge base loaded with {len(self.knowledge_base)} documents")
|
84 |
|
85 |
def process_markdown_file(self, content: str, filename: str):
|
@@ -142,77 +158,252 @@ class RAGtimBot:
|
|
142 |
|
143 |
return sections
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
def cosine_similarity(self, a, b):
|
146 |
"""Calculate cosine similarity between two vectors"""
|
147 |
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
148 |
|
149 |
-
def
|
150 |
-
"""
|
151 |
try:
|
152 |
# Generate query embedding
|
153 |
-
query_embedding = self.embedder(query[:500], return_tensors="pt") # Truncate query
|
154 |
query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
|
155 |
|
156 |
# Calculate similarities
|
157 |
similarities = []
|
158 |
for i, doc_embedding in enumerate(self.embeddings):
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
167 |
|
168 |
-
# Sort by similarity and
|
169 |
-
similarities.sort(key=lambda x:
|
170 |
return similarities[:top_k]
|
171 |
|
172 |
except Exception as e:
|
173 |
-
print(f"Error in search: {e}")
|
174 |
-
|
175 |
-
return self.keyword_search(query, top_k)
|
176 |
|
177 |
-
def
|
178 |
-
"""
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
content_lower = doc["content"].lower()
|
184 |
-
score = sum(content_lower.count(term) for term in query_terms)
|
185 |
|
186 |
-
#
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
-
if
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
})
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
# Initialize the bot
|
203 |
-
print("Initializing RAGtim Bot
|
204 |
-
bot =
|
205 |
|
206 |
-
def
|
207 |
-
"""API endpoint for search
|
208 |
try:
|
209 |
-
|
|
|
|
|
|
|
|
|
210 |
return {
|
211 |
"results": results,
|
212 |
"query": query,
|
213 |
"top_k": top_k,
|
214 |
-
"search_type":
|
215 |
-
"total_documents": len(bot.knowledge_base)
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
}
|
217 |
except Exception as e:
|
218 |
print(f"Error in search API: {e}")
|
@@ -237,46 +428,70 @@ def get_stats_api():
|
|
237 |
"sections_by_file": sections_by_file,
|
238 |
"model_name": "sentence-transformers/all-MiniLM-L6-v2",
|
239 |
"embedding_dimension": 384,
|
240 |
-
"search_capabilities": [
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
"knowledge_sources": list(sections_by_file.keys())
|
243 |
}
|
244 |
|
245 |
def chat_interface(message, history):
|
246 |
-
"""Chat interface with
|
247 |
if not message.strip():
|
248 |
-
return "Please ask me something about Raktim Mondol! I
|
249 |
|
250 |
try:
|
251 |
-
#
|
252 |
-
search_results = bot.
|
253 |
|
254 |
if search_results:
|
255 |
# Build comprehensive response
|
256 |
response_parts = []
|
257 |
-
response_parts.append(f"
|
258 |
|
259 |
# Use the best match as primary response
|
260 |
best_match = search_results[0]
|
261 |
-
response_parts.append(f"**Primary Answer** (
|
262 |
-
response_parts.append(f"Source: {best_match['metadata']['source']} - {best_match['metadata']['section']}")
|
263 |
-
response_parts.append(f"{best_match['
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
# Add additional context if available
|
266 |
if len(search_results) > 1:
|
267 |
response_parts.append("**Additional Context:**")
|
268 |
for i, result in enumerate(search_results[1:3], 1): # Show up to 2 additional results
|
269 |
-
section_info = f"{result['metadata']['source']} - {result['metadata']['section']}"
|
270 |
-
|
|
|
|
|
271 |
# Add a brief excerpt
|
272 |
-
excerpt = result['content'][:200] + "..." if len(result['content']) > 200 else result['content']
|
273 |
response_parts.append(f" {excerpt}\n")
|
274 |
|
275 |
-
response_parts.append("\n
|
|
|
|
|
|
|
|
|
276 |
|
277 |
return "\n".join(response_parts)
|
278 |
else:
|
279 |
-
return "I don't have specific information about that topic in my
|
280 |
|
281 |
except Exception as e:
|
282 |
print(f"Error in chat interface: {e}")
|
@@ -290,16 +505,21 @@ css = """
|
|
290 |
.gradio-container {
|
291 |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
292 |
}
|
293 |
-
.
|
|
|
|
|
|
|
|
|
|
|
294 |
padding: 10px;
|
295 |
-
|
296 |
-
|
297 |
}
|
298 |
"""
|
299 |
|
300 |
-
# Create the main chat interface
|
301 |
with gr.Blocks(
|
302 |
-
title="
|
303 |
css=css,
|
304 |
theme=gr.themes.Soft(
|
305 |
primary_hue="green",
|
@@ -308,43 +528,37 @@ with gr.Blocks(
|
|
308 |
)
|
309 |
) as chat_demo:
|
310 |
gr.Markdown(f"""
|
311 |
-
#
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
**
|
316 |
-
-
|
317 |
-
|
318 |
-
|
319 |
-
-
|
320 |
-
-
|
321 |
-
-
|
322 |
-
|
323 |
-
**
|
324 |
-
-
|
325 |
-
|
326 |
-
|
327 |
-
-
|
328 |
-
|
329 |
-
**
|
330 |
-
-
|
331 |
-
|
332 |
-
|
333 |
-
**
|
334 |
-
-
|
335 |
-
-
|
336 |
-
|
337 |
-
|
338 |
-
- Professional experience and teaching roles
|
339 |
-
- Statistical methods and biostatistics applications
|
340 |
-
- Awards, recognition, and professional development
|
341 |
-
- Contact information and collaboration opportunities
|
342 |
-
|
343 |
-
**Note**: This demo shows search results from the complete markdown knowledge base. In hybrid mode, these results are passed to DeepSeek LLM for natural response generation.
|
344 |
""")
|
345 |
|
346 |
chatbot = gr.Chatbot(
|
347 |
-
height=
|
348 |
show_label=False,
|
349 |
container=True,
|
350 |
type="messages"
|
@@ -352,20 +566,20 @@ with gr.Blocks(
|
|
352 |
|
353 |
with gr.Row():
|
354 |
msg = gr.Textbox(
|
355 |
-
placeholder="Ask
|
356 |
container=False,
|
357 |
scale=7,
|
358 |
show_label=False
|
359 |
)
|
360 |
-
submit_btn = gr.Button("
|
361 |
|
362 |
# Example buttons
|
363 |
with gr.Row():
|
364 |
examples = [
|
365 |
-
"What is Raktim's research
|
366 |
-
"Tell me about BioFusionNet
|
367 |
-
"What are his
|
368 |
-
"Describe his
|
369 |
]
|
370 |
for example in examples:
|
371 |
gr.Button(example, size="sm").click(
|
@@ -390,40 +604,99 @@ with gr.Blocks(
|
|
390 |
submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
|
391 |
msg.submit(respond, [msg, chatbot], [chatbot, msg])
|
392 |
|
393 |
-
# Create
|
394 |
-
with gr.Blocks(title="
|
395 |
-
gr.Markdown("#
|
396 |
-
gr.Markdown("
|
397 |
|
398 |
with gr.Row():
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
|
411 |
-
|
412 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
|
414 |
search_btn.click(
|
415 |
-
|
416 |
-
inputs=[search_input, top_k_slider],
|
417 |
outputs=search_output
|
418 |
)
|
419 |
|
420 |
# Create stats interface
|
421 |
-
with gr.Blocks(title="π
|
422 |
-
gr.Markdown("# π
|
423 |
-
gr.Markdown("Detailed
|
424 |
|
425 |
-
stats_output = gr.JSON(label="
|
426 |
-
stats_btn = gr.Button("Get Statistics")
|
427 |
|
428 |
stats_btn.click(
|
429 |
get_stats_api,
|
@@ -434,13 +707,17 @@ with gr.Blocks(title="π Stats API") as stats_demo:
|
|
434 |
# Combine interfaces using TabbedInterface
|
435 |
demo = gr.TabbedInterface(
|
436 |
[chat_demo, search_demo, stats_demo],
|
437 |
-
["π¬
|
438 |
-
title="
|
439 |
)
|
440 |
|
441 |
if __name__ == "__main__":
|
442 |
-
print("π Launching RAGtim Bot
|
443 |
print(f"π Loaded {len(bot.knowledge_base)} sections from markdown files")
|
|
|
|
|
|
|
|
|
444 |
demo.launch(
|
445 |
server_name="0.0.0.0",
|
446 |
server_port=7860,
|
|
|
8 |
import time
|
9 |
import requests
|
10 |
import re
|
11 |
+
import math
|
12 |
+
from collections import defaultdict, Counter
|
13 |
|
14 |
# Configure device
|
15 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
print(f"Using device: {device}")
|
17 |
|
18 |
+
class HybridSearchRAGBot:
|
19 |
def __init__(self):
|
20 |
self.embedder = None
|
21 |
self.knowledge_base = []
|
22 |
self.embeddings = []
|
23 |
+
|
24 |
+
# BM25 components
|
25 |
+
self.term_frequencies = {} # doc_id -> {term: frequency}
|
26 |
+
self.document_frequency = {} # term -> number of docs containing term
|
27 |
+
self.document_lengths = {} # doc_id -> document length
|
28 |
+
self.average_doc_length = 0
|
29 |
+
self.total_documents = 0
|
30 |
+
|
31 |
+
# BM25 parameters
|
32 |
+
self.k1 = 1.5 # Controls term frequency saturation
|
33 |
+
self.b = 0.75 # Controls document length normalization
|
34 |
+
|
35 |
self.initialize_models()
|
36 |
self.load_markdown_knowledge_base()
|
37 |
+
self.build_bm25_index()
|
38 |
|
39 |
def initialize_models(self):
|
40 |
"""Initialize the embedding model"""
|
|
|
95 |
# Fallback to zero embedding
|
96 |
self.embeddings.append(np.zeros(384))
|
97 |
|
98 |
+
self.total_documents = len(self.knowledge_base)
|
99 |
print(f"β
Knowledge base loaded with {len(self.knowledge_base)} documents")
|
100 |
|
101 |
def process_markdown_file(self, content: str, filename: str):
|
|
|
158 |
|
159 |
return sections
|
160 |
|
161 |
+
def tokenize(self, text: str) -> List[str]:
|
162 |
+
"""Tokenize text for BM25"""
|
163 |
+
# Convert to lowercase and remove punctuation
|
164 |
+
text = re.sub(r'[^\w\s]', ' ', text.lower())
|
165 |
+
# Split into words and filter out short words and stop words
|
166 |
+
words = [word for word in text.split() if len(word) > 2 and not self.is_stop_word(word)]
|
167 |
+
return words
|
168 |
+
|
169 |
+
def is_stop_word(self, word: str) -> bool:
|
170 |
+
"""Check if word is a stop word"""
|
171 |
+
stop_words = {
|
172 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
173 |
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
174 |
+
'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
|
175 |
+
'from', 'up', 'out', 'down', 'off', 'over', 'under', 'again', 'further', 'then', 'once'
|
176 |
+
}
|
177 |
+
return word in stop_words
|
178 |
+
|
179 |
+
def build_bm25_index(self):
|
180 |
+
"""Build BM25 index for all documents"""
|
181 |
+
print("Building BM25 index...")
|
182 |
+
|
183 |
+
# Reset indexes
|
184 |
+
self.term_frequencies = {}
|
185 |
+
self.document_frequency = defaultdict(int)
|
186 |
+
self.document_lengths = {}
|
187 |
+
|
188 |
+
total_length = 0
|
189 |
+
|
190 |
+
# First pass: calculate term frequencies and document lengths
|
191 |
+
for doc in self.knowledge_base:
|
192 |
+
doc_id = doc['id']
|
193 |
+
terms = self.tokenize(doc['content'])
|
194 |
+
|
195 |
+
# Calculate term frequencies for this document
|
196 |
+
term_freq = Counter(terms)
|
197 |
+
self.term_frequencies[doc_id] = dict(term_freq)
|
198 |
+
|
199 |
+
# Store document length
|
200 |
+
doc_length = len(terms)
|
201 |
+
self.document_lengths[doc_id] = doc_length
|
202 |
+
total_length += doc_length
|
203 |
+
|
204 |
+
# Update document frequencies
|
205 |
+
unique_terms = set(terms)
|
206 |
+
for term in unique_terms:
|
207 |
+
self.document_frequency[term] += 1
|
208 |
+
|
209 |
+
# Calculate average document length
|
210 |
+
self.average_doc_length = total_length / self.total_documents if self.total_documents > 0 else 0
|
211 |
+
|
212 |
+
print(f"β
BM25 index built: {len(self.document_frequency)} unique terms, avg doc length: {self.average_doc_length:.1f}")
|
213 |
+
|
214 |
+
def calculate_bm25_score(self, term: str, doc_id: str) -> float:
|
215 |
+
"""Calculate BM25 score for a term in a document"""
|
216 |
+
# Get term frequency in document
|
217 |
+
tf = self.term_frequencies.get(doc_id, {}).get(term, 0)
|
218 |
+
if tf == 0:
|
219 |
+
return 0.0
|
220 |
+
|
221 |
+
# Get document frequency and document length
|
222 |
+
df = self.document_frequency.get(term, 1)
|
223 |
+
doc_length = self.document_lengths.get(doc_id, 0)
|
224 |
+
|
225 |
+
# Calculate IDF: log((N - df + 0.5) / (df + 0.5))
|
226 |
+
idf = math.log((self.total_documents - df + 0.5) / (df + 0.5))
|
227 |
+
|
228 |
+
# Calculate BM25 score
|
229 |
+
numerator = tf * (self.k1 + 1)
|
230 |
+
denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.average_doc_length))
|
231 |
+
|
232 |
+
return idf * (numerator / denominator)
|
233 |
+
|
234 |
+
def bm25_search(self, query: str, top_k: int = 10) -> List[Dict]:
|
235 |
+
"""Perform BM25 search"""
|
236 |
+
query_terms = self.tokenize(query)
|
237 |
+
if not query_terms:
|
238 |
+
return []
|
239 |
+
|
240 |
+
scores = {}
|
241 |
+
|
242 |
+
# Calculate BM25 score for each document
|
243 |
+
for doc in self.knowledge_base:
|
244 |
+
doc_id = doc['id']
|
245 |
+
score = 0.0
|
246 |
+
|
247 |
+
for term in query_terms:
|
248 |
+
score += self.calculate_bm25_score(term, doc_id)
|
249 |
+
|
250 |
+
if score > 0:
|
251 |
+
# Apply priority boost
|
252 |
+
priority_boost = 1 + (doc['metadata']['priority'] / 50)
|
253 |
+
final_score = score * priority_boost
|
254 |
+
|
255 |
+
scores[doc_id] = {
|
256 |
+
'document': doc,
|
257 |
+
'score': final_score,
|
258 |
+
'search_type': 'bm25'
|
259 |
+
}
|
260 |
+
|
261 |
+
# Sort by score and return top_k
|
262 |
+
sorted_results = sorted(scores.values(), key=lambda x: x['score'], reverse=True)
|
263 |
+
return sorted_results[:top_k]
|
264 |
+
|
265 |
def cosine_similarity(self, a, b):
|
266 |
"""Calculate cosine similarity between two vectors"""
|
267 |
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
268 |
|
269 |
+
def vector_search(self, query: str, top_k: int = 10) -> List[Dict]:
|
270 |
+
"""Perform vector similarity search"""
|
271 |
try:
|
272 |
# Generate query embedding
|
273 |
+
query_embedding = self.embedder(query[:500], return_tensors="pt") # Truncate query
|
274 |
query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
|
275 |
|
276 |
# Calculate similarities
|
277 |
similarities = []
|
278 |
for i, doc_embedding in enumerate(self.embeddings):
|
279 |
+
if doc_embedding is not None and len(doc_embedding) > 0:
|
280 |
+
similarity = self.cosine_similarity(query_vector, doc_embedding)
|
281 |
+
|
282 |
+
# Apply priority boost
|
283 |
+
priority_boost = 1 + (self.knowledge_base[i]['metadata']['priority'] / 100)
|
284 |
+
final_score = similarity * priority_boost
|
285 |
+
|
286 |
+
similarities.append({
|
287 |
+
'document': self.knowledge_base[i],
|
288 |
+
'score': float(final_score),
|
289 |
+
'search_type': 'vector'
|
290 |
+
})
|
291 |
|
292 |
+
# Sort by similarity and return top_k
|
293 |
+
similarities.sort(key=lambda x: x['score'], reverse=True)
|
294 |
return similarities[:top_k]
|
295 |
|
296 |
except Exception as e:
|
297 |
+
print(f"Error in vector search: {e}")
|
298 |
+
return []
|
|
|
299 |
|
300 |
+
def hybrid_search(self, query: str, top_k: int = 10, vector_weight: float = 0.6, bm25_weight: float = 0.4) -> List[Dict]:
|
301 |
+
"""Perform hybrid search combining vector and BM25 results"""
|
302 |
+
try:
|
303 |
+
# Get results from both search methods
|
304 |
+
vector_results = self.vector_search(query, top_k * 2) # Get more results for better fusion
|
305 |
+
bm25_results = self.bm25_search(query, top_k * 2)
|
|
|
|
|
306 |
|
307 |
+
# Normalize scores to [0, 1] range
|
308 |
+
if vector_results:
|
309 |
+
max_vector_score = max(r['score'] for r in vector_results)
|
310 |
+
if max_vector_score > 0:
|
311 |
+
for result in vector_results:
|
312 |
+
result['normalized_score'] = result['score'] / max_vector_score
|
313 |
+
else:
|
314 |
+
for result in vector_results:
|
315 |
+
result['normalized_score'] = 0
|
316 |
|
317 |
+
if bm25_results:
|
318 |
+
max_bm25_score = max(r['score'] for r in bm25_results)
|
319 |
+
if max_bm25_score > 0:
|
320 |
+
for result in bm25_results:
|
321 |
+
result['normalized_score'] = result['score'] / max_bm25_score
|
322 |
+
else:
|
323 |
+
for result in bm25_results:
|
324 |
+
result['normalized_score'] = 0
|
325 |
+
|
326 |
+
# Combine results
|
327 |
+
combined_scores = {}
|
328 |
+
|
329 |
+
# Add vector results
|
330 |
+
for result in vector_results:
|
331 |
+
doc_id = result['document']['id']
|
332 |
+
combined_scores[doc_id] = {
|
333 |
+
'document': result['document'],
|
334 |
+
'vector_score': result['normalized_score'],
|
335 |
+
'bm25_score': 0.0,
|
336 |
+
'search_type': 'vector'
|
337 |
+
}
|
338 |
+
|
339 |
+
# Add BM25 results
|
340 |
+
for result in bm25_results:
|
341 |
+
doc_id = result['document']['id']
|
342 |
+
if doc_id in combined_scores:
|
343 |
+
combined_scores[doc_id]['bm25_score'] = result['normalized_score']
|
344 |
+
combined_scores[doc_id]['search_type'] = 'hybrid'
|
345 |
+
else:
|
346 |
+
combined_scores[doc_id] = {
|
347 |
+
'document': result['document'],
|
348 |
+
'vector_score': 0.0,
|
349 |
+
'bm25_score': result['normalized_score'],
|
350 |
+
'search_type': 'bm25'
|
351 |
+
}
|
352 |
+
|
353 |
+
# Calculate final hybrid scores
|
354 |
+
final_results = []
|
355 |
+
for doc_id, data in combined_scores.items():
|
356 |
+
hybrid_score = (vector_weight * data['vector_score']) + (bm25_weight * data['bm25_score'])
|
357 |
+
final_results.append({
|
358 |
+
'document': data['document'],
|
359 |
+
'score': hybrid_score,
|
360 |
+
'vector_score': data['vector_score'],
|
361 |
+
'bm25_score': data['bm25_score'],
|
362 |
+
'search_type': data['search_type']
|
363 |
})
|
364 |
+
|
365 |
+
# Sort by hybrid score and return top_k
|
366 |
+
final_results.sort(key=lambda x: x['score'], reverse=True)
|
367 |
+
return final_results[:top_k]
|
368 |
+
|
369 |
+
except Exception as e:
|
370 |
+
print(f"Error in hybrid search: {e}")
|
371 |
+
# Fallback to vector search only
|
372 |
+
return self.vector_search(query, top_k)
|
373 |
+
|
374 |
+
def search_knowledge_base(self, query: str, top_k: int = 5, search_type: str = "hybrid") -> List[Dict]:
|
375 |
+
"""Search the knowledge base using specified method"""
|
376 |
+
if search_type == "vector":
|
377 |
+
return self.vector_search(query, top_k)
|
378 |
+
elif search_type == "bm25":
|
379 |
+
return self.bm25_search(query, top_k)
|
380 |
+
else: # hybrid
|
381 |
+
return self.hybrid_search(query, top_k)
|
382 |
|
383 |
# Initialize the bot
|
384 |
+
print("Initializing Hybrid Search RAGtim Bot...")
|
385 |
+
bot = HybridSearchRAGBot()
|
386 |
|
387 |
+
def search_api(query, top_k=5, search_type="hybrid", vector_weight=0.6, bm25_weight=0.4):
|
388 |
+
"""API endpoint for hybrid search functionality"""
|
389 |
try:
|
390 |
+
if search_type == "hybrid":
|
391 |
+
results = bot.hybrid_search(query, top_k, vector_weight, bm25_weight)
|
392 |
+
else:
|
393 |
+
results = bot.search_knowledge_base(query, top_k, search_type)
|
394 |
+
|
395 |
return {
|
396 |
"results": results,
|
397 |
"query": query,
|
398 |
"top_k": top_k,
|
399 |
+
"search_type": search_type,
|
400 |
+
"total_documents": len(bot.knowledge_base),
|
401 |
+
"search_parameters": {
|
402 |
+
"vector_weight": vector_weight if search_type == "hybrid" else None,
|
403 |
+
"bm25_weight": bm25_weight if search_type == "hybrid" else None,
|
404 |
+
"bm25_k1": bot.k1,
|
405 |
+
"bm25_b": bot.b
|
406 |
+
}
|
407 |
}
|
408 |
except Exception as e:
|
409 |
print(f"Error in search API: {e}")
|
|
|
428 |
"sections_by_file": sections_by_file,
|
429 |
"model_name": "sentence-transformers/all-MiniLM-L6-v2",
|
430 |
"embedding_dimension": 384,
|
431 |
+
"search_capabilities": [
|
432 |
+
"Hybrid Search (Vector + BM25)",
|
433 |
+
"Semantic Vector Search",
|
434 |
+
"BM25 Keyword Search",
|
435 |
+
"GPU Accelerated",
|
436 |
+
"Transformer Embeddings"
|
437 |
+
],
|
438 |
+
"bm25_parameters": {
|
439 |
+
"k1": bot.k1,
|
440 |
+
"b": bot.b,
|
441 |
+
"unique_terms": len(bot.document_frequency),
|
442 |
+
"average_doc_length": bot.average_doc_length
|
443 |
+
},
|
444 |
+
"backend_type": "Hugging Face Space with Hybrid Search",
|
445 |
"knowledge_sources": list(sections_by_file.keys())
|
446 |
}
|
447 |
|
448 |
def chat_interface(message, history):
|
449 |
+
"""Chat interface with hybrid search"""
|
450 |
if not message.strip():
|
451 |
+
return "Please ask me something about Raktim Mondol! I use hybrid search combining semantic similarity and keyword matching for the best results."
|
452 |
|
453 |
try:
|
454 |
+
# Use hybrid search by default
|
455 |
+
search_results = bot.hybrid_search(message, top_k=6)
|
456 |
|
457 |
if search_results:
|
458 |
# Build comprehensive response
|
459 |
response_parts = []
|
460 |
+
response_parts.append(f"π **Hybrid Search Results** (Vector + BM25 combination, found {len(search_results)} relevant sections):\n")
|
461 |
|
462 |
# Use the best match as primary response
|
463 |
best_match = search_results[0]
|
464 |
+
response_parts.append(f"**Primary Answer** (Hybrid Score: {best_match['score']:.3f}):")
|
465 |
+
response_parts.append(f"π Source: {best_match['document']['metadata']['source']} - {best_match['document']['metadata']['section']}")
|
466 |
+
response_parts.append(f"π Search Type: {best_match['search_type'].upper()}")
|
467 |
+
|
468 |
+
# Show score breakdown for hybrid results
|
469 |
+
if 'vector_score' in best_match and 'bm25_score' in best_match:
|
470 |
+
response_parts.append(f"π Vector Score: {best_match['vector_score']:.3f} | BM25 Score: {best_match['bm25_score']:.3f}")
|
471 |
+
|
472 |
+
response_parts.append(f"\n{best_match['document']['content']}\n")
|
473 |
|
474 |
# Add additional context if available
|
475 |
if len(search_results) > 1:
|
476 |
response_parts.append("**Additional Context:**")
|
477 |
for i, result in enumerate(search_results[1:3], 1): # Show up to 2 additional results
|
478 |
+
section_info = f"{result['document']['metadata']['source']} - {result['document']['metadata']['section']}"
|
479 |
+
search_info = f"({result['search_type'].upper()}, Score: {result['score']:.3f})"
|
480 |
+
response_parts.append(f"{i}. {section_info} {search_info}")
|
481 |
+
|
482 |
# Add a brief excerpt
|
483 |
+
excerpt = result['document']['content'][:200] + "..." if len(result['document']['content']) > 200 else result['document']['content']
|
484 |
response_parts.append(f" {excerpt}\n")
|
485 |
|
486 |
+
response_parts.append("\nπ€ **Hybrid Search Technology:**")
|
487 |
+
response_parts.append("β’ **Vector Search**: Semantic similarity using transformer embeddings")
|
488 |
+
response_parts.append("β’ **BM25 Search**: Advanced keyword ranking with TF-IDF")
|
489 |
+
response_parts.append("β’ **Fusion**: Weighted combination for optimal relevance")
|
490 |
+
response_parts.append("\n[Note: This demonstrates hybrid search results. In production, these would be passed to an LLM for natural response generation.]")
|
491 |
|
492 |
return "\n".join(response_parts)
|
493 |
else:
|
494 |
+
return "I don't have specific information about that topic in my knowledge base. Could you please ask something else about Raktim Mondol?"
|
495 |
|
496 |
except Exception as e:
|
497 |
print(f"Error in chat interface: {e}")
|
|
|
505 |
.gradio-container {
|
506 |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
507 |
}
|
508 |
+
.search-type-radio .wrap {
|
509 |
+
display: flex;
|
510 |
+
gap: 10px;
|
511 |
+
}
|
512 |
+
.search-weights {
|
513 |
+
background: #f0f0f0;
|
514 |
padding: 10px;
|
515 |
+
border-radius: 5px;
|
516 |
+
margin: 10px 0;
|
517 |
}
|
518 |
"""
|
519 |
|
520 |
+
# Create the main chat interface
|
521 |
with gr.Blocks(
|
522 |
+
title="π₯ Hybrid Search RAGtim Bot",
|
523 |
css=css,
|
524 |
theme=gr.themes.Soft(
|
525 |
primary_hue="green",
|
|
|
528 |
)
|
529 |
) as chat_demo:
|
530 |
gr.Markdown(f"""
|
531 |
+
# π₯ Hybrid Search RAGtim Bot - Advanced Search Technology
|
532 |
+
|
533 |
+
**π Hybrid Search System**: This Space implements **true hybrid search** combining:
|
534 |
+
- π§ **Semantic Vector Search**: Transformer embeddings for conceptual similarity
|
535 |
+
- π **BM25 Keyword Search**: Advanced TF-IDF ranking for exact term matching
|
536 |
+
- βοΈ **Intelligent Fusion**: Weighted combination for optimal relevance
|
537 |
+
|
538 |
+
**π Knowledge Base**: **{len(bot.knowledge_base)} sections** from comprehensive markdown files:
|
539 |
+
- π **about.md** - Personal info, contact, professional summary
|
540 |
+
- π¬ **research_details.md** - Research projects, methodologies, innovations
|
541 |
+
- π **publications_detailed.md** - Publications with technical details
|
542 |
+
- π» **skills_expertise.md** - Technical skills, LLM expertise, tools
|
543 |
+
- πΌ **experience_detailed.md** - Professional experience, teaching
|
544 |
+
- π **statistics.md** - Statistical methods, biostatistics expertise
|
545 |
+
|
546 |
+
**π§ Search Parameters**:
|
547 |
+
- **BM25 Parameters**: k1={bot.k1}, b={bot.b}
|
548 |
+
- **Vocabulary**: {len(bot.document_frequency)} unique terms
|
549 |
+
- **Average Document Length**: {bot.average_doc_length:.1f} words
|
550 |
+
- **Embedding Model**: sentence-transformers/all-MiniLM-L6-v2 (384-dim)
|
551 |
+
|
552 |
+
**π‘ Try Different Search Types**:
|
553 |
+
- **Hybrid** (Recommended): Best of both semantic and keyword search
|
554 |
+
- **Vector**: Pure semantic similarity for conceptual queries
|
555 |
+
- **BM25**: Pure keyword matching for specific terms
|
556 |
+
|
557 |
+
**Ask me anything about Raktim Mondol's research, expertise, and background!**
|
|
|
|
|
|
|
|
|
|
|
|
|
558 |
""")
|
559 |
|
560 |
chatbot = gr.Chatbot(
|
561 |
+
height=500,
|
562 |
show_label=False,
|
563 |
container=True,
|
564 |
type="messages"
|
|
|
566 |
|
567 |
with gr.Row():
|
568 |
msg = gr.Textbox(
|
569 |
+
placeholder="Ask about Raktim's research, LLM expertise, publications, statistical methods...",
|
570 |
container=False,
|
571 |
scale=7,
|
572 |
show_label=False
|
573 |
)
|
574 |
+
submit_btn = gr.Button("π Hybrid Search", scale=1)
|
575 |
|
576 |
# Example buttons
|
577 |
with gr.Row():
|
578 |
examples = [
|
579 |
+
"What is Raktim's LLM and RAG research?",
|
580 |
+
"Tell me about BioFusionNet statistical methods",
|
581 |
+
"What are his multimodal AI capabilities?",
|
582 |
+
"Describe his biostatistics expertise"
|
583 |
]
|
584 |
for example in examples:
|
585 |
gr.Button(example, size="sm").click(
|
|
|
604 |
submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
|
605 |
msg.submit(respond, [msg, chatbot], [chatbot, msg])
|
606 |
|
607 |
+
# Create advanced search interface
|
608 |
+
with gr.Blocks(title="π§ Advanced Hybrid Search") as search_demo:
|
609 |
+
gr.Markdown("# π§ Advanced Hybrid Search Configuration")
|
610 |
+
gr.Markdown("Fine-tune the hybrid search parameters and compare different search methods")
|
611 |
|
612 |
with gr.Row():
|
613 |
+
with gr.Column(scale=2):
|
614 |
+
search_input = gr.Textbox(
|
615 |
+
label="Search Query",
|
616 |
+
placeholder="Enter your search query about Raktim Mondol..."
|
617 |
+
)
|
618 |
+
|
619 |
+
with gr.Row():
|
620 |
+
search_type = gr.Radio(
|
621 |
+
choices=["hybrid", "vector", "bm25"],
|
622 |
+
value="hybrid",
|
623 |
+
label="Search Method",
|
624 |
+
elem_classes=["search-type-radio"]
|
625 |
+
)
|
626 |
+
top_k_slider = gr.Slider(
|
627 |
+
minimum=1,
|
628 |
+
maximum=15,
|
629 |
+
value=5,
|
630 |
+
step=1,
|
631 |
+
label="Top K Results"
|
632 |
+
)
|
633 |
+
|
634 |
+
# Hybrid search weights (only shown when hybrid is selected)
|
635 |
+
with gr.Group(visible=True) as weight_group:
|
636 |
+
gr.Markdown("**Hybrid Search Weights**")
|
637 |
+
vector_weight = gr.Slider(
|
638 |
+
minimum=0.0,
|
639 |
+
maximum=1.0,
|
640 |
+
value=0.6,
|
641 |
+
step=0.1,
|
642 |
+
label="Vector Weight (Semantic)"
|
643 |
+
)
|
644 |
+
bm25_weight = gr.Slider(
|
645 |
+
minimum=0.0,
|
646 |
+
maximum=1.0,
|
647 |
+
value=0.4,
|
648 |
+
step=0.1,
|
649 |
+
label="BM25 Weight (Keyword)"
|
650 |
+
)
|
651 |
+
|
652 |
+
with gr.Column(scale=1):
|
653 |
+
gr.Markdown("**Search Method Guide:**")
|
654 |
+
gr.Markdown("""
|
655 |
+
**π₯ Hybrid**: Combines semantic + keyword
|
656 |
+
- Best for most queries
|
657 |
+
- Balances meaning and exact terms
|
658 |
+
|
659 |
+
**π§ Vector**: Pure semantic similarity
|
660 |
+
- Good for conceptual questions
|
661 |
+
- Finds related concepts
|
662 |
+
|
663 |
+
**π BM25**: Pure keyword matching
|
664 |
+
- Good for specific terms
|
665 |
+
- Traditional search ranking
|
666 |
+
""")
|
667 |
+
|
668 |
+
search_output = gr.JSON(label="Hybrid Search Results", height=400)
|
669 |
+
search_btn = gr.Button("π Search with Custom Parameters", variant="primary")
|
670 |
|
671 |
+
def update_weights_visibility(search_type):
|
672 |
+
return gr.Group(visible=(search_type == "hybrid"))
|
673 |
+
|
674 |
+
search_type.change(update_weights_visibility, inputs=[search_type], outputs=[weight_group])
|
675 |
+
|
676 |
+
def normalize_weights(vector_w, bm25_w):
|
677 |
+
total = vector_w + bm25_w
|
678 |
+
if total > 0:
|
679 |
+
return vector_w / total, bm25_w / total
|
680 |
+
return 0.6, 0.4
|
681 |
+
|
682 |
+
def advanced_search(query, search_type, top_k, vector_w, bm25_w):
|
683 |
+
# Normalize weights
|
684 |
+
vector_weight, bm25_weight = normalize_weights(vector_w, bm25_w)
|
685 |
+
return search_api(query, top_k, search_type, vector_weight, bm25_weight)
|
686 |
|
687 |
search_btn.click(
|
688 |
+
advanced_search,
|
689 |
+
inputs=[search_input, search_type, top_k_slider, vector_weight, bm25_weight],
|
690 |
outputs=search_output
|
691 |
)
|
692 |
|
693 |
# Create stats interface
|
694 |
+
with gr.Blocks(title="π System Statistics") as stats_demo:
|
695 |
+
gr.Markdown("# π Hybrid Search System Statistics")
|
696 |
+
gr.Markdown("Detailed information about the knowledge base and search capabilities")
|
697 |
|
698 |
+
stats_output = gr.JSON(label="System Statistics", height=500)
|
699 |
+
stats_btn = gr.Button("π Get System Statistics", variant="primary")
|
700 |
|
701 |
stats_btn.click(
|
702 |
get_stats_api,
|
|
|
707 |
# Combine interfaces using TabbedInterface
|
708 |
demo = gr.TabbedInterface(
|
709 |
[chat_demo, search_demo, stats_demo],
|
710 |
+
["π¬ Hybrid Chat", "π§ Advanced Search", "π Statistics"],
|
711 |
+
title="π₯ Hybrid Search RAGtim Bot - Vector + BM25 Fusion"
|
712 |
)
|
713 |
|
714 |
if __name__ == "__main__":
|
715 |
+
print("π Launching Hybrid Search RAGtim Bot...")
|
716 |
print(f"π Loaded {len(bot.knowledge_base)} sections from markdown files")
|
717 |
+
print(f"π BM25 index: {len(bot.document_frequency)} unique terms")
|
718 |
+
print(f"π§ Vector embeddings: {len(bot.embeddings)} documents")
|
719 |
+
print("π₯ Hybrid search ready: Semantic + Keyword fusion!")
|
720 |
+
|
721 |
demo.launch(
|
722 |
server_name="0.0.0.0",
|
723 |
server_port=7860,
|