raktimhugging commited on
Commit
e04dc80
Β·
verified Β·
1 Parent(s): 8ecbae1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +409 -132
app.py CHANGED
@@ -8,18 +8,33 @@ from typing import List, Dict, Any
8
  import time
9
  import requests
10
  import re
 
 
11
 
12
  # Configure device
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
  print(f"Using device: {device}")
15
 
16
- class RAGtimBot:
17
  def __init__(self):
18
  self.embedder = None
19
  self.knowledge_base = []
20
  self.embeddings = []
 
 
 
 
 
 
 
 
 
 
 
 
21
  self.initialize_models()
22
  self.load_markdown_knowledge_base()
 
23
 
24
  def initialize_models(self):
25
  """Initialize the embedding model"""
@@ -80,6 +95,7 @@ class RAGtimBot:
80
  # Fallback to zero embedding
81
  self.embeddings.append(np.zeros(384))
82
 
 
83
  print(f"βœ… Knowledge base loaded with {len(self.knowledge_base)} documents")
84
 
85
  def process_markdown_file(self, content: str, filename: str):
@@ -142,77 +158,252 @@ class RAGtimBot:
142
 
143
  return sections
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  def cosine_similarity(self, a, b):
146
  """Calculate cosine similarity between two vectors"""
147
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
148
 
149
- def search_knowledge_base(self, query: str, top_k: int = 5) -> List[Dict]:
150
- """Search the knowledge base using semantic similarity"""
151
  try:
152
  # Generate query embedding
153
- query_embedding = self.embedder(query[:500], return_tensors="pt") # Truncate query too
154
  query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
155
 
156
  # Calculate similarities
157
  similarities = []
158
  for i, doc_embedding in enumerate(self.embeddings):
159
- similarity = self.cosine_similarity(query_vector, doc_embedding)
160
- similarities.append({
161
- "id": self.knowledge_base[i]["id"],
162
- "content": self.knowledge_base[i]["content"],
163
- "metadata": self.knowledge_base[i]["metadata"],
164
- "score": float(similarity),
165
- "index": i
166
- })
 
 
 
 
167
 
168
- # Sort by similarity and priority
169
- similarities.sort(key=lambda x: (x["score"], x["metadata"]["priority"]), reverse=True)
170
  return similarities[:top_k]
171
 
172
  except Exception as e:
173
- print(f"Error in search: {e}")
174
- # Fallback to keyword search
175
- return self.keyword_search(query, top_k)
176
 
177
- def keyword_search(self, query: str, top_k: int = 5) -> List[Dict]:
178
- """Fallback keyword search"""
179
- query_terms = query.lower().split()
180
- results = []
181
-
182
- for i, doc in enumerate(self.knowledge_base):
183
- content_lower = doc["content"].lower()
184
- score = sum(content_lower.count(term) for term in query_terms)
185
 
186
- # Add priority boost
187
- priority_boost = doc["metadata"]["priority"] / 10
188
- final_score = score + priority_boost
 
 
 
 
 
 
189
 
190
- if score > 0:
191
- results.append({
192
- "id": doc["id"],
193
- "content": doc["content"],
194
- "metadata": doc["metadata"],
195
- "score": final_score,
196
- "index": i
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  })
198
-
199
- results.sort(key=lambda x: x["score"], reverse=True)
200
- return results[:top_k]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  # Initialize the bot
203
- print("Initializing RAGtim Bot with markdown knowledge base...")
204
- bot = RAGtimBot()
205
 
206
- def search_only_api(query, top_k=5):
207
- """API endpoint for search-only functionality"""
208
  try:
209
- results = bot.search_knowledge_base(query, top_k)
 
 
 
 
210
  return {
211
  "results": results,
212
  "query": query,
213
  "top_k": top_k,
214
- "search_type": "semantic",
215
- "total_documents": len(bot.knowledge_base)
 
 
 
 
 
 
216
  }
217
  except Exception as e:
218
  print(f"Error in search API: {e}")
@@ -237,46 +428,70 @@ def get_stats_api():
237
  "sections_by_file": sections_by_file,
238
  "model_name": "sentence-transformers/all-MiniLM-L6-v2",
239
  "embedding_dimension": 384,
240
- "search_capabilities": ["Semantic Search", "GPU Accelerated", "Transformer Embeddings", "Markdown Knowledge Base"],
241
- "backend_type": "Hugging Face Space",
 
 
 
 
 
 
 
 
 
 
 
 
242
  "knowledge_sources": list(sections_by_file.keys())
243
  }
244
 
245
  def chat_interface(message, history):
246
- """Chat interface with markdown knowledge base"""
247
  if not message.strip():
248
- return "Please ask me something about Raktim Mondol! I have comprehensive information loaded from his complete portfolio markdown files."
249
 
250
  try:
251
- # Search knowledge base
252
- search_results = bot.search_knowledge_base(message, top_k=6)
253
 
254
  if search_results:
255
  # Build comprehensive response
256
  response_parts = []
257
- response_parts.append(f"Based on my markdown knowledge base (found {len(search_results)} relevant sections):\n")
258
 
259
  # Use the best match as primary response
260
  best_match = search_results[0]
261
- response_parts.append(f"**Primary Answer** (Relevance: {best_match['score']:.2f}):")
262
- response_parts.append(f"Source: {best_match['metadata']['source']} - {best_match['metadata']['section']}")
263
- response_parts.append(f"{best_match['content']}\n")
 
 
 
 
 
 
264
 
265
  # Add additional context if available
266
  if len(search_results) > 1:
267
  response_parts.append("**Additional Context:**")
268
  for i, result in enumerate(search_results[1:3], 1): # Show up to 2 additional results
269
- section_info = f"{result['metadata']['source']} - {result['metadata']['section']}"
270
- response_parts.append(f"{i}. {section_info} (Relevance: {result['score']:.2f})")
 
 
271
  # Add a brief excerpt
272
- excerpt = result['content'][:200] + "..." if len(result['content']) > 200 else result['content']
273
  response_parts.append(f" {excerpt}\n")
274
 
275
- response_parts.append("\n[Note: This response is generated from your complete markdown knowledge base. In hybrid mode, DeepSeek LLM would generate more natural responses using this context.]")
 
 
 
 
276
 
277
  return "\n".join(response_parts)
278
  else:
279
- return "I don't have specific information about that topic in my markdown knowledge base. Could you please ask something else about Raktim Mondol?"
280
 
281
  except Exception as e:
282
  print(f"Error in chat interface: {e}")
@@ -290,16 +505,21 @@ css = """
290
  .gradio-container {
291
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
292
  }
293
- .chat-message {
 
 
 
 
 
294
  padding: 10px;
295
- margin: 5px 0;
296
- border-radius: 10px;
297
  }
298
  """
299
 
300
- # Create the main chat interface - UPDATED FOR GRADIO 5.34.0
301
  with gr.Blocks(
302
- title="πŸ€– RAGtim Bot - Markdown Knowledge Base",
303
  css=css,
304
  theme=gr.themes.Soft(
305
  primary_hue="green",
@@ -308,43 +528,37 @@ with gr.Blocks(
308
  )
309
  ) as chat_demo:
310
  gr.Markdown(f"""
311
- # πŸ€– RAGtim Bot - Markdown Knowledge Base
312
-
313
- **Complete Markdown Knowledge Base**: This Hugging Face Space loads all markdown files from Raktim Mondol's portfolio with **{len(bot.knowledge_base)} knowledge sections**.
314
-
315
- **Loaded Markdown Files:**
316
- - πŸ“„ **about.md** - Personal information, contact details, professional summary
317
- - πŸ”¬ **research_details.md** - Detailed research projects, methodologies, current work
318
- - πŸ“š **publications_detailed.md** - Complete publication details, technical contributions
319
- - πŸ’» **skills_expertise.md** - Comprehensive technical skills, tools, frameworks
320
- - πŸ’Ό **experience_detailed.md** - Professional experience, teaching, research roles
321
- - πŸ“Š **statistics.md** - Statistical methods, biostatistics expertise, methodologies
322
-
323
- **Search Capabilities:**
324
- - πŸ” Semantic similarity search using transformers
325
- - πŸš€ GPU-accelerated embeddings with priority ranking
326
- - πŸ“Š Relevance scoring across all markdown content
327
- - 🎯 Section-level granular search within each file
328
-
329
- **API Endpoints:**
330
- - `/api/search` - Search across complete markdown knowledge base
331
- - `/api/stats` - Detailed statistics about loaded content
332
-
333
- **Ask me anything about Raktim Mondol:**
334
- - Research projects, methodologies, and innovations
335
- - Publications with technical details and impact
336
- - Technical skills, programming expertise, and tools
337
- - Educational background and academic achievements
338
- - Professional experience and teaching roles
339
- - Statistical methods and biostatistics applications
340
- - Awards, recognition, and professional development
341
- - Contact information and collaboration opportunities
342
-
343
- **Note**: This demo shows search results from the complete markdown knowledge base. In hybrid mode, these results are passed to DeepSeek LLM for natural response generation.
344
  """)
345
 
346
  chatbot = gr.Chatbot(
347
- height=600,
348
  show_label=False,
349
  container=True,
350
  type="messages"
@@ -352,20 +566,20 @@ with gr.Blocks(
352
 
353
  with gr.Row():
354
  msg = gr.Textbox(
355
- placeholder="Ask me anything about Raktim Mondol's research, skills, experience, publications...",
356
  container=False,
357
  scale=7,
358
  show_label=False
359
  )
360
- submit_btn = gr.Button("Search Knowledge Base", scale=1)
361
 
362
  # Example buttons
363
  with gr.Row():
364
  examples = [
365
- "What is Raktim's research about?",
366
- "Tell me about BioFusionNet in detail",
367
- "What are his LLM and RAG expertise?",
368
- "Describe his statistical methods and biostatistics work"
369
  ]
370
  for example in examples:
371
  gr.Button(example, size="sm").click(
@@ -390,40 +604,99 @@ with gr.Blocks(
390
  submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
391
  msg.submit(respond, [msg, chatbot], [chatbot, msg])
392
 
393
- # Create API interface for search-only functionality
394
- with gr.Blocks(title="πŸ” Search API") as search_demo:
395
- gr.Markdown("# πŸ” Markdown Knowledge Base Search API")
396
- gr.Markdown("Direct access to semantic search across all loaded markdown files")
397
 
398
  with gr.Row():
399
- search_input = gr.Textbox(
400
- label="Search Query",
401
- placeholder="Enter your search query about Raktim Mondol..."
402
- )
403
- top_k_slider = gr.Slider(
404
- minimum=1,
405
- maximum=15,
406
- value=5,
407
- step=1,
408
- label="Top K Results"
409
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
- search_output = gr.JSON(label="Markdown Knowledge Base Search Results")
412
- search_btn = gr.Button("Search")
 
 
 
 
 
 
 
 
 
 
 
 
 
413
 
414
  search_btn.click(
415
- search_only_api,
416
- inputs=[search_input, top_k_slider],
417
  outputs=search_output
418
  )
419
 
420
  # Create stats interface
421
- with gr.Blocks(title="πŸ“Š Stats API") as stats_demo:
422
- gr.Markdown("# πŸ“Š Knowledge Base Stats")
423
- gr.Markdown("Detailed statistics about the loaded markdown knowledge base")
424
 
425
- stats_output = gr.JSON(label="Markdown Knowledge Base Statistics")
426
- stats_btn = gr.Button("Get Statistics")
427
 
428
  stats_btn.click(
429
  get_stats_api,
@@ -434,13 +707,17 @@ with gr.Blocks(title="πŸ“Š Stats API") as stats_demo:
434
  # Combine interfaces using TabbedInterface
435
  demo = gr.TabbedInterface(
436
  [chat_demo, search_demo, stats_demo],
437
- ["πŸ’¬ Markdown Chat", "πŸ” Search API", "πŸ“Š Stats API"],
438
- title="πŸ€– RAGtim Bot - Complete Markdown Knowledge Base"
439
  )
440
 
441
  if __name__ == "__main__":
442
- print("πŸš€ Launching RAGtim Bot with Markdown Knowledge Base...")
443
  print(f"πŸ“š Loaded {len(bot.knowledge_base)} sections from markdown files")
 
 
 
 
444
  demo.launch(
445
  server_name="0.0.0.0",
446
  server_port=7860,
 
8
  import time
9
  import requests
10
  import re
11
+ import math
12
+ from collections import defaultdict, Counter
13
 
14
  # Configure device
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  print(f"Using device: {device}")
17
 
18
+ class HybridSearchRAGBot:
19
  def __init__(self):
20
  self.embedder = None
21
  self.knowledge_base = []
22
  self.embeddings = []
23
+
24
+ # BM25 components
25
+ self.term_frequencies = {} # doc_id -> {term: frequency}
26
+ self.document_frequency = {} # term -> number of docs containing term
27
+ self.document_lengths = {} # doc_id -> document length
28
+ self.average_doc_length = 0
29
+ self.total_documents = 0
30
+
31
+ # BM25 parameters
32
+ self.k1 = 1.5 # Controls term frequency saturation
33
+ self.b = 0.75 # Controls document length normalization
34
+
35
  self.initialize_models()
36
  self.load_markdown_knowledge_base()
37
+ self.build_bm25_index()
38
 
39
  def initialize_models(self):
40
  """Initialize the embedding model"""
 
95
  # Fallback to zero embedding
96
  self.embeddings.append(np.zeros(384))
97
 
98
+ self.total_documents = len(self.knowledge_base)
99
  print(f"βœ… Knowledge base loaded with {len(self.knowledge_base)} documents")
100
 
101
  def process_markdown_file(self, content: str, filename: str):
 
158
 
159
  return sections
160
 
161
+ def tokenize(self, text: str) -> List[str]:
162
+ """Tokenize text for BM25"""
163
+ # Convert to lowercase and remove punctuation
164
+ text = re.sub(r'[^\w\s]', ' ', text.lower())
165
+ # Split into words and filter out short words and stop words
166
+ words = [word for word in text.split() if len(word) > 2 and not self.is_stop_word(word)]
167
+ return words
168
+
169
+ def is_stop_word(self, word: str) -> bool:
170
+ """Check if word is a stop word"""
171
+ stop_words = {
172
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
173
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
174
+ 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
175
+ 'from', 'up', 'out', 'down', 'off', 'over', 'under', 'again', 'further', 'then', 'once'
176
+ }
177
+ return word in stop_words
178
+
179
+ def build_bm25_index(self):
180
+ """Build BM25 index for all documents"""
181
+ print("Building BM25 index...")
182
+
183
+ # Reset indexes
184
+ self.term_frequencies = {}
185
+ self.document_frequency = defaultdict(int)
186
+ self.document_lengths = {}
187
+
188
+ total_length = 0
189
+
190
+ # First pass: calculate term frequencies and document lengths
191
+ for doc in self.knowledge_base:
192
+ doc_id = doc['id']
193
+ terms = self.tokenize(doc['content'])
194
+
195
+ # Calculate term frequencies for this document
196
+ term_freq = Counter(terms)
197
+ self.term_frequencies[doc_id] = dict(term_freq)
198
+
199
+ # Store document length
200
+ doc_length = len(terms)
201
+ self.document_lengths[doc_id] = doc_length
202
+ total_length += doc_length
203
+
204
+ # Update document frequencies
205
+ unique_terms = set(terms)
206
+ for term in unique_terms:
207
+ self.document_frequency[term] += 1
208
+
209
+ # Calculate average document length
210
+ self.average_doc_length = total_length / self.total_documents if self.total_documents > 0 else 0
211
+
212
+ print(f"βœ… BM25 index built: {len(self.document_frequency)} unique terms, avg doc length: {self.average_doc_length:.1f}")
213
+
214
+ def calculate_bm25_score(self, term: str, doc_id: str) -> float:
215
+ """Calculate BM25 score for a term in a document"""
216
+ # Get term frequency in document
217
+ tf = self.term_frequencies.get(doc_id, {}).get(term, 0)
218
+ if tf == 0:
219
+ return 0.0
220
+
221
+ # Get document frequency and document length
222
+ df = self.document_frequency.get(term, 1)
223
+ doc_length = self.document_lengths.get(doc_id, 0)
224
+
225
+ # Calculate IDF: log((N - df + 0.5) / (df + 0.5))
226
+ idf = math.log((self.total_documents - df + 0.5) / (df + 0.5))
227
+
228
+ # Calculate BM25 score
229
+ numerator = tf * (self.k1 + 1)
230
+ denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.average_doc_length))
231
+
232
+ return idf * (numerator / denominator)
233
+
234
+ def bm25_search(self, query: str, top_k: int = 10) -> List[Dict]:
235
+ """Perform BM25 search"""
236
+ query_terms = self.tokenize(query)
237
+ if not query_terms:
238
+ return []
239
+
240
+ scores = {}
241
+
242
+ # Calculate BM25 score for each document
243
+ for doc in self.knowledge_base:
244
+ doc_id = doc['id']
245
+ score = 0.0
246
+
247
+ for term in query_terms:
248
+ score += self.calculate_bm25_score(term, doc_id)
249
+
250
+ if score > 0:
251
+ # Apply priority boost
252
+ priority_boost = 1 + (doc['metadata']['priority'] / 50)
253
+ final_score = score * priority_boost
254
+
255
+ scores[doc_id] = {
256
+ 'document': doc,
257
+ 'score': final_score,
258
+ 'search_type': 'bm25'
259
+ }
260
+
261
+ # Sort by score and return top_k
262
+ sorted_results = sorted(scores.values(), key=lambda x: x['score'], reverse=True)
263
+ return sorted_results[:top_k]
264
+
265
  def cosine_similarity(self, a, b):
266
  """Calculate cosine similarity between two vectors"""
267
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
268
 
269
+ def vector_search(self, query: str, top_k: int = 10) -> List[Dict]:
270
+ """Perform vector similarity search"""
271
  try:
272
  # Generate query embedding
273
+ query_embedding = self.embedder(query[:500], return_tensors="pt") # Truncate query
274
  query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
275
 
276
  # Calculate similarities
277
  similarities = []
278
  for i, doc_embedding in enumerate(self.embeddings):
279
+ if doc_embedding is not None and len(doc_embedding) > 0:
280
+ similarity = self.cosine_similarity(query_vector, doc_embedding)
281
+
282
+ # Apply priority boost
283
+ priority_boost = 1 + (self.knowledge_base[i]['metadata']['priority'] / 100)
284
+ final_score = similarity * priority_boost
285
+
286
+ similarities.append({
287
+ 'document': self.knowledge_base[i],
288
+ 'score': float(final_score),
289
+ 'search_type': 'vector'
290
+ })
291
 
292
+ # Sort by similarity and return top_k
293
+ similarities.sort(key=lambda x: x['score'], reverse=True)
294
  return similarities[:top_k]
295
 
296
  except Exception as e:
297
+ print(f"Error in vector search: {e}")
298
+ return []
 
299
 
300
+ def hybrid_search(self, query: str, top_k: int = 10, vector_weight: float = 0.6, bm25_weight: float = 0.4) -> List[Dict]:
301
+ """Perform hybrid search combining vector and BM25 results"""
302
+ try:
303
+ # Get results from both search methods
304
+ vector_results = self.vector_search(query, top_k * 2) # Get more results for better fusion
305
+ bm25_results = self.bm25_search(query, top_k * 2)
 
 
306
 
307
+ # Normalize scores to [0, 1] range
308
+ if vector_results:
309
+ max_vector_score = max(r['score'] for r in vector_results)
310
+ if max_vector_score > 0:
311
+ for result in vector_results:
312
+ result['normalized_score'] = result['score'] / max_vector_score
313
+ else:
314
+ for result in vector_results:
315
+ result['normalized_score'] = 0
316
 
317
+ if bm25_results:
318
+ max_bm25_score = max(r['score'] for r in bm25_results)
319
+ if max_bm25_score > 0:
320
+ for result in bm25_results:
321
+ result['normalized_score'] = result['score'] / max_bm25_score
322
+ else:
323
+ for result in bm25_results:
324
+ result['normalized_score'] = 0
325
+
326
+ # Combine results
327
+ combined_scores = {}
328
+
329
+ # Add vector results
330
+ for result in vector_results:
331
+ doc_id = result['document']['id']
332
+ combined_scores[doc_id] = {
333
+ 'document': result['document'],
334
+ 'vector_score': result['normalized_score'],
335
+ 'bm25_score': 0.0,
336
+ 'search_type': 'vector'
337
+ }
338
+
339
+ # Add BM25 results
340
+ for result in bm25_results:
341
+ doc_id = result['document']['id']
342
+ if doc_id in combined_scores:
343
+ combined_scores[doc_id]['bm25_score'] = result['normalized_score']
344
+ combined_scores[doc_id]['search_type'] = 'hybrid'
345
+ else:
346
+ combined_scores[doc_id] = {
347
+ 'document': result['document'],
348
+ 'vector_score': 0.0,
349
+ 'bm25_score': result['normalized_score'],
350
+ 'search_type': 'bm25'
351
+ }
352
+
353
+ # Calculate final hybrid scores
354
+ final_results = []
355
+ for doc_id, data in combined_scores.items():
356
+ hybrid_score = (vector_weight * data['vector_score']) + (bm25_weight * data['bm25_score'])
357
+ final_results.append({
358
+ 'document': data['document'],
359
+ 'score': hybrid_score,
360
+ 'vector_score': data['vector_score'],
361
+ 'bm25_score': data['bm25_score'],
362
+ 'search_type': data['search_type']
363
  })
364
+
365
+ # Sort by hybrid score and return top_k
366
+ final_results.sort(key=lambda x: x['score'], reverse=True)
367
+ return final_results[:top_k]
368
+
369
+ except Exception as e:
370
+ print(f"Error in hybrid search: {e}")
371
+ # Fallback to vector search only
372
+ return self.vector_search(query, top_k)
373
+
374
+ def search_knowledge_base(self, query: str, top_k: int = 5, search_type: str = "hybrid") -> List[Dict]:
375
+ """Search the knowledge base using specified method"""
376
+ if search_type == "vector":
377
+ return self.vector_search(query, top_k)
378
+ elif search_type == "bm25":
379
+ return self.bm25_search(query, top_k)
380
+ else: # hybrid
381
+ return self.hybrid_search(query, top_k)
382
 
383
  # Initialize the bot
384
+ print("Initializing Hybrid Search RAGtim Bot...")
385
+ bot = HybridSearchRAGBot()
386
 
387
+ def search_api(query, top_k=5, search_type="hybrid", vector_weight=0.6, bm25_weight=0.4):
388
+ """API endpoint for hybrid search functionality"""
389
  try:
390
+ if search_type == "hybrid":
391
+ results = bot.hybrid_search(query, top_k, vector_weight, bm25_weight)
392
+ else:
393
+ results = bot.search_knowledge_base(query, top_k, search_type)
394
+
395
  return {
396
  "results": results,
397
  "query": query,
398
  "top_k": top_k,
399
+ "search_type": search_type,
400
+ "total_documents": len(bot.knowledge_base),
401
+ "search_parameters": {
402
+ "vector_weight": vector_weight if search_type == "hybrid" else None,
403
+ "bm25_weight": bm25_weight if search_type == "hybrid" else None,
404
+ "bm25_k1": bot.k1,
405
+ "bm25_b": bot.b
406
+ }
407
  }
408
  except Exception as e:
409
  print(f"Error in search API: {e}")
 
428
  "sections_by_file": sections_by_file,
429
  "model_name": "sentence-transformers/all-MiniLM-L6-v2",
430
  "embedding_dimension": 384,
431
+ "search_capabilities": [
432
+ "Hybrid Search (Vector + BM25)",
433
+ "Semantic Vector Search",
434
+ "BM25 Keyword Search",
435
+ "GPU Accelerated",
436
+ "Transformer Embeddings"
437
+ ],
438
+ "bm25_parameters": {
439
+ "k1": bot.k1,
440
+ "b": bot.b,
441
+ "unique_terms": len(bot.document_frequency),
442
+ "average_doc_length": bot.average_doc_length
443
+ },
444
+ "backend_type": "Hugging Face Space with Hybrid Search",
445
  "knowledge_sources": list(sections_by_file.keys())
446
  }
447
 
448
  def chat_interface(message, history):
449
+ """Chat interface with hybrid search"""
450
  if not message.strip():
451
+ return "Please ask me something about Raktim Mondol! I use hybrid search combining semantic similarity and keyword matching for the best results."
452
 
453
  try:
454
+ # Use hybrid search by default
455
+ search_results = bot.hybrid_search(message, top_k=6)
456
 
457
  if search_results:
458
  # Build comprehensive response
459
  response_parts = []
460
+ response_parts.append(f"πŸ” **Hybrid Search Results** (Vector + BM25 combination, found {len(search_results)} relevant sections):\n")
461
 
462
  # Use the best match as primary response
463
  best_match = search_results[0]
464
+ response_parts.append(f"**Primary Answer** (Hybrid Score: {best_match['score']:.3f}):")
465
+ response_parts.append(f"πŸ“„ Source: {best_match['document']['metadata']['source']} - {best_match['document']['metadata']['section']}")
466
+ response_parts.append(f"πŸ” Search Type: {best_match['search_type'].upper()}")
467
+
468
+ # Show score breakdown for hybrid results
469
+ if 'vector_score' in best_match and 'bm25_score' in best_match:
470
+ response_parts.append(f"πŸ“Š Vector Score: {best_match['vector_score']:.3f} | BM25 Score: {best_match['bm25_score']:.3f}")
471
+
472
+ response_parts.append(f"\n{best_match['document']['content']}\n")
473
 
474
  # Add additional context if available
475
  if len(search_results) > 1:
476
  response_parts.append("**Additional Context:**")
477
  for i, result in enumerate(search_results[1:3], 1): # Show up to 2 additional results
478
+ section_info = f"{result['document']['metadata']['source']} - {result['document']['metadata']['section']}"
479
+ search_info = f"({result['search_type'].upper()}, Score: {result['score']:.3f})"
480
+ response_parts.append(f"{i}. {section_info} {search_info}")
481
+
482
  # Add a brief excerpt
483
+ excerpt = result['document']['content'][:200] + "..." if len(result['document']['content']) > 200 else result['document']['content']
484
  response_parts.append(f" {excerpt}\n")
485
 
486
+ response_parts.append("\nπŸ€– **Hybrid Search Technology:**")
487
+ response_parts.append("β€’ **Vector Search**: Semantic similarity using transformer embeddings")
488
+ response_parts.append("β€’ **BM25 Search**: Advanced keyword ranking with TF-IDF")
489
+ response_parts.append("β€’ **Fusion**: Weighted combination for optimal relevance")
490
+ response_parts.append("\n[Note: This demonstrates hybrid search results. In production, these would be passed to an LLM for natural response generation.]")
491
 
492
  return "\n".join(response_parts)
493
  else:
494
+ return "I don't have specific information about that topic in my knowledge base. Could you please ask something else about Raktim Mondol?"
495
 
496
  except Exception as e:
497
  print(f"Error in chat interface: {e}")
 
505
  .gradio-container {
506
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
507
  }
508
+ .search-type-radio .wrap {
509
+ display: flex;
510
+ gap: 10px;
511
+ }
512
+ .search-weights {
513
+ background: #f0f0f0;
514
  padding: 10px;
515
+ border-radius: 5px;
516
+ margin: 10px 0;
517
  }
518
  """
519
 
520
+ # Create the main chat interface
521
  with gr.Blocks(
522
+ title="πŸ”₯ Hybrid Search RAGtim Bot",
523
  css=css,
524
  theme=gr.themes.Soft(
525
  primary_hue="green",
 
528
  )
529
  ) as chat_demo:
530
  gr.Markdown(f"""
531
+ # πŸ”₯ Hybrid Search RAGtim Bot - Advanced Search Technology
532
+
533
+ **πŸš€ Hybrid Search System**: This Space implements **true hybrid search** combining:
534
+ - 🧠 **Semantic Vector Search**: Transformer embeddings for conceptual similarity
535
+ - πŸ” **BM25 Keyword Search**: Advanced TF-IDF ranking for exact term matching
536
+ - βš–οΈ **Intelligent Fusion**: Weighted combination for optimal relevance
537
+
538
+ **πŸ“š Knowledge Base**: **{len(bot.knowledge_base)} sections** from comprehensive markdown files:
539
+ - πŸ“„ **about.md** - Personal info, contact, professional summary
540
+ - πŸ”¬ **research_details.md** - Research projects, methodologies, innovations
541
+ - πŸ“š **publications_detailed.md** - Publications with technical details
542
+ - πŸ’» **skills_expertise.md** - Technical skills, LLM expertise, tools
543
+ - πŸ’Ό **experience_detailed.md** - Professional experience, teaching
544
+ - πŸ“Š **statistics.md** - Statistical methods, biostatistics expertise
545
+
546
+ **πŸ”§ Search Parameters**:
547
+ - **BM25 Parameters**: k1={bot.k1}, b={bot.b}
548
+ - **Vocabulary**: {len(bot.document_frequency)} unique terms
549
+ - **Average Document Length**: {bot.average_doc_length:.1f} words
550
+ - **Embedding Model**: sentence-transformers/all-MiniLM-L6-v2 (384-dim)
551
+
552
+ **πŸ’‘ Try Different Search Types**:
553
+ - **Hybrid** (Recommended): Best of both semantic and keyword search
554
+ - **Vector**: Pure semantic similarity for conceptual queries
555
+ - **BM25**: Pure keyword matching for specific terms
556
+
557
+ **Ask me anything about Raktim Mondol's research, expertise, and background!**
 
 
 
 
 
 
558
  """)
559
 
560
  chatbot = gr.Chatbot(
561
+ height=500,
562
  show_label=False,
563
  container=True,
564
  type="messages"
 
566
 
567
  with gr.Row():
568
  msg = gr.Textbox(
569
+ placeholder="Ask about Raktim's research, LLM expertise, publications, statistical methods...",
570
  container=False,
571
  scale=7,
572
  show_label=False
573
  )
574
+ submit_btn = gr.Button("πŸ” Hybrid Search", scale=1)
575
 
576
  # Example buttons
577
  with gr.Row():
578
  examples = [
579
+ "What is Raktim's LLM and RAG research?",
580
+ "Tell me about BioFusionNet statistical methods",
581
+ "What are his multimodal AI capabilities?",
582
+ "Describe his biostatistics expertise"
583
  ]
584
  for example in examples:
585
  gr.Button(example, size="sm").click(
 
604
  submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
605
  msg.submit(respond, [msg, chatbot], [chatbot, msg])
606
 
607
+ # Create advanced search interface
608
+ with gr.Blocks(title="πŸ”§ Advanced Hybrid Search") as search_demo:
609
+ gr.Markdown("# πŸ”§ Advanced Hybrid Search Configuration")
610
+ gr.Markdown("Fine-tune the hybrid search parameters and compare different search methods")
611
 
612
  with gr.Row():
613
+ with gr.Column(scale=2):
614
+ search_input = gr.Textbox(
615
+ label="Search Query",
616
+ placeholder="Enter your search query about Raktim Mondol..."
617
+ )
618
+
619
+ with gr.Row():
620
+ search_type = gr.Radio(
621
+ choices=["hybrid", "vector", "bm25"],
622
+ value="hybrid",
623
+ label="Search Method",
624
+ elem_classes=["search-type-radio"]
625
+ )
626
+ top_k_slider = gr.Slider(
627
+ minimum=1,
628
+ maximum=15,
629
+ value=5,
630
+ step=1,
631
+ label="Top K Results"
632
+ )
633
+
634
+ # Hybrid search weights (only shown when hybrid is selected)
635
+ with gr.Group(visible=True) as weight_group:
636
+ gr.Markdown("**Hybrid Search Weights**")
637
+ vector_weight = gr.Slider(
638
+ minimum=0.0,
639
+ maximum=1.0,
640
+ value=0.6,
641
+ step=0.1,
642
+ label="Vector Weight (Semantic)"
643
+ )
644
+ bm25_weight = gr.Slider(
645
+ minimum=0.0,
646
+ maximum=1.0,
647
+ value=0.4,
648
+ step=0.1,
649
+ label="BM25 Weight (Keyword)"
650
+ )
651
+
652
+ with gr.Column(scale=1):
653
+ gr.Markdown("**Search Method Guide:**")
654
+ gr.Markdown("""
655
+ **πŸ”₯ Hybrid**: Combines semantic + keyword
656
+ - Best for most queries
657
+ - Balances meaning and exact terms
658
+
659
+ **🧠 Vector**: Pure semantic similarity
660
+ - Good for conceptual questions
661
+ - Finds related concepts
662
+
663
+ **πŸ” BM25**: Pure keyword matching
664
+ - Good for specific terms
665
+ - Traditional search ranking
666
+ """)
667
+
668
+ search_output = gr.JSON(label="Hybrid Search Results", height=400)
669
+ search_btn = gr.Button("πŸ” Search with Custom Parameters", variant="primary")
670
 
671
+ def update_weights_visibility(search_type):
672
+ return gr.Group(visible=(search_type == "hybrid"))
673
+
674
+ search_type.change(update_weights_visibility, inputs=[search_type], outputs=[weight_group])
675
+
676
+ def normalize_weights(vector_w, bm25_w):
677
+ total = vector_w + bm25_w
678
+ if total > 0:
679
+ return vector_w / total, bm25_w / total
680
+ return 0.6, 0.4
681
+
682
+ def advanced_search(query, search_type, top_k, vector_w, bm25_w):
683
+ # Normalize weights
684
+ vector_weight, bm25_weight = normalize_weights(vector_w, bm25_w)
685
+ return search_api(query, top_k, search_type, vector_weight, bm25_weight)
686
 
687
  search_btn.click(
688
+ advanced_search,
689
+ inputs=[search_input, search_type, top_k_slider, vector_weight, bm25_weight],
690
  outputs=search_output
691
  )
692
 
693
  # Create stats interface
694
+ with gr.Blocks(title="πŸ“Š System Statistics") as stats_demo:
695
+ gr.Markdown("# πŸ“Š Hybrid Search System Statistics")
696
+ gr.Markdown("Detailed information about the knowledge base and search capabilities")
697
 
698
+ stats_output = gr.JSON(label="System Statistics", height=500)
699
+ stats_btn = gr.Button("πŸ“Š Get System Statistics", variant="primary")
700
 
701
  stats_btn.click(
702
  get_stats_api,
 
707
  # Combine interfaces using TabbedInterface
708
  demo = gr.TabbedInterface(
709
  [chat_demo, search_demo, stats_demo],
710
+ ["πŸ’¬ Hybrid Chat", "πŸ”§ Advanced Search", "πŸ“Š Statistics"],
711
+ title="πŸ”₯ Hybrid Search RAGtim Bot - Vector + BM25 Fusion"
712
  )
713
 
714
  if __name__ == "__main__":
715
+ print("πŸš€ Launching Hybrid Search RAGtim Bot...")
716
  print(f"πŸ“š Loaded {len(bot.knowledge_base)} sections from markdown files")
717
+ print(f"πŸ” BM25 index: {len(bot.document_frequency)} unique terms")
718
+ print(f"🧠 Vector embeddings: {len(bot.embeddings)} documents")
719
+ print("πŸ”₯ Hybrid search ready: Semantic + Keyword fusion!")
720
+
721
  demo.launch(
722
  server_name="0.0.0.0",
723
  server_port=7860,