raktimhugging commited on
Commit
a1769da
Β·
verified Β·
1 Parent(s): 9b2b303

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -232
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import json
3
  import numpy as np
4
- from transformers import pipeline
5
  import torch
6
  import os
7
  from typing import List, Dict, Any
@@ -10,18 +10,10 @@ import requests
10
  import re
11
  import math
12
  from collections import defaultdict, Counter
13
- import logging
14
-
15
- # Import configuration
16
- from config import *
17
-
18
- # Configure logging
19
- logging.basicConfig(level=logging.INFO)
20
- logger = logging.getLogger(__name__)
21
 
22
  # Configure device
23
- device = get_device()
24
- logger.info(f"Using device: {device}")
25
 
26
  class HybridSearchRAGBot:
27
  def __init__(self):
@@ -30,15 +22,15 @@ class HybridSearchRAGBot:
30
  self.embeddings = []
31
 
32
  # BM25 components
33
- self.term_frequencies = {}
34
- self.document_frequency = {}
35
- self.document_lengths = {}
36
  self.average_doc_length = 0
37
  self.total_documents = 0
38
 
39
  # BM25 parameters
40
- self.k1 = BM25_K1
41
- self.b = BM25_B
42
 
43
  self.initialize_models()
44
  self.load_markdown_knowledge_base()
@@ -47,64 +39,84 @@ class HybridSearchRAGBot:
47
  def initialize_models(self):
48
  """Initialize the embedding model"""
49
  try:
50
- logger.info("Loading embedding model...")
51
  self.embedder = pipeline(
52
  'feature-extraction',
53
- EMBEDDING_MODEL,
54
  device=0 if device == "cuda" else -1
55
  )
56
- logger.info("βœ… Embedding model loaded successfully")
57
  except Exception as e:
58
- logger.error(f"❌ Error loading embedding model: {e}")
59
  raise e
60
 
61
  def load_markdown_knowledge_base(self):
62
  """Load knowledge base from markdown files"""
63
- logger.info("Loading knowledge base from markdown files...")
64
 
65
  # Reset knowledge base
66
  self.knowledge_base = []
67
 
68
- for filename in KNOWLEDGE_BASE_FILES:
 
 
 
 
 
 
 
 
 
 
69
  try:
70
  if os.path.exists(filename):
71
  with open(filename, 'r', encoding='utf-8') as f:
72
  content = f.read()
73
- self.process_markdown_file(content, os.path.basename(filename))
74
- logger.info(f"βœ… Loaded {filename}")
75
  else:
76
- logger.warning(f"⚠️ File not found: {filename}")
77
  except Exception as e:
78
- logger.error(f"❌ Error loading {filename}: {e}")
79
 
80
  # Generate embeddings for knowledge base
81
- logger.info("Generating embeddings for knowledge base...")
82
  self.embeddings = []
83
  for i, doc in enumerate(self.knowledge_base):
84
  try:
85
  # Truncate content to avoid token limit issues
86
- content = doc["content"][:500]
87
  embedding = self.embedder(content, return_tensors="pt")
88
  # Convert to numpy and flatten
89
  embedding_np = embedding[0].mean(dim=0).detach().cpu().numpy()
90
  self.embeddings.append(embedding_np)
91
  except Exception as e:
92
- logger.error(f"Error generating embedding for doc {doc['id']}: {e}")
93
  # Fallback to zero embedding
94
- self.embeddings.append(np.zeros(EMBEDDING_DIM))
95
 
96
  self.total_documents = len(self.knowledge_base)
97
- logger.info(f"βœ… Knowledge base loaded with {len(self.knowledge_base)} documents")
98
 
99
  def process_markdown_file(self, content: str, filename: str):
100
  """Process a markdown file and extract sections"""
101
- file_type, priority = FILE_TYPE_MAP.get(filename, ('general', 5))
 
 
 
 
 
 
 
 
 
 
102
 
103
  # Split content into sections
104
  sections = self.split_markdown_into_sections(content)
105
 
106
  for section in sections:
107
- if len(section['content'].strip()) > 100:
108
  doc = {
109
  "id": f"{filename}_{section['title']}_{len(self.knowledge_base)}",
110
  "content": section['content'],
@@ -124,10 +136,14 @@ class HybridSearchRAGBot:
124
  current_section = {'title': 'Introduction', 'content': ''}
125
 
126
  for line in lines:
 
127
  if line.startswith('#'):
 
128
  if current_section['content'].strip():
129
  sections.append(current_section.copy())
130
 
 
 
131
  title = line.lstrip('#').strip()
132
  current_section = {
133
  'title': title,
@@ -136,6 +152,7 @@ class HybridSearchRAGBot:
136
  else:
137
  current_section['content'] += line + '\n'
138
 
 
139
  if current_section['content'].strip():
140
  sections.append(current_section)
141
 
@@ -143,7 +160,9 @@ class HybridSearchRAGBot:
143
 
144
  def tokenize(self, text: str) -> List[str]:
145
  """Tokenize text for BM25"""
 
146
  text = re.sub(r'[^\w\s]', ' ', text.lower())
 
147
  words = [word for word in text.split() if len(word) > 2 and not self.is_stop_word(word)]
148
  return words
149
 
@@ -159,44 +178,54 @@ class HybridSearchRAGBot:
159
 
160
  def build_bm25_index(self):
161
  """Build BM25 index for all documents"""
162
- logger.info("Building BM25 index...")
163
 
 
164
  self.term_frequencies = {}
165
  self.document_frequency = defaultdict(int)
166
  self.document_lengths = {}
167
 
168
  total_length = 0
169
 
 
170
  for doc in self.knowledge_base:
171
  doc_id = doc['id']
172
  terms = self.tokenize(doc['content'])
173
 
 
174
  term_freq = Counter(terms)
175
  self.term_frequencies[doc_id] = dict(term_freq)
176
 
 
177
  doc_length = len(terms)
178
  self.document_lengths[doc_id] = doc_length
179
  total_length += doc_length
180
 
 
181
  unique_terms = set(terms)
182
  for term in unique_terms:
183
  self.document_frequency[term] += 1
184
 
 
185
  self.average_doc_length = total_length / self.total_documents if self.total_documents > 0 else 0
186
 
187
- logger.info(f"βœ… BM25 index built: {len(self.document_frequency)} unique terms, avg doc length: {self.average_doc_length:.1f}")
188
 
189
  def calculate_bm25_score(self, term: str, doc_id: str) -> float:
190
  """Calculate BM25 score for a term in a document"""
 
191
  tf = self.term_frequencies.get(doc_id, {}).get(term, 0)
192
  if tf == 0:
193
  return 0.0
194
 
 
195
  df = self.document_frequency.get(term, 1)
196
  doc_length = self.document_lengths.get(doc_id, 0)
197
 
 
198
  idf = math.log((self.total_documents - df + 0.5) / (df + 0.5))
199
 
 
200
  numerator = tf * (self.k1 + 1)
201
  denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.average_doc_length))
202
 
@@ -210,6 +239,7 @@ class HybridSearchRAGBot:
210
 
211
  scores = {}
212
 
 
213
  for doc in self.knowledge_base:
214
  doc_id = doc['id']
215
  score = 0.0
@@ -218,6 +248,7 @@ class HybridSearchRAGBot:
218
  score += self.calculate_bm25_score(term, doc_id)
219
 
220
  if score > 0:
 
221
  priority_boost = 1 + (doc['metadata']['priority'] / 50)
222
  final_score = score * priority_boost
223
 
@@ -227,6 +258,7 @@ class HybridSearchRAGBot:
227
  'search_type': 'bm25'
228
  }
229
 
 
230
  sorted_results = sorted(scores.values(), key=lambda x: x['score'], reverse=True)
231
  return sorted_results[:top_k]
232
 
@@ -237,14 +269,17 @@ class HybridSearchRAGBot:
237
  def vector_search(self, query: str, top_k: int = 10) -> List[Dict]:
238
  """Perform vector similarity search"""
239
  try:
240
- query_embedding = self.embedder(query[:500], return_tensors="pt")
 
241
  query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
242
 
 
243
  similarities = []
244
  for i, doc_embedding in enumerate(self.embeddings):
245
  if doc_embedding is not None and len(doc_embedding) > 0:
246
  similarity = self.cosine_similarity(query_vector, doc_embedding)
247
 
 
248
  priority_boost = 1 + (self.knowledge_base[i]['metadata']['priority'] / 100)
249
  final_score = similarity * priority_boost
250
 
@@ -254,20 +289,22 @@ class HybridSearchRAGBot:
254
  'search_type': 'vector'
255
  })
256
 
 
257
  similarities.sort(key=lambda x: x['score'], reverse=True)
258
  return similarities[:top_k]
259
 
260
  except Exception as e:
261
- logger.error(f"Error in vector search: {e}")
262
  return []
263
 
264
  def hybrid_search(self, query: str, top_k: int = 10, vector_weight: float = 0.6, bm25_weight: float = 0.4) -> List[Dict]:
265
  """Perform hybrid search combining vector and BM25 results"""
266
  try:
267
- vector_results = self.vector_search(query, top_k * 2)
 
268
  bm25_results = self.bm25_search(query, top_k * 2)
269
 
270
- # Normalize scores
271
  if vector_results:
272
  max_vector_score = max(r['score'] for r in vector_results)
273
  if max_vector_score > 0:
@@ -289,6 +326,7 @@ class HybridSearchRAGBot:
289
  # Combine results
290
  combined_scores = {}
291
 
 
292
  for result in vector_results:
293
  doc_id = result['document']['id']
294
  combined_scores[doc_id] = {
@@ -298,6 +336,7 @@ class HybridSearchRAGBot:
298
  'search_type': 'vector'
299
  }
300
 
 
301
  for result in bm25_results:
302
  doc_id = result['document']['id']
303
  if doc_id in combined_scores:
@@ -323,11 +362,13 @@ class HybridSearchRAGBot:
323
  'search_type': data['search_type']
324
  })
325
 
 
326
  final_results.sort(key=lambda x: x['score'], reverse=True)
327
  return final_results[:top_k]
328
 
329
  except Exception as e:
330
- logger.error(f"Error in hybrid search: {e}")
 
331
  return self.vector_search(query, top_k)
332
 
333
  def search_knowledge_base(self, query: str, top_k: int = 5, search_type: str = "hybrid") -> List[Dict]:
@@ -336,15 +377,15 @@ class HybridSearchRAGBot:
336
  return self.vector_search(query, top_k)
337
  elif search_type == "bm25":
338
  return self.bm25_search(query, top_k)
339
- else:
340
  return self.hybrid_search(query, top_k)
341
 
342
  # Initialize the bot
343
- logger.info("Initializing Hybrid Search RAGtim Bot...")
344
  bot = HybridSearchRAGBot()
345
 
346
- # API Functions
347
- def search_api(query, top_k=5, search_type="hybrid", vector_weight=0.6, bm25_weight=0.4):
348
  """API endpoint for hybrid search functionality"""
349
  try:
350
  if search_type == "hybrid":
@@ -366,12 +407,13 @@ def search_api(query, top_k=5, search_type="hybrid", vector_weight=0.6, bm25_wei
366
  }
367
  }
368
  except Exception as e:
369
- logger.error(f"Error in search API: {e}")
370
  return {"error": str(e), "results": []}
371
 
372
  def get_stats_api():
373
  """API endpoint for knowledge base statistics"""
374
  try:
 
375
  doc_types = {}
376
  sections_by_file = {}
377
 
@@ -386,8 +428,8 @@ def get_stats_api():
386
  "total_documents": len(bot.knowledge_base),
387
  "document_types": doc_types,
388
  "sections_by_file": sections_by_file,
389
- "model_name": EMBEDDING_MODEL,
390
- "embedding_dimension": EMBEDDING_DIM,
391
  "search_capabilities": [
392
  "Hybrid Search (Vector + BM25)",
393
  "Semantic Vector Search",
@@ -406,7 +448,7 @@ def get_stats_api():
406
  "status": "healthy"
407
  }
408
  except Exception as e:
409
- logger.error(f"Error in get_stats_api: {e}")
410
  return {
411
  "error": str(e),
412
  "status": "error",
@@ -420,29 +462,35 @@ def chat_interface(message, history):
420
  return "Please ask me something about Raktim Mondol! I use hybrid search combining semantic similarity and keyword matching for the best results."
421
 
422
  try:
 
423
  search_results = bot.hybrid_search(message, top_k=6)
424
 
425
  if search_results:
 
426
  response_parts = []
427
  response_parts.append(f"πŸ” **Hybrid Search Results** (Vector + BM25 combination, found {len(search_results)} relevant sections):\n")
428
 
 
429
  best_match = search_results[0]
430
  response_parts.append(f"**Primary Answer** (Hybrid Score: {best_match['score']:.3f}):")
431
  response_parts.append(f"πŸ“„ Source: {best_match['document']['metadata']['source']} - {best_match['document']['metadata']['section']}")
432
  response_parts.append(f"πŸ” Search Type: {best_match['search_type'].upper()}")
433
 
 
434
  if 'vector_score' in best_match and 'bm25_score' in best_match:
435
  response_parts.append(f"πŸ“Š Vector Score: {best_match['vector_score']:.3f} | BM25 Score: {best_match['bm25_score']:.3f}")
436
 
437
  response_parts.append(f"\n{best_match['document']['content']}\n")
438
 
 
439
  if len(search_results) > 1:
440
  response_parts.append("**Additional Context:**")
441
- for i, result in enumerate(search_results[1:3], 1):
442
  section_info = f"{result['document']['metadata']['source']} - {result['document']['metadata']['section']}"
443
  search_info = f"({result['search_type'].upper()}, Score: {result['score']:.3f})"
444
  response_parts.append(f"{i}. {section_info} {search_info}")
445
 
 
446
  excerpt = result['document']['content'][:200] + "..." if len(result['document']['content']) > 200 else result['document']['content']
447
  response_parts.append(f" {excerpt}\n")
448
 
@@ -457,203 +505,66 @@ def chat_interface(message, history):
457
  return "I don't have specific information about that topic in my knowledge base. Could you please ask something else about Raktim Mondol?"
458
 
459
  except Exception as e:
460
- logger.error(f"Error in chat interface: {e}")
461
  return "I'm sorry, I encountered an error while processing your question. Please try again."
462
 
463
- # Gradio Interface
464
- css = """
465
- .gradio-container {
466
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
467
- }
468
- .search-type-radio .wrap {
469
- display: flex;
470
- gap: 10px;
471
- }
472
- .search-weights {
473
- background: #f0f0f0;
474
- padding: 10px;
475
- border-radius: 5px;
476
- margin: 10px 0;
477
- }
478
- """
479
 
480
- # Create the main chat interface
481
- with gr.Blocks(
 
 
 
 
 
 
482
  title="πŸ”₯ Hybrid Search RAGtim Bot",
483
- css=css,
484
- theme=gr.themes.Soft(
485
- primary_hue="green",
486
- secondary_hue="blue",
487
- neutral_hue="slate"
488
- )
489
- ) as chat_demo:
490
- gr.Markdown(f"""
491
- # πŸ”₯ Hybrid Search RAGtim Bot - Advanced Search Technology
492
-
493
- **πŸš€ Hybrid Search System**: This Space implements **true hybrid search** combining:
494
- - 🧠 **Semantic Vector Search**: Transformer embeddings for conceptual similarity
495
- - πŸ” **BM25 Keyword Search**: Advanced TF-IDF ranking for exact term matching
496
- - βš–οΈ **Intelligent Fusion**: Weighted combination for optimal relevance
497
-
498
- **πŸ“š Knowledge Base**: **{len(bot.knowledge_base)} sections** from comprehensive markdown files
499
-
500
- **πŸ”§ Search Parameters**:
501
- - **BM25 Parameters**: k1={bot.k1}, b={bot.b}
502
- - **Vocabulary**: {len(bot.document_frequency)} unique terms
503
- - **Average Document Length**: {bot.average_doc_length:.1f} words
504
- - **Embedding Model**: {EMBEDDING_MODEL} ({EMBEDDING_DIM}-dim)
505
-
506
- **Ask me anything about Raktim Mondol's research, expertise, and background!**
507
- """)
508
-
509
- chatbot = gr.Chatbot(
510
- height=500,
511
- show_label=False,
512
- container=True,
513
- type="messages"
514
- )
515
-
516
- with gr.Row():
517
- msg = gr.Textbox(
518
- placeholder="Ask about Raktim's research, LLM expertise, publications, statistical methods...",
519
- container=False,
520
- scale=7,
521
- show_label=False
522
- )
523
- submit_btn = gr.Button("πŸ” Hybrid Search", scale=1)
524
-
525
- # Example buttons
526
- with gr.Row():
527
- examples = [
528
- "What is Raktim's LLM and RAG research?",
529
- "Tell me about BioFusionNet statistical methods",
530
- "What are his multimodal AI capabilities?",
531
- "Describe his biostatistics expertise"
532
- ]
533
- for example in examples:
534
- gr.Button(example, size="sm").click(
535
- lambda x=example: x, outputs=msg
536
- )
537
-
538
- def respond(message, history):
539
- if not message.strip():
540
- return history, ""
541
-
542
- history.append({"role": "user", "content": message})
543
- bot_response = chat_interface(message, history)
544
- history.append({"role": "assistant", "content": bot_response})
545
-
546
- return history, ""
547
-
548
- submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
549
- msg.submit(respond, [msg, chatbot], [chatbot, msg])
550
 
551
- # Advanced search interface
552
- with gr.Blocks(title="πŸ”§ Advanced Hybrid Search") as search_demo:
553
- gr.Markdown("# πŸ”§ Advanced Hybrid Search Configuration")
554
-
555
- with gr.Row():
556
- with gr.Column(scale=2):
557
- search_input = gr.Textbox(
558
- label="Search Query",
559
- placeholder="Enter your search query about Raktim Mondol..."
560
- )
561
-
562
- with gr.Row():
563
- search_type = gr.Radio(
564
- choices=["hybrid", "vector", "bm25"],
565
- value="hybrid",
566
- label="Search Method"
567
- )
568
- top_k_slider = gr.Slider(
569
- minimum=1,
570
- maximum=15,
571
- value=5,
572
- step=1,
573
- label="Top K Results"
574
- )
575
-
576
- with gr.Group(visible=True) as weight_group:
577
- gr.Markdown("**Hybrid Search Weights**")
578
- vector_weight = gr.Slider(
579
- minimum=0.0,
580
- maximum=1.0,
581
- value=0.6,
582
- step=0.1,
583
- label="Vector Weight (Semantic)"
584
- )
585
- bm25_weight = gr.Slider(
586
- minimum=0.0,
587
- maximum=1.0,
588
- value=0.4,
589
- step=0.1,
590
- label="BM25 Weight (Keyword)"
591
- )
592
-
593
- with gr.Column(scale=1):
594
- gr.Markdown("**Search Method Guide:**")
595
- gr.Markdown("""
596
- **πŸ”₯ Hybrid**: Combines semantic + keyword
597
- - Best for most queries
598
- - Balances meaning and exact terms
599
-
600
- **🧠 Vector**: Pure semantic similarity
601
- - Good for conceptual questions
602
- - Finds related concepts
603
-
604
- **πŸ” BM25**: Pure keyword matching
605
- - Good for specific terms
606
- - Traditional search ranking
607
- """)
608
-
609
- search_output = gr.JSON(label="Hybrid Search Results", height=400)
610
- search_btn = gr.Button("πŸ” Search with Custom Parameters", variant="primary")
611
-
612
- def update_weights_visibility(search_type):
613
- return gr.Group(visible=(search_type == "hybrid"))
614
-
615
- search_type.change(update_weights_visibility, inputs=[search_type], outputs=[weight_group])
616
-
617
- def normalize_weights(vector_w, bm25_w):
618
- total = vector_w + bm25_w
619
- if total > 0:
620
- return vector_w / total, bm25_w / total
621
- return 0.6, 0.4
622
-
623
- def advanced_search(query, search_type, top_k, vector_w, bm25_w):
624
- vector_weight, bm25_weight = normalize_weights(vector_w, bm25_w)
625
- return search_api(query, top_k, search_type, vector_weight, bm25_weight)
626
-
627
- search_btn.click(
628
- advanced_search,
629
- inputs=[search_input, search_type, top_k_slider, vector_weight, bm25_weight],
630
- outputs=search_output
631
- )
632
 
633
- # Stats interface
634
- with gr.Blocks(title="πŸ“Š System Statistics") as stats_demo:
635
- gr.Markdown("# πŸ“Š Hybrid Search System Statistics")
636
-
637
- stats_output = gr.JSON(label="System Statistics", height=500)
638
- stats_btn = gr.Button("πŸ“Š Get System Statistics", variant="primary")
639
-
640
- stats_btn.click(get_stats_api, inputs=[], outputs=stats_output)
 
641
 
642
- # Main demo with tabs
643
  demo = gr.TabbedInterface(
644
  [chat_demo, search_demo, stats_demo],
645
- ["πŸ’¬ Hybrid Chat", "πŸ”§ Advanced Search", "πŸ“Š Statistics"],
646
  title="πŸ”₯ Hybrid Search RAGtim Bot - Vector + BM25 Fusion"
647
  )
648
 
649
- # Launch the application
650
  if __name__ == "__main__":
651
- logger.info("πŸš€ Launching Hybrid Search RAGtim Bot...")
652
- logger.info(f"πŸ“š Loaded {len(bot.knowledge_base)} sections from markdown files")
653
- logger.info(f"πŸ” BM25 index: {len(bot.document_frequency)} unique terms")
654
- logger.info(f"🧠 Vector embeddings: {len(bot.embeddings)} documents")
655
- logger.info("πŸ”₯ Hybrid search ready: Semantic + Keyword fusion!")
656
 
 
657
  demo.launch(
658
  server_name="0.0.0.0",
659
  server_port=7860,
 
1
  import gradio as gr
2
  import json
3
  import numpy as np
4
+ from transformers import pipeline, AutoTokenizer, AutoModel
5
  import torch
6
  import os
7
  from typing import List, Dict, Any
 
10
  import re
11
  import math
12
  from collections import defaultdict, Counter
 
 
 
 
 
 
 
 
13
 
14
  # Configure device
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ print(f"Using device: {device}")
17
 
18
  class HybridSearchRAGBot:
19
  def __init__(self):
 
22
  self.embeddings = []
23
 
24
  # BM25 components
25
+ self.term_frequencies = {} # doc_id -> {term: frequency}
26
+ self.document_frequency = {} # term -> number of docs containing term
27
+ self.document_lengths = {} # doc_id -> document length
28
  self.average_doc_length = 0
29
  self.total_documents = 0
30
 
31
  # BM25 parameters
32
+ self.k1 = 1.5 # Controls term frequency saturation
33
+ self.b = 0.75 # Controls document length normalization
34
 
35
  self.initialize_models()
36
  self.load_markdown_knowledge_base()
 
39
  def initialize_models(self):
40
  """Initialize the embedding model"""
41
  try:
42
+ print("Loading embedding model...")
43
  self.embedder = pipeline(
44
  'feature-extraction',
45
+ 'sentence-transformers/all-MiniLM-L6-v2',
46
  device=0 if device == "cuda" else -1
47
  )
48
+ print("βœ… Embedding model loaded successfully")
49
  except Exception as e:
50
+ print(f"❌ Error loading embedding model: {e}")
51
  raise e
52
 
53
  def load_markdown_knowledge_base(self):
54
  """Load knowledge base from markdown files"""
55
+ print("Loading knowledge base from markdown files...")
56
 
57
  # Reset knowledge base
58
  self.knowledge_base = []
59
 
60
+ # Load all markdown files
61
+ markdown_files = [
62
+ 'about.md',
63
+ 'research_details.md',
64
+ 'publications_detailed.md',
65
+ 'skills_expertise.md',
66
+ 'experience_detailed.md',
67
+ 'statistics.md'
68
+ ]
69
+
70
+ for filename in markdown_files:
71
  try:
72
  if os.path.exists(filename):
73
  with open(filename, 'r', encoding='utf-8') as f:
74
  content = f.read()
75
+ self.process_markdown_file(content, filename)
76
+ print(f"βœ… Loaded {filename}")
77
  else:
78
+ print(f"⚠️ File not found: {filename}")
79
  except Exception as e:
80
+ print(f"❌ Error loading {filename}: {e}")
81
 
82
  # Generate embeddings for knowledge base
83
+ print("Generating embeddings for knowledge base...")
84
  self.embeddings = []
85
  for i, doc in enumerate(self.knowledge_base):
86
  try:
87
  # Truncate content to avoid token limit issues
88
+ content = doc["content"][:500] # Limit to 500 characters
89
  embedding = self.embedder(content, return_tensors="pt")
90
  # Convert to numpy and flatten
91
  embedding_np = embedding[0].mean(dim=0).detach().cpu().numpy()
92
  self.embeddings.append(embedding_np)
93
  except Exception as e:
94
+ print(f"Error generating embedding for doc {doc['id']}: {e}")
95
  # Fallback to zero embedding
96
+ self.embeddings.append(np.zeros(384))
97
 
98
  self.total_documents = len(self.knowledge_base)
99
+ print(f"βœ… Knowledge base loaded with {len(self.knowledge_base)} documents")
100
 
101
  def process_markdown_file(self, content: str, filename: str):
102
  """Process a markdown file and extract sections"""
103
+ # Determine file type and priority
104
+ file_type_map = {
105
+ 'about.md': ('about', 10),
106
+ 'research_details.md': ('research', 9),
107
+ 'publications_detailed.md': ('publications', 8),
108
+ 'skills_expertise.md': ('skills', 7),
109
+ 'experience_detailed.md': ('experience', 8),
110
+ 'statistics.md': ('statistics', 9)
111
+ }
112
+
113
+ file_type, priority = file_type_map.get(filename, ('general', 5))
114
 
115
  # Split content into sections
116
  sections = self.split_markdown_into_sections(content)
117
 
118
  for section in sections:
119
+ if len(section['content'].strip()) > 100: # Only process substantial content
120
  doc = {
121
  "id": f"{filename}_{section['title']}_{len(self.knowledge_base)}",
122
  "content": section['content'],
 
136
  current_section = {'title': 'Introduction', 'content': ''}
137
 
138
  for line in lines:
139
+ # Check if line is a header
140
  if line.startswith('#'):
141
+ # Save previous section if it has content
142
  if current_section['content'].strip():
143
  sections.append(current_section.copy())
144
 
145
+ # Start new section
146
+ header_level = len(line) - len(line.lstrip('#'))
147
  title = line.lstrip('#').strip()
148
  current_section = {
149
  'title': title,
 
152
  else:
153
  current_section['content'] += line + '\n'
154
 
155
+ # Add the last section
156
  if current_section['content'].strip():
157
  sections.append(current_section)
158
 
 
160
 
161
  def tokenize(self, text: str) -> List[str]:
162
  """Tokenize text for BM25"""
163
+ # Convert to lowercase and remove punctuation
164
  text = re.sub(r'[^\w\s]', ' ', text.lower())
165
+ # Split into words and filter out short words and stop words
166
  words = [word for word in text.split() if len(word) > 2 and not self.is_stop_word(word)]
167
  return words
168
 
 
178
 
179
  def build_bm25_index(self):
180
  """Build BM25 index for all documents"""
181
+ print("Building BM25 index...")
182
 
183
+ # Reset indexes
184
  self.term_frequencies = {}
185
  self.document_frequency = defaultdict(int)
186
  self.document_lengths = {}
187
 
188
  total_length = 0
189
 
190
+ # First pass: calculate term frequencies and document lengths
191
  for doc in self.knowledge_base:
192
  doc_id = doc['id']
193
  terms = self.tokenize(doc['content'])
194
 
195
+ # Calculate term frequencies for this document
196
  term_freq = Counter(terms)
197
  self.term_frequencies[doc_id] = dict(term_freq)
198
 
199
+ # Store document length
200
  doc_length = len(terms)
201
  self.document_lengths[doc_id] = doc_length
202
  total_length += doc_length
203
 
204
+ # Update document frequencies
205
  unique_terms = set(terms)
206
  for term in unique_terms:
207
  self.document_frequency[term] += 1
208
 
209
+ # Calculate average document length
210
  self.average_doc_length = total_length / self.total_documents if self.total_documents > 0 else 0
211
 
212
+ print(f"βœ… BM25 index built: {len(self.document_frequency)} unique terms, avg doc length: {self.average_doc_length:.1f}")
213
 
214
  def calculate_bm25_score(self, term: str, doc_id: str) -> float:
215
  """Calculate BM25 score for a term in a document"""
216
+ # Get term frequency in document
217
  tf = self.term_frequencies.get(doc_id, {}).get(term, 0)
218
  if tf == 0:
219
  return 0.0
220
 
221
+ # Get document frequency and document length
222
  df = self.document_frequency.get(term, 1)
223
  doc_length = self.document_lengths.get(doc_id, 0)
224
 
225
+ # Calculate IDF: log((N - df + 0.5) / (df + 0.5))
226
  idf = math.log((self.total_documents - df + 0.5) / (df + 0.5))
227
 
228
+ # Calculate BM25 score
229
  numerator = tf * (self.k1 + 1)
230
  denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.average_doc_length))
231
 
 
239
 
240
  scores = {}
241
 
242
+ # Calculate BM25 score for each document
243
  for doc in self.knowledge_base:
244
  doc_id = doc['id']
245
  score = 0.0
 
248
  score += self.calculate_bm25_score(term, doc_id)
249
 
250
  if score > 0:
251
+ # Apply priority boost
252
  priority_boost = 1 + (doc['metadata']['priority'] / 50)
253
  final_score = score * priority_boost
254
 
 
258
  'search_type': 'bm25'
259
  }
260
 
261
+ # Sort by score and return top_k
262
  sorted_results = sorted(scores.values(), key=lambda x: x['score'], reverse=True)
263
  return sorted_results[:top_k]
264
 
 
269
  def vector_search(self, query: str, top_k: int = 10) -> List[Dict]:
270
  """Perform vector similarity search"""
271
  try:
272
+ # Generate query embedding
273
+ query_embedding = self.embedder(query[:500], return_tensors="pt") # Truncate query
274
  query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
275
 
276
+ # Calculate similarities
277
  similarities = []
278
  for i, doc_embedding in enumerate(self.embeddings):
279
  if doc_embedding is not None and len(doc_embedding) > 0:
280
  similarity = self.cosine_similarity(query_vector, doc_embedding)
281
 
282
+ # Apply priority boost
283
  priority_boost = 1 + (self.knowledge_base[i]['metadata']['priority'] / 100)
284
  final_score = similarity * priority_boost
285
 
 
289
  'search_type': 'vector'
290
  })
291
 
292
+ # Sort by similarity and return top_k
293
  similarities.sort(key=lambda x: x['score'], reverse=True)
294
  return similarities[:top_k]
295
 
296
  except Exception as e:
297
+ print(f"Error in vector search: {e}")
298
  return []
299
 
300
  def hybrid_search(self, query: str, top_k: int = 10, vector_weight: float = 0.6, bm25_weight: float = 0.4) -> List[Dict]:
301
  """Perform hybrid search combining vector and BM25 results"""
302
  try:
303
+ # Get results from both search methods
304
+ vector_results = self.vector_search(query, top_k * 2) # Get more results for better fusion
305
  bm25_results = self.bm25_search(query, top_k * 2)
306
 
307
+ # Normalize scores to [0, 1] range
308
  if vector_results:
309
  max_vector_score = max(r['score'] for r in vector_results)
310
  if max_vector_score > 0:
 
326
  # Combine results
327
  combined_scores = {}
328
 
329
+ # Add vector results
330
  for result in vector_results:
331
  doc_id = result['document']['id']
332
  combined_scores[doc_id] = {
 
336
  'search_type': 'vector'
337
  }
338
 
339
+ # Add BM25 results
340
  for result in bm25_results:
341
  doc_id = result['document']['id']
342
  if doc_id in combined_scores:
 
362
  'search_type': data['search_type']
363
  })
364
 
365
+ # Sort by hybrid score and return top_k
366
  final_results.sort(key=lambda x: x['score'], reverse=True)
367
  return final_results[:top_k]
368
 
369
  except Exception as e:
370
+ print(f"Error in hybrid search: {e}")
371
+ # Fallback to vector search only
372
  return self.vector_search(query, top_k)
373
 
374
  def search_knowledge_base(self, query: str, top_k: int = 5, search_type: str = "hybrid") -> List[Dict]:
 
377
  return self.vector_search(query, top_k)
378
  elif search_type == "bm25":
379
  return self.bm25_search(query, top_k)
380
+ else: # hybrid
381
  return self.hybrid_search(query, top_k)
382
 
383
  # Initialize the bot
384
+ print("Initializing Hybrid Search RAGtim Bot...")
385
  bot = HybridSearchRAGBot()
386
 
387
+ # API Functions for Gradio Client
388
+ def search_api(query: str, top_k: int = 5, search_type: str = "hybrid", vector_weight: float = 0.6, bm25_weight: float = 0.4):
389
  """API endpoint for hybrid search functionality"""
390
  try:
391
  if search_type == "hybrid":
 
407
  }
408
  }
409
  except Exception as e:
410
+ print(f"Error in search API: {e}")
411
  return {"error": str(e), "results": []}
412
 
413
  def get_stats_api():
414
  """API endpoint for knowledge base statistics"""
415
  try:
416
+ # Calculate document distribution by type
417
  doc_types = {}
418
  sections_by_file = {}
419
 
 
428
  "total_documents": len(bot.knowledge_base),
429
  "document_types": doc_types,
430
  "sections_by_file": sections_by_file,
431
+ "model_name": "sentence-transformers/all-MiniLM-L6-v2",
432
+ "embedding_dimension": 384,
433
  "search_capabilities": [
434
  "Hybrid Search (Vector + BM25)",
435
  "Semantic Vector Search",
 
448
  "status": "healthy"
449
  }
450
  except Exception as e:
451
+ print(f"Error in get_stats_api: {e}")
452
  return {
453
  "error": str(e),
454
  "status": "error",
 
462
  return "Please ask me something about Raktim Mondol! I use hybrid search combining semantic similarity and keyword matching for the best results."
463
 
464
  try:
465
+ # Use hybrid search by default
466
  search_results = bot.hybrid_search(message, top_k=6)
467
 
468
  if search_results:
469
+ # Build comprehensive response
470
  response_parts = []
471
  response_parts.append(f"πŸ” **Hybrid Search Results** (Vector + BM25 combination, found {len(search_results)} relevant sections):\n")
472
 
473
+ # Use the best match as primary response
474
  best_match = search_results[0]
475
  response_parts.append(f"**Primary Answer** (Hybrid Score: {best_match['score']:.3f}):")
476
  response_parts.append(f"πŸ“„ Source: {best_match['document']['metadata']['source']} - {best_match['document']['metadata']['section']}")
477
  response_parts.append(f"πŸ” Search Type: {best_match['search_type'].upper()}")
478
 
479
+ # Show score breakdown for hybrid results
480
  if 'vector_score' in best_match and 'bm25_score' in best_match:
481
  response_parts.append(f"πŸ“Š Vector Score: {best_match['vector_score']:.3f} | BM25 Score: {best_match['bm25_score']:.3f}")
482
 
483
  response_parts.append(f"\n{best_match['document']['content']}\n")
484
 
485
+ # Add additional context if available
486
  if len(search_results) > 1:
487
  response_parts.append("**Additional Context:**")
488
+ for i, result in enumerate(search_results[1:3], 1): # Show up to 2 additional results
489
  section_info = f"{result['document']['metadata']['source']} - {result['document']['metadata']['section']}"
490
  search_info = f"({result['search_type'].upper()}, Score: {result['score']:.3f})"
491
  response_parts.append(f"{i}. {section_info} {search_info}")
492
 
493
+ # Add a brief excerpt
494
  excerpt = result['document']['content'][:200] + "..." if len(result['document']['content']) > 200 else result['document']['content']
495
  response_parts.append(f" {excerpt}\n")
496
 
 
505
  return "I don't have specific information about that topic in my knowledge base. Could you please ask something else about Raktim Mondol?"
506
 
507
  except Exception as e:
508
+ print(f"Error in chat interface: {e}")
509
  return "I'm sorry, I encountered an error while processing your question. Please try again."
510
 
511
+ # Create Gradio interfaces with proper API names
512
+ print("Creating Gradio interface...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
 
514
+ # Main chat interface
515
+ chat_demo = gr.Interface(
516
+ fn=chat_interface,
517
+ inputs=[
518
+ gr.Textbox(label="Ask about Raktim Mondol", placeholder="What would you like to know about Raktim's research, skills, or experience?"),
519
+ gr.State([]) # For conversation history
520
+ ],
521
+ outputs=gr.Textbox(label="Response"),
522
  title="πŸ”₯ Hybrid Search RAGtim Bot",
523
+ description="Ask me anything about Raktim Mondol! I use advanced hybrid search combining semantic similarity and keyword matching.",
524
+ api_name="chat"
525
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526
 
527
+ # Search API interface
528
+ search_demo = gr.Interface(
529
+ fn=search_api,
530
+ inputs=[
531
+ gr.Textbox(label="Search Query", placeholder="Enter your search query"),
532
+ gr.Number(label="Top K Results", value=5, minimum=1, maximum=20),
533
+ gr.Radio(choices=["hybrid", "vector", "bm25"], value="hybrid", label="Search Type"),
534
+ gr.Slider(minimum=0.0, maximum=1.0, value=0.6, label="Vector Weight"),
535
+ gr.Slider(minimum=0.0, maximum=1.0, value=0.4, label="BM25 Weight")
536
+ ],
537
+ outputs=gr.JSON(label="Search Results"),
538
+ title="πŸ” Hybrid Search API",
539
+ description="Direct access to the hybrid search functionality",
540
+ api_name="search"
541
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
 
543
+ # Stats API interface
544
+ stats_demo = gr.Interface(
545
+ fn=get_stats_api,
546
+ inputs=[],
547
+ outputs=gr.JSON(label="System Statistics"),
548
+ title="πŸ“Š System Statistics",
549
+ description="Knowledge base and system information",
550
+ api_name="stats"
551
+ )
552
 
553
+ # Combine interfaces
554
  demo = gr.TabbedInterface(
555
  [chat_demo, search_demo, stats_demo],
556
+ ["πŸ’¬ Chat", "πŸ” Search API", "πŸ“Š Stats API"],
557
  title="πŸ”₯ Hybrid Search RAGtim Bot - Vector + BM25 Fusion"
558
  )
559
 
 
560
  if __name__ == "__main__":
561
+ print("πŸš€ Launching Hybrid Search RAGtim Bot...")
562
+ print(f"πŸ“š Loaded {len(bot.knowledge_base)} sections from markdown files")
563
+ print(f"πŸ” BM25 index: {len(bot.document_frequency)} unique terms")
564
+ print(f"🧠 Vector embeddings: {len(bot.embeddings)} documents")
565
+ print("πŸ”₯ Hybrid search ready: Semantic + Keyword fusion!")
566
 
567
+ # Launch the main demo with API access
568
  demo.launch(
569
  server_name="0.0.0.0",
570
  server_port=7860,