raktimhugging commited on
Commit
39dacf3
Β·
verified Β·
1 Parent(s): 843035b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -185
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import json
3
  import numpy as np
4
- from transformers import pipeline, AutoTokenizer, AutoModel
5
  import torch
6
  import os
7
  from typing import List, Dict, Any
@@ -10,10 +10,18 @@ import requests
10
  import re
11
  import math
12
  from collections import defaultdict, Counter
 
 
 
 
 
 
 
 
13
 
14
  # Configure device
15
- device = "cuda" if torch.cuda.is_available() else "cpu"
16
- print(f"Using device: {device}")
17
 
18
  class HybridSearchRAGBot:
19
  def __init__(self):
@@ -22,15 +30,15 @@ class HybridSearchRAGBot:
22
  self.embeddings = []
23
 
24
  # BM25 components
25
- self.term_frequencies = {} # doc_id -> {term: frequency}
26
- self.document_frequency = {} # term -> number of docs containing term
27
- self.document_lengths = {} # doc_id -> document length
28
  self.average_doc_length = 0
29
  self.total_documents = 0
30
 
31
  # BM25 parameters
32
- self.k1 = 1.5 # Controls term frequency saturation
33
- self.b = 0.75 # Controls document length normalization
34
 
35
  self.initialize_models()
36
  self.load_markdown_knowledge_base()
@@ -39,84 +47,64 @@ class HybridSearchRAGBot:
39
  def initialize_models(self):
40
  """Initialize the embedding model"""
41
  try:
42
- print("Loading embedding model...")
43
  self.embedder = pipeline(
44
  'feature-extraction',
45
- 'sentence-transformers/all-MiniLM-L6-v2',
46
  device=0 if device == "cuda" else -1
47
  )
48
- print("βœ… Embedding model loaded successfully")
49
  except Exception as e:
50
- print(f"❌ Error loading embedding model: {e}")
51
  raise e
52
 
53
  def load_markdown_knowledge_base(self):
54
  """Load knowledge base from markdown files"""
55
- print("Loading knowledge base from markdown files...")
56
 
57
  # Reset knowledge base
58
  self.knowledge_base = []
59
 
60
- # Load all markdown files
61
- markdown_files = [
62
- 'about.md',
63
- 'research_details.md',
64
- 'publications_detailed.md',
65
- 'skills_expertise.md',
66
- 'experience_detailed.md',
67
- 'statistics.md'
68
- ]
69
-
70
- for filename in markdown_files:
71
  try:
72
  if os.path.exists(filename):
73
  with open(filename, 'r', encoding='utf-8') as f:
74
  content = f.read()
75
- self.process_markdown_file(content, filename)
76
- print(f"βœ… Loaded {filename}")
77
  else:
78
- print(f"⚠️ File not found: {filename}")
79
  except Exception as e:
80
- print(f"❌ Error loading {filename}: {e}")
81
 
82
  # Generate embeddings for knowledge base
83
- print("Generating embeddings for knowledge base...")
84
  self.embeddings = []
85
  for i, doc in enumerate(self.knowledge_base):
86
  try:
87
  # Truncate content to avoid token limit issues
88
- content = doc["content"][:500] # Limit to 500 characters
89
  embedding = self.embedder(content, return_tensors="pt")
90
  # Convert to numpy and flatten
91
  embedding_np = embedding[0].mean(dim=0).detach().cpu().numpy()
92
  self.embeddings.append(embedding_np)
93
  except Exception as e:
94
- print(f"Error generating embedding for doc {doc['id']}: {e}")
95
  # Fallback to zero embedding
96
- self.embeddings.append(np.zeros(384))
97
 
98
  self.total_documents = len(self.knowledge_base)
99
- print(f"βœ… Knowledge base loaded with {len(self.knowledge_base)} documents")
100
 
101
  def process_markdown_file(self, content: str, filename: str):
102
  """Process a markdown file and extract sections"""
103
- # Determine file type and priority
104
- file_type_map = {
105
- 'about.md': ('about', 10),
106
- 'research_details.md': ('research', 9),
107
- 'publications_detailed.md': ('publications', 8),
108
- 'skills_expertise.md': ('skills', 7),
109
- 'experience_detailed.md': ('experience', 8),
110
- 'statistics.md': ('statistics', 9)
111
- }
112
-
113
- file_type, priority = file_type_map.get(filename, ('general', 5))
114
 
115
  # Split content into sections
116
  sections = self.split_markdown_into_sections(content)
117
 
118
  for section in sections:
119
- if len(section['content'].strip()) > 100: # Only process substantial content
120
  doc = {
121
  "id": f"{filename}_{section['title']}_{len(self.knowledge_base)}",
122
  "content": section['content'],
@@ -136,14 +124,10 @@ class HybridSearchRAGBot:
136
  current_section = {'title': 'Introduction', 'content': ''}
137
 
138
  for line in lines:
139
- # Check if line is a header
140
  if line.startswith('#'):
141
- # Save previous section if it has content
142
  if current_section['content'].strip():
143
  sections.append(current_section.copy())
144
 
145
- # Start new section
146
- header_level = len(line) - len(line.lstrip('#'))
147
  title = line.lstrip('#').strip()
148
  current_section = {
149
  'title': title,
@@ -152,7 +136,6 @@ class HybridSearchRAGBot:
152
  else:
153
  current_section['content'] += line + '\n'
154
 
155
- # Add the last section
156
  if current_section['content'].strip():
157
  sections.append(current_section)
158
 
@@ -160,9 +143,7 @@ class HybridSearchRAGBot:
160
 
161
  def tokenize(self, text: str) -> List[str]:
162
  """Tokenize text for BM25"""
163
- # Convert to lowercase and remove punctuation
164
  text = re.sub(r'[^\w\s]', ' ', text.lower())
165
- # Split into words and filter out short words and stop words
166
  words = [word for word in text.split() if len(word) > 2 and not self.is_stop_word(word)]
167
  return words
168
 
@@ -178,54 +159,44 @@ class HybridSearchRAGBot:
178
 
179
  def build_bm25_index(self):
180
  """Build BM25 index for all documents"""
181
- print("Building BM25 index...")
182
 
183
- # Reset indexes
184
  self.term_frequencies = {}
185
  self.document_frequency = defaultdict(int)
186
  self.document_lengths = {}
187
 
188
  total_length = 0
189
 
190
- # First pass: calculate term frequencies and document lengths
191
  for doc in self.knowledge_base:
192
  doc_id = doc['id']
193
  terms = self.tokenize(doc['content'])
194
 
195
- # Calculate term frequencies for this document
196
  term_freq = Counter(terms)
197
  self.term_frequencies[doc_id] = dict(term_freq)
198
 
199
- # Store document length
200
  doc_length = len(terms)
201
  self.document_lengths[doc_id] = doc_length
202
  total_length += doc_length
203
 
204
- # Update document frequencies
205
  unique_terms = set(terms)
206
  for term in unique_terms:
207
  self.document_frequency[term] += 1
208
 
209
- # Calculate average document length
210
  self.average_doc_length = total_length / self.total_documents if self.total_documents > 0 else 0
211
 
212
- print(f"βœ… BM25 index built: {len(self.document_frequency)} unique terms, avg doc length: {self.average_doc_length:.1f}")
213
 
214
  def calculate_bm25_score(self, term: str, doc_id: str) -> float:
215
  """Calculate BM25 score for a term in a document"""
216
- # Get term frequency in document
217
  tf = self.term_frequencies.get(doc_id, {}).get(term, 0)
218
  if tf == 0:
219
  return 0.0
220
 
221
- # Get document frequency and document length
222
  df = self.document_frequency.get(term, 1)
223
  doc_length = self.document_lengths.get(doc_id, 0)
224
 
225
- # Calculate IDF: log((N - df + 0.5) / (df + 0.5))
226
  idf = math.log((self.total_documents - df + 0.5) / (df + 0.5))
227
 
228
- # Calculate BM25 score
229
  numerator = tf * (self.k1 + 1)
230
  denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.average_doc_length))
231
 
@@ -239,7 +210,6 @@ class HybridSearchRAGBot:
239
 
240
  scores = {}
241
 
242
- # Calculate BM25 score for each document
243
  for doc in self.knowledge_base:
244
  doc_id = doc['id']
245
  score = 0.0
@@ -248,7 +218,6 @@ class HybridSearchRAGBot:
248
  score += self.calculate_bm25_score(term, doc_id)
249
 
250
  if score > 0:
251
- # Apply priority boost
252
  priority_boost = 1 + (doc['metadata']['priority'] / 50)
253
  final_score = score * priority_boost
254
 
@@ -258,7 +227,6 @@ class HybridSearchRAGBot:
258
  'search_type': 'bm25'
259
  }
260
 
261
- # Sort by score and return top_k
262
  sorted_results = sorted(scores.values(), key=lambda x: x['score'], reverse=True)
263
  return sorted_results[:top_k]
264
 
@@ -269,17 +237,14 @@ class HybridSearchRAGBot:
269
  def vector_search(self, query: str, top_k: int = 10) -> List[Dict]:
270
  """Perform vector similarity search"""
271
  try:
272
- # Generate query embedding
273
- query_embedding = self.embedder(query[:500], return_tensors="pt") # Truncate query
274
  query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
275
 
276
- # Calculate similarities
277
  similarities = []
278
  for i, doc_embedding in enumerate(self.embeddings):
279
  if doc_embedding is not None and len(doc_embedding) > 0:
280
  similarity = self.cosine_similarity(query_vector, doc_embedding)
281
 
282
- # Apply priority boost
283
  priority_boost = 1 + (self.knowledge_base[i]['metadata']['priority'] / 100)
284
  final_score = similarity * priority_boost
285
 
@@ -289,22 +254,20 @@ class HybridSearchRAGBot:
289
  'search_type': 'vector'
290
  })
291
 
292
- # Sort by similarity and return top_k
293
  similarities.sort(key=lambda x: x['score'], reverse=True)
294
  return similarities[:top_k]
295
 
296
  except Exception as e:
297
- print(f"Error in vector search: {e}")
298
  return []
299
 
300
  def hybrid_search(self, query: str, top_k: int = 10, vector_weight: float = 0.6, bm25_weight: float = 0.4) -> List[Dict]:
301
  """Perform hybrid search combining vector and BM25 results"""
302
  try:
303
- # Get results from both search methods
304
- vector_results = self.vector_search(query, top_k * 2) # Get more results for better fusion
305
  bm25_results = self.bm25_search(query, top_k * 2)
306
 
307
- # Normalize scores to [0, 1] range
308
  if vector_results:
309
  max_vector_score = max(r['score'] for r in vector_results)
310
  if max_vector_score > 0:
@@ -326,7 +289,6 @@ class HybridSearchRAGBot:
326
  # Combine results
327
  combined_scores = {}
328
 
329
- # Add vector results
330
  for result in vector_results:
331
  doc_id = result['document']['id']
332
  combined_scores[doc_id] = {
@@ -336,7 +298,6 @@ class HybridSearchRAGBot:
336
  'search_type': 'vector'
337
  }
338
 
339
- # Add BM25 results
340
  for result in bm25_results:
341
  doc_id = result['document']['id']
342
  if doc_id in combined_scores:
@@ -362,13 +323,11 @@ class HybridSearchRAGBot:
362
  'search_type': data['search_type']
363
  })
364
 
365
- # Sort by hybrid score and return top_k
366
  final_results.sort(key=lambda x: x['score'], reverse=True)
367
  return final_results[:top_k]
368
 
369
  except Exception as e:
370
- print(f"Error in hybrid search: {e}")
371
- # Fallback to vector search only
372
  return self.vector_search(query, top_k)
373
 
374
  def search_knowledge_base(self, query: str, top_k: int = 5, search_type: str = "hybrid") -> List[Dict]:
@@ -377,13 +336,14 @@ class HybridSearchRAGBot:
377
  return self.vector_search(query, top_k)
378
  elif search_type == "bm25":
379
  return self.bm25_search(query, top_k)
380
- else: # hybrid
381
  return self.hybrid_search(query, top_k)
382
 
383
  # Initialize the bot
384
- print("Initializing Hybrid Search RAGtim Bot...")
385
  bot = HybridSearchRAGBot()
386
 
 
387
  def search_api(query, top_k=5, search_type="hybrid", vector_weight=0.6, bm25_weight=0.4):
388
  """API endpoint for hybrid search functionality"""
389
  try:
@@ -406,13 +366,12 @@ def search_api(query, top_k=5, search_type="hybrid", vector_weight=0.6, bm25_wei
406
  }
407
  }
408
  except Exception as e:
409
- print(f"Error in search API: {e}")
410
  return {"error": str(e), "results": []}
411
 
412
  def get_stats_api():
413
  """API endpoint for knowledge base statistics"""
414
  try:
415
- # Calculate document distribution by type
416
  doc_types = {}
417
  sections_by_file = {}
418
 
@@ -427,8 +386,8 @@ def get_stats_api():
427
  "total_documents": len(bot.knowledge_base),
428
  "document_types": doc_types,
429
  "sections_by_file": sections_by_file,
430
- "model_name": "sentence-transformers/all-MiniLM-L6-v2",
431
- "embedding_dimension": 384,
432
  "search_capabilities": [
433
  "Hybrid Search (Vector + BM25)",
434
  "Semantic Vector Search",
@@ -447,7 +406,7 @@ def get_stats_api():
447
  "status": "healthy"
448
  }
449
  except Exception as e:
450
- print(f"Error in get_stats_api: {e}")
451
  return {
452
  "error": str(e),
453
  "status": "error",
@@ -461,35 +420,29 @@ def chat_interface(message, history):
461
  return "Please ask me something about Raktim Mondol! I use hybrid search combining semantic similarity and keyword matching for the best results."
462
 
463
  try:
464
- # Use hybrid search by default
465
  search_results = bot.hybrid_search(message, top_k=6)
466
 
467
  if search_results:
468
- # Build comprehensive response
469
  response_parts = []
470
  response_parts.append(f"πŸ” **Hybrid Search Results** (Vector + BM25 combination, found {len(search_results)} relevant sections):\n")
471
 
472
- # Use the best match as primary response
473
  best_match = search_results[0]
474
  response_parts.append(f"**Primary Answer** (Hybrid Score: {best_match['score']:.3f}):")
475
  response_parts.append(f"πŸ“„ Source: {best_match['document']['metadata']['source']} - {best_match['document']['metadata']['section']}")
476
  response_parts.append(f"πŸ” Search Type: {best_match['search_type'].upper()}")
477
 
478
- # Show score breakdown for hybrid results
479
  if 'vector_score' in best_match and 'bm25_score' in best_match:
480
  response_parts.append(f"πŸ“Š Vector Score: {best_match['vector_score']:.3f} | BM25 Score: {best_match['bm25_score']:.3f}")
481
 
482
  response_parts.append(f"\n{best_match['document']['content']}\n")
483
 
484
- # Add additional context if available
485
  if len(search_results) > 1:
486
  response_parts.append("**Additional Context:**")
487
- for i, result in enumerate(search_results[1:3], 1): # Show up to 2 additional results
488
  section_info = f"{result['document']['metadata']['source']} - {result['document']['metadata']['section']}"
489
  search_info = f"({result['search_type'].upper()}, Score: {result['score']:.3f})"
490
  response_parts.append(f"{i}. {section_info} {search_info}")
491
 
492
- # Add a brief excerpt
493
  excerpt = result['document']['content'][:200] + "..." if len(result['document']['content']) > 200 else result['document']['content']
494
  response_parts.append(f" {excerpt}\n")
495
 
@@ -504,13 +457,10 @@ def chat_interface(message, history):
504
  return "I don't have specific information about that topic in my knowledge base. Could you please ask something else about Raktim Mondol?"
505
 
506
  except Exception as e:
507
- print(f"Error in chat interface: {e}")
508
  return "I'm sorry, I encountered an error while processing your question. Please try again."
509
 
510
- # Create Gradio interface
511
- print("Creating Gradio interface...")
512
-
513
- # Custom CSS for better styling
514
  css = """
515
  .gradio-container {
516
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
@@ -545,24 +495,13 @@ with gr.Blocks(
545
  - πŸ” **BM25 Keyword Search**: Advanced TF-IDF ranking for exact term matching
546
  - βš–οΈ **Intelligent Fusion**: Weighted combination for optimal relevance
547
 
548
- **πŸ“š Knowledge Base**: **{len(bot.knowledge_base)} sections** from comprehensive markdown files:
549
- - πŸ“„ **about.md** - Personal info, contact, professional summary
550
- - πŸ”¬ **research_details.md** - Research projects, methodologies, innovations
551
- - πŸ“š **publications_detailed.md** - Publications with technical details
552
- - πŸ’» **skills_expertise.md** - Technical skills, LLM expertise, tools
553
- - πŸ’Ό **experience_detailed.md** - Professional experience, teaching
554
- - πŸ“Š **statistics.md** - Statistical methods, biostatistics expertise
555
 
556
  **πŸ”§ Search Parameters**:
557
  - **BM25 Parameters**: k1={bot.k1}, b={bot.b}
558
  - **Vocabulary**: {len(bot.document_frequency)} unique terms
559
  - **Average Document Length**: {bot.average_doc_length:.1f} words
560
- - **Embedding Model**: sentence-transformers/all-MiniLM-L6-v2 (384-dim)
561
-
562
- **πŸ’‘ Try Different Search Types**:
563
- - **Hybrid** (Recommended): Best of both semantic and keyword search
564
- - **Vector**: Pure semantic similarity for conceptual queries
565
- - **BM25**: Pure keyword matching for specific terms
566
 
567
  **Ask me anything about Raktim Mondol's research, expertise, and background!**
568
  """)
@@ -600,13 +539,8 @@ with gr.Blocks(
600
  if not message.strip():
601
  return history, ""
602
 
603
- # Add user message to history
604
  history.append({"role": "user", "content": message})
605
-
606
- # Get bot response
607
  bot_response = chat_interface(message, history)
608
-
609
- # Add bot response to history
610
  history.append({"role": "assistant", "content": bot_response})
611
 
612
  return history, ""
@@ -614,10 +548,9 @@ with gr.Blocks(
614
  submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
615
  msg.submit(respond, [msg, chatbot], [chatbot, msg])
616
 
617
- # Create advanced search interface
618
  with gr.Blocks(title="πŸ”§ Advanced Hybrid Search") as search_demo:
619
  gr.Markdown("# πŸ”§ Advanced Hybrid Search Configuration")
620
- gr.Markdown("Fine-tune the hybrid search parameters and compare different search methods")
621
 
622
  with gr.Row():
623
  with gr.Column(scale=2):
@@ -630,8 +563,7 @@ with gr.Blocks(title="πŸ”§ Advanced Hybrid Search") as search_demo:
630
  search_type = gr.Radio(
631
  choices=["hybrid", "vector", "bm25"],
632
  value="hybrid",
633
- label="Search Method",
634
- elem_classes=["search-type-radio"]
635
  )
636
  top_k_slider = gr.Slider(
637
  minimum=1,
@@ -641,7 +573,6 @@ with gr.Blocks(title="πŸ”§ Advanced Hybrid Search") as search_demo:
641
  label="Top K Results"
642
  )
643
 
644
- # Hybrid search weights (only shown when hybrid is selected)
645
  with gr.Group(visible=True) as weight_group:
646
  gr.Markdown("**Hybrid Search Weights**")
647
  vector_weight = gr.Slider(
@@ -690,7 +621,6 @@ with gr.Blocks(title="πŸ”§ Advanced Hybrid Search") as search_demo:
690
  return 0.6, 0.4
691
 
692
  def advanced_search(query, search_type, top_k, vector_w, bm25_w):
693
- # Normalize weights
694
  vector_weight, bm25_weight = normalize_weights(vector_w, bm25_w)
695
  return search_api(query, top_k, search_type, vector_weight, bm25_weight)
696
 
@@ -700,84 +630,33 @@ with gr.Blocks(title="πŸ”§ Advanced Hybrid Search") as search_demo:
700
  outputs=search_output
701
  )
702
 
703
- # Create stats interface
704
  with gr.Blocks(title="πŸ“Š System Statistics") as stats_demo:
705
  gr.Markdown("# πŸ“Š Hybrid Search System Statistics")
706
- gr.Markdown("Detailed information about the knowledge base and search capabilities")
707
 
708
  stats_output = gr.JSON(label="System Statistics", height=500)
709
  stats_btn = gr.Button("πŸ“Š Get System Statistics", variant="primary")
710
 
711
- stats_btn.click(
712
- get_stats_api,
713
- inputs=[],
714
- outputs=stats_output
715
- )
716
 
717
- # Combine interfaces using TabbedInterface
718
  demo = gr.TabbedInterface(
719
  [chat_demo, search_demo, stats_demo],
720
  ["πŸ’¬ Hybrid Chat", "πŸ”§ Advanced Search", "πŸ“Š Statistics"],
721
  title="πŸ”₯ Hybrid Search RAGtim Bot - Vector + BM25 Fusion"
722
  )
723
 
724
- # Create API functions for external access
725
- def api_search_function(query: str, top_k: int = 5, search_type: str = "hybrid", vector_weight: float = 0.6, bm25_weight: float = 0.4):
726
- """API function for search - accessible via Gradio API"""
727
- try:
728
- if not query or not query.strip():
729
- return {"error": "Query parameter is required"}
730
-
731
- return search_api(query.strip(), top_k, search_type, vector_weight, bm25_weight)
732
- except Exception as e:
733
- return {"error": str(e)}
734
-
735
- def api_stats_function():
736
- """API function for stats - accessible via Gradio API"""
737
- try:
738
- return get_stats_api()
739
- except Exception as e:
740
- return {"error": str(e)}
741
-
742
- # Create separate API interfaces that can be accessed via HTTP
743
- search_api_interface = gr.Interface(
744
- fn=api_search_function,
745
- inputs=[
746
- gr.Textbox(label="query", placeholder="Enter search query"),
747
- gr.Number(label="top_k", value=5, minimum=1, maximum=20),
748
- gr.Dropdown(label="search_type", choices=["hybrid", "vector", "bm25"], value="hybrid"),
749
- gr.Number(label="vector_weight", value=0.6, minimum=0.0, maximum=1.0),
750
- gr.Number(label="bm25_weight", value=0.4, minimum=0.0, maximum=1.0)
751
- ],
752
- outputs=gr.JSON(label="Search Results"),
753
- title="Search API",
754
- description="Hybrid search API endpoint"
755
- )
756
-
757
- stats_api_interface = gr.Interface(
758
- fn=api_stats_function,
759
- inputs=[],
760
- outputs=gr.JSON(label="Statistics"),
761
- title="Stats API",
762
- description="Knowledge base statistics API endpoint"
763
- )
764
-
765
  if __name__ == "__main__":
766
- print("πŸš€ Launching Hybrid Search RAGtim Bot...")
767
- print(f"πŸ“š Loaded {len(bot.knowledge_base)} sections from markdown files")
768
- print(f"πŸ” BM25 index: {len(bot.document_frequency)} unique terms")
769
- print(f"🧠 Vector embeddings: {len(bot.embeddings)} documents")
770
- print("πŸ”₯ Hybrid search ready: Semantic + Keyword fusion!")
771
 
772
- # Launch the main demo
773
  demo.launch(
774
  server_name="0.0.0.0",
775
  server_port=7860,
776
  share=False,
777
  show_error=True
778
- )
779
-
780
- # Note: The API interfaces are available at:
781
- # - Main interface: https://your-space-url.hf.space
782
- # - Search API: https://your-space-url.hf.space/api/search (via the main interface)
783
- # - Stats API: https://your-space-url.hf.space/api/stats (via the main interface)
 
1
  import gradio as gr
2
  import json
3
  import numpy as np
4
+ from transformers import pipeline
5
  import torch
6
  import os
7
  from typing import List, Dict, Any
 
10
  import re
11
  import math
12
  from collections import defaultdict, Counter
13
+ import logging
14
+
15
+ # Import configuration
16
+ from config import *
17
+
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
 
22
  # Configure device
23
+ device = get_device()
24
+ logger.info(f"Using device: {device}")
25
 
26
  class HybridSearchRAGBot:
27
  def __init__(self):
 
30
  self.embeddings = []
31
 
32
  # BM25 components
33
+ self.term_frequencies = {}
34
+ self.document_frequency = {}
35
+ self.document_lengths = {}
36
  self.average_doc_length = 0
37
  self.total_documents = 0
38
 
39
  # BM25 parameters
40
+ self.k1 = BM25_K1
41
+ self.b = BM25_B
42
 
43
  self.initialize_models()
44
  self.load_markdown_knowledge_base()
 
47
  def initialize_models(self):
48
  """Initialize the embedding model"""
49
  try:
50
+ logger.info("Loading embedding model...")
51
  self.embedder = pipeline(
52
  'feature-extraction',
53
+ EMBEDDING_MODEL,
54
  device=0 if device == "cuda" else -1
55
  )
56
+ logger.info("βœ… Embedding model loaded successfully")
57
  except Exception as e:
58
+ logger.error(f"❌ Error loading embedding model: {e}")
59
  raise e
60
 
61
  def load_markdown_knowledge_base(self):
62
  """Load knowledge base from markdown files"""
63
+ logger.info("Loading knowledge base from markdown files...")
64
 
65
  # Reset knowledge base
66
  self.knowledge_base = []
67
 
68
+ for filename in KNOWLEDGE_BASE_FILES:
 
 
 
 
 
 
 
 
 
 
69
  try:
70
  if os.path.exists(filename):
71
  with open(filename, 'r', encoding='utf-8') as f:
72
  content = f.read()
73
+ self.process_markdown_file(content, os.path.basename(filename))
74
+ logger.info(f"βœ… Loaded {filename}")
75
  else:
76
+ logger.warning(f"⚠️ File not found: {filename}")
77
  except Exception as e:
78
+ logger.error(f"❌ Error loading {filename}: {e}")
79
 
80
  # Generate embeddings for knowledge base
81
+ logger.info("Generating embeddings for knowledge base...")
82
  self.embeddings = []
83
  for i, doc in enumerate(self.knowledge_base):
84
  try:
85
  # Truncate content to avoid token limit issues
86
+ content = doc["content"][:500]
87
  embedding = self.embedder(content, return_tensors="pt")
88
  # Convert to numpy and flatten
89
  embedding_np = embedding[0].mean(dim=0).detach().cpu().numpy()
90
  self.embeddings.append(embedding_np)
91
  except Exception as e:
92
+ logger.error(f"Error generating embedding for doc {doc['id']}: {e}")
93
  # Fallback to zero embedding
94
+ self.embeddings.append(np.zeros(EMBEDDING_DIM))
95
 
96
  self.total_documents = len(self.knowledge_base)
97
+ logger.info(f"βœ… Knowledge base loaded with {len(self.knowledge_base)} documents")
98
 
99
  def process_markdown_file(self, content: str, filename: str):
100
  """Process a markdown file and extract sections"""
101
+ file_type, priority = FILE_TYPE_MAP.get(filename, ('general', 5))
 
 
 
 
 
 
 
 
 
 
102
 
103
  # Split content into sections
104
  sections = self.split_markdown_into_sections(content)
105
 
106
  for section in sections:
107
+ if len(section['content'].strip()) > 100:
108
  doc = {
109
  "id": f"{filename}_{section['title']}_{len(self.knowledge_base)}",
110
  "content": section['content'],
 
124
  current_section = {'title': 'Introduction', 'content': ''}
125
 
126
  for line in lines:
 
127
  if line.startswith('#'):
 
128
  if current_section['content'].strip():
129
  sections.append(current_section.copy())
130
 
 
 
131
  title = line.lstrip('#').strip()
132
  current_section = {
133
  'title': title,
 
136
  else:
137
  current_section['content'] += line + '\n'
138
 
 
139
  if current_section['content'].strip():
140
  sections.append(current_section)
141
 
 
143
 
144
  def tokenize(self, text: str) -> List[str]:
145
  """Tokenize text for BM25"""
 
146
  text = re.sub(r'[^\w\s]', ' ', text.lower())
 
147
  words = [word for word in text.split() if len(word) > 2 and not self.is_stop_word(word)]
148
  return words
149
 
 
159
 
160
  def build_bm25_index(self):
161
  """Build BM25 index for all documents"""
162
+ logger.info("Building BM25 index...")
163
 
 
164
  self.term_frequencies = {}
165
  self.document_frequency = defaultdict(int)
166
  self.document_lengths = {}
167
 
168
  total_length = 0
169
 
 
170
  for doc in self.knowledge_base:
171
  doc_id = doc['id']
172
  terms = self.tokenize(doc['content'])
173
 
 
174
  term_freq = Counter(terms)
175
  self.term_frequencies[doc_id] = dict(term_freq)
176
 
 
177
  doc_length = len(terms)
178
  self.document_lengths[doc_id] = doc_length
179
  total_length += doc_length
180
 
 
181
  unique_terms = set(terms)
182
  for term in unique_terms:
183
  self.document_frequency[term] += 1
184
 
 
185
  self.average_doc_length = total_length / self.total_documents if self.total_documents > 0 else 0
186
 
187
+ logger.info(f"βœ… BM25 index built: {len(self.document_frequency)} unique terms, avg doc length: {self.average_doc_length:.1f}")
188
 
189
  def calculate_bm25_score(self, term: str, doc_id: str) -> float:
190
  """Calculate BM25 score for a term in a document"""
 
191
  tf = self.term_frequencies.get(doc_id, {}).get(term, 0)
192
  if tf == 0:
193
  return 0.0
194
 
 
195
  df = self.document_frequency.get(term, 1)
196
  doc_length = self.document_lengths.get(doc_id, 0)
197
 
 
198
  idf = math.log((self.total_documents - df + 0.5) / (df + 0.5))
199
 
 
200
  numerator = tf * (self.k1 + 1)
201
  denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.average_doc_length))
202
 
 
210
 
211
  scores = {}
212
 
 
213
  for doc in self.knowledge_base:
214
  doc_id = doc['id']
215
  score = 0.0
 
218
  score += self.calculate_bm25_score(term, doc_id)
219
 
220
  if score > 0:
 
221
  priority_boost = 1 + (doc['metadata']['priority'] / 50)
222
  final_score = score * priority_boost
223
 
 
227
  'search_type': 'bm25'
228
  }
229
 
 
230
  sorted_results = sorted(scores.values(), key=lambda x: x['score'], reverse=True)
231
  return sorted_results[:top_k]
232
 
 
237
  def vector_search(self, query: str, top_k: int = 10) -> List[Dict]:
238
  """Perform vector similarity search"""
239
  try:
240
+ query_embedding = self.embedder(query[:500], return_tensors="pt")
 
241
  query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
242
 
 
243
  similarities = []
244
  for i, doc_embedding in enumerate(self.embeddings):
245
  if doc_embedding is not None and len(doc_embedding) > 0:
246
  similarity = self.cosine_similarity(query_vector, doc_embedding)
247
 
 
248
  priority_boost = 1 + (self.knowledge_base[i]['metadata']['priority'] / 100)
249
  final_score = similarity * priority_boost
250
 
 
254
  'search_type': 'vector'
255
  })
256
 
 
257
  similarities.sort(key=lambda x: x['score'], reverse=True)
258
  return similarities[:top_k]
259
 
260
  except Exception as e:
261
+ logger.error(f"Error in vector search: {e}")
262
  return []
263
 
264
  def hybrid_search(self, query: str, top_k: int = 10, vector_weight: float = 0.6, bm25_weight: float = 0.4) -> List[Dict]:
265
  """Perform hybrid search combining vector and BM25 results"""
266
  try:
267
+ vector_results = self.vector_search(query, top_k * 2)
 
268
  bm25_results = self.bm25_search(query, top_k * 2)
269
 
270
+ # Normalize scores
271
  if vector_results:
272
  max_vector_score = max(r['score'] for r in vector_results)
273
  if max_vector_score > 0:
 
289
  # Combine results
290
  combined_scores = {}
291
 
 
292
  for result in vector_results:
293
  doc_id = result['document']['id']
294
  combined_scores[doc_id] = {
 
298
  'search_type': 'vector'
299
  }
300
 
 
301
  for result in bm25_results:
302
  doc_id = result['document']['id']
303
  if doc_id in combined_scores:
 
323
  'search_type': data['search_type']
324
  })
325
 
 
326
  final_results.sort(key=lambda x: x['score'], reverse=True)
327
  return final_results[:top_k]
328
 
329
  except Exception as e:
330
+ logger.error(f"Error in hybrid search: {e}")
 
331
  return self.vector_search(query, top_k)
332
 
333
  def search_knowledge_base(self, query: str, top_k: int = 5, search_type: str = "hybrid") -> List[Dict]:
 
336
  return self.vector_search(query, top_k)
337
  elif search_type == "bm25":
338
  return self.bm25_search(query, top_k)
339
+ else:
340
  return self.hybrid_search(query, top_k)
341
 
342
  # Initialize the bot
343
+ logger.info("Initializing Hybrid Search RAGtim Bot...")
344
  bot = HybridSearchRAGBot()
345
 
346
+ # API Functions
347
  def search_api(query, top_k=5, search_type="hybrid", vector_weight=0.6, bm25_weight=0.4):
348
  """API endpoint for hybrid search functionality"""
349
  try:
 
366
  }
367
  }
368
  except Exception as e:
369
+ logger.error(f"Error in search API: {e}")
370
  return {"error": str(e), "results": []}
371
 
372
  def get_stats_api():
373
  """API endpoint for knowledge base statistics"""
374
  try:
 
375
  doc_types = {}
376
  sections_by_file = {}
377
 
 
386
  "total_documents": len(bot.knowledge_base),
387
  "document_types": doc_types,
388
  "sections_by_file": sections_by_file,
389
+ "model_name": EMBEDDING_MODEL,
390
+ "embedding_dimension": EMBEDDING_DIM,
391
  "search_capabilities": [
392
  "Hybrid Search (Vector + BM25)",
393
  "Semantic Vector Search",
 
406
  "status": "healthy"
407
  }
408
  except Exception as e:
409
+ logger.error(f"Error in get_stats_api: {e}")
410
  return {
411
  "error": str(e),
412
  "status": "error",
 
420
  return "Please ask me something about Raktim Mondol! I use hybrid search combining semantic similarity and keyword matching for the best results."
421
 
422
  try:
 
423
  search_results = bot.hybrid_search(message, top_k=6)
424
 
425
  if search_results:
 
426
  response_parts = []
427
  response_parts.append(f"πŸ” **Hybrid Search Results** (Vector + BM25 combination, found {len(search_results)} relevant sections):\n")
428
 
 
429
  best_match = search_results[0]
430
  response_parts.append(f"**Primary Answer** (Hybrid Score: {best_match['score']:.3f}):")
431
  response_parts.append(f"πŸ“„ Source: {best_match['document']['metadata']['source']} - {best_match['document']['metadata']['section']}")
432
  response_parts.append(f"πŸ” Search Type: {best_match['search_type'].upper()}")
433
 
 
434
  if 'vector_score' in best_match and 'bm25_score' in best_match:
435
  response_parts.append(f"πŸ“Š Vector Score: {best_match['vector_score']:.3f} | BM25 Score: {best_match['bm25_score']:.3f}")
436
 
437
  response_parts.append(f"\n{best_match['document']['content']}\n")
438
 
 
439
  if len(search_results) > 1:
440
  response_parts.append("**Additional Context:**")
441
+ for i, result in enumerate(search_results[1:3], 1):
442
  section_info = f"{result['document']['metadata']['source']} - {result['document']['metadata']['section']}"
443
  search_info = f"({result['search_type'].upper()}, Score: {result['score']:.3f})"
444
  response_parts.append(f"{i}. {section_info} {search_info}")
445
 
 
446
  excerpt = result['document']['content'][:200] + "..." if len(result['document']['content']) > 200 else result['document']['content']
447
  response_parts.append(f" {excerpt}\n")
448
 
 
457
  return "I don't have specific information about that topic in my knowledge base. Could you please ask something else about Raktim Mondol?"
458
 
459
  except Exception as e:
460
+ logger.error(f"Error in chat interface: {e}")
461
  return "I'm sorry, I encountered an error while processing your question. Please try again."
462
 
463
+ # Gradio Interface
 
 
 
464
  css = """
465
  .gradio-container {
466
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 
495
  - πŸ” **BM25 Keyword Search**: Advanced TF-IDF ranking for exact term matching
496
  - βš–οΈ **Intelligent Fusion**: Weighted combination for optimal relevance
497
 
498
+ **πŸ“š Knowledge Base**: **{len(bot.knowledge_base)} sections** from comprehensive markdown files
 
 
 
 
 
 
499
 
500
  **πŸ”§ Search Parameters**:
501
  - **BM25 Parameters**: k1={bot.k1}, b={bot.b}
502
  - **Vocabulary**: {len(bot.document_frequency)} unique terms
503
  - **Average Document Length**: {bot.average_doc_length:.1f} words
504
+ - **Embedding Model**: {EMBEDDING_MODEL} ({EMBEDDING_DIM}-dim)
 
 
 
 
 
505
 
506
  **Ask me anything about Raktim Mondol's research, expertise, and background!**
507
  """)
 
539
  if not message.strip():
540
  return history, ""
541
 
 
542
  history.append({"role": "user", "content": message})
 
 
543
  bot_response = chat_interface(message, history)
 
 
544
  history.append({"role": "assistant", "content": bot_response})
545
 
546
  return history, ""
 
548
  submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
549
  msg.submit(respond, [msg, chatbot], [chatbot, msg])
550
 
551
+ # Advanced search interface
552
  with gr.Blocks(title="πŸ”§ Advanced Hybrid Search") as search_demo:
553
  gr.Markdown("# πŸ”§ Advanced Hybrid Search Configuration")
 
554
 
555
  with gr.Row():
556
  with gr.Column(scale=2):
 
563
  search_type = gr.Radio(
564
  choices=["hybrid", "vector", "bm25"],
565
  value="hybrid",
566
+ label="Search Method"
 
567
  )
568
  top_k_slider = gr.Slider(
569
  minimum=1,
 
573
  label="Top K Results"
574
  )
575
 
 
576
  with gr.Group(visible=True) as weight_group:
577
  gr.Markdown("**Hybrid Search Weights**")
578
  vector_weight = gr.Slider(
 
621
  return 0.6, 0.4
622
 
623
  def advanced_search(query, search_type, top_k, vector_w, bm25_w):
 
624
  vector_weight, bm25_weight = normalize_weights(vector_w, bm25_w)
625
  return search_api(query, top_k, search_type, vector_weight, bm25_weight)
626
 
 
630
  outputs=search_output
631
  )
632
 
633
+ # Stats interface
634
  with gr.Blocks(title="πŸ“Š System Statistics") as stats_demo:
635
  gr.Markdown("# πŸ“Š Hybrid Search System Statistics")
 
636
 
637
  stats_output = gr.JSON(label="System Statistics", height=500)
638
  stats_btn = gr.Button("πŸ“Š Get System Statistics", variant="primary")
639
 
640
+ stats_btn.click(get_stats_api, inputs=[], outputs=stats_output)
 
 
 
 
641
 
642
+ # Main demo with tabs
643
  demo = gr.TabbedInterface(
644
  [chat_demo, search_demo, stats_demo],
645
  ["πŸ’¬ Hybrid Chat", "πŸ”§ Advanced Search", "πŸ“Š Statistics"],
646
  title="πŸ”₯ Hybrid Search RAGtim Bot - Vector + BM25 Fusion"
647
  )
648
 
649
+ # Launch the application
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
650
  if __name__ == "__main__":
651
+ logger.info("πŸš€ Launching Hybrid Search RAGtim Bot...")
652
+ logger.info(f"πŸ“š Loaded {len(bot.knowledge_base)} sections from markdown files")
653
+ logger.info(f"πŸ” BM25 index: {len(bot.document_frequency)} unique terms")
654
+ logger.info(f"🧠 Vector embeddings: {len(bot.embeddings)} documents")
655
+ logger.info("πŸ”₯ Hybrid search ready: Semantic + Keyword fusion!")
656
 
 
657
  demo.launch(
658
  server_name="0.0.0.0",
659
  server_port=7860,
660
  share=False,
661
  show_error=True
662
+ )