raktim-mondol commited on
Commit
09f419b
·
1 Parent(s): 4e2d884

refactored_update

Browse files
Files changed (1) hide show
  1. app_refactored.py +0 -623
app_refactored.py DELETED
@@ -1,623 +0,0 @@
1
- import gradio as gr
2
- import json
3
- import numpy as np
4
- from transformers import pipeline
5
- import torch
6
- import os
7
- from typing import List, Dict, Any, Optional
8
- import re
9
- import math
10
- from collections import defaultdict, Counter
11
- from pathlib import Path
12
- import logging
13
-
14
- # Configure logging
15
- logging.basicConfig(level=logging.INFO)
16
- logger = logging.getLogger(__name__)
17
-
18
- # Configure device
19
- device = "cuda" if torch.cuda.is_available() else "cpu"
20
- logger.info(f"Using device: {device}")
21
-
22
- class DocumentProcessor:
23
- """Handles document processing and text extraction from markdown files."""
24
-
25
- def __init__(self, knowledge_base_dir: str = "knowledge_base"):
26
- self.knowledge_base_dir = Path(knowledge_base_dir)
27
-
28
- def load_markdown_files(self) -> List[Dict[str, Any]]:
29
- """Load and process all markdown files in the knowledge base directory."""
30
- documents = []
31
-
32
- file_priorities = {
33
- 'about.md': 10,
34
- 'research_details.md': 9,
35
- 'publications_detailed.md': 8,
36
- 'skills_expertise.md': 7,
37
- 'experience_detailed.md': 8,
38
- 'statistics.md': 9
39
- }
40
-
41
- for file_path in self.knowledge_base_dir.glob("*.md"):
42
- try:
43
- with open(file_path, 'r', encoding='utf-8') as f:
44
- content = f.read()
45
-
46
- file_type = file_path.stem
47
- priority = file_priorities.get(file_path.name, 5)
48
-
49
- sections = self._split_markdown_into_sections(content)
50
-
51
- for section in sections:
52
- if len(section['content'].strip()) > 100:
53
- doc = {
54
- "id": f"{file_path.name}_{section['title']}_{len(documents)}",
55
- "content": section['content'],
56
- "metadata": {
57
- "type": file_type,
58
- "priority": priority,
59
- "section": section['title'],
60
- "source": file_path.name
61
- }
62
- }
63
- documents.append(doc)
64
-
65
- logger.info(f"✅ Loaded {file_path.name}")
66
-
67
- except Exception as e:
68
- logger.error(f"❌ Error loading {file_path.name}: {e}")
69
-
70
- return documents
71
-
72
- def _split_markdown_into_sections(self, content: str) -> List[Dict[str, str]]:
73
- """Split markdown content into sections based on headers."""
74
- sections = []
75
- lines = content.split('\n')
76
- current_section = {'title': 'Introduction', 'content': ''}
77
-
78
- for line in lines:
79
- if line.startswith('#'):
80
- if current_section['content'].strip():
81
- sections.append(current_section.copy())
82
-
83
- title = line.lstrip('#').strip()
84
- current_section = {
85
- 'title': title,
86
- 'content': line + '\n'
87
- }
88
- else:
89
- current_section['content'] += line + '\n'
90
-
91
- if current_section['content'].strip():
92
- sections.append(current_section)
93
-
94
- return sections
95
-
96
- class BM25Searcher:
97
- """Implements BM25 search algorithm for keyword-based document retrieval."""
98
-
99
- def __init__(self, k1: float = 1.5, b: float = 0.75):
100
- self.k1 = k1
101
- self.b = b
102
- self.term_frequencies = {}
103
- self.document_frequency = defaultdict(int)
104
- self.document_lengths = {}
105
- self.average_doc_length = 0
106
- self.total_documents = 0
107
-
108
- def build_index(self, documents: List[Dict[str, Any]]):
109
- """Build BM25 index from documents."""
110
- logger.info("Building BM25 index...")
111
-
112
- self.term_frequencies = {}
113
- self.document_frequency = defaultdict(int)
114
- self.document_lengths = {}
115
-
116
- total_length = 0
117
-
118
- for doc in documents:
119
- doc_id = doc['id']
120
- terms = self._tokenize(doc['content'])
121
-
122
- term_freq = Counter(terms)
123
- self.term_frequencies[doc_id] = dict(term_freq)
124
-
125
- doc_length = len(terms)
126
- self.document_lengths[doc_id] = doc_length
127
- total_length += doc_length
128
-
129
- unique_terms = set(terms)
130
- for term in unique_terms:
131
- self.document_frequency[term] += 1
132
-
133
- self.total_documents = len(documents)
134
- self.average_doc_length = total_length / self.total_documents if self.total_documents > 0 else 0
135
-
136
- logger.info(f"✅ BM25 index built: {len(self.document_frequency)} unique terms")
137
-
138
- def search(self, query: str, documents: List[Dict[str, Any]], top_k: int = 10) -> List[Dict[str, Any]]:
139
- """Perform BM25 search."""
140
- query_terms = self._tokenize(query)
141
- if not query_terms:
142
- return []
143
-
144
- scores = {}
145
-
146
- for doc in documents:
147
- doc_id = doc['id']
148
- score = 0.0
149
-
150
- for term in query_terms:
151
- score += self._calculate_bm25_score(term, doc_id)
152
-
153
- if score > 0:
154
- priority_boost = 1 + (doc['metadata']['priority'] / 50)
155
- final_score = score * priority_boost
156
-
157
- scores[doc_id] = {
158
- 'document': doc,
159
- 'score': final_score,
160
- 'search_type': 'bm25'
161
- }
162
-
163
- sorted_results = sorted(scores.values(), key=lambda x: x['score'], reverse=True)
164
- return sorted_results[:top_k]
165
-
166
- def _tokenize(self, text: str) -> List[str]:
167
- """Tokenize text for BM25."""
168
- text = re.sub(r'[^\w\s]', ' ', text.lower())
169
- words = [word for word in text.split() if len(word) > 2 and not self._is_stop_word(word)]
170
- return words
171
-
172
- def _is_stop_word(self, word: str) -> bool:
173
- """Check if word is a stop word."""
174
- stop_words = {
175
- 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
176
- 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
177
- 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those'
178
- }
179
- return word in stop_words
180
-
181
- def _calculate_bm25_score(self, term: str, doc_id: str) -> float:
182
- """Calculate BM25 score for a term in a document."""
183
- tf = self.term_frequencies.get(doc_id, {}).get(term, 0)
184
- if tf == 0:
185
- return 0.0
186
-
187
- df = self.document_frequency.get(term, 1)
188
- doc_length = self.document_lengths.get(doc_id, 0)
189
-
190
- idf = math.log((self.total_documents - df + 0.5) / (df + 0.5))
191
-
192
- numerator = tf * (self.k1 + 1)
193
- denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.average_doc_length))
194
-
195
- return idf * (numerator / denominator)
196
-
197
- class VectorSearcher:
198
- """Implements vector-based semantic search using transformer embeddings."""
199
-
200
- def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
201
- self.model_name = model_name
202
- self.embedder = None
203
- self.embeddings = []
204
-
205
- def initialize_model(self):
206
- """Initialize the embedding model."""
207
- try:
208
- logger.info("Loading embedding model...")
209
- self.embedder = pipeline(
210
- 'feature-extraction',
211
- self.model_name,
212
- device=0 if device == "cuda" else -1
213
- )
214
- logger.info("✅ Embedding model loaded successfully")
215
- except Exception as e:
216
- logger.error(f"❌ Error loading embedding model: {e}")
217
- raise e
218
-
219
- def build_embeddings(self, documents: List[Dict[str, Any]]):
220
- """Build embeddings for all documents."""
221
- logger.info("Generating embeddings for knowledge base...")
222
- self.embeddings = []
223
-
224
- for i, doc in enumerate(documents):
225
- try:
226
- content = doc["content"][:500] # Limit to 500 characters
227
- embedding = self.embedder(content, return_tensors="pt")
228
- embedding_np = embedding[0].mean(dim=0).detach().cpu().numpy()
229
- self.embeddings.append(embedding_np)
230
- except Exception as e:
231
- logger.error(f"Error generating embedding for doc {doc['id']}: {e}")
232
- self.embeddings.append(np.zeros(384))
233
-
234
- logger.info(f"✅ Generated {len(self.embeddings)} embeddings")
235
-
236
- def search(self, query: str, documents: List[Dict[str, Any]], top_k: int = 10) -> List[Dict[str, Any]]:
237
- """Perform vector similarity search."""
238
- try:
239
- query_embedding = self.embedder(query[:500], return_tensors="pt")
240
- query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
241
-
242
- similarities = []
243
- for i, doc_embedding in enumerate(self.embeddings):
244
- if doc_embedding is not None and len(doc_embedding) > 0:
245
- similarity = self._cosine_similarity(query_vector, doc_embedding)
246
-
247
- priority_boost = 1 + (documents[i]['metadata']['priority'] / 100)
248
- final_score = similarity * priority_boost
249
-
250
- similarities.append({
251
- 'document': documents[i],
252
- 'score': float(final_score),
253
- 'search_type': 'vector'
254
- })
255
-
256
- similarities.sort(key=lambda x: x['score'], reverse=True)
257
- return similarities[:top_k]
258
-
259
- except Exception as e:
260
- logger.error(f"Error in vector search: {e}")
261
- return []
262
-
263
- def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
264
- """Calculate cosine similarity between two vectors."""
265
- return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
266
-
267
- class HybridSearchSystem:
268
- """Main hybrid search system combining BM25 and vector search."""
269
-
270
- def __init__(self):
271
- self.doc_processor = DocumentProcessor()
272
- self.bm25_searcher = BM25Searcher()
273
- self.vector_searcher = VectorSearcher()
274
- self.documents = []
275
-
276
- def initialize(self):
277
- """Initialize the entire search system."""
278
- logger.info("Initializing Hybrid Search RAGtim Bot...")
279
-
280
- # Load documents
281
- self.documents = self.doc_processor.load_markdown_files()
282
-
283
- # Initialize models and build indices
284
- self.vector_searcher.initialize_model()
285
- self.vector_searcher.build_embeddings(self.documents)
286
- self.bm25_searcher.build_index(self.documents)
287
-
288
- logger.info(f"✅ System initialized with {len(self.documents)} documents")
289
-
290
- def search(self, query: str, search_type: str = "hybrid", top_k: int = 5,
291
- vector_weight: float = 0.6, bm25_weight: float = 0.4) -> List[Dict[str, Any]]:
292
- """Perform search based on specified method."""
293
- if search_type == "vector":
294
- return self.vector_searcher.search(query, self.documents, top_k)
295
- elif search_type == "bm25":
296
- return self.bm25_searcher.search(query, self.documents, top_k)
297
- else: # hybrid
298
- return self._hybrid_search(query, top_k, vector_weight, bm25_weight)
299
-
300
- def _hybrid_search(self, query: str, top_k: int = 10,
301
- vector_weight: float = 0.6, bm25_weight: float = 0.4) -> List[Dict[str, Any]]:
302
- """Perform hybrid search combining vector and BM25 results."""
303
- try:
304
- vector_results = self.vector_searcher.search(query, self.documents, top_k * 2)
305
- bm25_results = self.bm25_searcher.search(query, self.documents, top_k * 2)
306
-
307
- # Normalize scores
308
- if vector_results:
309
- max_vector_score = max(r['score'] for r in vector_results)
310
- if max_vector_score > 0:
311
- for result in vector_results:
312
- result['normalized_score'] = result['score'] / max_vector_score
313
- else:
314
- for result in vector_results:
315
- result['normalized_score'] = 0
316
-
317
- if bm25_results:
318
- max_bm25_score = max(r['score'] for r in bm25_results)
319
- if max_bm25_score > 0:
320
- for result in bm25_results:
321
- result['normalized_score'] = result['score'] / max_bm25_score
322
- else:
323
- for result in bm25_results:
324
- result['normalized_score'] = 0
325
-
326
- # Combine results
327
- combined_scores = {}
328
-
329
- for result in vector_results:
330
- doc_id = result['document']['id']
331
- combined_scores[doc_id] = {
332
- 'document': result['document'],
333
- 'vector_score': result['normalized_score'],
334
- 'bm25_score': 0.0,
335
- 'search_type': 'vector'
336
- }
337
-
338
- for result in bm25_results:
339
- doc_id = result['document']['id']
340
- if doc_id in combined_scores:
341
- combined_scores[doc_id]['bm25_score'] = result['normalized_score']
342
- combined_scores[doc_id]['search_type'] = 'hybrid'
343
- else:
344
- combined_scores[doc_id] = {
345
- 'document': result['document'],
346
- 'vector_score': 0.0,
347
- 'bm25_score': result['normalized_score'],
348
- 'search_type': 'bm25'
349
- }
350
-
351
- # Calculate final hybrid scores
352
- final_results = []
353
- for doc_id, data in combined_scores.items():
354
- hybrid_score = (vector_weight * data['vector_score']) + (bm25_weight * data['bm25_score'])
355
- final_results.append({
356
- 'document': data['document'],
357
- 'score': hybrid_score,
358
- 'vector_score': data['vector_score'],
359
- 'bm25_score': data['bm25_score'],
360
- 'search_type': data['search_type']
361
- })
362
-
363
- final_results.sort(key=lambda x: x['score'], reverse=True)
364
- return final_results[:top_k]
365
-
366
- except Exception as e:
367
- logger.error(f"Error in hybrid search: {e}")
368
- return self.vector_searcher.search(query, self.documents, top_k)
369
-
370
- # Initialize the search system
371
- search_system = HybridSearchSystem()
372
- search_system.initialize()
373
-
374
- # API Functions
375
- def search_api(query: str, top_k: int = 5, search_type: str = "hybrid",
376
- vector_weight: float = 0.6, bm25_weight: float = 0.4) -> Dict[str, Any]:
377
- """API endpoint for search functionality."""
378
- try:
379
- results = search_system.search(query, search_type, top_k, vector_weight, bm25_weight)
380
-
381
- return {
382
- "results": results,
383
- "query": query,
384
- "top_k": top_k,
385
- "search_type": search_type,
386
- "total_documents": len(search_system.documents),
387
- "search_parameters": {
388
- "vector_weight": vector_weight if search_type == "hybrid" else None,
389
- "bm25_weight": bm25_weight if search_type == "hybrid" else None,
390
- "bm25_k1": search_system.bm25_searcher.k1,
391
- "bm25_b": search_system.bm25_searcher.b
392
- }
393
- }
394
- except Exception as e:
395
- logger.error(f"Error in search API: {e}")
396
- return {"error": str(e), "results": []}
397
-
398
- def get_stats_api() -> Dict[str, Any]:
399
- """API endpoint for system statistics."""
400
- try:
401
- doc_types = {}
402
- sections_by_file = {}
403
-
404
- for doc in search_system.documents:
405
- doc_type = doc["metadata"]["type"]
406
- source_file = doc["metadata"]["source"]
407
-
408
- doc_types[doc_type] = doc_types.get(doc_type, 0) + 1
409
- sections_by_file[source_file] = sections_by_file.get(source_file, 0) + 1
410
-
411
- return {
412
- "total_documents": len(search_system.documents),
413
- "document_types": doc_types,
414
- "sections_by_file": sections_by_file,
415
- "model_name": search_system.vector_searcher.model_name,
416
- "embedding_dimension": 384,
417
- "search_capabilities": [
418
- "Hybrid Search (Vector + BM25)",
419
- "Semantic Vector Search",
420
- "BM25 Keyword Search",
421
- "GPU Accelerated",
422
- "Transformer Embeddings"
423
- ],
424
- "bm25_parameters": {
425
- "k1": search_system.bm25_searcher.k1,
426
- "b": search_system.bm25_searcher.b,
427
- "unique_terms": len(search_system.bm25_searcher.document_frequency),
428
- "average_doc_length": search_system.bm25_searcher.average_doc_length
429
- },
430
- "backend_type": "Hugging Face Space with Hybrid Search",
431
- "knowledge_sources": list(sections_by_file.keys()),
432
- "status": "healthy"
433
- }
434
- except Exception as e:
435
- logger.error(f"Error in get_stats_api: {e}")
436
- return {
437
- "error": str(e),
438
- "status": "error",
439
- "total_documents": 0,
440
- "search_capabilities": ["Error"]
441
- }
442
-
443
- def chat_interface(message: str) -> str:
444
- """Enhanced chat interface with better formatting."""
445
- if not message.strip():
446
- return "Please ask me something about Raktim Mondol! I use hybrid search combining semantic similarity and keyword matching for the best results."
447
-
448
- try:
449
- search_results = search_system.search(message, "hybrid", 6)
450
-
451
- if search_results:
452
- response_parts = []
453
- response_parts.append(f"🔍 **Found {len(search_results)} relevant results using hybrid search**\n")
454
-
455
- best_match = search_results[0]
456
- response_parts.append(f"**Primary Answer** (Score: {best_match['score']:.3f})")
457
- response_parts.append(f"📄 Source: {best_match['document']['metadata']['source']} - {best_match['document']['metadata']['section']}")
458
- response_parts.append(f"🔍 Search Type: {best_match['search_type'].upper()}")
459
-
460
- if 'vector_score' in best_match and 'bm25_score' in best_match:
461
- response_parts.append(f"📊 Vector: {best_match['vector_score']:.3f} | BM25: {best_match['bm25_score']:.3f}")
462
-
463
- response_parts.append(f"\n{best_match['document']['content']}\n")
464
-
465
- if len(search_results) > 1:
466
- response_parts.append("**Additional Context:**")
467
- for i, result in enumerate(search_results[1:3], 1):
468
- section_info = f"{result['document']['metadata']['source']} - {result['document']['metadata']['section']}"
469
- search_info = f"({result['search_type'].upper()}, Score: {result['score']:.3f})"
470
- response_parts.append(f"{i}. {section_info} {search_info}")
471
-
472
- excerpt = result['document']['content'][:200] + "..." if len(result['document']['content']) > 200 else result['document']['content']
473
- response_parts.append(f" {excerpt}\n")
474
-
475
- response_parts.append("\n🤖 **Powered by Hybrid Search Technology**")
476
- response_parts.append("• Vector Search: Semantic understanding with transformers")
477
- response_parts.append("• BM25 Search: Advanced keyword ranking")
478
- response_parts.append("• Smart Fusion: Optimal relevance through weighted combination")
479
-
480
- return "\n".join(response_parts)
481
- else:
482
- return "I don't have specific information about that topic in my knowledge base. Could you please ask something else about Raktim Mondol?"
483
-
484
- except Exception as e:
485
- logger.error(f"Error in chat interface: {e}")
486
- return "I'm sorry, I encountered an error while processing your question. Please try again."
487
-
488
- # Create Gradio Interface with modern Gradio 5 features
489
- with gr.Blocks(
490
- title="🔥 Hybrid Search RAGtim Bot",
491
- theme=gr.themes.Soft(),
492
- css="""
493
- .gradio-container {
494
- max-width: 1200px !important;
495
- }
496
- .chat-container {
497
- height: 600px;
498
- }
499
- """
500
- ) as demo:
501
-
502
- gr.Markdown("""
503
- # 🔥 Hybrid Search RAGtim Bot
504
-
505
- **Advanced AI-powered search system combining semantic understanding with keyword precision**
506
-
507
- 🧠 **Semantic Vector Search** + 🔍 **BM25 Keyword Search** = ⚡ **Optimal Results**
508
-
509
- Built with Gradio 5, featuring modern UI components and enhanced performance
510
- """)
511
-
512
- with gr.Tabs():
513
- with gr.Tab("💬 Chat Interface"):
514
- gr.Markdown("### Ask anything about Raktim Mondol's research, skills, or experience")
515
-
516
- chatbot = gr.Chatbot(
517
- value=[],
518
- label="RAGtim Bot",
519
- height=400,
520
- show_copy_button=True,
521
- bubble_full_width=False
522
- )
523
-
524
- with gr.Row():
525
- msg = gr.Textbox(
526
- label="Your Question",
527
- placeholder="What would you like to know about Raktim's research or expertise?",
528
- scale=4,
529
- lines=2
530
- )
531
- submit_btn = gr.Button("Ask", variant="primary", scale=1)
532
-
533
- gr.Examples(
534
- examples=[
535
- "What is Raktim's research in LLMs and RAG?",
536
- "Tell me about BioFusionNet and statistical methods",
537
- "What are his multimodal AI capabilities?",
538
- "Describe his biostatistics expertise"
539
- ],
540
- inputs=msg
541
- )
542
-
543
- def respond(message, history):
544
- response = chat_interface(message)
545
- history.append((message, response))
546
- return history, ""
547
-
548
- submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
549
- msg.submit(respond, [msg, chatbot], [chatbot, msg])
550
-
551
- with gr.Tab("🔍 Advanced Search API"):
552
- gr.Markdown("### Direct access to the hybrid search engine")
553
-
554
- with gr.Row():
555
- with gr.Column(scale=2):
556
- search_query = gr.Textbox(
557
- label="Search Query",
558
- placeholder="Enter your search query here..."
559
- )
560
-
561
- with gr.Row():
562
- search_type = gr.Radio(
563
- choices=["hybrid", "vector", "bm25"],
564
- value="hybrid",
565
- label="Search Method"
566
- )
567
- top_k = gr.Slider(
568
- minimum=1, maximum=20, value=5, step=1,
569
- label="Number of Results"
570
- )
571
-
572
- with gr.Row():
573
- vector_weight = gr.Slider(
574
- minimum=0.0, maximum=1.0, value=0.6, step=0.1,
575
- label="Vector Weight"
576
- )
577
- bm25_weight = gr.Slider(
578
- minimum=0.0, maximum=1.0, value=0.4, step=0.1,
579
- label="BM25 Weight"
580
- )
581
-
582
- search_btn = gr.Button("🔍 Search", variant="primary")
583
-
584
- with gr.Column(scale=3):
585
- search_results = gr.JSON(
586
- label="Search Results",
587
- show_label=True
588
- )
589
-
590
- search_btn.click(
591
- search_api,
592
- inputs=[search_query, top_k, search_type, vector_weight, bm25_weight],
593
- outputs=search_results
594
- )
595
-
596
- with gr.Tab("📊 System Statistics"):
597
- gr.Markdown("### Knowledge base and system information")
598
-
599
- stats_btn = gr.Button("📊 Get Statistics", variant="secondary")
600
- stats_output = gr.JSON(
601
- label="System Statistics",
602
- show_label=True
603
- )
604
-
605
- stats_btn.click(get_stats_api, outputs=stats_output)
606
-
607
- # Auto-load stats on tab open
608
- demo.load(get_stats_api, outputs=stats_output)
609
-
610
- if __name__ == "__main__":
611
- logger.info("🚀 Launching Hybrid Search RAGtim Bot...")
612
- logger.info(f"📚 Loaded {len(search_system.documents)} documents")
613
- logger.info(f"🔍 BM25 index: {len(search_system.bm25_searcher.document_frequency)} unique terms")
614
- logger.info(f"🧠 Vector embeddings: {len(search_system.vector_searcher.embeddings)} documents")
615
- logger.info("🔥 Hybrid search ready!")
616
-
617
- demo.launch(
618
- server_name="0.0.0.0",
619
- server_port=7860,
620
- share=False,
621
- show_error=True,
622
- show_api=True
623
- )