Spaces:

raktimhugging
/

ragtim-bot

Running

App Files Files Community

raktimhugging commited on Jun 14

Commit

39dacf3

verified ·

1 Parent(s): 843035b

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -185

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import json
 import numpy as np
-from transformers import pipeline, AutoTokenizer, AutoModel
 import torch
 import os
 from typing import List, Dict, Any
@@ -10,10 +10,18 @@ import requests
 import re
 import math
 from collections import defaultdict, Counter
 # Configure device
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
 class HybridSearchRAGBot:
     def __init__(self):
@@ -22,15 +30,15 @@ class HybridSearchRAGBot:
         self.embeddings = []
         # BM25 components
-        self.term_frequencies = {}  # doc_id -> {term: frequency}
-        self.document_frequency = {}  # term -> number of docs containing term
-        self.document_lengths = {}  # doc_id -> document length
         self.average_doc_length = 0
         self.total_documents = 0
         # BM25 parameters
-        self.k1 = 1.5  # Controls term frequency saturation
-        self.b = 0.75  # Controls document length normalization
         self.initialize_models()
         self.load_markdown_knowledge_base()
@@ -39,84 +47,64 @@ class HybridSearchRAGBot:
     def initialize_models(self):
         """Initialize the embedding model"""
         try:
-            print("Loading embedding model...")
             self.embedder = pipeline(
                 'feature-extraction',
-                'sentence-transformers/all-MiniLM-L6-v2',
                 device=0 if device == "cuda" else -1
             )
-            print("✅ Embedding model loaded successfully")
         except Exception as e:
-            print(f"❌ Error loading embedding model: {e}")
             raise e
     def load_markdown_knowledge_base(self):
         """Load knowledge base from markdown files"""
-        print("Loading knowledge base from markdown files...")
         # Reset knowledge base
         self.knowledge_base = []
-        # Load all markdown files
-        markdown_files = [
-            'about.md',
-            'research_details.md',
-            'publications_detailed.md',
-            'skills_expertise.md',
-            'experience_detailed.md',
-            'statistics.md'
-        ]
-        for filename in markdown_files:
             try:
                 if os.path.exists(filename):
                     with open(filename, 'r', encoding='utf-8') as f:
                         content = f.read()
-                    self.process_markdown_file(content, filename)
-                    print(f"✅ Loaded {filename}")
                 else:
-                    print(f"⚠️ File not found: {filename}")
             except Exception as e:
-                print(f"❌ Error loading {filename}: {e}")
         # Generate embeddings for knowledge base
-        print("Generating embeddings for knowledge base...")
         self.embeddings = []
         for i, doc in enumerate(self.knowledge_base):
             try:
                 # Truncate content to avoid token limit issues
-                content = doc["content"][:500]  # Limit to 500 characters
                 embedding = self.embedder(content, return_tensors="pt")
                 # Convert to numpy and flatten
                 embedding_np = embedding[0].mean(dim=0).detach().cpu().numpy()
                 self.embeddings.append(embedding_np)
             except Exception as e:
-                print(f"Error generating embedding for doc {doc['id']}: {e}")
                 # Fallback to zero embedding
-                self.embeddings.append(np.zeros(384))
         self.total_documents = len(self.knowledge_base)
-        print(f"✅ Knowledge base loaded with {len(self.knowledge_base)} documents")
     def process_markdown_file(self, content: str, filename: str):
         """Process a markdown file and extract sections"""
-        # Determine file type and priority
-        file_type_map = {
-            'about.md': ('about', 10),
-            'research_details.md': ('research', 9),
-            'publications_detailed.md': ('publications', 8),
-            'skills_expertise.md': ('skills', 7),
-            'experience_detailed.md': ('experience', 8),
-            'statistics.md': ('statistics', 9)
-        }
-        file_type, priority = file_type_map.get(filename, ('general', 5))
         # Split content into sections
         sections = self.split_markdown_into_sections(content)
         for section in sections:
-            if len(section['content'].strip()) > 100:  # Only process substantial content
                 doc = {
                     "id": f"{filename}_{section['title']}_{len(self.knowledge_base)}",
                     "content": section['content'],
@@ -136,14 +124,10 @@ class HybridSearchRAGBot:
         current_section = {'title': 'Introduction', 'content': ''}
         for line in lines:
-            # Check if line is a header
             if line.startswith('#'):
-                # Save previous section if it has content
                 if current_section['content'].strip():
                     sections.append(current_section.copy())
-                # Start new section
-                header_level = len(line) - len(line.lstrip('#'))
                 title = line.lstrip('#').strip()
                 current_section = {
                     'title': title,
@@ -152,7 +136,6 @@ class HybridSearchRAGBot:
             else:
                 current_section['content'] += line + '\n'
-        # Add the last section
         if current_section['content'].strip():
             sections.append(current_section)
@@ -160,9 +143,7 @@ class HybridSearchRAGBot:
     def tokenize(self, text: str) -> List[str]:
         """Tokenize text for BM25"""
-        # Convert to lowercase and remove punctuation
         text = re.sub(r'[^\w\s]', ' ', text.lower())
-        # Split into words and filter out short words and stop words
         words = [word for word in text.split() if len(word) > 2 and not self.is_stop_word(word)]
         return words
@@ -178,54 +159,44 @@ class HybridSearchRAGBot:
     def build_bm25_index(self):
         """Build BM25 index for all documents"""
-        print("Building BM25 index...")
-        # Reset indexes
         self.term_frequencies = {}
         self.document_frequency = defaultdict(int)
         self.document_lengths = {}
         total_length = 0
-        # First pass: calculate term frequencies and document lengths
         for doc in self.knowledge_base:
             doc_id = doc['id']
             terms = self.tokenize(doc['content'])
-            # Calculate term frequencies for this document
             term_freq = Counter(terms)
             self.term_frequencies[doc_id] = dict(term_freq)
-            # Store document length
             doc_length = len(terms)
             self.document_lengths[doc_id] = doc_length
             total_length += doc_length
-            # Update document frequencies
             unique_terms = set(terms)
             for term in unique_terms:
                 self.document_frequency[term] += 1
-        # Calculate average document length
         self.average_doc_length = total_length / self.total_documents if self.total_documents > 0 else 0
-        print(f"✅ BM25 index built: {len(self.document_frequency)} unique terms, avg doc length: {self.average_doc_length:.1f}")
     def calculate_bm25_score(self, term: str, doc_id: str) -> float:
         """Calculate BM25 score for a term in a document"""
-        # Get term frequency in document
         tf = self.term_frequencies.get(doc_id, {}).get(term, 0)
         if tf == 0:
             return 0.0
-        # Get document frequency and document length
         df = self.document_frequency.get(term, 1)
         doc_length = self.document_lengths.get(doc_id, 0)
-        # Calculate IDF: log((N - df + 0.5) / (df + 0.5))
         idf = math.log((self.total_documents - df + 0.5) / (df + 0.5))
-        # Calculate BM25 score
         numerator = tf * (self.k1 + 1)
         denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.average_doc_length))
@@ -239,7 +210,6 @@ class HybridSearchRAGBot:
         scores = {}
-        # Calculate BM25 score for each document
         for doc in self.knowledge_base:
             doc_id = doc['id']
             score = 0.0
@@ -248,7 +218,6 @@ class HybridSearchRAGBot:
                 score += self.calculate_bm25_score(term, doc_id)
             if score > 0:
-                # Apply priority boost
                 priority_boost = 1 + (doc['metadata']['priority'] / 50)
                 final_score = score * priority_boost
@@ -258,7 +227,6 @@ class HybridSearchRAGBot:
                     'search_type': 'bm25'
                 }
-        # Sort by score and return top_k
         sorted_results = sorted(scores.values(), key=lambda x: x['score'], reverse=True)
         return sorted_results[:top_k]
@@ -269,17 +237,14 @@ class HybridSearchRAGBot:
     def vector_search(self, query: str, top_k: int = 10) -> List[Dict]:
         """Perform vector similarity search"""
         try:
-            # Generate query embedding
-            query_embedding = self.embedder(query[:500], return_tensors="pt")  # Truncate query
             query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
-            # Calculate similarities
             similarities = []
             for i, doc_embedding in enumerate(self.embeddings):
                 if doc_embedding is not None and len(doc_embedding) > 0:
                     similarity = self.cosine_similarity(query_vector, doc_embedding)
-                    # Apply priority boost
                     priority_boost = 1 + (self.knowledge_base[i]['metadata']['priority'] / 100)
                     final_score = similarity * priority_boost
@@ -289,22 +254,20 @@ class HybridSearchRAGBot:
                         'search_type': 'vector'
                     })
-            # Sort by similarity and return top_k
             similarities.sort(key=lambda x: x['score'], reverse=True)
             return similarities[:top_k]
         except Exception as e:
-            print(f"Error in vector search: {e}")
             return []
     def hybrid_search(self, query: str, top_k: int = 10, vector_weight: float = 0.6, bm25_weight: float = 0.4) -> List[Dict]:
         """Perform hybrid search combining vector and BM25 results"""
         try:
-            # Get results from both search methods
-            vector_results = self.vector_search(query, top_k * 2)  # Get more results for better fusion
             bm25_results = self.bm25_search(query, top_k * 2)
-            # Normalize scores to [0, 1] range
             if vector_results:
                 max_vector_score = max(r['score'] for r in vector_results)
                 if max_vector_score > 0:
@@ -326,7 +289,6 @@ class HybridSearchRAGBot:
             # Combine results
             combined_scores = {}
-            # Add vector results
             for result in vector_results:
                 doc_id = result['document']['id']
                 combined_scores[doc_id] = {
@@ -336,7 +298,6 @@ class HybridSearchRAGBot:
                     'search_type': 'vector'
                 }
-            # Add BM25 results
             for result in bm25_results:
                 doc_id = result['document']['id']
                 if doc_id in combined_scores:
@@ -362,13 +323,11 @@ class HybridSearchRAGBot:
                     'search_type': data['search_type']
                 })
-            # Sort by hybrid score and return top_k
             final_results.sort(key=lambda x: x['score'], reverse=True)
             return final_results[:top_k]
         except Exception as e:
-            print(f"Error in hybrid search: {e}")
-            # Fallback to vector search only
             return self.vector_search(query, top_k)
     def search_knowledge_base(self, query: str, top_k: int = 5, search_type: str = "hybrid") -> List[Dict]:
@@ -377,13 +336,14 @@ class HybridSearchRAGBot:
             return self.vector_search(query, top_k)
         elif search_type == "bm25":
             return self.bm25_search(query, top_k)
-        else:  # hybrid
             return self.hybrid_search(query, top_k)
 # Initialize the bot
-print("Initializing Hybrid Search RAGtim Bot...")
 bot = HybridSearchRAGBot()
 def search_api(query, top_k=5, search_type="hybrid", vector_weight=0.6, bm25_weight=0.4):
     """API endpoint for hybrid search functionality"""
     try:
@@ -406,13 +366,12 @@ def search_api(query, top_k=5, search_type="hybrid", vector_weight=0.6, bm25_wei
             }
         }
     except Exception as e:
-        print(f"Error in search API: {e}")
         return {"error": str(e), "results": []}
 def get_stats_api():
     """API endpoint for knowledge base statistics"""
     try:
-        # Calculate document distribution by type
         doc_types = {}
         sections_by_file = {}
@@ -427,8 +386,8 @@ def get_stats_api():
             "total_documents": len(bot.knowledge_base),
             "document_types": doc_types,
             "sections_by_file": sections_by_file,
-            "model_name": "sentence-transformers/all-MiniLM-L6-v2",
-            "embedding_dimension": 384,
             "search_capabilities": [
                 "Hybrid Search (Vector + BM25)",
                 "Semantic Vector Search",
@@ -447,7 +406,7 @@ def get_stats_api():
             "status": "healthy"
         }
     except Exception as e:
-        print(f"Error in get_stats_api: {e}")
         return {
             "error": str(e),
             "status": "error",
@@ -461,35 +420,29 @@ def chat_interface(message, history):
         return "Please ask me something about Raktim Mondol! I use hybrid search combining semantic similarity and keyword matching for the best results."
     try:
-        # Use hybrid search by default
         search_results = bot.hybrid_search(message, top_k=6)
         if search_results:
-            # Build comprehensive response
             response_parts = []
             response_parts.append(f"🔍 **Hybrid Search Results** (Vector + BM25 combination, found {len(search_results)} relevant sections):\n")
-            # Use the best match as primary response
             best_match = search_results[0]
             response_parts.append(f"**Primary Answer** (Hybrid Score: {best_match['score']:.3f}):")
             response_parts.append(f"📄 Source: {best_match['document']['metadata']['source']} - {best_match['document']['metadata']['section']}")
             response_parts.append(f"🔍 Search Type: {best_match['search_type'].upper()}")
-            # Show score breakdown for hybrid results
             if 'vector_score' in best_match and 'bm25_score' in best_match:
                 response_parts.append(f"📊 Vector Score: {best_match['vector_score']:.3f} | BM25 Score: {best_match['bm25_score']:.3f}")
             response_parts.append(f"\n{best_match['document']['content']}\n")
-            # Add additional context if available
             if len(search_results) > 1:
                 response_parts.append("**Additional Context:**")
-                for i, result in enumerate(search_results[1:3], 1):  # Show up to 2 additional results
                     section_info = f"{result['document']['metadata']['source']} - {result['document']['metadata']['section']}"
                     search_info = f"({result['search_type'].upper()}, Score: {result['score']:.3f})"
                     response_parts.append(f"{i}. {section_info} {search_info}")
-                    # Add a brief excerpt
                     excerpt = result['document']['content'][:200] + "..." if len(result['document']['content']) > 200 else result['document']['content']
                     response_parts.append(f"   {excerpt}\n")
@@ -504,13 +457,10 @@ def chat_interface(message, history):
             return "I don't have specific information about that topic in my knowledge base. Could you please ask something else about Raktim Mondol?"
     except Exception as e:
-        print(f"Error in chat interface: {e}")
         return "I'm sorry, I encountered an error while processing your question. Please try again."
-# Create Gradio interface
-print("Creating Gradio interface...")
-# Custom CSS for better styling
 css = """
 .gradio-container {
     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
@@ -545,24 +495,13 @@ with gr.Blocks(
     - 🔍 **BM25 Keyword Search**: Advanced TF-IDF ranking for exact term matching
     - ⚖️ **Intelligent Fusion**: Weighted combination for optimal relevance
-    **📚 Knowledge Base**: **{len(bot.knowledge_base)} sections** from comprehensive markdown files:
-    - 📄 **about.md** - Personal info, contact, professional summary
-    - 🔬 **research_details.md** - Research projects, methodologies, innovations
-    - 📚 **publications_detailed.md** - Publications with technical details
-    - 💻 **skills_expertise.md** - Technical skills, LLM expertise, tools
-    - 💼 **experience_detailed.md** - Professional experience, teaching
-    - 📊 **statistics.md** - Statistical methods, biostatistics expertise
     **🔧 Search Parameters**:
     - **BM25 Parameters**: k1={bot.k1}, b={bot.b}
     - **Vocabulary**: {len(bot.document_frequency)} unique terms
     - **Average Document Length**: {bot.average_doc_length:.1f} words
-    - **Embedding Model**: sentence-transformers/all-MiniLM-L6-v2 (384-dim)
-    **💡 Try Different Search Types**:
-    - **Hybrid** (Recommended): Best of both semantic and keyword search
-    - **Vector**: Pure semantic similarity for conceptual queries
-    - **BM25**: Pure keyword matching for specific terms
     **Ask me anything about Raktim Mondol's research, expertise, and background!**
     """)
@@ -600,13 +539,8 @@ with gr.Blocks(
         if not message.strip():
             return history, ""
-        # Add user message to history
         history.append({"role": "user", "content": message})
-        # Get bot response
         bot_response = chat_interface(message, history)
-        # Add bot response to history
         history.append({"role": "assistant", "content": bot_response})
         return history, ""
@@ -614,10 +548,9 @@ with gr.Blocks(
     submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
     msg.submit(respond, [msg, chatbot], [chatbot, msg])
-# Create advanced search interface
 with gr.Blocks(title="🔧 Advanced Hybrid Search") as search_demo:
     gr.Markdown("# 🔧 Advanced Hybrid Search Configuration")
-    gr.Markdown("Fine-tune the hybrid search parameters and compare different search methods")
     with gr.Row():
         with gr.Column(scale=2):
@@ -630,8 +563,7 @@ with gr.Blocks(title="🔧 Advanced Hybrid Search") as search_demo:
                 search_type = gr.Radio(
                     choices=["hybrid", "vector", "bm25"],
                     value="hybrid",
-                    label="Search Method",
-                    elem_classes=["search-type-radio"]
                 )
                 top_k_slider = gr.Slider(
                     minimum=1,
@@ -641,7 +573,6 @@ with gr.Blocks(title="🔧 Advanced Hybrid Search") as search_demo:
                     label="Top K Results"
                 )
-            # Hybrid search weights (only shown when hybrid is selected)
             with gr.Group(visible=True) as weight_group:
                 gr.Markdown("**Hybrid Search Weights**")
                 vector_weight = gr.Slider(
@@ -690,7 +621,6 @@ with gr.Blocks(title="🔧 Advanced Hybrid Search") as search_demo:
         return 0.6, 0.4
     def advanced_search(query, search_type, top_k, vector_w, bm25_w):
-        # Normalize weights
         vector_weight, bm25_weight = normalize_weights(vector_w, bm25_w)
         return search_api(query, top_k, search_type, vector_weight, bm25_weight)
@@ -700,84 +630,33 @@ with gr.Blocks(title="🔧 Advanced Hybrid Search") as search_demo:
         outputs=search_output
     )
-# Create stats interface
 with gr.Blocks(title="📊 System Statistics") as stats_demo:
     gr.Markdown("# 📊 Hybrid Search System Statistics")
-    gr.Markdown("Detailed information about the knowledge base and search capabilities")
     stats_output = gr.JSON(label="System Statistics", height=500)
     stats_btn = gr.Button("📊 Get System Statistics", variant="primary")
-    stats_btn.click(
-        get_stats_api,
-        inputs=[],
-        outputs=stats_output
-    )
-# Combine interfaces using TabbedInterface
 demo = gr.TabbedInterface(
     [chat_demo, search_demo, stats_demo],
     ["💬 Hybrid Chat", "🔧 Advanced Search", "📊 Statistics"],
     title="🔥 Hybrid Search RAGtim Bot - Vector + BM25 Fusion"
 )
-# Create API functions for external access
-def api_search_function(query: str, top_k: int = 5, search_type: str = "hybrid", vector_weight: float = 0.6, bm25_weight: float = 0.4):
-    """API function for search - accessible via Gradio API"""
-    try:
-        if not query or not query.strip():
-            return {"error": "Query parameter is required"}
-        return search_api(query.strip(), top_k, search_type, vector_weight, bm25_weight)
-    except Exception as e:
-        return {"error": str(e)}
-def api_stats_function():
-    """API function for stats - accessible via Gradio API"""
-    try:
-        return get_stats_api()
-    except Exception as e:
-        return {"error": str(e)}
-# Create separate API interfaces that can be accessed via HTTP
-search_api_interface = gr.Interface(
-    fn=api_search_function,
-    inputs=[
-        gr.Textbox(label="query", placeholder="Enter search query"),
-        gr.Number(label="top_k", value=5, minimum=1, maximum=20),
-        gr.Dropdown(label="search_type", choices=["hybrid", "vector", "bm25"], value="hybrid"),
-        gr.Number(label="vector_weight", value=0.6, minimum=0.0, maximum=1.0),
-        gr.Number(label="bm25_weight", value=0.4, minimum=0.0, maximum=1.0)
-    ],
-    outputs=gr.JSON(label="Search Results"),
-    title="Search API",
-    description="Hybrid search API endpoint"
-)
-stats_api_interface = gr.Interface(
-    fn=api_stats_function,
-    inputs=[],
-    outputs=gr.JSON(label="Statistics"),
-    title="Stats API",
-    description="Knowledge base statistics API endpoint"
-)
 if __name__ == "__main__":
-    print("🚀 Launching Hybrid Search RAGtim Bot...")
-    print(f"📚 Loaded {len(bot.knowledge_base)} sections from markdown files")
-    print(f"🔍 BM25 index: {len(bot.document_frequency)} unique terms")
-    print(f"🧠 Vector embeddings: {len(bot.embeddings)} documents")
-    print("🔥 Hybrid search ready: Semantic + Keyword fusion!")
-    # Launch the main demo
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
         show_error=True
-    )
-    # Note: The API interfaces are available at:
-    # - Main interface: https://your-space-url.hf.space
-    # - Search API: https://your-space-url.hf.space/api/search (via the main interface)
-    # - Stats API: https://your-space-url.hf.space/api/stats (via the main interface)

 import gradio as gr
 import json
 import numpy as np
+from transformers import pipeline
 import torch
 import os
 from typing import List, Dict, Any
 import re
 import math
 from collections import defaultdict, Counter
+import logging
+# Import configuration
+from config import *
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Configure device
+device = get_device()
+logger.info(f"Using device: {device}")
 class HybridSearchRAGBot:
     def __init__(self):
         self.embeddings = []
         # BM25 components
+        self.term_frequencies = {}
+        self.document_frequency = {}
+        self.document_lengths = {}
         self.average_doc_length = 0
         self.total_documents = 0
         # BM25 parameters
+        self.k1 = BM25_K1
+        self.b = BM25_B
         self.initialize_models()
         self.load_markdown_knowledge_base()
     def initialize_models(self):
         """Initialize the embedding model"""
         try:
+            logger.info("Loading embedding model...")
             self.embedder = pipeline(
                 'feature-extraction',
+                EMBEDDING_MODEL,
                 device=0 if device == "cuda" else -1
             )
+            logger.info("✅ Embedding model loaded successfully")
         except Exception as e:
+            logger.error(f"❌ Error loading embedding model: {e}")
             raise e
     def load_markdown_knowledge_base(self):
         """Load knowledge base from markdown files"""
+        logger.info("Loading knowledge base from markdown files...")
         # Reset knowledge base
         self.knowledge_base = []
+        for filename in KNOWLEDGE_BASE_FILES:
             try:
                 if os.path.exists(filename):
                     with open(filename, 'r', encoding='utf-8') as f:
                         content = f.read()
+                    self.process_markdown_file(content, os.path.basename(filename))
+                    logger.info(f"✅ Loaded {filename}")
                 else:
+                    logger.warning(f"⚠️ File not found: {filename}")
             except Exception as e:
+                logger.error(f"❌ Error loading {filename}: {e}")
         # Generate embeddings for knowledge base
+        logger.info("Generating embeddings for knowledge base...")
         self.embeddings = []
         for i, doc in enumerate(self.knowledge_base):
             try:
                 # Truncate content to avoid token limit issues
+                content = doc["content"][:500]
                 embedding = self.embedder(content, return_tensors="pt")
                 # Convert to numpy and flatten
                 embedding_np = embedding[0].mean(dim=0).detach().cpu().numpy()
                 self.embeddings.append(embedding_np)
             except Exception as e:
+                logger.error(f"Error generating embedding for doc {doc['id']}: {e}")
                 # Fallback to zero embedding
+                self.embeddings.append(np.zeros(EMBEDDING_DIM))
         self.total_documents = len(self.knowledge_base)
+        logger.info(f"✅ Knowledge base loaded with {len(self.knowledge_base)} documents")
     def process_markdown_file(self, content: str, filename: str):
         """Process a markdown file and extract sections"""
+        file_type, priority = FILE_TYPE_MAP.get(filename, ('general', 5))
         # Split content into sections
         sections = self.split_markdown_into_sections(content)
         for section in sections:
+            if len(section['content'].strip()) > 100:
                 doc = {
                     "id": f"{filename}_{section['title']}_{len(self.knowledge_base)}",
                     "content": section['content'],
         current_section = {'title': 'Introduction', 'content': ''}
         for line in lines:
             if line.startswith('#'):
                 if current_section['content'].strip():
                     sections.append(current_section.copy())
                 title = line.lstrip('#').strip()
                 current_section = {
                     'title': title,
             else:
                 current_section['content'] += line + '\n'
         if current_section['content'].strip():
             sections.append(current_section)
     def tokenize(self, text: str) -> List[str]:
         """Tokenize text for BM25"""
         text = re.sub(r'[^\w\s]', ' ', text.lower())
         words = [word for word in text.split() if len(word) > 2 and not self.is_stop_word(word)]
         return words
     def build_bm25_index(self):
         """Build BM25 index for all documents"""
+        logger.info("Building BM25 index...")
         self.term_frequencies = {}
         self.document_frequency = defaultdict(int)
         self.document_lengths = {}
         total_length = 0
         for doc in self.knowledge_base:
             doc_id = doc['id']
             terms = self.tokenize(doc['content'])
             term_freq = Counter(terms)
             self.term_frequencies[doc_id] = dict(term_freq)
             doc_length = len(terms)
             self.document_lengths[doc_id] = doc_length
             total_length += doc_length
             unique_terms = set(terms)
             for term in unique_terms:
                 self.document_frequency[term] += 1
         self.average_doc_length = total_length / self.total_documents if self.total_documents > 0 else 0
+        logger.info(f"✅ BM25 index built: {len(self.document_frequency)} unique terms, avg doc length: {self.average_doc_length:.1f}")
     def calculate_bm25_score(self, term: str, doc_id: str) -> float:
         """Calculate BM25 score for a term in a document"""
         tf = self.term_frequencies.get(doc_id, {}).get(term, 0)
         if tf == 0:
             return 0.0
         df = self.document_frequency.get(term, 1)
         doc_length = self.document_lengths.get(doc_id, 0)
         idf = math.log((self.total_documents - df + 0.5) / (df + 0.5))
         numerator = tf * (self.k1 + 1)
         denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.average_doc_length))
         scores = {}
         for doc in self.knowledge_base:
             doc_id = doc['id']
             score = 0.0
                 score += self.calculate_bm25_score(term, doc_id)
             if score > 0:
                 priority_boost = 1 + (doc['metadata']['priority'] / 50)
                 final_score = score * priority_boost
                     'search_type': 'bm25'
                 }
         sorted_results = sorted(scores.values(), key=lambda x: x['score'], reverse=True)
         return sorted_results[:top_k]
     def vector_search(self, query: str, top_k: int = 10) -> List[Dict]:
         """Perform vector similarity search"""
         try:
+            query_embedding = self.embedder(query[:500], return_tensors="pt")
             query_vector = query_embedding[0].mean(dim=0).detach().cpu().numpy()
             similarities = []
             for i, doc_embedding in enumerate(self.embeddings):
                 if doc_embedding is not None and len(doc_embedding) > 0:
                     similarity = self.cosine_similarity(query_vector, doc_embedding)
                     priority_boost = 1 + (self.knowledge_base[i]['metadata']['priority'] / 100)
                     final_score = similarity * priority_boost
                         'search_type': 'vector'
                     })
             similarities.sort(key=lambda x: x['score'], reverse=True)
             return similarities[:top_k]
         except Exception as e:
+            logger.error(f"Error in vector search: {e}")
             return []
     def hybrid_search(self, query: str, top_k: int = 10, vector_weight: float = 0.6, bm25_weight: float = 0.4) -> List[Dict]:
         """Perform hybrid search combining vector and BM25 results"""
         try:
+            vector_results = self.vector_search(query, top_k * 2)
             bm25_results = self.bm25_search(query, top_k * 2)
+            # Normalize scores
             if vector_results:
                 max_vector_score = max(r['score'] for r in vector_results)
                 if max_vector_score > 0:
             # Combine results
             combined_scores = {}
             for result in vector_results:
                 doc_id = result['document']['id']
                 combined_scores[doc_id] = {
                     'search_type': 'vector'
                 }
             for result in bm25_results:
                 doc_id = result['document']['id']
                 if doc_id in combined_scores:
                     'search_type': data['search_type']
                 })
             final_results.sort(key=lambda x: x['score'], reverse=True)
             return final_results[:top_k]
         except Exception as e:
+            logger.error(f"Error in hybrid search: {e}")
             return self.vector_search(query, top_k)
     def search_knowledge_base(self, query: str, top_k: int = 5, search_type: str = "hybrid") -> List[Dict]:
             return self.vector_search(query, top_k)
         elif search_type == "bm25":
             return self.bm25_search(query, top_k)
+        else:
             return self.hybrid_search(query, top_k)
 # Initialize the bot
+logger.info("Initializing Hybrid Search RAGtim Bot...")
 bot = HybridSearchRAGBot()
+# API Functions
 def search_api(query, top_k=5, search_type="hybrid", vector_weight=0.6, bm25_weight=0.4):
     """API endpoint for hybrid search functionality"""
     try:
             }
         }
     except Exception as e:
+        logger.error(f"Error in search API: {e}")
         return {"error": str(e), "results": []}
 def get_stats_api():
     """API endpoint for knowledge base statistics"""
     try:
         doc_types = {}
         sections_by_file = {}
             "total_documents": len(bot.knowledge_base),
             "document_types": doc_types,
             "sections_by_file": sections_by_file,
+            "model_name": EMBEDDING_MODEL,
+            "embedding_dimension": EMBEDDING_DIM,
             "search_capabilities": [
                 "Hybrid Search (Vector + BM25)",
                 "Semantic Vector Search",
             "status": "healthy"
         }
     except Exception as e:
+        logger.error(f"Error in get_stats_api: {e}")
         return {
             "error": str(e),
             "status": "error",
         return "Please ask me something about Raktim Mondol! I use hybrid search combining semantic similarity and keyword matching for the best results."
     try:
         search_results = bot.hybrid_search(message, top_k=6)
         if search_results:
             response_parts = []
             response_parts.append(f"🔍 **Hybrid Search Results** (Vector + BM25 combination, found {len(search_results)} relevant sections):\n")
             best_match = search_results[0]
             response_parts.append(f"**Primary Answer** (Hybrid Score: {best_match['score']:.3f}):")
             response_parts.append(f"📄 Source: {best_match['document']['metadata']['source']} - {best_match['document']['metadata']['section']}")
             response_parts.append(f"🔍 Search Type: {best_match['search_type'].upper()}")
             if 'vector_score' in best_match and 'bm25_score' in best_match:
                 response_parts.append(f"📊 Vector Score: {best_match['vector_score']:.3f} | BM25 Score: {best_match['bm25_score']:.3f}")
             response_parts.append(f"\n{best_match['document']['content']}\n")
             if len(search_results) > 1:
                 response_parts.append("**Additional Context:**")
+                for i, result in enumerate(search_results[1:3], 1):
                     section_info = f"{result['document']['metadata']['source']} - {result['document']['metadata']['section']}"
                     search_info = f"({result['search_type'].upper()}, Score: {result['score']:.3f})"
                     response_parts.append(f"{i}. {section_info} {search_info}")
                     excerpt = result['document']['content'][:200] + "..." if len(result['document']['content']) > 200 else result['document']['content']
                     response_parts.append(f"   {excerpt}\n")
             return "I don't have specific information about that topic in my knowledge base. Could you please ask something else about Raktim Mondol?"
     except Exception as e:
+        logger.error(f"Error in chat interface: {e}")
         return "I'm sorry, I encountered an error while processing your question. Please try again."
+# Gradio Interface
 css = """
 .gradio-container {
     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
     - 🔍 **BM25 Keyword Search**: Advanced TF-IDF ranking for exact term matching
     - ⚖️ **Intelligent Fusion**: Weighted combination for optimal relevance
+    **📚 Knowledge Base**: **{len(bot.knowledge_base)} sections** from comprehensive markdown files
     **🔧 Search Parameters**:
     - **BM25 Parameters**: k1={bot.k1}, b={bot.b}
     - **Vocabulary**: {len(bot.document_frequency)} unique terms
     - **Average Document Length**: {bot.average_doc_length:.1f} words
+    - **Embedding Model**: {EMBEDDING_MODEL} ({EMBEDDING_DIM}-dim)
     **Ask me anything about Raktim Mondol's research, expertise, and background!**
     """)
         if not message.strip():
             return history, ""
         history.append({"role": "user", "content": message})
         bot_response = chat_interface(message, history)
         history.append({"role": "assistant", "content": bot_response})
         return history, ""
     submit_btn.click(respond, [msg, chatbot], [chatbot, msg])
     msg.submit(respond, [msg, chatbot], [chatbot, msg])
+# Advanced search interface
 with gr.Blocks(title="🔧 Advanced Hybrid Search") as search_demo:
     gr.Markdown("# 🔧 Advanced Hybrid Search Configuration")
     with gr.Row():
         with gr.Column(scale=2):
                 search_type = gr.Radio(
                     choices=["hybrid", "vector", "bm25"],
                     value="hybrid",
+                    label="Search Method"
                 )
                 top_k_slider = gr.Slider(
                     minimum=1,
                     label="Top K Results"
                 )
             with gr.Group(visible=True) as weight_group:
                 gr.Markdown("**Hybrid Search Weights**")
                 vector_weight = gr.Slider(
         return 0.6, 0.4
     def advanced_search(query, search_type, top_k, vector_w, bm25_w):
         vector_weight, bm25_weight = normalize_weights(vector_w, bm25_w)
         return search_api(query, top_k, search_type, vector_weight, bm25_weight)
         outputs=search_output
     )
+# Stats interface
 with gr.Blocks(title="📊 System Statistics") as stats_demo:
     gr.Markdown("# 📊 Hybrid Search System Statistics")
     stats_output = gr.JSON(label="System Statistics", height=500)
     stats_btn = gr.Button("📊 Get System Statistics", variant="primary")
+    stats_btn.click(get_stats_api, inputs=[], outputs=stats_output)
+# Main demo with tabs
 demo = gr.TabbedInterface(
     [chat_demo, search_demo, stats_demo],
     ["💬 Hybrid Chat", "🔧 Advanced Search", "📊 Statistics"],
     title="🔥 Hybrid Search RAGtim Bot - Vector + BM25 Fusion"
 )
+# Launch the application
 if __name__ == "__main__":
+    logger.info("🚀 Launching Hybrid Search RAGtim Bot...")
+    logger.info(f"📚 Loaded {len(bot.knowledge_base)} sections from markdown files")
+    logger.info(f"🔍 BM25 index: {len(bot.document_frequency)} unique terms")
+    logger.info(f"🧠 Vector embeddings: {len(bot.embeddings)} documents")
+    logger.info("🔥 Hybrid search ready: Semantic + Keyword fusion!")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
         show_error=True
+    )