Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 12

Commit

64e7c31

1 Parent(s): a763857

sentence transformer

Browse files

Files changed (10) hide show

.gitignore +2 -0
chatbot_config.py +30 -0
chatbot_model.py +109 -204
chatbot_validator.py +3 -5
prepare_data.py +65 -182
run_chatbot_chat.py +2 -1
run_chatbot_validation.py +60 -38
run_taskmaster_processor.py +1 -1
taskmaster_processor.py +68 -36
tf_data_pipeline.py +75 -107

.gitignore CHANGED Viewed

@@ -183,4 +183,6 @@ training_data/*
 augmented_dialogues.json
 raw_datasets/*

 augmented_dialogues.json
 raw_datasets/*
+st/*

chatbot_config.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict
+@dataclass
+class ChatbotConfig:
+    """RetrievalChatbot Config"""
+    max_context_token_limit: int = 512
+    embedding_dim: int = 384  # Match Sentence Transformer dimension
+    learning_rate: float = 0.0005
+    min_text_length: int = 3
+    max_context_turns: int = 20
+    pretrained_model: str = 'sentence-transformers/all-MiniLM-L6-v2'
+    cross_encoder_model: str = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
+    summarizer_model: str = 't5-small'
+    embedding_batch_size: int = 64
+    search_batch_size: int = 64
+    max_batch_size: int = 64
+    max_retries: int = 3
+    def to_dict(self) -> Dict:
+        """Convert config to dictionary."""
+        return {k: (str(v) if isinstance(v, Path) else v)
+                for k, v in self.__dict__.items()}
+    @classmethod
+    def from_dict(cls, config_dict: Dict) -> 'ChatbotConfig':
+        """Create config from dictionary."""
+        return cls(**{k: v for k, v in config_dict.items()
+                     if k in cls.__dataclass_fields__})

chatbot_model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import numpy as np
-from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 from typing import List, Tuple, Dict, Optional, Union, Any
 import math
@@ -11,125 +11,24 @@ import datetime
 import faiss
 import gc
 import re
-from tf_data_pipeline import TFDataPipeline
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
 from conversation_summarizer import DeviceAwareModel, Summarizer
 import absl.logging
 from logger_config import config_logger
 from tqdm.auto import tqdm
 absl.logging.set_verbosity(absl.logging.WARNING)
 logger = config_logger(__name__)
-@dataclass
-class ChatbotConfig:
-    """RetrievalChatbot Config"""
-    max_context_token_limit: int = 512
-    embedding_dim: int = 768
-    encoder_units: int = 256
-    num_attention_heads: int = 8
-    dropout_rate: float = 0.2
-    l2_reg_weight: float = 0.001
-    learning_rate: float = 0.0005
-    min_text_length: int = 3
-    max_context_turns: int = 20
-    warmup_steps: int = 200
-    pretrained_model: str = 'distilbert-base-uncased'
-    cross_encoder_model: str = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
-    summarizer_model: str = 't5-small'
-    dtype: str = 'float32'
-    freeze_embeddings: bool = False
-    embedding_batch_size: int = 64
-    search_batch_size: int = 64
-    max_batch_size: int = 64
-    max_retries: int = 3
-    def to_dict(self) -> Dict:
-        """Convert config to dictionary."""
-        return {k: (str(v) if isinstance(v, Path) else v)
-                for k, v in self.__dict__.items()}
-    @classmethod
-    def from_dict(cls, config_dict: Dict) -> 'ChatbotConfig':
-        """Create config from dictionary."""
-        return cls(**{k: v for k, v in config_dict.items()
-                     if k in cls.__dataclass_fields__})
-class EncoderModel(tf.keras.Model):
-    """Dual encoder model with pretrained DistilBERT embeddings."""
-    def __init__(
-        self,
-        config: ChatbotConfig,
-        name: str = "encoder",
-        **kwargs
-    ):
-        super().__init__(name=name, **kwargs)
-        self.config = config
-        # Load pretrained model and freeze layers based on config
-        self.pretrained = TFAutoModel.from_pretrained(config.pretrained_model)
-        self._freeze_layers()
-        # Add Global Average Pooling, Projection, Dropout, and Normalization layers
-        self.pooler = tf.keras.layers.GlobalAveragePooling1D()
-        self.projection = tf.keras.layers.Dense(
-            config.embedding_dim,
-            activation='tanh',
-            name="projection",
-            dtype=tf.float32
-        )
-        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
-        self.normalize = tf.keras.layers.Lambda(
-            lambda x: tf.nn.l2_normalize(x, axis=1),
-            name="l2_normalize"
-        )
-    def _freeze_layers(self):
-        """Freeze n layers of the pretrained model"""
-        if self.config.freeze_embeddings:
-            self.pretrained.trainable = False
-            logger.info("All pretrained layers frozen.")
-        else:
-            # Freeze only the first 'n' transformer layers
-            for i, layer in enumerate(self.pretrained.layers):
-                if isinstance(layer, tf.keras.layers.Layer):
-                    if hasattr(layer, 'trainable'):
-                        if i < 1:
-                            layer.trainable = False
-                            logger.info(f"Layer {i} frozen.")
-                        else:
-                            layer.trainable = True
-                            logger.info(f"Layer {i} trainable.")
-    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
-        """Forward pass."""
-        # Get pretrained embeddings
-        pretrained_outputs = self.pretrained(inputs, training=training)
-        x = pretrained_outputs.last_hidden_state    # Shape: [batch_size, seq_len, embedding_dim]
-        # Apply pooling, projection, dropout, and normalization
-        x = self.pooler(x)                          # Shape: [batch_size, 768]
-        x = self.projection(x)                      # Shape: [batch_size, 768]
-        x = self.dropout(x, training=training)
-        x = self.normalize(x)                       # Shape: [batch_size, 768]
-        return x
-    def get_config(self) -> dict:
-        """Return the model config"""
-        config = super().get_config()
-        config.update({
-            "config": self.config.to_dict(),
-            "name": self.name
-        })
-        return config
 class RetrievalChatbot(DeviceAwareModel):
     """
     Retrieval-based learning chatbot model.
     Uses trained embeddings and FAISS for similarity search.
     """
     def __init__(
         self,
         config: ChatbotConfig,
@@ -139,6 +38,7 @@ class RetrievalChatbot(DeviceAwareModel):
         summarizer: Optional[Summarizer] = None,
         mode: str = 'training'
     ):
         super().__init__()
         self.config = config
         self.strategy = strategy
@@ -146,13 +46,14 @@ class RetrievalChatbot(DeviceAwareModel):
         self.mode = mode.lower()
         # Initialize reranker, summarizer, tokenizer, and encoder
-        self.reranker = reranker or self._initialize_reranker()
-        self.tokenizer = self._initialize_tokenizer()
         self.encoder = self._initialize_encoder()
         self.summarizer = summarizer or self._initialize_summarizer()
         # Initialize data pipeline
         logger.info("Initializing TFDataPipeline.")
         self.data_pipeline = TFDataPipeline(
             config=self.config,
             tokenizer=self.tokenizer,
@@ -177,7 +78,6 @@ class RetrievalChatbot(DeviceAwareModel):
             "train_metrics": {},
             "val_metrics": {}
         }
     def _setup_default_device(self) -> str:
         """Set up default device if none is provided."""
@@ -200,34 +100,11 @@ class RetrievalChatbot(DeviceAwareModel):
             device=self.device,
             max_summary_rounds=2
         )
-    def _initialize_tokenizer(self) -> AutoTokenizer:
-        """Initialize the tokenizer and add special tokens."""
-        logger.info("Initializing tokenizer and adding special tokens...")
-        tokenizer = AutoTokenizer.from_pretrained(self.config.pretrained_model)
-        special_tokens = {
-            "user": "<USER>",
-            "assistant": "<ASSISTANT>",
-            "context": "<CONTEXT>",
-            "sep": "<SEP>"
-        }
-        tokenizer.add_special_tokens(
-            {'additional_special_tokens': list(special_tokens.values())}
-        )
-        return tokenizer
-    def _initialize_encoder(self) -> EncoderModel:
-        """Initialize the EncoderModel and resize token embeddings."""
-        logger.info("Initializing encoder model...")
-        encoder = EncoderModel(
-            self.config,
-            name="shared_encoder",
-        )
-        new_vocab_size = len(self.tokenizer)
-        encoder.pretrained.resize_token_embeddings(new_vocab_size)
-        logger.info(f"Token embeddings resized to: {new_vocab_size}")
         return encoder
     def _load_faiss_index_and_responses(self) -> None:
@@ -254,43 +131,35 @@ class RetrievalChatbot(DeviceAwareModel):
         except Exception as e:
             logger.error(f"Failed to load FAISS index and response pool: {e}")
             raise
     @classmethod
     def load_model(cls, load_dir: Union[str, Path], mode: str = 'training') -> 'RetrievalChatbot':
-        """
-        Load saved models and configuration.
-        """
         load_dir = Path(load_dir)
         # Load config
-        with open(load_dir / "config.json", "r") as f:
-            config = ChatbotConfig.from_dict(json.load(f))
         # Initialize chatbot
         chatbot = cls(config, mode=mode)
-        # Load DistilBERT
-        chatbot.encoder.pretrained = TFAutoModel.from_pretrained(load_dir / "shared_encoder", config=config)
-        dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
-        _ = chatbot.encoder(dummy_input, training=False)
-        # Load tokenizer
-        chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
-        logger.info(f"Models and tokenizer loaded from {load_dir}")
-        # Load the custom weights
-        custom_weights_path = load_dir / "encoder_custom_weights.weights.h5"
-        if custom_weights_path.exists():
-            chatbot.encoder.load_weights(str(custom_weights_path))
-            logger.info("Loaded custom encoder weights for projection/dropout/etc.")
         else:
-            logger.warning(f"No custom encoder weights found at {custom_weights_path}. The top-level projection layer won't have learned parameters.")
-        # Handle 'inference' mode: load FAISS, etc.
-        if mode == 'inference':
-            cls._prepare_model_for_inference(chatbot, load_dir)
         return chatbot
     @classmethod
@@ -324,21 +193,19 @@ class RetrievalChatbot(DeviceAwareModel):
         except Exception as e:
             logger.error(f"Error loading inference components: {e}")
             raise
     def save_models(self, save_dir: Union[str, Path]):
-        """Save model and config"""
         save_dir = Path(save_dir)
         save_dir.mkdir(parents=True, exist_ok=True)
         # Save config
         with open(save_dir / "config.json", "w") as f:
             json.dump(self.config.to_dict(), f, indent=2)
-        # Save the HF DistilBERT submodule, custom top-level layers, and tokenizer
-        self.encoder.pretrained.save_pretrained(save_dir / "shared_encoder")
-        self.encoder.save_weights(save_dir / "encoder_custom_weights.weights.h5")
-        self.tokenizer.save_pretrained(save_dir / "tokenizer")
-        logger.info(f"Models and tokenizer saved to {save_dir}.")
     def retrieve_responses(
         self,
@@ -346,59 +213,73 @@ class RetrievalChatbot(DeviceAwareModel):
         top_k: int = 10,
         reranker: Optional[CrossEncoderReranker] = None,
         summarizer: Optional[Summarizer] = None,
-        summarize_threshold: int = 512
     ) -> List[Tuple[str, float]]:
         """
         Retrieve top-k responses using FAISS and cross-encoder re-ranking.
         Args:
             query: The user's input text.
-            top_k: Number of FAISS results to return
-            reranker: CrossEncoderReranker for refined scoring
-            summarizer: Summarizer for long queries
-            summarize_threshold: Summarize if conversation tokens > threshold.
         Returns:
             List of (response_text, final_score).
         """
         def sigmoid(x: float) -> float:
             return 1 / (1 + np.exp(-x))
-        # Query summarization
         if summarizer and len(query.split()) > summarize_threshold:
-            logger.info(f"Query is long ({len(query.split())} words). Summarizing.")
             query = summarizer.summarize_text(query)
-            logger.info(f"Summarized Query: {query}")
         detected_domain = self.detect_domain_from_query(query)
-        # Retrieve initial candidates from FAISS
-        initial_k = min(top_k * 10, len(self.data_pipeline.response_pool))
-        faiss_candidates = self.faiss_search(query, domain=detected_domain, top_k=initial_k)
         if not faiss_candidates:
             return []
-        texts = [item[0] for item in faiss_candidates]
-        if not reranker:
             reranker = CrossEncoderReranker(model_name=self.config.cross_encoder_model)
-        # Re-rank the texts (candidates) from FAISS search using the cross-encoder
-        ce_logits = reranker.rerank(query, texts, max_length=256)
-        # Combine scores from FAISS and cross-encoder
         final_candidates = []
-        for (resp_text, faiss_score), logit in zip(faiss_candidates, ce_logits):
-            ce_prob = sigmoid(logit)            # now in range [0...1]
-            faiss_norm = (faiss_score + 1)/2.0  # now in range [0...1]
-            combined_score = 0.85 * ce_prob + 0.15 * faiss_norm
             length_adjusted_score = self.length_adjust_score(resp_text, combined_score)
             final_candidates.append((resp_text, length_adjusted_score))
-        # Sort descending by combined score
         final_candidates.sort(key=lambda x: x[1], reverse=True)
-        # Return top_k
         return final_candidates[:top_k]
     def extract_keywords(self, query: str) -> List[str]:
@@ -636,21 +517,45 @@ class RetrievalChatbot(DeviceAwareModel):
         conversation_history: Optional[List[Tuple[str, str]]]
     ) -> str:
         """
-        Build conversation context string from conversation history.
         """
         if not conversation_history:
-            return f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
         conversation_parts = []
         for user_txt, assistant_txt in conversation_history:
-            conversation_parts.extend([
-                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {user_txt}",
-                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {assistant_txt}"
-            ])
-        conversation_parts.append(f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}")
         return "\n".join(conversation_parts)
     def train_model(
         self,
         tfrecord_file_path: str,

 import os
 import numpy as np
+from sentence_transformers import SentenceTransformer
 import tensorflow as tf
 from typing import List, Tuple, Dict, Optional, Union, Any
 import math
 import faiss
 import gc
 import re
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
 from conversation_summarizer import DeviceAwareModel, Summarizer
+from chatbot_config import ChatbotConfig
+from tf_data_pipeline import TFDataPipeline
 import absl.logging
 from logger_config import config_logger
 from tqdm.auto import tqdm
 absl.logging.set_verbosity(absl.logging.WARNING)
 logger = config_logger(__name__)
 class RetrievalChatbot(DeviceAwareModel):
     """
     Retrieval-based learning chatbot model.
     Uses trained embeddings and FAISS for similarity search.
     """
     def __init__(
         self,
         config: ChatbotConfig,
         summarizer: Optional[Summarizer] = None,
         mode: str = 'training'
     ):
         super().__init__()
         self.config = config
         self.strategy = strategy
         self.mode = mode.lower()
         # Initialize reranker, summarizer, tokenizer, and encoder
         self.encoder = self._initialize_encoder()
+        self.tokenizer = self.encoder.tokenizer
+        self.reranker = reranker or self._initialize_reranker()
         self.summarizer = summarizer or self._initialize_summarizer()
         # Initialize data pipeline
         logger.info("Initializing TFDataPipeline.")
         self.data_pipeline = TFDataPipeline(
             config=self.config,
             tokenizer=self.tokenizer,
             "train_metrics": {},
             "val_metrics": {}
         }
     def _setup_default_device(self) -> str:
         """Set up default device if none is provided."""
             device=self.device,
             max_summary_rounds=2
         )
+    def _initialize_encoder(self) -> SentenceTransformer:
+        """Initialize the Sentence Transformer model."""
+        logger.info("Initializing SentenceTransformer encoder model...")
+        encoder = SentenceTransformer(self.config.pretrained_model)
         return encoder
     def _load_faiss_index_and_responses(self) -> None:
         except Exception as e:
             logger.error(f"Failed to load FAISS index and response pool: {e}")
             raise
     @classmethod
     def load_model(cls, load_dir: Union[str, Path], mode: str = 'training') -> 'RetrievalChatbot':
+        """Load chatbot model and configuration."""
         load_dir = Path(load_dir)
         # Load config
+        config_path = load_dir / "config.json"
+        if config_path.exists():
+            with open(config_path, "r") as f:
+                config = ChatbotConfig.from_dict(json.load(f))
+            logger.info("Loaded ChatbotConfig from config.json.")
+        else:
+            raise FileNotFoundError(f"Config file not found at {config_path}. Please ensure it exists.")
         # Initialize chatbot
         chatbot = cls(config, mode=mode)
+        # Load Sentence Transformer
+        model_path = load_dir / "sentence_transformer"
+        if model_path.exists():
+            # Load locally saved model
+            chatbot.encoder = SentenceTransformer(str(model_path))
+            logger.info("Loaded SentenceTransformer model from local path successfully.")
         else:
+            # Load from pre-trained model hub
+            chatbot.encoder = SentenceTransformer(config.pretrained_model)
+            logger.info(f"Loaded SentenceTransformer model '{config.pretrained_model}' from the hub successfully.")
         return chatbot
     @classmethod
         except Exception as e:
             logger.error(f"Error loading inference components: {e}")
             raise
     def save_models(self, save_dir: Union[str, Path]):
+        """Save SentenceTransformer model and config."""
         save_dir = Path(save_dir)
         save_dir.mkdir(parents=True, exist_ok=True)
         # Save config
         with open(save_dir / "config.json", "w") as f:
             json.dump(self.config.to_dict(), f, indent=2)
+        # Save Sentence Transformer
+        self.encoder.save(save_dir / "sentence_transformer")
+        logger.info(f"Model and config saved to {save_dir}.")
     def retrieve_responses(
         self,
         top_k: int = 10,
         reranker: Optional[CrossEncoderReranker] = None,
         summarizer: Optional[Summarizer] = None,
+        summarize_threshold: int = 512,
+        boost_factor: float = 1.15
     ) -> List[Tuple[str, float]]:
         """
         Retrieve top-k responses using FAISS and cross-encoder re-ranking.
         Args:
             query: The user's input text.
+            top_k: Number of responses to return.
+            reranker: Optional reranker for refined scoring.
+            summarizer: Optional summarizer for long queries.
+            summarize_threshold: Threshold to summarize long queries.
+            boost_factor: Factor to boost scores for keyword matches.
         Returns:
             List of (response_text, final_score).
         """
         def sigmoid(x: float) -> float:
             return 1 / (1 + np.exp(-x))
+        # Summarize long queries
         if summarizer and len(query.split()) > summarize_threshold:
+            logger.info(f"Query is long ({len(query.split())} words). Summarizing...")
             query = summarizer.summarize_text(query)
+            logger.info(f"Summarized query: {query}")
+        # Detect domain for query
         detected_domain = self.detect_domain_from_query(query)
+        # Step 1: Retrieve candidates from FAISS
+        logger.info("Retrieving initial candidates from FAISS...")
+        faiss_candidates = self.data_pipeline.retrieve_responses(query, top_k=top_k * 10)
         if not faiss_candidates:
+            logger.warning("No candidates retrieved from FAISS.")
             return []
+        # Step 2: Re-rank candidates using Cross-Encoder
+        logger.info("Re-ranking candidates using Cross-Encoder...")
+        texts = [item[0] for item in faiss_candidates]  # Extract response texts
+        faiss_scores = [item[1] for item in faiss_candidates]
+        if reranker is None:
             reranker = CrossEncoderReranker(model_name=self.config.cross_encoder_model)
+        ce_logits = reranker.rerank(query, texts, max_length=256)  # Re-rank responses
+        # Combine FAISS and Cross-Encoder scores
         final_candidates = []
+        for resp_text, faiss_score, logit in zip(texts, faiss_scores, ce_logits):
+            ce_prob = sigmoid(logit)            # Cross-encoder score in range [0, 1]
+            faiss_norm = (faiss_score + 1) / 2  # Normalize FAISS score to range [0, 1]
+            combined_score = 0.75 * ce_prob + 0.25 * faiss_norm
+            # Boost score based on keyword match
+            query_keywords = self.extract_keywords(query)
+            if query_keywords and any(kw in resp_text.lower() for kw in query_keywords):
+                combined_score *= boost_factor
+            # Adjust score based on length
             length_adjusted_score = self.length_adjust_score(resp_text, combined_score)
             final_candidates.append((resp_text, length_adjusted_score))
+        # Step 3: Sort and return top-k results
         final_candidates.sort(key=lambda x: x[1], reverse=True)
+        logger.info(f"Returning top-{top_k} re-ranked responses.")
         return final_candidates[:top_k]
     def extract_keywords(self, query: str) -> List[str]:
         conversation_history: Optional[List[Tuple[str, str]]]
     ) -> str:
         """
+        Build conversation context string from conversation history,
+        using literal <USER> and <ASSISTANT> tokens (no tokenizer special index).
         """
+        USER_TOKEN = "<USER>"
+        ASSISTANT_TOKEN = "<ASSISTANT>"
         if not conversation_history:
+            return f"{USER_TOKEN} {query}"
         conversation_parts = []
         for user_txt, assistant_txt in conversation_history:
+            # Insert literal tokens
+            conversation_parts.append(f"{USER_TOKEN} {user_txt}")
+            conversation_parts.append(f"{ASSISTANT_TOKEN} {assistant_txt}")
+        conversation_parts.append(f"{USER_TOKEN} {query}")
         return "\n".join(conversation_parts)
+    # def _build_conversation_context(
+    #     self,
+    #     query: str,
+    #     conversation_history: Optional[List[Tuple[str, str]]]
+    # ) -> str:
+    #     """
+    #     Build conversation context string from conversation history.
+    #     """
+    #     if not conversation_history:
+    #         return f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
+    #     conversation_parts = []
+    #     for user_txt, assistant_txt in conversation_history:
+    #         conversation_parts.extend([
+    #             f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {user_txt}",
+    #             f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {assistant_txt}"
+    #         ])
+    #     conversation_parts.append(f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}")
+    #     return "\n".join(conversation_parts)
     def train_model(
         self,
         tfrecord_file_path: str,

chatbot_validator.py CHANGED Viewed

@@ -13,7 +13,7 @@ class ChatbotValidator:
     This testing module executes domain-specific queries, obtains chatbot responses, and evaluates them with a quality checker.
     """
-    def __init__(self, chatbot, quality_checker):
         """
         Initialize the validator.
         Args:
@@ -22,6 +22,7 @@ class ChatbotValidator:
         """
         self.chatbot = chatbot
         self.quality_checker = quality_checker
         # Domain-specific test queries (aligns with Taskmaster-1 dataset)
         self.domain_queries = {
@@ -85,9 +86,6 @@ class ChatbotValidator:
         metrics_history = []
         domain_metrics = {}
-        # Init the cross-encoder reranker to pass to the chatbot
-        reranker = CrossEncoderReranker(model_name=self.chatbot.config.cross_encoder_model)
         # Prepare random selection if needed
         rng = random.Random(seed)
@@ -113,7 +111,7 @@ class ChatbotValidator:
                 logger.info(f"TEST CASE {i}: QUERY: {query}")
                 # Retrieve top_k responses, then evaluate with quality checker
-                responses = self.chatbot.retrieve_responses(query, top_k=top_k, reranker=reranker)
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
                 # Aggregate metrics and log

     This testing module executes domain-specific queries, obtains chatbot responses, and evaluates them with a quality checker.
     """
+    def __init__(self, chatbot, quality_checker, cross_encoder_model='cross-encoder/ms-marco-MiniLM-L-12-v2'):
         """
         Initialize the validator.
         Args:
         """
         self.chatbot = chatbot
         self.quality_checker = quality_checker
+        self.reranker = CrossEncoderReranker(model_name=cross_encoder_model)
         # Domain-specific test queries (aligns with Taskmaster-1 dataset)
         self.domain_queries = {
         metrics_history = []
         domain_metrics = {}
         # Prepare random selection if needed
         rng = random.Random(seed)
                 logger.info(f"TEST CASE {i}: QUERY: {query}")
                 # Retrieve top_k responses, then evaluate with quality checker
+                responses = self.chatbot.retrieve_responses(query, top_k=top_k, reranker=self.reranker)
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
                 # Aggregate metrics and log

prepare_data.py CHANGED Viewed

@@ -1,14 +1,12 @@
 import os
-import sys
-import faiss
 import json
 import pickle
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModel
 from tqdm.auto import tqdm
 from pathlib import Path
-from chatbot_model import ChatbotConfig, EncoderModel
 from tf_data_pipeline import TFDataPipeline
 from logger_config import config_logger
 logger = config_logger(__name__)
@@ -23,15 +21,10 @@ def main():
     FAISS_INDICES_DIR = os.path.join(MODELS_DIR, 'faiss_indices')
     TF_RECORD_DIR = 'training_data'
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
-    JSON_TRAINING_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, 'taskmaster_dialogues.json')
     CACHE_FILE = os.path.join(CACHE_DIR, 'query_embeddings_cache.pkl')
     TF_RECORD_PATH = os.path.join(TF_RECORD_DIR, 'training_data_3.tfrecord')
-    # Decide whether to load the **custom** model or base DistilBERT (Base used for first iteration).
-    # True for custom, False for base DistilBERT.
-    LOAD_CUSTOM_MODEL = True
-    NUM_NEG_SAMPLES = 10
     # Ensure output directories exist
     os.makedirs(MODELS_DIR, exist_ok=True)
     os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
@@ -40,7 +33,7 @@ def main():
     os.makedirs(FAISS_INDICES_DIR, exist_ok=True)
     os.makedirs(TF_RECORD_DIR, exist_ok=True)
-    # Init config
     config_json = Path(MODELS_DIR) / "config.json"
     if config_json.exists():
         with open(config_json, "r", encoding="utf-8") as f:
@@ -50,187 +43,77 @@ def main():
     else:
         config = ChatbotConfig()
         logger.warning("No config.json found. Using default ChatbotConfig.")
-    # Ensure negative samples are set
-    config.neg_samples = NUM_NEG_SAMPLES
-    # Load or init tokenizer
-    try:
-        if Path(TOKENIZER_DIR).exists() and list(Path(TOKENIZER_DIR).iterdir()):
-            logger.info(f"Loading tokenizer from {TOKENIZER_DIR}")
-            tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
-        else:
-            logger.info(f"Loading base tokenizer for {config.pretrained_model}")
-            tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
-            Path(TOKENIZER_DIR).mkdir(parents=True, exist_ok=True)
-            tokenizer.save_pretrained(TOKENIZER_DIR)
-            logger.info(f"New tokenizer saved to {TOKENIZER_DIR}")
-    except Exception as e:
-        logger.error(f"Failed to load or create tokenizer: {e}")
-        sys.exit(1)
-    # Init the encoder
-    try:
-        encoder = EncoderModel(config=config)
-        logger.info("EncoderModel initialized successfully.")
-        if LOAD_CUSTOM_MODEL:
-            # Load the DistilBERT submodule from 'shared_encoder'
-            shared_encoder_path = Path(MODELS_DIR) / "shared_encoder"
-            if shared_encoder_path.exists():
-                logger.info(f"Loading DistilBERT submodule from {shared_encoder_path}")
-                encoder.pretrained = TFAutoModel.from_pretrained(shared_encoder_path)
-            else:
-                logger.warning(f"No shared_encoder found at {shared_encoder_path}, using base DistilBERT instead.")
-            # Load custom .weights.h5 (projection, dropout, etc.)
-            custom_weights_path = Path(MODELS_DIR) / "encoder_custom_weights.weights.h5"
-            if custom_weights_path.exists():
-                logger.info(f"Loading custom top-level weights from {custom_weights_path}")
-                # Dummy forward pass forces model build to ensure all layers are built
-                dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
-                _ = encoder(dummy_input, training=False)
-                encoder.load_weights(str(custom_weights_path))
-                logger.info("Custom encoder weights loaded successfully.")
-            else:
-                logger.warning(f"Custom weights file not found at {custom_weights_path}. Using only submodule weights.")
-        else:
-            # Base DistilBERT with special tokens
-            logger.info("Using the base DistilBERT without loading custom weights.")
-        # Resize token embeddings in case we added special tokens (EncoderModel class)
-        encoder.pretrained.resize_token_embeddings(len(tokenizer))
-        logger.info(f"Token embeddings resized to: {len(tokenizer)}")
-    except Exception as e:
-        logger.error(f"Failed to initialize EncoderModel: {e}")
-        sys.exit(1)
-    # Load JSON dialogues
-    try:
-        if not Path(JSON_TRAINING_DATA_PATH).exists():
-            logger.warning(f"No dialogues found at {JSON_TRAINING_DATA_PATH}, skipping.")
-            dialogues = []
-        else:
-            dialogues = TFDataPipeline.load_json_training_data(JSON_TRAINING_DATA_PATH, debug_samples=None)
-            logger.info(f"Loaded {len(dialogues)} dialogues from {JSON_TRAINING_DATA_PATH}.")
-    except Exception as e:
-        logger.error(f"Failed to load dialogues: {e}")
-        sys.exit(1)
-    # Load or init query_embeddings_cache. NOTE: recompute after each training. This was a bug source.
     query_embeddings_cache = {}
     if os.path.exists(CACHE_FILE):
-        try:
-            with open(CACHE_FILE, 'rb') as f:
-                query_embeddings_cache = pickle.load(f)
-            logger.info(f"Loaded {len(query_embeddings_cache)} query embeddings from {CACHE_FILE}.")
-        except Exception as e:
-            logger.warning(f"Failed to load query embeddings cache: {e}")
     else:
         logger.info("No existing query embeddings cache found. Starting fresh.")
-    # Initialize TFDataPipeline
-    try:
-        # Load or init FAISS index
-        if Path(FAISS_INDEX_PRODUCTION_PATH).exists():
-            logger.info(f"Loading existing FAISS index from {FAISS_INDEX_PRODUCTION_PATH}...")
-            faiss_index = faiss.read_index(FAISS_INDEX_PRODUCTION_PATH)
-            logger.info("FAISS index loaded successfully.")
-        else:
-            logger.info("No existing FAISS index found. Initializing a new index.")
-            dimension = config.embedding_dim  # Ensure this matches your encoder's output
-            faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product for cosine similarity
-            logger.info(f"Initialized new FAISS index with dimension {dimension}.")
-        # Init TFDataPipeline with the FAISS index
-        data_pipeline = TFDataPipeline(
-            config=config,
-            tokenizer=tokenizer,
-            encoder=encoder,
-            index_file_path=FAISS_INDEX_PRODUCTION_PATH,
-            response_pool=[],
-            max_length=config.max_context_token_limit,
-            neg_samples=config.neg_samples,
-            query_embeddings_cache=query_embeddings_cache,
-            index_type='IndexFlatIP',
-            nlist=100, # Not used for IndexFlatIP. Retained for future use of IndexIVFFlat
-            max_retries=config.max_retries
-        )
-        logger.info("TFDataPipeline initialized successfully.")
-    except Exception as e:
-        logger.error(f"Failed to initialize TFDataPipeline: {e}")
-        sys.exit(1)
-    # Collect response pool from dialogues
-    try:
-        if dialogues:
-            response_pool = data_pipeline.collect_responses_with_domain(dialogues)
-            data_pipeline.response_pool = response_pool
-            logger.info(f"Collected {len(response_pool)} unique assistant responses from dialogues.")
-        else:
-            logger.warning("No dialogues loaded. response_pool remains empty.")
-    except Exception as e:
-        logger.error(f"Failed to collect responses: {e}")
-        sys.exit(1)
-    # Build FAISS index with response embeddings
-    try:
-        if data_pipeline.response_pool:
-            data_pipeline.build_text_to_domain_map()
-            logger.info("Computing and adding response embeddings to FAISS index using TFDataPipeline...")
-            data_pipeline.compute_and_index_response_embeddings()
-            logger.info("Response embeddings computed and added to FAISS index.")
-            # Save the FAISS index
-            data_pipeline.save_faiss_index(FAISS_INDEX_PRODUCTION_PATH)
-            # Also save response pool JSON
-            response_pool_path = FAISS_INDEX_PRODUCTION_PATH.replace('.index', '_responses.json')
-            with open(response_pool_path, 'w', encoding='utf-8') as f:
-                json.dump(data_pipeline.response_pool, f, indent=2)
-            logger.info(f"Response pool saved to {response_pool_path}.")
-        else:
-            logger.warning("No responses to embed. Skipping FAISS indexing.")
-    except Exception as e:
-        logger.error(f"Failed to compute or add response embeddings: {e}")
-        sys.exit(1)
-    # Prepare training data as TFRecords (TensforFlow Record format)
-    try:
-        if dialogues:
-            logger.info("Starting data preparation and saving as TFRecord...")
-            data_pipeline.prepare_and_save_data(dialogues, TF_RECORD_PATH)
-            logger.info(f"Data saved as TFRecord at {TF_RECORD_PATH}.")
-        else:
-            logger.warning("No dialogues to build TFRecord from. Skipping TFRecord creation.")
-    except Exception as e:
-        logger.error(f"Failed during data preparation and saving: {e}")
-        sys.exit(1)
     # Save query embeddings cache
-    try:
-        with open(CACHE_FILE, 'wb') as f:
-            pickle.dump(data_pipeline.query_embeddings_cache, f)
-        logger.info(f"Saved {len(data_pipeline.query_embeddings_cache)} query embeddings to {CACHE_FILE}.")
-    except Exception as e:
-        logger.error(f"Failed to save query embeddings cache: {e}")
-        sys.exit(1)
-    # Save Tokenizer
-    try:
-        tokenizer.save_pretrained(TOKENIZER_DIR)
-        logger.info(f"Tokenizer saved to {TOKENIZER_DIR}.")
-    except Exception as e:
-        logger.error(f"Failed to save tokenizer: {e}")
-        sys.exit(1)
-    logger.info("Data preparation pipeline completed successfully.")
 if __name__ == "__main__":
     main()

 import os
 import json
 import pickle
+import faiss
 from tqdm.auto import tqdm
 from pathlib import Path
+from sentence_transformers import SentenceTransformer
 from tf_data_pipeline import TFDataPipeline
+from chatbot_config import ChatbotConfig
 from logger_config import config_logger
 logger = config_logger(__name__)
     FAISS_INDICES_DIR = os.path.join(MODELS_DIR, 'faiss_indices')
     TF_RECORD_DIR = 'training_data'
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
+    JSON_TRAINING_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, 'taskmaster_only.json')
     CACHE_FILE = os.path.join(CACHE_DIR, 'query_embeddings_cache.pkl')
     TF_RECORD_PATH = os.path.join(TF_RECORD_DIR, 'training_data_3.tfrecord')
     # Ensure output directories exist
     os.makedirs(MODELS_DIR, exist_ok=True)
     os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
     os.makedirs(FAISS_INDICES_DIR, exist_ok=True)
     os.makedirs(TF_RECORD_DIR, exist_ok=True)
+    # Load ChatbotConfig
     config_json = Path(MODELS_DIR) / "config.json"
     if config_json.exists():
         with open(config_json, "r", encoding="utf-8") as f:
     else:
         config = ChatbotConfig()
         logger.warning("No config.json found. Using default ChatbotConfig.")
+        try:
+            with open(config_json, "w", encoding="utf-8") as f:
+                json.dump(config.to_dict(), f, indent=2)
+            logger.info(f"Default ChatbotConfig saved to {config_json}")
+        except Exception as e:
+            logger.error(f"Failed to save default ChatbotConfig: {e}")
+            raise
+    # Init SentenceTransformer
+    encoder = SentenceTransformer(config.pretrained_model)
+    logger.info(f"Initialized SentenceTransformer model: {config.pretrained_model}")
+    # Load dialogues
+    if Path(JSON_TRAINING_DATA_PATH).exists():
+        dialogues = TFDataPipeline.load_json_training_data(JSON_TRAINING_DATA_PATH)
+        logger.info(f"Loaded {len(dialogues)} dialogues.")
+    else:
+        logger.warning(f"No dialogues found at {JSON_TRAINING_DATA_PATH}.")
+        dialogues = []
+    # Load or init query embeddings cache
     query_embeddings_cache = {}
     if os.path.exists(CACHE_FILE):
+        with open(CACHE_FILE, 'rb') as f:
+            query_embeddings_cache = pickle.load(f)
+        logger.info(f"Loaded query embeddings cache with {len(query_embeddings_cache)} entries.")
     else:
         logger.info("No existing query embeddings cache found. Starting fresh.")
+    # Init FAISS index
+    dimension = encoder.get_sentence_embedding_dimension()
+    if Path(FAISS_INDEX_PRODUCTION_PATH).exists():
+        faiss_index = faiss.read_index(FAISS_INDEX_PRODUCTION_PATH)
+        logger.info(f"Loaded FAISS index from {FAISS_INDEX_PRODUCTION_PATH}.")
+    else:
+        faiss_index = faiss.IndexFlatIP(dimension)
+        logger.info(f"Initialized new FAISS index with dimension {dimension}.")
+    # Init TFDataPipeline
+    data_pipeline = TFDataPipeline(
+        config=config,
+        tokenizer=encoder.tokenizer,
+        encoder=encoder,
+        response_pool=[],
+        query_embeddings_cache=query_embeddings_cache,
+        index_type='IndexFlatIP',
+        faiss_index_file_path=FAISS_INDEX_PRODUCTION_PATH
+    )
+    # Collect and embed responses
+    if dialogues:
+        response_pool = data_pipeline.collect_responses_with_domain(dialogues)
+        data_pipeline.response_pool = response_pool
+        # Save the response pool
+        response_pool_path = FAISS_INDEX_PRODUCTION_PATH.replace('.index', '_responses.json')
+        with open(response_pool_path, 'w', encoding='utf-8') as f:
+            json.dump(response_pool, f, indent=2)
+        logger.info(f"Response pool saved to {response_pool_path}.")
+        data_pipeline.compute_and_index_response_embeddings()
+        data_pipeline.save_faiss_index(FAISS_INDEX_PRODUCTION_PATH)
+        logger.info(f"FAISS index saved at {FAISS_INDEX_PRODUCTION_PATH}.")
+    else:
+        logger.warning("No responses to embed. Skipping FAISS indexing.")
     # Save query embeddings cache
+    with open(CACHE_FILE, 'wb') as f:
+        pickle.dump(query_embeddings_cache, f)
+    logger.info(f"Query embeddings cache saved at {CACHE_FILE}.")
+    logger.info("Pipeline completed successfully.")
 if __name__ == "__main__":
     main()

run_chatbot_chat.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import json
-from chatbot_model import ChatbotConfig, RetrievalChatbot
 from response_quality_checker import ResponseQualityChecker
 from environment_setup import EnvironmentSetup
 from logger_config import config_logger

 import os
 import json
+from chatbot_model import RetrievalChatbot
+from chatbot_config import ChatbotConfig
 from response_quality_checker import ResponseQualityChecker
 from environment_setup import EnvironmentSetup
 from logger_config import config_logger

run_chatbot_validation.py CHANGED Viewed

@@ -1,24 +1,27 @@
 import os
 import json
-from chatbot_model import ChatbotConfig, RetrievalChatbot
 from response_quality_checker import ResponseQualityChecker
 from chatbot_validator import ChatbotValidator
 from plotter import Plotter
 from environment_setup import EnvironmentSetup
 from logger_config import config_logger
 logger = config_logger(__name__)
 def run_chatbot_validation():
     # Initialize environment
     env = EnvironmentSetup()
     env.initialize()
     MODEL_DIR = "models"
     FAISS_INDICES_DIR = os.path.join(MODEL_DIR, "faiss_indices")
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_production.index")
     FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_test.index")
     # Toggle 'production' or 'test' env
     ENVIRONMENT = "production"
     if ENVIRONMENT == "test":
@@ -27,7 +30,7 @@ def run_chatbot_validation():
     else:
         FAISS_INDEX_PATH = FAISS_INDEX_PRODUCTION_PATH
         RESPONSE_POOL_PATH = FAISS_INDEX_PRODUCTION_PATH.replace(".index", "_responses.json")
     # Load the config
     config_path = os.path.join(MODEL_DIR, "config.json")
     if os.path.exists(config_path):
@@ -38,55 +41,62 @@ def run_chatbot_validation():
     else:
         config = ChatbotConfig()
         logger.warning("No config.json found. Using default ChatbotConfig.")
-    # Load RetrievalChatbot in 'inference' mode
     try:
-        chatbot = RetrievalChatbot.load_model(load_dir=MODEL_DIR, mode="inference")
-        logger.info("RetrievalChatbot loaded in 'inference' mode successfully.")
     except Exception as e:
-        logger.error(f"Failed to load RetrievalChatbot: {e}")
-        return
-    # Confirm FAISS index & response pool exist
-    if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(RESPONSE_POOL_PATH):
-        logger.error("FAISS index or response pool file is missing.")
         return
     # Load FAISS index and response pool
     try:
-        chatbot.data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
         logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
-        logger.info(f"FAISS dimensions: {chatbot.data_pipeline.index.d}")
-        logger.info(f"FAISS index type: {type(chatbot.data_pipeline.index)}")
-        logger.info(f"FAISS index total vectors: {chatbot.data_pipeline.index.ntotal}")
-        logger.info(f"FAISS is_trained: {chatbot.data_pipeline.index.is_trained}")
         with open(RESPONSE_POOL_PATH, "r", encoding="utf-8") as f:
-            chatbot.data_pipeline.response_pool = json.load(f)
             logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
-            logger.info(f"\nTotal responses in pool: {len(chatbot.data_pipeline.response_pool)}")
         # Validate dimension consistency
-        chatbot.data_pipeline.validate_faiss_index()
         logger.info("FAISS index and response pool validated successfully.")
     except Exception as e:
         logger.error(f"Failed to load or validate FAISS index: {e}")
         return
     # Init QualityChecker and Validator
-    quality_checker = ResponseQualityChecker(data_pipeline=chatbot.data_pipeline)
-    validator = ChatbotValidator(chatbot=chatbot, quality_checker=quality_checker)
-    logger.info("ResponseQualityChecker and ChatbotValidator initialized.")
-    # Run validation
     try:
         validation_metrics = validator.run_validation(num_examples=5)
         logger.info(f"Validation Metrics: {validation_metrics}")
     except Exception as e:
         logger.error(f"Validation process failed: {e}")
         return
     # Plot metrics
     try:
         plotter = Plotter(save_dir=env.training_dirs["plots"])
@@ -94,10 +104,22 @@ def run_chatbot_validation():
         logger.info("Validation metrics plotted successfully.")
     except Exception as e:
         logger.error(f"Failed to plot validation metrics: {e}")
     # Run interactive chat loop
-    logger.info("\nStarting interactive chat session...")
-    chatbot.run_interactive_chat(quality_checker, show_alternatives=True)
 if __name__ == "__main__":
-    run_chatbot_validation()

 import os
 import json
+from sentence_transformers import SentenceTransformer
+from chatbot_config import ChatbotConfig
+from chatbot_model import RetrievalChatbot
 from response_quality_checker import ResponseQualityChecker
 from chatbot_validator import ChatbotValidator
 from plotter import Plotter
 from environment_setup import EnvironmentSetup
 from logger_config import config_logger
+from tf_data_pipeline import TFDataPipeline
 logger = config_logger(__name__)
 def run_chatbot_validation():
     # Initialize environment
     env = EnvironmentSetup()
     env.initialize()
     MODEL_DIR = "models"
     FAISS_INDICES_DIR = os.path.join(MODEL_DIR, "faiss_indices")
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_production.index")
     FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_test.index")
     # Toggle 'production' or 'test' env
     ENVIRONMENT = "production"
     if ENVIRONMENT == "test":
     else:
         FAISS_INDEX_PATH = FAISS_INDEX_PRODUCTION_PATH
         RESPONSE_POOL_PATH = FAISS_INDEX_PRODUCTION_PATH.replace(".index", "_responses.json")
     # Load the config
     config_path = os.path.join(MODEL_DIR, "config.json")
     if os.path.exists(config_path):
     else:
         config = ChatbotConfig()
         logger.warning("No config.json found. Using default ChatbotConfig.")
+    # Init SentenceTransformer
     try:
+        model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Replace with your chosen model
+        encoder = SentenceTransformer(model_name)
+        logger.info(f"Loaded SentenceTransformer model: {model_name}")
     except Exception as e:
+        logger.error(f"Failed to load SentenceTransformer: {e}")
         return
     # Load FAISS index and response pool
     try:
+        # Initialize TFDataPipeline
+        data_pipeline = TFDataPipeline(
+            config=config,
+            tokenizer=encoder.tokenizer,
+            encoder=encoder,
+            response_pool=[],
+            query_embeddings_cache={},
+            index_type='IndexFlatIP',
+            faiss_index_file_path=FAISS_INDEX_PATH
+        )
+        if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(RESPONSE_POOL_PATH):
+            logger.error("FAISS index or response pool file is missing.")
+            return
+        data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
         logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
         with open(RESPONSE_POOL_PATH, "r", encoding="utf-8") as f:
+            data_pipeline.response_pool = json.load(f)
             logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
+            logger.info(f"Total responses in pool: {len(data_pipeline.response_pool)}")
         # Validate dimension consistency
+        data_pipeline.validate_faiss_index()
         logger.info("FAISS index and response pool validated successfully.")
     except Exception as e:
         logger.error(f"Failed to load or validate FAISS index: {e}")
         return
     # Init QualityChecker and Validator
     try:
+        chatbot = RetrievalChatbot.load_model(load_dir=MODEL_DIR, mode="inference")
+        quality_checker = ResponseQualityChecker(data_pipeline=data_pipeline)
+        validator = ChatbotValidator(chatbot=chatbot, quality_checker=quality_checker)
+        logger.info("ResponseQualityChecker and ChatbotValidator initialized.")
+        # Run validation
         validation_metrics = validator.run_validation(num_examples=5)
         logger.info(f"Validation Metrics: {validation_metrics}")
     except Exception as e:
         logger.error(f"Validation process failed: {e}")
         return
     # Plot metrics
     try:
         plotter = Plotter(save_dir=env.training_dirs["plots"])
         logger.info("Validation metrics plotted successfully.")
     except Exception as e:
         logger.error(f"Failed to plot validation metrics: {e}")
     # Run interactive chat loop
+    try:
+        logger.info("\nStarting interactive chat session...")
+        while True:
+            user_input = input("You: ")
+            if user_input.lower() in ["exit", "quit"]:
+                logger.info("Exiting chat session.")
+                break
+            responses = data_pipeline.retrieve_responses(user_input, top_k=3)
+            print("Top Responses:")
+            for i, (response, score) in enumerate(responses, start=1):
+                print(f"{i}. {response} (Score: {score:.4f})")
+    except KeyboardInterrupt:
+        logger.info("Interactive chat session interrupted by user.")
 if __name__ == "__main__":
+    run_chatbot_validation()

run_taskmaster_processor.py CHANGED Viewed

@@ -5,7 +5,7 @@ from taskmaster_processor import TaskmasterProcessor, RawDataProcessingConfig
 def main():
     # Setup config and processor
-    base_dir = "datasets/taskmaster"
     config = RawDataProcessingConfig(
         debug=True,
         max_length=512,

 def main():
     # Setup config and processor
+    base_dir = "raw_datasets/taskmaster"
     config = RawDataProcessingConfig(
         debug=True,
         max_length=512,

taskmaster_processor.py CHANGED Viewed

@@ -4,6 +4,9 @@ import json
 from pathlib import Path
 from typing import List, Dict, Optional, Any
 from dataclasses import dataclass, field
 @dataclass
 class TaskmasterDialogue:
@@ -28,7 +31,7 @@ class RawDataProcessingConfig:
         self,
         debug: bool = True,
         max_length: int = 512,
-        min_turns: int = 2,
         min_user_words: int = 3
     ):
         self.debug = debug
@@ -68,7 +71,7 @@ class TaskmasterProcessor:
         with open(ontology_path, 'r', encoding='utf-8') as f:
             ontology = json.load(f)
             if self.config.debug:
-                print(f"[TaskmasterProcessor] Loaded ontology with {len(ontology.keys())} top-level keys (unused).")
         dialogues: List[TaskmasterDialogue] = []
@@ -106,7 +109,7 @@ class TaskmasterProcessor:
                     break
         if self.config.debug:
-            print(f"[TaskmasterProcessor] Loaded {len(dialogues)} total dialogues from Taskmaster-1.")
         return dialogues
     def _extract_domain(self, scenario: str, turns: List[Dict[str, str]]) -> str:
@@ -130,43 +133,15 @@ class TaskmasterProcessor:
         for domain, pattern in domain_patterns.items():
             if re.search(pattern, combined_text):
-                # Optional: print if debug
                 if self.config.debug:
-                    print(f"Matched domain: {domain} in scenario/turns")
                 return domain
         if self.config.debug:
-            print("No domain match, returning 'other'")
         return 'other'
-    def _process_utterances(self, utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
-        """
-        Convert "utterances" to a cleaned List -> (speaker, text).
-        Skip lines that are numeric, too short, or empty.
-        """
-        cleaned_turns = []
-        for utt in utterances:
-            speaker = 'assistant' if utt.get('speaker') == 'ASSISTANT' else 'user'
-            raw_text = utt.get('text', '').strip()
-            # Text cleaning
-            text = self._clean_text(raw_text)
-            # Skip blank or numeric lines (e.g. "4 3 13")
-            if not text or self._is_numeric_line(text):
-                continue
-            # Skip too short (no training benefit from 1-word user turns). E.g. "ok","yes", etc.
-            if len(text.split()) < 3:
-                continue
-            # Add to cleaned turns
-            cleaned_turns.append({
-                'speaker': speaker,
-                'text': text
-            })
-        return cleaned_turns
     def _clean_text(self, text: str) -> str:
         """
         Simple text normalization
@@ -193,13 +168,20 @@ class TaskmasterProcessor:
               "turns": [ {"speaker": "user", "text": "..."}, ... ]
             }
         """
         results = []
         for dlg in dialogues:
             if not dlg.validate():
                 continue
             # Skip if too few turns
             if len(dlg.turns) < self.config.min_turns:
                 continue
             # Skip if any user turn is too short
@@ -208,6 +190,7 @@ class TaskmasterProcessor:
                 if turn['speaker'] == 'user':
                     words_count = len(turn['text'].split())
                     if words_count < self.config.min_user_words:
                         keep = False
                         break
@@ -217,10 +200,59 @@ class TaskmasterProcessor:
             pipeline_dlg = {
                 'dialogue_id': dlg.conversation_id,
                 'domain': dlg.domain,
-                'turns': dlg.turns  # already cleaned
             }
             results.append(pipeline_dlg)
         if self.config.debug:
-            print(f"[TaskmasterProcessor] Filtered down to {len(results)} dialogues after cleaning.")
         return results

 from pathlib import Path
 from typing import List, Dict, Optional, Any
 from dataclasses import dataclass, field
+from logger_config import config_logger
+logger = config_logger(__name__)
 @dataclass
 class TaskmasterDialogue:
         self,
         debug: bool = True,
         max_length: int = 512,
+        min_turns: int = 4,
         min_user_words: int = 3
     ):
         self.debug = debug
         with open(ontology_path, 'r', encoding='utf-8') as f:
             ontology = json.load(f)
             if self.config.debug:
+                logger.info(f"[TaskmasterProcessor] Loaded ontology with {len(ontology.keys())} top-level keys (unused).")
         dialogues: List[TaskmasterDialogue] = []
                     break
         if self.config.debug:
+            logger.info(f"[TaskmasterProcessor] Loaded {len(dialogues)} total dialogues from Taskmaster-1.")
         return dialogues
     def _extract_domain(self, scenario: str, turns: List[Dict[str, str]]) -> str:
         for domain, pattern in domain_patterns.items():
             if re.search(pattern, combined_text):
+                # Optional: logger.info if debug
                 if self.config.debug:
+                    logger.info(f"Matched domain: {domain} in scenario/turns")
                 return domain
         if self.config.debug:
+            logger.info("No domain match, returning 'other'")
         return 'other'
     def _clean_text(self, text: str) -> str:
         """
         Simple text normalization
               "turns": [ {"speaker": "user", "text": "..."}, ... ]
             }
         """
+        total = len(dialogues)
+        invalid = 0
+        too_few_turns = 0
+        short_user_turns = 0
         results = []
         for dlg in dialogues:
             if not dlg.validate():
+                invalid += 1
                 continue
             # Skip if too few turns
             if len(dlg.turns) < self.config.min_turns:
+                too_few_turns += 1
                 continue
             # Skip if any user turn is too short
                 if turn['speaker'] == 'user':
                     words_count = len(turn['text'].split())
                     if words_count < self.config.min_user_words:
+                        short_user_turns += 1
                         keep = False
                         break
             pipeline_dlg = {
                 'dialogue_id': dlg.conversation_id,
                 'domain': dlg.domain,
+                'turns': dlg.turns
             }
             results.append(pipeline_dlg)
         if self.config.debug:
+            logger.info(f"\nFiltering Statistics:")
+            logger.info(f"Total dialogues: {total}")
+            logger.info(f"Invalid dialogues: {invalid}")
+            logger.info(f"Too few turns: {too_few_turns}")
+            logger.info(f"Short user turns: {short_user_turns}")
+            logger.info(f"Remaining dialogues: {len(results)}")
+            logger.info(f"Filtering rate: {((total - len(results)) / total) * 100:.1f}%\n")
         return results
+    def _process_utterances(self, utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
+        """Added logging to track utterance filtering"""
+        total = len(utterances)
+        empty = 0
+        numeric = 0
+        too_short = 0
+        cleaned_turns = []
+        for utt in utterances:
+            speaker = 'assistant' if utt.get('speaker') == 'ASSISTANT' else 'user'
+            raw_text = utt.get('text', '').strip()
+            text = self._clean_text(raw_text)
+            if not text:
+                empty += 1
+                continue
+            if self._is_numeric_line(text):
+                numeric += 1
+                continue
+            if len(text.split()) < 3:
+                too_short += 1
+                continue
+            cleaned_turns.append({
+                'speaker': speaker,
+                'text': text
+            })
+        if self.config.debug and total > 0:
+            logger.info(f"\nUtterance Cleaning Statistics (Dialogue {utterances[0].get('conversation_id', 'unknown')}):")
+            logger.info(f"Total utterances: {total}")
+            logger.info(f"Empty/blank: {empty}")
+            logger.info(f"Numeric only: {numeric}")
+            logger.info(f"Too short (<3 words): {too_short}")
+            logger.info(f"Remaining turns: {len(cleaned_turns)}")
+            logger.info(f"Filtering rate: {((total - len(cleaned_turns)) / total) * 100:.1f}%\n")
+        return cleaned_turns

tf_data_pipeline.py CHANGED Viewed

@@ -11,6 +11,8 @@ import json
 from pathlib import Path
 from typing import Union, Optional, Dict, List, Tuple, Generator
 from transformers import AutoTokenizer
 from typing import List, Tuple, Generator
 from transformers import AutoTokenizer
 import random
@@ -21,26 +23,30 @@ logger = config_logger(__name__)
 class TFDataPipeline:
     def __init__(
         self,
-        config,
-        tokenizer,
-        encoder,
         response_pool: List[str],
         query_embeddings_cache: dict,
         max_length: int = 512,
         neg_samples: int = 10,
         index_type: str = 'IndexFlatIP',
         faiss_index_file_path: str = 'models/faiss_indices/faiss_index_production.index',
         nlist: int = 100,
         max_retries: int = 3
     ):
         self.config = config
         self.tokenizer = tokenizer
         self.encoder = encoder
         self.faiss_index_file_path = faiss_index_file_path
         self.response_pool = response_pool
         self.max_length = max_length
         self.neg_samples = neg_samples
         self.query_embeddings_cache = query_embeddings_cache # In-memory cache for embeddings
         self.index_type = index_type
         self.nlist = nlist
         self.embedding_batch_size = 16 if len(response_pool) < 100 else 64
@@ -59,9 +65,8 @@ class TFDataPipeline:
             self.validate_faiss_index()
             logger.info("FAISS index loaded and validated successfully.")
         else:
-            dimension = self.encoder.config.embedding_dim
-            self.index = faiss.IndexFlatIP(dimension)
-            logger.info(f"Initialized FAISS IndexFlatIP with dimension {dimension}.")
         if not self.index.is_trained:
             # Train the index if it's not trained. IndexFlatIP doesn't need training, but others do (Future switch to IndexIVFFlat)
@@ -98,7 +103,7 @@ class TFDataPipeline:
     def validate_faiss_index(self):
         """Validates FAISS index dimensionality."""
-        expected_dim = self.encoder.config.embedding_dim
         if self.index.d != expected_dim:
             logger.error(f"FAISS index dimension {self.index.d} does not match encoder embedding dimension {expected_dim}.")
             raise ValueError("FAISS index dimensionality mismatch.")
@@ -186,44 +191,49 @@ class TFDataPipeline:
                 pairs.append((query, positive))
         return pairs
     def compute_and_index_response_embeddings(self):
         """
-        Compute embeddings for the response pool and add them to the FAISS index.
-        self.response_pool: List[Dict[str, str]] with keys "domain" and "text".
         """
-        logger.info("Computing embeddings for the response pool...")
-        # Extract the assistant text
         texts = [resp["text"] for resp in self.response_pool]
         logger.debug(f"Total texts to embed: {len(texts)}")
-        batch_size = getattr(self, 'embedding_batch_size', 64)
         embeddings = []
         with tqdm(total=len(texts), desc="Computing Embeddings", unit="response") as pbar:
             for i in range(0, len(texts), batch_size):
-                batch_texts = texts[i:i+batch_size]
-                encodings = self.tokenizer(
                     batch_texts,
-                    padding=True,
-                    truncation=True,
-                    max_length=self.max_length,
-                    return_tensors='tf'
                 )
-                batch_embeds = self.encoder(encodings['input_ids'], training=False).numpy()
-                embeddings.append(batch_embeds)
                 pbar.update(len(batch_texts))
-        # Combine embeddings and add to FAISS
         all_embeddings = np.vstack(embeddings).astype(np.float32)
         logger.info(f"Adding {len(all_embeddings)} response embeddings to FAISS index...")
         self.index.add(all_embeddings)
         # Store in memory
         self.response_embeddings = all_embeddings
-        logger.info(f"FAISS index now has {self.index.ntotal} vectors.")
     def _find_hard_negatives(self, queries: List[str], positives: List[str], batch_size: int = 128) -> List[List[str]]:
         """
@@ -385,106 +395,41 @@ class TFDataPipeline:
                 self._text_domain_map[stripped_text] = domain
         logger.info(f"Built text -> domain map with {len(self._text_domain_map)} unique text entries.")
-    def encode_query(
-        self,
-        query: str,
-        context: Optional[List[Tuple[str, str]]] = None
-    ) -> np.ndarray:
-        """
-        Encode a user query (and optional conversation context) into an embedding vector.
-        Args:
-            query: The user query.
-            context: Optional conversation history as a list of (user_text, assistant_text).
-        Returns:
-            np.ndarray of shape [embedding_dim], typically L2-normalized already.
-        """
-        # Prepare context: concat user/assistant pairs
-        if context:
-            # Take the last N turns
-            relevant_history = context[-self.config.max_context_turns:]
-            context_str_parts = []
-            for (u_text, a_text) in relevant_history:
-                context_str_parts.append(
-                    f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {u_text} "
-                    f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {a_text}"
-                )
-            context_str = " ".join(context_str_parts)
-            # Append the new query
-            full_query = (
-                f"{context_str} "
-                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
-            )
-        else:
-            # Single user turn
-            full_query = (
-                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
-            )
-        # Tokenize
-        encodings = self.tokenizer(
-            [full_query],
-            padding='max_length',
-            truncation=True,
-            max_length=self.max_length,
-            return_tensors='np'  # to keep it compatible with FAISS
-        )
-        input_ids = encodings['input_ids']
-        # Debug out-of-vocab IDs
-        max_id = np.max(input_ids)
-        vocab_size = len(self.tokenizer)
-        if max_id >= vocab_size:
-            logger.error(f"Token ID {max_id} exceeds tokenizer vocab size {vocab_size}.")
-            raise ValueError("Token ID exceeds vocabulary size.")
-        # Get embeddings from the model. These are already L2-normalized by the model's final layer.
-        embeddings = self.encoder(input_ids, training=False).numpy()
-        return embeddings[0]
     def encode_responses(
-        self,
-        responses: List[str],
         context: Optional[List[Tuple[str, str]]] = None
     ) -> np.ndarray:
         """
-        Encode multiple response texts into embedding vectors.
-        Args:
-            responses: List of assistant responses.
-            context: Optional conversation context (last N turns).
-        Returns:
-            np.ndarray of shape [num_responses, embedding_dim].
         """
-        # Incorporate context into response encoding. Note: Undecided on benefit of this
         if context:
             relevant_history = context[-self.config.max_context_turns:]
             prepared = []
             for resp in responses:
                 context_str_parts = []
                 for (u_text, a_text) in relevant_history:
                     context_str_parts.append(
-                        f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {u_text} "
-                        f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {a_text}"
                     )
                 context_str = " ".join(context_str_parts)
-                # Treat resp as an assistant turn
-                full_resp = (
-                    f"{context_str} "
-                    f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {resp}"
-                )
                 prepared.append(full_resp)
         else:
             # Single response from the assistant
-            prepared = [
-                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {r}"
-                for r in responses
-            ]
-        # Tokenize
         encodings = self.tokenizer(
             prepared,
             padding='max_length',
@@ -493,19 +438,42 @@ class TFDataPipeline:
             return_tensors='np'
         )
         input_ids = encodings['input_ids']
         # Debug for out-of-vocab
         max_id = np.max(input_ids)
         vocab_size = len(self.tokenizer)
         if max_id >= vocab_size:
-            logger.error(f"Token ID {max_id} exceeds tokenizer vocab size {vocab_size}.")
             raise ValueError("Token ID exceeds vocabulary size.")
-        # Get embeddings from the model. These are already L2-normalized by the model's final layer.
-        embeddings = self.encoder(input_ids, training=False).numpy()
         return embeddings.astype('float32')
     def prepare_and_save_data(self, dialogues: List[dict], tf_record_path: str, batch_size: int = 32):
         """
         Batch-Process dialogues and save to TFRecord file.

 from pathlib import Path
 from typing import Union, Optional, Dict, List, Tuple, Generator
 from transformers import AutoTokenizer
+from sentence_transformers import SentenceTransformer
+from chatbot_config import ChatbotConfig
 from typing import List, Tuple, Generator
 from transformers import AutoTokenizer
 import random
 class TFDataPipeline:
     def __init__(
         self,
+        config: ChatbotConfig,
+        tokenizer: AutoTokenizer,
+        encoder: SentenceTransformer,
         response_pool: List[str],
         query_embeddings_cache: dict,
+        model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',
         max_length: int = 512,
         neg_samples: int = 10,
         index_type: str = 'IndexFlatIP',
         faiss_index_file_path: str = 'models/faiss_indices/faiss_index_production.index',
+        dimension: int = 384,
         nlist: int = 100,
         max_retries: int = 3
     ):
         self.config = config
         self.tokenizer = tokenizer
         self.encoder = encoder
+        self.model = SentenceTransformer(model_name)
         self.faiss_index_file_path = faiss_index_file_path
         self.response_pool = response_pool
         self.max_length = max_length
         self.neg_samples = neg_samples
         self.query_embeddings_cache = query_embeddings_cache # In-memory cache for embeddings
+        self.dimension = config.embedding_dim
         self.index_type = index_type
         self.nlist = nlist
         self.embedding_batch_size = 16 if len(response_pool) < 100 else 64
             self.validate_faiss_index()
             logger.info("FAISS index loaded and validated successfully.")
         else:
+            self.index = faiss.IndexFlatIP(self.dimension)
+            logger.info(f"Initialized FAISS IndexFlatIP with dimension {self.dimension}.")
         if not self.index.is_trained:
             # Train the index if it's not trained. IndexFlatIP doesn't need training, but others do (Future switch to IndexIVFFlat)
     def validate_faiss_index(self):
         """Validates FAISS index dimensionality."""
+        expected_dim = self.dimension
         if self.index.d != expected_dim:
             logger.error(f"FAISS index dimension {self.index.d} does not match encoder embedding dimension {expected_dim}.")
             raise ValueError("FAISS index dimensionality mismatch.")
                 pairs.append((query, positive))
         return pairs
     def compute_and_index_response_embeddings(self):
         """
+        Compute embeddings for the response pool using SentenceTransformer
+        and add them to the FAISS index.
         """
+        if not self.response_pool:
+            logger.warning("Response pool is empty. No embeddings to compute.")
+            return
+        logger.info("Computing embeddings for the response pool...")
         texts = [resp["text"] for resp in self.response_pool]
         logger.debug(f"Total texts to embed: {len(texts)}")
         embeddings = []
+        batch_size = self.embedding_batch_size
+        # Use SentenceTransformer to compute embeddings in batches
         with tqdm(total=len(texts), desc="Computing Embeddings", unit="response") as pbar:
             for i in range(0, len(texts), batch_size):
+                batch_texts = texts[i:i + batch_size]
+                # Compute embeddings
+                batch_embeddings = self.encoder.encode(
                     batch_texts,
+                    batch_size=batch_size,
+                    convert_to_numpy=True,
+                    normalize_embeddings=True  # Normalizes for cosine similarity
                 )
+                embeddings.append(batch_embeddings)
                 pbar.update(len(batch_texts))
+        # Combine all embeddings
         all_embeddings = np.vstack(embeddings).astype(np.float32)
         logger.info(f"Adding {len(all_embeddings)} response embeddings to FAISS index...")
+        # Add to FAISS index
         self.index.add(all_embeddings)
         # Store in memory
         self.response_embeddings = all_embeddings
+        logger.info(f"FAISS index now contains {self.index.ntotal} vectors.")
     def _find_hard_negatives(self, queries: List[str], positives: List[str], batch_size: int = 128) -> List[List[str]]:
         """
                 self._text_domain_map[stripped_text] = domain
         logger.info(f"Built text -> domain map with {len(self._text_domain_map)} unique text entries.")
+    def encode_query(self, query: str) -> np.ndarray:
+        """Generate embedding for a query string."""
+        return self.encoder.encode(query, convert_to_numpy=True)
     def encode_responses(
+        self,
+        responses: List[str],
         context: Optional[List[Tuple[str, str]]] = None
     ) -> np.ndarray:
         """
+        Encode multiple response texts into embeddings, injecting <ASSISTANT> literally.
         """
+        USER_TOKEN = "<USER>"
+        ASSISTANT_TOKEN = "<ASSISTANT>"
         if context:
             relevant_history = context[-self.config.max_context_turns:]
             prepared = []
             for resp in responses:
                 context_str_parts = []
+                # Build all user->assistant text
                 for (u_text, a_text) in relevant_history:
                     context_str_parts.append(
+                        f"{USER_TOKEN} {u_text} {ASSISTANT_TOKEN} {a_text}"
                     )
                 context_str = " ".join(context_str_parts)
+                # Treat resp as an assistant turn:
+                full_resp = f"{context_str} {ASSISTANT_TOKEN} {resp}"
                 prepared.append(full_resp)
         else:
             # Single response from the assistant
+            prepared = [f"{ASSISTANT_TOKEN} {r}" for r in responses]
+        # Pass the prepared strings to the SentenceTransformer tokenizer:
         encodings = self.tokenizer(
             prepared,
             padding='max_length',
             return_tensors='np'
         )
         input_ids = encodings['input_ids']
         # Debug for out-of-vocab
         max_id = np.max(input_ids)
         vocab_size = len(self.tokenizer)
         if max_id >= vocab_size:
+            logger.error(f"Token ID {max_id} >= tokenizer vocab size {vocab_size}")
             raise ValueError("Token ID exceeds vocabulary size.")
+        # Get embeddings from SentenceTransformer
+        embeddings = self.encoder.encode(prepared, convert_to_numpy=True)
         return embeddings.astype('float32')
+    def retrieve_responses(self, query: str, top_k: int = 10) -> List[Tuple[str, float]]:
+        """
+        Retrieve top-k responses for a query using FAISS.
+        Args:
+            query: User's query text.
+            top_k: Number of responses to return.
+        Returns:
+            List of tuples (response text, similarity score).
+        """
+        query_embedding = self.encode_query(query).reshape(1, -1).astype("float32")
+        distances, indices = self.index.search(query_embedding, top_k)
+        results = []
+        for idx, dist in zip(indices[0], distances[0]):
+            if idx < 0:
+                continue
+            response = self.response_pool[idx]
+            results.append((response["text"], dist))
+        return results
     def prepare_and_save_data(self, dialogues: List[dict], tf_record_path: str, batch_size: int = 32):
         """
         Batch-Process dialogues and save to TFRecord file.