Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Running

App Files Files Community

JoeArmani commited on Jan 12

Commit

d7fc7a7

1 Parent(s): c111c20

more structural updates

Browse files

Files changed (14) hide show

.gitignore +2 -0
chatbot_model.py +145 -171
chatbot_validator.py +1 -1
{data_augmentation → data_augmentation_code}/augmentation_processing_pipeline.py +0 -0
{data_augmentation → data_augmentation_code}/back_translator.py +0 -0
{data_augmentation → data_augmentation_code}/dialogue_augmenter.py +0 -0
{data_augmentation → data_augmentation_code}/main.py +0 -0
{data_augmentation → data_augmentation_code}/paraphraser.py +0 -0
{data_augmentation → data_augmentation_code}/pipeline_config.py +0 -0
{data_augmentation → data_augmentation_code}/quality_metrics.py +0 -0
{data_augmentation → data_augmentation_code}/schema_guided_dialogue_processor.py +0 -0
{data_augmentation → data_augmentation_code}/taskmaster_processor.py +0 -0
validate_model.py → run_chatbot_validation.py +7 -7
tf_data_pipeline.py +16 -16

.gitignore CHANGED Viewed

@@ -187,3 +187,5 @@ new_iteration/cache/*
 new_iteration/data_prep_iterative_models/*
 new_iteration/training_data/*
 new_iteration/processed_outputs/*

 new_iteration/data_prep_iterative_models/*
 new_iteration/training_data/*
 new_iteration/processed_outputs/*
+raw_datasets/*

chatbot_model.py CHANGED Viewed

@@ -24,25 +24,25 @@ logger = config_logger(__name__)
 @dataclass
 class ChatbotConfig:
-    """Configuration for the RetrievalChatbot."""
     max_context_token_limit: int = 512
     embedding_dim: int = 768
     encoder_units: int = 256
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
     l2_reg_weight: float = 0.001
-    learning_rate: float = 0.001
     min_text_length: int = 3
-    max_context_turns: int = 5
     warmup_steps: int = 200
     pretrained_model: str = 'distilbert-base-uncased'
     cross_encoder_model: str = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
     dtype: str = 'float32'
     freeze_embeddings: bool = False
     embedding_batch_size: int = 64
     search_batch_size: int = 64
     max_batch_size: int = 64
-    neg_samples: int = 10
     max_retries: int = 3
     def to_dict(self) -> Dict:
@@ -57,7 +57,7 @@ class ChatbotConfig:
                      if k in cls.__dataclass_fields__})
 class EncoderModel(tf.keras.Model):
-    """Dual encoder model with pretrained embeddings."""
     def __init__(
         self,
         config: ChatbotConfig,
@@ -71,7 +71,7 @@ class EncoderModel(tf.keras.Model):
         self.pretrained = TFAutoModel.from_pretrained(config.pretrained_model)
         self._freeze_layers()
-        # Add Pooling layer (Global Average Pooling), Projection layer, Dropout, and Normalization
         self.pooler = tf.keras.layers.GlobalAveragePooling1D()
         self.projection = tf.keras.layers.Dense(
             config.embedding_dim,
@@ -86,7 +86,7 @@ class EncoderModel(tf.keras.Model):
         )
     def _freeze_layers(self):
-        """Freeze layers of the pretrained model based on configuration."""
         if self.config.freeze_embeddings:
             self.pretrained.trainable = False
             logger.info("All pretrained layers frozen.")
@@ -95,29 +95,29 @@ class EncoderModel(tf.keras.Model):
             for i, layer in enumerate(self.pretrained.layers):
                 if isinstance(layer, tf.keras.layers.Layer):
                     if hasattr(layer, 'trainable'):
-                        # Freeze the first transformer block
                         if i < 1:
                             layer.trainable = False
                             logger.info(f"Layer {i} frozen.")
                         else:
                             layer.trainable = True
     def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
         """Forward pass."""
         # Get pretrained embeddings
         pretrained_outputs = self.pretrained(inputs, training=training)
-        x = pretrained_outputs.last_hidden_state # Shape: [batch_size, seq_len, embedding_dim]
         # Apply pooling, projection, dropout, and normalization
-        x = self.pooler(x)                      # Shape: [batch_size, 768]
-        x = self.projection(x)                  # Shape: [batch_size, 768]
         x = self.dropout(x, training=training)
-        x = self.normalize(x)                   # Shape: [batch_size, 768]
         return x
     def get_config(self) -> dict:
-        """Return the config of the model."""
         config = super().get_config()
         config.update({
             "config": self.config.to_dict(),
@@ -126,7 +126,10 @@ class EncoderModel(tf.keras.Model):
         return config
 class RetrievalChatbot(DeviceAwareModel):
-    """Retrieval-based chatbot using pretrained embeddings and FAISS for similarity search."""
     def __init__(
         self,
         config: ChatbotConfig,
@@ -142,7 +145,7 @@ class RetrievalChatbot(DeviceAwareModel):
         self.device = device or self._setup_default_device()
         self.mode = mode.lower()
-        # Initialize reranker, summarizer, tokenizer, encoder, and memory monitor
         self.reranker = reranker or self._initialize_reranker()
         self.tokenizer = self._initialize_tokenizer()
         self.encoder = self._initialize_encoder()
@@ -154,14 +157,9 @@ class RetrievalChatbot(DeviceAwareModel):
             config=self.config,
             tokenizer=self.tokenizer,
             encoder=self.encoder,
-            index_file_path='new_iteration/data_prep_iterative_models/faiss_indices/faiss_index_production.index',
             response_pool=[],
             max_length=self.config.max_context_token_limit,
             query_embeddings_cache={},
-            neg_samples=self.config.neg_samples,
-            index_type='IndexFlatIP',
-            nlist=100, # Not used with IndexFlatIP
-            max_retries=self.config.max_retries
         )
         # Collect unique responses from dialogues
@@ -197,7 +195,7 @@ class RetrievalChatbot(DeviceAwareModel):
         """Initialize the Summarizer."""
         return Summarizer(
             tokenizer=self.tokenizer,
-            model_name="t5-small",
             max_summary_length=self.config.max_context_token_limit // 4,
             device=self.device,
             max_summary_rounds=2
@@ -229,17 +227,18 @@ class RetrievalChatbot(DeviceAwareModel):
         new_vocab_size = len(self.tokenizer)
         encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
         return encoder
     def _load_faiss_index_and_responses(self) -> None:
         """Load FAISS index and response pool for inference."""
         try:
-            logger.info(f"Loading FAISS index from {self.data_pipeline.index_file_path}...")
-            self.data_pipeline.load_faiss_index(self.data_pipeline.index_file_path)
             logger.info("FAISS index loaded successfully.")
-            # Load response pool associated with the FAISS index
-            response_pool_path = self.data_pipeline.index_file_path.replace('.index', '_responses.json')
             if os.path.exists(response_pool_path):
                 with open(response_pool_path, 'r', encoding='utf-8') as f:
                     self.data_pipeline.response_pool = json.load(f)
@@ -263,29 +262,24 @@ class RetrievalChatbot(DeviceAwareModel):
         """
         load_dir = Path(load_dir)
-        # 1) Load config
         with open(load_dir / "config.json", "r") as f:
             config = ChatbotConfig.from_dict(json.load(f))
-        # 2) Initialize chatbot
         chatbot = cls(config, mode=mode)
-        # 3) Load DistilBERT from huggingface folder
-        chatbot.encoder.pretrained = TFAutoModel.from_pretrained(
-            load_dir / "shared_encoder",
-            config=config
-        )
         dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
         _ = chatbot.encoder(dummy_input, training=False)
-        # 4) Load tokenizer
         chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
         logger.info(f"Models and tokenizer loaded from {load_dir}")
-        # 5) Load the custom top layers' weights
         custom_weights_path = load_dir / "encoder_custom_weights.weights.h5"
         if custom_weights_path.exists():
             chatbot.encoder.load_weights(str(custom_weights_path))
@@ -293,7 +287,7 @@ class RetrievalChatbot(DeviceAwareModel):
         else:
             logger.warning(f"No custom encoder weights found at {custom_weights_path}. The top-level projection layer won't have learned parameters.")
-        # 6) If in inference mode, load FAISS, etc.
         if mode == 'inference':
             cls._prepare_model_for_inference(chatbot, load_dir)
@@ -301,7 +295,7 @@ class RetrievalChatbot(DeviceAwareModel):
     @classmethod
     def _prepare_model_for_inference(cls, chatbot: 'RetrievalChatbot', load_dir: Path) -> None:
-        """Internal method to load inference components."""
         try:
             # Load FAISS index
             faiss_path = load_dir / 'faiss_indices/faiss_index_production.index'
@@ -332,7 +326,7 @@ class RetrievalChatbot(DeviceAwareModel):
             raise
     def save_models(self, save_dir: Union[str, Path]):
-        """Save models and configuration."""
         save_dir = Path(save_dir)
         save_dir.mkdir(parents=True, exist_ok=True)
@@ -340,21 +334,13 @@ class RetrievalChatbot(DeviceAwareModel):
         with open(save_dir / "config.json", "w") as f:
             json.dump(self.config.to_dict(), f, indent=2)
-        # Save the HF DistilBERT submodule:
         self.encoder.pretrained.save_pretrained(save_dir / "shared_encoder")
-        # ALSO save custom top-level layers' weights
         self.encoder.save_weights(save_dir / "encoder_custom_weights.weights.h5")
-        # Save tokenizer
         self.tokenizer.save_pretrained(save_dir / "tokenizer")
         logger.info(f"Models and tokenizer saved to {save_dir}.")
-    def sigmoid(self, x: float) -> float:
-        return 1 / (1 + np.exp(-x))
-    def retrieve_responses_cross_encoder(
         self,
         query: str,
         top_k: int = 10,
@@ -363,20 +349,20 @@ class RetrievalChatbot(DeviceAwareModel):
         summarize_threshold: int = 512
     ) -> List[Tuple[str, float]]:
         """
-        Retrieve top-k responses with optional domain-based boosting
-        and cross-encoder re-ranking.
         Args:
             query: The user's input text.
-            top_k: Number of final results to return.
-            reranker: CrossEncoderReranker for refined scoring, if available.
-            summarizer: Summarizer for long queries, if desired.
-            summarize_threshold: Summarize if query wordcount > threshold.
         Returns:
             List of (response_text, final_score).
         """
-        # 1) Optional query summarization
         if summarizer and len(query.split()) > summarize_threshold:
             logger.info(f"Query is long ({len(query.split())} words). Summarizing.")
             query = summarizer.summarize_text(query)
@@ -393,17 +379,17 @@ class RetrievalChatbot(DeviceAwareModel):
         texts = [item[0] for item in faiss_candidates]
-        # Re-rank these boosted candidates
         if not reranker:
             reranker = CrossEncoderReranker(model_name=self.config.cross_encoder_model)
         ce_logits = reranker.rerank(query, texts, max_length=256)
-        # Combine cross-encoder score with the base FAISS score (simple multiplicative approach)
         final_candidates = []
         for (resp_text, faiss_score), logit in zip(faiss_candidates, ce_logits):
-            ce_prob = self.sigmoid(logit) # [0...1]
-            faiss_norm = (faiss_score + 1)/2.0 # [0...1]
             combined_score = 0.85 * ce_prob + 0.15 * faiss_norm
             length_adjusted_score = self.length_adjust_score(resp_text, combined_score)
@@ -415,22 +401,22 @@ class RetrievalChatbot(DeviceAwareModel):
         # Return top_k
         return final_candidates[:top_k]
-    DOMAIN_KEYWORDS = {
-        'restaurant': ['restaurant', 'dining', 'food', 'dine', 'reservation', 'table', 'menu', 'cuisine', 'eat', 'place to eat', 'hungry', 'chef', 'dish', 'meal', 'brunch', 'bistro', 'buffet', 'catering', 'gourmet', 'fast food', 'fine dining', 'takeaway', 'delivery', 'restaurant booking'],
-        'movie': ['movie', 'cinema', 'film', 'ticket', 'showtime', 'showing', 'theater', 'flick', 'screening', 'film ticket', 'film show', 'blockbuster', 'premiere', 'trailer', 'director', 'actor', 'actress', 'plot', 'genre', 'screen', 'sequel', 'animation', 'documentary'],
-        'ride_share': ['ride', 'taxi', 'uber', 'lyft', 'car service', 'pickup', 'dropoff', 'driver', 'cab', 'hailing', 'rideshare', 'ride hailing', 'carpool', 'chauffeur', 'transit', 'transportation', 'hail ride'],
-        'coffee': ['coffee', 'café', 'cafe', 'starbucks', 'espresso', 'latte', 'mocha', 'americano', 'barista', 'brew', 'cappuccino', 'macchiato', 'iced coffee', 'cold brew', 'espresso machine', 'coffee shop', 'tea', 'chai', 'java', 'bean', 'roast', 'decaf'],
-        'pizza': ['pizza', 'delivery', 'order food', 'pepperoni', 'topping', 'pizzeria', 'slice', 'pie', 'margherita', 'deep dish', 'thin crust', 'cheese', 'oven', 'tossed', 'sauce', 'garlic bread', 'calzone'],
-        'auto': ['car', 'vehicle', 'repair', 'maintenance', 'mechanic', 'oil change', 'garage', 'auto shop', 'tire', 'check engine', 'battery', 'transmission', 'brake', 'engine diagnostics', 'carwash', 'detail', 'alignment', 'exhaust', 'spark plug', 'dashboard'],
-    }
     def extract_keywords(self, query: str) -> List[str]:
         """
         Return any domain keywords present in the query (lowercased).
         """
         query_lower = query.lower()
         found = set()
-        for domain, kw_list in self.DOMAIN_KEYWORDS.items():
             for kw in kw_list:
                 if kw in query_lower:
                     found.add(kw)
@@ -456,7 +442,7 @@ class RetrievalChatbot(DeviceAwareModel):
     def detect_domain_from_query(self, query: str) -> str:
         """
-        Detect the domain of the query based on keywords.
         """
         domain_patterns = {
             'restaurant': r'\b(restaurant|restaurants?|dining|food|foods?|dine|reservation|reservations?|table|tables?|menu|menus?|cuisine|cuisines?|eat|eats?|place\s?to\s?eat|places\s?to\s?eat|hungry|chef|chefs?|dish|dishes?|meal|meals?|fork|forks?|knife|knives?|spoon|spoons?|brunch|bistro|buffet|buffets?|catering|caterings?|gourmet|fast\s?food|fine\s?dining|takeaway|takeaways?|delivery|deliveries|restaurant\s?booking)\b',
@@ -476,8 +462,7 @@ class RetrievalChatbot(DeviceAwareModel):
     def is_numeric_response(self, text: str) -> bool:
         """
-        Return True if `text` is purely digits (and/or spaces),
-        with optional punctuation like '.' at the end.
         """
         pattern = r'^[\s]*[\d]+([\s.,\d]+)*[\s]*$'
         return bool(re.match(pattern, text.strip()))
@@ -486,18 +471,16 @@ class RetrievalChatbot(DeviceAwareModel):
         self,
         query: str,
         domain: str = 'other',
-        top_k: int = 5,
-        boost_factor: float = 1.05
     ) -> List[Tuple[str, float]]:
         """
         Retrieve top-k responses from the FAISS index (IndexFlatIP) given a user query.
         Args:
             query (str): The user input text.
-            domain (str, optional): The detected domain. Defaults to 'other'.
-            top_k (int, optional): Number of top results to return. Defaults to 5.
-            boost_factor (float, optional): Factor to boost scores for keyword matches. Defaults to 1.3.
         Returns:
             List[Tuple[str, float]]: List of (response_text, similarity) sorted by descending similarity.
         """
@@ -508,7 +491,7 @@ class RetrievalChatbot(DeviceAwareModel):
         # Search the index
         distances, indices = self.data_pipeline.index.search(q_emb_np, top_k * 10)
-        # IndexFlatIP: 'distances' are inner products (cosine similarities for normalized vectors)
         candidates = []
         for rank, idx in enumerate(indices[0]):
             if idx < 0:
@@ -545,8 +528,7 @@ class RetrievalChatbot(DeviceAwareModel):
         boosted = []
         for (resp_text, resp_domain, score) in in_domain:
             new_score = score
-            # If the domain is known AND the response text
-            # shares any query keywords, apply a small boost
             if query_keywords and any(kw in resp_text.lower() for kw in query_keywords):
                 new_score *= boost_factor
@@ -558,7 +540,7 @@ class RetrievalChatbot(DeviceAwareModel):
         # Sort boosted responses
         boosted.sort(key=lambda x: x[1], reverse=True)
-        # Debug
         # for resp, score in boosted[:100]:
         #     logger.debug(f"Candidate: '{resp}' with score {score}")
@@ -572,8 +554,7 @@ class RetrievalChatbot(DeviceAwareModel):
         top_k: int = 10,
     ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
         """
-        Example chat method that always uses cross-encoder re-ranking
-        if self.reranker is available.
         """
         @self.run_on_device
         def get_response(self_arg, query_arg):
@@ -581,7 +562,7 @@ class RetrievalChatbot(DeviceAwareModel):
             conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
             # Retrieve and re-rank
-            results = self_arg.retrieve_responses_cross_encoder(
                 query=conversation_str,
                 top_k=top_k,
                 reranker=self_arg.reranker,
@@ -605,7 +586,9 @@ class RetrievalChatbot(DeviceAwareModel):
         query: str,
         conversation_history: Optional[List[Tuple[str, str]]]
     ) -> str:
-        """Build conversation context with better memory management."""
         if not conversation_history:
             return f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
@@ -636,12 +619,12 @@ class RetrievalChatbot(DeviceAwareModel):
     ) -> None:
         """
         Train the retrieval model using a pre-prepared TFRecord dataset.
-        This method handles:
         - Checkpoint loading/restoring
         - LR scheduling
         - Epoch/iteration tracking
-        - Optional training-history logging
-        - Basic early stopping
         """
         logger.info("Starting training with pre-prepared TFRecord dataset...")
@@ -673,7 +656,7 @@ class RetrievalChatbot(DeviceAwareModel):
         steps_per_epoch = math.ceil(train_size / batch_size)
         val_steps = math.ceil(val_size / batch_size)
         total_steps = steps_per_epoch * epochs
-        buffer_size = max(1, total_pairs // 10)  # 10% of the dataset
         logger.info(f"Training pairs: {train_size}")
         logger.info(f"Validation pairs: {val_size}")
@@ -695,7 +678,7 @@ class RetrievalChatbot(DeviceAwareModel):
             self.optimizer = tf.keras.optimizers.Adam(learning_rate=tf.cast(peak_lr, tf.float32))
             logger.info("Using fixed learning rate.")
-        # Initialize optimizer with dummy step
         dummy_input = tf.zeros((1, self.config.max_context_token_limit), dtype=tf.int32)
         with tf.GradientTape() as tape:
             dummy_output = self.encoder(dummy_input)
@@ -710,6 +693,7 @@ class RetrievalChatbot(DeviceAwareModel):
             model=self.encoder
         )
         manager = tf.train.CheckpointManager(
             checkpoint,
             directory=checkpoint_dir,
@@ -717,18 +701,18 @@ class RetrievalChatbot(DeviceAwareModel):
             checkpoint_name='ckpt'
         )
-        # Restore from existing checkpoint if present
         latest_checkpoint = manager.latest_checkpoint
         history_path = Path(checkpoint_dir) / 'training_history.json'
-        # If you want to log all epoch losses across runs
         if not hasattr(self, 'history'):
             self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
         if latest_checkpoint and not test_mode:
-            # Add checkpoint inspection
-            logger.info(f"\nTrying to load checkpoint from: {latest_checkpoint}")
-            reader = tf.train.load_checkpoint(latest_checkpoint)
             # shape_from_key = reader.get_variable_to_shape_map()
             # dtype_from_key = reader.get_variable_to_dtype_map()
             # logger.info("\nCheckpoint Variables:")
@@ -752,11 +736,11 @@ class RetrievalChatbot(DeviceAwareModel):
             if initial_epoch == 0:
                 initial_epoch = ckpt_number
-            # Assign to checkpoint.epoch so we keep counting from that
             checkpoint.epoch.assign(tf.cast(initial_epoch, tf.int32))
             logger.info(f"Resuming from epoch {initial_epoch}")
-            # If you want to load old history from file:
             if history_path.exists():
                 try:
                     with open(history_path, 'r') as f:
@@ -765,7 +749,10 @@ class RetrievalChatbot(DeviceAwareModel):
                 except Exception as e:
                     logger.warning(f"Could not load history, starting fresh: {e}")
-            # Fix for custom weights not being saved in the full model.
             self.save_models(Path(checkpoint_dir) / "pretrained_full_model")
             logger.info(f"Manually saved custom weights after restore.")
         else:
@@ -782,13 +769,13 @@ class RetrievalChatbot(DeviceAwareModel):
         train_summary_writer = tf.summary.create_file_writer(train_log_dir)
         val_summary_writer = tf.summary.create_file_writer(val_log_dir)
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
         # Parse dataset
         dataset = tf.data.TFRecordDataset(tfrecord_file_path)
-        # Optional: test/debug mode with small subset
         if test_mode:
-            subset_size = 150
             dataset = dataset.take(subset_size)
             logger.info(f"TEST MODE: Using only {subset_size} examples")
             # Recompute sizes, steps, epochs, etc., as needed
@@ -804,38 +791,36 @@ class RetrievalChatbot(DeviceAwareModel):
             early_stopping_patience = 2
             logger.info(f"New training pairs: {train_size}")
             logger.info(f"New validation pairs: {val_size}")
         dataset = dataset.map(
-            lambda x: parse_tfrecord_fn(x, self.config.max_context_token_limit, self.config.neg_samples),
             num_parallel_calls=tf.data.AUTOTUNE
         )
         # Train/val split
         train_dataset = dataset.take(train_size)
         val_dataset = dataset.skip(train_size).take(val_size)
         # Shuffle and batch
         train_dataset = train_dataset.shuffle(buffer_size=buffer_size)
         train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
         train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
         val_dataset = val_dataset.batch(batch_size, drop_remainder=False)
         val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
         val_dataset = val_dataset.cache()
         # Training loop
         best_val_loss = float("inf")
         epochs_no_improve = 0
         for epoch in range(int(checkpoint.epoch.numpy()) + 1, epochs + 1):
             checkpoint.epoch.assign(epoch)
             logger.info(f"Starting Epoch {epoch}...")
-            # --- Training Phase ---
             epoch_loss_avg = tf.keras.metrics.Mean(dtype=tf.float32)
             batches_processed = 0
-            # Progress bar
             try:
                 train_pbar = tqdm(
                     total=steps_per_epoch,
@@ -846,7 +831,8 @@ class RetrievalChatbot(DeviceAwareModel):
             except ImportError:
                 train_pbar = None
                 is_tqdm_train = False
             for q_batch, p_batch, n_batch in train_dataset:
                 loss, grad_norm, post_clip_norm = self.train_step(q_batch, p_batch, n_batch)
                 epoch_loss_avg(loss)
@@ -874,54 +860,54 @@ class RetrievalChatbot(DeviceAwareModel):
                         "lr": f"{current_lr:.2e}",
                         "batches": f"{batches_processed}/{steps_per_epoch}"
                     })
                 gc.collect()
                 # End the epoch early if we've processed all steps
                 if batches_processed >= steps_per_epoch:
                     break
             if is_tqdm_train and train_pbar:
                 train_pbar.close()
-            # --- Validation Phase ---
             val_loss_avg = tf.keras.metrics.Mean(dtype=tf.float32)
             val_batches_processed = 0
             try:
                 val_pbar = tqdm(total=val_steps, desc="Validation", unit="batch")
                 is_tqdm_val = True
             except ImportError:
                 val_pbar = None
                 is_tqdm_val = False
             last_valid_val_loss = None
             valid_batches = False
             for q_batch, p_batch, n_batch in val_dataset:
                 # If batch is too small, skip
                 if tf.shape(q_batch)[0] < 2:
                     logger.warning(f"Skipping validation batch of size {tf.shape(q_batch)[0]}")
                     continue
                 valid_batches = True
                 val_loss = self.validation_step(q_batch, p_batch, n_batch)
                 val_loss_avg(val_loss)
                 last_valid_val_loss = val_loss
                 val_batches_processed += 1
                 if is_tqdm_val:
                     val_pbar.update(1)
                     val_pbar.set_postfix({
                         "val_loss": f"{val_loss.numpy():.4f}",
                         "batches": f"{val_batches_processed}/{val_steps}"
                     })
                 gc.collect()
                 if val_batches_processed >= val_steps:
                     break
             if not valid_batches:
                 # If no valid batch is found, fallback
                 logger.warning("No valid validation batches in this epoch")
@@ -931,29 +917,29 @@ class RetrievalChatbot(DeviceAwareModel):
                 else:
                     val_loss = epoch_loss_avg.result()
                     val_loss_avg(val_loss)
             if is_tqdm_val and val_pbar:
                 val_pbar.close()
             # End of epoch: final stats
             train_loss = epoch_loss_avg.result().numpy()
             val_loss = val_loss_avg.result().numpy()
             logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
             # TensorBoard epoch logs
             with train_summary_writer.as_default():
                 tf.summary.scalar("epoch_loss", train_loss, step=epoch)
             with val_summary_writer.as_default():
                 tf.summary.scalar("val_loss", val_loss, step=epoch)
             # Save checkpoint
             manager.save()
-            # (Optional) Save model for quick testing/inference
             model_save_path = Path(checkpoint_dir) / f"model_epoch_{epoch}"
             self.save_models(model_save_path)
             logger.info(f"Saved model for epoch {epoch} at {model_save_path}")
             # Update local history
             self.history['train_loss'].append(train_loss)
             self.history['val_loss'].append(val_loss)
@@ -972,13 +958,12 @@ class RetrievalChatbot(DeviceAwareModel):
                     return obj
             json_history = convert_to_py_floats(self.history)
             # Save training history to file every epoch
-            # (Create or overwrite the file so we always have the latest.)
             with open(history_path, 'w') as f:
                 json.dump(json_history, f)
             logger.info(f"Saved training history to {history_path}")
             # Early stopping
             if val_loss < best_val_loss - min_delta:
                 best_val_loss = val_loss
@@ -990,7 +975,7 @@ class RetrievalChatbot(DeviceAwareModel):
                 if epochs_no_improve >= early_stopping_patience:
                     logger.info("Early stopping triggered.")
                     break
         logger.info("Training completed!")
     @tf.function
@@ -1004,37 +989,25 @@ class RetrievalChatbot(DeviceAwareModel):
         Single training step using queries, positives, and hard negatives.
         """
         with tf.GradientTape() as tape:
-            # Encode queries
             q_enc = self.encoder(q_batch, training=True)  # [batch_size, embed_dim]
-            # Encode positives
             p_enc = self.encoder(p_batch, training=True)  # [batch_size, embed_dim]
-            # Encode negatives
-            # n_batch: [batch_size, neg_samples, max_length]
             shape = tf.shape(n_batch)
             bs = shape[0]
             neg_samples = shape[1]
-            # Flatten negatives to feed them in one pass:
-            # => [batch_size * neg_samples, max_length]
             n_batch_flat = tf.reshape(n_batch, [bs * neg_samples, shape[2]])
             n_enc_flat = self.encoder(n_batch_flat, training=True)  # [bs*neg_samples, embed_dim]
             # Reshape back => [batch_size, neg_samples, embed_dim]
             n_enc = tf.reshape(n_enc_flat, [bs, neg_samples, -1])
-            # Combine the positive embedding and negative embeddings along dim=1
-            # => shape [batch_size, 1 + neg_samples, embed_dim]
-            # The first column is the positive; subsequent columns are negatives
-            combined_p_n = tf.concat(
-                [tf.expand_dims(p_enc, axis=1), n_enc],
-                axis=1
-            )  # [bs, (1+neg_samples), embed_dim]
-            # Now compute scores: dot product of q_enc with each column in combined_p_n
-            # We'll use `tf.einsum` to handle the batch dimension properly
-            # dot_products => shape [batch_size, (1+neg_samples)]
             dot_products = tf.cast(tf.einsum('bd,bkd->bk', q_enc, combined_p_n), tf.float32)
             labels = tf.zeros([bs], dtype=tf.int32)  # Keep labels as int32
             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
@@ -1043,14 +1016,13 @@ class RetrievalChatbot(DeviceAwareModel):
             )
             loss = tf.cast(tf.reduce_mean(loss), tf.float32)
-        # Calculate gradients
         gradients = tape.gradient(loss, self.encoder.trainable_variables)
         gradients_norm = tf.cast(tf.linalg.global_norm(gradients), tf.float32)
         max_grad_norm = tf.constant(1.5, dtype=tf.float32)
         gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm, gradients_norm)
         post_clip_norm = tf.cast(tf.linalg.global_norm(gradients), tf.float32)
-        # Apply gradients
         self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
         return loss, gradients_norm, post_clip_norm
@@ -1064,6 +1036,7 @@ class RetrievalChatbot(DeviceAwareModel):
     ) -> tf.Tensor:
         """
         Single validation step using queries, positives, and hard negatives.
         """
         q_enc = self.encoder(q_batch, training=False)
         p_enc = self.encoder(p_batch, training=False)
@@ -1082,7 +1055,7 @@ class RetrievalChatbot(DeviceAwareModel):
         )
         dot_products = tf.cast(tf.einsum('bd,bkd->bk', q_enc, combined_p_n), tf.float32)
-        labels = tf.zeros([bs], dtype=tf.int32)  # Keep labels as int32
         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
             labels=labels,
@@ -1098,7 +1071,9 @@ class RetrievalChatbot(DeviceAwareModel):
         peak_lr: float,
         warmup_steps: int
     ) -> tf.keras.optimizers.schedules.LearningRateSchedule:
-        """Create a custom learning rate schedule with warmup and cosine decay."""
         class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
             def __init__(
                 self,
@@ -1110,11 +1085,11 @@ class RetrievalChatbot(DeviceAwareModel):
                 self.total_steps = tf.cast(total_steps, tf.float32)
                 self.peak_lr = tf.cast(peak_lr, tf.float32)
-                # Adjust warmup_steps to not exceed half of total_steps
                 adjusted_warmup_steps = min(warmup_steps, max(1, total_steps // 10))
                 self.warmup_steps = tf.cast(adjusted_warmup_steps, tf.float32)
-                # Calculate and store constants
                 self.initial_lr = tf.cast(self.peak_lr * 0.1, tf.float32)
                 self.min_lr = tf.cast(self.peak_lr * 0.01, tf.float32)
@@ -1128,21 +1103,20 @@ class RetrievalChatbot(DeviceAwareModel):
             def __call__(self, step):
                 step = tf.cast(step, tf.float32)
-                # Warmup phase
                 warmup_factor = tf.cast(tf.minimum(1.0, step / self.warmup_steps), tf.float32)
                 warmup_lr = self.initial_lr + (self.peak_lr - self.initial_lr) * warmup_factor
-                # Decay phase
                 decay_steps = tf.cast(tf.maximum(1.0, self.total_steps - self.warmup_steps), tf.float32)
                 decay_factor = tf.cast((step - self.warmup_steps) / decay_steps, tf.float32)
                 decay_factor = tf.cast(tf.minimum(tf.maximum(0.0, decay_factor), 1.0), tf.float32)
                 cosine_decay = tf.cast(0.5 * (1.0 + tf.cos(tf.constant(math.pi, dtype=tf.float32) * decay_factor)), tf.float32)
                 decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
-                # Choose between warmup and decay
                 final_lr = tf.where(step < self.warmup_steps, warmup_lr, decay_lr)
-                # Ensure learning rate is valid
                 final_lr = tf.maximum(self.min_lr, final_lr)
                 final_lr = tf.where(tf.math.is_finite(final_lr), final_lr, self.min_lr)

 @dataclass
 class ChatbotConfig:
+    """RetrievalChatbot Config"""
     max_context_token_limit: int = 512
     embedding_dim: int = 768
     encoder_units: int = 256
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
     l2_reg_weight: float = 0.001
+    learning_rate: float = 0.0005
     min_text_length: int = 3
+    max_context_turns: int = 20
     warmup_steps: int = 200
     pretrained_model: str = 'distilbert-base-uncased'
     cross_encoder_model: str = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
+    summarizer_model: str = 't5-small'
     dtype: str = 'float32'
     freeze_embeddings: bool = False
     embedding_batch_size: int = 64
     search_batch_size: int = 64
     max_batch_size: int = 64
     max_retries: int = 3
     def to_dict(self) -> Dict:
                      if k in cls.__dataclass_fields__})
 class EncoderModel(tf.keras.Model):
+    """Dual encoder model with pretrained DistilBERT embeddings."""
     def __init__(
         self,
         config: ChatbotConfig,
         self.pretrained = TFAutoModel.from_pretrained(config.pretrained_model)
         self._freeze_layers()
+        # Add Global Average Pooling, Projection, Dropout, and Normalization layers
         self.pooler = tf.keras.layers.GlobalAveragePooling1D()
         self.projection = tf.keras.layers.Dense(
             config.embedding_dim,
         )
     def _freeze_layers(self):
+        """Freeze n layers of the pretrained model"""
         if self.config.freeze_embeddings:
             self.pretrained.trainable = False
             logger.info("All pretrained layers frozen.")
             for i, layer in enumerate(self.pretrained.layers):
                 if isinstance(layer, tf.keras.layers.Layer):
                     if hasattr(layer, 'trainable'):
                         if i < 1:
                             layer.trainable = False
                             logger.info(f"Layer {i} frozen.")
                         else:
                             layer.trainable = True
+                            logger.info(f"Layer {i} trainable.")
     def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
         """Forward pass."""
         # Get pretrained embeddings
         pretrained_outputs = self.pretrained(inputs, training=training)
+        x = pretrained_outputs.last_hidden_state    # Shape: [batch_size, seq_len, embedding_dim]
         # Apply pooling, projection, dropout, and normalization
+        x = self.pooler(x)                          # Shape: [batch_size, 768]
+        x = self.projection(x)                      # Shape: [batch_size, 768]
         x = self.dropout(x, training=training)
+        x = self.normalize(x)                       # Shape: [batch_size, 768]
         return x
     def get_config(self) -> dict:
+        """Return the model config"""
         config = super().get_config()
         config.update({
             "config": self.config.to_dict(),
         return config
 class RetrievalChatbot(DeviceAwareModel):
+    """
+    Retrieval-based learning chatbot model.
+    Uses trained embeddings and FAISS for similarity search.
+    """
     def __init__(
         self,
         config: ChatbotConfig,
         self.device = device or self._setup_default_device()
         self.mode = mode.lower()
+        # Initialize reranker, summarizer, tokenizer, and encoder
         self.reranker = reranker or self._initialize_reranker()
         self.tokenizer = self._initialize_tokenizer()
         self.encoder = self._initialize_encoder()
             config=self.config,
             tokenizer=self.tokenizer,
             encoder=self.encoder,
             response_pool=[],
             max_length=self.config.max_context_token_limit,
             query_embeddings_cache={},
         )
         # Collect unique responses from dialogues
         """Initialize the Summarizer."""
         return Summarizer(
             tokenizer=self.tokenizer,
+            model_name=self.config.summarizer_model,
             max_summary_length=self.config.max_context_token_limit // 4,
             device=self.device,
             max_summary_rounds=2
         new_vocab_size = len(self.tokenizer)
         encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
         return encoder
     def _load_faiss_index_and_responses(self) -> None:
         """Load FAISS index and response pool for inference."""
         try:
+            logger.info(f"Loading FAISS index from {self.data_pipeline.faiss_index_file_path}...")
+            self.data_pipeline.load_faiss_index(self.data_pipeline.faiss_index_file_path)
             logger.info("FAISS index loaded successfully.")
+            # Load response pool
+            response_pool_path = self.data_pipeline.faiss_index_file_path.replace('.index', '_responses.json')
             if os.path.exists(response_pool_path):
                 with open(response_pool_path, 'r', encoding='utf-8') as f:
                     self.data_pipeline.response_pool = json.load(f)
         """
         load_dir = Path(load_dir)
+        # Load config
         with open(load_dir / "config.json", "r") as f:
             config = ChatbotConfig.from_dict(json.load(f))
+        # Initialize chatbot
         chatbot = cls(config, mode=mode)
+        # Load DistilBERT
+        chatbot.encoder.pretrained = TFAutoModel.from_pretrained(load_dir / "shared_encoder", config=config)
         dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
         _ = chatbot.encoder(dummy_input, training=False)
+        # Load tokenizer
         chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
         logger.info(f"Models and tokenizer loaded from {load_dir}")
+        # Load the custom weights
         custom_weights_path = load_dir / "encoder_custom_weights.weights.h5"
         if custom_weights_path.exists():
             chatbot.encoder.load_weights(str(custom_weights_path))
         else:
             logger.warning(f"No custom encoder weights found at {custom_weights_path}. The top-level projection layer won't have learned parameters.")
+        # Handle 'inference' mode: load FAISS, etc.
         if mode == 'inference':
             cls._prepare_model_for_inference(chatbot, load_dir)
     @classmethod
     def _prepare_model_for_inference(cls, chatbot: 'RetrievalChatbot', load_dir: Path) -> None:
+        """Load inference components."""
         try:
             # Load FAISS index
             faiss_path = load_dir / 'faiss_indices/faiss_index_production.index'
             raise
     def save_models(self, save_dir: Union[str, Path]):
+        """Save model and config"""
         save_dir = Path(save_dir)
         save_dir.mkdir(parents=True, exist_ok=True)
         with open(save_dir / "config.json", "w") as f:
             json.dump(self.config.to_dict(), f, indent=2)
+        # Save the HF DistilBERT submodule, custom top-level layers, and tokenizer
         self.encoder.pretrained.save_pretrained(save_dir / "shared_encoder")
         self.encoder.save_weights(save_dir / "encoder_custom_weights.weights.h5")
         self.tokenizer.save_pretrained(save_dir / "tokenizer")
         logger.info(f"Models and tokenizer saved to {save_dir}.")
+    def retrieve_responses(
         self,
         query: str,
         top_k: int = 10,
         summarize_threshold: int = 512
     ) -> List[Tuple[str, float]]:
         """
+        Retrieve top-k responses using FAISS and cross-encoder re-ranking.
         Args:
             query: The user's input text.
+            top_k: Number of FAISS results to return
+            reranker: CrossEncoderReranker for refined scoring
+            summarizer: Summarizer for long queries
+            summarize_threshold: Summarize if conversation tokens > threshold.
         Returns:
             List of (response_text, final_score).
         """
+        def sigmoid(x: float) -> float:
+            return 1 / (1 + np.exp(-x))
+        # Query summarization
         if summarizer and len(query.split()) > summarize_threshold:
             logger.info(f"Query is long ({len(query.split())} words). Summarizing.")
             query = summarizer.summarize_text(query)
         texts = [item[0] for item in faiss_candidates]
         if not reranker:
             reranker = CrossEncoderReranker(model_name=self.config.cross_encoder_model)
+        # Re-rank the texts (candidates) from FAISS search using the cross-encoder
         ce_logits = reranker.rerank(query, texts, max_length=256)
+        # Combine scores from FAISS and cross-encoder
         final_candidates = []
         for (resp_text, faiss_score), logit in zip(faiss_candidates, ce_logits):
+            ce_prob = sigmoid(logit)            # now in range [0...1]
+            faiss_norm = (faiss_score + 1)/2.0  # now in range [0...1]
             combined_score = 0.85 * ce_prob + 0.15 * faiss_norm
             length_adjusted_score = self.length_adjust_score(resp_text, combined_score)
         # Return top_k
         return final_candidates[:top_k]
     def extract_keywords(self, query: str) -> List[str]:
         """
         Return any domain keywords present in the query (lowercased).
         """
+        domain_keywords = {
+            'restaurant': ['restaurant', 'dining', 'food', 'dine', 'reservation', 'table', 'menu', 'cuisine', 'eat', 'place to eat', 'hungry', 'chef', 'dish', 'meal', 'brunch', 'bistro', 'buffet', 'catering', 'gourmet', 'fast food', 'fine dining', 'takeaway', 'delivery', 'restaurant booking'],
+            'movie': ['movie', 'cinema', 'film', 'ticket', 'showtime', 'showing', 'theater', 'flick', 'screening', 'film ticket', 'film show', 'blockbuster', 'premiere', 'trailer', 'director', 'actor', 'actress', 'plot', 'genre', 'screen', 'sequel', 'animation', 'documentary'],
+            'ride_share': ['ride', 'taxi', 'uber', 'lyft', 'car service', 'pickup', 'dropoff', 'driver', 'cab', 'hailing', 'rideshare', 'ride hailing', 'carpool', 'chauffeur', 'transit', 'transportation', 'hail ride'],
+            'coffee': ['coffee', 'café', 'cafe', 'starbucks', 'espresso', 'latte', 'mocha', 'americano', 'barista', 'brew', 'cappuccino', 'macchiato', 'iced coffee', 'cold brew', 'espresso machine', 'coffee shop', 'tea', 'chai', 'java', 'bean', 'roast', 'decaf'],
+            'pizza': ['pizza', 'delivery', 'order food', 'pepperoni', 'topping', 'pizzeria', 'slice', 'pie', 'margherita', 'deep dish', 'thin crust', 'cheese', 'oven', 'tossed', 'sauce', 'garlic bread', 'calzone'],
+            'auto': ['car', 'vehicle', 'repair', 'maintenance', 'mechanic', 'oil change', 'garage', 'auto shop', 'tire', 'check engine', 'battery', 'transmission', 'brake', 'engine diagnostics', 'carwash', 'detail', 'alignment', 'exhaust', 'spark plug', 'dashboard'],
+        }
         query_lower = query.lower()
         found = set()
+        for domain, kw_list in domain_keywords.items():
             for kw in kw_list:
                 if kw in query_lower:
                     found.add(kw)
     def detect_domain_from_query(self, query: str) -> str:
         """
+        Detect the domain of the query based on keywords. Used for boosting FAISS search.
         """
         domain_patterns = {
             'restaurant': r'\b(restaurant|restaurants?|dining|food|foods?|dine|reservation|reservations?|table|tables?|menu|menus?|cuisine|cuisines?|eat|eats?|place\s?to\s?eat|places\s?to\s?eat|hungry|chef|chefs?|dish|dishes?|meal|meals?|fork|forks?|knife|knives?|spoon|spoons?|brunch|bistro|buffet|buffets?|catering|caterings?|gourmet|fast\s?food|fine\s?dining|takeaway|takeaways?|delivery|deliveries|restaurant\s?booking)\b',
     def is_numeric_response(self, text: str) -> bool:
         """
+        Return True if `text` is purely digits and/or spaces.
         """
         pattern = r'^[\s]*[\d]+([\s.,\d]+)*[\s]*$'
         return bool(re.match(pattern, text.strip()))
         self,
         query: str,
         domain: str = 'other',
+        top_k: int = 10,
+        boost_factor: float = 1.15
     ) -> List[Tuple[str, float]]:
         """
         Retrieve top-k responses from the FAISS index (IndexFlatIP) given a user query.
         Args:
             query (str): The user input text.
+            domain (str): The detected domain from possible domains: ['restaurant', 'movie', 'ride_share', 'coffee', 'pizza', 'auto', 'other']
+            top_k (int): Number of top results to return.
+            boost_factor (float, optional): Factor to boost scores for keyword matches.
         Returns:
             List[Tuple[str, float]]: List of (response_text, similarity) sorted by descending similarity.
         """
         # Search the index
         distances, indices = self.data_pipeline.index.search(q_emb_np, top_k * 10)
+        # IndexFlatIP: 'distances' are inner products (cosine similarities for normalized vectors).
         candidates = []
         for rank, idx in enumerate(indices[0]):
             if idx < 0:
         boosted = []
         for (resp_text, resp_domain, score) in in_domain:
             new_score = score
+            # If the domain is known AND the response text shares any query keywords, boost it
             if query_keywords and any(kw in resp_text.lower() for kw in query_keywords):
                 new_score *= boost_factor
         # Sort boosted responses
         boosted.sort(key=lambda x: x[1], reverse=True)
+        # Debug logging (see FAISS responses)
         # for resp, score in boosted[:100]:
         #     logger.debug(f"Candidate: '{resp}' with score {score}")
         top_k: int = 10,
     ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
         """
+        Live chat with the chatbot. Uses same processing flow as validation, except for context handling and quality checking.
         """
         @self.run_on_device
         def get_response(self_arg, query_arg):
             conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
             # Retrieve and re-rank
+            results = self_arg.retrieve_responses(
                 query=conversation_str,
                 top_k=top_k,
                 reranker=self_arg.reranker,
         query: str,
         conversation_history: Optional[List[Tuple[str, str]]]
     ) -> str:
+        """
+        Build conversation context string from conversation history.
+        """
         if not conversation_history:
             return f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
     ) -> None:
         """
         Train the retrieval model using a pre-prepared TFRecord dataset.
         - Checkpoint loading/restoring
         - LR scheduling
         - Epoch/iteration tracking
+        - Training-history logging
+        - Early stopping
+        - Custom loss function (Contrastive loss with hard negative sampling))
         """
         logger.info("Starting training with pre-prepared TFRecord dataset...")
         steps_per_epoch = math.ceil(train_size / batch_size)
         val_steps = math.ceil(val_size / batch_size)
         total_steps = steps_per_epoch * epochs
+        buffer_size = max(1, total_pairs // 2)  # 50% of the dataset for shuffling
         logger.info(f"Training pairs: {train_size}")
         logger.info(f"Validation pairs: {val_size}")
             self.optimizer = tf.keras.optimizers.Adam(learning_rate=tf.cast(peak_lr, tf.float32))
             logger.info("Using fixed learning rate.")
+        # Dummy step to force initialization
         dummy_input = tf.zeros((1, self.config.max_context_token_limit), dtype=tf.int32)
         with tf.GradientTape() as tape:
             dummy_output = self.encoder(dummy_input)
             model=self.encoder
         )
+        # Create a CheckpointManager
         manager = tf.train.CheckpointManager(
             checkpoint,
             directory=checkpoint_dir,
             checkpoint_name='ckpt'
         )
+        # Restore from existing checkpoint if one is provided
         latest_checkpoint = manager.latest_checkpoint
         history_path = Path(checkpoint_dir) / 'training_history.json'
+        # Log epoch losses across runs, including restore from checkpoint
         if not hasattr(self, 'history'):
             self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
         if latest_checkpoint and not test_mode:
+            # Debug checkpoint loading
+            # logger.info(f"\nTrying to load checkpoint from: {latest_checkpoint}")
+            # reader = tf.train.load_checkpoint(latest_checkpoint)
             # shape_from_key = reader.get_variable_to_shape_map()
             # dtype_from_key = reader.get_variable_to_dtype_map()
             # logger.info("\nCheckpoint Variables:")
             if initial_epoch == 0:
                 initial_epoch = ckpt_number
+            # Assign to checkpoint.epoch for counting
             checkpoint.epoch.assign(tf.cast(initial_epoch, tf.int32))
             logger.info(f"Resuming from epoch {initial_epoch}")
+            # Load history from file:
             if history_path.exists():
                 try:
                     with open(history_path, 'r') as f:
                 except Exception as e:
                     logger.warning(f"Could not load history, starting fresh: {e}")
+            # Save custom weights not being saved in the full model.
+            # This was a bugfix to extract weights from a checkpoint without retraining.
+            # Before updating save_models, only Distilbert weights were being saved (custom layers were missed).
+            # Not needed, also not harmful.
             self.save_models(Path(checkpoint_dir) / "pretrained_full_model")
             logger.info(f"Manually saved custom weights after restore.")
         else:
         train_summary_writer = tf.summary.create_file_writer(train_log_dir)
         val_summary_writer = tf.summary.create_file_writer(val_log_dir)
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
         # Parse dataset
         dataset = tf.data.TFRecordDataset(tfrecord_file_path)
+        # Debug mode uses small subset. Useful for CPU debugging.
         if test_mode:
+            subset_size = 200
             dataset = dataset.take(subset_size)
             logger.info(f"TEST MODE: Using only {subset_size} examples")
             # Recompute sizes, steps, epochs, etc., as needed
             early_stopping_patience = 2
             logger.info(f"New training pairs: {train_size}")
             logger.info(f"New validation pairs: {val_size}")
         dataset = dataset.map(
+            lambda x: parse_tfrecord_fn(x, self.config.max_context_token_limit, self.data_pipeline.neg_samples),
             num_parallel_calls=tf.data.AUTOTUNE
         )
         # Train/val split
         train_dataset = dataset.take(train_size)
         val_dataset = dataset.skip(train_size).take(val_size)
         # Shuffle and batch
         train_dataset = train_dataset.shuffle(buffer_size=buffer_size)
         train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
         train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
         val_dataset = val_dataset.batch(batch_size, drop_remainder=False)
         val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
         val_dataset = val_dataset.cache()
         # Training loop
         best_val_loss = float("inf")
         epochs_no_improve = 0
         for epoch in range(int(checkpoint.epoch.numpy()) + 1, epochs + 1):
             checkpoint.epoch.assign(epoch)
             logger.info(f"Starting Epoch {epoch}...")
             epoch_loss_avg = tf.keras.metrics.Mean(dtype=tf.float32)
             batches_processed = 0
             try:
                 train_pbar = tqdm(
                     total=steps_per_epoch,
             except ImportError:
                 train_pbar = None
                 is_tqdm_train = False
+            # --- Training ---
             for q_batch, p_batch, n_batch in train_dataset:
                 loss, grad_norm, post_clip_norm = self.train_step(q_batch, p_batch, n_batch)
                 epoch_loss_avg(loss)
                         "lr": f"{current_lr:.2e}",
                         "batches": f"{batches_processed}/{steps_per_epoch}"
                     })
                 gc.collect()
                 # End the epoch early if we've processed all steps
                 if batches_processed >= steps_per_epoch:
                     break
             if is_tqdm_train and train_pbar:
                 train_pbar.close()
+            # --- Validation ---
             val_loss_avg = tf.keras.metrics.Mean(dtype=tf.float32)
             val_batches_processed = 0
             try:
                 val_pbar = tqdm(total=val_steps, desc="Validation", unit="batch")
                 is_tqdm_val = True
             except ImportError:
                 val_pbar = None
                 is_tqdm_val = False
             last_valid_val_loss = None
             valid_batches = False
             for q_batch, p_batch, n_batch in val_dataset:
                 # If batch is too small, skip
                 if tf.shape(q_batch)[0] < 2:
                     logger.warning(f"Skipping validation batch of size {tf.shape(q_batch)[0]}")
                     continue
                 valid_batches = True
                 val_loss = self.validation_step(q_batch, p_batch, n_batch)
                 val_loss_avg(val_loss)
                 last_valid_val_loss = val_loss
                 val_batches_processed += 1
                 if is_tqdm_val:
                     val_pbar.update(1)
                     val_pbar.set_postfix({
                         "val_loss": f"{val_loss.numpy():.4f}",
                         "batches": f"{val_batches_processed}/{val_steps}"
                     })
                 gc.collect()
                 if val_batches_processed >= val_steps:
                     break
             if not valid_batches:
                 # If no valid batch is found, fallback
                 logger.warning("No valid validation batches in this epoch")
                 else:
                     val_loss = epoch_loss_avg.result()
                     val_loss_avg(val_loss)
             if is_tqdm_val and val_pbar:
                 val_pbar.close()
             # End of epoch: final stats
             train_loss = epoch_loss_avg.result().numpy()
             val_loss = val_loss_avg.result().numpy()
             logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
             # TensorBoard epoch logs
             with train_summary_writer.as_default():
                 tf.summary.scalar("epoch_loss", train_loss, step=epoch)
             with val_summary_writer.as_default():
                 tf.summary.scalar("val_loss", val_loss, step=epoch)
             # Save checkpoint
             manager.save()
+            # Save model for iterative testing/inference
             model_save_path = Path(checkpoint_dir) / f"model_epoch_{epoch}"
             self.save_models(model_save_path)
             logger.info(f"Saved model for epoch {epoch} at {model_save_path}")
             # Update local history
             self.history['train_loss'].append(train_loss)
             self.history['val_loss'].append(val_loss)
                     return obj
             json_history = convert_to_py_floats(self.history)
             # Save training history to file every epoch
             with open(history_path, 'w') as f:
                 json.dump(json_history, f)
             logger.info(f"Saved training history to {history_path}")
             # Early stopping
             if val_loss < best_val_loss - min_delta:
                 best_val_loss = val_loss
                 if epochs_no_improve >= early_stopping_patience:
                     logger.info("Early stopping triggered.")
                     break
         logger.info("Training completed!")
     @tf.function
         Single training step using queries, positives, and hard negatives.
         """
         with tf.GradientTape() as tape:
+            # Encode queries, positives, and negatives
             q_enc = self.encoder(q_batch, training=True)  # [batch_size, embed_dim]
             p_enc = self.encoder(p_batch, training=True)  # [batch_size, embed_dim]
             shape = tf.shape(n_batch)
             bs = shape[0]
             neg_samples = shape[1]
+            # Flatten negatives to feed them in one pass: [batch_size * neg_samples, max_length]
             n_batch_flat = tf.reshape(n_batch, [bs * neg_samples, shape[2]])
             n_enc_flat = self.encoder(n_batch_flat, training=True)  # [bs*neg_samples, embed_dim]
             # Reshape back => [batch_size, neg_samples, embed_dim]
             n_enc = tf.reshape(n_enc_flat, [bs, neg_samples, -1])
+            # Combine the positive embedding and negative embeddings along dim=1: shape [batch_size, 1 + neg_samples, embed_dim]
+            # Col 1 is the pos, subsequent cols are negatives
+            combined_p_n = tf.concat([tf.expand_dims(p_enc, axis=1), n_enc], axis=1)  # [bs, (1+neg_samples), embed_dim]
+            # Compute scores: dot product of q_enc with each column in combined_p_n. `tf.einsum` handles the batch dimension
             dot_products = tf.cast(tf.einsum('bd,bkd->bk', q_enc, combined_p_n), tf.float32)
             labels = tf.zeros([bs], dtype=tf.int32)  # Keep labels as int32
             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
             )
             loss = tf.cast(tf.reduce_mean(loss), tf.float32)
+        # Calculate gradients and clip
         gradients = tape.gradient(loss, self.encoder.trainable_variables)
         gradients_norm = tf.cast(tf.linalg.global_norm(gradients), tf.float32)
         max_grad_norm = tf.constant(1.5, dtype=tf.float32)
         gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm, gradients_norm)
         post_clip_norm = tf.cast(tf.linalg.global_norm(gradients), tf.float32)
         self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
         return loss, gradients_norm, post_clip_norm
     ) -> tf.Tensor:
         """
         Single validation step using queries, positives, and hard negatives.
+        Same idea as train_step, but without gradient updates.
         """
         q_enc = self.encoder(q_batch, training=False)
         p_enc = self.encoder(p_batch, training=False)
         )
         dot_products = tf.cast(tf.einsum('bd,bkd->bk', q_enc, combined_p_n), tf.float32)
+        labels = tf.zeros([bs], dtype=tf.int32)
         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
             labels=labels,
         peak_lr: float,
         warmup_steps: int
     ) -> tf.keras.optimizers.schedules.LearningRateSchedule:
+        """
+        Custom learning rate schedule with warmup and cosine decay.
+        """
         class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
             def __init__(
                 self,
                 self.total_steps = tf.cast(total_steps, tf.float32)
                 self.peak_lr = tf.cast(peak_lr, tf.float32)
+                # warmup_steps 10% of total_steps
                 adjusted_warmup_steps = min(warmup_steps, max(1, total_steps // 10))
                 self.warmup_steps = tf.cast(adjusted_warmup_steps, tf.float32)
+                # Calculate constants
                 self.initial_lr = tf.cast(self.peak_lr * 0.1, tf.float32)
                 self.min_lr = tf.cast(self.peak_lr * 0.01, tf.float32)
             def __call__(self, step):
                 step = tf.cast(step, tf.float32)
+                # Warmup
                 warmup_factor = tf.cast(tf.minimum(1.0, step / self.warmup_steps), tf.float32)
                 warmup_lr = self.initial_lr + (self.peak_lr - self.initial_lr) * warmup_factor
+                # Decay
                 decay_steps = tf.cast(tf.maximum(1.0, self.total_steps - self.warmup_steps), tf.float32)
                 decay_factor = tf.cast((step - self.warmup_steps) / decay_steps, tf.float32)
                 decay_factor = tf.cast(tf.minimum(tf.maximum(0.0, decay_factor), 1.0), tf.float32)
                 cosine_decay = tf.cast(0.5 * (1.0 + tf.cos(tf.constant(math.pi, dtype=tf.float32) * decay_factor)), tf.float32)
                 decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
                 final_lr = tf.where(step < self.warmup_steps, warmup_lr, decay_lr)
+                # Ensure valid lr
                 final_lr = tf.maximum(self.min_lr, final_lr)
                 final_lr = tf.where(tf.math.is_finite(final_lr), final_lr, self.min_lr)

chatbot_validator.py CHANGED Viewed

@@ -113,7 +113,7 @@ class ChatbotValidator:
                 logger.info(f"\nTest Case {i}: {query}")
                 # Retrieve top_k responses, then evaluate with quality checker
-                responses = self.chatbot.retrieve_responses_cross_encoder(query, top_k=top_k, reranker=reranker)
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
                 # Aggregate metrics and log

                 logger.info(f"\nTest Case {i}: {query}")
                 # Retrieve top_k responses, then evaluate with quality checker
+                responses = self.chatbot.retrieve_responses(query, top_k=top_k, reranker=reranker)
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
                 # Aggregate metrics and log

{data_augmentation → data_augmentation_code}/augmentation_processing_pipeline.py RENAMED Viewed

File without changes

{data_augmentation → data_augmentation_code}/back_translator.py RENAMED Viewed

File without changes

{data_augmentation → data_augmentation_code}/dialogue_augmenter.py RENAMED Viewed

File without changes

{data_augmentation → data_augmentation_code}/main.py RENAMED Viewed

File without changes

{data_augmentation → data_augmentation_code}/paraphraser.py RENAMED Viewed

File without changes

{data_augmentation → data_augmentation_code}/pipeline_config.py RENAMED Viewed

File without changes

{data_augmentation → data_augmentation_code}/quality_metrics.py RENAMED Viewed

File without changes

{data_augmentation → data_augmentation_code}/schema_guided_dialogue_processor.py RENAMED Viewed

File without changes

{data_augmentation → data_augmentation_code}/taskmaster_processor.py RENAMED Viewed

File without changes

validate_model.py → run_chatbot_validation.py RENAMED Viewed

@@ -39,7 +39,7 @@ def run_interactive_chat(chatbot, quality_checker):
         else:
             print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
-def validate_chatbot():
     # Initialize environment
     env = EnvironmentSetup()
     env.initialize()
@@ -86,15 +86,15 @@ def validate_chatbot():
     try:
         chatbot.data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
         logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
-        logger.info("FAISS dimensions:", chatbot.data_pipeline.index.d)
-        logger.info("FAISS index type:", type(chatbot.data_pipeline.index))
-        logger.info("FAISS index total vectors:", chatbot.data_pipeline.index.ntotal)
-        logger.info("FAISS is_trained:", chatbot.data_pipeline.index.is_trained)
         with open(RESPONSE_POOL_PATH, "r", encoding="utf-8") as f:
             chatbot.data_pipeline.response_pool = json.load(f)
             logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
-            logger.info("\nTotal responses in pool:", len(chatbot.data_pipeline.response_pool))
         # Validate dimension consistency
         chatbot.data_pipeline.validate_faiss_index()
@@ -130,4 +130,4 @@ def validate_chatbot():
     run_interactive_chat(chatbot, quality_checker)
 if __name__ == "__main__":
-    validate_chatbot()

         else:
             print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
+def run_chatbot_validation():
     # Initialize environment
     env = EnvironmentSetup()
     env.initialize()
     try:
         chatbot.data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
         logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
+        logger.info(f"FAISS dimensions: {chatbot.data_pipeline.index.d}")
+        logger.info(f"FAISS index type: {type(chatbot.data_pipeline.index)}")
+        logger.info(f"FAISS index total vectors: {chatbot.data_pipeline.index.ntotal}")
+        logger.info(f"FAISS is_trained: {chatbot.data_pipeline.index.is_trained}")
         with open(RESPONSE_POOL_PATH, "r", encoding="utf-8") as f:
             chatbot.data_pipeline.response_pool = json.load(f)
             logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
+            logger.info(f"\nTotal responses in pool: {len(chatbot.data_pipeline.response_pool)}")
         # Validate dimension consistency
         chatbot.data_pipeline.validate_faiss_index()
     run_interactive_chat(chatbot, quality_checker)
 if __name__ == "__main__":
+    run_chatbot_validation()

tf_data_pipeline.py CHANGED Viewed

@@ -24,19 +24,19 @@ class TFDataPipeline:
         config,
         tokenizer,
         encoder,
-        index_file_path: str,
         response_pool: List[str],
-        max_length: int,
         query_embeddings_cache: dict,
-        neg_samples: int = 5,
         index_type: str = 'IndexFlatIP',
         nlist: int = 100,
         max_retries: int = 3
     ):
         self.config = config
         self.tokenizer = tokenizer
         self.encoder = encoder
-        self.index_file_path = index_file_path
         self.response_pool = response_pool
         self.max_length = max_length
         self.neg_samples = neg_samples
@@ -53,9 +53,9 @@ class TFDataPipeline:
         self.build_text_to_domain_map()
         # Initialize FAISS index
-        if os.path.exists(index_file_path):
-            logger.info(f"Loading existing FAISS index from {index_file_path}...")
-            self.index = faiss.read_index(index_file_path)
             self.validate_faiss_index()
             logger.info("FAISS index loaded and validated successfully.")
         else:
@@ -83,18 +83,18 @@ class TFDataPipeline:
                 self.query_embeddings_cache[query] = hf[query][:]
         logger.info(f"Embeddings cache loaded from {cache_file_path}.")
-    def save_faiss_index(self, index_file_path: str):
-        faiss.write_index(self.index, index_file_path)
-        logger.info(f"FAISS index saved to {index_file_path}")
-    def load_faiss_index(self, index_file_path: str):
         """Load FAISS index from specified file path."""
-        if os.path.exists(index_file_path):
-            self.index = faiss.read_index(index_file_path)
-            logger.info(f"FAISS index loaded from {index_file_path}.")
         else:
-            logger.error(f"FAISS index file not found at {index_file_path}.")
-            raise FileNotFoundError(f"FAISS index file not found at {index_file_path}.")
     def validate_faiss_index(self):
         """Validates FAISS index dimensionality."""

         config,
         tokenizer,
         encoder,
         response_pool: List[str],
         query_embeddings_cache: dict,
+        max_length: int = 512,
+        neg_samples: int = 10,
         index_type: str = 'IndexFlatIP',
+        faiss_index_file_path: str = 'new_iteration/data_prep_iterative_models/faiss_indices/faiss_index_production.index',
         nlist: int = 100,
         max_retries: int = 3
     ):
         self.config = config
         self.tokenizer = tokenizer
         self.encoder = encoder
+        self.faiss_index_file_path = faiss_index_file_path
         self.response_pool = response_pool
         self.max_length = max_length
         self.neg_samples = neg_samples
         self.build_text_to_domain_map()
         # Initialize FAISS index
+        if os.path.exists(faiss_index_file_path):
+            logger.info(f"Loading existing FAISS index from {faiss_index_file_path}...")
+            self.index = faiss.read_index(faiss_index_file_path)
             self.validate_faiss_index()
             logger.info("FAISS index loaded and validated successfully.")
         else:
                 self.query_embeddings_cache[query] = hf[query][:]
         logger.info(f"Embeddings cache loaded from {cache_file_path}.")
+    def save_faiss_index(self, faiss_index_file_path: str):
+        faiss.write_index(self.index, faiss_index_file_path)
+        logger.info(f"FAISS index saved to {faiss_index_file_path}")
+    def load_faiss_index(self, faiss_index_file_path: str):
         """Load FAISS index from specified file path."""
+        if os.path.exists(faiss_index_file_path):
+            self.index = faiss.read_index(faiss_index_file_path)
+            logger.info(f"FAISS index loaded from {faiss_index_file_path}.")
         else:
+            logger.error(f"FAISS index file not found at {faiss_index_file_path}.")
+            raise FileNotFoundError(f"FAISS index file not found at {faiss_index_file_path}.")
     def validate_faiss_index(self):
         """Validates FAISS index dimensionality."""