Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Dec 31, 2024

Commit

f5346f7

1 Parent(s): 74af405

updating training process

Browse files

Files changed (3) hide show

chatbot_model.py +368 -78
run_model_train.py +7 -6
tf_data_pipeline.py +25 -1

chatbot_model.py CHANGED Viewed

@@ -229,7 +229,6 @@ class RetrievalChatbot(DeviceAwareModel):
         self.encoder = EncoderModel(
             self.config,
             name="shared_encoder",
-            shared_weights=True  # If weight sharing is intended
         )
         # Resize token embeddings after adding special tokens
@@ -875,37 +874,35 @@ class RetrievalChatbot(DeviceAwareModel):
         logger.info(f"Models and tokenizer loaded from {load_dir}.")
         return chatbot
-    # @staticmethod
-    # def load_training_data(data_path: Union[str, Path], debug_samples: Optional[int] = None) -> List[dict]:
-    #     """
-    #     Load training data from a JSON file.
-    #     Args:
-    #         data_path (Union[str, Path]): Path to the JSON file containing dialogues.
-    #         debug_samples (Optional[int]): Number of samples to load for debugging.
-    #     Returns:
-    #         List[dict]: List of dialogue dictionaries.
-    #     """
-    #     logger.info(f"Loading training data from {data_path}...")
-    #     data_path = Path(data_path)
-    #     if not data_path.exists():
-    #         logger.error(f"Data file {data_path} does not exist.")
-    #         return []
-    #     with open(data_path, 'r', encoding='utf-8') as f:
-    #         dialogues = json.load(f)
-    #     if debug_samples is not None:
-    #         dialogues = dialogues[:debug_samples]
-    #         logger.info(f"Debug mode: Limited to {debug_samples} dialogues")
-    #     logger.info(f"Loaded {len(dialogues)} dialogues.")
-    #     return dialogues
     def train_streaming(
         self,
-        dialogues: List[dict],
         epochs: int = 20,
         batch_size: int = 16,
         validation_split: float = 0.2,
@@ -915,31 +912,23 @@ class RetrievalChatbot(DeviceAwareModel):
         warmup_steps_ratio: float = 0.1,
         early_stopping_patience: int = 3,
         min_delta: float = 1e-4,
-        neg_samples: int = 1
     ) -> None:
-        """Streaming training with tf.data pipeline."""
-        logger.info("Starting streaming training pipeline with tf.data...")
-        # Initialize TFDataPipeline (replaces StreamingDataPipeline)
-        dataset_preparer = TFDataPipeline(
-            embedding_batch_size=self.config.embedding_batch_size,
-            tokenizer=self.tokenizer,
-            encoder=self.encoder,
-            index=self.index,  # Pass CPU version of FAISS index
-            response_pool=self.response_pool,
-            max_length=self.config.max_context_token_limit,
-            neg_samples=neg_samples
-        )
         # Calculate total steps for learning rate schedule
-        total_pairs = dataset_preparer.estimate_total_pairs(dialogues)
         train_size = int(total_pairs * (1 - validation_split))
-        val_size = int(total_pairs * validation_split)
-        steps_per_epoch = int(math.ceil(train_size / batch_size))
-        val_steps = int(math.ceil(val_size / batch_size))
         total_steps = steps_per_epoch * epochs
-        logger.info(f"Total pairs: {total_pairs}")
         logger.info(f"Training pairs: {train_size}")
         logger.info(f"Validation pairs: {val_size}")
         logger.info(f"Steps per epoch: {steps_per_epoch}")
@@ -974,9 +963,19 @@ class RetrievalChatbot(DeviceAwareModel):
         val_summary_writer = tf.summary.create_file_writer(val_log_dir)
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
-        # Create training and validation datasets
-        train_dataset = dataset_preparer.get_tf_dataset(dialogues, batch_size).take(train_size)
-        val_dataset = dataset_preparer.get_tf_dataset(dialogues, batch_size).skip(train_size).take(val_size)
         # Training loop
         best_val_loss = float("inf")
@@ -996,7 +995,6 @@ class RetrievalChatbot(DeviceAwareModel):
                 logger.info("Training progress bar disabled")
             for q_batch, p_batch, n_batch in train_dataset:
-                #p_batch = p_n_batch[:, 0, :] # Extract positive from (positive, negative) pair
                 loss = self.train_step(q_batch, p_batch, n_batch)
                 epoch_loss_avg(loss)
                 batches_processed += 1
@@ -1018,7 +1016,7 @@ class RetrievalChatbot(DeviceAwareModel):
                         "lr": f"{current_lr:.2e}",
                         "batches": f"{batches_processed}/{steps_per_epoch}"
                     })
                 # Memory cleanup
                 gc.collect()
@@ -1041,7 +1039,6 @@ class RetrievalChatbot(DeviceAwareModel):
                 logger.info("Validation progress bar disabled")
             for q_batch, p_batch, n_batch in val_dataset:
-                #p_batch = p_n_batch[:, 0, :] # Extract positive from (positive, negative) pair
                 val_loss = self.validation_step(q_batch, p_batch, n_batch)
                 val_loss_avg(val_loss)
                 val_batches_processed += 1
@@ -1052,11 +1049,10 @@ class RetrievalChatbot(DeviceAwareModel):
                         "val_loss": f"{val_loss.numpy():.4f}",
                         "batches": f"{val_batches_processed}/{val_steps}"
                     })
                 # Memory cleanup
                 gc.collect()
                 if val_batches_processed >= val_steps:
                     break
@@ -1100,21 +1096,17 @@ class RetrievalChatbot(DeviceAwareModel):
                     logger.info("Early stopping triggered.")
                     break
-        logger.info("Streaming training completed!")
     @tf.function
     def train_step(
         self,
         q_batch: tf.Tensor,
         p_batch: tf.Tensor,
-        n_batch: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None
     ) -> tf.Tensor:
         """
-        Single training step that uses queries, positives, and negatives in a
-        contrastive/InfoNCE style. The label is always 0 (the positive) vs.
-        the negative alternatives.
         """
         with tf.GradientTape() as tape:
             # Encode queries
@@ -1160,12 +1152,6 @@ class RetrievalChatbot(DeviceAwareModel):
             )
             loss = tf.reduce_mean(loss)
-            # If there's an attention_mask you want to apply (less common in this scenario),
-            # you could do something like:
-            if attention_mask is not None:
-                loss = loss * attention_mask
-                loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
         # Apply gradients
         gradients = tape.gradient(loss, self.encoder.trainable_variables)
         self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
@@ -1176,12 +1162,10 @@ class RetrievalChatbot(DeviceAwareModel):
         self,
         q_batch: tf.Tensor,
         p_batch: tf.Tensor,
-        n_batch: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None
     ) -> tf.Tensor:
         """
-        Single validation step with queries, positives, and negatives.
-        Uses the same loss calculation as train_step, but `training=False`.
         """
         q_enc = self.encoder(q_batch, training=False)
         p_enc = self.encoder(p_batch, training=False)
@@ -1208,11 +1192,317 @@ class RetrievalChatbot(DeviceAwareModel):
         )
         loss = tf.reduce_mean(loss)
-        if attention_mask is not None:
-            loss = loss * attention_mask
-            loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
         return loss
     def _get_lr_schedule(
         self,

         self.encoder = EncoderModel(
             self.config,
             name="shared_encoder",
         )
         # Resize token embeddings after adding special tokens
         logger.info(f"Models and tokenizer loaded from {load_dir}.")
         return chatbot
+    def parse_tfrecord_fn(example_proto, max_length, neg_samples):
+        """
+        Parses a single TFRecord example.
+        Args:
+            example_proto: A serialized TFRecord example.
+            max_length: The maximum sequence length for tokenization.
+            neg_samples: The number of hard negatives per query.
+        Returns:
+            A tuple of (query_ids, positive_ids, negative_ids).
+        """
+        feature_description = {
+            'query_ids': tf.io.FixedLenFeature([max_length], tf.int64),
+            'positive_ids': tf.io.FixedLenFeature([max_length], tf.int64),
+            'negative_ids': tf.io.FixedLenFeature([neg_samples * max_length], tf.int64),
+        }
+        parsed_features = tf.io.parse_single_example(example_proto, feature_description)
+        query_ids = tf.cast(parsed_features['query_ids'], tf.int32)
+        positive_ids = tf.cast(parsed_features['positive_ids'], tf.int32)
+        negative_ids = tf.cast(parsed_features['negative_ids'], tf.int32)
+        negative_ids = tf.reshape(negative_ids, [neg_samples, max_length])
+        return query_ids, positive_ids, negative_ids
     def train_streaming(
         self,
+        tfrecord_file_path: str,
         epochs: int = 20,
         batch_size: int = 16,
         validation_split: float = 0.2,
         warmup_steps_ratio: float = 0.1,
         early_stopping_patience: int = 3,
         min_delta: float = 1e-4,
     ) -> None:
+        """Training using a pre-prepared TFRecord dataset."""
+        logger.info("Starting training with pre-prepared TFRecord dataset...")
         # Calculate total steps for learning rate schedule
+        # Estimate total pairs by counting the number of records in the TFRecord
+        # Assuming each record corresponds to one pair
+        raw_dataset = tf.data.TFRecordDataset(tfrecord_file_path)
+        total_pairs = sum(1 for _ in raw_dataset)
+        logger.info(f"Total pairs in TFRecord: {total_pairs}")
         train_size = int(total_pairs * (1 - validation_split))
+        val_size = total_pairs - train_size
+        steps_per_epoch = math.ceil(train_size / batch_size)
+        val_steps = math.ceil(val_size / batch_size)
         total_steps = steps_per_epoch * epochs
         logger.info(f"Training pairs: {train_size}")
         logger.info(f"Validation pairs: {val_size}")
         logger.info(f"Steps per epoch: {steps_per_epoch}")
         val_summary_writer = tf.summary.create_file_writer(val_log_dir)
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
+        # Define the parsing function with the appropriate max_length and neg_samples
+        parse_fn = lambda x: self.parse_tfrecord_fn(x, self.config.max_context_token_limit, self.config.neg_samples)
+        # Create the full dataset
+        dataset = tf.data.TFRecordDataset(tfrecord_file_path)
+        dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.AUTOTUNE)
+        dataset = dataset.shuffle(buffer_size=10000)  # Adjust buffer size as needed TODO: what is this?
+        dataset = dataset.batch(batch_size, drop_remainder=True)
+        dataset = dataset.prefetch(tf.data.AUTOTUNE)
+        # Split into training and validation
+        train_dataset = dataset.take(train_size)
+        val_dataset = dataset.skip(train_size).take(val_size)
         # Training loop
         best_val_loss = float("inf")
                 logger.info("Training progress bar disabled")
             for q_batch, p_batch, n_batch in train_dataset:
                 loss = self.train_step(q_batch, p_batch, n_batch)
                 epoch_loss_avg(loss)
                 batches_processed += 1
                         "lr": f"{current_lr:.2e}",
                         "batches": f"{batches_processed}/{steps_per_epoch}"
                     })
                 # Memory cleanup
                 gc.collect()
                 logger.info("Validation progress bar disabled")
             for q_batch, p_batch, n_batch in val_dataset:
                 val_loss = self.validation_step(q_batch, p_batch, n_batch)
                 val_loss_avg(val_loss)
                 val_batches_processed += 1
                         "val_loss": f"{val_loss.numpy():.4f}",
                         "batches": f"{val_batches_processed}/{val_steps}"
                     })
                 # Memory cleanup
                 gc.collect()
                 if val_batches_processed >= val_steps:
                     break
                     logger.info("Early stopping triggered.")
                     break
+        logger.info("Training completed!")
     @tf.function
     def train_step(
         self,
         q_batch: tf.Tensor,
         p_batch: tf.Tensor,
+        n_batch: tf.Tensor
     ) -> tf.Tensor:
         """
+        Single training step using queries, positives, and hard negatives.
         """
         with tf.GradientTape() as tape:
             # Encode queries
             )
             loss = tf.reduce_mean(loss)
         # Apply gradients
         gradients = tape.gradient(loss, self.encoder.trainable_variables)
         self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
         self,
         q_batch: tf.Tensor,
         p_batch: tf.Tensor,
+        n_batch: tf.Tensor
     ) -> tf.Tensor:
         """
+        Single validation step using queries, positives, and hard negatives.
         """
         q_enc = self.encoder(q_batch, training=False)
         p_enc = self.encoder(p_batch, training=False)
         )
         loss = tf.reduce_mean(loss)
         return loss
+    # def train_streaming(
+    #     self,
+    #     dialogues: List[dict],
+    #     epochs: int = 20,
+    #     batch_size: int = 16,
+    #     validation_split: float = 0.2,
+    #     checkpoint_dir: str = "checkpoints/",
+    #     use_lr_schedule: bool = True,
+    #     peak_lr: float = 2e-5,
+    #     warmup_steps_ratio: float = 0.1,
+    #     early_stopping_patience: int = 3,
+    #     min_delta: float = 1e-4,
+    #     neg_samples: int = 1
+    # ) -> None:
+    #     """Streaming training with tf.data pipeline."""
+    #     logger.info("Starting streaming training pipeline with tf.data...")
+    #     # Initialize TFDataPipeline (replaces StreamingDataPipeline)
+    #     dataset_preparer = TFDataPipeline(
+    #         embedding_batch_size=self.config.embedding_batch_size,
+    #         tokenizer=self.tokenizer,
+    #         encoder=self.encoder,
+    #         index=self.index,  # Pass CPU version of FAISS index
+    #         response_pool=self.response_pool,
+    #         max_length=self.config.max_context_token_limit,
+    #         neg_samples=neg_samples
+    #     )
+    #     # Calculate total steps for learning rate schedule
+    #     total_pairs = dataset_preparer.estimate_total_pairs(dialogues)
+    #     train_size = int(total_pairs * (1 - validation_split))
+    #     val_size = int(total_pairs * validation_split)
+    #     steps_per_epoch = int(math.ceil(train_size / batch_size))
+    #     val_steps = int(math.ceil(val_size / batch_size))
+    #     total_steps = steps_per_epoch * epochs
+    #     logger.info(f"Total pairs: {total_pairs}")
+    #     logger.info(f"Training pairs: {train_size}")
+    #     logger.info(f"Validation pairs: {val_size}")
+    #     logger.info(f"Steps per epoch: {steps_per_epoch}")
+    #     logger.info(f"Validation steps: {val_steps}")
+    #     logger.info(f"Total steps: {total_steps}")
+    #     # Set up optimizer with learning rate schedule
+    #     if use_lr_schedule:
+    #         warmup_steps = int(total_steps * warmup_steps_ratio)
+    #         lr_schedule = self._get_lr_schedule(
+    #             total_steps=total_steps,
+    #             peak_lr=peak_lr,
+    #             warmup_steps=warmup_steps
+    #         )
+    #         self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+    #         logger.info("Using custom learning rate schedule.")
+    #     else:
+    #         self.optimizer = tf.keras.optimizers.Adam(learning_rate=peak_lr)
+    #         logger.info("Using fixed learning rate.")
+    #     # Initialize checkpoint manager
+    #     checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
+    #     manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
+    #     # Setup TensorBoard
+    #     log_dir = Path(checkpoint_dir) / "tensorboard_logs"
+    #     log_dir.mkdir(parents=True, exist_ok=True)
+    #     current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    #     train_log_dir = str(log_dir / f"train_{current_time}")
+    #     val_log_dir = str(log_dir / f"val_{current_time}")
+    #     train_summary_writer = tf.summary.create_file_writer(train_log_dir)
+    #     val_summary_writer = tf.summary.create_file_writer(val_log_dir)
+    #     logger.info(f"TensorBoard logs will be saved in {log_dir}")
+    #     # Create training and validation datasets
+    #     train_dataset = dataset_preparer.get_tf_dataset(dialogues, batch_size).take(train_size)
+    #     val_dataset = dataset_preparer.get_tf_dataset(dialogues, batch_size).skip(train_size).take(val_size)
+    #     # Training loop
+    #     best_val_loss = float("inf")
+    #     epochs_no_improve = 0
+    #     for epoch in range(1, epochs + 1):
+    #         # --- Training Phase ---
+    #         epoch_loss_avg = tf.keras.metrics.Mean()
+    #         batches_processed = 0
+    #         try:
+    #             train_pbar = tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}", unit="batch")
+    #             is_tqdm_train = True
+    #         except ImportError:
+    #             train_pbar = None
+    #             is_tqdm_train = False
+    #             logger.info("Training progress bar disabled")
+    #         for q_batch, p_batch, n_batch in train_dataset:
+    #             #p_batch = p_n_batch[:, 0, :] # Extract positive from (positive, negative) pair
+    #             loss = self.train_step(q_batch, p_batch, n_batch)
+    #             epoch_loss_avg(loss)
+    #             batches_processed += 1
+    #             # Log to TensorBoard
+    #             with train_summary_writer.as_default():
+    #                 tf.summary.scalar("loss", loss, step=(epoch - 1) * steps_per_epoch + batches_processed)
+    #             # Update progress bar
+    #             if use_lr_schedule:
+    #                 current_lr = float(lr_schedule(self.optimizer.iterations))
+    #             else:
+    #                 current_lr = float(self.optimizer.learning_rate.numpy())
+    #             if is_tqdm_train:
+    #                 train_pbar.update(1)
+    #                 train_pbar.set_postfix({
+    #                     "loss": f"{loss.numpy():.4f}",
+    #                     "lr": f"{current_lr:.2e}",
+    #                     "batches": f"{batches_processed}/{steps_per_epoch}"
+    #                 })
+    #             # Memory cleanup
+    #             gc.collect()
+    #             if batches_processed >= steps_per_epoch:
+    #                 break
+    #         if is_tqdm_train and train_pbar:
+    #             train_pbar.close()
+    #         # --- Validation Phase ---
+    #         val_loss_avg = tf.keras.metrics.Mean()
+    #         val_batches_processed = 0
+    #         try:
+    #             val_pbar = tqdm(total=val_steps, desc="Validation", unit="batch")
+    #             is_tqdm_val = True
+    #         except ImportError:
+    #             val_pbar = None
+    #             is_tqdm_val = False
+    #             logger.info("Validation progress bar disabled")
+    #         for q_batch, p_batch, n_batch in val_dataset:
+    #             #p_batch = p_n_batch[:, 0, :] # Extract positive from (positive, negative) pair
+    #             val_loss = self.validation_step(q_batch, p_batch, n_batch)
+    #             val_loss_avg(val_loss)
+    #             val_batches_processed += 1
+    #             if is_tqdm_val:
+    #                 val_pbar.update(1)
+    #                 val_pbar.set_postfix({
+    #                     "val_loss": f"{val_loss.numpy():.4f}",
+    #                     "batches": f"{val_batches_processed}/{val_steps}"
+    #                 })
+    #             # Memory cleanup
+    #             gc.collect()
+    #             if val_batches_processed >= val_steps:
+    #                 break
+    #         if is_tqdm_val and val_pbar:
+    #             val_pbar.close()
+    #         # End of epoch: compute final epoch stats, log, and save checkpoint
+    #         train_loss = epoch_loss_avg.result().numpy()
+    #         val_loss = val_loss_avg.result().numpy()
+    #         logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
+    #         # Log epoch metrics
+    #         with train_summary_writer.as_default():
+    #             tf.summary.scalar("epoch_loss", train_loss, step=epoch)
+    #         with val_summary_writer.as_default():
+    #             tf.summary.scalar("val_loss", val_loss, step=epoch)
+    #         # Save checkpoint
+    #         manager.save()
+    #         # Store metrics in history
+    #         self.history['train_loss'].append(train_loss)
+    #         self.history['val_loss'].append(val_loss)
+    #         if use_lr_schedule:
+    #             current_lr = float(lr_schedule(self.optimizer.iterations))
+    #         else:
+    #             current_lr = float(self.optimizer.learning_rate.numpy())
+    #         self.history.setdefault('learning_rate', []).append(current_lr)
+    #         # Early stopping logic
+    #         if val_loss < best_val_loss - min_delta:
+    #             best_val_loss = val_loss
+    #             epochs_no_improve = 0
+    #             logger.info(f"Validation loss improved to {val_loss:.4f}. Reset patience.")
+    #         else:
+    #             epochs_no_improve += 1
+    #             logger.info(f"No improvement this epoch. Patience: {epochs_no_improve}/{early_stopping_patience}")
+    #             if epochs_no_improve >= early_stopping_patience:
+    #                 logger.info("Early stopping triggered.")
+    #                 break
+    #     logger.info("Streaming training completed!")
+    # @tf.function
+    # def train_step(
+    #     self,
+    #     q_batch: tf.Tensor,
+    #     p_batch: tf.Tensor,
+    #     n_batch: tf.Tensor,
+    #     attention_mask: Optional[tf.Tensor] = None
+    # ) -> tf.Tensor:
+    #     """
+    #     Single training step that uses queries, positives, and negatives in a
+    #     contrastive/InfoNCE style. The label is always 0 (the positive) vs.
+    #     the negative alternatives.
+    #     """
+    #     with tf.GradientTape() as tape:
+    #         # Encode queries
+    #         q_enc = self.encoder(q_batch, training=True)  # [batch_size, embed_dim]
+    #         # Encode positives
+    #         p_enc = self.encoder(p_batch, training=True)  # [batch_size, embed_dim]
+    #         # Encode negatives
+    #         # n_batch: [batch_size, neg_samples, max_length]
+    #         shape = tf.shape(n_batch)
+    #         bs = shape[0]
+    #         neg_samples = shape[1]
+    #         # Flatten negatives to feed them in one pass:
+    #         # => [batch_size * neg_samples, max_length]
+    #         n_batch_flat = tf.reshape(n_batch, [bs * neg_samples, shape[2]])
+    #         n_enc_flat = self.encoder(n_batch_flat, training=True)  # [bs*neg_samples, embed_dim]
+    #         # Reshape back => [batch_size, neg_samples, embed_dim]
+    #         n_enc = tf.reshape(n_enc_flat, [bs, neg_samples, -1])
+    #         # Combine the positive embedding and negative embeddings along dim=1
+    #         # => shape [batch_size, 1 + neg_samples, embed_dim]
+    #         # The first column is the positive; subsequent columns are negatives
+    #         combined_p_n = tf.concat(
+    #             [tf.expand_dims(p_enc, axis=1), n_enc],
+    #             axis=1
+    #         )  # [bs, (1+neg_samples), embed_dim]
+    #         # Now compute scores: dot product of q_enc with each column in combined_p_n
+    #         # We'll use `tf.einsum` to handle the batch dimension properly
+    #         # dot_products => shape [batch_size, (1+neg_samples)]
+    #         dot_products = tf.einsum('bd,bkd->bk', q_enc, combined_p_n)
+    #         # The label for each row is 0 (the first column is the correct/positive)
+    #         labels = tf.zeros([bs], dtype=tf.int32)
+    #         # Cross-entropy over the [batch_size, 1+neg_samples] scores
+    #         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+    #             labels=labels,
+    #             logits=dot_products
+    #         )
+    #         loss = tf.reduce_mean(loss)
+    #         # If there's an attention_mask you want to apply (less common in this scenario),
+    #         # you could do something like:
+    #         if attention_mask is not None:
+    #             loss = loss * attention_mask
+    #             loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
+    #     # Apply gradients
+    #     gradients = tape.gradient(loss, self.encoder.trainable_variables)
+    #     self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
+    #     return loss
+    # @tf.function
+    # def validation_step(
+    #     self,
+    #     q_batch: tf.Tensor,
+    #     p_batch: tf.Tensor,
+    #     n_batch: tf.Tensor,
+    #     attention_mask: Optional[tf.Tensor] = None
+    # ) -> tf.Tensor:
+    #     """
+    #     Single validation step with queries, positives, and negatives.
+    #     Uses the same loss calculation as train_step, but `training=False`.
+    #     """
+    #     q_enc = self.encoder(q_batch, training=False)
+    #     p_enc = self.encoder(p_batch, training=False)
+    #     shape = tf.shape(n_batch)
+    #     bs = shape[0]
+    #     neg_samples = shape[1]
+    #     n_batch_flat = tf.reshape(n_batch, [bs * neg_samples, shape[2]])
+    #     n_enc_flat = self.encoder(n_batch_flat, training=False)
+    #     n_enc = tf.reshape(n_enc_flat, [bs, neg_samples, -1])
+    #     combined_p_n = tf.concat(
+    #         [tf.expand_dims(p_enc, axis=1), n_enc],
+    #         axis=1
+    #     )
+    #     dot_products = tf.einsum('bd,bkd->bk', q_enc, combined_p_n)
+    #     labels = tf.zeros([bs], dtype=tf.int32)
+    #     loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+    #         labels=labels,
+    #         logits=dot_products
+    #     )
+    #     loss = tf.reduce_mean(loss)
+    #     if attention_mask is not None:
+    #         loss = loss * attention_mask
+    #         loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
+    #     return loss
     def _get_lr_schedule(
         self,

run_model_train.py CHANGED Viewed

@@ -39,7 +39,7 @@ def main():
     DEBUG_SAMPLES = 5
     EPOCHS = 5 if DEBUG_SAMPLES else 20
-    TRAINING_DATA_PATH = 'processed_outputs/batch_group_0010.json'
     # Optimize batch size for Colab
     batch_size = env.optimize_batch_size(base_batch_size=16)
@@ -49,20 +49,21 @@ def main():
         embedding_dim=768, # DistilBERT
         max_context_token_limit=512,
         freeze_embeddings=False,
     )
-    # Load training data
-    dialogues = RetrievalChatbot.load_training_data(data_path=TRAINING_DATA_PATH, debug_samples=DEBUG_SAMPLES)
-    print(dialogues)
     # Initialize chatbot and verify FAISS index
     #with env.strategy.scope():
-    chatbot = RetrievalChatbot(config, dialogues)
     chatbot.build_models()
     chatbot.verify_faiss_index()
     chatbot.train_streaming(
-        dialogues=dialogues,
         epochs=EPOCHS,
         batch_size=batch_size,
         use_lr_schedule=True,

     DEBUG_SAMPLES = 5
     EPOCHS = 5 if DEBUG_SAMPLES else 20
+    TF_RECORD_FILE_PATH = 'training_data/training_data.tfrecord'
     # Optimize batch size for Colab
     batch_size = env.optimize_batch_size(base_batch_size=16)
         embedding_dim=768, # DistilBERT
         max_context_token_limit=512,
         freeze_embeddings=False,
+        neg_samples=3,
     )
+    # # Load training data
+    # dialogues = RetrievalChatbot.load_training_data(data_path=TRAINING_DATA_PATH, debug_samples=DEBUG_SAMPLES)
+    # print(dialogues)
     # Initialize chatbot and verify FAISS index
     #with env.strategy.scope():
+    chatbot = RetrievalChatbot(config)
     chatbot.build_models()
     chatbot.verify_faiss_index()
     chatbot.train_streaming(
+        tfrecord_file_path=TF_RECORD_FILE_PATH,
         epochs=EPOCHS,
         batch_size=batch_size,
         use_lr_schedule=True,

tf_data_pipeline.py CHANGED Viewed

@@ -689,7 +689,31 @@ class TFDataPipeline:
         return q_ids, p_ids, n_ids
         # def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
     #     """Find hard negatives for a batch of queries with error handling and retries."""

         return q_ids, p_ids, n_ids
+    # def parse_tfrecord_fn(example_proto, max_length, neg_samples):
+    #     """
+    #     Parses a single TFRecord example.
+    #     Args:
+    #         example_proto: A serialized TFRecord example.
+    #         max_length: The maximum sequence length for tokenization.
+    #         neg_samples: The number of hard negatives per query.
+    #     Returns:
+    #         A tuple of (query_ids, positive_ids, negative_ids).
+    #     """
+    #     feature_description = {
+    #         'query_ids': tf.io.FixedLenFeature([max_length], tf.int64),
+    #         'positive_ids': tf.io.FixedLenFeature([max_length], tf.int64),
+    #         'negative_ids': tf.io.FixedLenFeature([neg_samples * max_length], tf.int64),
+    #     }
+    #     parsed_features = tf.io.parse_single_example(example_proto, feature_description)
+    #     query_ids = tf.cast(parsed_features['query_ids'], tf.int32)
+    #     positive_ids = tf.cast(parsed_features['positive_ids'], tf.int32)
+    #     negative_ids = tf.cast(parsed_features['negative_ids'], tf.int32)
+    #     negative_ids = tf.reshape(negative_ids, [neg_samples, max_length])
+    #     return query_ids, positive_ids, negative_ids
         # def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
     #     """Find hard negatives for a batch of queries with error handling and retries."""