Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 5

Commit

d53c64b

1 Parent(s): fc5f33b

fix checkpointing restoration

Browse files

Files changed (2) hide show

chatbot_model.py +171 -137
train_model.py +14 -10

chatbot_model.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 from typing import List, Tuple, Dict, Optional, Union, Any
@@ -74,7 +75,8 @@ class EncoderModel(tf.keras.Model):
         self.projection = tf.keras.layers.Dense(
             config.embedding_dim,
             activation='tanh',
-            name="projection"
         )
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
         self.normalize = tf.keras.layers.Lambda(
@@ -613,20 +615,20 @@ class RetrievalChatbot(DeviceAwareModel):
         test_mode: bool = False,
         initial_epoch: int = 0
     ) -> None:
-        """Training using a pre-prepared TFRecord dataset."""
         logger.info("Starting training with pre-prepared TFRecord dataset...")
         def parse_tfrecord_fn(example_proto, max_length, neg_samples):
             """
             Parses a single TFRecord example.
-            Args:
-                example_proto: A serialized TFRecord example.
-                max_length: The maximum sequence length for tokenization.
-                neg_samples: The number of hard negatives per query.
-            Returns:
-                A tuple of (query_ids, positive_ids, negative_ids).
             """
             feature_description = {
                 'query_ids': tf.io.FixedLenFeature([max_length], tf.int64),
@@ -640,9 +642,9 @@ class RetrievalChatbot(DeviceAwareModel):
             negative_ids = tf.cast(parsed_features['negative_ids'], tf.int32)
             negative_ids = tf.reshape(negative_ids, [neg_samples, max_length])
-            return query_ids, positive_ids, negative_ids
-        # Calculate total steps by counting the number of records in the TFRecord
         raw_dataset = tf.data.TFRecordDataset(tfrecord_file_path)
         total_pairs = sum(1 for _ in raw_dataset)
         logger.info(f"Total pairs in TFRecord: {total_pairs}")
@@ -652,7 +654,7 @@ class RetrievalChatbot(DeviceAwareModel):
         steps_per_epoch = math.ceil(train_size / batch_size)
         val_steps = math.ceil(val_size / batch_size)
         total_steps = steps_per_epoch * epochs
-        buffer_size = total_pairs // 10 # 10% of the dataset
         logger.info(f"Training pairs: {train_size}")
         logger.info(f"Validation pairs: {val_size}")
@@ -660,61 +662,104 @@ class RetrievalChatbot(DeviceAwareModel):
         logger.info(f"Validation steps: {val_steps}")
         logger.info(f"Total steps: {total_steps}")
-        # Set up optimizer with learning rate schedule
         if use_lr_schedule:
             warmup_steps = int(total_steps * warmup_steps_ratio)
             lr_schedule = self._get_lr_schedule(
                 total_steps=total_steps,
-                peak_lr=peak_lr,
                 warmup_steps=warmup_steps
             )
             self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
             logger.info("Using custom learning rate schedule.")
         else:
-            self.optimizer = tf.keras.optimizers.Adam(learning_rate=peak_lr)
             logger.info("Using fixed learning rate.")
-        # Initialize checkpoint manager
         checkpoint = tf.train.Checkpoint(
             epoch=tf.Variable(0, dtype=tf.int32),
             optimizer=self.optimizer,
-            optimizer_iterations=self.optimizer.iterations,
-            model=self.encoder,
-            variables=self.encoder.variables
         )
-        manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3, checkpoint_name='ckpt')
-        # Restore from checkpoint if available
         latest_checkpoint = manager.latest_checkpoint
-        #history_path = Path(checkpoint_dir) / 'training_history.json'
-        if latest_checkpoint:
-            # if history_path.exists():
-            #     try:
-            #         with open(history_path, 'r') as f:
-            #             self.history = json.load(f)
-            #         logger.info(f"Loaded previous training history from {history_path}")
-            #     except Exception as e:
-            #         logger.warning(f"Could not load history, starting fresh: {e}")
-            #         self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
-            # else:
-            #     self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
             status = checkpoint.restore(latest_checkpoint)
-            status.expect_partial()
             logger.info(f"Restored from checkpoint: {latest_checkpoint}")
-            # Get the checkpoint number to validate initial_epoch
             ckpt_number = int(latest_checkpoint.split('ckpt-')[-1])
             if initial_epoch == 0:
                 initial_epoch = ckpt_number
-            checkpoint.epoch.assign(initial_epoch)
             logger.info(f"Resuming from epoch {initial_epoch}")
         else:
             logger.info("Starting training from scratch")
             initial_epoch = 0
-        # Setup TensorBoard
         log_dir = Path(checkpoint_dir) / "tensorboard_logs"
         log_dir.mkdir(parents=True, exist_ok=True)
         current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -724,18 +769,15 @@ class RetrievalChatbot(DeviceAwareModel):
         val_summary_writer = tf.summary.create_file_writer(val_log_dir)
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
-        # Define the parsing function with the appropriate max_length and neg_samples
-        parse_fn = lambda x: parse_tfrecord_fn(x, self.config.max_context_token_limit, self.config.neg_samples)
-        # Create the full dataset
         dataset = tf.data.TFRecordDataset(tfrecord_file_path)
-        # Test mode for debugging
         if test_mode:
-            subset_size = 200
             dataset = dataset.take(subset_size)
             logger.info(f"TEST MODE: Using only {subset_size} examples")
-            # Recalculate sizes
             total_pairs = subset_size
             train_size = int(total_pairs * (1 - validation_split))
             val_size = total_pairs - train_size
@@ -743,22 +785,23 @@ class RetrievalChatbot(DeviceAwareModel):
             steps_per_epoch = math.ceil(train_size / batch_size)
             val_steps = math.ceil(val_size / batch_size)
             total_steps = steps_per_epoch * epochs
-            buffer_size = total_pairs // 10 # 10% of the dataset
-            epochs = min(epochs, 5)  # Limit epochs in test mode
             early_stopping_patience = 2
             logger.info(f"New training pairs: {train_size}")
             logger.info(f"New validation pairs: {val_size}")
-        dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.AUTOTUNE)
-        # Split into training and validation sets
         train_dataset = dataset.take(train_size)
         val_dataset = dataset.skip(train_size).take(val_size)
-        # Shuffle the training data
         train_dataset = train_dataset.shuffle(buffer_size=buffer_size)
-        # Batch both datasets
         train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
         train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
@@ -773,41 +816,34 @@ class RetrievalChatbot(DeviceAwareModel):
         for epoch in range(int(checkpoint.epoch.numpy()) + 1, epochs + 1):
             checkpoint.epoch.assign(epoch)
             logger.info(f"Starting Epoch {epoch}...")
             # --- Training Phase ---
-            epoch_loss_avg = tf.keras.metrics.Mean()
             batches_processed = 0
             try:
-                train_pbar = tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}", unit="batch")
                 is_tqdm_train = True
             except ImportError:
                 train_pbar = None
                 is_tqdm_train = False
-                logger.info("Training progress bar disabled")
             for q_batch, p_batch, n_batch in train_dataset:
                 loss, grad_norm, post_clip_norm = self.train_step(q_batch, p_batch, n_batch)
-                # Check for gradient issues
-                grad_norm_value = float(grad_norm.numpy())
-                post_clip_value = float(post_clip_norm.numpy())
-                if grad_norm_value < 1e-7:
-                    logger.warning(f"Potential vanishing gradient detected: norm = {grad_norm_value:.2e}")
-                elif grad_norm_value > 100:
-                    logger.warning(f"Potential exploding gradient detected: norm = {grad_norm_value:.2e}")
-                # if grad_norm_value != post_clip_value:
-                #     logger.info(f"Gradient clipped: {grad_norm_value:.2e} -> {post_clip_value:.2e}")
                 epoch_loss_avg(loss)
                 batches_processed += 1
                 # Log to TensorBoard
                 with train_summary_writer.as_default():
                     step = (epoch - 1) * steps_per_epoch + batches_processed
-                    tf.summary.scalar("loss", loss, step=step)
-                    tf.summary.scalar("gradient_norm_pre_clip", grad_norm, step=step)
-                    tf.summary.scalar("gradient_norm_post_clip", post_clip_norm, step=step)
                 # Update progress bar
                 if use_lr_schedule:
@@ -819,15 +855,15 @@ class RetrievalChatbot(DeviceAwareModel):
                     train_pbar.update(1)
                     train_pbar.set_postfix({
                         "loss": f"{loss.numpy():.4f}",
-                        "pre_clip": f"{grad_norm_value:.2e}",
-                        "post_clip": f"{post_clip_value:.2e}",
                         "lr": f"{current_lr:.2e}",
                         "batches": f"{batches_processed}/{steps_per_epoch}"
                     })
-                # Memory cleanup
                 gc.collect()
                 if batches_processed >= steps_per_epoch:
                     break
@@ -835,7 +871,7 @@ class RetrievalChatbot(DeviceAwareModel):
                 train_pbar.close()
             # --- Validation Phase ---
-            val_loss_avg = tf.keras.metrics.Mean()
             val_batches_processed = 0
             try:
@@ -844,16 +880,16 @@ class RetrievalChatbot(DeviceAwareModel):
             except ImportError:
                 val_pbar = None
                 is_tqdm_val = False
-                logger.info("Validation progress bar disabled")
-            last_valid_val_loss = None  # Initialize outside the loop
             valid_batches = False
             for q_batch, p_batch, n_batch in val_dataset:
-                if tf.shape(q_batch)[0] < 2:
-                    logger.warning(f"Skipping validation batch of size {tf.shape(q_batch)[0]} (too small for loss calculation)")
                     continue
                 valid_batches = True
                 val_loss = self.validation_step(q_batch, p_batch, n_batch)
                 val_loss_avg(val_loss)
@@ -867,32 +903,30 @@ class RetrievalChatbot(DeviceAwareModel):
                         "batches": f"{val_batches_processed}/{val_steps}"
                     })
-                # Memory cleanup
                 gc.collect()
                 if val_batches_processed >= val_steps:
                     break
             if not valid_batches:
-                logger.warning("No valid validation batches in this epoch, using last known validation loss")
                 if last_valid_val_loss is not None:
                     val_loss = last_valid_val_loss
                     val_loss_avg(val_loss)
                 else:
-                    # If we've never had a valid batch (first epoch), use training loss as fallback
-                    logger.warning("No previous validation loss available, using training loss as fallback")
-                    val_loss = train_loss
                     val_loss_avg(val_loss)
             if is_tqdm_val and val_pbar:
                 val_pbar.close()
-            # End of epoch: compute final epoch stats, log, and save checkpoint
             train_loss = epoch_loss_avg.result().numpy()
             val_loss = val_loss_avg.result().numpy()
             logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
-            # Log epoch metrics
             with train_summary_writer.as_default():
                 tf.summary.scalar("epoch_loss", train_loss, step=epoch)
             with val_summary_writer.as_default():
@@ -900,31 +934,38 @@ class RetrievalChatbot(DeviceAwareModel):
             # Save checkpoint
             manager.save()
-            # Save model after each epoch for testing/inference
             model_save_path = Path(checkpoint_dir) / f"model_epoch_{epoch}"
             self.save_models(model_save_path)
             logger.info(f"Saved model for epoch {epoch} at {model_save_path}")
-            # Store metrics in history
             self.history['train_loss'].append(train_loss)
             self.history['val_loss'].append(val_loss)
-            if use_lr_schedule:
-                current_lr = float(lr_schedule(self.optimizer.iterations))
-            else:
-                current_lr = float(self.optimizer.learning_rate.numpy())
-            # Log learning rate
             self.history.setdefault('learning_rate', []).append(current_lr)
-            # Save history to file
-            #if history_path.exists():
-            # with open(history_path, 'w') as f:
-            #     json.dump(self.history, f)
-            # logger.info(f"Saved training history to {history_path}")
-            # Early stopping logic
             if val_loss < best_val_loss - min_delta:
                 best_val_loss = val_loss
                 epochs_no_improve = 0
@@ -980,26 +1021,20 @@ class RetrievalChatbot(DeviceAwareModel):
             # Now compute scores: dot product of q_enc with each column in combined_p_n
             # We'll use `tf.einsum` to handle the batch dimension properly
             # dot_products => shape [batch_size, (1+neg_samples)]
-            dot_products = tf.einsum('bd,bkd->bk', q_enc, combined_p_n)
-            # The label for each row is 0 (the first column is the correct/positive)
-            labels = tf.zeros([bs], dtype=tf.int32)
-            # Cross-entropy over the [batch_size, 1+neg_samples] scores
             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                 labels=labels,
                 logits=dot_products
             )
-            loss = tf.reduce_mean(loss)
         # Calculate gradients
         gradients = tape.gradient(loss, self.encoder.trainable_variables)
-        gradients_norm = tf.linalg.global_norm(gradients)
-        # Clip gradients if norm exceeds threshold
-        max_grad_norm = 1.5
         gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm, gradients_norm)
-        post_clip_norm = tf.linalg.global_norm(gradients)
         # Apply gradients
         self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
@@ -1032,14 +1067,14 @@ class RetrievalChatbot(DeviceAwareModel):
             axis=1
         )
-        dot_products = tf.einsum('bd,bkd->bk', q_enc, combined_p_n)
-        labels = tf.zeros([bs], dtype=tf.int32)
         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
             labels=labels,
             logits=dot_products
         )
-        loss = tf.reduce_mean(loss)
         return loss
@@ -1066,8 +1101,8 @@ class RetrievalChatbot(DeviceAwareModel):
                 self.warmup_steps = tf.cast(adjusted_warmup_steps, tf.float32)
                 # Calculate and store constants
-                self.initial_lr = self.peak_lr * 0.1  # Start at 10% of peak
-                self.min_lr = self.peak_lr * 0.01     # Minimum 1% of peak
                 logger.info(f"Learning rate schedule initialized:")
                 logger.info(f"  Initial LR: {float(self.initial_lr):.6f}")
@@ -1080,15 +1115,14 @@ class RetrievalChatbot(DeviceAwareModel):
                 step = tf.cast(step, tf.float32)
                 # Warmup phase
-                warmup_factor = tf.minimum(1.0, step / self.warmup_steps)
                 warmup_lr = self.initial_lr + (self.peak_lr - self.initial_lr) * warmup_factor
                 # Decay phase
-                decay_steps = tf.maximum(1.0, self.total_steps - self.warmup_steps)
-                decay_factor = (step - self.warmup_steps) / decay_steps
-                decay_factor = tf.minimum(tf.maximum(0.0, decay_factor), 1.0)  # Clip to [0,1]
-                cosine_decay = 0.5 * (1.0 + tf.cos(tf.constant(math.pi) * decay_factor))
                 decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
                 # Choose between warmup and decay

 import os
+import numpy as np
 from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 from typing import List, Tuple, Dict, Optional, Union, Any
         self.projection = tf.keras.layers.Dense(
             config.embedding_dim,
             activation='tanh',
+            name="projection",
+            dtype=tf.float32
         )
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
         self.normalize = tf.keras.layers.Lambda(
         test_mode: bool = False,
         initial_epoch: int = 0
     ) -> None:
+        """
+        Train the retrieval model using a pre-prepared TFRecord dataset.
+        This method handles:
+        - Checkpoint loading/restoring
+        - LR scheduling
+        - Epoch/iteration tracking
+        - Optional training-history logging
+        - Basic early stopping
+        """
         logger.info("Starting training with pre-prepared TFRecord dataset...")
         def parse_tfrecord_fn(example_proto, max_length, neg_samples):
             """
             Parses a single TFRecord example.
             """
             feature_description = {
                 'query_ids': tf.io.FixedLenFeature([max_length], tf.int64),
             negative_ids = tf.cast(parsed_features['negative_ids'], tf.int32)
             negative_ids = tf.reshape(negative_ids, [neg_samples, max_length])
+            return query_ids, positive_ids, negative_ids
+        # Count total records in TFRecord
         raw_dataset = tf.data.TFRecordDataset(tfrecord_file_path)
         total_pairs = sum(1 for _ in raw_dataset)
         logger.info(f"Total pairs in TFRecord: {total_pairs}")
         steps_per_epoch = math.ceil(train_size / batch_size)
         val_steps = math.ceil(val_size / batch_size)
         total_steps = steps_per_epoch * epochs
+        buffer_size = max(1, total_pairs // 10)  # 10% of the dataset
         logger.info(f"Training pairs: {train_size}")
         logger.info(f"Validation pairs: {val_size}")
         logger.info(f"Validation steps: {val_steps}")
         logger.info(f"Total steps: {total_steps}")
+        # Set up optimizer & LR schedule
         if use_lr_schedule:
             warmup_steps = int(total_steps * warmup_steps_ratio)
             lr_schedule = self._get_lr_schedule(
                 total_steps=total_steps,
+                peak_lr=tf.cast(peak_lr, tf.float32),
                 warmup_steps=warmup_steps
             )
             self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
             logger.info("Using custom learning rate schedule.")
         else:
+            self.optimizer = tf.keras.optimizers.Adam(learning_rate=tf.cast(peak_lr, tf.float32))
             logger.info("Using fixed learning rate.")
+        # Initialize optimizer with dummy step
+        dummy_input = tf.zeros((1, self.config.max_context_token_limit), dtype=tf.int32)
+        with tf.GradientTape() as tape:
+            dummy_output = self.encoder(dummy_input)
+            dummy_loss = tf.cast(tf.reduce_mean(dummy_output), tf.float32)
+        dummy_grads = tape.gradient(dummy_loss, self.encoder.trainable_variables)
+        self.optimizer.apply_gradients(zip(dummy_grads, self.encoder.trainable_variables))
+        # Create checkpoint and manager
         checkpoint = tf.train.Checkpoint(
             epoch=tf.Variable(0, dtype=tf.int32),
             optimizer=self.optimizer,
+            model=self.encoder
+        )
+        manager = tf.train.CheckpointManager(
+            checkpoint,
+            directory=checkpoint_dir,
+            max_to_keep=3,
+            checkpoint_name='ckpt'
         )
+        # Restore from existing checkpoint if present
         latest_checkpoint = manager.latest_checkpoint
+        history_path = Path(checkpoint_dir) / 'training_history.json'
+        # If you want to log all epoch losses across runs
+        if not hasattr(self, 'history'):
+            self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
+        if latest_checkpoint and not test_mode:
+            # Debug info before restore
+            logger.info("\nEncoder Variables:")
+            for var in self.encoder.variables:
+                logger.info(f"{var.name}: {var.dtype} - Shape: {var.shape}")
+            logger.info("\nOptimizer Variables:")
+            for var in self.optimizer.variables:
+                logger.info(f"{var.name}: {var.dtype} - Shape: {var.shape}")
+            # Add checkpoint inspection
+            logger.info("\nTrying to load checkpoint from: ", latest_checkpoint)
+            reader = tf.train.load_checkpoint(latest_checkpoint)
+            shape_from_key = reader.get_variable_to_shape_map()
+            dtype_from_key = reader.get_variable_to_dtype_map()
+            logger.info("\nCheckpoint Variables:")
+            for key in shape_from_key:
+                logger.info(f"{key}: dtype={dtype_from_key[key]} - Shape: {shape_from_key[key]}")
             status = checkpoint.restore(latest_checkpoint)
+            status.assert_consumed()
             logger.info(f"Restored from checkpoint: {latest_checkpoint}")
+            logger.info(f"Optimizer iterations after restore: {self.optimizer.iterations.numpy()}")
+            # Verify learning rate after restore
+            if use_lr_schedule:
+                current_lr = float(lr_schedule(self.optimizer.iterations))
+            else:
+                current_lr = float(self.optimizer.learning_rate.numpy())
+            logger.info(f"Current learning rate after restore: {current_lr:.2e}")
+            # Derive initial_epoch from checkpoint name if not passed in
             ckpt_number = int(latest_checkpoint.split('ckpt-')[-1])
             if initial_epoch == 0:
                 initial_epoch = ckpt_number
+            # Assign to checkpoint.epoch so we keep counting from that
+            checkpoint.epoch.assign(tf.cast(initial_epoch, tf.int32))
             logger.info(f"Resuming from epoch {initial_epoch}")
+            # If you want to load old history from file:
+            if history_path.exists():
+                try:
+                    with open(history_path, 'r') as f:
+                        self.history = json.load(f)
+                    logger.info(f"Loaded previous training history from {history_path}")
+                except Exception as e:
+                    logger.warning(f"Could not load history, starting fresh: {e}")
         else:
             logger.info("Starting training from scratch")
+            checkpoint.epoch.assign(tf.cast(0, tf.int32))
             initial_epoch = 0
+        # Set up TensorBoard
         log_dir = Path(checkpoint_dir) / "tensorboard_logs"
         log_dir.mkdir(parents=True, exist_ok=True)
         current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
         val_summary_writer = tf.summary.create_file_writer(val_log_dir)
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
+        # Parse dataset
         dataset = tf.data.TFRecordDataset(tfrecord_file_path)
+        # Optional: test/debug mode with small subset
         if test_mode:
+            subset_size = 150
             dataset = dataset.take(subset_size)
             logger.info(f"TEST MODE: Using only {subset_size} examples")
+            # Recompute sizes, steps, epochs, etc., as needed
             total_pairs = subset_size
             train_size = int(total_pairs * (1 - validation_split))
             val_size = total_pairs - train_size
             steps_per_epoch = math.ceil(train_size / batch_size)
             val_steps = math.ceil(val_size / batch_size)
             total_steps = steps_per_epoch * epochs
+            buffer_size = max(1, total_pairs // 10)
+            epochs = min(epochs, 5)  # For quick debug
             early_stopping_patience = 2
             logger.info(f"New training pairs: {train_size}")
             logger.info(f"New validation pairs: {val_size}")
+        dataset = dataset.map(
+            lambda x: parse_tfrecord_fn(x, self.config.max_context_token_limit, self.config.neg_samples),
+            num_parallel_calls=tf.data.AUTOTUNE
+        )
+        # Train/val split
         train_dataset = dataset.take(train_size)
         val_dataset = dataset.skip(train_size).take(val_size)
+        # Shuffle and batch
         train_dataset = train_dataset.shuffle(buffer_size=buffer_size)
         train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
         train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
         for epoch in range(int(checkpoint.epoch.numpy()) + 1, epochs + 1):
             checkpoint.epoch.assign(epoch)
             logger.info(f"Starting Epoch {epoch}...")
             # --- Training Phase ---
+            epoch_loss_avg = tf.keras.metrics.Mean(dtype=tf.float32)
             batches_processed = 0
+            # Progress bar
             try:
+                train_pbar = tqdm(
+                    total=steps_per_epoch,
+                    desc=f"Training Epoch {epoch}",
+                    unit="batch"
+                )
                 is_tqdm_train = True
             except ImportError:
                 train_pbar = None
                 is_tqdm_train = False
             for q_batch, p_batch, n_batch in train_dataset:
                 loss, grad_norm, post_clip_norm = self.train_step(q_batch, p_batch, n_batch)
                 epoch_loss_avg(loss)
                 batches_processed += 1
                 # Log to TensorBoard
                 with train_summary_writer.as_default():
                     step = (epoch - 1) * steps_per_epoch + batches_processed
+                    tf.summary.scalar("loss", tf.cast(loss, tf.float32), step=step)
+                    tf.summary.scalar("gradient_norm_pre_clip", tf.cast(grad_norm, tf.float32), step=step)
+                    tf.summary.scalar("gradient_norm_post_clip", tf.cast(post_clip_norm, tf.float32), step=step)
                 # Update progress bar
                 if use_lr_schedule:
                     train_pbar.update(1)
                     train_pbar.set_postfix({
                         "loss": f"{loss.numpy():.4f}",
+                        "pre_clip": f"{grad_norm.numpy():.2e}",
+                        "post_clip": f"{post_clip_norm.numpy():.2e}",
                         "lr": f"{current_lr:.2e}",
                         "batches": f"{batches_processed}/{steps_per_epoch}"
                     })
                 gc.collect()
+                # End the epoch early if we've processed all steps
                 if batches_processed >= steps_per_epoch:
                     break
                 train_pbar.close()
             # --- Validation Phase ---
+            val_loss_avg = tf.keras.metrics.Mean(dtype=tf.float32)
             val_batches_processed = 0
             try:
             except ImportError:
                 val_pbar = None
                 is_tqdm_val = False
+            last_valid_val_loss = None
             valid_batches = False
             for q_batch, p_batch, n_batch in val_dataset:
+                # If batch is too small, skip
+                if tf.shape(q_batch)[0] < 2:
+                    logger.warning(f"Skipping validation batch of size {tf.shape(q_batch)[0]}")
                     continue
                 valid_batches = True
                 val_loss = self.validation_step(q_batch, p_batch, n_batch)
                 val_loss_avg(val_loss)
                         "batches": f"{val_batches_processed}/{val_steps}"
                     })
                 gc.collect()
                 if val_batches_processed >= val_steps:
                     break
             if not valid_batches:
+                # If no valid batch is found, fallback
+                logger.warning("No valid validation batches in this epoch")
                 if last_valid_val_loss is not None:
                     val_loss = last_valid_val_loss
                     val_loss_avg(val_loss)
                 else:
+                    val_loss = epoch_loss_avg.result()
                     val_loss_avg(val_loss)
             if is_tqdm_val and val_pbar:
                 val_pbar.close()
+            # End of epoch: final stats
             train_loss = epoch_loss_avg.result().numpy()
             val_loss = val_loss_avg.result().numpy()
             logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
+            # TensorBoard epoch logs
             with train_summary_writer.as_default():
                 tf.summary.scalar("epoch_loss", train_loss, step=epoch)
             with val_summary_writer.as_default():
             # Save checkpoint
             manager.save()
+            # (Optional) Save model for quick testing/inference
             model_save_path = Path(checkpoint_dir) / f"model_epoch_{epoch}"
             self.save_models(model_save_path)
             logger.info(f"Saved model for epoch {epoch} at {model_save_path}")
+            # Update local history
             self.history['train_loss'].append(train_loss)
             self.history['val_loss'].append(val_loss)
             self.history.setdefault('learning_rate', []).append(current_lr)
+            def convert_to_py_floats(obj):
+                if isinstance(obj, dict):
+                    return {k: convert_to_py_floats(v) for k, v in obj.items()}
+                elif isinstance(obj, list):
+                    return [convert_to_py_floats(x) for x in obj]
+                elif isinstance(obj, (np.float32, np.float64)):
+                    return float(obj)
+                elif tf.is_tensor(obj):
+                    return float(obj.numpy())
+                else:
+                    return obj
+            json_history = convert_to_py_floats(self.history)
+            # Save training history to file every epoch
+            # (Create or overwrite the file so we always have the latest.)
+            with open(history_path, 'w') as f:
+                json.dump(json_history, f)
+            logger.info(f"Saved training history to {history_path}")
+            # Early stopping
             if val_loss < best_val_loss - min_delta:
                 best_val_loss = val_loss
                 epochs_no_improve = 0
             # Now compute scores: dot product of q_enc with each column in combined_p_n
             # We'll use `tf.einsum` to handle the batch dimension properly
             # dot_products => shape [batch_size, (1+neg_samples)]
+            dot_products = tf.cast(tf.einsum('bd,bkd->bk', q_enc, combined_p_n), tf.float32)
+            labels = tf.zeros([bs], dtype=tf.int32)  # Keep labels as int32
             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                 labels=labels,
                 logits=dot_products
             )
+            loss = tf.cast(tf.reduce_mean(loss), tf.float32)
         # Calculate gradients
         gradients = tape.gradient(loss, self.encoder.trainable_variables)
+        gradients_norm = tf.cast(tf.linalg.global_norm(gradients), tf.float32)
+        max_grad_norm = tf.constant(1.5, dtype=tf.float32)
         gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm, gradients_norm)
+        post_clip_norm = tf.cast(tf.linalg.global_norm(gradients), tf.float32)
         # Apply gradients
         self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
             axis=1
         )
+        dot_products = tf.cast(tf.einsum('bd,bkd->bk', q_enc, combined_p_n), tf.float32)
+        labels = tf.zeros([bs], dtype=tf.int32)  # Keep labels as int32
         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
             labels=labels,
             logits=dot_products
         )
+        loss = tf.cast(tf.reduce_mean(loss), tf.float32)
         return loss
                 self.warmup_steps = tf.cast(adjusted_warmup_steps, tf.float32)
                 # Calculate and store constants
+                self.initial_lr = tf.cast(self.peak_lr * 0.1, tf.float32)
+                self.min_lr = tf.cast(self.peak_lr * 0.01, tf.float32)
                 logger.info(f"Learning rate schedule initialized:")
                 logger.info(f"  Initial LR: {float(self.initial_lr):.6f}")
                 step = tf.cast(step, tf.float32)
                 # Warmup phase
+                warmup_factor = tf.cast(tf.minimum(1.0, step / self.warmup_steps), tf.float32)
                 warmup_lr = self.initial_lr + (self.peak_lr - self.initial_lr) * warmup_factor
                 # Decay phase
+                decay_steps = tf.cast(tf.maximum(1.0, self.total_steps - self.warmup_steps), tf.float32)
+                decay_factor = tf.cast((step - self.warmup_steps) / decay_steps, tf.float32)
+                decay_factor = tf.cast(tf.minimum(tf.maximum(0.0, decay_factor), 1.0), tf.float32)
+                cosine_decay = tf.cast(0.5 * (1.0 + tf.cos(tf.constant(math.pi, dtype=tf.float32) * decay_factor)), tf.float32)
                 decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
                 # Choose between warmup and decay

train_model.py CHANGED Viewed

@@ -48,14 +48,17 @@ def main():
     # Initialize chatbot
     chatbot = RetrievalChatbot(config, mode='training')
-    # # Load from a checkpoint
-    # latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)
-    # initial_epoch = 0
-    # if latest_checkpoint:
-    #     ckpt_number = int(latest_checkpoint.split('ckpt-')[-1])
-    #     initial_epoch = ckpt_number
-    #     logger.info(f"Found checkpoint {latest_checkpoint}, resuming from epoch {initial_epoch}")
     # Train the model
     chatbot.train_model(
@@ -63,8 +66,9 @@ def main():
         epochs=EPOCHS,
         batch_size=batch_size,
         use_lr_schedule=True,
-        test_mode=False,
-        checkpoint_dir=CHECKPOINT_DIR
     )
     # Save final model

     # Initialize chatbot
     chatbot = RetrievalChatbot(config, mode='training')
+    # Check for existing checkpoint and get initial epoch
+    latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)
+    initial_epoch = 0
+    if latest_checkpoint:
+        try:
+            ckpt_number = int(latest_checkpoint.split('ckpt-')[-1])
+            initial_epoch = ckpt_number
+            logger.info(f"Found checkpoint {latest_checkpoint}, resuming from epoch {initial_epoch}")
+        except (IndexError, ValueError):
+            logger.error(f"Failed to parse checkpoint number from {latest_checkpoint}")
+            initial_epoch = 0
     # Train the model
     chatbot.train_model(
         epochs=EPOCHS,
         batch_size=batch_size,
         use_lr_schedule=True,
+        test_mode=True,
+        checkpoint_dir=CHECKPOINT_DIR,
+        initial_epoch=initial_epoch
     )
     # Save final model