Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 4

Commit

fc5f33b

1 Parent(s): dfb45fe

checkpointing updates, optimizations

Browse files

Files changed (5) hide show

chatbot_model.py +51 -24
training_plotter.py → plotter.py +2 -19
readme.md +0 -3
train_model.py +12 -12
validate_model.py +2 -2

chatbot_model.py CHANGED Viewed

@@ -30,7 +30,7 @@ class ChatbotConfig:
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
     l2_reg_weight: float = 0.001
-    learning_rate: float = 0.001
     min_text_length: int = 3
     max_context_turns: int = 5
     warmup_steps: int = 200
@@ -676,8 +676,9 @@ class RetrievalChatbot(DeviceAwareModel):
         # Initialize checkpoint manager
         checkpoint = tf.train.Checkpoint(
-            epoch=tf.Variable(0),
-            optimizer=self.optimizer,
             model=self.encoder,
             variables=self.encoder.variables
         )
@@ -685,27 +686,29 @@ class RetrievalChatbot(DeviceAwareModel):
         # Restore from checkpoint if available
         latest_checkpoint = manager.latest_checkpoint
         if latest_checkpoint:
-            history_path = Path(checkpoint_dir) / 'training_history.json'
-            if history_path.exists():
-                try:
-                    with open(history_path, 'r') as f:
-                        self.history = json.load(f)
-                    logger.info(f"Loaded previous training history from {history_path}")
-                except Exception as e:
-                    logger.warning(f"Could not load history, starting fresh: {e}")
-                    self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
-            else:
-                self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
             status = checkpoint.restore(latest_checkpoint)
             status.expect_partial()
             logger.info(f"Restored from checkpoint: {latest_checkpoint}")
             # Get the checkpoint number to validate initial_epoch
             ckpt_number = int(latest_checkpoint.split('ckpt-')[-1])
             if initial_epoch == 0:
                 initial_epoch = ckpt_number
             logger.info(f"Resuming from epoch {initial_epoch}")
         else:
             logger.info("Starting training from scratch")
@@ -736,6 +739,7 @@ class RetrievalChatbot(DeviceAwareModel):
             total_pairs = subset_size
             train_size = int(total_pairs * (1 - validation_split))
             val_size = total_pairs - train_size
             steps_per_epoch = math.ceil(train_size / batch_size)
             val_steps = math.ceil(val_size / batch_size)
             total_steps = steps_per_epoch * epochs
@@ -758,7 +762,7 @@ class RetrievalChatbot(DeviceAwareModel):
         train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
         train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
-        val_dataset = val_dataset.batch(batch_size, drop_remainder=True)
         val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
         val_dataset = val_dataset.cache()
@@ -766,7 +770,9 @@ class RetrievalChatbot(DeviceAwareModel):
         best_val_loss = float("inf")
         epochs_no_improve = 0
-        for epoch in range(initial_epoch + 1, epochs + 1):
             # --- Training Phase ---
             epoch_loss_avg = tf.keras.metrics.Mean()
             batches_processed = 0
@@ -790,8 +796,8 @@ class RetrievalChatbot(DeviceAwareModel):
                 elif grad_norm_value > 100:
                     logger.warning(f"Potential exploding gradient detected: norm = {grad_norm_value:.2e}")
-                if grad_norm_value != post_clip_value:
-                    logger.info(f"Gradient clipped: {grad_norm_value:.2e} -> {post_clip_value:.2e}")
                 epoch_loss_avg(loss)
                 batches_processed += 1
@@ -840,9 +846,18 @@ class RetrievalChatbot(DeviceAwareModel):
                 is_tqdm_val = False
                 logger.info("Validation progress bar disabled")
             for q_batch, p_batch, n_batch in val_dataset:
                 val_loss = self.validation_step(q_batch, p_batch, n_batch)
                 val_loss_avg(val_loss)
                 val_batches_processed += 1
                 if is_tqdm_val:
@@ -857,7 +872,18 @@ class RetrievalChatbot(DeviceAwareModel):
                 if val_batches_processed >= val_steps:
                     break
             if is_tqdm_val and val_pbar:
                 val_pbar.close()
@@ -893,9 +919,10 @@ class RetrievalChatbot(DeviceAwareModel):
             self.history.setdefault('learning_rate', []).append(current_lr)
             # Save history to file
-            with open(history_path, 'w') as f:
-                json.dump(self.history, f)
-            logger.info(f"Saved training history to {history_path}")
             # Early stopping logic
             if val_loss < best_val_loss - min_delta:
@@ -970,7 +997,7 @@ class RetrievalChatbot(DeviceAwareModel):
         gradients_norm = tf.linalg.global_norm(gradients)
         # Clip gradients if norm exceeds threshold
-        max_grad_norm = 1.0
         gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm, gradients_norm)
         post_clip_norm = tf.linalg.global_norm(gradients)

     num_attention_heads: int = 8
     dropout_rate: float = 0.2
     l2_reg_weight: float = 0.001
+    learning_rate: float = 0.0005
     min_text_length: int = 3
     max_context_turns: int = 5
     warmup_steps: int = 200
         # Initialize checkpoint manager
         checkpoint = tf.train.Checkpoint(
+            epoch=tf.Variable(0, dtype=tf.int32),
+            optimizer=self.optimizer,
+            optimizer_iterations=self.optimizer.iterations,
             model=self.encoder,
             variables=self.encoder.variables
         )
         # Restore from checkpoint if available
         latest_checkpoint = manager.latest_checkpoint
+        #history_path = Path(checkpoint_dir) / 'training_history.json'
         if latest_checkpoint:
+            # if history_path.exists():
+            #     try:
+            #         with open(history_path, 'r') as f:
+            #             self.history = json.load(f)
+            #         logger.info(f"Loaded previous training history from {history_path}")
+            #     except Exception as e:
+            #         logger.warning(f"Could not load history, starting fresh: {e}")
+            #         self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
+            # else:
+            #     self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
             status = checkpoint.restore(latest_checkpoint)
             status.expect_partial()
             logger.info(f"Restored from checkpoint: {latest_checkpoint}")
             # Get the checkpoint number to validate initial_epoch
             ckpt_number = int(latest_checkpoint.split('ckpt-')[-1])
             if initial_epoch == 0:
                 initial_epoch = ckpt_number
+            checkpoint.epoch.assign(initial_epoch)
             logger.info(f"Resuming from epoch {initial_epoch}")
         else:
             logger.info("Starting training from scratch")
             total_pairs = subset_size
             train_size = int(total_pairs * (1 - validation_split))
             val_size = total_pairs - train_size
+            batch_size = min(batch_size, val_size)
             steps_per_epoch = math.ceil(train_size / batch_size)
             val_steps = math.ceil(val_size / batch_size)
             total_steps = steps_per_epoch * epochs
         train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
         train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
+        val_dataset = val_dataset.batch(batch_size, drop_remainder=False)
         val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
         val_dataset = val_dataset.cache()
         best_val_loss = float("inf")
         epochs_no_improve = 0
+        for epoch in range(int(checkpoint.epoch.numpy()) + 1, epochs + 1):
+            checkpoint.epoch.assign(epoch)
+            logger.info(f"Starting Epoch {epoch}...")
             # --- Training Phase ---
             epoch_loss_avg = tf.keras.metrics.Mean()
             batches_processed = 0
                 elif grad_norm_value > 100:
                     logger.warning(f"Potential exploding gradient detected: norm = {grad_norm_value:.2e}")
+                # if grad_norm_value != post_clip_value:
+                #     logger.info(f"Gradient clipped: {grad_norm_value:.2e} -> {post_clip_value:.2e}")
                 epoch_loss_avg(loss)
                 batches_processed += 1
                 is_tqdm_val = False
                 logger.info("Validation progress bar disabled")
+            last_valid_val_loss = None  # Initialize outside the loop
+            valid_batches = False
             for q_batch, p_batch, n_batch in val_dataset:
+                if tf.shape(q_batch)[0] < 2:
+                    logger.warning(f"Skipping validation batch of size {tf.shape(q_batch)[0]} (too small for loss calculation)")
+                    continue
+                valid_batches = True
                 val_loss = self.validation_step(q_batch, p_batch, n_batch)
                 val_loss_avg(val_loss)
+                last_valid_val_loss = val_loss
                 val_batches_processed += 1
                 if is_tqdm_val:
                 if val_batches_processed >= val_steps:
                     break
+            if not valid_batches:
+                logger.warning("No valid validation batches in this epoch, using last known validation loss")
+                if last_valid_val_loss is not None:
+                    val_loss = last_valid_val_loss
+                    val_loss_avg(val_loss)
+                else:
+                    # If we've never had a valid batch (first epoch), use training loss as fallback
+                    logger.warning("No previous validation loss available, using training loss as fallback")
+                    val_loss = train_loss
+                    val_loss_avg(val_loss)
             if is_tqdm_val and val_pbar:
                 val_pbar.close()
             self.history.setdefault('learning_rate', []).append(current_lr)
             # Save history to file
+            #if history_path.exists():
+            # with open(history_path, 'w') as f:
+            #     json.dump(self.history, f)
+            # logger.info(f"Saved training history to {history_path}")
             # Early stopping logic
             if val_loss < best_val_loss - min_delta:
         gradients_norm = tf.linalg.global_norm(gradients)
         # Clip gradients if norm exceeds threshold
+        max_grad_norm = 1.5
         gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm, gradients_norm)
         post_clip_norm = tf.linalg.global_norm(gradients)

training_plotter.py → plotter.py RENAMED Viewed

@@ -2,11 +2,8 @@ from pathlib import Path
 from typing import Dict, List, Optional
 import matplotlib.pyplot as plt
 from datetime import datetime
-import logging
-logger = logging.getLogger(__name__)
-class TrainingPlotter:
     def __init__(self, save_dir: Optional[Path] = None):
         self.save_dir = save_dir
         if save_dir:
@@ -18,10 +15,7 @@ class TrainingPlotter:
         Args:
             history: Dictionary containing training metrics
             title: Title for the plot
-        """
-        # Silence matplotlib debug messages
-        logger.setLevel(logging.WARNING)
         # Create figure with subplots
         fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
@@ -46,15 +40,11 @@ class TrainingPlotter:
         plt.suptitle(title)
         plt.tight_layout()
-        # Reset the logger level
-        logger.setLevel(logging.INFO)
         # Save if directory provided
         if self.save_dir:
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             save_path = self.save_dir / f'training_history_{timestamp}.png'
             plt.savefig(save_path)
-            logger.info(f"Saved training history plot to {save_path}")
         plt.show()
@@ -64,8 +54,6 @@ class TrainingPlotter:
         Args:
             metrics: Dictionary of validation metrics. Can handle nested dictionaries.
         """
-        # Silence matplotlib debug messages
-        logger.setLevel(logging.WARNING)
         # Flatten nested metrics dictionary
         flat_metrics = {}
@@ -83,7 +71,6 @@ class TrainingPlotter:
                 flat_metrics[key] = value
         if not flat_metrics:
-            logger.warning("No numeric metrics to plot")
             return
         plt.figure(figsize=(12, 6))
@@ -113,15 +100,11 @@ class TrainingPlotter:
         # Adjust layout to prevent label cutoff
         plt.tight_layout()
-        # Reset the logger level
-        logger.setLevel(logging.INFO)
         # Save if directory provided
         if self.save_dir:
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             save_path = self.save_dir / f'validation_metrics_{timestamp}.png'
             plt.savefig(save_path)
-            logger.info(f"Saved validation metrics plot to {save_path}")
         plt.show()

 from typing import Dict, List, Optional
 import matplotlib.pyplot as plt
 from datetime import datetime
+class Plotter:
     def __init__(self, save_dir: Optional[Path] = None):
         self.save_dir = save_dir
         if save_dir:
         Args:
             history: Dictionary containing training metrics
             title: Title for the plot
+        """
         # Create figure with subplots
         fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
         plt.suptitle(title)
         plt.tight_layout()
         # Save if directory provided
         if self.save_dir:
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             save_path = self.save_dir / f'training_history_{timestamp}.png'
             plt.savefig(save_path)
         plt.show()
         Args:
             metrics: Dictionary of validation metrics. Can handle nested dictionaries.
         """
         # Flatten nested metrics dictionary
         flat_metrics = {}
                 flat_metrics[key] = value
         if not flat_metrics:
             return
         plt.figure(figsize=(12, 6))
         # Adjust layout to prevent label cutoff
         plt.tight_layout()
         # Save if directory provided
         if self.save_dir:
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             save_path = self.save_dir / f'validation_metrics_{timestamp}.png'
             plt.savefig(save_path)
         plt.show()

readme.md CHANGED Viewed

@@ -18,9 +18,6 @@ This package automatically downloads the following models during installation:
 pip install -e .
-On Linux with Cuda/GPU:
-pip install faiss-gpu>=1.7.0
 ## Description
 This Python script demonstrates a complete pipeline for dialogue augmentation, including validation, optimization, and data augmentation.

 pip install -e .
 ## Description
 This Python script demonstrates a complete pipeline for dialogue augmentation, including validation, optimization, and data augmentation.

train_model.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import tensorflow as tf
 from chatbot_model import RetrievalChatbot, ChatbotConfig
 from environment_setup import EnvironmentSetup
-from training_plotter import TrainingPlotter
 from logger_config import config_logger
 logger = config_logger(__name__)
@@ -38,7 +38,7 @@ def main():
     # Training configuration
     EPOCHS = 20
     TF_RECORD_FILE_PATH = 'training_data/training_data.tfrecord'
     # Optimize batch size for Colab
     batch_size = 32 # env.optimize_batch_size(base_batch_size=16)
@@ -48,14 +48,14 @@ def main():
     # Initialize chatbot
     chatbot = RetrievalChatbot(config, mode='training')
-    # Load from a checkpoint
-    checkpoint_dir = 'checkpoints/'
-    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
-    initial_epoch = 0
-    if latest_checkpoint:
-        ckpt_number = int(latest_checkpoint.split('ckpt-')[-1])
-        initial_epoch = ckpt_number
-        logger.info(f"Found checkpoint {latest_checkpoint}, resuming from epoch {initial_epoch}")
     # Train the model
     chatbot.train_model(
@@ -64,7 +64,7 @@ def main():
         batch_size=batch_size,
         use_lr_schedule=True,
         test_mode=False,
-        initial_epoch=initial_epoch
     )
     # Save final model
@@ -72,7 +72,7 @@ def main():
     chatbot.save_models(model_save_path)
     # Plot and save training history
-    plotter = TrainingPlotter(save_dir=env.training_dirs['plots'])
     plotter.plot_training_history(chatbot.history)
 if __name__ == "__main__":

 import tensorflow as tf
 from chatbot_model import RetrievalChatbot, ChatbotConfig
 from environment_setup import EnvironmentSetup
+from plotter import Plotter
 from logger_config import config_logger
 logger = config_logger(__name__)
     # Training configuration
     EPOCHS = 20
     TF_RECORD_FILE_PATH = 'training_data/training_data.tfrecord'
+    CHECKPOINT_DIR = 'checkpoints/'
     # Optimize batch size for Colab
     batch_size = 32 # env.optimize_batch_size(base_batch_size=16)
     # Initialize chatbot
     chatbot = RetrievalChatbot(config, mode='training')
+    # # Load from a checkpoint
+    # latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)
+    # initial_epoch = 0
+    # if latest_checkpoint:
+    #     ckpt_number = int(latest_checkpoint.split('ckpt-')[-1])
+    #     initial_epoch = ckpt_number
+    #     logger.info(f"Found checkpoint {latest_checkpoint}, resuming from epoch {initial_epoch}")
     # Train the model
     chatbot.train_model(
         batch_size=batch_size,
         use_lr_schedule=True,
         test_mode=False,
+        checkpoint_dir=CHECKPOINT_DIR
     )
     # Save final model
     chatbot.save_models(model_save_path)
     # Plot and save training history
+    plotter = Plotter(save_dir=env.training_dirs['plots'])
     plotter.plot_training_history(chatbot.history)
 if __name__ == "__main__":

validate_model.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 from chatbot_model import ChatbotConfig, RetrievalChatbot
 from response_quality_checker import ResponseQualityChecker
 from chatbot_validator import ChatbotValidator
-from training_plotter import TrainingPlotter
 from environment_setup import EnvironmentSetup
 from logger_config import config_logger
@@ -103,7 +103,7 @@ def validate_chatbot():
     # Plot validation_metrics
     try:
-        plotter = TrainingPlotter(save_dir=env.training_dirs['plots'])
         plotter.plot_validation_metrics(validation_metrics)
         logger.info("Validation metrics plotted successfully.")
     except Exception as e:

 from chatbot_model import ChatbotConfig, RetrievalChatbot
 from response_quality_checker import ResponseQualityChecker
 from chatbot_validator import ChatbotValidator
+from plotter import Plotter
 from environment_setup import EnvironmentSetup
 from logger_config import config_logger
     # Plot validation_metrics
     try:
+        plotter = Plotter(save_dir=env.training_dirs['plots'])
         plotter.plot_validation_metrics(validation_metrics)
         logger.info("Validation metrics plotted successfully.")
     except Exception as e: