Spaces:

bird-of-paradise
/

ReTool-Implementation

Running

App Files Files Community

bird-of-paradise commited on 3 days ago

Commit

c710786

verified ·

1 Parent(s): 0690c9f

Add custom sampler, train data loader and GRPO style train loop for ReTool_trainer

Browse files

Files changed (1) hide show

src/retool_trainer.py +350 -7

src/retool_trainer.py CHANGED Viewed

@@ -1,8 +1,18 @@
-from typing import Any, Callable, Optional, Union
-from collections import defaultdict
-import re
 import profiling_decorator
 import datasets
 import torch
 import torch.utils.data
@@ -14,6 +24,7 @@ from torch import nn
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.utils.data import DataLoader, Sampler
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForSequenceClassification,
     AutoTokenizer,
@@ -28,6 +39,105 @@ from transformers import (
 from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
 class ReToolTrainer(Trainer):  # Change this line
@@ -52,16 +162,19 @@ class ReToolTrainer(Trainer):  # Change this line
         mask_truncated_completions: bool = True,
         **kwargs
     ):
-        # Initialize parent Trainer (simpler call)
         super().__init__(
             model=model,
-            tokenizer=processing_class,  # Note: Trainer uses 'tokenizer', not 'processing_class'
             args=args,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
             **kwargs
         )
         # Store processing_class for compatibility
         self.processing_class = processing_class or self.tokenizer
@@ -115,6 +228,76 @@ class ReToolTrainer(Trainer):  # Change this line
             use_cache=True,
             cache_implementation=args.cache_implementation, #args.cache_implementation = 'Offloaded Cache'
         )
     def _get_interpreter_token_ids(self) -> list[int]:
         """Get token IDs for <interpreter> and </interpreter> tags."""
         start_token = self.processing_class.encode("<interpreter>", add_special_tokens=False)[0]
@@ -725,4 +908,164 @@ def _compute_loss(self, model, inputs):
         self._metrics[mode]["clip_ratio/high_max"].append(nanmax(gathered_high_clip).item())
         gathered_clip_ratio = self.accelerator.gather_for_metrics(clip_ratio)
         self._metrics[mode]["clip_ratio/region_mean"].append(gathered_clip_ratio.nanmean().item())
-        return loss

 import profiling_decorator
+import copy
+import inspect
+import os
+import re
+import textwrap
+import warnings
+from collections import defaultdict, deque
+from collections.abc import Sequence, Sized
+from contextlib import nullcontext
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
 import datasets
 import torch
 import torch.utils.data
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.utils.data import DataLoader, Sampler
 from transformers import (
+    AutoConfig,
     AutoModelForCausalLM,
     AutoModelForSequenceClassification,
     AutoTokenizer,
 from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+class HFRepeatSampler(Sampler):
+    """
+    Sampler that repeats the indices of a dataset in a structured manner.
+    Args:
+        data_source (`Sized`):
+            Dataset to sample from.
+        mini_repeat_count (`int`):
+            Number of times to repeat each index per batch.
+        batch_size (`int`, *optional*, defaults to `1`):
+            Number of unique indices per batch.
+        repeat_count (`int`, *optional*, defaults to `1`):
+            Number of times to repeat the full sampling process.
+        shuffle (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the dataset.
+        seed (`int` or `None`, *optional*, defaults to `None`):
+            Random seed for reproducibility (only affects this sampler).
+    Example:
+    ```python
+    >>> sampler = RepeatSampler(
+    ...     ["a", "b", "c", "d", "e", "f", "g"], mini_repeat_count=2, batch_size=3, repeat_count=4
+    ... )
+    >>> list(sampler)
+    [4, 4, 3, 3, 0, 0,
+     4, 4, 3, 3, 0, 0,
+     4, 4, 3, 3, 0, 0,
+     4, 4, 3, 3, 0, 0,
+     1, 1, 2, 2, 6, 6,
+     1, 1, 2, 2, 6, 6,
+     1, 1, 2, 2, 6, 6,
+     1, 1, 2, 2, 6, 6]
+    ```
+    ```txt
+    mini_repeat_count = 3
+          -   -   -
+         [0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,      |
+          4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,      |
+          8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11,      |
+                                                                repeat_count = 2
+          0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,      |
+          4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,      |
+          8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11, ...] |
+          ---------   ---------   ---------   ---------
+           ---------   ---------   ---------   ---------
+            ---------   ---------   ---------   ---------
+                         batch_size = 12
+    ```
+    """
+    def __init__(
+        self,
+        data_source: Sized,
+        mini_repeat_count: int,
+        batch_size: int = 1,
+        repeat_count: int = 1,
+        shuffle: bool = True,
+        seed: Optional[int] = None,
+    ):
+        self.data_source = data_source
+        self.mini_repeat_count = mini_repeat_count
+        self.batch_size = batch_size
+        self.repeat_count = repeat_count
+        self.num_samples = len(data_source)
+        self.shuffle = shuffle
+        self.seed = seed
+        if shuffle:
+            self.generator = torch.Generator()  # Create a local random generator
+            if seed is not None:
+                self.generator.manual_seed(seed)
+    def __iter__(self):
+        if self.shuffle:
+            # E.g., [2, 4, 3, 1, 0, 6, 5] (num_samples = 7)
+            indexes = torch.randperm(self.num_samples, generator=self.generator).tolist()
+        else:
+            indexes = list(range(self.num_samples))
+        #    [2, 4, 3, 1, 0, 6, 5]
+        # -> [[2, 4, 3], [1, 0, 6], [5]]  (batch_size = 3)
+        indexes = [indexes[i : i + self.batch_size] for i in range(0, len(indexes), self.batch_size)]
+        #    [[2, 4, 3], [1, 0, 6], [5]]
+        # -> [[2, 4, 3], [1, 0, 6]]
+        indexes = [chunk for chunk in indexes if len(chunk) == self.batch_size]
+        for chunk in indexes:
+            for _ in range(self.repeat_count):
+                for index in chunk:
+                    for _ in range(self.mini_repeat_count):
+                        yield index
+    def __len__(self) -> int:
+        return (self.num_samples // self.batch_size) * self.batch_size * self.mini_repeat_count * self.repeat_count
 class ReToolTrainer(Trainer):  # Change this line
         mask_truncated_completions: bool = True,
         **kwargs
     ):
+        # Initialize parent Trainer (simpler call)
         super().__init__(
             model=model,
             args=args,
+            tokenizer=processing_class,  # Note: Trainer uses 'tokenizer', not 'processing_class'
+            data_collator=identity,  # No data collation is needed in GRPO
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
             **kwargs
         )
         # Store processing_class for compatibility
         self.processing_class = processing_class or self.tokenizer
             use_cache=True,
             cache_implementation=args.cache_implementation, #args.cache_implementation = 'Offloaded Cache'
         )
+    def _set_signature_columns_if_needed(self):
+    # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+    # By default, this method sets `self._signature_columns` to the model's expected inputs.
+    # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+    # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt", "image"]
+    def _get_train_sampler(self, dataset=None):
+        """Override to use RepeatSampler for GRPO."""
+        # Returns a sampler that
+        # 1. ensures each prompt is repeated across multiple processes. This guarantees that identical prompts are
+        #    distributed to different GPUs, allowing rewards to be computed and normalized correctly within each prompt
+        #    group. Using the same seed across processes ensures consistent prompt assignment, preventing discrepancies
+        #    in group formation.
+        # 2. repeats the batch multiple times to allow reusing generations across multiple updates. Refer to
+        #    _prepare_inputs to see how the generations are stored and reused.
+        # In the following figure, the values are the prompt indices. The first row shows the first sampled batch, the
+        # second row shows the second sampled batch, and so on.
+        #
+        #                                      |   GPU 0  |   GPU 1  |
+        #
+        #                 global_step   step    <-───>  num_generations=2
+        #                                       <-───────> per_device_train_batch_size=3
+        #  grad_accum    ▲  ▲  0          0     0   0   1   1   2   2   <- Generate for the first `steps_per_generation` (prompts 0 to 11); store the completions; use the first slice to compute the loss
+        #     =2         ▼  |  0          1     3   3   4   4   5   5   <- Take the stored generations and use the second slice to compute the loss
+        #                   |
+        #                   |  1          2     6   6   7   7   8   8   <- Take the stored generations and use the third slice to compute the loss
+        #  steps_per_gen=4  ▼  1          3     9   9  10  10  11  11   <- Take the stored generations and use the fourth slice to compute the loss
+        #
+        #                      2          4    12  12  13  13  14  14   <- Generate for the second `steps_per_generation` (prompts 12 to 23); store the completions; use the first slice to compute the loss
+        #                      2          5    15  15  16  16  17  17   <- Take the stored generations and use the second slice to compute the loss
+        #                                          ...
+        if dataset is None:
+            dataset = self.train_dataset
+        return HFRepeatSampler(
+            data_source=dataset,
+            mini_repeat_count=self.num_generations,  # e.g., 4 completions per prompt
+            batch_size=self.args.generation_batch_size // self.num_generations,   # correction
+            repeat_count=self.num_iterations * self.args.steps_per_generation,    # correction
+            shuffle=True,
+            seed=self.args.seed
+        )
+    def get_train_dataloader(self):
+        """Override to ensure our custom sampler is used."""
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator
+        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
+            train_dataset = self._remove_unused_columns(train_dataset, description="training")
+        else:
+            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
+        sampler = self._get_train_sampler(train_dataset)
+        dataloader_batch_size = self._train_batch_size * self.args.steps_per_generation
+        return DataLoader(
+            train_dataset,
+            batch_size= self.args.generation_batch_size,  # < this is the change, HF was useing dataloader_batch_size
+            sampler=sampler,
+            collate_fn=data_collator,
+            drop_last=self.args.dataloader_drop_last,
+            num_workers=self.args.dataloader_num_workers,
+        )
     def _get_interpreter_token_ids(self) -> list[int]:
         """Get token IDs for <interpreter> and </interpreter> tags."""
         start_token = self.processing_class.encode("<interpreter>", add_special_tokens=False)[0]
         self._metrics[mode]["clip_ratio/high_max"].append(nanmax(gathered_high_clip).item())
         gathered_clip_ratio = self.accelerator.gather_for_metrics(clip_ratio)
         self._metrics[mode]["clip_ratio/region_mean"].append(gathered_clip_ratio.nanmean().item())
+        return loss
+def train(self):
+    """
+    Comprehensive training loop for ReTool with GRPO.
+    Adapted from train_with_batching to work as a method.
+    """
+    # Initialize
+    self.model.train()
+    if not hasattr(self, 'ref_model') or self.ref_model is None:
+        self.ref_model = deepcopy(self.model)
+        self.ref_model.eval()
+    # Setup tracking
+    writer = SummaryWriter(self.args.logging_dir)
+    training_history = []
+    # Get dataloader with our custom sampler
+    train_dataloader = self.get_train_dataloader()
+    # Generation storage for reuse
+    stored_generation_outputs = None
+    generation_counter = 0
+    global_step = 0
+    for epoch in range(self.args.num_train_epochs):
+        epoch_metrics = []
+        start_mem = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0
+        for batch_idx, batch in enumerate(train_dataloader):
+            # batch already has repeated prompts from our RepeatSampler
+            # Shape: (generation_batch_size, ...) where generation_batch_size = unique_prompts * num_generations
+            # Determine if we need new generations
+            generate_new = (global_step % (self.args.steps_per_generation * self.num_iterations)) == 0
+            if generate_new:
+                print(f"Generating new completions at step {global_step}")
+                with torch.no_grad():
+                    # This is where ReTool magic happens - generate with code execution!
+                    stored_generation_outputs = self._generate_and_score_completions(batch)
+                generation_counter = 0
+            # Now train on the stored generations
+            # This replaces the mini/micro batch logic from your original function
+            batch_loss = self._train_on_stored_generations(
+                stored_generation_outputs,
+                epoch_metrics
+            )
+            global_step += 1
+            generation_counter += 1
+            # Logging
+            if global_step % self.args.logging_steps == 0:
+                self._log_training_metrics(writer, epoch_metrics, global_step)
+            # Optional: Check for training instability
+            if self._should_stop_training(epoch_metrics):
+                print("Training instability detected! Stopping early.")
+                return training_history
+        # End of epoch
+        end_mem = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0
+        epoch_summary = self._compute_epoch_summary(epoch_metrics, start_mem, end_mem)
+        training_history.append(epoch_summary)
+        # Log epoch results
+        self._log_epoch_metrics(epoch, epoch_summary, writer)
+        # Update scheduler if we have one
+        if hasattr(self, 'scheduler') and self.scheduler is not None:
+            self.scheduler.step(epoch_summary['mean_reward'])
+            print(f"Current learning rate: {self.optimizer.param_groups[0]['lr']}")
+    writer.close()
+    return training_history
+def _train_on_stored_generations(self, generation_outputs, epoch_metrics):
+    """
+    Train on stored generations with mini/micro-batching.
+    This replaces the inner loops of your train_with_batching.
+    """
+    # Extract components from generation_outputs
+    # These already include code execution results and advantages!
+    prompt_ids = generation_outputs['prompt_ids']
+    completion_ids = generation_outputs['completion_ids']
+    advantages = generation_outputs['advantages']
+    completion_mask = generation_outputs['completion_mask']
+    interpreter_mask = generation_outputs.get('interpreter_mask', completion_mask)
+    batch_size = prompt_ids.size(0)
+    # Mini-batch size: process multiple groups together
+    # Each group has num_generations completions
+    mini_batch_size = self.args.per_device_train_batch_size * self.num_generations
+    # Micro-batch size: for memory efficiency within mini-batch
+    micro_batch_size = max(self.num_generations, 4)  # At least one full group
+    total_loss = 0
+    num_updates = 0
+    # Shuffle indices for this training iteration
+    indices = torch.randperm(batch_size)
+    # Process in mini-batches
+    for mini_start in range(0, batch_size, mini_batch_size):
+        mini_end = min(mini_start + mini_batch_size, batch_size)
+        mini_indices = indices[mini_start:mini_end]
+        self.optimizer.zero_grad()
+        mini_batch_loss = 0
+        num_micro_batches = 0
+        # Process in micro-batches (gradient accumulation)
+        for micro_start in range(0, len(mini_indices), micro_batch_size):
+            micro_end = min(micro_start + micro_batch_size, len(mini_indices))
+            micro_indices = mini_indices[micro_start:micro_end]
+            # Create micro-batch
+            micro_batch = {
+                'prompt_ids': prompt_ids[micro_indices],
+                'prompt_mask': generation_outputs['prompt_mask'][micro_indices],
+                'completion_ids': completion_ids[micro_indices],
+                'completion_mask': completion_mask[micro_indices],
+                'interpreter_mask': interpreter_mask[micro_indices],
+                'advantages': advantages[micro_indices]
+            }
+            # Compute GRPO loss (this uses your _compute_loss method)
+            loss = self._compute_loss(self.model, micro_batch)
+            # Scale for gradient accumulation
+            scaled_loss = loss * (len(micro_indices) / len(mini_indices))
+            scaled_loss.backward()
+            mini_batch_loss += loss.item()
+            num_micro_batches += 1
+        # Gradient clipping and optimizer step
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            self.model.parameters(),
+            max_norm=1.0
+        )
+        self.optimizer.step()
+        # Track metrics
+        batch_metrics = {
+            'loss': mini_batch_loss / num_micro_batches,
+            'gradient_norm': grad_norm.item(),
+            'batch_size': len(mini_indices),
+            'advantages_mean': advantages[mini_indices].mean().item(),
+            'advantages_std': advantages[mini_indices].std().item()
+        }
+        epoch_metrics.append(batch_metrics)
+        total_loss += mini_batch_loss
+        num_updates += 1
+    return total_loss / max(num_updates, 1)