Spaces:
Paused
Paused
# Copyright 2020-2025 The HuggingFace Team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import os | |
from dataclasses import dataclass, field | |
from ..trainer.utils import OnPolicyConfig | |
class RLOOConfig(OnPolicyConfig): | |
r""" | |
Configuration class for the [`RLOOTrainer`]. | |
This class includes only the parameters that are specific to RLOO training. For a full list of training arguments, | |
please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default | |
values in this class may differ from those in [`~transformers.TrainingArguments`]. | |
Using [`~transformers.HfArgumentParser`] we can turn this class into | |
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the | |
command line. | |
Parameters: | |
exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[: -len(".py")]`): | |
Name of this experiment. | |
reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`): | |
Path to the reward model. | |
num_ppo_epochs (`int`, *optional*, defaults to `4`): | |
Number of epochs to train. | |
whiten_rewards (`bool`, *optional*, defaults to `False`): | |
Whether to whiten the rewards. | |
kl_coef (`float`, *optional*, defaults to `0.05`): | |
KL coefficient. | |
cliprange (`float`, *optional*, defaults to `0.2`): | |
Clip range. | |
rloo_k (`int`, *optional*, defaults to `2`): | |
REINFORCE Leave-One-Out (RLOO) number of online samples per prompt. | |
normalize_reward (`bool`, *optional*, defaults to `False`): | |
Whether to normalize rewards. | |
reward_clip_range (`float`, *optional*, defaults to `10.0`): | |
Clip range for rewards. | |
normalize_advantage (`bool`, *optional*, defaults to `False`): | |
Whether to normalize advantages. | |
token_level_kl (`bool`, *optional*, defaults to `True`): | |
Whether to use token-level KL penalty or sequence-level KL penalty. | |
ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): | |
This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, | |
improving generation speed. However, disabling this option allows training models that exceed the VRAM | |
capacity of a single GPU, albeit at the cost of slower generation. | |
""" | |
exp_name: str = field( | |
default=os.path.basename(__file__)[:-3], | |
metadata={"help": "Name of this experiment."}, | |
) | |
reward_model_path: str = field( | |
default="EleutherAI/pythia-160m", | |
metadata={"help": "Path to the reward model."}, | |
) | |
num_ppo_epochs: int = field( | |
default=4, | |
metadata={"help": "Number of epochs to train."}, | |
) | |
whiten_rewards: bool = field( | |
default=False, | |
metadata={"help": "Whether to whiten the rewards."}, | |
) | |
kl_coef: float = field( | |
default=0.05, | |
metadata={"help": "KL coefficient."}, | |
) | |
cliprange: float = field( | |
default=0.2, | |
metadata={"help": "Clip range."}, | |
) | |
rloo_k: int = field( | |
default=2, | |
metadata={"help": "REINFORCE Leave-One-Out (RLOO) number of online samples per prompt."}, | |
) | |
normalize_reward: bool = field( | |
default=False, | |
metadata={"help": "Whether to normalize rewards"}, | |
) | |
reward_clip_range: float = field( | |
default=10.0, | |
metadata={"help": "Clip range for rewards"}, | |
) | |
normalize_advantage: bool = field( | |
default=False, | |
metadata={"help": "Whether to normalize advantages"}, | |
) | |
token_level_kl: bool = field( | |
default=False, | |
metadata={"help": "Whether to use token-level KL penalty or sequence-level KL penalty"}, | |
) | |
ds3_gather_for_generation: bool = field( | |
default=True, | |
metadata={ | |
"help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for " | |
"generation, improving generation speed. However, disabling this option allows training models that " | |
"exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation." | |
}, | |
) | |