Spaces:
Paused
Paused
# Copyright 2020-2025 The HuggingFace Team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import os | |
from dataclasses import dataclass, field | |
from typing import Literal, Optional | |
from ..trainer.utils import OnPolicyConfig | |
class PPOConfig(OnPolicyConfig): | |
r""" | |
Configuration class for the [`PPOTrainer`]. | |
This class includes only the parameters that are specific to PPO training. For a full list of training arguments, | |
please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default | |
values in this class may differ from those in [`~transformers.TrainingArguments`]. | |
Using [`~transformers.HfArgumentParser`] we can turn this class into | |
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the | |
command line. | |
Parameters: | |
exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`): | |
Name of this experiment. | |
reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`): | |
Path to the reward model. | |
model_adapter_name (`str` or `None`, *optional*, defaults to `None`): | |
Name of the train target PEFT adapter, when using LoRA with multiple adapters. | |
ref_adapter_name (`str` or `None`, *optional*, defaults to `None`): | |
Name of the reference PEFT adapter, when using LoRA with multiple adapters. | |
num_ppo_epochs (`int`, *optional*, defaults to `4`): | |
Number of epochs to train. | |
whiten_rewards (`bool`, *optional*, defaults to `False`): | |
Whether to whiten the rewards. | |
kl_coef (`float`, *optional*, defaults to `0.05`): | |
KL coefficient. | |
kl_estimator (`Literal["k1", "k3"]`, *optional*, defaults to `"k1"`): | |
Which estimator for KL-Divergence to use from [Approximating KL Divergence](http://joschu.net/blog/kl-approx.html). | |
Defaults to "k1", a straightforward, unbiased estimator. Can be set to "k3", an unbiased estimator with | |
lower variance which "appears to be a strictly better estimator". Cannot be set to "k2", as it is used for | |
logging purposes. | |
cliprange (`float`, *optional*, defaults to `0.2`): | |
Clip range. | |
vf_coef (`float`, *optional*, defaults to `0.1`): | |
Value function coefficient. | |
cliprange_value (`float`, *optional*, defaults to `0.2`): | |
Clip range for the value function. | |
gamma (`float`, *optional*, defaults to `1.0`): | |
Discount factor. | |
lam (`float`, *optional*, defaults to `0.95`): | |
Lambda value for GAE. | |
ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): | |
This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, | |
improving generation speed. However, disabling this option allows training models that exceed the VRAM | |
capacity of a single GPU, albeit at the cost of slower generation. | |
""" | |
exp_name: str = field( | |
default=os.path.basename(__file__)[:-3], | |
metadata={"help": "Name of this experiment."}, | |
) | |
reward_model_path: str = field( | |
default="EleutherAI/pythia-160m", | |
metadata={"help": "Path to the reward model."}, | |
) | |
model_adapter_name: Optional[str] = field( | |
default=None, | |
metadata={"help": "Name of the train target PEFT adapter, when using LoRA with multiple adapters."}, | |
) | |
ref_adapter_name: Optional[str] = field( | |
default=None, | |
metadata={"help": "Name of the reference PEFT adapter, when using LoRA with multiple adapters."}, | |
) | |
num_ppo_epochs: int = field( | |
default=4, | |
metadata={"help": "Number of epochs to train."}, | |
) | |
whiten_rewards: bool = field( | |
default=False, | |
metadata={"help": "Whether to whiten the rewards."}, | |
) | |
kl_coef: float = field( | |
default=0.05, | |
metadata={"help": "KL coefficient."}, | |
) | |
kl_estimator: Literal["k1", "k3"] = field( | |
default="k1", | |
metadata={ | |
"help": "Which estimator for KL-Divergence to use from Approximating KL Divergence " | |
"(http://joschu.net/blog/kl-approx.html). Defaults to 'k1', a straightforward, unbiased estimator. Can be " | |
"set to 'k3', an unbiased estimator with lower variance which 'appears to be a strictly better " | |
"estimator'. Cannot be set to 'k2', as it is used for logging purposes." | |
}, | |
) | |
cliprange: float = field( | |
default=0.2, | |
metadata={"help": "Clip range."}, | |
) | |
vf_coef: float = field( | |
default=0.1, | |
metadata={"help": "Value function coefficient."}, | |
) | |
cliprange_value: float = field( | |
default=0.2, | |
metadata={"help": "Clip range for the value function."}, | |
) | |
gamma: float = field( | |
default=1.0, | |
metadata={"help": "Discount factor."}, | |
) | |
lam: float = field( | |
default=0.95, | |
metadata={"help": "Lambda value for GAE."}, | |
) | |
ds3_gather_for_generation: bool = field( | |
default=True, | |
metadata={ | |
"help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for " | |
"generation, improving generation speed. However, disabling this option allows training models that " | |
"exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation." | |
}, | |
) | |