Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

VideoModelStudio / docs /finetrainers-src-codebase /finetrainers /optimizer.py

jbilcke-hf HF Staff

we are going to hack into finetrainers

9fd1204 8 days ago

raw

history blame

16.6 kB

	import functools
	import math
	from typing import Any, Callable, Dict, List, Optional, Type, Union

	import torch
	from torch.distributed.checkpoint.state_dict import (
	StateDictOptions,
	get_optimizer_state_dict,
	set_optimizer_state_dict,
	)
	from torch.distributed.checkpoint.stateful import Stateful

	from .parallel import ParallelBackendEnum
	from .utils.import_utils import is_bitsandbytes_available


	class OptimizerWrapper(Stateful):
	r"""
	Optimizer wrapper that:
	- allows step/zero_grad on multiple optimizers needed for virtual pipeline stages
	- saves/loading optimizer state_dict at checkpoint
	"""

	def __init__(
	self,
	model_parts: List[torch.nn.Module],
	optimizer_cls: Type[torch.optim.Optimizer],
	optimizer_kwargs: Dict[str, Any],
	) -> None:
	self.optimizer_cls = optimizer_cls
	self.optimizer_kwargs = optimizer_kwargs

	self.optimizers = []
	self.model_parts = model_parts

	for model in self.model_parts:
	optimizer = optimizer_cls(model.parameters(), **optimizer_kwargs)
	self.optimizers.append(optimizer)

	def step(self) -> None:
	for optimizer in self.optimizers:
	optimizer.step()

	def zero_grad(self) -> None:
	for optimizer in self.optimizers:
	optimizer.zero_grad()

	def state_dict(self) -> Dict[str, Any]:
	func = functools.partial(
	get_optimizer_state_dict,
	options=StateDictOptions(flatten_optimizer_state_dict=True),
	)
	return {k: v for sd in map(func, self.model_parts, self.optimizers) for k, v in sd.items()}

	def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
	func = functools.partial(
	set_optimizer_state_dict,
	optim_state_dict=state_dict,
	options=StateDictOptions(flatten_optimizer_state_dict=True),
	)
	list(map(func, self.model_parts, self.optimizers))


	class SchedulerWrapper:
	def __init__(
	self, optimizers, scheduler_lambda_fn: Type[torch.optim.lr_scheduler.LRScheduler], last_epoch: int
	) -> None:
	self.schedulers = []
	for optimizer in optimizers:
	self.schedulers.append(torch.optim.lr_scheduler.LambdaLR(optimizer, scheduler_lambda_fn, last_epoch))

	def step(self) -> None:
	for scheduler in self.schedulers:
	scheduler.step()

	def get_last_lr(self) -> List[float]:
	# TODO(aryan): look into this later. Currently calling it leads to NCCL hang?????
	return {f"lr_{idx}": scheduler.get_last_lr() for idx, scheduler in enumerate(self.schedulers)}

	def get_lr_scheduler_state(self) -> Dict[str, Any]:
	state_dict = {}
	if len(self.schedulers) == 1:
	state_dict["lr_scheduler"] = self.schedulers[0]
	else:
	# For now, pipeline-parallel with looped schedules does not support resharding for lr_scheduler.
	# It should only support saving and loading a distributed checkpoint with the same number of pp ranks
	for idx, lr_scheduler in enumerate(self.schedulers):
	state_dict[f"lr_scheduler_{idx}"] = lr_scheduler
	return state_dict


	def get_optimizer(
	parallel_backend: ParallelBackendEnum,
	name: str,
	model_parts: List[torch.nn.Module],
	learning_rate: float = 1e-3,
	beta1: float = 0.9,
	beta2: float = 0.95,
	beta3: float = 0.999,
	epsilon: float = 1e-8,
	weight_decay: float = 1e-4,
	fused: bool = False,
	) -> Union[torch.optim.Optimizer, OptimizerWrapper]:
	name = name.lower()

	_raise_errors_if_packages_not_available(name)

	if name == "adam":
	optimizer_cls = torch.optim.Adam
	optimizer_kwargs = {
	"lr": learning_rate,
	"betas": (beta1, beta2),
	"eps": epsilon,
	"weight_decay": weight_decay,
	"fused": fused,
	}
	elif name == "adamw":
	optimizer_cls = torch.optim.AdamW
	optimizer_kwargs = {
	"lr": learning_rate,
	"betas": (beta1, beta2),
	"eps": epsilon,
	"weight_decay": weight_decay,
	"fused": fused,
	}
	elif name == "adam-bnb":
	from bitsandbytes.optim import Adam

	optimizer_cls = Adam
	optimizer_kwargs = {
	"lr": learning_rate,
	"betas": (beta1, beta2),
	"eps": epsilon,
	"weight_decay": weight_decay,
	}
	elif name == "adamw-bnb":
	from bitsandbytes.optim import AdamW

	optimizer_cls = AdamW
	optimizer_kwargs = {
	"lr": learning_rate,
	"betas": (beta1, beta2),
	"eps": epsilon,
	"weight_decay": weight_decay,
	}
	elif name == "adam-bnb-8bit":
	from bitsandbytes.optim import Adam8bit

	optimizer_cls = Adam8bit
	optimizer_kwargs = {
	"lr": learning_rate,
	"betas": (beta1, beta2),
	"eps": epsilon,
	"weight_decay": weight_decay,
	}
	elif name == "adamw-bnb-8bit":
	from bitsandbytes.optim import AdamW8bit

	optimizer_cls = AdamW8bit
	optimizer_kwargs = {
	"lr": learning_rate,
	"betas": (beta1, beta2),
	"eps": epsilon,
	"weight_decay": weight_decay,
	}

	# TODO(aryan): handle bitsandbytes and torchao
	else:
	raise ValueError(f"Unsupported optimizer: {name}")

	if parallel_backend == ParallelBackendEnum.ACCELERATE:
	return get_optimizer_accelerate(model_parts, optimizer_cls, optimizer_kwargs)
	elif parallel_backend == ParallelBackendEnum.PTD:
	return get_optimizer_ptd(model_parts, optimizer_cls, optimizer_kwargs)


	def get_optimizer_accelerate(
	model_parts: List[torch.nn.Module], optimizer_cls: Type[torch.optim.Optimizer], optimizer_kwargs: Dict[str, Any]
	) -> torch.optim.Optimizer:
	params = [param for model in model_parts for param in model.parameters() if param.requires_grad]
	optimizer = optimizer_cls(params, **optimizer_kwargs)
	return optimizer


	def get_optimizer_ptd(
	model_parts: List[torch.nn.Module], optimizer_cls: Type[torch.optim.Optimizer], optimizer_kwargs: Dict[str, Any]
	) -> OptimizerWrapper:
	return OptimizerWrapper(model_parts, optimizer_cls, optimizer_kwargs)


	def get_lr_scheduler(
	parallel_backend: ParallelBackendEnum,
	name: str,
	optimizer: Union[torch.optim.Optimizer, OptimizerWrapper],
	step_rules: Optional[str] = None,
	num_warmup_steps: Optional[int] = None,
	num_training_steps: Optional[int] = None,
	num_cycles: int = 1,
	power: float = 1.0,
	lr_init: float = 1e-3,
	lr_end: float = 1e-7,
	last_epoch: int = -1,
	) -> Union[torch.optim.lr_scheduler.LambdaLR, SchedulerWrapper]:
	name = name.lower()
	if name == "constant":
	scheduler_lambda_fn = get_constant_schedule()
	elif name == "constant_with_warmup":
	scheduler_lambda_fn = get_constant_schedule_with_warmup(num_warmup_steps)
	elif name == "piecewise_constant":
	scheduler_lambda_fn = get_piecewise_constant_schedule(step_rules)
	elif name == "linear":
	scheduler_lambda_fn = get_linear_schedule_with_warmup(num_warmup_steps, num_training_steps)
	elif name == "cosine":
	scheduler_lambda_fn = get_cosine_schedule_with_warmup(num_warmup_steps, num_training_steps, num_cycles)
	elif name == "cosine_with_restarts":
	scheduler_lambda_fn = get_cosine_with_hard_restarts_schedule_with_warmup(
	num_warmup_steps, num_training_steps, num_cycles
	)
	elif name == "polynomial":
	scheduler_lambda_fn = get_polynomial_decay_schedule_with_warmup(
	num_warmup_steps, num_training_steps, lr_init, lr_end, power
	)
	else:
	raise ValueError(f"Unsupported scheduler: {name}")

	if parallel_backend == ParallelBackendEnum.ACCELERATE:
	return get_lr_scheduler_accelerate(optimizer, scheduler_lambda_fn, last_epoch)
	elif parallel_backend == ParallelBackendEnum.PTD:
	return get_lr_scheduler_ptd(optimizer, scheduler_lambda_fn, last_epoch)


	def get_lr_scheduler_accelerate(
	optimizer: torch.optim.Optimizer,
	scheduler_lambda_fn: Type[torch.optim.lr_scheduler.LRScheduler],
	last_epoch: int = -1,
	) -> torch.optim.lr_scheduler.LambdaLR:
	scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, scheduler_lambda_fn, last_epoch)
	return scheduler


	def get_lr_scheduler_ptd(
	optimizer: OptimizerWrapper, scheduler_lambda_fn: Type[torch.optim.lr_scheduler.LRScheduler], last_epoch: int = -1
	) -> SchedulerWrapper:
	return SchedulerWrapper(optimizer.optimizers, scheduler_lambda_fn, last_epoch)


	# ==============================
	# Adapted from https://github.com/huggingface/diffusers/blob/196aef5a6f76e1ad6ba889184860c3633d166910/src/diffusers/optimization.py
	# ==============================


	def get_constant_schedule() -> Callable[[int], float]:
	r"""
	Create a schedule with a constant learning rate, using the learning rate set in optimizer.
	"""

	def lr_lambda(current_step: int):
	return 1.0

	return lr_lambda


	def get_constant_schedule_with_warmup(num_warmup_steps: int) -> Callable[[int], float]:
	r"""
	Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
	increases linearly between 0 and the initial lr set in the optimizer.

	Args:
	num_warmup_steps (`int`):
	The number of steps for the warmup phase.
	"""

	def lr_lambda(current_step: int):
	if current_step < num_warmup_steps:
	return float(current_step) / float(max(1.0, num_warmup_steps))
	return 1.0

	return lr_lambda


	def get_piecewise_constant_schedule(step_rules: str) -> Callable[[int], float]:
	r"""
	Create a schedule with a constant learning rate, using the learning rate set in optimizer.

	Args:
	step_rules (`string`):
	The rules for the learning rate. ex: rule_steps="1:10,0.1:20,0.01:30,0.005" it means that the learning rate
	if multiple 1 for the first 10 steps, multiple 0.1 for the next 20 steps, multiple 0.01 for the next 30
	steps and multiple 0.005 for the other steps.
	"""

	rules_dict = {}
	rule_list = step_rules.split(",")
	for rule_str in rule_list[:-1]:
	value_str, steps_str = rule_str.split(":")
	steps = int(steps_str)
	value = float(value_str)
	rules_dict[steps] = value
	last_lr_multiple = float(rule_list[-1])

	def create_rules_function(rules_dict, last_lr_multiple):
	def rule_func(steps: int) -> float:
	sorted_steps = sorted(rules_dict.keys())
	for i, sorted_step in enumerate(sorted_steps):
	if steps < sorted_step:
	return rules_dict[sorted_steps[i]]
	return last_lr_multiple

	return rule_func

	rules_func = create_rules_function(rules_dict, last_lr_multiple)
	return rules_func


	def get_linear_schedule_with_warmup(num_warmup_steps: int, num_training_steps: int) -> Callable[[int], float]:
	r"""
	Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
	a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.

	Args:
	num_warmup_steps (`int`):
	The number of steps for the warmup phase.
	num_training_steps (`int`):
	The total number of training steps.
	"""

	def lr_lambda(current_step: int):
	if current_step < num_warmup_steps:
	return float(current_step) / float(max(1, num_warmup_steps))
	return max(
	0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
	)

	return lr_lambda


	def get_cosine_schedule_with_warmup(
	num_warmup_steps: int,
	num_training_steps: int,
	num_cycles: float = 0.5,
	) -> Callable[[int], float]:
	r"""
	Create a schedule with a learning rate that decreases following the values of the cosine function between the
	initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
	initial lr set in the optimizer.

	Args:
	num_warmup_steps (`int`):
	The number of steps for the warmup phase.
	num_training_steps (`int`):
	The total number of training steps.
	num_periods (`float`, optional, defaults to 0.5):
	The number of periods of the cosine function in a schedule (the default is to just decrease from the max
	value to 0 following a half-cosine).
	"""

	def lr_lambda(current_step):
	if current_step < num_warmup_steps:
	return float(current_step) / float(max(1, num_warmup_steps))
	progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
	return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

	return lr_lambda


	def get_cosine_with_hard_restarts_schedule_with_warmup(
	num_warmup_steps: int,
	num_training_steps: int,
	num_cycles: int = 1,
	) -> Callable[[int], float]:
	r"""
	Create a schedule with a learning rate that decreases following the values of the cosine function between the
	initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
	linearly between 0 and the initial lr set in the optimizer.

	Args:
	num_warmup_steps (`int`):
	The number of steps for the warmup phase.
	num_training_steps (`int`):
	The total number of training steps.
	num_cycles (`int`, optional, defaults to 1):
	The number of hard restarts to use.
	"""

	def lr_lambda(current_step):
	if current_step < num_warmup_steps:
	return float(current_step) / float(max(1, num_warmup_steps))
	progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
	if progress >= 1.0:
	return 0.0
	return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))

	return lr_lambda


	def get_polynomial_decay_schedule_with_warmup(
	num_warmup_steps: int,
	num_training_steps: int,
	lr_init: float,
	lr_end: float = 1e-7,
	power: float = 1.0,
	) -> Callable[[int], float]:
	r"""
	Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
	optimizer to end lr defined by lr_end, after a warmup period during which it increases linearly from 0 to the
	initial lr set in the optimizer.

	Args:
	num_warmup_steps (`int`):
	The number of steps for the warmup phase.
	num_training_steps (`int`):
	The total number of training steps.
	lr_end (`float`, optional, defaults to 1e-7):
	The end LR.
	power (`float`, optional, defaults to 1.0):
	Power factor.

	Note: power defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT implementation at
	https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
	"""

	if not (lr_init > lr_end):
	raise ValueError(f"lr_end ({lr_end}) must be smaller than initial lr ({lr_init})")

	def lr_lambda(current_step: int):
	if current_step < num_warmup_steps:
	return float(current_step) / float(max(1, num_warmup_steps))
	elif current_step > num_training_steps:
	return lr_end / lr_init # as LambdaLR multiplies by lr_init
	else:
	lr_range = lr_init - lr_end
	decay_steps = num_training_steps - num_warmup_steps
	pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
	decay = lr_range * pct_remaining**power + lr_end
	return decay / lr_init # as LambdaLR multiplies by lr_init

	return lr_lambda


	def _raise_errors_if_packages_not_available(name: str) -> None:
	name_split = name.split("-")
	if len(name_split) < 2:
	return
	package_name = name_split[1]
	if package_name == "bnb":
	if not is_bitsandbytes_available():
	raise ImportError(
	f"Please install bitsandbytes by running `pip install bitsandbytes` to use the {name} optimizer."
	)