Spaces:

AlphaSphereDotAI
/

Visualizr

Build error

App Files Files Community

Visualizr / src /visualizr /experiment.py

MH0386

Upload folder using huggingface_hub

3e165b2 verified 2 months ago

raw

history blame contribute delete

13.2 kB

	import copy
	import os

	import numpy as np
	import pytorch_lightning as pl
	import torch
	from pytorch_lightning import loggers as pl_loggers
	from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
	from torch.cuda import amp
	from torch.optim.optimizer import Optimizer
	from torch.utils.data.dataset import TensorDataset

	from visualizr import logger
	from visualizr.choices import OptimizerType, TrainMode
	from visualizr.config import TrainConfig
	from visualizr.dist_utils import get_world_size
	from visualizr.model.seq2seq import DiffusionPredictor
	from visualizr.renderer import render_condition


	class LitModel(pl.LightningModule):
	def __init__(self, conf: TrainConfig):
	super().__init__()
	assert conf.train_mode != TrainMode.manipulate
	if conf.seed is not None:
	pl.seed_everything(conf.seed)

	self.save_hyperparameters(conf.as_dict_jsonable())

	self.conf = conf

	self.model = DiffusionPredictor(conf)

	self.ema_model = copy.deepcopy(self.model)
	self.ema_model.requires_grad_(False)
	self.ema_model.eval()

	self.sampler = conf.make_diffusion_conf().make_sampler()
	self.eval_sampler = conf.make_eval_diffusion_conf().make_sampler()

	# this is shared for both model and latent
	self.T_sampler = conf.make_T_sampler()

	if conf.train_mode.use_latent_net():
	self.latent_sampler = conf.make_latent_diffusion_conf().make_sampler()
	self.eval_latent_sampler = (
	conf.make_latent_eval_diffusion_conf().make_sampler()
	)
	else:
	self.latent_sampler = None
	self.eval_latent_sampler = None

	# initial variables for consistent sampling
	self.register_buffer(
	"x_T", torch.randn(conf.sample_size, 3, conf.img_size, conf.img_size)
	)

	def render(
	self,
	start,
	motion_direction_start,
	audio_driven,
	face_location,
	face_scale,
	ypr_info,
	noisyT,
	step_T,
	control_flag,
	):
	if step_T is None:
	sampler = self.eval_sampler
	else:
	sampler = self.conf._make_diffusion_conf(step_T).make_sampler()

	pred_img = render_condition(
	self.conf,
	self.ema_model,
	sampler,
	start,
	motion_direction_start,
	audio_driven,
	face_location,
	face_scale,
	ypr_info,
	noisyT,
	control_flag,
	)
	return pred_img

	def forward(self, noise=None, x_start=None, ema_model: bool = False):
	with amp.autocast(False):
	if not self.disable_ema:
	model = self.ema_model
	else:
	model = self.model
	gen = self.eval_sampler.sample(model=model, noise=noise, x_start=x_start)
	return gen

	def setup(self, stage=None) -> None:
	"""
	make datasets & seeding each worker separately
	"""
	##############################################
	# NEED TO SET THE SEED SEPARATELY HERE
	if self.conf.seed is not None:
	seed = self.conf.seed * get_world_size() + self.global_rank
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	logger.info("local seed:", seed)
	##############################################

	self.train_data = self.conf.make_dataset()
	logger.info("train data:", len(self.train_data))
	self.val_data = self.train_data
	logger.info("val data:", len(self.val_data))

	def _train_dataloader(self, drop_last=True):
	"""
	really make the dataloader
	"""
	# make sure to use the fraction of batch size
	# the batch size is global!
	conf = self.conf.clone()
	conf.batch_size = self.batch_size

	dataloader = conf.make_loader(
	self.train_data, shuffle=True, drop_last=drop_last
	)
	return dataloader

	def train_dataloader(self):
	"""
	return the dataloader, if diffusion mode => return image dataset
	if latent mode => return the inferred latent dataset
	"""
	logger.info("on train dataloader start ...")
	if self.conf.train_mode.require_dataset_infer():
	if self.conds is None:
	# usually we load self.conds from a file
	# so we do not need to do this again!
	self.conds = self.infer_whole_dataset()
	# need to use float32! unless the mean & std will be off!
	# (1, c)
	self.conds_mean.data = self.conds.float().mean(dim=0, keepdim=True)
	self.conds_std.data = self.conds.float().std(dim=0, keepdim=True)
	logger.info("mean:", self.conds_mean.mean(), "std:", self.conds_std.mean())

	# return the dataset with pre-calculated conds
	conf = self.conf.clone()
	conf.batch_size = self.batch_size
	data = TensorDataset(self.conds)
	return conf.make_loader(data, shuffle=True)
	else:
	return self._train_dataloader()

	@property
	def batch_size(self):
	"""
	local batch size for each worker
	"""
	ws = get_world_size()
	assert self.conf.batch_size % ws == 0
	return self.conf.batch_size // ws

	@property
	def num_samples(self):
	"""
	(global) batch size * iterations
	"""
	# batch size here is global!
	# global_step already takes into account the accum batches
	return self.global_step * self.conf.batch_size_effective

	def is_last_accum(self, batch_idx):
	"""
	is it the last gradient accumulation loop?
	used with gradient_accum > 1 and to see if the optimizer will perform "step" in this iteration or not
	"""
	return (batch_idx + 1) % self.conf.accum_batches == 0

	def training_step(self, batch, batch_idx):
	"""
	given an input, calculate the loss function
	no optimization at this stage.
	"""
	with amp.autocast(False):
	motion_start = batch["motion_start"] # torch.Size([B, 512])
	motion_direction = batch["motion_direction"] # torch.Size([B, 125, 20])
	audio_feats = batch["audio_feats"].float() # torch.Size([B, 25, 250, 1024])
	face_location = batch["face_location"].float() # torch.Size([B, 125])
	face_scale = batch["face_scale"].float() # torch.Size([B, 125, 1])
	yaw_pitch_roll = batch["yaw_pitch_roll"].float() # torch.Size([B, 125, 3])
	motion_direction_start = batch[
	"motion_direction_start"
	].float() # torch.Size([B, 20])

	# import pdb; pdb.set_trace()
	if self.conf.train_mode == TrainMode.diffusion:
	"""
	main training mode!!!
	"""
	# with numpy seed we have the problem that the sample t's are related!
	t, weight = self.T_sampler.sample(
	len(motion_start), motion_start.device
	)
	losses = self.sampler.training_losses(
	model=self.model,
	motion_direction_start=motion_direction_start,
	motion_target=motion_direction,
	motion_start=motion_start,
	audio_feats=audio_feats,
	face_location=face_location,
	face_scale=face_scale,
	yaw_pitch_roll=yaw_pitch_roll,
	t=t,
	)
	else:
	raise NotImplementedError()

	loss = losses["loss"].mean()
	# divide by accum batches to make the accumulated gradient exact!
	for key in losses.keys():
	losses[key] = self.all_gather(losses[key]).mean()

	if self.global_rank == 0:
	self.logger.experiment.add_scalar(
	"loss", losses["loss"], self.num_samples
	)
	for key in losses:
	self.logger.experiment.add_scalar(
	f"loss/{key}", losses[key], self.num_samples
	)

	return {"loss": loss}

	def on_train_batch_end(
	self, outputs, batch, batch_idx: int, dataloader_idx: int
	) -> None:
	"""
	after each training step ...
	"""
	if self.is_last_accum(batch_idx):
	if self.conf.train_mode == TrainMode.latent_diffusion:
	# it trains only the latent hence change only the latent
	ema(
	self.model.latent_net,
	self.ema_model.latent_net,
	self.conf.ema_decay,
	)
	else:
	ema(self.model, self.ema_model, self.conf.ema_decay)

	def on_before_optimizer_step(
	self, optimizer: Optimizer, optimizer_idx: int
	) -> None:
	# fix the fp16 + clip grad norm problem with pytorch lightinng
	# this is the currently correct way to do it
	if self.conf.grad_clip > 0:
	# from trainer.params_grads import grads_norm, iter_opt_params
	params = [p for group in optimizer.param_groups for p in group["params"]]
	torch.nn.utils.clip_grad_norm_(params, max_norm=self.conf.grad_clip)

	def configure_optimizers(self):
	out = {}
	if self.conf.optimizer == OptimizerType.adam:
	optim = torch.optim.Adam(
	self.model.parameters(),
	lr=self.conf.lr,
	weight_decay=self.conf.weight_decay,
	)
	elif self.conf.optimizer == OptimizerType.adamw:
	optim = torch.optim.AdamW(
	self.model.parameters(),
	lr=self.conf.lr,
	weight_decay=self.conf.weight_decay,
	)
	else:
	raise NotImplementedError()
	out["optimizer"] = optim
	if self.conf.warmup > 0:
	sched = torch.optim.lr_scheduler.LambdaLR(
	optim, lr_lambda=WarmupLR(self.conf.warmup)
	)
	out["lr_scheduler"] = {
	"scheduler": sched,
	"interval": "step",
	}
	return out

	def split_tensor(self, x):
	"""
	extract the tensor for a corresponding "worker" in the batch dimension

	Args:
	x: (n, c)

	Returns: x: (n_local, c)
	"""
	n = len(x)
	rank = self.global_rank
	world_size = get_world_size()
	# print(f'rank: {rank}/{world_size}')
	per_rank = n // world_size
	return x[rank * per_rank : (rank + 1) * per_rank]


	def ema(source, target, decay):
	source_dict = source.state_dict()
	target_dict = target.state_dict()
	for key in source_dict.keys():
	target_dict[key].data.copy_(
	target_dict[key].data * decay + source_dict[key].data * (1 - decay)
	)


	class WarmupLR:
	def __init__(self, warmup) -> None:
	self.warmup = warmup

	def __call__(self, step):
	return min(step, self.warmup) / self.warmup


	def is_time(num_samples, every, step_size):
	closest = (num_samples // every) * every
	return num_samples - closest < step_size


	def train(conf: TrainConfig, gpus, nodes=1, mode: str = "train"):
	logger.info("conf:", conf.name)
	model = LitModel(conf)

	if not os.path.exists(conf.logdir):
	os.makedirs(conf.logdir)
	checkpoint = ModelCheckpoint(
	dirpath=f"{conf.logdir}", save_last=True, save_top_k=-1, every_n_epochs=10
	)
	checkpoint_path = f"{conf.logdir}/last.ckpt"
	logger.info("ckpt path:", checkpoint_path)
	if os.path.exists(checkpoint_path):
	resume = checkpoint_path
	logger.info("resume!")
	else:
	if conf.continue_from is not None:
	# continue from a checkpoint
	resume = conf.continue_from.pathcd
	else:
	resume = None

	tb_logger = pl_loggers.TensorBoardLogger(
	save_dir=conf.logdir, name=None, version=""
	)

	# from pytorch_lightning.

	plugins = []
	if len(gpus) == 1 and nodes == 1:
	accelerator = None
	else:
	accelerator = "ddp"
	from pytorch_lightning.plugins import DDPPlugin

	# important for working with gradient checkpoint
	plugins.append(DDPPlugin(find_unused_parameters=True))

	trainer = pl.Trainer(
	max_steps=conf.total_samples // conf.batch_size_effective,
	resume_from_checkpoint=resume,
	gpus=gpus,
	num_nodes=nodes,
	accelerator=accelerator,
	precision=16 if conf.fp16 else 32,
	callbacks=[
	checkpoint,
	LearningRateMonitor(),
	],
	# clip in the model instead
	# gradient_clip_val=conf.grad_clip,
	replace_sampler_ddp=True,
	logger=tb_logger,
	accumulate_grad_batches=conf.accum_batches,
	plugins=plugins,
	)

	trainer.fit(model)