Spaces:

roll-ai
/

GENC3-docker

Build error

App Files Files Community

GENC3-docker / cosmos_predict1 /diffusion /training /callbacks /iter_speed.py

roll-ai

Upload 381 files

b6af722 verified 12 days ago

raw

history blame contribute delete

2.83 kB

	# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import time

	import torch
	from torch import Tensor

	from cosmos_predict1.diffusion.training.callbacks.every_n import EveryN
	from cosmos_predict1.utils import log
	from cosmos_predict1.utils.distributed import rank0_only
	from cosmos_predict1.utils.model import Model
	from cosmos_predict1.utils.trainer import Trainer


	class IterSpeed(EveryN):
	"""
	Args:
	hit_thres (int): Number of iterations to wait before logging.
	"""

	def __init__(self, args, hit_thres: int = 5, *kwargs):
	super().__init__(args, *kwargs)
	self.time = None
	self.hit_counter = 0
	self.hit_thres = hit_thres
	self.name = self.__class__.__name__
	self.last_hit_time = time.time()

	def on_training_step_end(
	self,
	model: Model,
	data_batch: dict[str, torch.Tensor],
	output_batch: dict[str, torch.Tensor],
	loss: torch.Tensor,
	iteration: int = 0,
	) -> None:
	if self.hit_counter < self.hit_thres:
	log.info(
	f"Iteration {iteration}: "
	f"Hit counter: {self.hit_counter + 1}/{self.hit_thres} \| "
	f"Loss: {loss.item():.4f} \| "
	f"Time: {time.time() - self.last_hit_time:.2f}s"
	)
	self.hit_counter += 1
	self.last_hit_time = time.time()
	#! useful for large scale training and avoid oom crash in the first two iterations!!!
	torch.cuda.synchronize()
	return
	super().on_training_step_end(model, data_batch, output_batch, loss, iteration)

	@rank0_only
	def every_n_impl(
	self,
	trainer: Trainer,
	model: Model,
	data_batch: dict[str, Tensor],
	output_batch: dict[str, Tensor],
	loss: Tensor,
	iteration: int,
	) -> None:
	if self.time is None:
	self.time = time.time()
	return
	cur_time = time.time()
	iter_speed = (cur_time - self.time) / self.every_n / self.step_size

	log.info(f"{iteration} : iter_speed {iter_speed:.2f} seconds per iteration \| Loss: {loss.item():.4f}")

	self.time = cur_time