roll-ai's picture
Upload 381 files
b6af722 verified
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import torch
from torch import Tensor
from cosmos_predict1.diffusion.training.callbacks.every_n import EveryN
from cosmos_predict1.utils import log
from cosmos_predict1.utils.distributed import rank0_only
from cosmos_predict1.utils.model import Model
from cosmos_predict1.utils.trainer import Trainer
class IterSpeed(EveryN):
"""
Args:
hit_thres (int): Number of iterations to wait before logging.
"""
def __init__(self, *args, hit_thres: int = 5, **kwargs):
super().__init__(*args, **kwargs)
self.time = None
self.hit_counter = 0
self.hit_thres = hit_thres
self.name = self.__class__.__name__
self.last_hit_time = time.time()
def on_training_step_end(
self,
model: Model,
data_batch: dict[str, torch.Tensor],
output_batch: dict[str, torch.Tensor],
loss: torch.Tensor,
iteration: int = 0,
) -> None:
if self.hit_counter < self.hit_thres:
log.info(
f"Iteration {iteration}: "
f"Hit counter: {self.hit_counter + 1}/{self.hit_thres} | "
f"Loss: {loss.item():.4f} | "
f"Time: {time.time() - self.last_hit_time:.2f}s"
)
self.hit_counter += 1
self.last_hit_time = time.time()
#! useful for large scale training and avoid oom crash in the first two iterations!!!
torch.cuda.synchronize()
return
super().on_training_step_end(model, data_batch, output_batch, loss, iteration)
@rank0_only
def every_n_impl(
self,
trainer: Trainer,
model: Model,
data_batch: dict[str, Tensor],
output_batch: dict[str, Tensor],
loss: Tensor,
iteration: int,
) -> None:
if self.time is None:
self.time = time.time()
return
cur_time = time.time()
iter_speed = (cur_time - self.time) / self.every_n / self.step_size
log.info(f"{iteration} : iter_speed {iter_speed:.2f} seconds per iteration | Loss: {loss.item():.4f}")
self.time = cur_time