kairunwen's picture
Update Code
57746f1
"""
Trainer
Author: Xiaoyang Wu ([email protected])
Please cite our work if the code is helpful to you.
"""
import os
import sys
import weakref
import torch
import torch.nn as nn
import torch.utils.data
from functools import partial
if sys.version_info >= (3, 10):
from collections.abc import Iterator
else:
from collections import Iterator
from tensorboardX import SummaryWriter
from .defaults import create_ddp_model, worker_init_fn
from .hooks import HookBase, build_hooks
import pointcept.utils.comm as comm
from pointcept.datasets import build_dataset, point_collate_fn, collate_fn
from pointcept.models import build_model
from pointcept.utils.logger import get_root_logger
from pointcept.utils.optimizer import build_optimizer
from pointcept.utils.scheduler import build_scheduler
from pointcept.utils.events import EventStorage, ExceptionWriter
from pointcept.utils.registry import Registry
TRAINERS = Registry("trainers")
class TrainerBase:
def __init__(self) -> None:
self.hooks = []
self.epoch = 0
self.start_epoch = 0
self.max_epoch = 0
self.max_iter = 0
self.comm_info = dict()
self.data_iterator: Iterator = enumerate([])
self.storage: EventStorage
self.writer: SummaryWriter
def register_hooks(self, hooks) -> None:
hooks = build_hooks(hooks)
for h in hooks:
assert isinstance(h, HookBase)
# To avoid circular reference, hooks and trainer cannot own each other.
# This normally does not matter, but will cause memory leak if the
# involved objects contain __del__:
# See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/
h.trainer = weakref.proxy(self)
self.hooks.extend(hooks)
def train(self):
with EventStorage() as self.storage:
# => before train
self.before_train()
for self.epoch in range(self.start_epoch, self.max_epoch):
# => before epoch
self.before_epoch()
# => run_epoch
for (
self.comm_info["iter"],
self.comm_info["input_dict"],
) in self.data_iterator:
# => before_step
self.before_step()
# => run_step
self.run_step()
# => after_step
self.after_step()
# => after epoch
self.after_epoch()
# => after train
self.after_train()
def before_train(self):
for h in self.hooks:
h.before_train()
def before_epoch(self):
for h in self.hooks:
h.before_epoch()
def before_step(self):
for h in self.hooks:
h.before_step()
def run_step(self):
raise NotImplementedError
def after_step(self):
for h in self.hooks:
h.after_step()
def after_epoch(self):
for h in self.hooks:
h.after_epoch()
self.storage.reset_histories()
def after_train(self):
# Sync GPU before running train hooks
comm.synchronize()
for h in self.hooks:
h.after_train()
if comm.is_main_process():
self.writer.close()
@TRAINERS.register_module("DefaultTrainer")
class Trainer(TrainerBase):
def __init__(self, cfg):
super(Trainer, self).__init__()
self.epoch = 0
self.start_epoch = 0
self.max_epoch = cfg.eval_epoch
self.best_metric_value = -torch.inf
self.logger = get_root_logger(
log_file=os.path.join(cfg.save_path, "train.log"),
file_mode="a" if cfg.resume else "w",
)
self.logger.info("=> Loading config ...")
self.cfg = cfg
self.logger.info(f"Save path: {cfg.save_path}")
self.logger.info(f"Config:\n{cfg.pretty_text}")
self.logger.info("=> Building model ...")
self.model = self.build_model()
self.logger.info("=> Building writer ...")
self.writer = self.build_writer()
self.logger.info("=> Building train dataset & dataloader ...")
self.train_loader = self.build_train_loader()
self.logger.info("=> Building val dataset & dataloader ...")
self.val_loader = self.build_val_loader()
self.logger.info("=> Building optimize, scheduler, scaler(amp) ...")
self.optimizer = self.build_optimizer()
self.scheduler = self.build_scheduler()
self.scaler = self.build_scaler()
self.logger.info("=> Building hooks ...")
self.register_hooks(self.cfg.hooks)
def train(self):
with EventStorage() as self.storage, ExceptionWriter():
# => before train
self.before_train()
self.logger.info(">>>>>>>>>>>>>>>> Start Training >>>>>>>>>>>>>>>>")
for self.epoch in range(self.start_epoch, self.max_epoch):
# => before epoch
# TODO: optimize to iteration based
if comm.get_world_size() > 1:
self.train_loader.sampler.set_epoch(self.epoch)
self.model.train()
self.data_iterator = enumerate(self.train_loader)
self.before_epoch()
# => run_epoch
for (
self.comm_info["iter"],
self.comm_info["input_dict"],
) in self.data_iterator:
# => before_step
self.before_step()
# => run_step
self.run_step()
# => after_step
self.after_step()
# => after epoch
self.after_epoch()
# => after train
self.after_train()
def run_step(self):
input_dict = self.comm_info["input_dict"]
for key in input_dict.keys():
if isinstance(input_dict[key], torch.Tensor):
input_dict[key] = input_dict[key].cuda(non_blocking=True)
with torch.cuda.amp.autocast(enabled=self.cfg.enable_amp):
output_dict = self.model(input_dict)
loss = output_dict["loss"]
self.optimizer.zero_grad()
if self.cfg.enable_amp:
self.scaler.scale(loss).backward()
self.scaler.unscale_(self.optimizer)
if self.cfg.clip_grad is not None:
torch.nn.utils.clip_grad_norm_(
self.model.parameters(), self.cfg.clip_grad
)
self.scaler.step(self.optimizer)
# When enable amp, optimizer.step call are skipped if the loss scaling factor is too large.
# Fix torch warning scheduler step before optimizer step.
scaler = self.scaler.get_scale()
self.scaler.update()
if scaler <= self.scaler.get_scale():
self.scheduler.step()
else:
loss.backward()
if self.cfg.clip_grad is not None:
torch.nn.utils.clip_grad_norm_(
self.model.parameters(), self.cfg.clip_grad
)
self.optimizer.step()
self.scheduler.step()
if self.cfg.empty_cache:
torch.cuda.empty_cache()
self.comm_info["model_output_dict"] = output_dict
def after_epoch(self):
for h in self.hooks:
h.after_epoch()
self.storage.reset_histories()
if self.cfg.empty_cache_per_epoch:
torch.cuda.empty_cache()
def build_model(self):
model = build_model(self.cfg.model)
if self.cfg.sync_bn:
model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
# logger.info(f"Model: \n{self.model}")
self.logger.info(f"Num params: {n_parameters}")
model = create_ddp_model(
model.cuda(),
broadcast_buffers=False,
find_unused_parameters=self.cfg.find_unused_parameters,
)
return model
def build_writer(self):
writer = SummaryWriter(self.cfg.save_path) if comm.is_main_process() else None
self.logger.info(f"Tensorboard writer logging dir: {self.cfg.save_path}")
return writer
def build_train_loader(self):
train_data = build_dataset(self.cfg.data.train)
if comm.get_world_size() > 1:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
else:
train_sampler = None
init_fn = (
partial(
worker_init_fn,
num_workers=self.cfg.num_worker_per_gpu,
rank=comm.get_rank(),
seed=self.cfg.seed,
)
if self.cfg.seed is not None
else None
)
train_loader = torch.utils.data.DataLoader(
train_data,
batch_size=self.cfg.batch_size_per_gpu,
shuffle=(train_sampler is None),
num_workers=self.cfg.num_worker_per_gpu,
sampler=train_sampler,
collate_fn=partial(point_collate_fn, mix_prob=self.cfg.mix_prob),
pin_memory=True,
worker_init_fn=init_fn,
drop_last=True,
persistent_workers=True,
)
return train_loader
def build_val_loader(self):
val_loader = None
if self.cfg.evaluate:
val_data = build_dataset(self.cfg.data.val)
if comm.get_world_size() > 1:
val_sampler = torch.utils.data.distributed.DistributedSampler(val_data)
else:
val_sampler = None
val_loader = torch.utils.data.DataLoader(
val_data,
batch_size=self.cfg.batch_size_val_per_gpu,
shuffle=False,
num_workers=self.cfg.num_worker_per_gpu,
pin_memory=True,
sampler=val_sampler,
collate_fn=collate_fn,
)
return val_loader
def build_optimizer(self):
return build_optimizer(self.cfg.optimizer, self.model, self.cfg.param_dicts)
def build_scheduler(self):
assert hasattr(self, "optimizer")
assert hasattr(self, "train_loader")
self.cfg.scheduler.total_steps = len(self.train_loader) * self.cfg.eval_epoch
return build_scheduler(self.cfg.scheduler, self.optimizer)
def build_scaler(self):
scaler = torch.cuda.amp.GradScaler() if self.cfg.enable_amp else None
return scaler
@TRAINERS.register_module("MultiDatasetTrainer")
class MultiDatasetTrainer(Trainer):
def build_train_loader(self):
from pointcept.datasets import MultiDatasetDataloader
train_data = build_dataset(self.cfg.data.train)
train_loader = MultiDatasetDataloader(
train_data,
self.cfg.batch_size_per_gpu,
self.cfg.num_worker_per_gpu,
self.cfg.mix_prob,
self.cfg.seed,
)
self.comm_info["iter_per_epoch"] = len(train_loader)
return train_loader