import time import math import datetime import os import random import numpy as np import torch import torch.nn as nn import torch.optim as optim from tqdm import tqdm from models.build_model import build_model from generate import generate from data.preprocess_features import preprocess_features from data.loader import Loader from data.loader_exhaustive import LoaderExhaustive from data.loader_generations import LoaderGenerations from data.collate import filter_collate from utils import CsvWriter, create_exp_dir, accuracy from config import args # os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Set the random seed manually for reproducibility. if args.seed > 0: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) random.seed(args.seed) class Runner: def __init__(self): self.logging = create_exp_dir(args.work_dir, debug=args.debug) use_cuda = torch.cuda.is_available() and not args.no_cuda self.device = torch.device('cuda' if use_cuda else 'cpu') if self.device == torch.device("cuda"): self.logging("Using GPU") else: self.logging("Using CPU") self.train_step = 0 self.n_sequences_total = 0 self.init_hours = 0 self.epoch = 0 self.init_time = time.time() # Load data n_bins = args.n_emotion_bins if args.conditioning == "discrete_token" and \ not args.regression else None conditional = args.conditioning != "none" or args.regression # Preprocessing train_feats, test_feats = preprocess_features( "../data_files/features/pianoroll/full_dataset_features_summarized.csv", n_bins=n_bins, conditional=conditional, use_labeled_only=not args.full_dataset) if args.exhaustive_eval: # Evaluate using ENTIRE test set train_dataset = [] test_dataset = LoaderExhaustive(args.data_folder, test_feats, args.tgt_len, args.conditioning, max_samples=args.n_samples, regression=args.regression, always_use_discrete_condition=args.always_use_discrete_condition) else: train_dataset = Loader(args.data_folder, train_feats, args.tgt_len, args.conditioning, regression=args.regression, always_use_discrete_condition=args.always_use_discrete_condition) test_dataset = Loader(args.data_folder, test_feats, args.tgt_len, args.conditioning, regression=args.regression, always_use_discrete_condition=args.always_use_discrete_condition) if args.regression_dir is not None: # Perform emotion regression on generated samples train_dataset = [] test_dataset = LoaderGenerations(args.regression_dir, args.tgt_len) self.null_condition = torch.FloatTensor([np.nan, np.nan]).to(self.device) self.maps = test_dataset.get_maps() self.pad_idx = test_dataset.get_pad_idx() self.vocab_size = test_dataset.get_vocab_len() args.vocab_size = self.vocab_size self.logging(f"Number of tokens: {self.vocab_size}") if args.exhaustive_eval or args.regression_dir is not None: self.train_loader = [] else: self.train_loader = torch.utils.data.DataLoader(train_dataset, args.batch_size, shuffle=not args.debug, num_workers=args.num_workers, collate_fn=filter_collate, pin_memory=not args.no_cuda, drop_last=True) self.test_loader = torch.utils.data.DataLoader(test_dataset, args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=filter_collate, pin_memory=not args.no_cuda and args.regression_dir is None, drop_last=True) print(f"Data loader lengths\nTrain: {len(train_dataset)}") if not args.overfit: print(f"Test:{len(test_dataset)}") self.gen_dir = os.path.join(args.work_dir, "generations", "training") # Automatic mixed precision self.amp = not args.no_amp and self.device == torch.device('cuda') if self.amp: self.logging("Using automatic mixed precision") else: self.logging("Using float32") self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp) self.init_model() # Build the model if not args.debug: # Save mappings os.makedirs(self.gen_dir, exist_ok=True) torch.save(self.maps, os.path.join(args.work_dir, "mappings.pt")) self.csv_writer = CsvWriter(os.path.join(args.work_dir, "performance.csv"), ["epoch", "step", "hour", "lr", "trn_loss", "val_loss", "val_l1_v", "val_l1_a"], in_path=self.csv_in, debug=args.debug) args.n_all_param = sum([p.nelement() for p in self.model.parameters()]) self.model = self.model.to(self.device) self.ce_loss = nn.CrossEntropyLoss(ignore_index=self.pad_idx).to(self.device) self.mse_loss = nn.MSELoss() self.l1_loss = nn.L1Loss() #### scheduler if args.scheduler == '--': self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, args.max_step, eta_min=args.eta_min) elif args.scheduler == 'dev_perf': self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) elif args.scheduler == 'constant': pass elif args.scheduler == 'cyclic': self.scheduler = optim.lr_scheduler.CyclicLR(self.optimizer, args.lr_min, args.lr_max, verbose=False, cycle_momentum=False) # Print log if not args.debug: self.logging('=' * 120) for k, v in args.__dict__.items(): self.logging(' - {} : {}'.format(k, v)) self.logging('=' * 120) self.logging('#params = {}'.format(args.n_all_param)) now = datetime.datetime.now() now = now.strftime("%d-%m-%Y %H:%M") self.logging(f"Run started at {now}") self.once = True def init_model(self): # Initialize model if args.restart_dir: # Load existing model config = torch.load(os.path.join(args.restart_dir, "model_config.pt")) self.model, config = build_model(None, load_config_dict=config) self.model = self.model.to(self.device) model_fp = os.path.join(args.restart_dir, 'model.pt') optimizer_fp = os.path.join(args.restart_dir, 'optimizer.pt') stats_fp = os.path.join(args.restart_dir, 'stats.pt') scaler_fp = os.path.join(args.restart_dir, 'scaler.pt') self.model.load_state_dict( torch.load(model_fp, map_location=lambda storage, loc: storage)) self.logging(f"Model loaded from {model_fp}") self.csv_in = os.path.join(args.restart_dir, 'performance.csv') else: # Build model from scratch self.csv_in = None self.model, config = build_model(vars(args)) self.model = self.model.to(self.device) # save model configuration for later load if not args.debug: torch.save(config, os.path.join(args.work_dir, "model_config.pt")) self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) # Load self.optimizer if necessary if args.restart_dir: if os.path.exists(optimizer_fp): try: self.optimizer.load_state_dict( torch.load(optimizer_fp, map_location=lambda storage, loc: storage)) except: pass else: print('Optimizer was not saved. Start from scratch.') try: stats = torch.load(stats_fp) self.train_step = stats["step"] self.init_hours = stats["hour"] self.epoch = stats["epoch"] self.n_sequences_total = stats["sample"] except: self.train_step = 0 self.init_hours = 0 self.epoch = 0 self.n_sequences_total = 0 if os.path.exists(scaler_fp) and not args.reset_scaler: try: self.scaler.load_state_dict(torch.load(scaler_fp)) except: pass if args.overwrite_lr: # New learning rate for p in self.optimizer.param_groups: p['lr'] = args.lr ############################################################################### # EVALUATION ############################################################################### def evaluate(self): # Turn on evaluation mode which disables dropout. self.model.eval() # Evaluation topk = (1, 5) # find accuracy for top-1 and top-5 n_elements_total, n_sequences_total, total_loss = 0, 0, 0. total_accs = {"l1_v": 0., "l1_a": 0., "l1_mean": 0., "l1_mean_normal":0 } if args.regression else {k: 0. for k in topk} with torch.no_grad(): n_batches = len(self.test_loader) loader = enumerate(self.test_loader) if args.exhaustive_eval or args.regression: loader = tqdm(loader, total=n_batches) for i, (input_, condition, target) in loader: if args.max_eval_step > 0 and i >= args.max_eval_step: break if input_ != []: input_ = input_.to(self.device) condition = condition.to(self.device) if not args.regression: target = target.to(self.device) loss, pred = self.forward_pass(input_, condition, target) if args.regression: pred = torch.clamp(pred, min=-1.0, max=1.0) loss = self.l1_loss(pred, condition) l1_v = self.l1_loss(pred[:, 0], condition[:, 0]).item() l1_a = self.l1_loss(pred[:, 1], condition[:, 1]).item() accuracies = {"l1_v": l1_v, "l1_a": l1_a, "l1_mean": (l1_v + l1_a) / 2, "l1_mean_normal": (l1_v + l1_a) / 2 / 2} n_elements = pred[:, 0].numel() else: accuracies = accuracy(pred, target, topk=topk, ignore_index=self.pad_idx) n_elements = input_.numel() n_sequences = input_.size(0) total_loss += n_elements * loss.item() for key, value in accuracies.items(): total_accs[key] += n_elements * value n_elements_total += n_elements n_sequences_total += n_sequences if n_elements_total == 0: avg_loss = float('nan') avg_accs = float('nan') else: avg_loss = total_loss / n_elements_total avg_accs = {k: v/n_elements_total for k, v in total_accs.items()} if args.exhaustive_eval: print(f"Total number of sequences: {n_sequences_total}") return avg_loss, avg_accs def forward_pass(self, input_, condition, target): input_ = input_.to(self.device) condition = condition.to(self.device) with torch.cuda.amp.autocast(enabled=self.amp): if args.regression: output = self.model(input_) loss = self.l1_loss(output, condition) else: target = target.to(self.device) output = self.model(input_, condition) output_flat = output.reshape(-1, output.size(-1)) target = target.reshape(-1) loss = self.ce_loss(output_flat, target) return loss, output def train(self): # Turn on training mode which enables dropout. self.model.train() train_loss = 0 n_elements_total = 0 train_interval_start = time.time() while True: for input_, condition, target in self.train_loader: self.model.train() if input_ != []: loss, _ = self.forward_pass(input_, condition, target) loss_val = loss.item() loss /= args.accumulate_step n_elements = input_.numel() if not math.isnan(loss_val): train_loss += n_elements * loss_val n_elements_total += n_elements self.n_sequences_total += input_.size(0) self.scaler.scale(loss).backward() if self.train_step % args.accumulate_step == 0: self.scaler.unscale_(self.optimizer) if args.clip > 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.clip) self.scaler.step(self.optimizer) self.scaler.update() self.model.zero_grad() if args.scheduler != "constant": # linear warmup stage if self.train_step <= args.warmup_step: curr_lr = args.lr * self.train_step / args.warmup_step self.optimizer.param_groups[0]['lr'] = curr_lr else: self.scheduler.step() if (self.train_step % args.gen_step == 0) and self.train_step > 0 and not args.regression: # Generate and save samples with torch.no_grad(): self.model.eval() if args.max_gen_input_len > 0: max_input_len = args.max_gen_input_len else: max_input_len = args.tgt_len primers = [[""]] # Use fixed set of conditions if args.conditioning == "none": discrete_conditions = None continuous_conditions = None primers = [[""] for _ in range(4)] elif args.conditioning == "discrete_token": discrete_conditions = [ ["", ""], ["", ""], ["", ""], ["", ""], ] continuous_conditions = None elif args.conditioning in ["continuous_token", "continuous_concat"]: discrete_conditions = None continuous_conditions = [ [-0.8, -0.8], [-0.8, 0.8], [0.8, -0.8], [0.8, 0.8] ] generate(self.model, self.maps, self.device, self.gen_dir, args.conditioning, debug=args.debug, verbose=False, amp=self.amp, discrete_conditions=discrete_conditions, continuous_conditions=continuous_conditions, min_n_instruments=1, gen_len=args.gen_len, max_input_len=max_input_len, step=str(self.train_step), primers=primers, temperatures=[args.temp_note, args.temp_rest]) if (self.train_step % args.log_step == 0): # Print log if n_elements_total > 0: cur_loss = train_loss / n_elements_total elapsed_total = time.time() - self.init_time elapsed_interval = time.time() - train_interval_start hours_elapsed = elapsed_total / 3600.0 hours_total = self.init_hours + hours_elapsed lr = self.optimizer.param_groups[0]['lr'] log_str = '| Epoch {:3d} step {:>8d} | {:>6d} sequences | {:>3.1f} h | lr {:.2e} ' \ '| ms/batch {:4.0f} | loss {:7.4f}'.format( self.epoch, self.train_step, self.n_sequences_total, hours_total, lr, elapsed_interval * 1000 / args.log_step, cur_loss) self.logging(log_str) self.csv_writer.update({"epoch": self.epoch, "step": self.train_step, "hour": hours_total, "lr": lr, "trn_loss": cur_loss, "val_loss": np.nan, "val_l1_v": np.nan, "val_l1_a": np.nan}) train_loss = 0 n_elements_total = 0 self.n_good_output, self.n_nan_output = 0, 0 train_interval_start = time.time() if not args.debug: # Save model model_fp = os.path.join(args.work_dir, 'model.pt') torch.save(self.model.state_dict(), model_fp) optimizer_fp = os.path.join(args.work_dir, 'optimizer.pt') torch.save(self.optimizer.state_dict(), optimizer_fp) scaler_fp = os.path.join(args.work_dir, 'scaler.pt') torch.save(self.scaler.state_dict(), scaler_fp) torch.save({"step": self.train_step, "hour": hours_total, "epoch": self.epoch, "sample": self.n_sequences_total}, os.path.join(args.work_dir, 'stats.pt')) if (self.train_step % args.eval_step == 0): # Evaluate model val_loss, val_acc = self.evaluate() elapsed_total = time.time() - self.init_time hours_elapsed = elapsed_total / 3600.0 hours_total = self.init_hours + hours_elapsed lr = self.optimizer.param_groups[0]['lr'] self.logging('-' * 120) log_str = '| Eval {:3d} step {:>8d} | now: {} | {:>3.1f} h' \ '| valid loss {:7.4f} | ppl {:5.3f}'.format( self.train_step // args.eval_step, self.train_step, time.strftime("%d-%m - %H:%M"), hours_total, val_loss, math.exp(val_loss)) if args.regression: log_str += " | l1_v: {:5.3f} | l1_a: {:5.3f}".format( val_acc["l1_v"], val_acc["l1_a"]) self.csv_writer.update({"epoch": self.epoch, "step": self.train_step, "hour": hours_total, "lr": lr, "trn_loss": np.nan, "val_loss": val_loss}) self.logging(log_str) self.logging('-' * 120) # dev-performance based learning rate annealing if args.scheduler == 'dev_perf': self.scheduler.step(val_loss) if self.train_step >= args.max_step: break self.train_step += 1 self.epoch += 1 if self.train_step >= args.max_step: break def run(self): # Loop over epochs. # At any point you can hit Ctrl + C to break out of training early. try: if args.exhaustive_eval or args.regression_dir is not None: self.logging("Exhaustive evaluation") if args.regression_dir is not None: self.logging(f"For regression on folder {args.regression_dir}") loss, accuracies = self.evaluate() perplexity = math.exp(loss) elapsed_total = time.time() - self.init_time hours_elapsed = elapsed_total / 3600.0 msg = f"Loss: {loss:7.4f}, ppl: {perplexity:5.2f}" for k, v in accuracies.items(): if args.regression: msg += f", {k}: {v:7.4f}" else: msg += f", top{k:1.0f}: {v:7.4f}" msg += f", hours: {hours_elapsed:3.1f}" self.logging(msg) else: while True: self.train() if self.train_step >= args.max_step: self.logging('-' * 120) self.logging('End of training') break except KeyboardInterrupt: self.logging('-' * 120) self.logging('Exiting from training early') if __name__ == "__main__": runner = Runner() runner.run()