"""
Configurations can be overwritten by adding: key=value
Use debug.wandb=False to disable logging to wandb.
"""

import datetime
from datetime import timedelta
import os
import random
import socket
import time
from glob import glob

import hydra
import ipdb  # noqa: F401
import numpy as np
import omegaconf
import torch
import wandb
from accelerate import Accelerator, DistributedDataParallelKwargs, InitProcessGroupKwargs
from pytorch3d.renderer import PerspectiveCameras

from diffusionsfm.dataset.co3d_v2 import Co3dDataset, unnormalize_image_for_vis
# from diffusionsfm.dataset.multiloader import get_multiloader, MultiDataset
from diffusionsfm.eval.eval_category import evaluate
from diffusionsfm.model.diffuser import RayDiffuser
from diffusionsfm.model.diffuser_dpt import RayDiffuserDPT
from diffusionsfm.model.scheduler import NoiseScheduler
from diffusionsfm.utils.rays import cameras_to_rays, normalize_cameras_batch, compute_ndc_coordinates
from diffusionsfm.utils.visualization import (
    create_training_visualizations,
    view_color_coded_images_from_tensor,
)

os.umask(000)  # Default to 777 permissions


class Trainer(object):
    def __init__(self, cfg):
        seed = cfg.training.seed
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

        self.cfg = cfg
        self.debug = cfg.debug
        self.resume = cfg.training.resume
        self.pretrain_path = cfg.training.pretrain_path

        self.batch_size = cfg.training.batch_size
        self.max_iterations = cfg.training.max_iterations
        self.mixed_precision = cfg.training.mixed_precision
        self.interval_visualize = cfg.training.interval_visualize
        self.interval_save_checkpoint = cfg.training.interval_save_checkpoint
        self.interval_delete_checkpoint = cfg.training.interval_delete_checkpoint
        self.interval_evaluate = cfg.training.interval_evaluate
        self.delete_all = cfg.training.delete_all_checkpoints_after_training
        self.freeze_encoder = cfg.training.freeze_encoder
        self.translation_scale = cfg.training.translation_scale
        self.regression = cfg.training.regression
        self.prob_unconditional = cfg.training.prob_unconditional
        self.load_extra_cameras = cfg.training.load_extra_cameras
        self.calculate_intrinsics = cfg.training.calculate_intrinsics
        self.distort = cfg.training.distort
        self.diffuse_origins_and_endpoints = cfg.training.diffuse_origins_and_endpoints
        self.diffuse_depths = cfg.training.diffuse_depths
        self.depth_resolution = cfg.training.depth_resolution
        self.dpt_head = cfg.training.dpt_head
        self.full_num_patches_x = cfg.training.full_num_patches_x
        self.full_num_patches_y = cfg.training.full_num_patches_y
        self.dpt_encoder_features = cfg.training.dpt_encoder_features
        self.nearest_neighbor = cfg.training.nearest_neighbor
        self.no_bg_targets = cfg.training.no_bg_targets
        self.unit_normalize_scene = cfg.training.unit_normalize_scene
        self.sd_scale = cfg.training.sd_scale
        self.bfloat = cfg.training.bfloat
        self.first_cam_mediod = cfg.training.first_cam_mediod
        self.normalize_first_camera = cfg.training.normalize_first_camera
        self.gradient_clipping = cfg.training.gradient_clipping
        self.l1_loss = cfg.training.l1_loss
        self.reinit = cfg.training.reinit

        if self.first_cam_mediod:
            assert self.normalize_first_camera

        self.pred_x0 = cfg.model.pred_x0
        self.num_patches_x = cfg.model.num_patches_x
        self.num_patches_y = cfg.model.num_patches_y
        self.depth = cfg.model.depth
        self.num_images = cfg.model.num_images
        self.num_visualize = min(self.batch_size, 2)
        self.random_num_images = cfg.model.random_num_images
        self.feature_extractor = cfg.model.feature_extractor
        self.append_ndc = cfg.model.append_ndc
        self.use_homogeneous = cfg.model.use_homogeneous
        self.freeze_transformer = cfg.model.freeze_transformer
        self.cond_depth_mask = cfg.model.cond_depth_mask

        self.dataset_name = cfg.dataset.name
        self.shape = cfg.dataset.shape
        self.apply_augmentation = cfg.dataset.apply_augmentation
        self.mask_holes = cfg.dataset.mask_holes
        self.image_size = cfg.dataset.image_size

        if not self.regression and (self.diffuse_origins_and_endpoints or self.diffuse_depths):
            assert self.mask_holes or self.cond_depth_mask

        if self.regression:
            assert self.pred_x0

        self.start_time = None
        self.iteration = 0
        self.epoch = 0
        self.wandb_id = None

        self.hostname = socket.gethostname()

        if self.dpt_head:
            find_unused_parameters = True
        else:
            find_unused_parameters = False

        ddp_scaler = DistributedDataParallelKwargs(
            find_unused_parameters=find_unused_parameters
        )
        init_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=5400))
        self.accelerator = Accelerator(
            even_batches=False,
            device_placement=False,
            kwargs_handlers=[ddp_scaler, init_kwargs],
        )
        self.device = self.accelerator.device

        scheduler = NoiseScheduler(
            type=cfg.noise_scheduler.type,
            max_timesteps=cfg.noise_scheduler.max_timesteps,
            beta_start=cfg.noise_scheduler.beta_start,
            beta_end=cfg.noise_scheduler.beta_end,
        )
        if self.dpt_head:
            self.model = RayDiffuserDPT(
                depth=self.depth,
                width=self.num_patches_x,
                P=1,
                max_num_images=self.num_images,
                noise_scheduler=scheduler,
                freeze_encoder=self.freeze_encoder,
                feature_extractor=self.feature_extractor,
                append_ndc=self.append_ndc,
                use_unconditional=self.prob_unconditional > 0,
                diffuse_depths=self.diffuse_depths,
                depth_resolution=self.depth_resolution,
                encoder_features=self.dpt_encoder_features,
                use_homogeneous=self.use_homogeneous,
                freeze_transformer=self.freeze_transformer,
                cond_depth_mask=self.cond_depth_mask,
            ).to(self.device)
        else:
            self.model = RayDiffuser(
                depth=self.depth,
                width=self.num_patches_x,
                P=1,
                max_num_images=self.num_images,
                noise_scheduler=scheduler,
                freeze_encoder=self.freeze_encoder,
                feature_extractor=self.feature_extractor,
                append_ndc=self.append_ndc,
                use_unconditional=self.prob_unconditional > 0,
                diffuse_depths=self.diffuse_depths,
                depth_resolution=self.depth_resolution,
                use_homogeneous=self.use_homogeneous,
                cond_depth_mask=self.cond_depth_mask,
            ).to(self.device)

        if self.dpt_head:
            depth_size = self.full_num_patches_x
        elif self.depth_resolution > 1:
            depth_size = self.num_patches_x * self.depth_resolution
        else:
            depth_size = self.num_patches_x
        self.depth_size = depth_size

        if self.dataset_name == "multi":
            self.dataset, self.train_dataloader, self.test_dataset = get_multiloader(
                num_images=self.num_images,
                apply_augmentation=self.apply_augmentation,
                load_extra_cameras=self.load_extra_cameras,
                distort_image=self.distort,
                center_crop=self.diffuse_origins_and_endpoints or self.diffuse_depths,
                crop_images=not (self.diffuse_origins_and_endpoints or self.diffuse_depths),
                load_depths=self.diffuse_origins_and_endpoints or self.diffuse_depths,
                depth_size=depth_size,
                mask_holes=self.mask_holes,
                img_size=self.image_size,
                batch_size=self.batch_size,
                num_workers=cfg.training.num_workers,
                dust3r_pairs=True,
            )
        elif self.dataset_name == "co3d":
            self.dataset = Co3dDataset(
                category=self.shape,
                split="train",
                num_images=self.num_images,
                apply_augmentation=self.apply_augmentation,
                load_extra_cameras=self.load_extra_cameras,
                distort_image=self.distort,
                center_crop=self.diffuse_origins_and_endpoints or self.diffuse_depths,
                crop_images=not (self.diffuse_origins_and_endpoints or self.diffuse_depths),
                load_depths=self.diffuse_origins_and_endpoints or self.diffuse_depths,
                depth_size=depth_size,
                mask_holes=self.mask_holes,
                img_size=self.image_size,
            )
            self.train_dataloader = torch.utils.data.DataLoader(
                self.dataset,
                batch_size=self.batch_size,
                shuffle=True,
                num_workers=cfg.training.num_workers,
                pin_memory=True,
                drop_last=True,
            )
            self.test_dataset = Co3dDataset(
                category=self.shape,
                split="test",
                num_images=self.num_images,
                apply_augmentation=False,
                load_extra_cameras=self.load_extra_cameras,
                distort_image=self.distort,
                center_crop=self.diffuse_origins_and_endpoints or self.diffuse_depths,
                crop_images=not (self.diffuse_origins_and_endpoints or self.diffuse_depths),
                load_depths=self.diffuse_origins_and_endpoints or self.diffuse_depths,
                depth_size=depth_size,
                mask_holes=self.mask_holes,
                img_size=self.image_size,
            )
        else:
            raise NotImplementedError(f"Dataset '{self.dataset_name}' is not supported.")
        self.lr = 1e-4

        self.output_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
        self.checkpoint_dir = os.path.join(self.output_dir, "checkpoints")
        if self.accelerator.is_main_process:
            name = os.path.basename(self.output_dir)
            name += f"_{self.debug.run_name}"

            print("Output dir:", self.output_dir)
            with open(os.path.join(self.output_dir, name), "w"):
                # Create empty tag with name
                pass
            self.name = name

            conf_dict = omegaconf.OmegaConf.to_container(
                cfg, resolve=True, throw_on_missing=True
            )
            conf_dict["output_dir"] = self.output_dir
            conf_dict["hostname"] = self.hostname

        if self.dpt_head:
            self.init_optimizer_with_separate_lrs()
        else:
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        self.gradscaler = torch.cuda.amp.GradScaler(growth_interval=100000, enabled=self.mixed_precision)

        self.model, self.optimizer, self.train_dataloader = self.accelerator.prepare(
            self.model, self.optimizer, self.train_dataloader
        )

        if self.resume:
            checkpoint_files = sorted(glob(os.path.join(self.checkpoint_dir, "*.pth")))
            last_checkpoint = checkpoint_files[-1]
            print("Resuming from checkpoint:", last_checkpoint)
            self.load_model(last_checkpoint, load_metadata=True)
        elif self.pretrain_path != "":
            print("Loading pretrained model:", self.pretrain_path)
            self.load_model(self.pretrain_path, load_metadata=False)

        if self.accelerator.is_main_process:
            mode = "online" if cfg.debug.wandb else "disabled"
            if self.wandb_id is None:
                self.wandb_id = wandb.util.generate_id()
            self.wandb_run = wandb.init(
                mode=mode,
                name=name,
                project=cfg.debug.project_name,
                config=conf_dict,
                resume=self.resume,
                id=self.wandb_id,
            )
            wandb.define_metric("iteration")
            noise_schedule = self.get_module().noise_scheduler.plot_schedule(
                return_image=True
            )
            wandb.log(
                {"Schedule": wandb.Image(noise_schedule, caption="Noise Schedule")}
            )

    def get_module(self):
        if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
            model = self.model.module
        else:
            model = self.model

        return model
    
    def init_optimizer_with_separate_lrs(self):
        print("Use different LRs for the DINOv2 encoder and DiT!")

        feature_extractor_params = [
            p for n, p in self.model.feature_extractor.named_parameters()
        ]
        feature_extractor_param_names = [
            "feature_extractor." + n for n, _ in self.model.feature_extractor.named_parameters()
        ]
        ray_predictor_params = [
            p for n, p in self.model.ray_predictor.named_parameters()
        ]
        ray_predictor_param_names = [
            "ray_predictor." + n for n, p in self.model.ray_predictor.named_parameters()
        ]
        other_params = [
            p for n, p in self.model.named_parameters()
            if n not in feature_extractor_param_names + ray_predictor_param_names
        ]

        self.optimizer = torch.optim.Adam([
            {'params': feature_extractor_params, 'lr': self.lr * 0.1},  # Lower LR for feature extractor
            {'params': ray_predictor_params, 'lr': self.lr * 0.1},      # Lower LR for DIT (ray_predictor)
            {'params': other_params, 'lr': self.lr}                     # Normal LR for other parts of the model
        ])

    def train(self):
        while self.iteration < self.max_iterations:
            for batch in self.train_dataloader:
                t0 = time.time()
                self.optimizer.zero_grad()

                float_type = torch.bfloat16 if self.bfloat else torch.float16
                with torch.cuda.amp.autocast(
                    enabled=self.mixed_precision, dtype=float_type
                ):
                    images = batch["image"].to(self.device)
                    focal_lengths = batch["focal_length"].to(self.device)
                    crop_params = batch["crop_parameters"].to(self.device)
                    principal_points = batch["principal_point"].to(self.device)
                    R = batch["R"].to(self.device)
                    T = batch["T"].to(self.device)
                    if "distortion_coefficients" in batch:
                        distortion_coefficients = batch["distortion_coefficients"]
                    else:
                        distortion_coefficients = [None for _ in range(R.shape[0])]

                    depths = batch["depth"].to(self.device)
                    if self.no_bg_targets:
                        masks = batch["depth_masks"].to(self.device).bool()

                    cameras_og = [
                        PerspectiveCameras(
                            focal_length=focal_lengths[b],
                            principal_point=principal_points[b],
                            R=R[b],
                            T=T[b],
                            device=self.device,
                        )
                        for b in range(self.batch_size)
                    ]

                    cameras, _ = normalize_cameras_batch(
                        cameras=cameras_og,
                        scale=self.translation_scale,
                        normalize_first_camera=self.normalize_first_camera,
                        depths=(
                            None
                            if not (self.diffuse_origins_and_endpoints or self.diffuse_depths)
                            else depths
                        ),
                        first_cam_mediod=self.first_cam_mediod,
                        crop_parameters=crop_params,
                        num_patches_x=self.depth_size,
                        num_patches_y=self.depth_size,
                        distortion_coeffs=distortion_coefficients,
                    )

                    # Now that cameras are normalized, fix shapes of camera parameters
                    if self.load_extra_cameras or self.random_num_images:
                        if self.random_num_images:
                            num_images = torch.randint(2, self.num_images + 1, (1,))
                        else:
                            num_images = self.num_images

                        # The correct number of images is already loaded.
                        # Only need to modify these camera parameters shapes.
                        focal_lengths = focal_lengths[:, :num_images]
                        crop_params = crop_params[:, :num_images]
                        R = R[:, :num_images]
                        T = T[:, :num_images]
                        images = images[:, :num_images]
                        depths = depths[:, :num_images]
                        masks = masks[:, :num_images]

                        cameras = [
                            PerspectiveCameras(
                                focal_length=cameras[b].focal_length[:num_images],
                                principal_point=cameras[b].principal_point[:num_images],
                                R=cameras[b].R[:num_images],
                                T=cameras[b].T[:num_images],
                                device=self.device,
                            )
                            for b in range(self.batch_size)
                        ]

                    if self.regression:
                        low = self.get_module().noise_scheduler.max_timesteps - 1
                    else:
                        low = 0

                    t = torch.randint(
                        low=low,
                        high=self.get_module().noise_scheduler.max_timesteps,
                        size=(self.batch_size,),
                        device=self.device,
                    )

                    if self.prob_unconditional > 0:
                        unconditional_mask = (
                            (torch.rand(self.batch_size) < self.prob_unconditional)
                            .float()
                            .to(self.device)
                        )
                    else:
                        unconditional_mask = None

                    if self.distort:
                        raise NotImplementedError()
                    else:
                        gt_rays = []
                        rays_dirs = []
                        rays = []
                        for i, (camera, crop_param, depth) in enumerate(
                            zip(cameras, crop_params, depths)
                        ):
                            if self.diffuse_origins_and_endpoints:
                                mode = "segment"
                            else:
                                mode = "plucker"

                            r = cameras_to_rays(
                                cameras=camera,
                                num_patches_x=self.full_num_patches_x,
                                num_patches_y=self.full_num_patches_y,
                                crop_parameters=crop_param,
                                depths=depth,
                                mode=mode,
                                depth_resolution=self.depth_resolution,
                                nearest_neighbor=self.nearest_neighbor,
                                distortion_coefficients=distortion_coefficients[i],
                            )
                            rays_dirs.append(r.get_directions())
                            gt_rays.append(r)

                            if self.diffuse_origins_and_endpoints:
                                assert r.mode == "segment"
                            elif self.diffuse_depths:
                                assert r.mode == "plucker"

                            if self.unit_normalize_scene:
                                if self.diffuse_origins_and_endpoints:
                                    assert r.mode == "segment"
                                    # Let's say SD should be 0.5
                                    scale = r.get_segments().std() * self.sd_scale

                                    if scale.isnan().any():
                                        assert False

                                    camera.T /= scale
                                    r.rays /= scale
                                    depths[i] /= scale
                                else:
                                    assert r.mode == "plucker"
                                    scale = r.depths.std() * self.sd_scale

                                    if scale.isnan().any():
                                        assert False

                                    camera.T /= scale
                                    r.depths /= scale
                                    depths[i] /= scale

                            rays.append(
                                r.to_spatial(
                                    include_ndc_coordinates=self.append_ndc,
                                    include_depths=self.diffuse_depths,
                                    use_homogeneous=self.use_homogeneous,
                                )
                            )

                    rays_tensor = torch.stack(rays, dim=0)

                    if self.append_ndc:
                        ndc_coordinates = rays_tensor[..., -2:, :, :]
                        rays_tensor = rays_tensor[..., :-2, :, :]

                        if self.dpt_head:
                            xy_grid = compute_ndc_coordinates(
                                crop_params,
                                num_patches_x=self.depth_size // 16,
                                num_patches_y=self.depth_size // 16,
                                distortion_coeffs=distortion_coefficients,
                            )[..., :2]
                            ndc_coordinates = xy_grid.permute(0, 1, 4, 2, 3).contiguous()

                    else:
                        ndc_coordinates = None

                    if self.cond_depth_mask:
                        condition_mask = masks
                    else:
                        condition_mask = None

                    if rays_tensor.isnan().any():
                        import pickle

                        with open("bad.json", "wb") as f:
                            pickle.dump(batch, f)
                        ipdb.set_trace()

                    eps_pred, eps = self.model(
                        images=images,
                        rays=rays_tensor,
                        t=t,
                        ndc_coordinates=ndc_coordinates,
                        unconditional_mask=unconditional_mask,
                        depth_mask=condition_mask,
                    )
                    if self.pred_x0:
                        target = rays_tensor
                    else:
                        target = eps

                    if self.no_bg_targets:
                        C = eps_pred.shape[2]
                        loss_masks = masks.unsqueeze(2).repeat(1, 1, C, 1, 1)
                        eps_pred = loss_masks * eps_pred
                        target = loss_masks * target

                    loss = 0

                    if self.l1_loss:
                        loss_reconstruction = torch.mean(torch.abs(eps_pred - target))
                    else:
                        loss_reconstruction = torch.mean((eps_pred - target) ** 2)

                    loss += loss_reconstruction

                if self.mixed_precision:
                    self.gradscaler.scale(loss).backward()

                    scaled_norm = 0
                    for p in self.model.parameters():
                        if p.requires_grad and p.grad is not None:
                            param_norm = p.grad.data.norm(2)
                            scaled_norm += param_norm.item() ** 2
                    scaled_norm = scaled_norm ** 0.5

                    if self.gradient_clipping and self.accelerator.sync_gradients:
                        self.accelerator.clip_grad_norm_(
                            self.get_module().parameters(), 1
                        )

                    clipped_norm = 0
                    for p in self.model.parameters():
                        if p.requires_grad and p.grad is not None:
                            param_norm = p.grad.data.norm(2)
                            clipped_norm += param_norm.item() ** 2
                    clipped_norm = clipped_norm ** 0.5

                    self.gradscaler.unscale_(self.optimizer)
                    unscaled_norm = 0
                    for p in self.model.parameters():
                        if p.requires_grad and p.grad is not None:
                            param_norm = p.grad.data.norm(2)
                            unscaled_norm += param_norm.item() ** 2
                    unscaled_norm = unscaled_norm ** 0.5

                    self.gradscaler.step(self.optimizer)
                    self.gradscaler.update()
                else:
                    self.accelerator.backward(loss)

                    if self.gradient_clipping and self.accelerator.sync_gradients:
                        self.accelerator.clip_grad_norm_(
                            self.get_module().parameters(), 10
                        )
                    self.optimizer.step()

                if self.accelerator.is_main_process:
                    if self.iteration % 10 == 0:
                        self.log_info(
                            loss_reconstruction,
                            t0,
                            self.lr,
                            scaled_norm,
                            unscaled_norm,
                            clipped_norm,
                        )

                    if self.iteration % self.interval_visualize == 0:
                        self.visualize(
                            images=unnormalize_image_for_vis(images.clone()),
                            cameras_gt=cameras,
                            depths=depths,
                            crop_parameters=crop_params,
                            distortion_coefficients=distortion_coefficients,
                            depth_mask=masks,
                        )

                    if self.iteration % self.interval_save_checkpoint == 0 and self.iteration != 0:
                        self.save_model()

                    if self.iteration % self.interval_delete_checkpoint == 0:
                        self.clear_old_checkpoints(self.checkpoint_dir)

                    if (
                        self.iteration % self.interval_evaluate == 0
                        and self.iteration > 0
                    ):
                        self.evaluate_train_acc()

                    if self.iteration >= self.max_iterations + 1:
                        if self.delete_all:
                            self.clear_old_checkpoints(
                                self.checkpoint_dir, clear_all_old=True
                            )
                        return

                self.iteration += 1

                if self.reinit and self.iteration >= 50000:
                    state_dict = self.get_module().state_dict()
                    self.model = RayDiffuserDPT(
                        depth=self.depth,
                        width=self.num_patches_x,
                        P=1,
                        max_num_images=self.num_images,
                        noise_scheduler=self.get_module().noise_scheduler,
                        freeze_encoder=False,
                        feature_extractor=self.feature_extractor,
                        append_ndc=self.append_ndc,
                        use_unconditional=self.prob_unconditional > 0,
                        diffuse_depths=self.diffuse_depths,
                        depth_resolution=self.depth_resolution,
                        encoder_features=self.dpt_encoder_features,
                        use_homogeneous=self.use_homogeneous,
                        freeze_transformer=False,
                        cond_depth_mask=self.cond_depth_mask,
                    ).to(self.device)

                    self.init_optimizer_with_separate_lrs()
                    self.gradscaler = torch.cuda.amp.GradScaler(growth_interval=100000, enabled=self.mixed_precision)

                    self.model, self.optimizer = self.accelerator.prepare(
                        self.model, self.optimizer
                    )

                    msg = self.get_module().load_state_dict(
                        state_dict,
                        strict=True,
                    )
                    print(msg)

                    self.reinit = False

            self.epoch += 1

    def load_model(self, path, load_metadata=True):
        save_dict = torch.load(path, map_location=self.device)
        del save_dict["state_dict"]["ray_predictor.x_pos_enc.image_pos_table"]

        if not self.resume:
            if len(save_dict["state_dict"]["scratch.input_conv.weight"].shape) == 2 and self.dpt_head:
                print("Initialize conv layer weights from the linear layer!")
                C = save_dict["state_dict"]["scratch.input_conv.weight"].shape[1]
                input_conv_weight = save_dict["state_dict"]["scratch.input_conv.weight"].view(384, C, 1, 1).repeat(1, 1, 16, 16) / 256.
                input_conv_bias = save_dict["state_dict"]["scratch.input_conv.bias"]

                self.get_module().scratch.input_conv.weight.data = input_conv_weight   
                self.get_module().scratch.input_conv.bias.data = input_conv_bias

                del save_dict["state_dict"]["scratch.input_conv.weight"]
                del save_dict["state_dict"]["scratch.input_conv.bias"]

        missing, unexpected = self.get_module().load_state_dict(
            save_dict["state_dict"],
            strict=False,
        )
        print(f"Missing keys: {missing}")
        print(f"Unexpected keys: {unexpected}")
        if load_metadata:
            self.iteration = save_dict["iteration"]
            self.epoch = save_dict["epoch"]
            time_elapsed = save_dict["elapsed"]
            self.start_time = time.time() - time_elapsed
            if "wandb_id" in save_dict:
                self.wandb_id = save_dict["wandb_id"]
            self.optimizer.load_state_dict(save_dict["optimizer"])
            self.gradscaler.load_state_dict(save_dict["gradscaler"])

    def save_model(self):
        path = os.path.join(self.checkpoint_dir, f"ckpt_{self.iteration:08d}.pth")
        os.makedirs(os.path.dirname(path), exist_ok=True)
        elapsed = time.time() - self.start_time if self.start_time is not None else 0
        save_dict = {
            "epoch": self.epoch,
            "elapsed": elapsed,
            "gradscaler": self.gradscaler.state_dict(),
            "iteration": self.iteration,
            "state_dict": self.get_module().state_dict(),
            "optimizer": self.optimizer.state_dict(),
            "wandb_id": self.wandb_id,
        }
        torch.save(save_dict, path)

    def clear_old_checkpoints(self, checkpoint_dir, clear_all_old=False):
        print("Clearing old checkpoints")
        checkpoint_files = sorted(glob(os.path.join(checkpoint_dir, "ckpt_*.pth")))
        if clear_all_old:
            for checkpoint_file in checkpoint_files[:-1]:
                os.remove(checkpoint_file)
        else:
            for checkpoint_file in checkpoint_files:
                checkpoint = os.path.basename(checkpoint_file)
                checkpoint_iteration = int("".join(filter(str.isdigit, checkpoint)))
                if checkpoint_iteration % self.interval_delete_checkpoint != 0:
                    os.remove(checkpoint_file)

    def log_info(
        self,
        loss,
        t0,
        lr,
        scaled_norm,
        unscaled_norm,
        clipped_norm,
    ):
        if self.start_time is None:
            self.start_time = time.time()
        time_elapsed = round(time.time() - self.start_time)
        time_remaining = round(
            (time.time() - self.start_time)
            / (self.iteration + 1)
            * (self.max_iterations - self.iteration)
        )
        disp = [
            f"Iter: {self.iteration}/{self.max_iterations}",
            f"Epoch: {self.epoch}",
            f"Loss: {loss.item():.4f}",
            f"LR: {lr:.7f}",
            f"Grad Norm: {scaled_norm:.4f}/{unscaled_norm:.4f}/{clipped_norm:.4f}",
            f"Elap: {str(datetime.timedelta(seconds=time_elapsed))}",
            f"Rem: {str(datetime.timedelta(seconds=time_remaining))}",
            self.hostname,
            self.name,
        ]
        print(", ".join(disp), flush=True)
        wandb_log = {
            "loss": loss.item(),
            "iter_time": time.time() - t0,
            "lr": lr,
            "iteration": self.iteration,
            "hours_remaining": time_remaining / 3600,
            "gradient norm": scaled_norm,
            "unscaled norm": unscaled_norm,
            "clipped norm": clipped_norm,
        }
        wandb.log(wandb_log)

    def visualize(
        self,
        images,
        cameras_gt,
        crop_parameters=None,
        depths=None,
        distortion_coefficients=None,
        depth_mask=None,
        high_loss=False,
    ):
        self.get_module().eval()

        for camera in cameras_gt:
            # AMP may not cast back to float
            camera.R = camera.R.float()
            camera.T = camera.T.float()

        loss_tag = "" if not high_loss else " HIGH LOSS"

        for i in range(self.num_visualize):
            imgs = view_color_coded_images_from_tensor(images[i].cpu(), depth=False)
            im = wandb.Image(imgs, caption=f"iteration {self.iteration} example {i}")
            wandb.log({f"Vis images {i}{loss_tag}": im})

            if self.cond_depth_mask:
                imgs = view_color_coded_images_from_tensor(
                    depth_mask[i].cpu(), depth=True
                )
                im = wandb.Image(
                    imgs, caption=f"iteration {self.iteration} example {i}"
                )
                wandb.log({f"Vis masks {i}{loss_tag}": im})

        vis_depths, _, _ = create_training_visualizations(
            model=self.get_module(),
            images=images[: self.num_visualize],
            device=self.device,
            cameras_gt=cameras_gt,
            pred_x0=self.pred_x0,
            num_images=images.shape[1],
            crop_parameters=crop_parameters[: self.num_visualize],
            visualize_pred=self.regression,
            return_first=self.regression,
            calculate_intrinsics=self.calculate_intrinsics,
            mode="segment" if self.diffuse_origins_and_endpoints else "plucker",
            depths=depths[: self.num_visualize],
            diffuse_depths=self.diffuse_depths,
            full_num_patches_x=self.full_num_patches_x,
            full_num_patches_y=self.full_num_patches_y,
            use_homogeneous=self.use_homogeneous,
            distortion_coefficients=distortion_coefficients,
        )

        for i, vis_image in enumerate(vis_depths):
            im = wandb.Image(
                vis_image, caption=f"iteration {self.iteration} example {i}"
            )

            for i, vis_image in enumerate(vis_depths):
                im = wandb.Image(
                    vis_image, caption=f"iteration {self.iteration} example {i}"
                )
                wandb.log({f"Vis origins and endpoints {i}{loss_tag}": im})

        self.get_module().train()

    def evaluate_train_acc(self, num_evaluate=10):
        print("Evaluating train accuracy")
        model = self.get_module()
        model.eval()
        additional_timesteps = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
        num_images = self.num_images

        for split in ["train", "test"]:
            if split == "train":
                if self.dataset_name != "co3d":
                    to_evaluate = self.dataset.datasets
                    names = self.dataset.names
                else:
                    to_evaluate = [self.dataset]
                    names = ["co3d"]
            elif split == "test":
                if self.dataset_name != "co3d":
                    to_evaluate = self.test_dataset.datasets
                    names = self.test_dataset.names
                else:
                    to_evaluate = [self.test_dataset]
                    names = ["co3d"]

            for name, dataset in zip(names, to_evaluate):                    
                results = evaluate(
                    cfg=self.cfg,
                    model=model,
                    dataset=dataset,
                    num_images=num_images,
                    device=self.device,
                    additional_timesteps=additional_timesteps,
                    num_evaluate=num_evaluate,
                    use_pbar=True,
                    mode="segment" if self.diffuse_origins_and_endpoints else "plucker",
                    metrics=False,
                )

                R_err = []
                CC_err = []
                for key in results.keys():
                    R_err.append([v["R_error"] for v in results[key]])
                    CC_err.append([v["CC_error"] for v in results[key]])

                R_err = np.array(R_err)
                CC_err = np.array(CC_err)

                R_acc_15 = np.mean(R_err < 15, (0, 2)).max()
                CC_acc = np.mean(CC_err < 0.1, (0, 2)).max()

                wandb.log(
                    {
                        f"R_acc_15_{name}_{split}": R_acc_15,
                        "iteration": self.iteration,
                    }
                )
                wandb.log(
                    {
                        f"CC_acc_0.1_{name}_{split}": CC_acc,
                        "iteration": self.iteration,
                    }
                )
        model.train()


@hydra.main(config_path="./conf", config_name="config", version_base="1.3")
def main(cfg):
    print(cfg)
    torch.autograd.set_detect_anomaly(cfg.debug.anomaly_detection)
    torch.set_float32_matmul_precision(cfg.training.matmul_precision)
    trainer = Trainer(cfg=cfg)
    trainer.train()


if __name__ == "__main__":
    main()