"""Model code for FlowMo.

Sources: https://github.com/feizc/FluxMusic/blob/main/train.py
https://github.com/black-forest-labs/flux/tree/main/src/flux
"""

import ast
import itertools
import math
from dataclasses import dataclass
from typing import List, Tuple

import einops
import torch
from einops import rearrange, repeat
from mup import MuReadout
from torch import Tensor, nn
import argparse
import contextlib
import copy
import glob
import os
import subprocess
import tempfile
import time

import fsspec
import psutil
import torch
import torch.distributed as dist
from mup import MuReadout, set_base_shapes
from omegaconf import OmegaConf
from torch.utils.data import DataLoader

from .lookup_free_quantize import LFQ

MUP_ENABLED = True


def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
    b, h, l, d = q.shape
    q, k = apply_rope(q, k, pe)

    if torch.__version__ == "2.0.1+cu117":  # tmp workaround
        if d != 64:
            print("MUP is broken in this setting! Be careful!")
            x = torch.nn.functional.scaled_dot_product_attention(
                q,
                k,
                v,
            )
    else:
        x = torch.nn.functional.scaled_dot_product_attention(
            q,
            k,
            v,
            scale=8.0 / d if MUP_ENABLED else None,
        )
    assert x.shape == q.shape
    x = rearrange(x, "B H L D -> B L (H D)")
    return x


def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
    omega = 1.0 / (theta**scale)
    out = torch.einsum("...n,d->...nd", pos, omega)
    out = torch.stack(
        [torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)],
        dim=-1,
    )
    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
    return out.float()


def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]

    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)


def _get_diagonal_gaussian(parameters):
    mean, logvar = torch.chunk(parameters, 2, dim=1)
    logvar = torch.clamp(logvar, -30.0, 20.0)
    return mean, logvar


def _sample_diagonal_gaussian(mean, logvar):
    std = torch.exp(0.5 * logvar)
    x = mean + std * torch.randn(mean.shape, device=mean.device)
    return x


def _kl_diagonal_gaussian(mean, logvar):
    var = torch.exp(logvar)
    return 0.5 * torch.sum(torch.pow(mean, 2) + var - 1.0 - logvar, dim=1).mean()


class EmbedND(nn.Module):
    def __init__(self, dim: int, theta: int, axes_dim):
        super().__init__()
        self.dim = dim
        self.theta = theta
        self.axes_dim = axes_dim

    def forward(self, ids: Tensor) -> Tensor:
        n_axes = ids.shape[-1]
        emb = torch.cat(
            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
            dim=-3,
        )

        return emb.unsqueeze(1)


def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
    """
    Create sinusoidal timestep embeddings.
    :param t: a 1-D Tensor of N indices, one per batch element.
                      These may be fractional.
    :param dim: the dimension of the output.
    :param max_period: controls the minimum frequency of the embeddings.
    :return: an (N, D) Tensor of positional embeddings.
    """
    t = time_factor * t
    half = dim // 2
    freqs = torch.exp(
        -math.log(max_period)
        * torch.arange(start=0, end=half, dtype=torch.float32)
        / half
    ).to(t.device)

    args = t[:, None].float() * freqs[None]
    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
    if dim % 2:
        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
    if torch.is_floating_point(t):
        embedding = embedding.to(t)
    return embedding


class MLPEmbedder(nn.Module):
    def __init__(self, in_dim: int, hidden_dim: int):
        super().__init__()
        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
        self.silu = nn.SiLU()
        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)

    def forward(self, x: Tensor) -> Tensor:
        return self.out_layer(self.silu(self.in_layer(x)))


class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(dim))

    def forward(self, x: Tensor):
        x_dtype = x.dtype
        x = x.float()
        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
        return (x * rrms).to(dtype=x_dtype) * self.scale


class QKNorm(torch.nn.Module):
    def __init__(self, dim: int):
        super().__init__()
        self.query_norm = RMSNorm(dim)
        self.key_norm = RMSNorm(dim)

    def forward(self, q: Tensor, k: Tensor, v: Tensor):
        q = self.query_norm(q)
        k = self.key_norm(k)
        return q.to(v), k.to(v)


class SelfAttention(nn.Module):
    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.norm = QKNorm(head_dim)
        self.proj = nn.Linear(dim, dim)

    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
        qkv = self.qkv(x)
        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
        q, k = self.norm(q, k, v)
        x = attention(q, k, v, pe=pe)
        x = self.proj(x)
        return x


@dataclass
class ModulationOut:
    shift: Tensor
    scale: Tensor
    gate: Tensor


class Modulation(nn.Module):
    def __init__(self, dim: int, double: bool):
        super().__init__()
        self.is_double = double
        self.multiplier = 6 if double else 3
        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)

        self.lin.weight[dim * 2 : dim * 3].data[:] = 0.0
        self.lin.bias[dim * 2 : dim * 3].data[:] = 0.0
        self.lin.weight[dim * 5 : dim * 6].data[:] = 0.0
        self.lin.bias[dim * 5 : dim * 6].data[:] = 0.0

    def forward(self, vec: Tensor) -> Tuple[ModulationOut, ModulationOut]:
        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(
            self.multiplier, dim=-1
        )
        return (
            ModulationOut(*out[:3]),
            ModulationOut(*out[3:]) if self.is_double else None,
        )


class DoubleStreamBlock(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        mlp_ratio: float,
        qkv_bias: bool = False,
    ):
        super().__init__()

        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        self.num_heads = num_heads
        self.hidden_size = hidden_size

        self.img_mod = Modulation(hidden_size, double=True)
        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.img_attn = SelfAttention(
            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
        )
        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.img_mlp = nn.Sequential(
            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
            nn.GELU(approximate="tanh"),
            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
        )

        self.txt_mod = Modulation(hidden_size, double=True)
        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.txt_attn = SelfAttention(
            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
        )

        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.txt_mlp = nn.Sequential(
            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
            nn.GELU(approximate="tanh"),
            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
        )

    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
        pe_single, pe_double = pe
        p = 1
        if vec is None:
            img_mod1, img_mod2 = ModulationOut(0, 1 - p, 1), ModulationOut(0, 1 - p, 1)
            txt_mod1, txt_mod2 = ModulationOut(0, 1 - p, 1), ModulationOut(0, 1 - p, 1)
        else:
            img_mod1, img_mod2 = self.img_mod(vec)
            txt_mod1, txt_mod2 = self.txt_mod(vec)

        # prepare image for attention
        img_modulated = self.img_norm1(img)
        img_modulated = (p + img_mod1.scale) * img_modulated + img_mod1.shift
        img_qkv = self.img_attn.qkv(img_modulated)
        img_q, img_k, img_v = rearrange(
            img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
        )
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)

        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
        txt_modulated = (p + txt_mod1.scale) * txt_modulated + txt_mod1.shift
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        txt_q, txt_k, txt_v = rearrange(
            txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
        )
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)

        # run actual attention
        q = torch.cat((txt_q, img_q), dim=2)
        k = torch.cat((txt_k, img_k), dim=2)
        v = torch.cat((txt_v, img_v), dim=2)

        attn = attention(q, k, v, pe=pe_double)
        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]

        # calculate the img bloks
        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
        img = img + img_mod2.gate * self.img_mlp(
            (p + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
        )

        # calculate the txt bloks
        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
        txt = txt + txt_mod2.gate * self.txt_mlp(
            (p + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
        )
        return img, txt


class LastLayer(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        patch_size: int,
        out_channels: int,
        readout_zero_init=False,
    ):
        super().__init__()
        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)

        if MUP_ENABLED:
            self.linear = MuReadout(
                hidden_size,
                patch_size * patch_size * out_channels,
                bias=True,
                readout_zero_init=readout_zero_init,
            )
        else:
            self.linear = nn.Linear(
                hidden_size, patch_size * patch_size * out_channels, bias=True
            )

        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
        )

    def forward(self, x: Tensor, vec) -> Tensor:
        if vec is None:
            pass
        else:
            shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
            x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
        x = self.norm_final(x)
        x = self.linear(x)
        return x


@dataclass
class FluxParams:
    in_channels: int
    patch_size: int
    context_dim: int
    hidden_size: int
    mlp_ratio: float
    num_heads: int
    depth: int
    axes_dim: List[int]
    theta: int
    qkv_bias: bool


DIT_ZOO = dict(
    dit_xl_4=dict(
        hidden_size=1152,
        mlp_ratio=4.0,
        num_heads=16,
        axes_dim=[8, 28, 28],
        theta=10_000,
        qkv_bias=True,
    ),
    dit_l_4=dict(
        hidden_size=1024,
        mlp_ratio=4.0,
        num_heads=16,
        axes_dim=[8, 28, 28],
        theta=10_000,
        qkv_bias=True,
    ),
    dit_b_4=dict(
        hidden_size=768,
        mlp_ratio=4.0,
        num_heads=12,
        axes_dim=[8, 28, 28],
        theta=10_000,
        qkv_bias=True,
    ),
    dit_s_4=dict(
        hidden_size=384,
        mlp_ratio=4.0,
        num_heads=6,
        axes_dim=[8, 28, 28],
        theta=10_000,
        qkv_bias=True,
    ),
    dit_mup_test=dict(
        hidden_size=768,
        mlp_ratio=4.0,
        num_heads=12,
        axes_dim=[8, 28, 28],
        theta=10_000,
        qkv_bias=True,
    ),
)


def prepare_idxs(img, code_length, patch_size):
    bs, c, h, w = img.shape

    img_ids = torch.zeros(h // patch_size, w // patch_size, 3, device=img.device)
    img_ids[..., 1] = (
        img_ids[..., 1] + torch.arange(h // patch_size, device=img.device)[:, None]
    )
    img_ids[..., 2] = (
        img_ids[..., 2] + torch.arange(w // patch_size, device=img.device)[None, :]
    )
    img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)

    txt_ids = (
        torch.zeros((bs, code_length, 3), device=img.device)
        + torch.arange(code_length, device=img.device)[None, :, None]
    )
    return img_ids, txt_ids


class Flux(nn.Module):
    """
    Transformer model for flow matching on sequences.
    """

    def __init__(self, params: FluxParams, name="", lsg=False):
        super().__init__()

        self.name = name
        self.lsg = lsg
        self.params = params
        self.in_channels = params.in_channels
        self.patch_size = params.patch_size
        self.out_channels = self.in_channels
        if params.hidden_size % params.num_heads != 0:
            raise ValueError(
                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
            )
        pe_dim = params.hidden_size // params.num_heads
        if sum(params.axes_dim) != pe_dim:
            raise ValueError(
                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
            )
        self.hidden_size = params.hidden_size
        self.num_heads = params.num_heads
        self.pe_embedder = EmbedND(
            dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim
        )

        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
        self.txt_in = nn.Linear(params.context_dim, self.hidden_size)

        self.double_blocks = nn.ModuleList(
            [
                DoubleStreamBlock(
                    self.hidden_size,
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
                )
                for idx in range(params.depth)
            ]
        )

        self.final_layer_img = LastLayer(
            self.hidden_size, 1, self.out_channels, readout_zero_init=False
        )
        self.final_layer_txt = LastLayer(
            self.hidden_size, 1, params.context_dim, readout_zero_init=False
        )

    def forward(
        self,
        img: Tensor,
        img_ids: Tensor,
        txt: Tensor,
        txt_ids: Tensor,
        timesteps: Tensor,
    ) -> Tensor:
        b, c, h, w = img.shape

        img = rearrange(
            img,
            "b c (gh ph) (gw pw) -> b (gh gw) (ph pw c)",
            ph=self.patch_size,
            pw=self.patch_size,
        )
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")
        img = self.img_in(img)

        if timesteps is None:
            vec = None
        else:
            vec = self.time_in(timestep_embedding(timesteps, 256))

        txt = self.txt_in(txt)
        pe_single = self.pe_embedder(torch.cat((txt_ids,), dim=1))
        pe_double = self.pe_embedder(torch.cat((txt_ids, img_ids), dim=1))

        for block in self.double_blocks:
            img, txt = block(img=img, txt=txt, pe=(pe_single, pe_double), vec=vec)

        img = self.final_layer_img(img, vec=vec)
        img = rearrange(
            img,
            "b (gh gw) (ph pw c) -> b c (gh ph) (gw pw)",
            ph=self.patch_size,
            pw=self.patch_size,
            gh=h // self.patch_size,
            gw=w // self.patch_size,
        )

        txt = self.final_layer_txt(txt, vec=vec)
        return img, txt, {"final_txt": txt}


def get_weights_to_fix(model):
    with torch.no_grad():
        for name, module in itertools.chain(model.named_modules()):
            if "double_blocks" in name and isinstance(module, torch.nn.Linear):
                yield name, module.weight


class FlowMo(nn.Module):
    def __init__(self, width, config):
        super().__init__()
        code_length = config.model.code_length
        context_dim = config.model.context_dim
        enc_depth = config.model.enc_depth
        dec_depth = config.model.dec_depth

        patch_size = config.model.patch_size
        self.config = config

        self.image_size = config.data.image_size
        self.patch_size = config.model.patch_size
        self.code_length = code_length
        self.dit_mode = "dit_b_4"
        self.context_dim = context_dim
        self.encoder_context_dim = context_dim * (
            1 + (self.config.model.quantization_type == "kl")
        )

        if config.model.quantization_type == "lfq":
            self.quantizer = LFQ(
                codebook_size=2**self.config.model.codebook_size_for_entropy,
                dim=self.config.model.codebook_size_for_entropy,
                num_codebooks=1,
                token_factorization=False,
            )

        if self.config.model.enc_mup_width is not None:
            enc_width = self.config.model.enc_mup_width
        else:
            enc_width = width

        encoder_params = FluxParams(
            in_channels=3 * patch_size**2,
            context_dim=self.encoder_context_dim,
            patch_size=patch_size,
            depth=enc_depth,
            **DIT_ZOO[self.dit_mode],
        )
        decoder_params = FluxParams(
            in_channels=3 * patch_size**2,
            context_dim=context_dim + 1,
            patch_size=patch_size,
            depth=dec_depth,
            **DIT_ZOO[self.dit_mode],
        )

        # width=4, dit_b_4 is the usual model
        encoder_params.hidden_size = enc_width * (encoder_params.hidden_size // 4)
        decoder_params.hidden_size = width * (decoder_params.hidden_size // 4)
        encoder_params.axes_dim = [
            (d // 4) * enc_width for d in encoder_params.axes_dim
        ]
        decoder_params.axes_dim = [(d // 4) * width for d in decoder_params.axes_dim]

        self.encoder = Flux(encoder_params, name="encoder")
        self.decoder = Flux(decoder_params, name="decoder")

    @torch.compile
    def encode(self, img):
        b, c, h, w = img.shape

        img_idxs, txt_idxs = prepare_idxs(img, self.code_length, self.patch_size)
        txt = torch.zeros(
            (b, self.code_length, self.encoder_context_dim), device=img.device
        )

        _, code, aux = self.encoder(img, img_idxs, txt, txt_idxs, timesteps=None)

        return code, aux

    def _decode(self, img, code, timesteps):
        b, c, h, w = img.shape

        img_idxs, txt_idxs = prepare_idxs(
            img,
            self.code_length,
            self.patch_size,
        )
        pred, _, decode_aux = self.decoder(
            img, img_idxs, code, txt_idxs, timesteps=timesteps
        )
        return pred, decode_aux

    @torch.compile
    def decode(self, *args, **kwargs):
        return self._decode(*args, **kwargs)

    @torch.compile
    def decode_checkpointed(self, *args, **kwargs):
        # Need to compile(checkpoint), not checkpoint(compile)
        assert not kwargs, kwargs
        return torch.utils.checkpoint.checkpoint(
            self._decode,
            *args,
            # WARNING: Do not use_reentrant=True with compile, it will silently
            # produce incorrect gradients!
            use_reentrant=False,
        )

    @torch.compile
    def _quantize(self, code):
        """
        Args:
            code: [b codelength context dim]

        Returns:
            quantized code of the same shape
        """
        b, t, f = code.shape
        indices = None
        if self.config.model.quantization_type == "noop":
            quantized = code
            quantizer_loss = torch.tensor(0.0).to(code.device)
        elif self.config.model.quantization_type == "kl":
            # colocating features of same token before split is maybe slightly
            # better?
            mean, logvar = _get_diagonal_gaussian(
                einops.rearrange(code, "b t f -> b (f t)")
            )
            code = einops.rearrange(
                _sample_diagonal_gaussian(mean, logvar),
                "b (f t) -> b t f",
                f=f // 2,
                t=t,
            )
            quantizer_loss = _kl_diagonal_gaussian(mean, logvar)
        elif self.config.model.quantization_type == "lfq":
            assert f % self.config.model.codebook_size_for_entropy == 0, f
            code = einops.rearrange(
                code,
                "b t (fg fh) -> b fg (t fh)",
                fg=self.config.model.codebook_size_for_entropy,
            )

            (quantized, entropy_aux_loss, indices), breakdown = self.quantizer(
                code, return_loss_breakdown=True
            )
            assert quantized.shape == code.shape
            quantized = einops.rearrange(quantized, "b fg (t fh) -> b t (fg fh)", t=t)

            quantizer_loss = (
                entropy_aux_loss * self.config.model.entropy_loss_weight
                + breakdown.commitment * self.config.model.commit_loss_weight
            )
            code = quantized
        else:
            raise NotImplementedError
        return code, indices, quantizer_loss

    #     def forward(
    #         self,
    #         img,
    #         noised_img,
    #         timesteps,
    #         enable_cfg=True,
    #     ):
    #         aux = {}
    #
    #         code, encode_aux = self.encode(img)
    #
    #         aux["original_code"] = code
    #
    #         b, t, f = code.shape
    #
    #         code, _, aux["quantizer_loss"] = self._quantize(code)
    #
    #         mask = torch.ones_like(code[..., :1])
    #         code = torch.concatenate([code, mask], axis=-1)
    #         code_pre_cfg = code
    #
    #         if self.config.model.enable_cfg and enable_cfg:
    #             cfg_mask = (torch.rand((b,), device=code.device) > 0.1)[:, None, None]
    #             code = code * cfg_mask
    #
    #         v_est, decode_aux = self.decode(noised_img, code, timesteps)
    #         aux.update(decode_aux)
    #
    #         if self.config.model.posttrain_sample:
    #             aux["posttrain_sample"] = self.reconstruct_checkpoint(code_pre_cfg)
    #
    #         return v_est, aux

    def forward(self, img):
        return self.reconstruct(img)

    def reconstruct_checkpoint(self, code):
        with torch.autocast(
            "cuda",
            dtype=torch.bfloat16,
        ):
            bs, *_ = code.shape

            z = torch.randn((bs, 3, self.image_size, self.image_size)).cuda()
            ts = (
                torch.rand((bs, self.config.model.posttrain_sample_k + 1))
                .cumsum(dim=1)
                .cuda()
            )
            ts = ts - ts[:, :1]
            ts = (ts / ts[:, -1:]).flip(dims=(1,))
            dts = ts[:, :-1] - ts[:, 1:]

            for i, (t, dt) in enumerate((zip(ts.T, dts.T))):
                if self.config.model.posttrain_sample_enable_cfg:
                    mask = (torch.rand((bs,), device=code.device) > 0.1)[
                        :, None, None
                    ].to(code.dtype)
                    code_t = code * mask
                else:
                    code_t = code

                vc, _ = self.decode_checkpointed(z, code_t, t)

                z = z - dt[:, None, None, None] * vc
        return z

    @torch.no_grad()
    def reconstruct(self, images, dtype=torch.bfloat16, code=None):
        """
        Args:
            images in [bchw] [-1, 1]

        Returns:
            images in [bchw] [-1, 1]
        """
        model = self
        config = self.config.eval.sampling

        with torch.autocast(
            "cuda",
            dtype=dtype,
        ):
            bs, c, h, w = images.shape
            if code is None:
                x = images.cuda()
                prequantized_code = model.encode(x)[0].cuda()
                code, indices, _ = model._quantize(prequantized_code)

            z = torch.randn((bs, 3, h, w)).cuda()

            mask = torch.ones_like(code[..., :1])
            code = torch.concatenate([code * mask, mask], axis=-1)

            cfg_mask = 0.0
            null_code = code * cfg_mask if config.cfg != 1.0 else None

            samples = rf_sample(
                model,
                z,
                code,
                null_code=null_code,
                sample_steps=config.sample_steps,
                cfg=config.cfg,
                schedule=config.schedule,
            )[-1].clip(-1, 1)
        return samples.to(torch.float32), code, prequantized_code


def rf_loss(config, model, batch, aux_state):
    x = batch["image"]
    b = x.size(0)

    if config.opt.schedule == "lognormal":
        nt = torch.randn((b,)).to(x.device)
        t = torch.sigmoid(nt)
    elif config.opt.schedule == "fat_lognormal":
        nt = torch.randn((b,)).to(x.device)
        t = torch.sigmoid(nt)
        t = torch.where(torch.rand_like(t) <= 0.9, t, torch.rand_like(t))
    elif config.opt.schedule == "uniform":
        t = torch.rand((b,), device=x.device)
    elif config.opt.schedule.startswith("debug"):
        p = float(config.opt.schedule.split("_")[1])
        t = torch.ones((b,), device=x.device) * p
    else:
        raise NotImplementedError

    t = t.view([b, *([1] * len(x.shape[1:]))])
    z1 = torch.randn_like(x)
    zt = (1 - t) * x + t * z1

    zt, t = zt.to(x.dtype), t.to(x.dtype)

    vtheta, aux = model(
        img=x,
        noised_img=zt,
        timesteps=t.reshape((b,)),
    )

    diff = z1 - vtheta - x
    x_pred = zt - vtheta * t

    loss = ((diff) ** 2).mean(dim=list(range(1, len(x.shape))))
    loss = loss.mean()

    aux["loss_dict"] = {}
    aux["loss_dict"]["diffusion_loss"] = loss
    aux["loss_dict"]["quantizer_loss"] = aux["quantizer_loss"]

    if config.opt.lpips_weight != 0.0:
        aux_loss = 0.0
        if config.model.posttrain_sample:
            x_pred = aux["posttrain_sample"]

        lpips_dist = aux_state["lpips_model"](x, x_pred)
        lpips_dist = (config.opt.lpips_weight * lpips_dist).mean() + aux_loss
        aux["loss_dict"]["lpips_loss"] = lpips_dist
    else:
        lpips_dist = 0.0

    loss = loss + aux["quantizer_loss"] + lpips_dist
    aux["loss_dict"]["total_loss"] = loss
    return loss, aux


def _edm_to_flow_convention(noise_level):
    # z = x + \sigma z'
    return noise_level / (1 + noise_level)


def rf_sample(
    model,
    z,
    code,
    null_code=None,
    sample_steps=25,
    cfg=2.0,
    schedule="linear",
):
    b = z.size(0)
    if schedule == "linear":
        ts = torch.arange(1, sample_steps + 1).flip(0) / sample_steps
        dts = torch.ones_like(ts) * (1.0 / sample_steps)
    elif schedule.startswith("pow"):
        p = float(schedule.split("_")[1])
        ts = torch.arange(0, sample_steps + 1).flip(0) ** (1 / p) / sample_steps ** (
            1 / p
        )
        dts = ts[:-1] - ts[1:]
    else:
        raise NotImplementedError

    if model.config.eval.sampling.cfg_interval is None:
        interval = None
    else:
        cfg_lo, cfg_hi = ast.literal_eval(model.config.eval.sampling.cfg_interval)
        interval = _edm_to_flow_convention(cfg_lo), _edm_to_flow_convention(cfg_hi)

    images = []
    for i, (t, dt) in enumerate((zip(ts, dts))):
        timesteps = torch.tensor([t] * b).to(z.device)
        vc, decode_aux = model.decode(img=z, timesteps=timesteps, code=code)

        if null_code is not None and (
            interval is None
            or ((t.item() >= interval[0]) and (t.item() <= interval[1]))
        ):
            vu, _ = model.decode(img=z, timesteps=timesteps, code=null_code)
            vc = vu + cfg * (vc - vu)

        z = z - dt * vc
        images.append(z)
    return images


def build_model(config):
    with tempfile.TemporaryDirectory() as log_dir:
        MUP_ENABLED = config.model.enable_mup
        model_partial = FlowMo

        shared_kwargs = dict(config=config)
        model = model_partial(
            **shared_kwargs,
            width=config.model.mup_width,
        ).cuda()

        if config.model.enable_mup:
            print("Mup enabled!")
            with torch.device("cpu"):
                base_model = model_partial(
                    **shared_kwargs, width=config.model.mup_width
                )
                delta_model = model_partial(
                    **shared_kwargs,
                    width=(
                        config.model.mup_width * 4 if config.model.mup_width == 1 else 1
                    ),
                )
                true_model = model_partial(
                    **shared_kwargs, width=config.model.mup_width
                )

                if torch.distributed.is_initialized():
                    bsh_path = os.path.join(log_dir, f"{dist.get_rank()}.bsh")
                else:
                    bsh_path = os.path.join(log_dir, "0.bsh")
                set_base_shapes(
                    true_model, base_model, delta=delta_model, savefile=bsh_path
                )

            model = set_base_shapes(model, base=bsh_path)

            for module in model.modules():
                if isinstance(module, MuReadout):
                    module.width_mult = lambda: module.weight.infshape.width_mult()
    return model