Spaces:

HReynaud
/

EchoFlow

Running

App Files Files

xet

Community

HReynaud commited on Mar 25

Commit

dab5199

0 Parent(s):

first commit

Browse files

Files changed (17) hide show

.gitattributes +35 -0
.gitignore +180 -0
README.md +18 -0
assets/anatomies_dynamic.pt +3 -0
assets/anatomies_lvh.pt +3 -0
assets/anatomies_ped_a4c.pt +3 -0
assets/anatomies_ped_psax.pt +3 -0
assets/h1.png +0 -0
assets/h2.png +0 -0
assets/h3.png +0 -0
assets/h4.png +0 -0
assets/scaling.pt +3 -0
assets/seg.png +0 -0
demo.py +945 -0
echoflow/common/__init__.py +90 -0
echoflow/common/models.py +1730 -0
requirements.txt +14 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,180 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+tmp/
+.vscode/
+.gradio/
+.cursor/
+*.mp4

README.md ADDED Viewed

	@@ -0,0 +1,18 @@

+---
+title: EchoFlow
+emoji: 💙
+colorFrom: gray
+colorTo: red
+sdk: gradio
+sdk_version: 5.22.0
+app_file: demo.py
+pinned: true
+license: apache-2.0
+python_version: 3.11.8
+models:
+  - HReynaud/EchoFlow
+datasets:
+  - HReynaud/EchoFlow
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

assets/anatomies_dynamic.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d8bf0fa238ca8b4ccdf8457fc8b248cebd52b005d9385115db773ec8005dc29
+size 10271965

assets/anatomies_lvh.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfe6ff14cb9e6ba9a8d79e770423096f3bd9fa072b2a8fc984150f6e5fd91fe9
+size 11179209

assets/anatomies_ped_a4c.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2675b28071004ad15f060f057ae13330f1f61369500d7507fadefe7b5ae9c74
+size 3364061

assets/anatomies_ped_psax.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:881e51666a580b2830a27d7e97055a0f2ab037152547aa79af1448a2b5f65ccb
+size 4635874

assets/h1.png ADDED Viewed

assets/h2.png ADDED Viewed

assets/h3.png ADDED Viewed

assets/h4.png ADDED Viewed

assets/scaling.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbcd8f8cf990d57b96ce7e544e5f9b48b7ad2400dfc4080e0651575f666b19ac
+size 1432

assets/seg.png ADDED Viewed

demo.py ADDED Viewed

	@@ -0,0 +1,945 @@

+import json
+import os
+import types
+from urllib.parse import urlparse
+import cv2
+import diffusers
+import gradio as gr
+import numpy as np
+import torch
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from omegaconf import OmegaConf
+from PIL import Image, ImageOps
+from safetensors.torch import load_file
+from torch.nn import functional as F
+from torchdiffeq import odeint_adjoint as odeint
+from echoflow.common import instantiate_class_from_config, unscale_latents
+from echoflow.common.models import (
+    ContrastiveModel,
+    DiffuserSTDiT,
+    ResNet18,
+    SegDiTTransformer2DModel,
+)
+torch.set_grad_enabled(False)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+dtype = torch.float32
+# 4f4 latent space
+B, T, C, H, W = 1, 64, 4, 28, 28
+VIEWS = ["A4C", "PSAX", "PLAX"]
+def load_model(path):
+    if path.startswith("http"):
+        parsed_url = urlparse(path)
+        if "huggingface.co" in parsed_url.netloc:
+            parts = parsed_url.path.strip("/").split("/")
+            repo_id = "/".join(parts[:2])
+            subfolder = None
+            if len(parts) > 3:
+                subfolder = "/".join(parts[4:])
+            local_root = "./tmp"
+            local_dir = os.path.join(local_root, repo_id.replace("/", "_"))
+            if subfolder:
+                local_dir = os.path.join(local_root, subfolder)
+            os.makedirs(local_root, exist_ok=True)
+            config_file = hf_hub_download(
+                repo_id=repo_id,
+                subfolder=subfolder,
+                filename="config.json",
+                local_dir=local_root,
+                repo_type="model",
+                token=os.getenv("READ_HF_TOKEN"),
+                local_dir_use_symlinks=False,
+            )
+            assert os.path.exists(config_file)
+            hf_hub_download(
+                repo_id=repo_id,
+                filename="diffusion_pytorch_model.safetensors",
+                subfolder=subfolder,
+                local_dir=local_root,
+                local_dir_use_symlinks=False,
+                token=os.getenv("READ_HF_TOKEN"),
+            )
+            path = local_dir
+    model_root = os.path.join(config_file.split("config.json")[0])
+    json_path = os.path.join(model_root, "config.json")
+    assert os.path.exists(json_path)
+    with open(json_path, "r") as f:
+        config = json.load(f)
+    klass_name = config["_class_name"]
+    klass = getattr(diffusers, klass_name, None) or globals().get(klass_name, None)
+    assert (
+        klass is not None
+    ), f"Could not find class {klass_name} in diffusers or global scope."
+    assert hasattr(
+        klass, "from_pretrained"
+    ), f"Class {klass_name} does not support 'from_pretrained'."
+    return klass.from_pretrained(path)
+def load_reid(path):
+    parsed_url = urlparse(path)
+    parts = parsed_url.path.strip("/").split("/")
+    repo_id = "/".join(parts[:2])
+    subfolder = "/".join(parts[4:])
+    local_root = "./tmp"
+    config_file = hf_hub_download(
+        repo_id=repo_id,
+        subfolder=subfolder,
+        filename="config.yaml",
+        local_dir=local_root,
+        repo_type="model",
+        token=os.getenv("READ_HF_TOKEN"),
+        local_dir_use_symlinks=False,
+    )
+    weights_file = hf_hub_download(
+        repo_id=repo_id,
+        subfolder=subfolder,
+        filename="backbone.safetensors",
+        local_dir=local_root,
+        repo_type="model",
+        token=os.getenv("READ_HF_TOKEN"),
+        local_dir_use_symlinks=False,
+    )
+    config = OmegaConf.load(config_file)
+    backbone = instantiate_class_from_config(config.backbone)
+    backbone = ContrastiveModel.patch_backbone(
+        backbone, config.model.args.in_channels, config.model.args.out_channels
+    )
+    state_dict = load_file(weights_file)
+    backbone.load_state_dict(state_dict)
+    backbone = backbone.to(device, dtype=dtype)
+    backbone.eval()
+    return backbone
+def get_vae_scaler(path):
+    scaler = torch.load(path)
+    scaler = {k: v.to(device) for k, v in scaler.items()}
+    return scaler
+generator = torch.Generator(device=device).manual_seed(0)
+lifm = load_model("https://huggingface.co/HReynaud/EchoFlow/tree/main/lifm/FMiT-S2-4f4")
+lifm = lifm.to(device, dtype=dtype)
+lifm.eval()
+vae = load_model("https://huggingface.co/HReynaud/EchoFlow/tree/main/vae/avae-4f4")
+vae = vae.to(device, dtype=dtype)
+vae.eval()
+vae_scaler = get_vae_scaler("assets/scaling.pt")
+reid = {
+    "anatomies": {
+        "A4C": torch.cat(
+            [
+                torch.load("assets/anatomies_dynamic.pt"),
+                torch.load("assets/anatomies_ped_a4c.pt"),
+            ],
+            dim=0,
+        ),
+        "PSAX": torch.load("assets/anatomies_ped_psax.pt"),
+        "PLAX": torch.load("assets/anatomies_lvh.pt"),
+    },
+    "models": {
+        "A4C": load_reid(
+            "https://huggingface.co/HReynaud/EchoFlow/tree/main/reid/dynamic-4f4"
+        ),
+        "PSAX": load_reid(
+            "https://huggingface.co/HReynaud/EchoFlow/tree/main/reid/ped_psax-4f4"
+        ),
+        "PLAX": load_reid(
+            "https://huggingface.co/HReynaud/EchoFlow/tree/main/reid/lvh-4f4"
+        ),
+    },
+    "tau": {
+        "A4C": 0.9997,
+        "PSAX": 0.9953,
+        "PLAX": 0.9950,
+    },
+}
+lvfm = load_model("https://huggingface.co/HReynaud/EchoFlow/tree/main/lvfm/FMvT-S2-4f4")
+lvfm = lvfm.to(device, dtype=dtype)
+lvfm.eval()
+def load_default_mask():
+    """Load the default mask from disk. If not found, return a blank black mask."""
+    default_mask_path = os.path.join("assets", "default_mask.png")
+    try:
+        if os.path.exists(default_mask_path):
+            mask = Image.open(default_mask_path).convert("L")
+            # Ensure the mask is square and of proper size
+            mask = mask.resize((400, 400), Image.Resampling.LANCZOS)
+            # Make sure it's binary (0 or 255)
+            mask = ImageOps.autocontrast(mask, cutoff=0)
+            return np.array(mask)
+    except Exception as e:
+        print(f"Error loading default mask: {e}")
+    # Return a blank black mask if no default mask is found
+    return np.zeros((400, 400), dtype=np.uint8)
+def preprocess_mask(mask):
+    """Ensure mask is properly formatted for the model."""
+    if mask is None:
+        return np.zeros((112, 112), dtype=np.uint8)
+    # Check if mask is an EditorValue with multiple parts
+    if isinstance(mask, dict) and "composite" in mask:
+        # Use the composite image from the ImageEditor
+        mask = mask["composite"]
+    # If mask is already a numpy array, convert to PIL for processing
+    if isinstance(mask, np.ndarray):
+        mask_pil = Image.fromarray(mask)
+    else:
+        mask_pil = mask
+    # Ensure the mask is in L mode (grayscale)
+    mask_pil = mask_pil.convert("L")
+    # Apply contrast to make it binary (0 or 255)
+    mask_pil = ImageOps.autocontrast(mask_pil, cutoff=0)
+    # Threshold to ensure binary values
+    mask_pil = mask_pil.point(lambda p: 255 if p > 127 else 0)
+    # Print sizes for debugging
+    # print(f"Original mask size: {mask_pil.size}")
+    # Resize to 112x112 for the model
+    mask_pil = mask_pil.resize((112, 112), Image.Resampling.LANCZOS)
+    # Convert back to numpy array
+    return np.array(mask_pil)
+def generate_latent_image(mask, class_selection, sampling_steps=50):
+    """Generate a latent image based on mask, class selection, and sampling steps"""
+    # Mask
+    mask = preprocess_mask(mask)
+    mask = torch.from_numpy(mask).to(device, dtype=dtype)
+    mask = mask.unsqueeze(0).unsqueeze(0)
+    mask = F.interpolate(mask, size=(H, W), mode="bilinear", align_corners=False)
+    mask = 1.0 * (mask > 0)
+    # print(mask.shape, mask.min(), mask.max(), mask.mean(), mask.std())
+    # Class
+    class_idx = VIEWS.index(class_selection)
+    class_idx = torch.tensor([class_idx], device=device, dtype=torch.long)
+    # Timesteps
+    timesteps = torch.linspace(
+        1.0, 0.0, steps=sampling_steps + 1, device=device, dtype=dtype
+    )
+    forward_kwargs = {
+        "class_labels": class_idx,  # B x 1
+        "segmentation": mask,  # B x 1 x H x W
+    }
+    z_1 = torch.randn(
+        (B, C, H, W),
+        device=device,
+        dtype=dtype,
+        generator=generator,
+    )
+    lifm.forward_original = lifm.forward
+    def new_forward(self, t, y, *args, **kwargs):
+        kwargs = {**kwargs, **forward_kwargs}
+        return self.forward_original(y, t.view(1), *args, **kwargs).sample
+    lifm.forward = types.MethodType(new_forward, lifm)
+    # Use odeint to integrate
+    with torch.autocast("cuda"):
+        latent_image = odeint(
+            lifm,
+            z_1,
+            timesteps,
+            atol=1e-5,
+            rtol=1e-5,
+            adjoint_params=lifm.parameters(),
+            method="euler",
+        )[-1]
+    lifm.forward = lifm.forward_original
+    latent_image = latent_image.detach().cpu().numpy()
+    # callm VAE here
+    return latent_image  # B x C x H x W
+def decode_images(latents, vae):
+    """Decode latent representations to pixel space using a VAE.
+    Args:
+        latents: A numpy array of shape [B, C, H, W] for single image
+                or [B, C, T, H, W] for sequences/animations
+        vae: The VAE model for decoding
+    Returns:
+        numpy array of decoded images in [B, H, W, 3] format for single image
+        or [B, C, T, H, W] for sequences
+    """
+    if latents is None:
+        return None
+    # Convert to torch tensor if needed
+    if not isinstance(latents, torch.Tensor):
+        latents = torch.from_numpy(latents).to(device, dtype=dtype)
+    # Unscale latents
+    latents = unscale_latents(latents, vae_scaler)
+    # Handle both single images and sequences
+    is_sequence = len(latents.shape) == 5  # B C T H W
+    # print("Sequence:", is_sequence)
+    if is_sequence:
+        B, C, T, H, W = latents.shape
+        latents = rearrange(latents[0], "c t h w -> t c h w")
+    else:
+        B, C, H, W = latents.shape
+    # print("Latents:", latents.shape)
+    with torch.no_grad():
+        # Decode latents to pixel space
+        # decode one by one
+        decoded = []
+        for i in range(latents.shape[0]):
+            decoded.append(vae.decode(latents[i : i + 1].float()).sample)
+        decoded = torch.cat(decoded, dim=0)
+        decoded = (decoded + 1) * 128
+        decoded = decoded.clamp(0, 255).to(torch.uint8).cpu()
+        if is_sequence:
+            # Reshape back to [B, C, T, H, W] for sequences
+            decoded = rearrange(decoded, "t c h w -> c t h w").unsqueeze(0)
+        else:
+            decoded = decoded.squeeze()
+            decoded = decoded.permute(1, 2, 0)
+    # print("Decoded:", decoded.shape)
+    return decoded.numpy()
+def decode_latent_to_pixel(latent_image):
+    """Decode a single latent image to pixel space"""
+    global vae
+    if latent_image is None:
+        return None
+    # Add batch dimension if needed
+    if len(latent_image.shape) == 3:
+        latent_image = latent_image[None, ...]
+    decoded_image = decode_images(latent_image, vae)
+    decoded_image = cv2.resize(
+        decoded_image, (400, 400), interpolation=cv2.INTER_NEAREST
+    )
+    return decoded_image
+def check_privacy(latent_image_numpy, class_selection):
+    """Check if the latent image is too similar to database images"""
+    latent_image = torch.from_numpy(latent_image_numpy).to(device, dtype=dtype)
+    reid_model = reid["models"][class_selection].to(device, dtype=dtype)
+    real_anatomies = reid["anatomies"][class_selection]  # already scaled
+    tau = reid["tau"][class_selection]
+    with torch.no_grad():
+        features = reid_model(latent_image).sigmoid().cpu()
+    corr = torch.corrcoef(torch.cat([real_anatomies, features], dim=0))[0, 1:]
+    corr = corr.max()
+    if corr > tau:
+        return (
+            None,
+            f"⚠️ **Warning:** Generated image is too similar to training data. Privacy check failed (corr = {corr:.4f} / tau = {tau:.4f})",
+        )
+    else:
+        return (
+            latent_image_numpy,
+            f"✅ **Success:** Generated image passed privacy check (corr = {corr:.4f} / tau = {tau:.4f})",
+        )
+def generate_animation(
+    latent_image, ejection_fraction, sampling_steps=50, cfg_scale=1.0
+):
+    """Generate an animated sequence of latent images based on EF"""
+    # print(
+    #     f"Generating animation with EF = {ejection_fraction}, steps = {sampling_steps}, CFG = {cfg_scale}"
+    # )
+    # print(latent_image.shape, type(latent_image))
+    if latent_image is None:
+        return None
+    lvefs = torch.tensor([ejection_fraction / 100.0], device=device, dtype=dtype)
+    lvefs = lvefs[:, None, None].to(device, dtype)
+    uncond_lvefs = -1 * torch.ones_like(lvefs)
+    ref_images = torch.from_numpy(latent_image).to(device, dtype)
+    ref_images = ref_images[:, :, None, :, :]  # B x C x 1 x H x W
+    ref_images = ref_images.repeat(1, 1, T, 1, 1)  # B x C x T x H x W
+    uncond_images = torch.zeros_like(ref_images)
+    timesteps = torch.linspace(
+        1.0, 0.0, steps=sampling_steps + 1, device=device, dtype=dtype
+    )
+    forward_kwargs = {
+        "encoder_hidden_states": lvefs,
+        "cond_image": ref_images,
+    }
+    z_1 = torch.randn(
+        (B, C, T, H, W),
+        device=device,
+        dtype=dtype,
+        generator=generator,
+    )
+    # print(
+    #     z_1.shape,
+    #     forward_kwargs["encoder_hidden_states"].shape,
+    #     forward_kwargs["cond_image"].shape,
+    # )
+    lvfm.forward_original = lvfm.forward
+    def new_forward(self, t, y, *args, **kwargs):
+        kwargs = {**kwargs, **forward_kwargs}
+        # y has shape (B, C, T, H, W)
+        pred = self.forward_original(y, t.repeat(y.size(0)), *args, **kwargs).sample
+        if cfg_scale != 1.0:
+            uncond_kwargs = {
+                "encoder_hidden_states": uncond_lvefs,
+                "cond_image": uncond_images,
+            }
+            uncond_pred = self.forward_original(
+                y, t.repeat(y.size(0)), *args, **uncond_kwargs
+            ).sample
+            pred = uncond_pred + cfg_scale * (pred - uncond_pred)
+        return pred
+    lvfm.forward = types.MethodType(new_forward, lvfm)
+    with torch.autocast("cuda"):
+        synthetic_video = odeint(
+            lvfm,
+            z_1,
+            timesteps,
+            atol=1e-5,
+            rtol=1e-5,
+            adjoint_params=lvfm.parameters(),
+            method="euler",
+        )[-1]
+    lvfm.forward = lvfm.forward_original
+    # print("Synthetic video:", synthetic_video.shape)
+    return synthetic_video  # B x C x T x H x W
+def decode_animation(latent_animation):
+    """Decode a latent animation to pixel space"""
+    global vae
+    if latent_animation is None:
+        return None
+    # Convert to torch tensor if needed
+    if not isinstance(latent_animation, torch.Tensor):
+        latent_animation = torch.from_numpy(latent_animation).to(device, dtype=dtype)
+    # Ensure shape is B x C x T x H x W
+    if len(latent_animation.shape) == 4:  # [T, C, H, W]
+        latent_animation = latent_animation[None, ...]  # Add batch dimension
+    # Decode using VAE
+    decoded = decode_images(
+        latent_animation, vae
+    )  # Returns B x C x T x H x W numpy array
+    # Remove batch dimension and transpose to T x H x W x C
+    decoded = np.transpose(decoded[0], (1, 2, 3, 0))  # [T, H, W, C]
+    # Resize frames to 400x400
+    decoded = np.stack(
+        [
+            cv2.resize(frame, (400, 400), interpolation=cv2.INTER_NEAREST)
+            for frame in decoded
+        ]
+    )
+    # Save to temporary file
+    temp_file = "temp_video_2.mp4"
+    fps = 32
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    out = cv2.VideoWriter(temp_file, fourcc, fps, (400, 400))
+    # Write frames
+    for frame in decoded:
+        out.write(frame)
+    out.release()
+    return temp_file
+def convert_latent_to_display(latent_image):
+    """Convert multi-channel latent image to grayscale for display"""
+    if latent_image is None:
+        return None
+    # Check shape
+    if len(latent_image.shape) == 4:  # [B, C, H, W]
+        # Remove batch dimension and average across channels
+        display_image = np.squeeze(latent_image, axis=0)  # [C, H, W]
+        display_image = np.mean(display_image, axis=0)  # [H, W]
+    elif len(latent_image.shape) == 3:  # [C, H, W]
+        # Average across channels
+        display_image = np.mean(latent_image, axis=0)  # [H, W]
+    else:
+        display_image = latent_image
+    # Normalize to 0-1 range
+    display_image = (display_image - display_image.min()) / (
+        display_image.max() - display_image.min() + 1e-8
+    )
+    # Convert to grayscale image
+    display_image = (display_image * 255).astype(np.uint8)
+    # Resize to a larger size (e.g., 400x400) using bicubic interpolation
+    display_image = cv2.resize(
+        display_image, (400, 400), interpolation=cv2.INTER_NEAREST
+    )
+    return display_image
+def latent_animation_to_grayscale(latent_animation):
+    """Convert multi-channel latent animation to grayscale for display"""
+    if latent_animation is None:
+        return None
+    # print("Input shape:", latent_animation.shape)
+    # Convert to numpy if it's a torch tensor
+    if torch.is_tensor(latent_animation):
+        latent_animation = latent_animation.detach().cpu().numpy()
+    # Handle shape B x C x T x H x W -> T x H x W
+    if len(latent_animation.shape) == 5:  # [B, C, T, H, W]
+        latent_animation = np.squeeze(latent_animation, axis=0)  # [C, T, H, W]
+        latent_animation = np.transpose(latent_animation, (1, 0, 2, 3))  # [T, C, H, W]
+    # print("After transpose:", latent_animation.shape)
+    # Average across channels
+    latent_animation = np.mean(latent_animation, axis=1)  # [T, H, W]
+    # print("After channel reduction:", latent_animation.shape)
+    # Normalize each frame independently
+    min_vals = latent_animation.min(axis=(1, 2), keepdims=True)
+    max_vals = latent_animation.max(axis=(1, 2), keepdims=True)
+    latent_animation = (latent_animation - min_vals) / (max_vals - min_vals + 1e-8)
+    # Convert to uint8
+    latent_animation = (latent_animation * 255).astype(np.uint8)
+    # print("Before resize:", latent_animation.shape)
+    # Resize each frame
+    resized_frames = []
+    for frame in latent_animation:
+        resized = cv2.resize(frame, (400, 400), interpolation=cv2.INTER_NEAREST)
+        resized_frames.append(resized)
+    # Stack back into video
+    grayscale_video = np.stack(resized_frames)
+    # print("Final shape:", grayscale_video.shape)
+    # Add a dummy channel dimension for grayscale video
+    grayscale_video = grayscale_video[..., None].repeat(3, axis=-1)  # Convert to RGB
+    # print("Output shape with channels:", grayscale_video.shape)
+    # Save to temporary file
+    temp_file = "temp_video.mp4"
+    fps = 32
+    # Create VideoWriter object
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    out = cv2.VideoWriter(temp_file, fourcc, fps, (400, 400))
+    # Write frames
+    for frame in grayscale_video:
+        out.write(frame)
+    out.release()
+    return temp_file
+def create_demo():
+    # Define the theme and layout
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# EchoFlow Demo")
+        gr.Markdown("## Dataset Generation Pipeline")
+        gr.Markdown(
+            """
+        ### 🎯 Purpose
+        This demo showcases EchoFlow's ability to generate synthetic echocardiogram images and videos while preserving patient privacy. The pipeline consists of four main steps:
+        1. **Latent Image Generation**: Draw a mask to indicate the region where the Left Ventricle should appear. Select the desired cardiac view, and click "Generate Latent Image". This outputs a latent image, which can be decoded into a pixel space image by clicking "Decode to Pixel Space".
+        2. **Privacy Filter**: When clicking "Run Privacy Check", the generated image will be checked against a database of all training anatomies to ensure it is sufficiently different from real patient data.
+        3. **Latent Video Generation**: If the privacy check passes, the latent image can be animated into a video with the desired Ejection Fraction.
+        4. **Video Decoding**: The video can be decoded into a pixel space video by clicking "Decode Video".
+        ### ⚙️ Parameters
+        - **Sampling Steps**: Higher values produce better quality but take longer
+        - **Ejection Fraction**: Controls the strength of heart contraction in the animation
+        - **CFG Scale**: Controls how closely the animation follows the specified conditions
+        """
+        )
+        # Main container with 4 columns
+        with gr.Row():
+            # Column 1: Latent Image Generation
+            with gr.Column():
+                gr.Markdown(
+                    '<img src="https://i.ibb.co/MysCHY1M/h1.png" style="width: 100%; height: 75px; object-fit: contain;">'
+                )
+                gr.Markdown("### Latent Image Generation")
+                with gr.Row():
+                    # Input mask (binary image)
+                    with gr.Column(scale=1):
+                        # gr.Markdown("#### Mask Condition")
+                        gr.Markdown("Draw the LV mask (white = region of interest)")
+                        # Create a black background for the canvas
+                        black_background = np.zeros((400, 400), dtype=np.uint8)
+                        # Load the default mask image if it exists
+                        try:
+                            mask_image = Image.open("assets/seg.png").convert("L")
+                            mask_image = mask_image.resize(
+                                (400, 400), Image.Resampling.LANCZOS
+                            )
+                            # Make it binary (0 or 255)
+                            mask_image = ImageOps.autocontrast(mask_image, cutoff=0)
+                            mask_image = mask_image.point(
+                                lambda p: 255 if p > 127 else 0
+                            )
+                            mask_array = np.array(mask_image)
+                            # Create the editor value structure
+                            editor_value = {
+                                "background": black_background,  # Black background
+                                "layers": [mask_array],  # The mask as an editable layer
+                                "composite": mask_array,  # The composite image (what's displayed)
+                            }
+                        except Exception as e:
+                            print(f"Error loading mask image: {e}")
+                            # Fall back to empty canvas
+                            editor_value = black_background
+                        mask_input = gr.ImageEditor(
+                            label="Binary Mask",
+                            height=400,
+                            width=400,
+                            image_mode="L",
+                            value=editor_value,
+                            type="numpy",
+                            brush=gr.Brush(
+                                colors=["#ffffff"],
+                                color_mode="fixed",
+                                default_size=20,
+                                default_color="#ffffff",
+                            ),
+                            eraser=gr.Eraser(default_size=20),
+                            # show_label=False,
+                            show_download_button=True,
+                            sources=[],
+                            canvas_size=(400, 400),
+                            fixed_canvas=True,
+                            layers=False,  # Enable layers to make the mask editable
+                        )
+                        # # Class selection
+                        # with gr.Column(scale=1):
+                        # gr.Markdown("#### View Condition")
+                        class_selection = gr.Radio(
+                            choices=["A4C", "PSAX", "PLAX"],
+                            label="View Class",
+                            value="A4C",
+                        )
+                        # gr.Markdown("#### Sampling Steps")
+                        sampling_steps = gr.Slider(
+                            minimum=1,
+                            maximum=200,
+                            value=100,
+                            step=1,
+                            label="Number of Sampling Steps",
+                            info="Higher values = better quality but slower generation",
+                        )
+                # Generate button
+                generate_btn = gr.Button("Generate Latent Image", variant="primary")
+                # Display area for latent image (grayscale visualization)
+                latent_image_display = gr.Image(
+                    label="Latent Image",
+                    type="numpy",
+                    height=400,
+                    width=400,
+                    # show_label=False,
+                )
+                # Decode button (initially disabled)
+                decode_btn = gr.Button(
+                    "Decode to Pixel Space (Optional)",
+                    interactive=False,
+                    variant="primary",
+                )
+                # Display area for decoded image
+                decoded_image_display = gr.Image(
+                    label="Decoded Image",
+                    type="numpy",
+                    height=400,
+                    width=400,
+                    # show_label=False,
+                )
+            # Column 2: Privacy Filter
+            with gr.Column():
+                gr.Markdown(
+                    '<img src="https://i.ibb.co/MysCHY1M/h1.png" style="width: 100%; height: 75px; object-fit: contain;">'
+                )
+                gr.Markdown("### Privacy Filter")
+                gr.Markdown(
+                    "Checks if the generated image is too similar to training data"
+                )
+                # Privacy check button
+                privacy_btn = gr.Button(
+                    "Run Privacy Check", interactive=False, variant="primary"
+                )
+                # Display area for privacy result status
+                privacy_status = gr.Markdown("No image processed yet")
+                # Display area for privacy-filtered latent image
+                filtered_latent_display = gr.Image(
+                    label="Filtered Latent Image", type="numpy", height=400, width=400
+                )
+            # Column 3: Animation
+            with gr.Column():
+                gr.Markdown(
+                    '<img src="https://i.ibb.co/MysCHY1M/h1.png" style="width: 100%; height: 75px; object-fit: contain;">'
+                )
+                gr.Markdown("### Latent Video Generation")
+                # Ejection Fraction slider
+                ef_slider = gr.Slider(
+                    minimum=0,
+                    maximum=100,
+                    value=65,
+                    label="Ejection Fraction (%)",
+                    info="Higher values = stronger contraction",
+                )
+                # Add sampling steps slider for animation
+                animation_steps = gr.Slider(
+                    minimum=1,
+                    maximum=200,
+                    value=100,
+                    step=1,
+                    label="Number of Sampling Steps",
+                    info="Higher values = better quality but slower generation",
+                )
+                # Add CFG slider
+                cfg_slider = gr.Slider(
+                    minimum=0,
+                    maximum=10,
+                    value=1,
+                    step=1,
+                    label="Classifier-Free Guidance Scale",
+                    # info="Higher values = better quality but slower generation",
+                )
+                # Animate button
+                animate_btn = gr.Button(
+                    "Generate Video", interactive=False, variant="primary"
+                )
+                # Display area for latent animation (grayscale)
+                latent_animation_display = gr.Video(
+                    label="Latent Video", format="mp4", autoplay=True, loop=True
+                )
+            # Column 4: Video Decoding
+            with gr.Column():
+                gr.Markdown(
+                    '<img src="https://i.ibb.co/MysCHY1M/h1.png" style="width: 100%; height: 75px; object-fit: contain;">'
+                )
+                gr.Markdown("### Video Decoding")
+                # Decode animation button
+                decode_animation_btn = gr.Button(
+                    "Decode Video", interactive=False, variant="primary"
+                )
+                # Display area for decoded animation
+                decoded_animation_display = gr.Video(
+                    label="Decoded Video", format="mp4", autoplay=True, loop=True
+                )
+        # Hidden state variables to store the full latent representations
+        latent_image_state = gr.State(None)
+        filtered_latent_state = gr.State(None)
+        latent_animation_state = gr.State(None)
+        # Event handlers
+        generate_btn.click(
+            fn=generate_latent_image,
+            inputs=[mask_input, class_selection, sampling_steps],
+            outputs=[latent_image_state],
+            queue=True,
+        ).then(
+            fn=convert_latent_to_display,
+            inputs=[latent_image_state],
+            outputs=[latent_image_display],
+            queue=False,
+        ).then(
+            fn=lambda x: gr.Button(
+                interactive=x is not None
+            ),  # Properly update button state
+            inputs=[latent_image_state],
+            outputs=[decode_btn],
+            queue=False,
+        ).then(
+            fn=lambda x: gr.Button(
+                interactive=x is not None
+            ),  # Properly update button state
+            inputs=[latent_image_state],
+            outputs=[privacy_btn],
+            queue=False,
+        )
+        decode_btn.click(
+            fn=decode_latent_to_pixel,
+            inputs=[latent_image_state],
+            outputs=[decoded_image_display],
+            queue=True,
+        ).then(
+            fn=lambda x: gr.Button(
+                interactive=x is not None
+            ),  # Properly update button state
+            inputs=[decoded_image_display],
+            outputs=[privacy_btn],
+            queue=False,
+        )
+        privacy_btn.click(
+            fn=check_privacy,
+            inputs=[latent_image_state, class_selection],
+            outputs=[filtered_latent_state, privacy_status],
+            queue=True,
+        ).then(
+            fn=convert_latent_to_display,
+            inputs=[filtered_latent_state],
+            outputs=[filtered_latent_display],
+            queue=False,
+        ).then(
+            fn=lambda x: gr.Button(
+                interactive=x is not None
+            ),  # Properly update button state
+            inputs=[filtered_latent_state],
+            outputs=[animate_btn],
+            queue=False,
+        )
+        animate_btn.click(
+            fn=generate_animation,
+            inputs=[filtered_latent_state, ef_slider, animation_steps, cfg_slider],
+            outputs=[latent_animation_state],
+            queue=True,
+        ).then(
+            fn=latent_animation_to_grayscale,
+            inputs=[latent_animation_state],
+            outputs=[latent_animation_display],
+            queue=False,
+        ).then(
+            fn=lambda x: gr.Button(
+                interactive=x is not None
+            ),  # Properly update button state
+            inputs=[latent_animation_state],
+            outputs=[decode_animation_btn],
+            queue=False,
+        )
+        decode_animation_btn.click(
+            fn=decode_animation,
+            inputs=[latent_animation_state],  # Remove vae_state from inputs
+            outputs=[decoded_animation_display],
+            queue=True,
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()

echoflow/common/__init__.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import importlib
+import omegaconf
+from .models import ContrastiveModel, DiffuserSTDiT, ResNet18, SegDiTTransformer2DModel
+def parse_klass_arg(value, full_config):
+    """
+    Parse an argument value that might represent a class, enum, or basic data type.
+    This function tries to dynamically import and resolve nested attributes.
+    It also resolves OmegaConf interpolations if found.
+    """
+    if isinstance(value, str) and "." in value:
+        # Check if the value is an interpolation and try to resolve it
+        if value.startswith("${") and value.endswith("}"):
+            try:
+                # Attempt to resolve the interpolation directly using OmegaConf
+                value = omegaconf.OmegaConf.resolve(full_config)[value[2:-1]]
+            except Exception as e:
+                print(f"Error resolving OmegaConf interpolation {value}: {e}")
+                return None
+        parts = value.split(".")
+        for i in range(len(parts) - 1, 0, -1):
+            module_name = ".".join(parts[:i])
+            attr_name = parts[i]
+            try:
+                module = importlib.import_module(module_name)
+                result = module
+                for j in range(i, len(parts)):
+                    result = getattr(result, parts[j])
+                return result
+            except ImportError as e:
+                continue
+            except AttributeError as e:
+                print(
+                    f"Warning: Could not resolve attribute {parts[j]} from {module_name}, error: {e}"
+                )
+                continue
+        # print(f"Warning: Failed to import or resolve {value}. Falling back to string.")
+        return (
+            value  # Return the original string if no valid import and resolution occurs
+        )
+    return value
+def instantiate_class_from_config(config, *args, **kwargs):
+    """
+    Dynamically instantiate a class based on a configuration object.
+    Supports passing additional positional and keyword arguments.
+    """
+    module_name, class_name = config.target.rsplit(".", 1)
+    klass = globals().get(class_name)
+    # module = importlib.import_module(module_name)
+    # klass = getattr(module, class_name)
+    # Assuming config might be a part of a larger OmegaConf structure:
+    # if not isinstance(config, omegaconf.DictConfig):
+    #     config = omegaconf.OmegaConf.create(config)
+    config = omegaconf.OmegaConf.to_container(config, resolve=True)
+    # Resolve args and kwargs from the configuration
+    # conf_args = [parse_klass_arg(arg, config) for arg in config.get('args', [])]
+    # conf_kwargs = {key: parse_klass_arg(value, config) for key, value in config.get('kwargs', {}).items()}
+    conf_kwargs = {
+        key: parse_klass_arg(value, config) for key, value in config["args"].items()
+    }
+    # Combine conf_args with explicitly passed *args
+    all_args = list(args)  # + conf_args
+    # Combine conf_kwargs with explicitly passed **kwargs
+    all_kwargs = {**conf_kwargs, **kwargs}
+    # Instantiate the class with the processed arguments
+    instance = klass(*all_args, **all_kwargs)
+    return instance
+def unscale_latents(latents, vae_scaling=None):
+    if vae_scaling is not None:
+        if latents.ndim == 4:
+            v = (1, -1, 1, 1)
+        elif latents.ndim == 5:
+            v = (1, -1, 1, 1, 1)
+        else:
+            raise ValueError("Latents should be 4D or 5D")
+        latents *= vae_scaling["std"].view(*v)
+        latents += vae_scaling["mean"].view(*v)
+    return latents

echoflow/common/models.py ADDED Viewed

	@@ -0,0 +1,1730 @@

+# This file contains modified code from the HuggingFace Diffusers library.
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch._dynamo
+import torch.nn as nn
+import torch.nn.functional as F
+import xformers
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.attention_processor import (
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnProcessor,
+)
+from diffusers.models.embeddings import PatchEmbed, TimestepEmbedding, Timesteps
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.unets.unet_3d_blocks import UNetMidBlockSpatioTemporal
+from diffusers.models.unets.unet_3d_blocks import get_down_block as get_down_block_3d
+from diffusers.models.unets.unet_3d_blocks import get_up_block as get_up_block_3d
+from diffusers.utils import BaseOutput, is_torch_version
+from einops import rearrange
+from timm.layers.drop import DropPath
+from timm.layers.mlp import Mlp
+from torchvision.models import resnet18
+approx_gelu = lambda: nn.GELU(approximate="tanh")
+class SegDiTTransformer2DModel(ModelMixin, ConfigMixin):
+    r"""
+    A 2D Transformer model as introduced in DiT (https://arxiv.org/abs/2212.09748).
+    Parameters:
+        num_attention_heads (int, optional, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (int, optional, defaults to 72): The number of channels in each head.
+        in_channels (int, defaults to 4): The number of channels in the input.
+        out_channels (int, optional):
+            The number of channels in the output. Specify this parameter if the output channel number differs from the
+            input.
+        num_layers (int, optional, defaults to 28): The number of layers of Transformer blocks to use.
+        dropout (float, optional, defaults to 0.0): The dropout probability to use within the Transformer blocks.
+        norm_num_groups (int, optional, defaults to 32):
+            Number of groups for group normalization within Transformer blocks.
+        attention_bias (bool, optional, defaults to True):
+            Configure if the Transformer blocks' attention should contain a bias parameter.
+        sample_size (int, defaults to 32):
+            The width of the latent images. This parameter is fixed during training.
+        patch_size (int, defaults to 2):
+            Size of the patches the model processes, relevant for architectures working on non-sequential data.
+        activation_fn (str, optional, defaults to "gelu-approximate"):
+            Activation function to use in feed-forward networks within Transformer blocks.
+        num_embeds_ada_norm (int, optional, defaults to 1000):
+            Number of embeddings for AdaLayerNorm, fixed during training and affects the maximum denoising steps during
+            inference.
+        upcast_attention (bool, optional, defaults to False):
+            If true, upcasts the attention mechanism dimensions for potentially improved performance.
+        norm_type (str, optional, defaults to "ada_norm_zero"):
+            Specifies the type of normalization used, can be 'ada_norm_zero'.
+        norm_elementwise_affine (bool, optional, defaults to False):
+            If true, enables element-wise affine parameters in the normalization layers.
+        norm_eps (float, optional, defaults to 1e-5):
+            A small constant added to the denominator in normalization layers to prevent division by zero.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 72,
+        in_channels: int = 4,
+        out_channels: Optional[int] = None,
+        num_layers: int = 28,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        attention_bias: bool = True,
+        sample_size: int = 32,
+        patch_size: int = 2,
+        activation_fn: str = "gelu-approximate",
+        num_embeds_ada_norm: Optional[int] = 1000,
+        upcast_attention: bool = False,
+        norm_type: str = "ada_norm_zero",
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-5,
+    ):
+        super().__init__()
+        # Validate inputs.
+        if norm_type != "ada_norm_zero":
+            raise NotImplementedError(
+                f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+            )
+        elif norm_type == "ada_norm_zero" and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+            )
+        # Set some common variables used across the board.
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = (
+            self.config.num_attention_heads * self.config.attention_head_dim
+        )
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.gradient_checkpointing = False
+        # 2. Initialize the position embedding and transformer blocks.
+        self.height = self.config.sample_size
+        self.width = self.config.sample_size
+        self.patch_size = self.config.patch_size
+        self.pos_embed = PatchEmbed(
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.config.in_channels,
+            embed_dim=self.inner_dim,
+        )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+        # 3. Output blocks.
+        self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+        self.proj_out_2 = nn.Linear(
+            self.inner_dim,
+            self.config.patch_size * self.config.patch_size * self.out_channels,
+        )
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        segmentation: Optional[torch.LongTensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`DiTTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # 0. If segmentation is provided, apply it to the input.
+        if segmentation is not None:
+            hidden_states = torch.cat([hidden_states, segmentation], dim=1)  # B C+1 H W
+        # 1. Input
+        height, width = (
+            hidden_states.shape[-2] // self.patch_size,
+            hidden_states.shape[-1] // self.patch_size,
+        )
+        hidden_states = self.pos_embed(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    None,
+                    None,
+                    None,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=None,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=None,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                )
+        # 3. Output
+        conditioning = self.transformer_blocks[0].norm1.emb(
+            timestep, class_labels, hidden_dtype=hidden_states.dtype
+        )
+        shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+        hidden_states = (
+            self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+        )
+        hidden_states = self.proj_out_2(hidden_states)
+        # unpatchify
+        height = width = int(hidden_states.shape[1] ** 0.5)
+        hidden_states = hidden_states.reshape(
+            shape=(
+                -1,
+                height,
+                width,
+                self.patch_size,
+                self.patch_size,
+                self.out_channels,
+            )
+        )
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(
+                -1,
+                self.out_channels,
+                height * self.patch_size,
+                width * self.patch_size,
+            )
+        )
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+def get_2d_sincos_pos_embed(
+    embed_dim, grid_size, cls_token=False, extra_tokens=0, scale=1.0, base_size=None
+):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if not isinstance(grid_size, tuple):
+        grid_size = (grid_size, grid_size)
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / scale
+    if base_size is not None:
+        grid_h *= base_size / grid_size[0]
+        grid_w *= base_size / grid_size[1]
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate(
+            [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0
+        )
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed(embed_dim, length, scale=1.0):
+    pos = np.arange(0, length)[..., None] / scale
+    return get_1d_sincos_pos_embed_from_grid(embed_dim, pos)
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def t2i_modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class PatchEmbed3D(nn.Module):
+    """Video to Patch Embedding.
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(
+        self,
+        patch_size=(2, 4, 4),
+        in_chans=3,
+        embed_dim=96,
+        norm_layer=None,
+        flatten=True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.flatten = flatten
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv3d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, D, H, W = x.size()
+        if W % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
+        if H % self.patch_size[1] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
+        if D % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
+        x = self.proj(x)  # (B C T H W)
+        if self.norm is not None:
+            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCTHW -> BNC
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        enable_flashattn: bool = False,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        if enable_flashattn:
+            print(
+                "[WARNING] FlashAttention cannot be used. Set enable_flashattn to False."
+            )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
+        qkv_permute_shape = (2, 0, 3, 1, 4)
+        qkv = qkv.view(qkv_shape).permute(qkv_permute_shape)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        dtype = q.dtype
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)  # translate attn to float32
+        attn = attn.to(torch.float32)
+        attn = attn.softmax(dim=-1)
+        attn = attn.to(dtype)  # cast back attn to original dtype
+        attn = self.attn_drop(attn)
+        x = attn @ v
+        x_output_shape = (B, N, C)
+        x = x.reshape(x_output_shape)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MultiHeadCrossAttention(nn.Module):
+    def __init__(self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0):
+        super(MultiHeadCrossAttention, self).__init__()
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.kv_linear = nn.Linear(d_model, d_model * 2)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(d_model, d_model)
+        self.proj_drop = nn.Dropout(proj_drop)
+    @torch._dynamo.disable
+    def forward(self, x, cond, mask=None):
+        # query/value: img tokens; key: condition; mask: if padding tokens
+        B, N, C = x.shape
+        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
+        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
+        k, v = kv.unbind(2)
+        attn_bias = None
+        if mask is not None:
+            attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
+        x = xformers.ops.memory_efficient_attention(
+            q, k, v, p=self.attn_drop.p, attn_bias=attn_bias
+        )
+        x = x.view(B, -1, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        )
+        freqs = freqs.to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return embedding
+    def forward(self, t, dtype):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        if t_freq.dtype != dtype:
+            t_freq = t_freq.to(dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class CaptionEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(
+        self,
+        in_channels,
+        hidden_size,
+        uncond_prob,
+        act_layer=nn.GELU(approximate="tanh"),
+        token_num=120,
+    ):
+        super().__init__()
+        self.y_proj = Mlp(
+            in_features=in_channels,
+            hidden_features=hidden_size,
+            out_features=hidden_size,
+            act_layer=act_layer,
+            drop=0,
+        )
+        self.register_buffer(
+            "y_embedding",
+            nn.Parameter(torch.randn(token_num, in_channels) / in_channels**0.5),
+        )
+        self.uncond_prob = uncond_prob
+    def token_drop(self, caption, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
+        return caption
+    @torch._dynamo.disable
+    def forward(self, caption, train, force_drop_ids=None):
+        if train:
+            assert caption.shape[2:] == self.y_embedding.shape
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            caption = self.token_drop(caption, force_drop_ids)
+        caption = self.y_proj(caption)
+        return caption
+class T2IFinalLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+    def __init__(self, hidden_size, num_patch, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True)
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(2, hidden_size) / hidden_size**0.5
+        )
+        self.out_channels = out_channels
+    def forward(self, x, t):
+        shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
+        x = t2i_modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class STDiTBlock(nn.Module):
+    """
+    STDiT: Spatio-Temporal Diffusion Transformer.
+    Args:
+        hidden_size (int): Hidden size of the model.
+        num_heads (int): Number of attention heads.
+        d_s (int): Spatial patch size.
+        d_t (int): Temporal patch size.
+        mlp_ratio (float): Ratio of hidden to mlp hidden size.
+        drop_path (float): Drop path rate.
+        enable_flashattn (bool): Enable FlashAttention.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        d_s=None,
+        d_t=None,
+        mlp_ratio=4.0,
+        drop_path=0.0,
+        enable_flashattn=False,
+        uncond=False,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.enable_flashattn = enable_flashattn
+        self.attn_cls = Attention
+        self.mha_cls = MultiHeadCrossAttention
+        self.norm1 = nn.LayerNorm(hidden_size, eps=1e-6, elementwise_affine=False)
+        self.attn = self.attn_cls(
+            hidden_size,
+            num_heads=num_heads,
+            qkv_bias=True,
+            enable_flashattn=False,
+        )
+        if uncond:
+            self.cross_attn = self.mha_cls(hidden_size, num_heads)
+        self.norm2 = nn.LayerNorm(hidden_size, eps=1e-6, elementwise_affine=False)
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=int(hidden_size * mlp_ratio),
+            act_layer=approx_gelu,
+            drop=0,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(6, hidden_size) / hidden_size**0.5
+        )
+        # temporal attention
+        self.d_s = d_s
+        self.d_t = d_t
+        self.attn_temp = self.attn_cls(
+            hidden_size,
+            num_heads=num_heads,
+            qkv_bias=True,
+            enable_flashattn=self.enable_flashattn,
+        )
+    def forward(self, x, t, y=None, mask=None, tpe=None):
+        """
+        Args:
+            x (torch.Tensor): noisy input tensor of shape [B, N, C]
+            y (torch.Tensor): conditional input tensor of shape [B, N, C]
+            t (torch.Tensor): input tensor; of shape [B, C]
+            mask (torch.Tensor): input tensor; of shape [B, N]
+            tpe (torch.Tensor): input tensor; of shape [B, C]
+        """
+        B, N, C = x.shape
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+            self.scale_shift_table[None] + t.reshape(B, 6, -1)
+        ).chunk(6, dim=1)
+        x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
+        # spatial branch
+        x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=self.d_t, S=self.d_s)
+        x_s = self.attn(x_s)
+        x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=self.d_t, S=self.d_s)
+        x = x + self.drop_path(gate_msa * x_s)
+        # temporal branch
+        x_t = rearrange(x, "B (T S) C -> (B S) T C", T=self.d_t, S=self.d_s)
+        if tpe is not None:
+            x_t = x_t + tpe
+        x_t = self.attn_temp(x_t)
+        x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=self.d_t, S=self.d_s)
+        x = x + self.drop_path(gate_msa * x_t)
+        # cross attn
+        if y is not None:
+            x = x + self.cross_attn(x, y, mask)
+        # mlp
+        x = x + self.drop_path(
+            gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))
+        )
+        return x
+# | Model | Layers N | Hidden size d | Heads | Gflops (I=32, p=4) |
+# |-------|----------|---------------|-------|---------------------|
+# | DiT-S | 12       | 384           | 6     | 1.4                 |
+# | DiT-B | 12       | 768           | 12    | 5.6                 |
+# | DiT-L | 24       | 1024          | 16    | 19.7                |
+# | DiT-XL| 28       | 1152          | 16    | 29.1                |
+class STDiT(nn.Module):
+    def __init__(
+        self,
+        input_size=(1, 32, 32),  # T, H, W
+        in_channels=4,
+        out_channels=4,
+        patch_size=(1, 2, 2),  # T, H, W
+        hidden_size=1152,  #
+        depth=28,  # Number of layers
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        drop_path=0.0,
+        no_temporal_pos_emb=False,
+        caption_channels=4096,  # 0 to disable
+        model_max_length=120,
+        space_scale=1.0,
+        time_scale=1.0,
+        enable_flashattn=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_size = hidden_size
+        self.patch_size = patch_size
+        self.input_size = input_size
+        num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
+        self.num_patches = num_patches
+        self.num_temporal = input_size[0] // patch_size[0]
+        self.num_spatial = num_patches // self.num_temporal
+        self.num_heads = num_heads
+        self.no_temporal_pos_emb = no_temporal_pos_emb
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.enable_flashattn = enable_flashattn
+        self.space_scale = space_scale
+        self.time_scale = time_scale
+        if caption_channels == 0:
+            print("Warning: caption_channels is 0, disabling text conditioning.")
+        self.register_buffer("pos_embed", self.get_spatial_pos_embed())
+        self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())
+        self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.t_block = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+        self.y_embedder = (
+            CaptionEmbedder(
+                in_channels=caption_channels,
+                hidden_size=hidden_size,
+                uncond_prob=class_dropout_prob,
+                act_layer=approx_gelu,
+                token_num=model_max_length,
+            )
+            if caption_channels > 0
+            else None
+        )
+        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]
+        self.blocks = nn.ModuleList(
+            [
+                STDiTBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=self.mlp_ratio,
+                    drop_path=drop_path[i],
+                    enable_flashattn=self.enable_flashattn,
+                    d_t=self.num_temporal,
+                    d_s=self.num_spatial,
+                    uncond=(caption_channels > 0),
+                )
+                for i in range(self.depth)
+            ]
+        )
+        self.final_layer = T2IFinalLayer(
+            hidden_size, np.prod(self.patch_size), self.out_channels
+        )
+        # init model
+        self.initialize_weights()
+        self.initialize_temporal()
+        # sequence parallel related configs
+        self.sp_rank = None
+    def forward(self, x, timestep, y=None, mask=None, cond_image=None):
+        """
+        Forward pass of STDiT.
+        Args:
+            x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W]
+            timestep (torch.Tensor): diffusion time steps; of shape [B]
+            y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C]
+            mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token]
+        Returns:
+            x (torch.Tensor): output latent representation; of shape [B, C, T, H, W]
+        """
+        # x = x.to(self.dtype)
+        # timestep = timestep.to(self.dtype)
+        # y = y.to(self.dtype)
+        # embedding
+        x = self.x_embedder(x)  # [B, N, C]
+        # print(x.shape, self.num_temporal, self.num_spatial)
+        x = rearrange(
+            x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial
+        )
+        x = x + self.pos_embed
+        x = rearrange(x, "B T S C -> B (T S) C")
+        # shard over the sequence dim if sp is enabled
+        # if self.enable_sequence_parallelism:
+        #     x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="down")
+        t = self.t_embedder(timestep, dtype=x.dtype)  # [B, C]
+        t0 = self.t_block(t)  # [B, C]
+        if self.y_embedder is not None and y is not None:
+            y = self.y_embedder(y, self.training)  # [B, 1, N_token, C]
+            if mask is not None:
+                if mask.shape[0] != y.shape[0]:
+                    mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
+                mask = mask.squeeze(1).squeeze(1)
+                y = (
+                    y.squeeze(1)
+                    .masked_select(mask.unsqueeze(-1) != 0)
+                    .view(1, -1, x.shape[-1])
+                )
+                y_lens = mask.sum(dim=1).tolist()
+            else:
+                y_lens = [y.shape[2]] * y.shape[0]  # N_token * B
+                y = y.squeeze(1).view(1, -1, x.shape[-1])
+        else:
+            y = None
+            y_lens = None
+        # blocks
+        for i, block in enumerate(self.blocks):
+            if i == 0:
+                tpe = self.pos_embed_temporal
+            else:
+                tpe = None
+            x = block(x=x, t=t0, y=y, mask=y_lens, tpe=tpe)
+        # x.shape: [B, N, C]
+        # final process
+        x = self.final_layer(x, t)  # [B, N, C=T_p * H_p * W_p * C_out]
+        x = self.unpatchify(x)  # [B, C_out, T, H, W]
+        return x
+    def unpatchify(self, x):
+        """
+        Args:
+            x (torch.Tensor): of shape [B, N, C]
+        Return:
+            x (torch.Tensor): of shape [B, C_out, T, H, W]
+        """
+        N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
+        T_p, H_p, W_p = self.patch_size
+        x = rearrange(
+            x,
+            "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
+            N_t=N_t,
+            N_h=N_h,
+            N_w=N_w,
+            T_p=T_p,
+            H_p=H_p,
+            W_p=W_p,
+            C_out=self.out_channels,
+        )
+        return x
+    def unpatchify_old(self, x):
+        c = self.out_channels
+        t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
+        pt, ph, pw = self.patch_size
+        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
+        x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
+        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
+        return imgs
+    def get_spatial_pos_embed(self, grid_size=None):
+        if grid_size is None:
+            grid_size = self.input_size[1:]
+        pos_embed = get_2d_sincos_pos_embed(
+            self.hidden_size,
+            (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]),
+            scale=self.space_scale,
+        )
+        pos_embed = (
+            torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
+        )
+        return pos_embed
+    def get_temporal_pos_embed(self):
+        pos_embed = get_1d_sincos_pos_embed(
+            self.hidden_size,
+            self.input_size[0] // self.patch_size[0],
+            scale=self.time_scale,
+        )
+        pos_embed = (
+            torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
+        )
+        return pos_embed
+    def freeze_not_temporal(self):
+        for n, p in self.named_parameters():
+            if "attn_temp" not in n:
+                p.requires_grad = False
+    def freeze_text(self):
+        for n, p in self.named_parameters():
+            if "cross_attn" in n:
+                p.requires_grad = False
+    def initialize_temporal(self):
+        for block in self.blocks:
+            nn.init.constant_(block.attn_temp.proj.weight, 0)
+            nn.init.constant_(block.attn_temp.proj.bias, 0)
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.t_block[1].weight, std=0.02)
+        # Initialize caption embedding MLP:
+        if self.y_embedder is not None:
+            nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
+            nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)
+        # Zero-out adaLN modulation layers in PixArt blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.cross_attn.proj.weight, 0)
+            nn.init.constant_(block.cross_attn.proj.bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+@dataclass
+class DiffuserSTDiTModelOutput(BaseOutput):
+    """
+    The output of [`DiffuserSTDiT`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, num_frames, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+class DiffuserSTDiT(ModelMixin, ConfigMixin):
+    """
+    STDiT: Spatio-Temporal Diffusion Transformer.
+    Parameters:
+        input_size (tuple): Input size of the video. Default: (1, 32, 32).
+        in_channels (int): Number of input video channels. Default: 4.
+        out_channels (int): Number of output video channels. Default: 4.
+        patch_size (tuple): Patch token size. Default: (1, 2, 2).
+        hidden_size (int): Hidden size of the model. Default: 1152.
+        depth (int): Number of layers. Default: 28.
+        num_heads (int): Number of attention heads. Default: 16.
+        mlp_ratio (float): Ratio of hidden to mlp hidden size. Default: 4.0.
+        class_dropout_prob (float): Probability of dropping class tokens. Default: 0.1.
+        drop_path (float): Drop path rate. Default: 0.0.
+        no_temporal_pos_emb (bool): Disable temporal positional embeddings. Default: False.
+        caption_channels (int): Number of caption channels. Default: 4096.
+        model_max_length (int): Maximum length of the model. Default: 120.
+        space_scale (float): Spatial scale. Default: 1.0.
+        time_scale (float): Temporal scale. Default: 1.0.
+        enable_flashattn (bool): Enable FlashAttention. Default: False.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        input_size=(1, 32, 32),  # T, H, W
+        in_channels=4,
+        out_channels=4,
+        patch_size=(1, 2, 2),  # T, H, W
+        hidden_size=1152,  #
+        depth=28,  # Number of layers
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        drop_path=0.0,
+        no_temporal_pos_emb=False,
+        caption_channels=4096,  # 0 to disable
+        model_max_length=120,
+        space_scale=1.0,
+        time_scale=1.0,
+        enable_flashattn=False,
+    ):
+        super().__init__()
+        self.model = STDiT(
+            input_size=input_size,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            patch_size=patch_size,
+            hidden_size=hidden_size,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            class_dropout_prob=class_dropout_prob,
+            drop_path=drop_path,
+            no_temporal_pos_emb=no_temporal_pos_emb,
+            caption_channels=caption_channels,
+            model_max_length=model_max_length,
+            space_scale=space_scale,
+            time_scale=time_scale,
+            enable_flashattn=enable_flashattn,
+        )
+    def forward(
+        self,
+        x,
+        timestep,
+        encoder_hidden_states=None,
+        cond_image=None,
+        mask=None,
+        return_dict=True,
+        *args,
+        **kwargs,
+    ):
+        """
+        Args:
+            x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W]
+            timestep (torch.Tensor): diffusion time steps; of shape [B]
+            y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C]
+            mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token]
+            return_dict (bool): return a dictionary or not. Default: True.
+        """
+        if type(timestep) == int or timestep.ndim == 0:
+            timestep = torch.ones(x.shape[0], device=x.device) * timestep
+        encoder_hidden_states = (
+            encoder_hidden_states.unsqueeze(1)
+            if encoder_hidden_states is not None
+            else None
+        )
+        if cond_image is not None:
+            assert (
+                x.shape == cond_image.shape
+            ), "x and cond_image must have the same shape"
+            x = torch.cat([x, cond_image], dim=1)  # B x 2C x T x H x W
+        output = self.model(x, timestep, encoder_hidden_states, mask)
+        if not return_dict:
+            return (output,)
+        return DiffuserSTDiTModelOutput(sample=output)
+##############################
+# Image-Conditionned ST UNet #
+##############################
+@torch._dynamo.disable
+@dataclass
+class UNetSTICOutput(BaseOutput):  # UNet-SpatioTemporal-ImageConditionned
+    """
+    The output of [`UNetSpatioTemporalConditionModel`].
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.Tensor = None
+class UNetSTIC(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and
+    returns a sample shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        addition_time_embed_dim: (`int`, defaults to 256):
+            Dimension to to encode the additional time ids.
+        projection_class_embeddings_input_dim (`int`, defaults to 768):
+            The dimension of the projection of encoded `added_time_ids`.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unets.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
+            [`~models.unets.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
+            [`~models.unets.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
+        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
+            The number of attention heads.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 8,
+        out_channels: int = 4,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "DownBlockSpatioTemporal",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        addition_time_embed_dim: int = 256,
+        projection_class_embeddings_input_dim: int = 768,
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        cross_attention_dim: Union[int, Tuple[int]] = 1024,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        num_attention_heads: Union[int, Tuple[int]] = (5, 10, 20, 20),
+        num_frames: int = 25,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(
+            down_block_types
+        ):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(
+            down_block_types
+        ):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(
+            down_block_types
+        ):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            padding=1,
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        # self.add_time_proj = Timesteps(
+        #     addition_time_embed_dim, True, downscale_freq_shift=0
+        # )
+        # self.add_embedding = TimestepEmbedding(
+        #     projection_class_embeddings_input_dim, time_embed_dim
+        # )
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(
+                down_block_types
+            )
+        blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block_3d(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-5,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlockSpatioTemporal(
+            block_out_channels[-1],
+            temb_channels=blocks_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            num_attention_heads=num_attention_heads[-1],
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(
+            reversed(transformer_layers_per_block)
+        )
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[
+                min(i + 1, len(block_out_channels) - 1)
+            ]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block_3d(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=1e-5,
+                resolution_idx=i,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0], num_groups=32, eps=1e-5
+        )
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0],
+            out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+        # self.set_default_attn_processor()
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(
+            proc.__class__ in CROSS_ATTENTION_PROCESSORS
+            for proc in self.attn_processors.values()
+        ):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(
+        self, chunk_size: Optional[int] = None, dim: int = 0
+    ) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+        def fn_recursive_feed_forward(
+            module: torch.nn.Module, chunk_size: int, dim: int
+        ):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        cond_image=None,
+        mask=None,
+        # added_time_ids: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[UNetSTICOutput, Tuple]:
+        r"""
+        The [`UNetSpatioTemporalConditionModel`] forward method.
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
+            added_time_ids: (`torch.Tensor`):
+                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
+                embeddings and added to the time embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_slatio_temporal.UNetSTICOutput`] instead
+                of a plain tuple.
+        Returns:
+            [`~models.unet_slatio_temporal.UNetSTICOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSTICOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        sample = torch.cat([x, cond_image], dim=1)  # B C+1 T H W
+        # pad to multiple of 2**n
+        res_target = 2 ** (np.ceil(np.log2(sample.shape[-1])).astype(int))
+        padding = (res_target - sample.shape[-1]) // 2
+        sample = F.pad(
+            sample, (padding, padding, padding, padding, 0, 0), mode="circular"
+        )
+        # reshape from B C T H W to B T C H W
+        sample = sample.permute(0, 2, 1, 3, 4)
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb)
+        # time_embeds = self.add_time_proj(added_time_ids.flatten())
+        # time_embeds = time_embeds.reshape((batch_size, -1))
+        # time_embeds = time_embeds.to(emb.dtype)
+        # aug_emb = self.add_embedding(time_embeds)
+        # emb = emb + aug_emb
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # Repeat the embeddings num_video_frames times
+        # emb: [batch, channels] -> [batch * frames, channels]
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(
+            num_frames, dim=0
+        )
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        image_only_indicator = torch.zeros(
+            batch_size, num_frames, dtype=sample.dtype, device=sample.device
+        )
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if (
+                hasattr(downsample_block, "has_cross_attention")
+                and downsample_block.has_cross_attention
+            ):
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+            if (
+                hasattr(upsample_block, "has_cross_attention")
+                and upsample_block.has_cross_attention
+            ):
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    image_only_indicator=image_only_indicator,
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+        if padding > 0:
+            sample = sample[:, :, :, padding:-padding, padding:-padding]
+        # reshape back to B C T H W
+        sample = sample.permute(0, 2, 1, 3, 4)
+        if not return_dict:
+            return (sample,)
+        return UNetSTICOutput(sample=sample)
+class ContrastiveModel(nn.Module):
+    def __init__(self, in_channels, out_channels, backbone=None, kl_loss_weight=0.0):
+        super(ContrastiveModel, self).__init__()
+        assert backbone is not None, "Backbone must be provided."
+        self.backbone = backbone
+        self.backbone = self.patch_backbone(self.backbone, in_channels, out_channels)
+        self.fc_end = nn.Linear(out_channels, 1)
+        self.kl_loss_weight = kl_loss_weight
+    @classmethod
+    def patch_backbone(cls, backbone, in_channels, out_channels):
+        if "ResNet" in backbone.__class__.__name__:
+            backbone.model.conv1 = nn.Conv2d(
+                in_channels,
+                64,
+                kernel_size=(7, 7),
+                stride=(2, 2),
+                padding=(3, 3),
+                bias=False,
+            )
+            backbone.model.fc = nn.Linear(
+                in_features=512, out_features=out_channels, bias=True
+            )
+        else:
+            raise Exception(
+                "Invalid argument: "
+                + backbone.__class__.__name__
+                + "\nChoose ResNet! Other architectures are not yet implemented in this framework."
+            )
+        return backbone
+    def forward_once(self, x):
+        features = self.backbone(x)
+        output = torch.sigmoid(features)
+        return output, features
+    def forward_constrastive(self, input1, input2):
+        y1 = self.forward_once(input1)
+        y2 = self.forward_once(input2)
+        difference = torch.abs(y1 - y2)
+        output = self.fc_end(difference)  # linear layer
+        return output  # B x 1
+    def forward_fused(self, input1, input2):
+        inputs = torch.cat((input1, input2), dim=0)  # 2B x C x H x W
+        outputs, features = self.forward_once(inputs)
+        y1, y2 = torch.split(outputs, outputs.size(0) // 2, dim=0)
+        difference = torch.abs(y1 - y2)
+        output = self.fc_end(difference)
+        # Compute KL divergence
+        if self.kl_loss_weight > 0:
+            mu = torch.mean(features, dim=0)
+            var = torch.var(features, dim=0) + 1e-6  # Add epsilon to avoid log(0)
+            kl_loss = 0.5 * torch.sum(mu.pow(2) + var - torch.log(var) - 1)
+        else:
+            kl_loss = torch.zeros((1,), device=output.device)
+        return output, kl_loss
+    def loss(self, output, target):
+        return nn.functional.binary_cross_entropy_with_logits(output, target[:, None])
+    def forward(self, input1, input2, target):
+        y_hat, kl_loss = self.forward_fused(input1, input2)
+        loss = self.loss(y_hat, target)
+        total_loss = loss + self.kl_loss_weight * kl_loss
+        return total_loss, loss, kl_loss
+class ResNet18(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, weights=None, progress=False):
+        super(ResNet18, self).__init__()
+        self.model = resnet18(weights=weights, progress=progress)
+    def forward(self, x):
+        return self.model(x)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+opencv-python==4.9.0.80
+diffusers==0.30.3
+einops==0.7.0
+gradio==5.22.0
+huggingface-hub==0.29.3
+numpy==1.26.4
+omegaconf==2.3.0
+pillow==10.2.0
+safetensors==0.4.5
+torch==2.2.2
+torchdiffeq==0.2.4
+xformers==0.0.25.post1
+timm==0.9.16
+accelerate==0.34.2