ltx-video-distilled

Running on Zero

App Files Files Community

ltx-video-distilled / ltx_video /models /autoencoders /latent_upsampler.py

linoyts HF Staff

Upload 35 files

833590f verified 17 days ago

raw

history blame contribute delete

7.03 kB

	from typing import Optional, Union
	from pathlib import Path
	import os
	import json

	import torch
	import torch.nn as nn
	from einops import rearrange
	from diffusers import ConfigMixin, ModelMixin
	from safetensors.torch import safe_open

	from ltx_video.models.autoencoders.pixel_shuffle import PixelShuffleND


	class ResBlock(nn.Module):
	def __init__(
	self, channels: int, mid_channels: Optional[int] = None, dims: int = 3
	):
	super().__init__()
	if mid_channels is None:
	mid_channels = channels

	Conv = nn.Conv2d if dims == 2 else nn.Conv3d

	self.conv1 = Conv(channels, mid_channels, kernel_size=3, padding=1)
	self.norm1 = nn.GroupNorm(32, mid_channels)
	self.conv2 = Conv(mid_channels, channels, kernel_size=3, padding=1)
	self.norm2 = nn.GroupNorm(32, channels)
	self.activation = nn.SiLU()

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	residual = x
	x = self.conv1(x)
	x = self.norm1(x)
	x = self.activation(x)
	x = self.conv2(x)
	x = self.norm2(x)
	x = self.activation(x + residual)
	return x


	class LatentUpsampler(ModelMixin, ConfigMixin):
	"""
	Model to spatially upsample VAE latents.

	Args:
	in_channels (`int`): Number of channels in the input latent
	mid_channels (`int`): Number of channels in the middle layers
	num_blocks_per_stage (`int`): Number of ResBlocks to use in each stage (pre/post upsampling)
	dims (`int`): Number of dimensions for convolutions (2 or 3)
	spatial_upsample (`bool`): Whether to spatially upsample the latent
	temporal_upsample (`bool`): Whether to temporally upsample the latent
	"""

	def __init__(
	self,
	in_channels: int = 128,
	mid_channels: int = 512,
	num_blocks_per_stage: int = 4,
	dims: int = 3,
	spatial_upsample: bool = True,
	temporal_upsample: bool = False,
	):
	super().__init__()

	self.in_channels = in_channels
	self.mid_channels = mid_channels
	self.num_blocks_per_stage = num_blocks_per_stage
	self.dims = dims
	self.spatial_upsample = spatial_upsample
	self.temporal_upsample = temporal_upsample

	Conv = nn.Conv2d if dims == 2 else nn.Conv3d

	self.initial_conv = Conv(in_channels, mid_channels, kernel_size=3, padding=1)
	self.initial_norm = nn.GroupNorm(32, mid_channels)
	self.initial_activation = nn.SiLU()

	self.res_blocks = nn.ModuleList(
	[ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)]
	)

	if spatial_upsample and temporal_upsample:
	self.upsampler = nn.Sequential(
	nn.Conv3d(mid_channels, 8 * mid_channels, kernel_size=3, padding=1),
	PixelShuffleND(3),
	)
	elif spatial_upsample:
	self.upsampler = nn.Sequential(
	nn.Conv2d(mid_channels, 4 * mid_channels, kernel_size=3, padding=1),
	PixelShuffleND(2),
	)
	elif temporal_upsample:
	self.upsampler = nn.Sequential(
	nn.Conv3d(mid_channels, 2 * mid_channels, kernel_size=3, padding=1),
	PixelShuffleND(1),
	)
	else:
	raise ValueError(
	"Either spatial_upsample or temporal_upsample must be True"
	)

	self.post_upsample_res_blocks = nn.ModuleList(
	[ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)]
	)

	self.final_conv = Conv(mid_channels, in_channels, kernel_size=3, padding=1)

	def forward(self, latent: torch.Tensor) -> torch.Tensor:
	b, c, f, h, w = latent.shape

	if self.dims == 2:
	x = rearrange(latent, "b c f h w -> (b f) c h w")
	x = self.initial_conv(x)
	x = self.initial_norm(x)
	x = self.initial_activation(x)

	for block in self.res_blocks:
	x = block(x)

	x = self.upsampler(x)

	for block in self.post_upsample_res_blocks:
	x = block(x)

	x = self.final_conv(x)
	x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
	else:
	x = self.initial_conv(latent)
	x = self.initial_norm(x)
	x = self.initial_activation(x)

	for block in self.res_blocks:
	x = block(x)

	if self.temporal_upsample:
	x = self.upsampler(x)
	x = x[:, :, 1:, :, :]
	else:
	x = rearrange(x, "b c f h w -> (b f) c h w")
	x = self.upsampler(x)
	x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)

	for block in self.post_upsample_res_blocks:
	x = block(x)

	x = self.final_conv(x)

	return x

	@classmethod
	def from_config(cls, config):
	return cls(
	in_channels=config.get("in_channels", 4),
	mid_channels=config.get("mid_channels", 128),
	num_blocks_per_stage=config.get("num_blocks_per_stage", 4),
	dims=config.get("dims", 2),
	spatial_upsample=config.get("spatial_upsample", True),
	temporal_upsample=config.get("temporal_upsample", False),
	)

	def config(self):
	return {
	"_class_name": "LatentUpsampler",
	"in_channels": self.in_channels,
	"mid_channels": self.mid_channels,
	"num_blocks_per_stage": self.num_blocks_per_stage,
	"dims": self.dims,
	"spatial_upsample": self.spatial_upsample,
	"temporal_upsample": self.temporal_upsample,
	}

	@classmethod
	def from_pretrained(
	cls,
	pretrained_model_path: Optional[Union[str, os.PathLike]],
	*args,
	**kwargs,
	):
	pretrained_model_path = Path(pretrained_model_path)
	if pretrained_model_path.is_file() and str(pretrained_model_path).endswith(
	".safetensors"
	):
	state_dict = {}
	with safe_open(pretrained_model_path, framework="pt", device="cpu") as f:
	metadata = f.metadata()
	for k in f.keys():
	state_dict[k] = f.get_tensor(k)
	config = json.loads(metadata["config"])
	with torch.device("meta"):
	latent_upsampler = LatentUpsampler.from_config(config)
	latent_upsampler.load_state_dict(state_dict, assign=True)
	return latent_upsampler


	if __name__ == "__main__":
	latent_upsampler = LatentUpsampler(num_blocks_per_stage=4, dims=3)
	print(latent_upsampler)
	total_params = sum(p.numel() for p in latent_upsampler.parameters())
	print(f"Total number of parameters: {total_params:,}")
	latent = torch.randn(1, 128, 9, 16, 16)
	upsampled_latent = latent_upsampler(latent)
	print(f"Upsampled latent shape: {upsampled_latent.shape}")