elungky's picture
Initial commit for new Space - pre-built Docker image
28451f7
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Net config options for cosmos/tokenizer
ContinuousImageTokenizerConfig
DiscreteImageTokenizerConfig
CausalContinuousVideoTokenizerConfig
"""
from cosmos_predict1.tokenizer.modules import (
ContinuousFormulation,
Decoder3DType,
DecoderType,
DiscreteQuantizer,
Encoder3DType,
EncoderType,
)
from cosmos_predict1.tokenizer.networks.continuous_image import ContinuousImageTokenizer
from cosmos_predict1.tokenizer.networks.continuous_video import CausalContinuousVideoTokenizer
from cosmos_predict1.tokenizer.networks.discrete_image import DiscreteImageTokenizer
from cosmos_predict1.tokenizer.networks.discrete_video import CausalDiscreteVideoTokenizer
from cosmos_predict1.utils.lazy_config import LazyCall as L
from cosmos_predict1.utils.lazy_config import LazyDict
ContinuousImageTokenizerConfig: LazyDict = L(ContinuousImageTokenizer)(
# The attention resolution for res blocks.
attn_resolutions=[32],
# The base number of channels.
channels=128,
# The channel multipler for each resolution.
channels_mult=[2, 4, 4],
dropout=0.0,
in_channels=3,
# The spatial compression ratio, default 8.
spatial_compression=8,
# The number of layers in each res block.
num_res_blocks=2,
out_channels=3,
resolution=1024,
patch_size=4,
patch_method="haar",
# The output latent dimension (channels).
latent_channels=16,
# The encoder output channels just before sampling.
# Which is also the decoder's input channels.
z_channels=16,
# A factor over the z_channels, to get the total channels the encoder should output.
# For a VAE for instance, we want to output the mean and variance, so we need 2 * z_channels.
# Since we are using AE formulation, we only need the mean, so z_factor=1.
z_factor=1,
name="ContinuousImageTokenizer",
# What formulation to use, either "AE" or "VAE".
# Chose AE here, since this has been proven to be effective.
formulation=ContinuousFormulation.AE.name,
# Specify type of encoder ["Default", "LiteVAE"]
encoder=EncoderType.Default.name,
# Specify type of decoder ["Default"]
decoder=DecoderType.Default.name,
)
DiscreteImageTokenizerConfig: LazyDict = L(DiscreteImageTokenizer)(
# The attention resolution for res blocks.
attn_resolutions=[32],
# The base number of channels.
channels=128,
# The channel multipler for each resolution.
channels_mult=[2, 4, 4],
dropout=0.0,
in_channels=3,
# The spatial compression ratio.
spatial_compression=16,
# The number of layers in each res block.
num_res_blocks=2,
out_channels=3,
resolution=1024,
patch_size=4,
patch_method="haar",
# The encoder output channels just before sampling.
z_channels=256,
# A factor over the z_channels, to get the total channels the encoder should output.
# for discrete tokenization, often we directly use the vector, so z_factor=1.
z_factor=1,
# The quantizer of choice, VQ, LFQ, FSQ, or ResFSQ. Default FSQ.
quantizer=DiscreteQuantizer.FSQ.name,
# The embedding dimension post-quantization, which is also the input channels of the decoder.
# Which is also the output
embedding_dim=6,
# The number of levels to use for fine-scalar quantization.
levels=[8, 8, 8, 5, 5, 5],
persistent_quantizer=False,
# The number of quantizers to use for residual fine-scalar quantization.
num_quantizers=4,
name="DiscreteImageTokenizer",
# Specify type of encoder ["Default", "LiteVAE"]
encoder=EncoderType.Default.name,
# Specify type of decoder ["Default"]
decoder=DecoderType.Default.name,
)
CausalContinuousFactorizedVideoTokenizerConfig: LazyDict = L(CausalContinuousVideoTokenizer)(
# The new causal continuous tokenizer, that is at least 2x more efficient in memory and runtime.
# - It relies on fully 3D discrete wavelet transform
# - Uses a layer norm instead of a group norm
# - Factorizes full convolutions into spatial and temporal convolutions
# - Factorizes full attention into spatial and temporal attention
# - Adopts an AE formulation
# - Strictly causal, with flexible temporal length at inference.
attn_resolutions=[32],
channels=128,
channels_mult=[2, 4, 4],
dropout=0.0,
in_channels=3,
num_res_blocks=2,
out_channels=3,
resolution=1024,
patch_size=4,
patch_method="haar",
latent_channels=16,
z_channels=16,
z_factor=1,
num_groups=1,
# Most of the CV and DV tokenizers trained before September 1, 2024,
# used temporal upsampling that was not perfectly mirrored with the
# # encoder's temporal downsampling. Moving forward, new CV/DV tokenizers
# will use legacy_mode=False, meaning they will adopt mirrored upsampling.
legacy_mode=False,
spatial_compression=8,
temporal_compression=8,
formulation=ContinuousFormulation.AE.name,
encoder=Encoder3DType.FACTORIZED.name,
decoder=Decoder3DType.FACTORIZED.name,
name="CausalContinuousFactorizedVideoTokenizer",
)
CausalDiscreteFactorizedVideoTokenizerConfig: LazyDict = L(CausalDiscreteVideoTokenizer)(
# The new causal discrete tokenizer, that is at least 2x more efficient in memory and runtime.
# - It relies on fully 3D discrete wavelet transform
# - Uses a layer norm instead of a group norm
# - Factorizes full convolutions into spatial and temporal convolutions
# - Factorizes full attention into spatial and temporal attention
# - Strictly causal, with flexible temporal length at inference.
attn_resolutions=[32],
channels=128,
channels_mult=[2, 4, 4],
dropout=0.0,
in_channels=3,
num_res_blocks=2,
out_channels=3,
resolution=1024,
patch_size=4,
patch_method="haar",
# The encoder output channels just before quantization is changed to 256
# from 16 (old versions). It aligns with the DI that uses 256 channels,
# making initialization from image tokenizers easier.
z_channels=256,
z_factor=1,
num_groups=1,
# Most of the CV and DV tokenizers trained before September 1, 2024,
# used temporal upsampling that was not perfectly mirrored with the
# # encoder's temporal downsampling. Moving forward, new CV/DV tokenizers
# will use legacy_mode=False, meaning they will adopt mirrored upsampling.
legacy_mode=False,
spatial_compression=16,
temporal_compression=8,
quantizer=DiscreteQuantizer.FSQ.name,
embedding_dim=6,
levels=[8, 8, 8, 5, 5, 5],
persistent_quantizer=False,
encoder=Encoder3DType.FACTORIZED.name,
decoder=Decoder3DType.FACTORIZED.name,
name="CausalDiscreteFactorizedVideoTokenizer",
)