Spaces:
Build error
Build error
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
# SPDX-License-Identifier: Apache-2.0 | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Net config options for cosmos/tokenizer | |
ContinuousImageTokenizerConfig | |
DiscreteImageTokenizerConfig | |
CausalContinuousVideoTokenizerConfig | |
""" | |
from cosmos_predict1.tokenizer.modules import ( | |
ContinuousFormulation, | |
Decoder3DType, | |
DecoderType, | |
DiscreteQuantizer, | |
Encoder3DType, | |
EncoderType, | |
) | |
from cosmos_predict1.tokenizer.networks.continuous_image import ContinuousImageTokenizer | |
from cosmos_predict1.tokenizer.networks.continuous_video import CausalContinuousVideoTokenizer | |
from cosmos_predict1.tokenizer.networks.discrete_image import DiscreteImageTokenizer | |
from cosmos_predict1.tokenizer.networks.discrete_video import CausalDiscreteVideoTokenizer | |
from cosmos_predict1.utils.lazy_config import LazyCall as L | |
from cosmos_predict1.utils.lazy_config import LazyDict | |
ContinuousImageTokenizerConfig: LazyDict = L(ContinuousImageTokenizer)( | |
# The attention resolution for res blocks. | |
attn_resolutions=[32], | |
# The base number of channels. | |
channels=128, | |
# The channel multipler for each resolution. | |
channels_mult=[2, 4, 4], | |
dropout=0.0, | |
in_channels=3, | |
# The spatial compression ratio, default 8. | |
spatial_compression=8, | |
# The number of layers in each res block. | |
num_res_blocks=2, | |
out_channels=3, | |
resolution=1024, | |
patch_size=4, | |
patch_method="haar", | |
# The output latent dimension (channels). | |
latent_channels=16, | |
# The encoder output channels just before sampling. | |
# Which is also the decoder's input channels. | |
z_channels=16, | |
# A factor over the z_channels, to get the total channels the encoder should output. | |
# For a VAE for instance, we want to output the mean and variance, so we need 2 * z_channels. | |
# Since we are using AE formulation, we only need the mean, so z_factor=1. | |
z_factor=1, | |
name="ContinuousImageTokenizer", | |
# What formulation to use, either "AE" or "VAE". | |
# Chose AE here, since this has been proven to be effective. | |
formulation=ContinuousFormulation.AE.name, | |
# Specify type of encoder ["Default", "LiteVAE"] | |
encoder=EncoderType.Default.name, | |
# Specify type of decoder ["Default"] | |
decoder=DecoderType.Default.name, | |
) | |
DiscreteImageTokenizerConfig: LazyDict = L(DiscreteImageTokenizer)( | |
# The attention resolution for res blocks. | |
attn_resolutions=[32], | |
# The base number of channels. | |
channels=128, | |
# The channel multipler for each resolution. | |
channels_mult=[2, 4, 4], | |
dropout=0.0, | |
in_channels=3, | |
# The spatial compression ratio. | |
spatial_compression=16, | |
# The number of layers in each res block. | |
num_res_blocks=2, | |
out_channels=3, | |
resolution=1024, | |
patch_size=4, | |
patch_method="haar", | |
# The encoder output channels just before sampling. | |
z_channels=256, | |
# A factor over the z_channels, to get the total channels the encoder should output. | |
# for discrete tokenization, often we directly use the vector, so z_factor=1. | |
z_factor=1, | |
# The quantizer of choice, VQ, LFQ, FSQ, or ResFSQ. Default FSQ. | |
quantizer=DiscreteQuantizer.FSQ.name, | |
# The embedding dimension post-quantization, which is also the input channels of the decoder. | |
# Which is also the output | |
embedding_dim=6, | |
# The number of levels to use for fine-scalar quantization. | |
levels=[8, 8, 8, 5, 5, 5], | |
persistent_quantizer=False, | |
# The number of quantizers to use for residual fine-scalar quantization. | |
num_quantizers=4, | |
name="DiscreteImageTokenizer", | |
# Specify type of encoder ["Default", "LiteVAE"] | |
encoder=EncoderType.Default.name, | |
# Specify type of decoder ["Default"] | |
decoder=DecoderType.Default.name, | |
) | |
CausalContinuousFactorizedVideoTokenizerConfig: LazyDict = L(CausalContinuousVideoTokenizer)( | |
# The new causal continuous tokenizer, that is at least 2x more efficient in memory and runtime. | |
# - It relies on fully 3D discrete wavelet transform | |
# - Uses a layer norm instead of a group norm | |
# - Factorizes full convolutions into spatial and temporal convolutions | |
# - Factorizes full attention into spatial and temporal attention | |
# - Adopts an AE formulation | |
# - Strictly causal, with flexible temporal length at inference. | |
attn_resolutions=[32], | |
channels=128, | |
channels_mult=[2, 4, 4], | |
dropout=0.0, | |
in_channels=3, | |
num_res_blocks=2, | |
out_channels=3, | |
resolution=1024, | |
patch_size=4, | |
patch_method="haar", | |
latent_channels=16, | |
z_channels=16, | |
z_factor=1, | |
num_groups=1, | |
# Most of the CV and DV tokenizers trained before September 1, 2024, | |
# used temporal upsampling that was not perfectly mirrored with the | |
# # encoder's temporal downsampling. Moving forward, new CV/DV tokenizers | |
# will use legacy_mode=False, meaning they will adopt mirrored upsampling. | |
legacy_mode=False, | |
spatial_compression=8, | |
temporal_compression=8, | |
formulation=ContinuousFormulation.AE.name, | |
encoder=Encoder3DType.FACTORIZED.name, | |
decoder=Decoder3DType.FACTORIZED.name, | |
name="CausalContinuousFactorizedVideoTokenizer", | |
) | |
CausalDiscreteFactorizedVideoTokenizerConfig: LazyDict = L(CausalDiscreteVideoTokenizer)( | |
# The new causal discrete tokenizer, that is at least 2x more efficient in memory and runtime. | |
# - It relies on fully 3D discrete wavelet transform | |
# - Uses a layer norm instead of a group norm | |
# - Factorizes full convolutions into spatial and temporal convolutions | |
# - Factorizes full attention into spatial and temporal attention | |
# - Strictly causal, with flexible temporal length at inference. | |
attn_resolutions=[32], | |
channels=128, | |
channels_mult=[2, 4, 4], | |
dropout=0.0, | |
in_channels=3, | |
num_res_blocks=2, | |
out_channels=3, | |
resolution=1024, | |
patch_size=4, | |
patch_method="haar", | |
# The encoder output channels just before quantization is changed to 256 | |
# from 16 (old versions). It aligns with the DI that uses 256 channels, | |
# making initialization from image tokenizers easier. | |
z_channels=256, | |
z_factor=1, | |
num_groups=1, | |
# Most of the CV and DV tokenizers trained before September 1, 2024, | |
# used temporal upsampling that was not perfectly mirrored with the | |
# # encoder's temporal downsampling. Moving forward, new CV/DV tokenizers | |
# will use legacy_mode=False, meaning they will adopt mirrored upsampling. | |
legacy_mode=False, | |
spatial_compression=16, | |
temporal_compression=8, | |
quantizer=DiscreteQuantizer.FSQ.name, | |
embedding_dim=6, | |
levels=[8, 8, 8, 5, 5, 5], | |
persistent_quantizer=False, | |
encoder=Encoder3DType.FACTORIZED.name, | |
decoder=Decoder3DType.FACTORIZED.name, | |
name="CausalDiscreteFactorizedVideoTokenizer", | |
) | |