Spaces:

NTUST-DDRC
/

gen3c

Build error

App Files Files Community

gen3c / cosmos_predict1 /tokenizer /training /configs /base /net.py

elungky

Initial commit for new Space - pre-built Docker image

28451f7 28 days ago

raw

history blame contribute delete

7.4 kB

	# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Net config options for cosmos/tokenizer

	ContinuousImageTokenizerConfig
	DiscreteImageTokenizerConfig
	CausalContinuousVideoTokenizerConfig

	"""

	from cosmos_predict1.tokenizer.modules import (
	ContinuousFormulation,
	Decoder3DType,
	DecoderType,
	DiscreteQuantizer,
	Encoder3DType,
	EncoderType,
	)
	from cosmos_predict1.tokenizer.networks.continuous_image import ContinuousImageTokenizer
	from cosmos_predict1.tokenizer.networks.continuous_video import CausalContinuousVideoTokenizer
	from cosmos_predict1.tokenizer.networks.discrete_image import DiscreteImageTokenizer
	from cosmos_predict1.tokenizer.networks.discrete_video import CausalDiscreteVideoTokenizer
	from cosmos_predict1.utils.lazy_config import LazyCall as L
	from cosmos_predict1.utils.lazy_config import LazyDict

	ContinuousImageTokenizerConfig: LazyDict = L(ContinuousImageTokenizer)(
	# The attention resolution for res blocks.
	attn_resolutions=[32],
	# The base number of channels.
	channels=128,
	# The channel multipler for each resolution.
	channels_mult=[2, 4, 4],
	dropout=0.0,
	in_channels=3,
	# The spatial compression ratio, default 8.
	spatial_compression=8,
	# The number of layers in each res block.
	num_res_blocks=2,
	out_channels=3,
	resolution=1024,
	patch_size=4,
	patch_method="haar",
	# The output latent dimension (channels).
	latent_channels=16,
	# The encoder output channels just before sampling.
	# Which is also the decoder's input channels.
	z_channels=16,
	# A factor over the z_channels, to get the total channels the encoder should output.
	# For a VAE for instance, we want to output the mean and variance, so we need 2 * z_channels.
	# Since we are using AE formulation, we only need the mean, so z_factor=1.
	z_factor=1,
	name="ContinuousImageTokenizer",
	# What formulation to use, either "AE" or "VAE".
	# Chose AE here, since this has been proven to be effective.
	formulation=ContinuousFormulation.AE.name,
	# Specify type of encoder ["Default", "LiteVAE"]
	encoder=EncoderType.Default.name,
	# Specify type of decoder ["Default"]
	decoder=DecoderType.Default.name,
	)

	DiscreteImageTokenizerConfig: LazyDict = L(DiscreteImageTokenizer)(
	# The attention resolution for res blocks.
	attn_resolutions=[32],
	# The base number of channels.
	channels=128,
	# The channel multipler for each resolution.
	channels_mult=[2, 4, 4],
	dropout=0.0,
	in_channels=3,
	# The spatial compression ratio.
	spatial_compression=16,
	# The number of layers in each res block.
	num_res_blocks=2,
	out_channels=3,
	resolution=1024,
	patch_size=4,
	patch_method="haar",
	# The encoder output channels just before sampling.
	z_channels=256,
	# A factor over the z_channels, to get the total channels the encoder should output.
	# for discrete tokenization, often we directly use the vector, so z_factor=1.
	z_factor=1,
	# The quantizer of choice, VQ, LFQ, FSQ, or ResFSQ. Default FSQ.
	quantizer=DiscreteQuantizer.FSQ.name,
	# The embedding dimension post-quantization, which is also the input channels of the decoder.
	# Which is also the output
	embedding_dim=6,
	# The number of levels to use for fine-scalar quantization.
	levels=[8, 8, 8, 5, 5, 5],
	persistent_quantizer=False,
	# The number of quantizers to use for residual fine-scalar quantization.
	num_quantizers=4,
	name="DiscreteImageTokenizer",
	# Specify type of encoder ["Default", "LiteVAE"]
	encoder=EncoderType.Default.name,
	# Specify type of decoder ["Default"]
	decoder=DecoderType.Default.name,
	)

	CausalContinuousFactorizedVideoTokenizerConfig: LazyDict = L(CausalContinuousVideoTokenizer)(
	# The new causal continuous tokenizer, that is at least 2x more efficient in memory and runtime.
	# - It relies on fully 3D discrete wavelet transform
	# - Uses a layer norm instead of a group norm
	# - Factorizes full convolutions into spatial and temporal convolutions
	# - Factorizes full attention into spatial and temporal attention
	# - Adopts an AE formulation
	# - Strictly causal, with flexible temporal length at inference.
	attn_resolutions=[32],
	channels=128,
	channels_mult=[2, 4, 4],
	dropout=0.0,
	in_channels=3,
	num_res_blocks=2,
	out_channels=3,
	resolution=1024,
	patch_size=4,
	patch_method="haar",
	latent_channels=16,
	z_channels=16,
	z_factor=1,
	num_groups=1,
	# Most of the CV and DV tokenizers trained before September 1, 2024,
	# used temporal upsampling that was not perfectly mirrored with the
	# # encoder's temporal downsampling. Moving forward, new CV/DV tokenizers
	# will use legacy_mode=False, meaning they will adopt mirrored upsampling.
	legacy_mode=False,
	spatial_compression=8,
	temporal_compression=8,
	formulation=ContinuousFormulation.AE.name,
	encoder=Encoder3DType.FACTORIZED.name,
	decoder=Decoder3DType.FACTORIZED.name,
	name="CausalContinuousFactorizedVideoTokenizer",
	)

	CausalDiscreteFactorizedVideoTokenizerConfig: LazyDict = L(CausalDiscreteVideoTokenizer)(
	# The new causal discrete tokenizer, that is at least 2x more efficient in memory and runtime.
	# - It relies on fully 3D discrete wavelet transform
	# - Uses a layer norm instead of a group norm
	# - Factorizes full convolutions into spatial and temporal convolutions
	# - Factorizes full attention into spatial and temporal attention
	# - Strictly causal, with flexible temporal length at inference.
	attn_resolutions=[32],
	channels=128,
	channels_mult=[2, 4, 4],
	dropout=0.0,
	in_channels=3,
	num_res_blocks=2,
	out_channels=3,
	resolution=1024,
	patch_size=4,
	patch_method="haar",
	# The encoder output channels just before quantization is changed to 256
	# from 16 (old versions). It aligns with the DI that uses 256 channels,
	# making initialization from image tokenizers easier.
	z_channels=256,
	z_factor=1,
	num_groups=1,
	# Most of the CV and DV tokenizers trained before September 1, 2024,
	# used temporal upsampling that was not perfectly mirrored with the
	# # encoder's temporal downsampling. Moving forward, new CV/DV tokenizers
	# will use legacy_mode=False, meaning they will adopt mirrored upsampling.
	legacy_mode=False,
	spatial_compression=16,
	temporal_compression=8,
	quantizer=DiscreteQuantizer.FSQ.name,
	embedding_dim=6,
	levels=[8, 8, 8, 5, 5, 5],
	persistent_quantizer=False,
	encoder=Encoder3DType.FACTORIZED.name,
	decoder=Decoder3DType.FACTORIZED.name,
	name="CausalDiscreteFactorizedVideoTokenizer",
	)