Spaces:
Build error
Build error
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
# SPDX-License-Identifier: Apache-2.0 | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
from typing import Optional | |
import attrs | |
from cosmos_predict1.autoregressive.tokenizer.discrete_video import DiscreteVideoFSQStateDictTokenizer | |
from cosmos_predict1.autoregressive.tokenizer.networks import CausalDiscreteVideoTokenizer | |
from cosmos_predict1.utils.lazy_config import LazyCall as L | |
from cosmos_predict1.utils.lazy_config import LazyDict | |
def create_discrete_video_fsq_tokenizer_state_dict_config( | |
ckpt_path, pixel_chunk_duration=33, compression_ratio=[8, 16, 16] | |
) -> LazyDict: | |
CausalDiscreteFactorizedVideoTokenizerConfig: LazyDict = L(CausalDiscreteVideoTokenizer)( | |
# The new causal discrete tokenizer, that is at least 2x more efficient in memory and runtime. | |
# - It relies on fully 3D discrete wavelet transform | |
# - Uses a layer norm instead of a group norm | |
# - Factorizes full convolutions into spatial and temporal convolutions | |
# - Factorizes full attention into spatial and temporal attention | |
# - Strictly causal, with flexible temporal length at inference. | |
attn_resolutions=[32], | |
channels=128, | |
channels_mult=[2, 4, 4], | |
dropout=0.0, | |
in_channels=3, | |
num_res_blocks=2, | |
out_channels=3, | |
resolution=1024, | |
patch_size=4, | |
patch_method="haar", | |
z_channels=16, | |
z_factor=1, | |
num_groups=1, | |
legacy_mode=False, | |
spatial_compression=16, | |
temporal_compression=8, | |
embedding_dim=6, | |
levels=[8, 8, 8, 5, 5, 5], | |
name="CausalDiscreteFactorizedVideoTokenizer", | |
) | |
return L(DiscreteVideoFSQStateDictTokenizer)( | |
enc_fp=ckpt_path.replace("ema.jit", "encoder.jit"), | |
dec_fp=ckpt_path.replace("ema.jit", "decoder.jit"), | |
tokenizer_module=CausalDiscreteFactorizedVideoTokenizerConfig, | |
name="discrete_video_fsq", | |
latent_ch=6, | |
is_bf16=True, | |
pixel_chunk_duration=pixel_chunk_duration, | |
latent_chunk_duration=1 + (pixel_chunk_duration - 1) // compression_ratio[0], | |
max_enc_batch_size=8, | |
max_dec_batch_size=4, | |
levels=[8, 8, 8, 5, 5, 5], | |
compression_ratio=compression_ratio, | |
) | |
class TextTokenizerConfig: | |
""" | |
Text tokenizer config | |
Args: | |
config: Config file to define the text tokenizer class. | |
data_key (str): The input key from data_dict that will be passed to the text tokenizer. | |
tokenize_here (bool): Whether to use the tokenizer to perform online tokenization. | |
tokenizer_offset (int): Offset that is added to the tokens. | |
vocab_size (int): Vocabulary size of the tokenizer. | |
""" | |
config: LazyDict | |
data_key: str = "" | |
tokenize_here: bool = False | |
tokenizer_offset: int = 0 | |
vocab_size: int = 0 | |
class VideoTokenizerConfig: | |
""" | |
Video tokenizer config | |
Args: | |
config: Config file to define the video tokenizer class. | |
data_key (str): The input key from data_dict that will be passed to the video tokenizer. | |
tokenize_here (bool): Whether to use the tokenizer to perform online tokenization. | |
tokenizer_offset (int): Offset that is added to the tokens. In case of joint text-video tokenizers, we | |
add an offset to make sure that video tokens and text tokens don't overlap. | |
vocab_size (int): Vocabulary size of the tokenizer. | |
max_seq_len (int): Maximum token length for an input video. | |
temporal_overlap (int): Overlap between consecutive video chunks. | |
""" | |
config: LazyDict | |
data_key: str = "" | |
tokenize_here: bool = True | |
tokenizer_offset: int = 0 | |
vocab_size: int = 0 | |
max_seq_len: int = -1 | |
temporal_overlap: int = 0 | |
class TokenizerConfig: | |
""" | |
Joint tokenizer config | |
Args: | |
text_tokenizer (TextTokenizerConfig): Text tokenizer config file | |
class_tokenizer (ClassTokenizerConfig): Class tokenizer config file | |
video_tokenizer (VideoTokenizerConfig): Video tokenizer config file | |
image_tokenizer (ImageTokenizerConfig): Image tokenizer config file | |
seq_len (int): Final token sequence length | |
training_type (str): Type of training we use. Supports ["text_only", "text_to_video", "class_to_image", "image_text_interleaved"] | |
add_special_tokens (bool): Whether to add special tokens to the output tokens | |
pad_to_multiple_of (int): Pad the token sequence length to the nearest multiple of this number. Defaults to 64. | |
""" | |
text_tokenizer: Optional[TextTokenizerConfig] = None | |
video_tokenizer: Optional[VideoTokenizerConfig] = None | |
seq_len: int = 4096 | |
training_type: str = None | |
add_special_tokens: bool = True | |
pad_to_multiple_of: Optional[int] = 64 | |