cosmos_transfer1_av
/
cosmos_transfer1
/diffusion
/config
/inference
/cosmos-1-diffusion-control2world.py
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
# SPDX-License-Identifier: Apache-2.0 | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
from hydra.core.config_store import ConfigStore | |
from cosmos_transfer1.checkpoints import ( | |
BASE_7B_CHECKPOINT_AV_SAMPLE_PATH, | |
EDGE2WORLD_CONTROLNET_DISTILLED_CHECKPOINT_PATH, | |
BASE_t2w_7B_SV2MV_CHECKPOINT_AV_SAMPLE_PATH, | |
BASE_v2w_7B_SV2MV_CHECKPOINT_AV_SAMPLE_PATH, | |
) | |
from cosmos_transfer1.diffusion.config.transfer.conditioner import CTRL_HINT_KEYS_COMB | |
from cosmos_transfer1.diffusion.model.model_ctrl import ( | |
VideoDiffusionModelWithCtrl, | |
VideoDiffusionT2VModelWithCtrl, | |
VideoDistillModelWithCtrl, | |
) | |
from cosmos_transfer1.diffusion.model.model_multi_camera_ctrl import MultiVideoDiffusionModelWithCtrl | |
from cosmos_transfer1.diffusion.networks.general_dit_multi_view import MultiViewVideoExtendGeneralDIT | |
from cosmos_transfer1.diffusion.networks.general_dit_video_conditioned import VideoExtendGeneralDIT | |
from cosmos_transfer1.utils.lazy_config import LazyCall as L | |
from cosmos_transfer1.utils.lazy_config import LazyDict | |
cs = ConfigStore.instance() | |
# Base configuration for 7B model | |
Base_7B_Config = LazyDict( | |
dict( | |
defaults=[ | |
{"override /net": "faditv2_7b"}, | |
{"override /conditioner": "add_fps_image_size_padding_mask"}, | |
{"override /tokenizer": "cosmos_diffusion_tokenizer_res720_comp8x8x8_t121_ver092624"}, | |
"_self_", | |
], | |
model=dict( | |
latent_shape=[16, 16, 88, 160], | |
net=dict( | |
rope_h_extrapolation_ratio=1, | |
rope_w_extrapolation_ratio=1, | |
rope_t_extrapolation_ratio=2, | |
), | |
), | |
job=dict( | |
group="Control2World", | |
name="Base_7B_Config", | |
), | |
) | |
) | |
def make_ctrlnet_config_7b( | |
hint_key: str = "control_input_seg", | |
num_control_blocks: int = 3, | |
) -> LazyDict: | |
hint_mask = [True] * len(CTRL_HINT_KEYS_COMB[hint_key]) | |
return LazyDict( | |
dict( | |
defaults=[ | |
"/experiment/Base_7B_Config", | |
{"override /hint_key": hint_key}, | |
{"override /net_ctrl": "faditv2_7b"}, | |
{"override /conditioner": "ctrlnet_add_fps_image_size_padding_mask"}, | |
], | |
job=dict( | |
group="CTRL_7Bv1_lvg", | |
name=f"CTRL_7Bv1pt3_lvg_tp_121frames_{hint_key}_block{num_control_blocks}", | |
project="cosmos_transfer1", | |
), | |
model=dict( | |
hint_mask=hint_mask, | |
hint_dropout_rate=0.3, | |
conditioner=dict(video_cond_bool=dict()), | |
net=L(VideoExtendGeneralDIT)( | |
extra_per_block_abs_pos_emb=True, | |
pos_emb_learnable=True, | |
extra_per_block_abs_pos_emb_type="learnable", | |
), | |
net_ctrl=dict( | |
in_channels=17, | |
hint_channels=128, | |
num_blocks=28, | |
layer_mask=[True if (i >= num_control_blocks) else False for i in range(28)], | |
extra_per_block_abs_pos_emb=True, | |
pos_emb_learnable=True, | |
extra_per_block_abs_pos_emb_type="learnable", | |
), | |
), | |
model_obj=L(VideoDiffusionModelWithCtrl)(), | |
) | |
) | |
def make_ctrlnet_config_7b_t2v( | |
hint_key: str = "control_input_seg", | |
num_control_blocks: int = 3, | |
) -> LazyDict: | |
hint_mask = [True] * len(CTRL_HINT_KEYS_COMB[hint_key]) | |
return LazyDict( | |
dict( | |
defaults=[ | |
"/experiment/Base_7B_Config", | |
{"override /hint_key": hint_key}, | |
{"override /net_ctrl": "faditv2_7b"}, | |
{"override /conditioner": "ctrlnet_add_fps_image_size_padding_mask"}, | |
], | |
job=dict( | |
group="CTRL_7Bv1_t2v", | |
name=f"CTRL_7Bv1pt3_t2v_121frames_{hint_key}_block{num_control_blocks}", | |
project="cosmos_ctrlnet1", | |
), | |
model=dict( | |
base_load_from=dict( | |
load_path=f"checkpoints/{BASE_7B_CHECKPOINT_AV_SAMPLE_PATH}", | |
), | |
hint_mask=hint_mask, | |
hint_dropout_rate=0.3, | |
net=dict( | |
extra_per_block_abs_pos_emb=True, | |
pos_emb_learnable=True, | |
extra_per_block_abs_pos_emb_type="learnable", | |
), | |
net_ctrl=dict( | |
in_channels=16, | |
hint_channels=16, | |
num_blocks=28, | |
layer_mask=[True if (i >= num_control_blocks) else False for i in range(28)], | |
extra_per_block_abs_pos_emb=True, | |
pos_emb_learnable=True, | |
extra_per_block_abs_pos_emb_type="learnable", | |
), | |
), | |
model_obj=L(VideoDiffusionT2VModelWithCtrl)(), | |
) | |
) | |
def make_ctrlnet_config_7b_mv( | |
hint_key: str = "control_input_seg", | |
num_control_blocks: int = 3, | |
t2w: bool = True, | |
) -> LazyDict: | |
hint_mask = [True] * len(CTRL_HINT_KEYS_COMB[hint_key]) | |
return LazyDict( | |
dict( | |
defaults=[ | |
"/experiment/Base_7B_Config", | |
{"override /hint_key": hint_key}, | |
{"override /net_ctrl": "faditv2_7b_mv"}, | |
{"override /conditioner": "view_cond_ctrlnet_add_fps_image_size_padding_mask"}, | |
], | |
job=dict( | |
group="CTRL_7Bv1_mv", | |
name=f"CTRL_7Bv1pt3_sv2mv_{'t2w' if t2w else 'v2w'}_57frames_{hint_key}_block{num_control_blocks}", | |
project="cosmos_ctrlnet1", | |
), | |
model=dict( | |
n_views=6, | |
base_load_from=dict( | |
load_path=f"checkpoints/{BASE_t2w_7B_SV2MV_CHECKPOINT_AV_SAMPLE_PATH}" | |
if t2w | |
else f"checkpoints/{BASE_v2w_7B_SV2MV_CHECKPOINT_AV_SAMPLE_PATH}", | |
), | |
hint_mask=hint_mask, | |
hint_dropout_rate=0.3, | |
conditioner=dict( | |
video_cond_bool=dict( | |
condition_location="first_cam" if t2w else "first_cam_and_first_n", | |
) | |
), | |
net=L(MultiViewVideoExtendGeneralDIT)( | |
n_views=6, | |
n_views_emb=7, | |
camera_condition_dim=6, | |
add_repeat_frame_embedding=True, | |
extra_per_block_abs_pos_emb=True, | |
pos_emb_learnable=True, | |
), | |
net_ctrl=dict( | |
in_channels=16, | |
hint_channels=16, | |
num_blocks=28, | |
n_views=6, | |
n_views_emb=7, | |
camera_condition_dim=6, | |
add_repeat_frame_embedding=True, | |
is_extend_model=True, | |
layer_mask=[True if (i >= num_control_blocks) else False for i in range(28)], | |
extra_per_block_abs_pos_emb=True, | |
pos_emb_learnable=True, | |
), | |
tokenizer=dict( | |
video_vae=dict( | |
pixel_chunk_duration=57, | |
) | |
), | |
), | |
model_obj=L(MultiVideoDiffusionModelWithCtrl)(), | |
) | |
) | |
def make_ctrlnet_config_7b_mv_waymo( | |
hint_key: str = "control_input_seg", | |
num_control_blocks: int = 3, | |
t2w: bool = True, | |
) -> LazyDict: | |
hint_mask = [True] * len(CTRL_HINT_KEYS_COMB[hint_key]) | |
return LazyDict( | |
dict( | |
defaults=[ | |
"/experiment/Base_7B_Config", | |
{"override /hint_key": hint_key}, | |
{"override /net_ctrl": "faditv2_7b_mv"}, | |
{"override /conditioner": "view_cond_ctrlnet_add_fps_image_size_padding_mask"}, | |
], | |
job=dict( | |
group="CTRL_7Bv1_mv", | |
name=f"CTRL_7Bv1pt3_sv2mv_{'t2w' if t2w else 'v2w'}_57frames_{hint_key}_waymo_block{num_control_blocks}", | |
project="cosmos_ctrlnet1", | |
), | |
model=dict( | |
n_views=5, | |
base_load_from=dict( | |
load_path=f"checkpoints/{BASE_t2w_7B_SV2MV_CHECKPOINT_AV_SAMPLE_PATH}" | |
if t2w | |
else f"checkpoints/{BASE_v2w_7B_SV2MV_CHECKPOINT_AV_SAMPLE_PATH}", | |
), | |
hint_mask=hint_mask, | |
hint_dropout_rate=0.15, | |
conditioner=dict( | |
video_cond_bool=dict( | |
condition_location="first_cam" if t2w else "first_cam_and_first_n", | |
cfg_unconditional_type="zero_condition_region_condition_mask", | |
apply_corruption_to_condition_region="noise_with_sigma", | |
condition_on_augment_sigma=False, | |
dropout_rate=0.0, | |
first_random_n_num_condition_t_max=0 if t2w else 2, | |
normalize_condition_latent=False, | |
augment_sigma_sample_p_mean=-3.0, | |
augment_sigma_sample_p_std=2.0, | |
augment_sigma_sample_multiplier=1.0, | |
) | |
), | |
net=L(MultiViewVideoExtendGeneralDIT)( | |
n_views=5, | |
n_views_emb=7, | |
camera_condition_dim=6, | |
add_repeat_frame_embedding=True, | |
extra_per_block_abs_pos_emb=True, | |
pos_emb_learnable=True, | |
extra_per_block_abs_pos_emb_type="learnable", | |
num_blocks=28, | |
), | |
adjust_video_noise=True, | |
net_ctrl=dict( | |
in_channels=16, | |
hint_channels=16, | |
num_blocks=28, | |
n_views=5, | |
n_views_emb=7, | |
camera_condition_dim=6, | |
add_repeat_frame_embedding=True, | |
is_extend_model=True, | |
layer_mask=[True if (i >= num_control_blocks) else False for i in range(28)], | |
extra_per_block_abs_pos_emb=True, | |
pos_emb_learnable=True, | |
extra_per_block_abs_pos_emb_type="learnable", | |
), | |
tokenizer=dict( | |
video_vae=dict( | |
pixel_chunk_duration=57, | |
) | |
), | |
), | |
model_obj=L(MultiVideoDiffusionModelWithCtrl)(), | |
) | |
) | |
# Register base configs | |
cs.store(group="experiment", package="_global_", name=Base_7B_Config["job"]["name"], node=Base_7B_Config) | |
# Register all control configurations | |
num_control_blocks = 3 | |
for key in CTRL_HINT_KEYS_COMB.keys(): | |
# Register 7B configurations | |
config_7b = make_ctrlnet_config_7b(hint_key=key, num_control_blocks=num_control_blocks) | |
cs.store(group="experiment", package="_global_", name=config_7b["job"]["name"], node=config_7b) | |
# Register t2v based control net | |
num_control_blocks = 3 | |
for key in ["control_input_hdmap", "control_input_lidar"]: | |
# Register 7B configurations | |
config_7b = make_ctrlnet_config_7b_t2v(hint_key=key, num_control_blocks=num_control_blocks) | |
cs.store(group="experiment", package="_global_", name=config_7b["job"]["name"], node=config_7b) | |
num_control_blocks = 3 | |
for key in ["control_input_hdmap", "control_input_lidar"]: | |
for t2w in [True, False]: | |
# Register 7B sv2mv configurations | |
config_7b = make_ctrlnet_config_7b_mv(hint_key=key, num_control_blocks=num_control_blocks, t2w=t2w) | |
cs.store(group="experiment", package="_global_", name=config_7b["job"]["name"], node=config_7b) | |
# Register waymo example | |
num_control_blocks = 3 | |
for key in ["control_input_hdmap", "control_input_lidar"]: | |
for t2w in [True, False]: | |
# Register 7B sv2mv configurations | |
config_7b = make_ctrlnet_config_7b_mv_waymo(hint_key=key, num_control_blocks=num_control_blocks, t2w=t2w) | |
cs.store(group="experiment", package="_global_", name=config_7b["job"]["name"], node=config_7b) | |
def make_ctrlnet_config_7b_distilled( | |
hint_key: str = "control_input_edge", | |
num_control_blocks: int = 3, | |
) -> LazyDict: | |
hint_mask = [True] * len(CTRL_HINT_KEYS_COMB[hint_key]) | |
return LazyDict( | |
dict( | |
defaults=[ | |
"/experiment/Base_7B_Config", | |
{"override /hint_key": hint_key}, | |
{"override /net": "faditv2_7b"}, | |
{"override /net_ctrl": "faditv2_7b"}, | |
{"override /conditioner": "ctrlnet_add_fps_image_size_padding_mask"}, | |
"_self_", | |
], | |
job=dict( | |
group="DISTILL_CTRL_7Bv1", | |
name=f"CTRL_7Bv1pt3_lvg_fsdp_distilled_121frames_{hint_key}_block{num_control_blocks}", | |
project="cosmos_nano_v1", | |
), | |
model=dict( | |
base_load_from=dict( | |
load_path=f"checkpoints/{EDGE2WORLD_CONTROLNET_DISTILLED_CHECKPOINT_PATH}", | |
), | |
hint_mask=hint_mask, | |
hint_dropout_rate=0.0, | |
conditioner=dict( | |
video_cond_bool=dict( | |
condition_location="first_random_n", | |
cfg_unconditional_type="zero_condition_region_condition_mask", | |
apply_corruption_to_condition_region="noise_with_sigma_fixed", | |
condition_on_augment_sigma=False, | |
dropout_rate=0.0, | |
first_random_n_num_condition_t_max=2, | |
first_random_n_num_condition_t_min=0, | |
normalize_condition_latent=False, | |
augment_sigma_sample_p_mean=-3.0, | |
augment_sigma_sample_p_std=2.0, | |
augment_sigma_sample_multiplier=1.0, | |
) | |
), | |
net=L(VideoExtendGeneralDIT)( | |
extra_per_block_abs_pos_emb=True, | |
pos_emb_learnable=True, | |
extra_per_block_abs_pos_emb_type="learnable", | |
rope_t_extrapolation_ratio=2, | |
), | |
net_ctrl=dict( | |
in_channels=17, | |
hint_channels=128, | |
num_blocks=28, | |
layer_mask=[True if (i >= num_control_blocks) else False for i in range(28)], | |
num_control_blocks=num_control_blocks, | |
dropout_ctrl_branch=0, | |
extra_per_block_abs_pos_emb=True, | |
pos_emb_learnable=True, | |
extra_per_block_abs_pos_emb_type="learnable", | |
), | |
), | |
model_obj=L(VideoDistillModelWithCtrl)(), | |
) | |
) | |
# Register the specific distilled configuration | |
distilled_config = make_ctrlnet_config_7b_distilled(hint_key="control_input_edge", num_control_blocks=3) | |
cs.store( | |
group="experiment", | |
package="_global_", | |
name="dev_v2w_ctrl_7bv1pt3_VisControlCanny_video_only_dmd2_fsdp", | |
node=distilled_config, | |
) | |