RobertML's picture
Add files using upload-large-folder tool
0cbcfbb verified
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel
from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
sd3_common_transformer_block_config = {
"dummy_input": {
"hidden_states": (2, 4096, 1536),
"encoder_hidden_states": (2, 333, 1536),
"temb": (2, 1536),
},
"output_names": ["encoder_hidden_states_out", "hidden_states_out"],
"dynamic_axes": {
"hidden_states": {0: "batch_size"},
"encoder_hidden_states": {0: "batch_size"},
"temb": {0: "steps"},
},
}
ONNX_CONFIG = {
UNet2DConditionModel: {
"down_blocks.0": {
"dummy_input": {
"hidden_states": (2, 320, 128, 128),
"temb": (2, 1280),
},
"output_names": ["sample", "res_samples_0", "res_samples_1", "res_samples_2"],
"dynamic_axes": {
"hidden_states": {0: "batch_size"},
"temb": {0: "steps"},
},
},
"down_blocks.1": {
"dummy_input": {
"hidden_states": (2, 320, 64, 64),
"temb": (2, 1280),
"encoder_hidden_states": (2, 77, 2048),
},
"output_names": ["sample", "res_samples_0", "res_samples_1", "res_samples_2"],
"dynamic_axes": {
"hidden_states": {0: "batch_size"},
"temb": {0: "steps"},
"encoder_hidden_states": {0: "batch_size"},
},
},
"down_blocks.2": {
"dummy_input": {
"hidden_states": (2, 640, 32, 32),
"temb": (2, 1280),
"encoder_hidden_states": (2, 77, 2048),
},
"output_names": ["sample", "res_samples_0", "res_samples_1"],
"dynamic_axes": {
"hidden_states": {0: "batch_size"},
"temb": {0: "steps"},
"encoder_hidden_states": {0: "batch_size"},
},
},
"mid_block": {
"dummy_input": {
"hidden_states": (2, 1280, 32, 32),
"temb": (2, 1280),
"encoder_hidden_states": (2, 77, 2048),
},
"output_names": ["sample"],
"dynamic_axes": {
"hidden_states": {0: "batch_size"},
"temb": {0: "steps"},
"encoder_hidden_states": {0: "batch_size"},
},
},
"up_blocks.0": {
"dummy_input": {
"hidden_states": (2, 1280, 32, 32),
"res_hidden_states_0": (2, 640, 32, 32),
"res_hidden_states_1": (2, 1280, 32, 32),
"res_hidden_states_2": (2, 1280, 32, 32),
"temb": (2, 1280),
"encoder_hidden_states": (2, 77, 2048),
},
"output_names": ["sample"],
"dynamic_axes": {
"hidden_states": {0: "batch_size"},
"temb": {0: "steps"},
"encoder_hidden_states": {0: "batch_size"},
"res_hidden_states_0": {0: "batch_size"},
"res_hidden_states_1": {0: "batch_size"},
"res_hidden_states_2": {0: "batch_size"},
},
},
"up_blocks.1": {
"dummy_input": {
"hidden_states": (2, 1280, 64, 64),
"res_hidden_states_0": (2, 320, 64, 64),
"res_hidden_states_1": (2, 640, 64, 64),
"res_hidden_states_2": (2, 640, 64, 64),
"temb": (2, 1280),
"encoder_hidden_states": (2, 77, 2048),
},
"output_names": ["sample"],
"dynamic_axes": {
"hidden_states": {0: "batch_size"},
"temb": {0: "steps"},
"encoder_hidden_states": {0: "batch_size"},
"res_hidden_states_0": {0: "batch_size"},
"res_hidden_states_1": {0: "batch_size"},
"res_hidden_states_2": {0: "batch_size"},
},
},
"up_blocks.2": {
"dummy_input": {
"hidden_states": (2, 640, 128, 128),
"res_hidden_states_0": (2, 320, 128, 128),
"res_hidden_states_1": (2, 320, 128, 128),
"res_hidden_states_2": (2, 320, 128, 128),
"temb": (2, 1280),
},
"output_names": ["sample"],
"dynamic_axes": {
"hidden_states": {0: "batch_size"},
"temb": {0: "steps"},
"res_hidden_states_0": {0: "batch_size"},
"res_hidden_states_1": {0: "batch_size"},
"res_hidden_states_2": {0: "batch_size"},
},
},
},
SD3Transformer2DModel: {
**{f"transformer_blocks.{i}": sd3_common_transformer_block_config for i in range(23)},
"transformer_blocks.23": {
"dummy_input": {
"hidden_states": (2, 4096, 1536),
"encoder_hidden_states": (2, 333, 1536),
"temb": (2, 1536),
},
"output_names": ["hidden_states_out"],
"dynamic_axes": {
"hidden_states": {0: "batch_size"},
"encoder_hidden_states": {0: "batch_size"},
"temb": {0: "steps"},
},
},
},
}