|
model_config: |
|
model_name: HunyuanVideo-Foley-XXL |
|
model_type: 1d |
|
model_precision: bf16 |
|
model_kwargs: |
|
depth_triple_blocks: 18 |
|
depth_single_blocks: 36 |
|
hidden_size: 1536 |
|
num_heads: 12 |
|
mlp_ratio: 4 |
|
mlp_act_type: "gelu_tanh" |
|
qkv_bias: True |
|
qk_norm: True |
|
qk_norm_type: "rms" |
|
attn_mode: "torch" |
|
embedder_type: "default" |
|
interleaved_audio_visual_rope: True |
|
enable_learnable_empty_visual_feat: True |
|
sync_modulation: False |
|
add_sync_feat_to_audio: True |
|
cross_attention: True |
|
use_attention_mask: False |
|
condition_projection: "linear" |
|
sync_feat_dim: 768 |
|
condition_dim: 768 |
|
clip_dim: 768 |
|
audio_vae_latent_dim: 128 |
|
audio_frame_rate: 50 |
|
patch_size: 1 |
|
rope_dim_list: null |
|
rope_theta: 10000 |
|
text_length: 77 |
|
clip_length: 64 |
|
sync_length: 192 |
|
use_mmaudio_singleblock: True |
|
depth_triple_ssl_encoder: null |
|
depth_single_ssl_encoder: 8 |
|
use_repa_with_audiossl: True |
|
|
|
diffusion_config: |
|
denoise_type: "flow" |
|
flow_path_type: "linear" |
|
flow_predict_type: "velocity" |
|
flow_reverse: True |
|
flow_solver: "euler" |
|
sample_flow_shift: 1.0 |
|
sample_use_flux_shift: False |
|
flux_base_shift: 0.5 |
|
flux_max_shift: 1.15 |
|
|