File size: 1,313 Bytes
1062761 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
model_config:
model_name: HunyuanVideo-Foley-XXL
model_type: 1d
model_precision: bf16
model_kwargs:
depth_triple_blocks: 18
depth_single_blocks: 36
hidden_size: 1536
num_heads: 12
mlp_ratio: 4
mlp_act_type: "gelu_tanh"
qkv_bias: True
qk_norm: True
qk_norm_type: "rms"
attn_mode: "torch"
embedder_type: "default"
interleaved_audio_visual_rope: True
enable_learnable_empty_visual_feat: True
sync_modulation: False
add_sync_feat_to_audio: True
cross_attention: True
use_attention_mask: False
condition_projection: "linear"
sync_feat_dim: 768 # syncformer 768 dim
condition_dim: 768 # clap 768 text condition dim (clip-text)
clip_dim: 768 # siglip2 visual dim
audio_vae_latent_dim: 128
audio_frame_rate: 50
patch_size: 1
rope_dim_list: null
rope_theta: 10000
text_length: 77
clip_length: 64
sync_length: 192
use_mmaudio_singleblock: True
depth_triple_ssl_encoder: null
depth_single_ssl_encoder: 8
use_repa_with_audiossl: True
diffusion_config:
denoise_type: "flow"
flow_path_type: "linear"
flow_predict_type: "velocity"
flow_reverse: True
flow_solver: "euler"
sample_flow_shift: 1.0
sample_use_flux_shift: False
flux_base_shift: 0.5
flux_max_shift: 1.15
|