HunyuanVideo-Foley / config.yaml
James Zhou
[add] model setting
1062761
model_config:
model_name: HunyuanVideo-Foley-XXL
model_type: 1d
model_precision: bf16
model_kwargs:
depth_triple_blocks: 18
depth_single_blocks: 36
hidden_size: 1536
num_heads: 12
mlp_ratio: 4
mlp_act_type: "gelu_tanh"
qkv_bias: True
qk_norm: True
qk_norm_type: "rms"
attn_mode: "torch"
embedder_type: "default"
interleaved_audio_visual_rope: True
enable_learnable_empty_visual_feat: True
sync_modulation: False
add_sync_feat_to_audio: True
cross_attention: True
use_attention_mask: False
condition_projection: "linear"
sync_feat_dim: 768 # syncformer 768 dim
condition_dim: 768 # clap 768 text condition dim (clip-text)
clip_dim: 768 # siglip2 visual dim
audio_vae_latent_dim: 128
audio_frame_rate: 50
patch_size: 1
rope_dim_list: null
rope_theta: 10000
text_length: 77
clip_length: 64
sync_length: 192
use_mmaudio_singleblock: True
depth_triple_ssl_encoder: null
depth_single_ssl_encoder: 8
use_repa_with_audiossl: True
diffusion_config:
denoise_type: "flow"
flow_path_type: "linear"
flow_predict_type: "velocity"
flow_reverse: True
flow_solver: "euler"
sample_flow_shift: 1.0
sample_use_flux_shift: False
flux_base_shift: 0.5
flux_max_shift: 1.15