|
_target_: modules.v2.vc_wrapper.VoiceConversionWrapper
|
|
sr: 22050
|
|
hop_size: 256
|
|
mel_fn:
|
|
_target_: modules.audio.mel_spectrogram
|
|
_partial_: true
|
|
n_fft: 1024
|
|
win_size: 1024
|
|
hop_size: 256
|
|
num_mels: 80
|
|
sampling_rate: 22050
|
|
fmin: 0
|
|
fmax: null
|
|
center: False
|
|
cfm:
|
|
_target_: modules.v2.cfm.CFM
|
|
estimator:
|
|
_target_: modules.v2.dit_wrapper.DiT
|
|
time_as_token: true
|
|
style_as_token: true
|
|
uvit_skip_connection: false
|
|
block_size: 8192
|
|
depth: 13
|
|
num_heads: 8
|
|
hidden_dim: 512
|
|
in_channels: 80
|
|
content_dim: 512
|
|
style_encoder_dim: 192
|
|
class_dropout_prob: 0.1
|
|
dropout_rate: 0.0
|
|
attn_dropout_rate: 0.0
|
|
cfm_length_regulator:
|
|
_target_: modules.v2.length_regulator.InterpolateRegulator
|
|
channels: 512
|
|
is_discrete: true
|
|
codebook_size: 2048
|
|
sampling_ratios: [ 1, 1, 1, 1 ]
|
|
f0_condition: false
|
|
ar:
|
|
_target_: modules.v2.ar.NaiveWrapper
|
|
model:
|
|
_target_: modules.v2.ar.NaiveTransformer
|
|
config:
|
|
_target_: modules.v2.ar.NaiveModelArgs
|
|
dropout: 0.0
|
|
rope_base: 10000.0
|
|
dim: 768
|
|
head_dim: 64
|
|
n_local_heads: 2
|
|
intermediate_size: 2304
|
|
n_head: 12
|
|
n_layer: 12
|
|
vocab_size: 2049
|
|
ar_length_regulator:
|
|
_target_: modules.v2.length_regulator.InterpolateRegulator
|
|
channels: 768
|
|
is_discrete: true
|
|
codebook_size: 32
|
|
sampling_ratios: [ ]
|
|
f0_condition: false
|
|
style_encoder:
|
|
_target_: modules.campplus.DTDNN.CAMPPlus
|
|
feat_dim: 80
|
|
embedding_size: 192
|
|
content_extractor_narrow:
|
|
_target_: modules.astral_quantization.default_model.AstralQuantizer
|
|
tokenizer_name: "openai/whisper-small"
|
|
ssl_model_name: "facebook/hubert-large-ll60k"
|
|
ssl_output_layer: 18
|
|
skip_ssl: true
|
|
encoder: &bottleneck_encoder
|
|
_target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
|
|
dim: 512
|
|
num_blocks: 12
|
|
intermediate_dim: 1536
|
|
dilation: 1
|
|
input_dim: 1024
|
|
quantizer:
|
|
_target_: modules.astral_quantization.bsq.BinarySphericalQuantize
|
|
codebook_size: 32
|
|
dim: 512
|
|
entropy_loss_weight: 0.1
|
|
diversity_gamma: 1.0
|
|
spherical: True
|
|
enable_entropy_loss: True
|
|
soft_entropy_loss: True
|
|
content_extractor_wide:
|
|
_target_: modules.astral_quantization.default_model.AstralQuantizer
|
|
tokenizer_name: "openai/whisper-small"
|
|
ssl_model_name: "facebook/hubert-large-ll60k"
|
|
ssl_output_layer: 18
|
|
encoder: *bottleneck_encoder
|
|
quantizer:
|
|
_target_: modules.astral_quantization.bsq.BinarySphericalQuantize
|
|
codebook_size: 2048
|
|
dim: 512
|
|
entropy_loss_weight: 0.1
|
|
diversity_gamma: 1.0
|
|
spherical: True
|
|
enable_entropy_loss: True
|
|
soft_entropy_loss: True
|
|
vocoder:
|
|
_target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
|
|
pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
|
|
use_cuda_kernel: false
|
|
|