diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..2649e7ac04e3148ca09dbae60b21822b4018154e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,22 @@
+*.exe
+*.pt
+*.onnx
+*.pyc
+*.pth
+*.index
+*.mp3
+*.flac
+*.ogg
+*.m4a
+*.bin
+*.wav
+*.txt
+*.zip
+*.png
+*.safetensors
+
+logs
+rvc/models
+env
+venv
+.venv
\ No newline at end of file
diff --git a/rvc/configs/32000.json b/rvc/configs/32000.json
new file mode 100644
index 0000000000000000000000000000000000000000..803a948e9f4d033dc7d7ccf08deee7706ebceb91
--- /dev/null
+++ b/rvc/configs/32000.json
@@ -0,0 +1,42 @@
+{
+ "train": {
+ "log_interval": 200,
+ "seed": 1234,
+ "learning_rate": 1e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "lr_decay": 0.999875,
+ "segment_size": 12800,
+ "c_mel": 45,
+ "c_kl": 1.0
+ },
+ "data": {
+ "max_wav_value": 32768.0,
+ "sample_rate": 32000,
+ "filter_length": 1024,
+ "hop_length": 320,
+ "win_length": 1024,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "text_enc_hidden_dim": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,8,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [20,16,4,4],
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "spk_embed_dim": 109
+ }
+}
diff --git a/rvc/configs/40000.json b/rvc/configs/40000.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd9bcc85b96a5e71478354c525189a4832c2b78a
--- /dev/null
+++ b/rvc/configs/40000.json
@@ -0,0 +1,42 @@
+{
+ "train": {
+ "log_interval": 200,
+ "seed": 1234,
+ "learning_rate": 1e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "lr_decay": 0.999875,
+ "segment_size": 12800,
+ "c_mel": 45,
+ "c_kl": 1.0
+ },
+ "data": {
+ "max_wav_value": 32768.0,
+ "sample_rate": 40000,
+ "filter_length": 2048,
+ "hop_length": 400,
+ "win_length": 2048,
+ "n_mel_channels": 125,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "text_enc_hidden_dim": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,10,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "spk_embed_dim": 109
+ }
+}
diff --git a/rvc/configs/44100.json b/rvc/configs/44100.json
new file mode 100644
index 0000000000000000000000000000000000000000..76ebbaef41d2f9ec3931474ec28e352e1d9c66b6
--- /dev/null
+++ b/rvc/configs/44100.json
@@ -0,0 +1,42 @@
+{
+ "train": {
+ "log_interval": 200,
+ "seed": 1234,
+ "learning_rate": 0.0001,
+ "betas": [0.8, 0.99],
+ "eps": 1e-09,
+ "lr_decay": 0.999875,
+ "segment_size": 15876,
+ "c_mel": 45,
+ "c_kl": 1.0
+ },
+ "data": {
+ "max_wav_value": 32768.0,
+ "sample_rate": 44100,
+ "filter_length": 2048,
+ "hop_length": 441,
+ "win_length": 2048,
+ "n_mel_channels": 160,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "text_enc_hidden_dim": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [7,7,3,3],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [14,14,6,6],
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "spk_embed_dim": 109
+ }
+}
\ No newline at end of file
diff --git a/rvc/configs/48000.json b/rvc/configs/48000.json
new file mode 100644
index 0000000000000000000000000000000000000000..968ad3b3b15963beb1cf77f16f4b3dfd8ca9ca08
--- /dev/null
+++ b/rvc/configs/48000.json
@@ -0,0 +1,42 @@
+{
+ "train": {
+ "log_interval": 200,
+ "seed": 1234,
+ "learning_rate": 1e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "lr_decay": 0.999875,
+ "segment_size": 17280,
+ "c_mel": 45,
+ "c_kl": 1.0
+ },
+ "data": {
+ "max_wav_value": 32768.0,
+ "sample_rate": 48000,
+ "filter_length": 2048,
+ "hop_length": 480,
+ "win_length": 2048,
+ "n_mel_channels": 128,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "text_enc_hidden_dim": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [12,10,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [24,20,4,4],
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "spk_embed_dim": 109
+ }
+}
diff --git a/rvc/configs/config.py b/rvc/configs/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a5653e946849b687c395781c3dca5cb00beb09a
--- /dev/null
+++ b/rvc/configs/config.py
@@ -0,0 +1,99 @@
+import torch
+import json
+import os
+
+version_config_paths = [
+ os.path.join("48000.json"),
+ os.path.join("40000.json"),
+ os.path.join("44100.json"),
+ os.path.join("32000.json"),
+]
+
+
+def singleton(cls):
+ instances = {}
+
+ def get_instance(*args, **kwargs):
+ if cls not in instances:
+ instances[cls] = cls(*args, **kwargs)
+ return instances[cls]
+
+ return get_instance
+
+
+@singleton
+class Config:
+ def __init__(self):
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+ self.gpu_name = (
+ torch.cuda.get_device_name(int(self.device.split(":")[-1]))
+ if self.device.startswith("cuda")
+ else None
+ )
+ self.json_config = self.load_config_json()
+ self.gpu_mem = None
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
+
+ def load_config_json(self):
+ configs = {}
+ for config_file in version_config_paths:
+ config_path = os.path.join("rvc", "configs", config_file)
+ with open(config_path, "r") as f:
+ configs[config_file] = json.load(f)
+ return configs
+
+ def device_config(self):
+ if self.device.startswith("cuda"):
+ self.set_cuda_config()
+ else:
+ self.device = "cpu"
+
+ # Configuration for 6GB GPU memory
+ x_pad, x_query, x_center, x_max = (1, 6, 38, 41)
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
+ # Configuration for 5GB GPU memory
+ x_pad, x_query, x_center, x_max = (1, 5, 30, 32)
+
+ return x_pad, x_query, x_center, x_max
+
+ def set_cuda_config(self):
+ i_device = int(self.device.split(":")[-1])
+ self.gpu_name = torch.cuda.get_device_name(i_device)
+ self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (
+ 1024**3
+ )
+
+
+def max_vram_gpu(gpu):
+ if torch.cuda.is_available():
+ gpu_properties = torch.cuda.get_device_properties(gpu)
+ total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024)
+ return total_memory_gb
+ else:
+ return "8"
+
+
+def get_gpu_info():
+ ngpu = torch.cuda.device_count()
+ gpu_infos = []
+ if torch.cuda.is_available() or ngpu != 0:
+ for i in range(ngpu):
+ gpu_name = torch.cuda.get_device_name(i)
+ mem = int(
+ torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024
+ + 0.4
+ )
+ gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)")
+ if len(gpu_infos) > 0:
+ gpu_info = "\n".join(gpu_infos)
+ else:
+ gpu_info = "Unfortunately, there is no compatible GPU available to support your training."
+ return gpu_info
+
+
+def get_number_of_gpus():
+ if torch.cuda.is_available():
+ num_gpus = torch.cuda.device_count()
+ return "-".join(map(str, range(num_gpus)))
+ else:
+ return "-"
diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b573acc24afb8e975e929e0aa0e5ce5ae762c1e
--- /dev/null
+++ b/rvc/infer/infer.py
@@ -0,0 +1,495 @@
+import os
+import sys
+import soxr
+import time
+import torch
+import librosa
+import logging
+import traceback
+import numpy as np
+import soundfile as sf
+import noisereduce as nr
+from pedalboard import (
+ Pedalboard,
+ Chorus,
+ Distortion,
+ Reverb,
+ PitchShift,
+ Limiter,
+ Gain,
+ Bitcrush,
+ Clipping,
+ Compressor,
+ Delay,
+)
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from rvc.infer.pipeline import Pipeline as VC
+from rvc.lib.utils import load_audio_infer, load_embedding
+from rvc.lib.tools.split_audio import process_audio, merge_audio
+from rvc.lib.algorithm.synthesizers import Synthesizer
+from rvc.configs.config import Config
+
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("httpcore").setLevel(logging.WARNING)
+logging.getLogger("faiss").setLevel(logging.WARNING)
+logging.getLogger("faiss.loader").setLevel(logging.WARNING)
+
+
+class VoiceConverter:
+ """
+ A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method.
+ """
+
+ def __init__(self):
+ """
+ Initializes the VoiceConverter with default configuration, and sets up models and parameters.
+ """
+ self.config = Config() # Load configuration
+ self.hubert_model = (
+ None # Initialize the Hubert model (for embedding extraction)
+ )
+ self.last_embedder_model = None # Last used embedder model
+ self.tgt_sr = None # Target sampling rate for the output audio
+ self.net_g = None # Generator network for voice conversion
+ self.vc = None # Voice conversion pipeline instance
+ self.cpt = None # Checkpoint for loading model weights
+ self.version = None # Model version
+ self.n_spk = None # Number of speakers in the model
+ self.use_f0 = None # Whether the model uses F0
+ self.loaded_model = None
+
+ def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
+ """
+ Loads the HuBERT model for speaker embedding extraction.
+
+ Args:
+ embedder_model (str): Path to the pre-trained HuBERT model.
+ embedder_model_custom (str): Path to the custom HuBERT model.
+ """
+ self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
+ self.hubert_model = self.hubert_model.to(self.config.device).float()
+ self.hubert_model.eval()
+
+ @staticmethod
+ def remove_audio_noise(data, sr, reduction_strength=0.7):
+ """
+ Removes noise from an audio file using the NoiseReduce library.
+
+ Args:
+ data (numpy.ndarray): The audio data as a NumPy array.
+ sr (int): The sample rate of the audio data.
+ reduction_strength (float): Strength of the noise reduction. Default is 0.7.
+ """
+ try:
+ reduced_noise = nr.reduce_noise(
+ y=data, sr=sr, prop_decrease=reduction_strength
+ )
+ return reduced_noise
+ except Exception as error:
+ print(f"An error occurred removing audio noise: {error}")
+ return None
+
+ @staticmethod
+ def convert_audio_format(input_path, output_path, output_format):
+ """
+ Converts an audio file to a specified output format.
+
+ Args:
+ input_path (str): Path to the input audio file.
+ output_path (str): Path to the output audio file.
+ output_format (str): Desired audio format (e.g., "WAV", "MP3").
+ """
+ try:
+ if output_format != "WAV":
+ print(f"Saving audio as {output_format}...")
+ audio, sample_rate = librosa.load(input_path, sr=None)
+ common_sample_rates = [
+ 8000,
+ 11025,
+ 12000,
+ 16000,
+ 22050,
+ 24000,
+ 32000,
+ 44100,
+ 48000,
+ ]
+ target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
+ audio = librosa.resample(
+ audio, orig_sr=sample_rate, target_sr=target_sr, res_type="soxr_vhq"
+ )
+ sf.write(output_path, audio, target_sr, format=output_format.lower())
+ return output_path
+ except Exception as error:
+ print(f"An error occurred converting the audio format: {error}")
+
+ @staticmethod
+ def post_process_audio(
+ audio_input,
+ sample_rate,
+ **kwargs,
+ ):
+ board = Pedalboard()
+ if kwargs.get("reverb", False):
+ reverb = Reverb(
+ room_size=kwargs.get("reverb_room_size", 0.5),
+ damping=kwargs.get("reverb_damping", 0.5),
+ wet_level=kwargs.get("reverb_wet_level", 0.33),
+ dry_level=kwargs.get("reverb_dry_level", 0.4),
+ width=kwargs.get("reverb_width", 1.0),
+ freeze_mode=kwargs.get("reverb_freeze_mode", 0),
+ )
+ board.append(reverb)
+ if kwargs.get("pitch_shift", False):
+ pitch_shift = PitchShift(semitones=kwargs.get("pitch_shift_semitones", 0))
+ board.append(pitch_shift)
+ if kwargs.get("limiter", False):
+ limiter = Limiter(
+ threshold_db=kwargs.get("limiter_threshold", -6),
+ release_ms=kwargs.get("limiter_release", 0.05),
+ )
+ board.append(limiter)
+ if kwargs.get("gain", False):
+ gain = Gain(gain_db=kwargs.get("gain_db", 0))
+ board.append(gain)
+ if kwargs.get("distortion", False):
+ distortion = Distortion(drive_db=kwargs.get("distortion_gain", 25))
+ board.append(distortion)
+ if kwargs.get("chorus", False):
+ chorus = Chorus(
+ rate_hz=kwargs.get("chorus_rate", 1.0),
+ depth=kwargs.get("chorus_depth", 0.25),
+ centre_delay_ms=kwargs.get("chorus_delay", 7),
+ feedback=kwargs.get("chorus_feedback", 0.0),
+ mix=kwargs.get("chorus_mix", 0.5),
+ )
+ board.append(chorus)
+ if kwargs.get("bitcrush", False):
+ bitcrush = Bitcrush(bit_depth=kwargs.get("bitcrush_bit_depth", 8))
+ board.append(bitcrush)
+ if kwargs.get("clipping", False):
+ clipping = Clipping(threshold_db=kwargs.get("clipping_threshold", 0))
+ board.append(clipping)
+ if kwargs.get("compressor", False):
+ compressor = Compressor(
+ threshold_db=kwargs.get("compressor_threshold", 0),
+ ratio=kwargs.get("compressor_ratio", 1),
+ attack_ms=kwargs.get("compressor_attack", 1.0),
+ release_ms=kwargs.get("compressor_release", 100),
+ )
+ board.append(compressor)
+ if kwargs.get("delay", False):
+ delay = Delay(
+ delay_seconds=kwargs.get("delay_seconds", 0.5),
+ feedback=kwargs.get("delay_feedback", 0.0),
+ mix=kwargs.get("delay_mix", 0.5),
+ )
+ board.append(delay)
+ return board(audio_input, sample_rate)
+
+ def convert_audio(
+ self,
+ audio_input_path: str,
+ audio_output_path: str,
+ model_path: str,
+ index_path: str,
+ pitch: int = 0,
+ f0_file: str = None,
+ f0_method: str = "rmvpe",
+ index_rate: float = 0.75,
+ volume_envelope: float = 1,
+ protect: float = 0.5,
+ hop_length: int = 128,
+ split_audio: bool = False,
+ f0_autotune: bool = False,
+ f0_autotune_strength: float = 1,
+ filter_radius: int = 3,
+ embedder_model: str = "contentvec",
+ embedder_model_custom: str = None,
+ clean_audio: bool = False,
+ clean_strength: float = 0.5,
+ export_format: str = "WAV",
+ post_process: bool = False,
+ resample_sr: int = 0,
+ sid: int = 0,
+ **kwargs,
+ ):
+ """
+ Performs voice conversion on the input audio.
+
+ Args:
+ pitch (int): Key for F0 up-sampling.
+ filter_radius (int): Radius for filtering.
+ index_rate (float): Rate for index matching.
+ volume_envelope (int): RMS mix rate.
+ protect (float): Protection rate for certain audio segments.
+ hop_length (int): Hop length for audio processing.
+ f0_method (str): Method for F0 extraction.
+ audio_input_path (str): Path to the input audio file.
+ audio_output_path (str): Path to the output audio file.
+ model_path (str): Path to the voice conversion model.
+ index_path (str): Path to the index file.
+ split_audio (bool): Whether to split the audio for processing.
+ f0_autotune (bool): Whether to use F0 autotune.
+ clean_audio (bool): Whether to clean the audio.
+ clean_strength (float): Strength of the audio cleaning.
+ export_format (str): Format for exporting the audio.
+ f0_file (str): Path to the F0 file.
+ embedder_model (str): Path to the embedder model.
+ embedder_model_custom (str): Path to the custom embedder model.
+ resample_sr (int, optional): Resample sampling rate. Default is 0.
+ sid (int, optional): Speaker ID. Default is 0.
+ **kwargs: Additional keyword arguments.
+ """
+ if not model_path:
+ print("No model path provided. Aborting conversion.")
+ return
+
+ self.get_vc(model_path, sid)
+
+ try:
+ start_time = time.time()
+ print(f"Converting audio '{audio_input_path}'...")
+
+ audio = load_audio_infer(
+ audio_input_path,
+ 16000,
+ **kwargs,
+ )
+ audio_max = np.abs(audio).max() / 0.95
+
+ if audio_max > 1:
+ audio /= audio_max
+
+ if not self.hubert_model or embedder_model != self.last_embedder_model:
+ self.load_hubert(embedder_model, embedder_model_custom)
+ self.last_embedder_model = embedder_model
+
+ file_index = (
+ index_path.strip()
+ .strip('"')
+ .strip("\n")
+ .strip('"')
+ .strip()
+ .replace("trained", "added")
+ )
+
+ if self.tgt_sr != resample_sr >= 16000:
+ self.tgt_sr = resample_sr
+
+ if split_audio:
+ chunks, intervals = process_audio(audio, 16000)
+ print(f"Audio split into {len(chunks)} chunks for processing.")
+ else:
+ chunks = []
+ chunks.append(audio)
+
+ converted_chunks = []
+ for c in chunks:
+ audio_opt = self.vc.pipeline(
+ model=self.hubert_model,
+ net_g=self.net_g,
+ sid=sid,
+ audio=c,
+ pitch=pitch,
+ f0_method=f0_method,
+ file_index=file_index,
+ index_rate=index_rate,
+ pitch_guidance=self.use_f0,
+ filter_radius=filter_radius,
+ volume_envelope=volume_envelope,
+ version=self.version,
+ protect=protect,
+ hop_length=hop_length,
+ f0_autotune=f0_autotune,
+ f0_autotune_strength=f0_autotune_strength,
+ f0_file=f0_file,
+ )
+ converted_chunks.append(audio_opt)
+ if split_audio:
+ print(f"Converted audio chunk {len(converted_chunks)}")
+
+ if split_audio:
+ audio_opt = merge_audio(
+ chunks, converted_chunks, intervals, 16000, self.tgt_sr
+ )
+ else:
+ audio_opt = converted_chunks[0]
+
+ if clean_audio:
+ cleaned_audio = self.remove_audio_noise(
+ audio_opt, self.tgt_sr, clean_strength
+ )
+ if cleaned_audio is not None:
+ audio_opt = cleaned_audio
+
+ if post_process:
+ audio_opt = self.post_process_audio(
+ audio_input=audio_opt,
+ sample_rate=self.tgt_sr,
+ **kwargs,
+ )
+
+ sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
+ output_path_format = audio_output_path.replace(
+ ".wav", f".{export_format.lower()}"
+ )
+ audio_output_path = self.convert_audio_format(
+ audio_output_path, output_path_format, export_format
+ )
+
+ elapsed_time = time.time() - start_time
+ print(
+ f"Conversion completed at '{audio_output_path}' in {elapsed_time:.2f} seconds."
+ )
+ except Exception as error:
+ print(f"An error occurred during audio conversion: {error}")
+ print(traceback.format_exc())
+
+ def convert_audio_batch(
+ self,
+ audio_input_paths: str,
+ audio_output_path: str,
+ **kwargs,
+ ):
+ """
+ Performs voice conversion on a batch of input audio files.
+
+ Args:
+ audio_input_paths (str): List of paths to the input audio files.
+ audio_output_path (str): Path to the output audio file.
+ resample_sr (int, optional): Resample sampling rate. Default is 0.
+ sid (int, optional): Speaker ID. Default is 0.
+ **kwargs: Additional keyword arguments.
+ """
+ pid = os.getpid()
+ try:
+ with open(
+ os.path.join(now_dir, "assets", "infer_pid.txt"), "w"
+ ) as pid_file:
+ pid_file.write(str(pid))
+ start_time = time.time()
+ print(f"Converting audio batch '{audio_input_paths}'...")
+ audio_files = [
+ f
+ for f in os.listdir(audio_input_paths)
+ if f.endswith(
+ (
+ "wav",
+ "mp3",
+ "flac",
+ "ogg",
+ "opus",
+ "m4a",
+ "mp4",
+ "aac",
+ "alac",
+ "wma",
+ "aiff",
+ "webm",
+ "ac3",
+ )
+ )
+ ]
+ print(f"Detected {len(audio_files)} audio files for inference.")
+ for a in audio_files:
+ new_input = os.path.join(audio_input_paths, a)
+ new_output = os.path.splitext(a)[0] + "_output.wav"
+ new_output = os.path.join(audio_output_path, new_output)
+ if os.path.exists(new_output):
+ continue
+ self.convert_audio(
+ audio_input_path=new_input,
+ audio_output_path=new_output,
+ **kwargs,
+ )
+ print(f"Conversion completed at '{audio_input_paths}'.")
+ elapsed_time = time.time() - start_time
+ print(f"Batch conversion completed in {elapsed_time:.2f} seconds.")
+ except Exception as error:
+ print(f"An error occurred during audio batch conversion: {error}")
+ print(traceback.format_exc())
+ finally:
+ os.remove(os.path.join(now_dir, "assets", "infer_pid.txt"))
+
+ def get_vc(self, weight_root, sid):
+ """
+ Loads the voice conversion model and sets up the pipeline.
+
+ Args:
+ weight_root (str): Path to the model weights.
+ sid (int): Speaker ID.
+ """
+ if sid == "" or sid == []:
+ self.cleanup_model()
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+
+ if not self.loaded_model or self.loaded_model != weight_root:
+ self.load_model(weight_root)
+ if self.cpt is not None:
+ self.setup_network()
+ self.setup_vc_instance()
+ self.loaded_model = weight_root
+
+ def cleanup_model(self):
+ """
+ Cleans up the model and releases resources.
+ """
+ if self.hubert_model is not None:
+ del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
+ self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+
+ del self.net_g, self.cpt
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ self.cpt = None
+
+ def load_model(self, weight_root):
+ """
+ Loads the model weights from the specified path.
+
+ Args:
+ weight_root (str): Path to the model weights.
+ """
+ self.cpt = (
+ torch.load(weight_root, map_location="cpu", weights_only=True)
+ if os.path.isfile(weight_root)
+ else None
+ )
+
+ def setup_network(self):
+ """
+ Sets up the network configuration based on the loaded checkpoint.
+ """
+ if self.cpt is not None:
+ self.tgt_sr = self.cpt["config"][-1]
+ self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
+ self.use_f0 = self.cpt.get("f0", 1)
+
+ self.version = self.cpt.get("version", "v1")
+ self.text_enc_hidden_dim = 768 if self.version == "v2" else 256
+ self.vocoder = self.cpt.get("vocoder", "HiFi-GAN")
+ self.net_g = Synthesizer(
+ *self.cpt["config"],
+ use_f0=self.use_f0,
+ text_enc_hidden_dim=self.text_enc_hidden_dim,
+ vocoder=self.vocoder,
+ )
+ del self.net_g.enc_q
+ self.net_g.load_state_dict(self.cpt["weight"], strict=False)
+ self.net_g = self.net_g.to(self.config.device).float()
+ self.net_g.eval()
+
+ def setup_vc_instance(self):
+ """
+ Sets up the voice conversion pipeline instance based on the target sampling rate and configuration.
+ """
+ if self.cpt is not None:
+ self.vc = VC(self.tgt_sr, self.config)
+ self.n_spk = self.cpt["config"][-3]
diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ad393e86f1d9b8f3c1cd35d5a0fabd57053a9f9
--- /dev/null
+++ b/rvc/infer/pipeline.py
@@ -0,0 +1,690 @@
+import os
+import gc
+import re
+import sys
+import torch
+import torch.nn.functional as F
+import torchcrepe
+import faiss
+import librosa
+import numpy as np
+from scipy import signal
+from torch import Tensor
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from rvc.lib.predictors.RMVPE import RMVPE0Predictor
+from rvc.lib.predictors.FCPE import FCPEF0Predictor
+
+import logging
+
+logging.getLogger("faiss").setLevel(logging.WARNING)
+
+FILTER_ORDER = 5
+CUTOFF_FREQUENCY = 48 # Hz
+SAMPLE_RATE = 16000 # Hz
+bh, ah = signal.butter(
+ N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
+)
+
+input_audio_path2wav = {}
+
+
+class AudioProcessor:
+ """
+ A class for processing audio signals, specifically for adjusting RMS levels.
+ """
+
+ def change_rms(
+ source_audio: np.ndarray,
+ source_rate: int,
+ target_audio: np.ndarray,
+ target_rate: int,
+ rate: float,
+ ):
+ """
+ Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate.
+
+ Args:
+ source_audio: The source audio signal as a NumPy array.
+ source_rate: The sampling rate of the source audio.
+ target_audio: The target audio signal to adjust.
+ target_rate: The sampling rate of the target audio.
+ rate: The blending rate between the source and target RMS levels.
+ """
+ # Calculate RMS of both audio data
+ rms1 = librosa.feature.rms(
+ y=source_audio,
+ frame_length=source_rate // 2 * 2,
+ hop_length=source_rate // 2,
+ )
+ rms2 = librosa.feature.rms(
+ y=target_audio,
+ frame_length=target_rate // 2 * 2,
+ hop_length=target_rate // 2,
+ )
+
+ # Interpolate RMS to match target audio length
+ rms1 = F.interpolate(
+ torch.from_numpy(rms1).float().unsqueeze(0),
+ size=target_audio.shape[0],
+ mode="linear",
+ ).squeeze()
+ rms2 = F.interpolate(
+ torch.from_numpy(rms2).float().unsqueeze(0),
+ size=target_audio.shape[0],
+ mode="linear",
+ ).squeeze()
+ rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6)
+
+ # Adjust target audio RMS based on the source audio RMS
+ adjusted_audio = (
+ target_audio
+ * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy()
+ )
+ return adjusted_audio
+
+
+class Autotune:
+ """
+ A class for applying autotune to a given fundamental frequency (F0) contour.
+ """
+
+ def __init__(self, ref_freqs):
+ """
+ Initializes the Autotune class with a set of reference frequencies.
+
+ Args:
+ ref_freqs: A list of reference frequencies representing musical notes.
+ """
+ self.ref_freqs = ref_freqs
+ self.note_dict = self.ref_freqs # No interpolation needed
+
+ def autotune_f0(self, f0, f0_autotune_strength):
+ """
+ Autotunes a given F0 contour by snapping each frequency to the closest reference frequency.
+
+ Args:
+ f0: The input F0 contour as a NumPy array.
+ """
+ autotuned_f0 = np.zeros_like(f0)
+ for i, freq in enumerate(f0):
+ closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
+ autotuned_f0[i] = freq + (closest_note - freq) * f0_autotune_strength
+ return autotuned_f0
+
+
+class Pipeline:
+ """
+ The main pipeline class for performing voice conversion, including preprocessing, F0 estimation,
+ voice conversion using a model, and post-processing.
+ """
+
+ def __init__(self, tgt_sr, config):
+ """
+ Initializes the Pipeline class with target sampling rate and configuration parameters.
+
+ Args:
+ tgt_sr: The target sampling rate for the output audio.
+ config: A configuration object containing various parameters for the pipeline.
+ """
+ self.x_pad = config.x_pad
+ self.x_query = config.x_query
+ self.x_center = config.x_center
+ self.x_max = config.x_max
+ self.sample_rate = 16000
+ self.window = 160
+ self.t_pad = self.sample_rate * self.x_pad
+ self.t_pad_tgt = tgt_sr * self.x_pad
+ self.t_pad2 = self.t_pad * 2
+ self.t_query = self.sample_rate * self.x_query
+ self.t_center = self.sample_rate * self.x_center
+ self.t_max = self.sample_rate * self.x_max
+ self.time_step = self.window / self.sample_rate * 1000
+ self.f0_min = 50
+ self.f0_max = 1100
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+ self.device = config.device
+ self.ref_freqs = [
+ 49.00, # G1
+ 51.91, # G#1 / Ab1
+ 55.00, # A1
+ 58.27, # A#1 / Bb1
+ 61.74, # B1
+ 65.41, # C2
+ 69.30, # C#2 / Db2
+ 73.42, # D2
+ 77.78, # D#2 / Eb2
+ 82.41, # E2
+ 87.31, # F2
+ 92.50, # F#2 / Gb2
+ 98.00, # G2
+ 103.83, # G#2 / Ab2
+ 110.00, # A2
+ 116.54, # A#2 / Bb2
+ 123.47, # B2
+ 130.81, # C3
+ 138.59, # C#3 / Db3
+ 146.83, # D3
+ 155.56, # D#3 / Eb3
+ 164.81, # E3
+ 174.61, # F3
+ 185.00, # F#3 / Gb3
+ 196.00, # G3
+ 207.65, # G#3 / Ab3
+ 220.00, # A3
+ 233.08, # A#3 / Bb3
+ 246.94, # B3
+ 261.63, # C4
+ 277.18, # C#4 / Db4
+ 293.66, # D4
+ 311.13, # D#4 / Eb4
+ 329.63, # E4
+ 349.23, # F4
+ 369.99, # F#4 / Gb4
+ 392.00, # G4
+ 415.30, # G#4 / Ab4
+ 440.00, # A4
+ 466.16, # A#4 / Bb4
+ 493.88, # B4
+ 523.25, # C5
+ 554.37, # C#5 / Db5
+ 587.33, # D5
+ 622.25, # D#5 / Eb5
+ 659.25, # E5
+ 698.46, # F5
+ 739.99, # F#5 / Gb5
+ 783.99, # G5
+ 830.61, # G#5 / Ab5
+ 880.00, # A5
+ 932.33, # A#5 / Bb5
+ 987.77, # B5
+ 1046.50, # C6
+ ]
+ self.autotune = Autotune(self.ref_freqs)
+ self.note_dict = self.autotune.note_dict
+ self.model_rmvpe = RMVPE0Predictor(
+ os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
+ device=self.device,
+ )
+
+ def get_f0_crepe(
+ self,
+ x,
+ f0_min,
+ f0_max,
+ p_len,
+ hop_length,
+ model="full",
+ ):
+ """
+ Estimates the fundamental frequency (F0) of a given audio signal using the Crepe model.
+
+ Args:
+ x: The input audio signal as a NumPy array.
+ f0_min: Minimum F0 value to consider.
+ f0_max: Maximum F0 value to consider.
+ p_len: Desired length of the F0 output.
+ hop_length: Hop length for the Crepe model.
+ model: Crepe model size to use ("full" or "tiny").
+ """
+ x = x.astype(np.float32)
+ x /= np.quantile(np.abs(x), 0.999)
+ audio = torch.from_numpy(x).to(self.device, copy=True)
+ audio = torch.unsqueeze(audio, dim=0)
+ if audio.ndim == 2 and audio.shape[0] > 1:
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
+ audio = audio.detach()
+ pitch: Tensor = torchcrepe.predict(
+ audio,
+ self.sample_rate,
+ hop_length,
+ f0_min,
+ f0_max,
+ model,
+ batch_size=hop_length * 2,
+ device=self.device,
+ pad=True,
+ )
+ p_len = p_len or x.shape[0] // hop_length
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
+ source[source < 0.001] = np.nan
+ target = np.interp(
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
+ np.arange(0, len(source)),
+ source,
+ )
+ f0 = np.nan_to_num(target)
+ return f0
+
+ def get_f0_hybrid(
+ self,
+ methods_str,
+ x,
+ f0_min,
+ f0_max,
+ p_len,
+ hop_length,
+ ):
+ """
+ Estimates the fundamental frequency (F0) using a hybrid approach combining multiple methods.
+
+ Args:
+ methods_str: A string specifying the methods to combine (e.g., "hybrid[crepe+rmvpe]").
+ x: The input audio signal as a NumPy array.
+ f0_min: Minimum F0 value to consider.
+ f0_max: Maximum F0 value to consider.
+ p_len: Desired length of the F0 output.
+ hop_length: Hop length for F0 estimation methods.
+ """
+ methods_str = re.search("hybrid\[(.+)\]", methods_str)
+ if methods_str:
+ methods = [method.strip() for method in methods_str.group(1).split("+")]
+ f0_computation_stack = []
+ print(f"Calculating f0 pitch estimations for methods: {', '.join(methods)}")
+ x = x.astype(np.float32)
+ x /= np.quantile(np.abs(x), 0.999)
+ for method in methods:
+ f0 = None
+ if method == "crepe":
+ f0 = self.get_f0_crepe_computation(
+ x, f0_min, f0_max, p_len, int(hop_length)
+ )
+ elif method == "rmvpe":
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+ f0 = f0[1:]
+ elif method == "fcpe":
+ self.model_fcpe = FCPEF0Predictor(
+ os.path.join("rvc", "models", "predictors", "fcpe.pt"),
+ f0_min=int(f0_min),
+ f0_max=int(f0_max),
+ dtype=torch.float32,
+ device=self.device,
+ sample_rate=self.sample_rate,
+ threshold=0.03,
+ )
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
+ del self.model_fcpe
+ gc.collect()
+ f0_computation_stack.append(f0)
+
+ f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
+ f0_median_hybrid = None
+ if len(f0_computation_stack) == 1:
+ f0_median_hybrid = f0_computation_stack[0]
+ else:
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
+ return f0_median_hybrid
+
+ def get_f0(
+ self,
+ input_audio_path,
+ x,
+ p_len,
+ pitch,
+ f0_method,
+ filter_radius,
+ hop_length,
+ f0_autotune,
+ f0_autotune_strength,
+ inp_f0=None,
+ ):
+ """
+ Estimates the fundamental frequency (F0) of a given audio signal using various methods.
+
+ Args:
+ input_audio_path: Path to the input audio file.
+ x: The input audio signal as a NumPy array.
+ p_len: Desired length of the F0 output.
+ pitch: Key to adjust the pitch of the F0 contour.
+ f0_method: Method to use for F0 estimation (e.g., "crepe").
+ filter_radius: Radius for median filtering the F0 contour.
+ hop_length: Hop length for F0 estimation methods.
+ f0_autotune: Whether to apply autotune to the F0 contour.
+ inp_f0: Optional input F0 contour to use instead of estimating.
+ """
+ global input_audio_path2wav
+ if f0_method == "crepe":
+ f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length))
+ elif f0_method == "crepe-tiny":
+ f0 = self.get_f0_crepe(
+ x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny"
+ )
+ elif f0_method == "rmvpe":
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+ elif f0_method == "fcpe":
+ self.model_fcpe = FCPEF0Predictor(
+ os.path.join("rvc", "models", "predictors", "fcpe.pt"),
+ f0_min=int(self.f0_min),
+ f0_max=int(self.f0_max),
+ dtype=torch.float32,
+ device=self.device,
+ sample_rate=self.sample_rate,
+ threshold=0.03,
+ )
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
+ del self.model_fcpe
+ gc.collect()
+ elif "hybrid" in f0_method:
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
+ f0 = self.get_f0_hybrid(
+ f0_method,
+ x,
+ self.f0_min,
+ self.f0_max,
+ p_len,
+ hop_length,
+ )
+
+ if f0_autotune is True:
+ f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength)
+
+ f0 *= pow(2, pitch / 12)
+ tf0 = self.sample_rate // self.window
+ if inp_f0 is not None:
+ delta_t = np.round(
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
+ ).astype("int16")
+ replace_f0 = np.interp(
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
+ )
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
+ :shape
+ ]
+ f0bak = f0.copy()
+ f0_mel = 1127 * np.log(1 + f0 / 700)
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
+ self.f0_mel_max - self.f0_mel_min
+ ) + 1
+ f0_mel[f0_mel <= 1] = 1
+ f0_mel[f0_mel > 255] = 255
+ f0_coarse = np.rint(f0_mel).astype(int)
+
+ return f0_coarse, f0bak
+
+ def voice_conversion(
+ self,
+ model,
+ net_g,
+ sid,
+ audio0,
+ pitch,
+ pitchf,
+ index,
+ big_npy,
+ index_rate,
+ version,
+ protect,
+ ):
+ """
+ Performs voice conversion on a given audio segment.
+
+ Args:
+ model: The feature extractor model.
+ net_g: The generative model for synthesizing speech.
+ sid: Speaker ID for the target voice.
+ audio0: The input audio segment.
+ pitch: Quantized F0 contour for pitch guidance.
+ pitchf: Original F0 contour for pitch guidance.
+ index: FAISS index for speaker embedding retrieval.
+ big_npy: Speaker embeddings stored in a NumPy array.
+ index_rate: Blending rate for speaker embedding retrieval.
+ version: Model version (Keep to support old models).
+ protect: Protection level for preserving the original pitch.
+ """
+ with torch.no_grad():
+ pitch_guidance = pitch != None and pitchf != None
+ # prepare source audio
+ feats = torch.from_numpy(audio0).float()
+ feats = feats.mean(-1) if feats.dim() == 2 else feats
+ assert feats.dim() == 1, feats.dim()
+ feats = feats.view(1, -1).to(self.device)
+ # extract features
+ feats = model(feats)["last_hidden_state"]
+ feats = (
+ model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
+ )
+ # make a copy for pitch guidance and protection
+ feats0 = feats.clone() if pitch_guidance else None
+ if (
+ index
+ ): # set by parent function, only true if index is available, loaded, and index rate > 0
+ feats = self._retrieve_speaker_embeddings(
+ feats, index, big_npy, index_rate
+ )
+ # feature upsampling
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(
+ 0, 2, 1
+ )
+ # adjust the length if the audio is short
+ p_len = min(audio0.shape[0] // self.window, feats.shape[1])
+ if pitch_guidance:
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+ 0, 2, 1
+ )
+ pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len]
+ # Pitch protection blending
+ if protect < 0.5:
+ pitchff = pitchf.clone()
+ pitchff[pitchf > 0] = 1
+ pitchff[pitchf < 1] = protect
+ feats = feats * pitchff.unsqueeze(-1) + feats0 * (
+ 1 - pitchff.unsqueeze(-1)
+ )
+ feats = feats.to(feats0.dtype)
+ else:
+ pitch, pitchf = None, None
+ p_len = torch.tensor([p_len], device=self.device).long()
+ audio1 = (
+ (net_g.infer(feats.float(), p_len, pitch, pitchf.float(), sid)[0][0, 0])
+ .data.cpu()
+ .float()
+ .numpy()
+ )
+ # clean up
+ del feats, feats0, p_len
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ return audio1
+
+ def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate):
+ npy = feats[0].cpu().numpy()
+ score, ix = index.search(npy, k=8)
+ weight = np.square(1 / score)
+ weight /= weight.sum(axis=1, keepdims=True)
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
+ feats = (
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+ + (1 - index_rate) * feats
+ )
+ return feats
+
+ def pipeline(
+ self,
+ model,
+ net_g,
+ sid,
+ audio,
+ pitch,
+ f0_method,
+ file_index,
+ index_rate,
+ pitch_guidance,
+ filter_radius,
+ volume_envelope,
+ version,
+ protect,
+ hop_length,
+ f0_autotune,
+ f0_autotune_strength,
+ f0_file,
+ ):
+ """
+ The main pipeline function for performing voice conversion.
+
+ Args:
+ model: The feature extractor model.
+ net_g: The generative model for synthesizing speech.
+ sid: Speaker ID for the target voice.
+ audio: The input audio signal.
+ input_audio_path: Path to the input audio file.
+ pitch: Key to adjust the pitch of the F0 contour.
+ f0_method: Method to use for F0 estimation.
+ file_index: Path to the FAISS index file for speaker embedding retrieval.
+ index_rate: Blending rate for speaker embedding retrieval.
+ pitch_guidance: Whether to use pitch guidance during voice conversion.
+ filter_radius: Radius for median filtering the F0 contour.
+ tgt_sr: Target sampling rate for the output audio.
+ resample_sr: Resampling rate for the output audio.
+ volume_envelope: Blending rate for adjusting the RMS level of the output audio.
+ version: Model version.
+ protect: Protection level for preserving the original pitch.
+ hop_length: Hop length for F0 estimation methods.
+ f0_autotune: Whether to apply autotune to the F0 contour.
+ f0_file: Path to a file containing an F0 contour to use.
+ """
+ if file_index != "" and os.path.exists(file_index) and index_rate > 0:
+ try:
+ index = faiss.read_index(file_index)
+ big_npy = index.reconstruct_n(0, index.ntotal)
+ except Exception as error:
+ print(f"An error occurred reading the FAISS index: {error}")
+ index = big_npy = None
+ else:
+ index = big_npy = None
+ audio = signal.filtfilt(bh, ah, audio)
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
+ opt_ts = []
+ if audio_pad.shape[0] > self.t_max:
+ audio_sum = np.zeros_like(audio)
+ for i in range(self.window):
+ audio_sum += audio_pad[i : i - self.window]
+ for t in range(self.t_center, audio.shape[0], self.t_center):
+ opt_ts.append(
+ t
+ - self.t_query
+ + np.where(
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
+ )[0][0]
+ )
+ s = 0
+ audio_opt = []
+ t = None
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
+ p_len = audio_pad.shape[0] // self.window
+ inp_f0 = None
+ if hasattr(f0_file, "name"):
+ try:
+ with open(f0_file.name, "r") as f:
+ lines = f.read().strip("\n").split("\n")
+ inp_f0 = []
+ for line in lines:
+ inp_f0.append([float(i) for i in line.split(",")])
+ inp_f0 = np.array(inp_f0, dtype="float32")
+ except Exception as error:
+ print(f"An error occurred reading the F0 file: {error}")
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
+ if pitch_guidance:
+ pitch, pitchf = self.get_f0(
+ "input_audio_path", # questionable purpose of making a key for an array
+ audio_pad,
+ p_len,
+ pitch,
+ f0_method,
+ filter_radius,
+ hop_length,
+ f0_autotune,
+ f0_autotune_strength,
+ inp_f0,
+ )
+ pitch = pitch[:p_len]
+ pitchf = pitchf[:p_len]
+ if self.device == "mps":
+ pitchf = pitchf.astype(np.float32)
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
+ for t in opt_ts:
+ t = t // self.window * self.window
+ if pitch_guidance:
+ audio_opt.append(
+ self.voice_conversion(
+ model,
+ net_g,
+ sid,
+ audio_pad[s : t + self.t_pad2 + self.window],
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
+ index,
+ big_npy,
+ index_rate,
+ version,
+ protect,
+ )[self.t_pad_tgt : -self.t_pad_tgt]
+ )
+ else:
+ audio_opt.append(
+ self.voice_conversion(
+ model,
+ net_g,
+ sid,
+ audio_pad[s : t + self.t_pad2 + self.window],
+ None,
+ None,
+ index,
+ big_npy,
+ index_rate,
+ version,
+ protect,
+ )[self.t_pad_tgt : -self.t_pad_tgt]
+ )
+ s = t
+ if pitch_guidance:
+ audio_opt.append(
+ self.voice_conversion(
+ model,
+ net_g,
+ sid,
+ audio_pad[t:],
+ pitch[:, t // self.window :] if t is not None else pitch,
+ pitchf[:, t // self.window :] if t is not None else pitchf,
+ index,
+ big_npy,
+ index_rate,
+ version,
+ protect,
+ )[self.t_pad_tgt : -self.t_pad_tgt]
+ )
+ else:
+ audio_opt.append(
+ self.voice_conversion(
+ model,
+ net_g,
+ sid,
+ audio_pad[t:],
+ None,
+ None,
+ index,
+ big_npy,
+ index_rate,
+ version,
+ protect,
+ )[self.t_pad_tgt : -self.t_pad_tgt]
+ )
+ audio_opt = np.concatenate(audio_opt)
+ if volume_envelope != 1:
+ audio_opt = AudioProcessor.change_rms(
+ audio, self.sample_rate, audio_opt, self.sample_rate, volume_envelope
+ )
+ audio_max = np.abs(audio_opt).max() / 0.99
+ if audio_max > 1:
+ audio_opt /= audio_max
+ if pitch_guidance:
+ del pitch, pitchf
+ del sid
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ return audio_opt
diff --git a/rvc/lib/algorithm/__init__.py b/rvc/lib/algorithm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/rvc/lib/algorithm/attentions.py b/rvc/lib/algorithm/attentions.py
new file mode 100644
index 0000000000000000000000000000000000000000..d698afc8aef31097772e8aafc5aca0189043e0ed
--- /dev/null
+++ b/rvc/lib/algorithm/attentions.py
@@ -0,0 +1,243 @@
+import math
+import torch
+from rvc.lib.algorithm.commons import convert_pad_shape
+
+
+class MultiHeadAttention(torch.nn.Module):
+ """
+ Multi-head attention module with optional relative positional encoding and proximal bias.
+
+ Args:
+ channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ n_heads (int): Number of attention heads.
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
+ window_size (int, optional): Window size for relative positional encoding. Defaults to None.
+ heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True.
+ block_length (int, optional): Block length for local attention. Defaults to None.
+ proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False.
+ proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False.
+ """
+
+ def __init__(
+ self,
+ channels: int,
+ out_channels: int,
+ n_heads: int,
+ p_dropout: float = 0.0,
+ window_size: int = None,
+ heads_share: bool = True,
+ block_length: int = None,
+ proximal_bias: bool = False,
+ proximal_init: bool = False,
+ ):
+ super().__init__()
+ assert (
+ channels % n_heads == 0
+ ), "Channels must be divisible by the number of heads."
+
+ self.channels = channels
+ self.out_channels = out_channels
+ self.n_heads = n_heads
+ self.k_channels = channels // n_heads
+ self.window_size = window_size
+ self.block_length = block_length
+ self.proximal_bias = proximal_bias
+
+ # Define projections
+ self.conv_q = torch.nn.Conv1d(channels, channels, 1)
+ self.conv_k = torch.nn.Conv1d(channels, channels, 1)
+ self.conv_v = torch.nn.Conv1d(channels, channels, 1)
+ self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
+
+ self.drop = torch.nn.Dropout(p_dropout)
+
+ # Relative positional encodings
+ if window_size:
+ n_heads_rel = 1 if heads_share else n_heads
+ rel_stddev = self.k_channels**-0.5
+ self.emb_rel_k = torch.nn.Parameter(
+ torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels)
+ * rel_stddev
+ )
+ self.emb_rel_v = torch.nn.Parameter(
+ torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels)
+ * rel_stddev
+ )
+
+ # Initialize weights
+ torch.nn.init.xavier_uniform_(self.conv_q.weight)
+ torch.nn.init.xavier_uniform_(self.conv_k.weight)
+ torch.nn.init.xavier_uniform_(self.conv_v.weight)
+ torch.nn.init.xavier_uniform_(self.conv_o.weight)
+
+ if proximal_init:
+ with torch.no_grad():
+ self.conv_k.weight.copy_(self.conv_q.weight)
+ self.conv_k.bias.copy_(self.conv_q.bias)
+
+ def forward(self, x, c, attn_mask=None):
+ # Compute query, key, value projections
+ q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c)
+
+ # Compute attention
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+ # Final output projection
+ return self.conv_o(x)
+
+ def attention(self, query, key, value, mask=None):
+ # Reshape and compute scaled dot-product attention
+ b, d, t_s, t_t = (*key.size(), query.size(2))
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+
+ if self.window_size:
+ assert t_s == t_t, "Relative attention only supports self-attention."
+ scores += self._compute_relative_scores(query, t_s)
+
+ if self.proximal_bias:
+ assert t_s == t_t, "Proximal bias only supports self-attention."
+ scores += self._attention_bias_proximal(t_s).to(scores.device, scores.dtype)
+
+ if mask is not None:
+ scores = scores.masked_fill(mask == 0, -1e4)
+ if self.block_length:
+ block_mask = (
+ torch.ones_like(scores)
+ .triu(-self.block_length)
+ .tril(self.block_length)
+ )
+ scores = scores.masked_fill(block_mask == 0, -1e4)
+
+ # Apply softmax and dropout
+ p_attn = self.drop(torch.nn.functional.softmax(scores, dim=-1))
+
+ # Compute attention output
+ output = torch.matmul(p_attn, value)
+
+ if self.window_size:
+ output += self._apply_relative_values(p_attn, t_s)
+
+ return output.transpose(2, 3).contiguous().view(b, d, t_t), p_attn
+
+ def _compute_relative_scores(self, query, length):
+ rel_emb = self._get_relative_embeddings(self.emb_rel_k, length)
+ rel_logits = self._matmul_with_relative_keys(
+ query / math.sqrt(self.k_channels), rel_emb
+ )
+ return self._relative_position_to_absolute_position(rel_logits)
+
+ def _apply_relative_values(self, p_attn, length):
+ rel_weights = self._absolute_position_to_relative_position(p_attn)
+ rel_emb = self._get_relative_embeddings(self.emb_rel_v, length)
+ return self._matmul_with_relative_values(rel_weights, rel_emb)
+
+ # Helper methods
+ def _matmul_with_relative_values(self, x, y):
+ return torch.matmul(x, y.unsqueeze(0))
+
+ def _matmul_with_relative_keys(self, x, y):
+ return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+
+ def _get_relative_embeddings(self, embeddings, length):
+ pad_length = max(length - (self.window_size + 1), 0)
+ start = max((self.window_size + 1) - length, 0)
+ end = start + 2 * length - 1
+
+ if pad_length > 0:
+ embeddings = torch.nn.functional.pad(
+ embeddings,
+ convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+ )
+ return embeddings[:, start:end]
+
+ def _relative_position_to_absolute_position(self, x):
+ batch, heads, length, _ = x.size()
+ x = torch.nn.functional.pad(
+ x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
+ )
+ x_flat = x.view(batch, heads, length * 2 * length)
+ x_flat = torch.nn.functional.pad(
+ x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+ )
+ return x_flat.view(batch, heads, length + 1, 2 * length - 1)[
+ :, :, :length, length - 1 :
+ ]
+
+ def _absolute_position_to_relative_position(self, x):
+ batch, heads, length, _ = x.size()
+ x = torch.nn.functional.pad(
+ x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+ )
+ x_flat = x.view(batch, heads, length**2 + length * (length - 1))
+ x_flat = torch.nn.functional.pad(
+ x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]])
+ )
+ return x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:]
+
+ def _attention_bias_proximal(self, length):
+ r = torch.arange(length, dtype=torch.float32)
+ diff = r.unsqueeze(0) - r.unsqueeze(1)
+ return -torch.log1p(torch.abs(diff)).unsqueeze(0).unsqueeze(0)
+
+
+class FFN(torch.nn.Module):
+ """
+ Feed-forward network module.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ filter_channels (int): Number of filter channels in the convolution layers.
+ kernel_size (int): Kernel size of the convolution layers.
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
+ activation (str, optional): Activation function to use. Defaults to None.
+ causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False.
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ filter_channels: int,
+ kernel_size: int,
+ p_dropout: float = 0.0,
+ activation: str = None,
+ causal: bool = False,
+ ):
+ super().__init__()
+ self.padding_fn = self._causal_padding if causal else self._same_padding
+
+ self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size)
+ self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size)
+ self.drop = torch.nn.Dropout(p_dropout)
+
+ self.activation = activation
+
+ def forward(self, x, x_mask):
+ x = self.conv_1(self.padding_fn(x * x_mask))
+ x = self._apply_activation(x)
+ x = self.drop(x)
+ x = self.conv_2(self.padding_fn(x * x_mask))
+ return x * x_mask
+
+ def _apply_activation(self, x):
+ if self.activation == "gelu":
+ return x * torch.sigmoid(1.702 * x)
+ return torch.relu(x)
+
+ def _causal_padding(self, x):
+ pad_l, pad_r = self.conv_1.kernel_size[0] - 1, 0
+ return torch.nn.functional.pad(
+ x, convert_pad_shape([[0, 0], [0, 0], [pad_l, pad_r]])
+ )
+
+ def _same_padding(self, x):
+ pad = (self.conv_1.kernel_size[0] - 1) // 2
+ return torch.nn.functional.pad(
+ x, convert_pad_shape([[0, 0], [0, 0], [pad, pad]])
+ )
diff --git a/rvc/lib/algorithm/commons.py b/rvc/lib/algorithm/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..89d00cbf50d4461f994a27e1629083757033bcee
--- /dev/null
+++ b/rvc/lib/algorithm/commons.py
@@ -0,0 +1,117 @@
+import torch
+from typing import Optional
+
+
+def init_weights(m, mean=0.0, std=0.01):
+ """
+ Initialize the weights of a module.
+
+ Args:
+ m: The module to initialize.
+ mean: The mean of the normal distribution.
+ std: The standard deviation of the normal distribution.
+ """
+ classname = m.__class__.__name__
+ if classname.find("Conv") != -1:
+ m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+ """
+ Calculate the padding needed for a convolution.
+
+ Args:
+ kernel_size: The size of the kernel.
+ dilation: The dilation of the convolution.
+ """
+ return int((kernel_size * dilation - dilation) / 2)
+
+
+def convert_pad_shape(pad_shape):
+ """
+ Convert the pad shape to a list of integers.
+
+ Args:
+ pad_shape: The pad shape..
+ """
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def slice_segments(
+ x: torch.Tensor, ids_str: torch.Tensor, segment_size: int = 4, dim: int = 2
+):
+ """
+ Slice segments from a tensor, handling tensors with different numbers of dimensions.
+
+ Args:
+ x (torch.Tensor): The tensor to slice.
+ ids_str (torch.Tensor): The starting indices of the segments.
+ segment_size (int, optional): The size of each segment. Defaults to 4.
+ dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2.
+ """
+ if dim == 2:
+ ret = torch.zeros_like(x[:, :segment_size])
+ elif dim == 3:
+ ret = torch.zeros_like(x[:, :, :segment_size])
+
+ for i in range(x.size(0)):
+ idx_str = ids_str[i].item()
+ idx_end = idx_str + segment_size
+ if dim == 2:
+ ret[i] = x[i, idx_str:idx_end]
+ else:
+ ret[i] = x[i, :, idx_str:idx_end]
+
+ return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+ """
+ Randomly slice segments from a tensor.
+
+ Args:
+ x: The tensor to slice.
+ x_lengths: The lengths of the sequences.
+ segment_size: The size of each segment.
+ """
+ b, d, t = x.size()
+ if x_lengths is None:
+ x_lengths = t
+ ids_str_max = x_lengths - segment_size + 1
+ ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
+ ret = slice_segments(x, ids_str, segment_size, dim=3)
+ return ret, ids_str
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ """
+ Fused add tanh sigmoid multiply operation.
+
+ Args:
+ input_a: The first input tensor.
+ input_b: The second input tensor.
+ n_channels: The number of channels.
+ """
+ n_channels_int = n_channels[0]
+ in_act = input_a + input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
+ """
+ Generate a sequence mask.
+
+ Args:
+ length: The lengths of the sequences.
+ max_length: The maximum length of the sequences.
+ """
+ if max_length is None:
+ max_length = length.max()
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+ return x.unsqueeze(0) < length.unsqueeze(1)
diff --git a/rvc/lib/algorithm/discriminators.py b/rvc/lib/algorithm/discriminators.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bee31e775e9f26d4337e68625f01fc7a4885fdb
--- /dev/null
+++ b/rvc/lib/algorithm/discriminators.py
@@ -0,0 +1,175 @@
+import torch
+from torch.utils.checkpoint import checkpoint
+from torch.nn.utils.parametrizations import spectral_norm, weight_norm
+
+from rvc.lib.algorithm.commons import get_padding
+from rvc.lib.algorithm.residuals import LRELU_SLOPE
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+ """
+ Multi-period discriminator.
+
+ This class implements a multi-period discriminator, which is used to
+ discriminate between real and fake audio signals. The discriminator
+ is composed of a series of convolutional layers that are applied to
+ the input signal at different periods.
+
+ Args:
+ use_spectral_norm (bool): Whether to use spectral normalization.
+ Defaults to False.
+ """
+
+ def __init__(self, use_spectral_norm: bool = False, checkpointing: bool = False):
+ super(MultiPeriodDiscriminator, self).__init__()
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
+ self.checkpointing = checkpointing
+ self.discriminators = torch.nn.ModuleList(
+ [
+ DiscriminatorS(
+ use_spectral_norm=use_spectral_norm, checkpointing=checkpointing
+ )
+ ]
+ + [
+ DiscriminatorP(
+ p, use_spectral_norm=use_spectral_norm, checkpointing=checkpointing
+ )
+ for p in periods
+ ]
+ )
+
+ def forward(self, y, y_hat):
+ y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
+ for d in self.discriminators:
+ if self.training and self.checkpointing:
+
+ def forward_discriminator(d, y, y_hat):
+ y_d_r, fmap_r = d(y)
+ y_d_g, fmap_g = d(y_hat)
+ return y_d_r, fmap_r, y_d_g, fmap_g
+
+ y_d_r, fmap_r, y_d_g, fmap_g = checkpoint(
+ forward_discriminator, d, y, y_hat, use_reentrant=False
+ )
+ else:
+ y_d_r, fmap_r = d(y)
+ y_d_g, fmap_g = d(y_hat)
+ y_d_rs.append(y_d_r)
+ y_d_gs.append(y_d_g)
+ fmap_rs.append(fmap_r)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorS(torch.nn.Module):
+ """
+ Discriminator for the short-term component.
+
+ This class implements a discriminator for the short-term component
+ of the audio signal. The discriminator is composed of a series of
+ convolutional layers that are applied to the input signal.
+ """
+
+ def __init__(self, use_spectral_norm: bool = False, checkpointing: bool = False):
+ super(DiscriminatorS, self).__init__()
+ self.checkpointing = checkpointing
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
+ self.convs = torch.nn.ModuleList(
+ [
+ norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)),
+ norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+ norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+ norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+ norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+ norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)),
+ ]
+ )
+ self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1))
+ self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE, inplace=True)
+
+ def forward(self, x):
+ fmap = []
+ for conv in self.convs:
+ if self.training and self.checkpointing:
+ x = checkpoint(conv, x, use_reentrant=False)
+ x = checkpoint(self.lrelu, x, use_reentrant=False)
+ else:
+ x = self.lrelu(conv(x))
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+ return x, fmap
+
+
+class DiscriminatorP(torch.nn.Module):
+ """
+ Discriminator for the long-term component.
+
+ This class implements a discriminator for the long-term component
+ of the audio signal. The discriminator is composed of a series of
+ convolutional layers that are applied to the input signal at a given
+ period.
+
+ Args:
+ period (int): Period of the discriminator.
+ kernel_size (int): Kernel size of the convolutional layers. Defaults to 5.
+ stride (int): Stride of the convolutional layers. Defaults to 3.
+ use_spectral_norm (bool): Whether to use spectral normalization. Defaults to False.
+ """
+
+ def __init__(
+ self,
+ period: int,
+ kernel_size: int = 5,
+ stride: int = 3,
+ use_spectral_norm: bool = False,
+ checkpointing: bool = False,
+ ):
+ super(DiscriminatorP, self).__init__()
+ self.checkpointing = checkpointing
+ self.period = period
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
+
+ in_channels = [1, 32, 128, 512, 1024]
+ out_channels = [32, 128, 512, 1024, 1024]
+
+ self.convs = torch.nn.ModuleList(
+ [
+ norm_f(
+ torch.nn.Conv2d(
+ in_ch,
+ out_ch,
+ (kernel_size, 1),
+ (stride, 1),
+ padding=(get_padding(kernel_size, 1), 0),
+ )
+ )
+ for in_ch, out_ch in zip(in_channels, out_channels)
+ ]
+ )
+
+ self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+ self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE, inplace=True)
+
+ def forward(self, x):
+ fmap = []
+ b, c, t = x.shape
+ if t % self.period != 0:
+ n_pad = self.period - (t % self.period)
+ x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
+ x = x.view(b, c, -1, self.period)
+
+ for conv in self.convs:
+ if self.training and self.checkpointing:
+ x = checkpoint(conv, x, use_reentrant=False)
+ x = checkpoint(self.lrelu, x, use_reentrant=False)
+ else:
+ x = self.lrelu(conv(x))
+ fmap.append(x)
+
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+ return x, fmap
diff --git a/rvc/lib/algorithm/encoders.py b/rvc/lib/algorithm/encoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff27d039e6d177724059c180e0fbfcdbdaa03689
--- /dev/null
+++ b/rvc/lib/algorithm/encoders.py
@@ -0,0 +1,209 @@
+import math
+import torch
+from typing import Optional
+
+from rvc.lib.algorithm.commons import sequence_mask
+from rvc.lib.algorithm.modules import WaveNet
+from rvc.lib.algorithm.normalization import LayerNorm
+from rvc.lib.algorithm.attentions import FFN, MultiHeadAttention
+
+
+class Encoder(torch.nn.Module):
+ """
+ Encoder module for the Transformer model.
+
+ Args:
+ hidden_channels (int): Number of hidden channels in the encoder.
+ filter_channels (int): Number of filter channels in the feed-forward network.
+ n_heads (int): Number of attention heads.
+ n_layers (int): Number of encoder layers.
+ kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1.
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
+ window_size (int, optional): Window size for relative positional encoding. Defaults to 10.
+ """
+
+ def __init__(
+ self,
+ hidden_channels: int,
+ filter_channels: int,
+ n_heads: int,
+ n_layers: int,
+ kernel_size: int = 1,
+ p_dropout: float = 0.0,
+ window_size: int = 10,
+ ):
+ super().__init__()
+
+ self.hidden_channels = hidden_channels
+ self.n_layers = n_layers
+ self.drop = torch.nn.Dropout(p_dropout)
+
+ self.attn_layers = torch.nn.ModuleList(
+ [
+ MultiHeadAttention(
+ hidden_channels,
+ hidden_channels,
+ n_heads,
+ p_dropout=p_dropout,
+ window_size=window_size,
+ )
+ for _ in range(n_layers)
+ ]
+ )
+ self.norm_layers_1 = torch.nn.ModuleList(
+ [LayerNorm(hidden_channels) for _ in range(n_layers)]
+ )
+ self.ffn_layers = torch.nn.ModuleList(
+ [
+ FFN(
+ hidden_channels,
+ hidden_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout=p_dropout,
+ )
+ for _ in range(n_layers)
+ ]
+ )
+ self.norm_layers_2 = torch.nn.ModuleList(
+ [LayerNorm(hidden_channels) for _ in range(n_layers)]
+ )
+
+ def forward(self, x, x_mask):
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+ x = x * x_mask
+
+ for i in range(self.n_layers):
+ y = self.attn_layers[i](x, x, attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_1[i](x + y)
+
+ y = self.ffn_layers[i](x, x_mask)
+ y = self.drop(y)
+ x = self.norm_layers_2[i](x + y)
+
+ return x * x_mask
+
+
+class TextEncoder(torch.nn.Module):
+ """
+ Text Encoder with configurable embedding dimension.
+
+ Args:
+ out_channels (int): Output channels of the encoder.
+ hidden_channels (int): Hidden channels of the encoder.
+ filter_channels (int): Filter channels of the encoder.
+ n_heads (int): Number of attention heads.
+ n_layers (int): Number of encoder layers.
+ kernel_size (int): Kernel size of the convolutional layers.
+ p_dropout (float): Dropout probability.
+ embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768).
+ f0 (bool, optional): Whether to use F0 embedding. Defaults to True.
+ """
+
+ def __init__(
+ self,
+ out_channels: int,
+ hidden_channels: int,
+ filter_channels: int,
+ n_heads: int,
+ n_layers: int,
+ kernel_size: int,
+ p_dropout: float,
+ embedding_dim: int,
+ f0: bool = True,
+ ):
+ super().__init__()
+ self.hidden_channels = hidden_channels
+ self.out_channels = out_channels
+ self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels)
+ self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True)
+ self.emb_pitch = torch.nn.Embedding(256, hidden_channels) if f0 else None
+
+ self.encoder = Encoder(
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+ )
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(
+ self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor
+ ):
+ x = self.emb_phone(phone)
+ if pitch is not None and self.emb_pitch:
+ x += self.emb_pitch(pitch)
+
+ x *= math.sqrt(self.hidden_channels)
+ x = self.lrelu(x)
+ x = x.transpose(1, -1) # [B, H, T]
+
+ x_mask = sequence_mask(lengths, x.size(2)).unsqueeze(1).to(x.dtype)
+ x = self.encoder(x, x_mask)
+ stats = self.proj(x) * x_mask
+
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ return m, logs, x_mask
+
+
+class PosteriorEncoder(torch.nn.Module):
+ """
+ Posterior Encoder for inferring latent representation.
+
+ Args:
+ in_channels (int): Number of channels in the input.
+ out_channels (int): Number of channels in the output.
+ hidden_channels (int): Number of hidden channels in the encoder.
+ kernel_size (int): Kernel size of the convolutional layers.
+ dilation_rate (int): Dilation rate of the convolutional layers.
+ n_layers (int): Number of layers in the encoder.
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ hidden_channels: int,
+ kernel_size: int,
+ dilation_rate: int,
+ n_layers: int,
+ gin_channels: int = 0,
+ ):
+ super().__init__()
+ self.out_channels = out_channels
+ self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1)
+ self.enc = WaveNet(
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=gin_channels,
+ )
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(
+ self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
+ ):
+ x_mask = sequence_mask(x_lengths, x.size(2)).unsqueeze(1).to(x.dtype)
+
+ x = self.pre(x) * x_mask
+ x = self.enc(x, x_mask, g=g)
+
+ stats = self.proj(x) * x_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+
+ z = m + torch.randn_like(m) * torch.exp(logs)
+ z *= x_mask
+
+ return z, m, logs, x_mask
+
+ def remove_weight_norm(self):
+ self.enc.remove_weight_norm()
+
+ def __prepare_scriptable__(self):
+ for hook in self.enc._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.enc)
+ return self
diff --git a/rvc/lib/algorithm/generators/hifigan.py b/rvc/lib/algorithm/generators/hifigan.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f18dfac0234640a311945d38dcab1451760b583
--- /dev/null
+++ b/rvc/lib/algorithm/generators/hifigan.py
@@ -0,0 +1,230 @@
+import torch
+import numpy as np
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+from typing import Optional
+
+from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock
+from rvc.lib.algorithm.commons import init_weights
+
+
+class HiFiGANGenerator(torch.nn.Module):
+ """
+ HiFi-GAN Generator module for audio synthesis.
+
+ This module implements the generator part of the HiFi-GAN architecture,
+ which uses transposed convolutions for upsampling and residual blocks for
+ refining the audio output. It can also incorporate global conditioning.
+
+ Args:
+ initial_channel (int): Number of input channels to the initial convolutional layer.
+ resblock_kernel_sizes (list): List of kernel sizes for the residual blocks.
+ resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size.
+ upsample_rates (list): List of upsampling factors for each upsampling layer.
+ upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer.
+ upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling.
+ gin_channels (int, optional): Number of input channels for the global conditioning. If 0, no global conditioning is used. Defaults to 0.
+ """
+
+ def __init__(
+ self,
+ initial_channel: int,
+ resblock_kernel_sizes: list,
+ resblock_dilation_sizes: list,
+ upsample_rates: list,
+ upsample_initial_channel: int,
+ upsample_kernel_sizes: list,
+ gin_channels: int = 0,
+ ):
+ super(HiFiGANGenerator, self).__init__()
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+ self.conv_pre = torch.nn.Conv1d(
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
+ )
+
+ self.ups = torch.nn.ModuleList()
+ self.resblocks = torch.nn.ModuleList()
+
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ self.ups.append(
+ weight_norm(
+ torch.nn.ConvTranspose1d(
+ upsample_initial_channel // (2**i),
+ upsample_initial_channel // (2 ** (i + 1)),
+ k,
+ u,
+ padding=(k - u) // 2,
+ )
+ )
+ )
+ ch = upsample_initial_channel // (2 ** (i + 1))
+ for j, (k, d) in enumerate(
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
+ ):
+ self.resblocks.append(ResBlock(ch, k, d))
+
+ self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+ self.ups.apply(init_weights)
+
+ if gin_channels != 0:
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
+ # new tensor
+ x = self.conv_pre(x)
+
+ if g is not None:
+ # in-place call
+ x += self.cond(g)
+
+ for i in range(self.num_upsamples):
+ # in-place call
+ x = torch.nn.functional.leaky_relu_(x, LRELU_SLOPE)
+ x = self.ups[i](x)
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i * self.num_kernels + j](x)
+ else:
+ xs += self.resblocks[i * self.num_kernels + j](x)
+ x = xs / self.num_kernels
+ # in-place call
+ x = torch.nn.functional.leaky_relu_(x)
+ x = self.conv_post(x)
+ # in-place call
+ x = torch.tanh_(x)
+
+ return x
+
+ def __prepare_scriptable__(self):
+ for l in self.ups_and_resblocks:
+ for hook in l._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(l)
+ return self
+
+ def remove_weight_norm(self):
+ for l in self.ups:
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+
+
+class SineGenerator(torch.nn.Module):
+ """
+ Sine wave generator with optional harmonic overtones and noise.
+
+ This module generates sine waves for a fundamental frequency and its harmonics.
+ It can also add Gaussian noise and apply a voiced/unvoiced mask.
+
+ Args:
+ sampling_rate (int): The sampling rate of the audio in Hz.
+ num_harmonics (int, optional): The number of harmonic overtones to generate. Defaults to 0.
+ sine_amplitude (float, optional): The amplitude of the sine wave components. Defaults to 0.1.
+ noise_stddev (float, optional): The standard deviation of the additive Gaussian noise. Defaults to 0.003.
+ voiced_threshold (float, optional): The threshold for the fundamental frequency (F0) to determine if a frame is voiced. Defaults to 0.0.
+ """
+
+ def __init__(
+ self,
+ sampling_rate: int,
+ num_harmonics: int = 0,
+ sine_amplitude: float = 0.1,
+ noise_stddev: float = 0.003,
+ voiced_threshold: float = 0.0,
+ ):
+ super(SineGenerator, self).__init__()
+ self.sampling_rate = sampling_rate
+ self.num_harmonics = num_harmonics
+ self.sine_amplitude = sine_amplitude
+ self.noise_stddev = noise_stddev
+ self.voiced_threshold = voiced_threshold
+ self.waveform_dim = self.num_harmonics + 1 # fundamental + harmonics
+
+ def _compute_voiced_unvoiced(self, f0: torch.Tensor):
+ """
+ Generates a binary mask indicating voiced/unvoiced frames based on the fundamental frequency.
+
+ Args:
+ f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length).
+ """
+ uv_mask = (f0 > self.voiced_threshold).float()
+ return uv_mask
+
+ def _generate_sine_wave(self, f0: torch.Tensor, upsampling_factor: int):
+ """
+ Generates sine waves for the fundamental frequency and its harmonics.
+
+ Args:
+ f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1).
+ upsampling_factor (int): The factor by which to upsample the sine wave.
+ """
+ batch_size, length, _ = f0.shape
+
+ # Create an upsampling grid
+ upsampling_grid = torch.arange(
+ 1, upsampling_factor + 1, dtype=f0.dtype, device=f0.device
+ )
+
+ # Calculate phase increments
+ phase_increments = (f0 / self.sampling_rate) * upsampling_grid
+ phase_remainder = torch.fmod(phase_increments[:, :-1, -1:] + 0.5, 1.0) - 0.5
+ cumulative_phase = phase_remainder.cumsum(dim=1).fmod(1.0).to(f0.dtype)
+ phase_increments += torch.nn.functional.pad(
+ cumulative_phase, (0, 0, 1, 0), mode="constant"
+ )
+
+ # Reshape to match the sine wave shape
+ phase_increments = phase_increments.reshape(batch_size, -1, 1)
+
+ # Scale for harmonics
+ harmonic_scale = torch.arange(
+ 1, self.waveform_dim + 1, dtype=f0.dtype, device=f0.device
+ ).reshape(1, 1, -1)
+ phase_increments *= harmonic_scale
+
+ # Add random phase offset (except for the fundamental)
+ random_phase = torch.rand(1, 1, self.waveform_dim, device=f0.device)
+ random_phase[..., 0] = 0 # Fundamental frequency has no random offset
+ phase_increments += random_phase
+
+ # Generate sine waves
+ sine_waves = torch.sin(2 * np.pi * phase_increments)
+ return sine_waves
+
+ def forward(self, f0: torch.Tensor, upsampling_factor: int):
+ with torch.no_grad():
+ # Expand `f0` to include waveform dimensions
+ f0 = f0.unsqueeze(-1)
+
+ # Generate sine waves
+ sine_waves = (
+ self._generate_sine_wave(f0, upsampling_factor) * self.sine_amplitude
+ )
+
+ # Compute voiced/unvoiced mask
+ voiced_mask = self._compute_voiced_unvoiced(f0)
+
+ # Upsample voiced/unvoiced mask
+ voiced_mask = torch.nn.functional.interpolate(
+ voiced_mask.transpose(2, 1),
+ scale_factor=float(upsampling_factor),
+ mode="nearest",
+ ).transpose(2, 1)
+
+ # Compute noise amplitude
+ noise_amplitude = voiced_mask * self.noise_stddev + (1 - voiced_mask) * (
+ self.sine_amplitude / 3
+ )
+
+ # Add Gaussian noise
+ noise = noise_amplitude * torch.randn_like(sine_waves)
+
+ # Combine sine waves and noise
+ sine_waveforms = sine_waves * voiced_mask + noise
+
+ return sine_waveforms, voiced_mask, noise
diff --git a/rvc/lib/algorithm/generators/hifigan_mrf.py b/rvc/lib/algorithm/generators/hifigan_mrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ebf806fd7fff512f771a2f5a699de203aebd645
--- /dev/null
+++ b/rvc/lib/algorithm/generators/hifigan_mrf.py
@@ -0,0 +1,385 @@
+import math
+from typing import Optional
+
+import numpy as np
+import torch
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+from torch.utils.checkpoint import checkpoint
+
+LRELU_SLOPE = 0.1
+
+
+class MRFLayer(torch.nn.Module):
+ """
+ A single layer of the Multi-Receptive Field (MRF) block.
+
+ This layer consists of two 1D convolutional layers with weight normalization
+ and Leaky ReLU activation in between. The first convolution has a dilation,
+ while the second has a dilation of 1. A skip connection is added from the input
+ to the output.
+
+ Args:
+ channels (int): The number of input and output channels.
+ kernel_size (int): The kernel size of the convolutional layers.
+ dilation (int): The dilation rate for the first convolutional layer.
+ """
+
+ def __init__(self, channels, kernel_size, dilation):
+ super().__init__()
+ self.conv1 = weight_norm(
+ torch.nn.Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ padding=(kernel_size * dilation - dilation) // 2,
+ dilation=dilation,
+ )
+ )
+ self.conv2 = weight_norm(
+ torch.nn.Conv1d(
+ channels, channels, kernel_size, padding=kernel_size // 2, dilation=1
+ )
+ )
+
+ def forward(self, x: torch.Tensor):
+ # new tensor
+ y = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
+ y = self.conv1(y)
+ # in-place call
+ y = torch.nn.functional.leaky_relu_(y, LRELU_SLOPE)
+ y = self.conv2(y)
+ return x + y
+
+ def remove_weight_norm(self):
+ remove_weight_norm(self.conv1)
+ remove_weight_norm(self.conv2)
+
+
+class MRFBlock(torch.nn.Module):
+ """
+ A Multi-Receptive Field (MRF) block.
+
+ This block consists of multiple MRFLayers with different dilation rates.
+ It applies each layer sequentially to the input.
+
+ Args:
+ channels (int): The number of input and output channels for the MRFLayers.
+ kernel_size (int): The kernel size for the convolutional layers in the MRFLayers.
+ dilations (list[int]): A list of dilation rates for the MRFLayers.
+ """
+
+ def __init__(self, channels, kernel_size, dilations):
+ super().__init__()
+ self.layers = torch.nn.ModuleList()
+ for dilation in dilations:
+ self.layers.append(MRFLayer(channels, kernel_size, dilation))
+
+ def forward(self, x: torch.Tensor):
+ for layer in self.layers:
+ x = layer(x)
+ return x
+
+ def remove_weight_norm(self):
+ for layer in self.layers:
+ layer.remove_weight_norm()
+
+
+class SineGenerator(torch.nn.Module):
+ """
+ Definition of sine generator
+
+ Generates sine waveforms with optional harmonics and additive noise.
+ Can be used to create harmonic noise source for neural vocoders.
+
+ Args:
+ samp_rate (int): Sampling rate in Hz.
+ harmonic_num (int): Number of harmonic overtones (default 0).
+ sine_amp (float): Amplitude of sine-waveform (default 0.1).
+ noise_std (float): Standard deviation of Gaussian noise (default 0.003).
+ voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0).
+ """
+
+ def __init__(
+ self,
+ samp_rate: int,
+ harmonic_num: int = 0,
+ sine_amp: float = 0.1,
+ noise_std: float = 0.003,
+ voiced_threshold: float = 0,
+ ):
+ super(SineGenerator, self).__init__()
+ self.sine_amp = sine_amp
+ self.noise_std = noise_std
+ self.harmonic_num = harmonic_num
+ self.dim = self.harmonic_num + 1
+ self.sampling_rate = samp_rate
+ self.voiced_threshold = voiced_threshold
+
+ def _f02uv(self, f0: torch.Tensor):
+ """
+ Generates voiced/unvoiced (UV) signal based on the fundamental frequency (F0).
+
+ Args:
+ f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1).
+ """
+ # generate uv signal
+ uv = torch.ones_like(f0)
+ uv = uv * (f0 > self.voiced_threshold)
+ return uv
+
+ def _f02sine(self, f0_values: torch.Tensor):
+ """
+ Generates sine waveforms based on the fundamental frequency (F0) and its harmonics.
+
+ Args:
+ f0_values (torch.Tensor): Tensor of fundamental frequency and its harmonics,
+ shape (batch_size, length, dim), where dim indicates
+ the fundamental tone and overtones.
+ """
+ # convert to F0 in rad. The integer part n can be ignored
+ # because 2 * np.pi * n doesn't affect phase
+ rad_values = (f0_values / self.sampling_rate) % 1
+
+ # initial phase noise (no noise for fundamental component)
+ rand_ini = torch.rand(
+ f0_values.shape[0], f0_values.shape[2], device=f0_values.device
+ )
+ rand_ini[:, 0] = 0
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+ tmp_over_one = torch.cumsum(rad_values, 1) % 1
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+ cumsum_shift = torch.zeros_like(rad_values)
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+
+ sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
+
+ return sines
+
+ def forward(self, f0: torch.Tensor):
+ with torch.no_grad():
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+ # fundamental component
+ f0_buf[:, :, 0] = f0[:, :, 0]
+ for idx in np.arange(self.harmonic_num):
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
+
+ sine_waves = self._f02sine(f0_buf) * self.sine_amp
+
+ uv = self._f02uv(f0)
+
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+ noise = noise_amp * torch.randn_like(sine_waves)
+
+ sine_waves = sine_waves * uv + noise
+ return sine_waves, uv, noise
+
+
+class SourceModuleHnNSF(torch.nn.Module):
+ """
+ Generates harmonic and noise source features.
+
+ This module uses the SineGenerator to create harmonic signals based on the
+ fundamental frequency (F0) and merges them into a single excitation signal.
+
+ Args:
+ sample_rate (int): Sampling rate in Hz.
+ harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0.
+ sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1.
+ add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003.
+ voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0.
+ """
+
+ def __init__(
+ self,
+ sampling_rate: int,
+ harmonic_num: int = 0,
+ sine_amp: float = 0.1,
+ add_noise_std: float = 0.003,
+ voiced_threshold: float = 0,
+ ):
+ super(SourceModuleHnNSF, self).__init__()
+
+ self.sine_amp = sine_amp
+ self.noise_std = add_noise_std
+
+ # to produce sine waveforms
+ self.l_sin_gen = SineGenerator(
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold
+ )
+
+ # to merge source harmonics into a single excitation
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+ self.l_tanh = torch.nn.Tanh()
+
+ def forward(self, x: torch.Tensor):
+ sine_wavs, uv, _ = self.l_sin_gen(x)
+ sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+
+ return sine_merge, None, None
+
+
+class HiFiGANMRFGenerator(torch.nn.Module):
+ """
+ HiFi-GAN generator with Multi-Receptive Field (MRF) blocks.
+
+ This generator takes an input feature sequence and fundamental frequency (F0)
+ as input and generates an audio waveform. It utilizes transposed convolutions
+ for upsampling and MRF blocks for feature refinement. It can also condition
+ on global conditioning features.
+
+ Args:
+ in_channel (int): Number of input channels.
+ upsample_initial_channel (int): Number of channels after the initial convolution.
+ upsample_rates (list[int]): List of upsampling rates for the transposed convolutions.
+ upsample_kernel_sizes (list[int]): List of kernel sizes for the transposed convolutions.
+ resblock_kernel_sizes (list[int]): List of kernel sizes for the convolutional layers in the MRF blocks.
+ resblock_dilations (list[list[int]]): List of lists of dilation rates for the MRF blocks.
+ gin_channels (int): Number of global conditioning input channels (0 if no global conditioning).
+ sample_rate (int): Sampling rate of the audio.
+ harmonic_num (int): Number of harmonics to generate.
+ checkpointing (bool): Whether to use checkpointing to save memory during training (default: False).
+ """
+
+ def __init__(
+ self,
+ in_channel: int,
+ upsample_initial_channel: int,
+ upsample_rates: list[int],
+ upsample_kernel_sizes: list[int],
+ resblock_kernel_sizes: list[int],
+ resblock_dilations: list[list[int]],
+ gin_channels: int,
+ sample_rate: int,
+ harmonic_num: int,
+ checkpointing: bool = False,
+ ):
+ super().__init__()
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.checkpointing = checkpointing
+
+ self.f0_upsample = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+ self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num)
+
+ self.conv_pre = weight_norm(
+ torch.nn.Conv1d(
+ in_channel, upsample_initial_channel, kernel_size=7, stride=1, padding=3
+ )
+ )
+ self.upsamples = torch.nn.ModuleList()
+ self.noise_convs = torch.nn.ModuleList()
+
+ stride_f0s = [
+ math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
+ for i in range(len(upsample_rates))
+ ]
+
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ # handling odd upsampling rates
+ if u % 2 == 0:
+ # old method
+ padding = (k - u) // 2
+ else:
+ padding = u // 2 + u % 2
+
+ self.upsamples.append(
+ weight_norm(
+ torch.nn.ConvTranspose1d(
+ upsample_initial_channel // (2**i),
+ upsample_initial_channel // (2 ** (i + 1)),
+ kernel_size=k,
+ stride=u,
+ padding=padding,
+ output_padding=u % 2,
+ )
+ )
+ )
+ """ handling odd upsampling rates
+ # s k p
+ # 40 80 20
+ # 32 64 16
+ # 4 8 2
+ # 2 3 1
+ # 63 125 31
+ # 9 17 4
+ # 3 5 1
+ # 1 1 0
+ """
+ stride = stride_f0s[i]
+ kernel = 1 if stride == 1 else stride * 2 - stride % 2
+ padding = 0 if stride == 1 else (kernel - stride) // 2
+
+ self.noise_convs.append(
+ torch.nn.Conv1d(
+ 1,
+ upsample_initial_channel // (2 ** (i + 1)),
+ kernel_size=kernel,
+ stride=stride,
+ padding=padding,
+ )
+ )
+ self.mrfs = torch.nn.ModuleList()
+ for i in range(len(self.upsamples)):
+ channel = upsample_initial_channel // (2 ** (i + 1))
+ self.mrfs.append(
+ torch.nn.ModuleList(
+ [
+ MRFBlock(channel, kernel_size=k, dilations=d)
+ for k, d in zip(resblock_kernel_sizes, resblock_dilations)
+ ]
+ )
+ )
+ self.conv_post = weight_norm(
+ torch.nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3)
+ )
+ if gin_channels != 0:
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ def forward(
+ self, x: torch.Tensor, f0: torch.Tensor, g: Optional[torch.Tensor] = None
+ ):
+ f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2)
+ har_source, _, _ = self.m_source(f0)
+ har_source = har_source.transpose(-1, -2)
+ # new tensor
+ x = self.conv_pre(x)
+
+ if g is not None:
+ # in-place call
+ x += self.cond(g)
+
+ for ups, mrf, noise_conv in zip(self.upsamples, self.mrfs, self.noise_convs):
+ # in-place call
+ x = torch.nn.functional.leaky_relu_(x, LRELU_SLOPE)
+
+ if self.training and self.checkpointing:
+ x = checkpoint(ups, x, use_reentrant=False)
+ else:
+ x = ups(x)
+
+ x += noise_conv(har_source)
+
+ def mrf_sum(x, layers):
+ return sum(layer(x) for layer in layers) / self.num_kernels
+
+ if self.training and self.checkpointing:
+ x = checkpoint(mrf_sum, x, mrf, use_reentrant=False)
+ else:
+ x = mrf_sum(x, mrf)
+ # in-place call
+ x = torch.nn.functional.leaky_relu_(x)
+ x = self.conv_post(x)
+ # in-place call
+ x = torch.tanh_(x)
+ return x
+
+ def remove_weight_norm(self):
+ remove_weight_norm(self.conv_pre)
+ for up in self.upsamples:
+ remove_weight_norm(up)
+ for mrf in self.mrfs:
+ mrf.remove_weight_norm()
+ remove_weight_norm(self.conv_post)
diff --git a/rvc/lib/algorithm/generators/hifigan_nsf.py b/rvc/lib/algorithm/generators/hifigan_nsf.py
new file mode 100644
index 0000000000000000000000000000000000000000..d192095c99b9269afc4ec903b4976e77f6fc38a4
--- /dev/null
+++ b/rvc/lib/algorithm/generators/hifigan_nsf.py
@@ -0,0 +1,237 @@
+import math
+from typing import Optional
+
+import torch
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+from torch.utils.checkpoint import checkpoint
+
+from rvc.lib.algorithm.commons import init_weights
+from rvc.lib.algorithm.generators.hifigan import SineGenerator
+from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock
+
+
+class SourceModuleHnNSF(torch.nn.Module):
+ """
+ Source Module for generating harmonic and noise components for audio synthesis.
+
+ This module generates a harmonic source signal using sine waves and adds
+ optional noise. It's often used in neural vocoders as a source of excitation.
+
+ Args:
+ sample_rate (int): Sampling rate of the audio in Hz.
+ harmonic_num (int, optional): Number of harmonic overtones to generate above the fundamental frequency (F0). Defaults to 0.
+ sine_amp (float, optional): Amplitude of the sine wave components. Defaults to 0.1.
+ add_noise_std (float, optional): Standard deviation of the additive white Gaussian noise. Defaults to 0.003.
+ voiced_threshod (float, optional): Threshold for the fundamental frequency (F0) to determine if a frame is voiced. If F0 is below this threshold, it's considered unvoiced. Defaults to 0.
+ """
+
+ def __init__(
+ self,
+ sample_rate: int,
+ harmonic_num: int = 0,
+ sine_amp: float = 0.1,
+ add_noise_std: float = 0.003,
+ voiced_threshod: float = 0,
+ ):
+ super(SourceModuleHnNSF, self).__init__()
+
+ self.sine_amp = sine_amp
+ self.noise_std = add_noise_std
+
+ self.l_sin_gen = SineGenerator(
+ sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+ )
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+ self.l_tanh = torch.nn.Tanh()
+
+ def forward(self, x: torch.Tensor, upsample_factor: int = 1):
+ sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor)
+ sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+ return sine_merge, None, None
+
+
+class HiFiGANNSFGenerator(torch.nn.Module):
+ """
+ Generator module based on the Neural Source Filter (NSF) architecture.
+
+ This generator synthesizes audio by first generating a source excitation signal
+ (harmonic and noise) and then filtering it through a series of upsampling and
+ residual blocks. Global conditioning can be applied to influence the generation.
+
+ Args:
+ initial_channel (int): Number of input channels to the initial convolutional layer.
+ resblock_kernel_sizes (list): List of kernel sizes for the residual blocks.
+ resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size.
+ upsample_rates (list): List of upsampling factors for each upsampling layer.
+ upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer.
+ upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling.
+ gin_channels (int): Number of input channels for the global conditioning. If 0, no global conditioning is used.
+ sr (int): Sampling rate of the audio.
+ checkpointing (bool, optional): Whether to use gradient checkpointing to save memory during training. Defaults to False.
+ """
+
+ def __init__(
+ self,
+ initial_channel: int,
+ resblock_kernel_sizes: list,
+ resblock_dilation_sizes: list,
+ upsample_rates: list,
+ upsample_initial_channel: int,
+ upsample_kernel_sizes: list,
+ gin_channels: int,
+ sr: int,
+ checkpointing: bool = False,
+ ):
+ super(HiFiGANNSFGenerator, self).__init__()
+
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+ self.checkpointing = checkpointing
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
+ self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0)
+
+ self.conv_pre = torch.nn.Conv1d(
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
+ )
+
+ self.ups = torch.nn.ModuleList()
+ self.noise_convs = torch.nn.ModuleList()
+
+ channels = [
+ upsample_initial_channel // (2 ** (i + 1))
+ for i in range(len(upsample_rates))
+ ]
+ stride_f0s = [
+ math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
+ for i in range(len(upsample_rates))
+ ]
+
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ # handling odd upsampling rates
+ if u % 2 == 0:
+ # old method
+ padding = (k - u) // 2
+ else:
+ padding = u // 2 + u % 2
+
+ self.ups.append(
+ weight_norm(
+ torch.nn.ConvTranspose1d(
+ upsample_initial_channel // (2**i),
+ channels[i],
+ k,
+ u,
+ padding=padding,
+ output_padding=u % 2,
+ )
+ )
+ )
+ """ handling odd upsampling rates
+ # s k p
+ # 40 80 20
+ # 32 64 16
+ # 4 8 2
+ # 2 3 1
+ # 63 125 31
+ # 9 17 4
+ # 3 5 1
+ # 1 1 0
+ """
+ stride = stride_f0s[i]
+ kernel = 1 if stride == 1 else stride * 2 - stride % 2
+ padding = 0 if stride == 1 else (kernel - stride) // 2
+
+ self.noise_convs.append(
+ torch.nn.Conv1d(
+ 1,
+ channels[i],
+ kernel_size=kernel,
+ stride=stride,
+ padding=padding,
+ )
+ )
+
+ self.resblocks = torch.nn.ModuleList(
+ [
+ ResBlock(channels[i], k, d)
+ for i in range(len(self.ups))
+ for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes)
+ ]
+ )
+
+ self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False)
+ self.ups.apply(init_weights)
+
+ if gin_channels != 0:
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ self.upp = math.prod(upsample_rates)
+ self.lrelu_slope = LRELU_SLOPE
+
+ def forward(
+ self, x: torch.Tensor, f0: torch.Tensor, g: Optional[torch.Tensor] = None
+ ):
+ har_source, _, _ = self.m_source(f0, self.upp)
+ har_source = har_source.transpose(1, 2)
+ # new tensor
+ x = self.conv_pre(x)
+
+ if g is not None:
+ # in-place call
+ x += self.cond(g)
+
+ for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
+ # in-place call
+ x = torch.nn.functional.leaky_relu_(x, self.lrelu_slope)
+
+ # Apply upsampling layer
+ if self.training and self.checkpointing:
+ x = checkpoint(ups, x, use_reentrant=False)
+ else:
+ x = ups(x)
+
+ # Add noise excitation
+ x += noise_convs(har_source)
+
+ # Apply residual blocks
+ def resblock_forward(x, blocks):
+ return sum(block(x) for block in blocks) / len(blocks)
+
+ blocks = self.resblocks[i * self.num_kernels : (i + 1) * self.num_kernels]
+
+ # Checkpoint or regular computation for ResBlocks
+ if self.training and self.checkpointing:
+ x = checkpoint(resblock_forward, x, blocks, use_reentrant=False)
+ else:
+ x = resblock_forward(x, blocks)
+ # in-place call
+ x = torch.nn.functional.leaky_relu_(x)
+ # in-place call
+ x = torch.tanh_(self.conv_post(x))
+
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.ups:
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+
+ def __prepare_scriptable__(self):
+ for l in self.ups:
+ for hook in l._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ for hook in l._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ remove_weight_norm(l)
+ return self
diff --git a/rvc/lib/algorithm/generators/refinegan.py b/rvc/lib/algorithm/generators/refinegan.py
new file mode 100644
index 0000000000000000000000000000000000000000..f571cfa55c7ad5cbe0c127b06620e5c1cf57ffb7
--- /dev/null
+++ b/rvc/lib/algorithm/generators/refinegan.py
@@ -0,0 +1,475 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils.parametrizations import weight_norm
+from torch.nn.utils.parametrize import remove_parametrizations
+from torch.utils.checkpoint import checkpoint
+
+from rvc.lib.algorithm.commons import get_padding
+
+
+class ResBlock(nn.Module):
+ """
+ Residual block with multiple dilated convolutions.
+
+ This block applies a sequence of dilated convolutional layers with Leaky ReLU activation.
+ It's designed to capture information at different scales due to the varying dilation rates.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ kernel_size (int, optional): Kernel size for the convolutional layers. Defaults to 7.
+ dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers. Defaults to (1, 3, 5).
+ leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2.
+ """
+
+ def __init__(
+ self,
+ *,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int = 7,
+ dilation: tuple[int] = (1, 3, 5),
+ leaky_relu_slope: float = 0.2,
+ ):
+ super(ResBlock, self).__init__()
+
+ self.leaky_relu_slope = leaky_relu_slope
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+
+ self.convs1 = nn.ModuleList(
+ [
+ weight_norm(
+ nn.Conv1d(
+ in_channels=in_channels if idx == 0 else out_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=1,
+ dilation=d,
+ padding=get_padding(kernel_size, d),
+ )
+ )
+ for idx, d in enumerate(dilation)
+ ]
+ )
+ self.convs1.apply(self.init_weights)
+
+ self.convs2 = nn.ModuleList(
+ [
+ weight_norm(
+ nn.Conv1d(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=1,
+ dilation=d,
+ padding=get_padding(kernel_size, d),
+ )
+ )
+ for idx, d in enumerate(dilation)
+ ]
+ )
+ self.convs2.apply(self.init_weights)
+
+ def forward(self, x: torch.Tensor):
+ for idx, (c1, c2) in enumerate(zip(self.convs1, self.convs2)):
+ # new tensor
+ xt = F.leaky_relu(x, self.leaky_relu_slope)
+ xt = c1(xt)
+ # in-place call
+ xt = F.leaky_relu_(xt, self.leaky_relu_slope)
+ xt = c2(xt)
+
+ if idx != 0 or self.in_channels == self.out_channels:
+ x = xt + x
+ else:
+ x = xt
+
+ return x
+
+ def remove_parametrizations(self):
+ for c1, c2 in zip(self.convs1, self.convs2):
+ remove_parametrizations(c1)
+ remove_parametrizations(c2)
+
+ def init_weights(self, m):
+ if type(m) == nn.Conv1d:
+ m.weight.data.normal_(0, 0.01)
+ m.bias.data.fill_(0.0)
+
+
+class AdaIN(nn.Module):
+ """
+ Adaptive Instance Normalization layer.
+
+ This layer applies a scaling factor to the input based on a learnable weight.
+
+ Args:
+ channels (int): Number of input channels.
+ leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation applied after scaling. Defaults to 0.2.
+ """
+
+ def __init__(
+ self,
+ *,
+ channels: int,
+ leaky_relu_slope: float = 0.2,
+ ):
+ super().__init__()
+
+ self.weight = nn.Parameter(torch.ones(channels))
+ # safe to use in-place as it is used on a new x+gaussian tensor
+ self.activation = nn.LeakyReLU(leaky_relu_slope, inplace=True)
+
+ def forward(self, x: torch.Tensor):
+ gaussian = torch.randn_like(x) * self.weight[None, :, None]
+
+ return self.activation(x + gaussian)
+
+
+class ParallelResBlock(nn.Module):
+ """
+ Parallel residual block that applies multiple residual blocks with different kernel sizes in parallel.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ kernel_sizes (tuple[int], optional): Tuple of kernel sizes for the parallel residual blocks. Defaults to (3, 7, 11).
+ dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers within the residual blocks. Defaults to (1, 3, 5).
+ leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2.
+ """
+
+ def __init__(
+ self,
+ *,
+ in_channels: int,
+ out_channels: int,
+ kernel_sizes: tuple[int] = (3, 7, 11),
+ dilation: tuple[int] = (1, 3, 5),
+ leaky_relu_slope: float = 0.2,
+ ):
+ super().__init__()
+
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+
+ self.input_conv = nn.Conv1d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=7,
+ stride=1,
+ padding=3,
+ )
+
+ self.blocks = nn.ModuleList(
+ [
+ nn.Sequential(
+ AdaIN(channels=out_channels),
+ ResBlock(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ dilation=dilation,
+ leaky_relu_slope=leaky_relu_slope,
+ ),
+ AdaIN(channels=out_channels),
+ )
+ for kernel_size in kernel_sizes
+ ]
+ )
+
+ def forward(self, x: torch.Tensor):
+ x = self.input_conv(x)
+
+ results = [block(x) for block in self.blocks]
+
+ return torch.mean(torch.stack(results), dim=0)
+
+ def remove_parametrizations(self):
+ for block in self.blocks:
+ block[1].remove_parametrizations()
+
+
+class SineGenerator(nn.Module):
+ """
+ Definition of sine generator
+
+ Generates sine waveforms with optional harmonics and additive noise.
+ Can be used to create harmonic noise source for neural vocoders.
+
+ Args:
+ samp_rate (int): Sampling rate in Hz.
+ harmonic_num (int): Number of harmonic overtones (default 0).
+ sine_amp (float): Amplitude of sine-waveform (default 0.1).
+ noise_std (float): Standard deviation of Gaussian noise (default 0.003).
+ voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0).
+ """
+
+ def __init__(
+ self,
+ samp_rate,
+ harmonic_num=0,
+ sine_amp=0.1,
+ noise_std=0.003,
+ voiced_threshold=0,
+ ):
+ super(SineGenerator, self).__init__()
+ self.sine_amp = sine_amp
+ self.noise_std = noise_std
+ self.harmonic_num = harmonic_num
+ self.dim = self.harmonic_num + 1
+ self.sampling_rate = samp_rate
+ self.voiced_threshold = voiced_threshold
+
+ self.merge = nn.Sequential(
+ nn.Linear(self.dim, 1, bias=False),
+ nn.Tanh(),
+ )
+
+ def _f02uv(self, f0):
+ # generate uv signal
+ uv = torch.ones_like(f0)
+ uv = uv * (f0 > self.voiced_threshold)
+ return uv
+
+ def _f02sine(self, f0_values):
+ """f0_values: (batchsize, length, dim)
+ where dim indicates fundamental tone and overtones
+ """
+ # convert to F0 in rad. The integer part n can be ignored
+ # because 2 * np.pi * n doesn't affect phase
+ rad_values = (f0_values / self.sampling_rate) % 1
+
+ # initial phase noise (no noise for fundamental component)
+ rand_ini = torch.rand(
+ f0_values.shape[0], f0_values.shape[2], device=f0_values.device
+ )
+ rand_ini[:, 0] = 0
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+ tmp_over_one = torch.cumsum(rad_values, 1) % 1
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+ cumsum_shift = torch.zeros_like(rad_values)
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+
+ sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
+
+ return sines
+
+ def forward(self, f0):
+ with torch.no_grad():
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+ # fundamental component
+ f0_buf[:, :, 0] = f0[:, :, 0]
+ for idx in np.arange(self.harmonic_num):
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
+
+ sine_waves = self._f02sine(f0_buf) * self.sine_amp
+
+ uv = self._f02uv(f0)
+
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+ noise = noise_amp * torch.randn_like(sine_waves)
+
+ sine_waves = sine_waves * uv + noise
+ # correct DC offset
+ sine_waves = sine_waves - sine_waves.mean(dim=1, keepdim=True)
+ # merge with grad
+ return self.merge(sine_waves)
+
+
+class RefineGANGenerator(nn.Module):
+ """
+ RefineGAN generator for audio synthesis.
+
+ This generator uses a combination of downsampling, residual blocks, and parallel residual blocks
+ to refine an input mel-spectrogram and fundamental frequency (F0) into an audio waveform.
+ It can also incorporate global conditioning.
+
+ Args:
+ sample_rate (int, optional): Sampling rate of the audio. Defaults to 44100.
+ downsample_rates (tuple[int], optional): Downsampling rates for the downsampling blocks. Defaults to (2, 2, 8, 8).
+ upsample_rates (tuple[int], optional): Upsampling rates for the upsampling blocks. Defaults to (8, 8, 2, 2).
+ leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2.
+ num_mels (int, optional): Number of mel-frequency bins in the input mel-spectrogram. Defaults to 128.
+ start_channels (int, optional): Number of channels in the initial convolutional layer. Defaults to 16.
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 256.
+ checkpointing (bool, optional): Whether to use checkpointing for memory efficiency. Defaults to False.
+ """
+
+ def __init__(
+ self,
+ *,
+ sample_rate: int = 44100,
+ downsample_rates: tuple[int] = (2, 2, 8, 8),
+ upsample_rates: tuple[int] = (8, 8, 2, 2),
+ leaky_relu_slope: float = 0.2,
+ num_mels: int = 128,
+ start_channels: int = 16,
+ gin_channels: int = 256,
+ checkpointing: bool = False,
+ upsample_initial_channel=512,
+ ):
+ super().__init__()
+
+ self.upsample_rates = upsample_rates
+ self.leaky_relu_slope = leaky_relu_slope
+ self.checkpointing = checkpointing
+
+ self.upp = np.prod(upsample_rates)
+ self.m_source = SineGenerator(sample_rate)
+
+ # expanded f0 sinegen -> match mel_conv
+ self.pre_conv = weight_norm(
+ nn.Conv1d(
+ in_channels=1,
+ out_channels=upsample_initial_channel // 2,
+ kernel_size=7,
+ stride=1,
+ padding=3,
+ bias=False,
+ )
+ )
+
+ stride_f0s = [
+ math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
+ for i in range(len(upsample_rates))
+ ]
+
+ channels = upsample_initial_channel
+
+ self.downsample_blocks = nn.ModuleList([])
+ for i, u in enumerate(upsample_rates):
+ # handling odd upsampling rates
+ stride = stride_f0s[i]
+ kernel = 1 if stride == 1 else stride * 2 - stride % 2
+ padding = 0 if stride == 1 else (kernel - stride) // 2
+
+ # f0 input gets upscaled to full segment size, then downscaled back to match each upscale step
+
+ self.downsample_blocks.append(
+ nn.Conv1d(
+ in_channels=1,
+ out_channels=channels // 2 ** (i + 2),
+ kernel_size=kernel,
+ stride=stride,
+ padding=padding,
+ )
+ )
+
+ self.mel_conv = weight_norm(
+ nn.Conv1d(
+ in_channels=num_mels,
+ out_channels=channels // 2,
+ kernel_size=7,
+ stride=1,
+ padding=3,
+ )
+ )
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(256, channels // 2, 1)
+
+ self.upsample_blocks = nn.ModuleList([])
+ self.upsample_conv_blocks = nn.ModuleList([])
+ self.filters = nn.ModuleList([])
+
+ for rate in upsample_rates:
+ new_channels = channels // 2
+
+ self.upsample_blocks.append(nn.Upsample(scale_factor=rate, mode="linear"))
+
+ low_pass = nn.Conv1d(
+ channels,
+ channels,
+ kernel_size=15,
+ padding=7,
+ groups=channels,
+ bias=False,
+ )
+
+ low_pass.weight.data.fill_(1.0 / 15)
+
+ self.filters.append(low_pass)
+
+ self.upsample_conv_blocks.append(
+ ParallelResBlock(
+ in_channels=channels + channels // 4,
+ out_channels=new_channels,
+ kernel_sizes=(3, 7, 11),
+ dilation=(1, 3, 5),
+ leaky_relu_slope=leaky_relu_slope,
+ )
+ )
+
+ channels = new_channels
+
+ self.conv_post = weight_norm(
+ nn.Conv1d(
+ in_channels=channels,
+ out_channels=1,
+ kernel_size=7,
+ stride=1,
+ padding=3,
+ )
+ )
+
+ def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None):
+
+ f0 = F.interpolate(
+ f0.unsqueeze(1), size=mel.shape[-1] * self.upp, mode="linear"
+ )
+ har_source = self.m_source(f0.transpose(1, 2)).transpose(1, 2)
+
+ x = self.pre_conv(har_source)
+ x = F.interpolate(x, size=mel.shape[-1], mode="linear")
+ # expanding spectrogram from 192 to 256 channels
+ mel = self.mel_conv(mel)
+
+ if g is not None:
+ # adding expanded speaker embedding
+ mel += self.cond(g)
+ x = torch.cat([mel, x], dim=1)
+
+ for ups, res, down, flt in zip(
+ self.upsample_blocks,
+ self.upsample_conv_blocks,
+ self.downsample_blocks,
+ self.filters,
+ ):
+ # in-place call
+ x = F.leaky_relu_(x, self.leaky_relu_slope)
+
+ if self.training and self.checkpointing:
+ x = checkpoint(ups, x, use_reentrant=False)
+ x = checkpoint(flt, x, use_reentrant=False)
+ x = torch.cat([x, down(har_source)], dim=1)
+ x = checkpoint(res, x, use_reentrant=False)
+ else:
+ x = ups(x)
+ x = flt(x)
+ x = torch.cat([x, down(har_source)], dim=1)
+ x = res(x)
+
+ # in-place call
+ x = F.leaky_relu_(x, self.leaky_relu_slope)
+ x = self.conv_post(x)
+ # in-place call
+ x = torch.tanh_(x)
+
+ return x
+
+ def remove_parametrizations(self):
+ remove_parametrizations(self.source_conv)
+ remove_parametrizations(self.mel_conv)
+ remove_parametrizations(self.conv_post)
+
+ for block in self.downsample_blocks:
+ block[1].remove_parametrizations()
+
+ for block in self.upsample_conv_blocks:
+ block.remove_parametrizations()
diff --git a/rvc/lib/algorithm/modules.py b/rvc/lib/algorithm/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..611c45d3830051923effd0319f66d39e3126d4d2
--- /dev/null
+++ b/rvc/lib/algorithm/modules.py
@@ -0,0 +1,117 @@
+import torch
+from rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply
+
+
+class WaveNet(torch.nn.Module):
+ """
+ WaveNet residual blocks as used in WaveGlow.
+
+ Args:
+ hidden_channels (int): Number of hidden channels.
+ kernel_size (int): Size of the convolutional kernel.
+ dilation_rate (int): Dilation rate of the convolution.
+ n_layers (int): Number of convolutional layers.
+ gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
+ p_dropout (float, optional): Dropout probability. Defaults to 0.
+ """
+
+ def __init__(
+ self,
+ hidden_channels: int,
+ kernel_size: int,
+ dilation_rate,
+ n_layers: int,
+ gin_channels: int = 0,
+ p_dropout: int = 0,
+ ):
+ super().__init__()
+ assert kernel_size % 2 == 1, "Kernel size must be odd for proper padding."
+
+ self.hidden_channels = hidden_channels
+ self.kernel_size = (kernel_size,)
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+ self.p_dropout = p_dropout
+ self.n_channels_tensor = torch.IntTensor([hidden_channels]) # Static tensor
+
+ self.in_layers = torch.nn.ModuleList()
+ self.res_skip_layers = torch.nn.ModuleList()
+ self.drop = torch.nn.Dropout(p_dropout)
+
+ # Conditional layer for global conditioning
+ if gin_channels:
+ self.cond_layer = torch.nn.utils.parametrizations.weight_norm(
+ torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1),
+ name="weight",
+ )
+
+ # Precompute dilations and paddings
+ dilations = [dilation_rate**i for i in range(n_layers)]
+ paddings = [(kernel_size * d - d) // 2 for d in dilations]
+
+ # Initialize layers
+ for i in range(n_layers):
+ self.in_layers.append(
+ torch.nn.utils.parametrizations.weight_norm(
+ torch.nn.Conv1d(
+ hidden_channels,
+ 2 * hidden_channels,
+ kernel_size,
+ dilation=dilations[i],
+ padding=paddings[i],
+ ),
+ name="weight",
+ )
+ )
+
+ res_skip_channels = (
+ hidden_channels if i == n_layers - 1 else 2 * hidden_channels
+ )
+ self.res_skip_layers.append(
+ torch.nn.utils.parametrizations.weight_norm(
+ torch.nn.Conv1d(hidden_channels, res_skip_channels, 1),
+ name="weight",
+ )
+ )
+
+ def forward(self, x, x_mask, g=None):
+ output = x.clone().zero_()
+
+ # Apply conditional layer if global conditioning is provided
+ g = self.cond_layer(g) if g is not None else None
+
+ for i in range(self.n_layers):
+ x_in = self.in_layers[i](x)
+ g_l = (
+ g[
+ :,
+ i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels,
+ :,
+ ]
+ if g is not None
+ else 0
+ )
+
+ # Activation with fused Tanh-Sigmoid
+ acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.n_channels_tensor)
+ acts = self.drop(acts)
+
+ # Residual and skip connections
+ res_skip_acts = self.res_skip_layers[i](acts)
+ if i < self.n_layers - 1:
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
+ x = (x + res_acts) * x_mask
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
+ else:
+ output = output + res_skip_acts
+
+ return output * x_mask
+
+ def remove_weight_norm(self):
+ if self.gin_channels:
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
+ for layer in self.in_layers:
+ torch.nn.utils.remove_weight_norm(layer)
+ for layer in self.res_skip_layers:
+ torch.nn.utils.remove_weight_norm(layer)
diff --git a/rvc/lib/algorithm/normalization.py b/rvc/lib/algorithm/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..94a29bac9fbe9bce34f7a3fe253b8f231fcec78a
--- /dev/null
+++ b/rvc/lib/algorithm/normalization.py
@@ -0,0 +1,26 @@
+import torch
+
+
+class LayerNorm(torch.nn.Module):
+ """
+ Layer normalization module.
+
+ Args:
+ channels (int): Number of channels.
+ eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5.
+ """
+
+ def __init__(self, channels: int, eps: float = 1e-5):
+ super().__init__()
+ self.eps = eps
+ self.gamma = torch.nn.Parameter(torch.ones(channels))
+ self.beta = torch.nn.Parameter(torch.zeros(channels))
+
+ def forward(self, x):
+ # Transpose to (batch_size, time_steps, channels) for layer_norm
+ x = x.transpose(1, -1)
+ x = torch.nn.functional.layer_norm(
+ x, (x.size(-1),), self.gamma, self.beta, self.eps
+ )
+ # Transpose back to (batch_size, channels, time_steps)
+ return x.transpose(1, -1)
diff --git a/rvc/lib/algorithm/residuals.py b/rvc/lib/algorithm/residuals.py
new file mode 100644
index 0000000000000000000000000000000000000000..df7d544de11b022c3882e649bb688b407f9f86c2
--- /dev/null
+++ b/rvc/lib/algorithm/residuals.py
@@ -0,0 +1,267 @@
+import torch
+from itertools import chain
+from typing import Optional, Tuple
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+
+from rvc.lib.algorithm.modules import WaveNet
+from rvc.lib.algorithm.commons import get_padding, init_weights
+
+LRELU_SLOPE = 0.1
+
+
+def create_conv1d_layer(channels, kernel_size, dilation):
+ return weight_norm(
+ torch.nn.Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation,
+ padding=get_padding(kernel_size, dilation),
+ )
+ )
+
+
+def apply_mask(tensor: torch.Tensor, mask: Optional[torch.Tensor]):
+ return tensor * mask if mask else tensor
+
+
+def apply_mask_(tensor: torch.Tensor, mask: Optional[torch.Tensor]):
+ return tensor.mul_(mask) if mask else tensor
+
+
+class ResBlock(torch.nn.Module):
+ """
+ A residual block module that applies a series of 1D convolutional layers with residual connections.
+ """
+
+ def __init__(
+ self, channels: int, kernel_size: int = 3, dilations: Tuple[int] = (1, 3, 5)
+ ):
+ """
+ Initializes the ResBlock.
+
+ Args:
+ channels (int): Number of input and output channels for the convolution layers.
+ kernel_size (int): Size of the convolution kernel. Defaults to 3.
+ dilations (Tuple[int]): Tuple of dilation rates for the convolution layers in the first set.
+ """
+ super().__init__()
+ # Create convolutional layers with specified dilations and initialize weights
+ self.convs1 = self._create_convs(channels, kernel_size, dilations)
+ self.convs2 = self._create_convs(channels, kernel_size, [1] * len(dilations))
+
+ @staticmethod
+ def _create_convs(channels: int, kernel_size: int, dilations: Tuple[int]):
+ """
+ Creates a list of 1D convolutional layers with specified dilations.
+
+ Args:
+ channels (int): Number of input and output channels for the convolution layers.
+ kernel_size (int): Size of the convolution kernel.
+ dilations (Tuple[int]): Tuple of dilation rates for each convolution layer.
+ """
+ layers = torch.nn.ModuleList(
+ [create_conv1d_layer(channels, kernel_size, d) for d in dilations]
+ )
+ layers.apply(init_weights)
+ return layers
+
+ def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None):
+ for conv1, conv2 in zip(self.convs1, self.convs2):
+ x_residual = x
+ # new tensor
+ x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
+ # in-place call
+ x = apply_mask_(x, x_mask)
+ # in-place call
+ x = torch.nn.functional.leaky_relu_(conv1(x), LRELU_SLOPE)
+ # in-place call
+ x = apply_mask_(x, x_mask)
+ x = conv2(x)
+ # in-place call
+ x += x_residual
+ # in-place call
+ return apply_mask_(x, x_mask)
+
+ def remove_weight_norm(self):
+ for conv in chain(self.convs1, self.convs2):
+ remove_weight_norm(conv)
+
+
+class Flip(torch.nn.Module):
+ """
+ Flip module for flow-based models.
+
+ This module flips the input along the time dimension.
+ """
+
+ def forward(self, x, *args, reverse=False, **kwargs):
+ x = torch.flip(x, [1])
+ if not reverse:
+ logdet = torch.zeros(x.size(0), dtype=x.dtype, device=x.device)
+ return x, logdet
+ else:
+ return x
+
+
+class ResidualCouplingBlock(torch.nn.Module):
+ """
+ Residual Coupling Block for normalizing flow.
+
+ Args:
+ channels (int): Number of channels in the input.
+ hidden_channels (int): Number of hidden channels in the coupling layer.
+ kernel_size (int): Kernel size of the convolutional layers.
+ dilation_rate (int): Dilation rate of the convolutional layers.
+ n_layers (int): Number of layers in the coupling layer.
+ n_flows (int, optional): Number of coupling layers in the block. Defaults to 4.
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
+ """
+
+ def __init__(
+ self,
+ channels: int,
+ hidden_channels: int,
+ kernel_size: int,
+ dilation_rate: int,
+ n_layers: int,
+ n_flows: int = 4,
+ gin_channels: int = 0,
+ ):
+ super(ResidualCouplingBlock, self).__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.n_flows = n_flows
+ self.gin_channels = gin_channels
+
+ self.flows = torch.nn.ModuleList()
+ for _ in range(n_flows):
+ self.flows.append(
+ ResidualCouplingLayer(
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=gin_channels,
+ mean_only=True,
+ )
+ )
+ self.flows.append(Flip())
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ x_mask: torch.Tensor,
+ g: Optional[torch.Tensor] = None,
+ reverse: bool = False,
+ ):
+ if not reverse:
+ for flow in self.flows:
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
+ else:
+ for flow in reversed(self.flows):
+ x = flow.forward(x, x_mask, g=g, reverse=reverse)
+ return x
+
+ def remove_weight_norm(self):
+ for i in range(self.n_flows):
+ self.flows[i * 2].remove_weight_norm()
+
+ def __prepare_scriptable__(self):
+ for i in range(self.n_flows):
+ for hook in self.flows[i * 2]._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.flows[i * 2])
+
+ return self
+
+
+class ResidualCouplingLayer(torch.nn.Module):
+ """
+ Residual coupling layer for flow-based models.
+
+ Args:
+ channels (int): Number of channels.
+ hidden_channels (int): Number of hidden channels.
+ kernel_size (int): Size of the convolutional kernel.
+ dilation_rate (int): Dilation rate of the convolution.
+ n_layers (int): Number of convolutional layers.
+ p_dropout (float, optional): Dropout probability. Defaults to 0.
+ gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
+ mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False.
+ """
+
+ def __init__(
+ self,
+ channels: int,
+ hidden_channels: int,
+ kernel_size: int,
+ dilation_rate: int,
+ n_layers: int,
+ p_dropout: float = 0,
+ gin_channels: int = 0,
+ mean_only: bool = False,
+ ):
+ assert channels % 2 == 0, "channels should be divisible by 2"
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.half_channels = channels // 2
+ self.mean_only = mean_only
+
+ self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1)
+ self.enc = WaveNet(
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ p_dropout=p_dropout,
+ gin_channels=gin_channels,
+ )
+ self.post = torch.nn.Conv1d(
+ hidden_channels, self.half_channels * (2 - mean_only), 1
+ )
+ self.post.weight.data.zero_()
+ self.post.bias.data.zero_()
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ x_mask: torch.Tensor,
+ g: Optional[torch.Tensor] = None,
+ reverse: bool = False,
+ ):
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+ h = self.pre(x0) * x_mask
+ h = self.enc(h, x_mask, g=g)
+ stats = self.post(h) * x_mask
+ if not self.mean_only:
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+ else:
+ m = stats
+ logs = torch.zeros_like(m)
+
+ if not reverse:
+ x1 = m + x1 * torch.exp(logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ logdet = torch.sum(logs, [1, 2])
+ return x, logdet
+ else:
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ return x
+
+ def remove_weight_norm(self):
+ self.enc.remove_weight_norm()
diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f311e27deed5dd9b56027de634a91106827f2e53
--- /dev/null
+++ b/rvc/lib/algorithm/synthesizers.py
@@ -0,0 +1,244 @@
+import torch
+from typing import Optional
+from rvc.lib.algorithm.generators.hifigan_mrf import HiFiGANMRFGenerator
+from rvc.lib.algorithm.generators.hifigan_nsf import HiFiGANNSFGenerator
+from rvc.lib.algorithm.generators.hifigan import HiFiGANGenerator
+from rvc.lib.algorithm.generators.refinegan import RefineGANGenerator
+from rvc.lib.algorithm.commons import slice_segments, rand_slice_segments
+from rvc.lib.algorithm.residuals import ResidualCouplingBlock
+from rvc.lib.algorithm.encoders import TextEncoder, PosteriorEncoder
+
+
+class Synthesizer(torch.nn.Module):
+ """
+ Base Synthesizer model.
+
+ Args:
+ spec_channels (int): Number of channels in the spectrogram.
+ segment_size (int): Size of the audio segment.
+ inter_channels (int): Number of channels in the intermediate layers.
+ hidden_channels (int): Number of channels in the hidden layers.
+ filter_channels (int): Number of channels in the filter layers.
+ n_heads (int): Number of attention heads.
+ n_layers (int): Number of layers in the encoder.
+ kernel_size (int): Size of the convolution kernel.
+ p_dropout (float): Dropout probability.
+ resblock (str): Type of residual block.
+ resblock_kernel_sizes (list): Kernel sizes for the residual blocks.
+ resblock_dilation_sizes (list): Dilation sizes for the residual blocks.
+ upsample_rates (list): Upsampling rates for the decoder.
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
+ upsample_kernel_sizes (list): Kernel sizes for the upsampling layers.
+ spk_embed_dim (int): Dimension of the speaker embedding.
+ gin_channels (int): Number of channels in the global conditioning vector.
+ sr (int): Sampling rate of the audio.
+ use_f0 (bool): Whether to use F0 information.
+ text_enc_hidden_dim (int): Hidden dimension for the text encoder.
+ kwargs: Additional keyword arguments.
+ """
+
+ def __init__(
+ self,
+ spec_channels: int,
+ segment_size: int,
+ inter_channels: int,
+ hidden_channels: int,
+ filter_channels: int,
+ n_heads: int,
+ n_layers: int,
+ kernel_size: int,
+ p_dropout: float,
+ resblock: str,
+ resblock_kernel_sizes: list,
+ resblock_dilation_sizes: list,
+ upsample_rates: list,
+ upsample_initial_channel: int,
+ upsample_kernel_sizes: list,
+ spk_embed_dim: int,
+ gin_channels: int,
+ sr: int,
+ use_f0: bool,
+ text_enc_hidden_dim: int = 768,
+ vocoder: str = "HiFi-GAN",
+ randomized: bool = True,
+ checkpointing: bool = False,
+ **kwargs,
+ ):
+ super().__init__()
+ self.segment_size = segment_size
+ self.use_f0 = use_f0
+ self.randomized = randomized
+
+ self.enc_p = TextEncoder(
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ text_enc_hidden_dim,
+ f0=use_f0,
+ )
+ print(f"Using {vocoder} vocoder")
+ if use_f0:
+ if vocoder == "MRF HiFi-GAN":
+ self.dec = HiFiGANMRFGenerator(
+ in_channel=inter_channels,
+ upsample_initial_channel=upsample_initial_channel,
+ upsample_rates=upsample_rates,
+ upsample_kernel_sizes=upsample_kernel_sizes,
+ resblock_kernel_sizes=resblock_kernel_sizes,
+ resblock_dilations=resblock_dilation_sizes,
+ gin_channels=gin_channels,
+ sample_rate=sr,
+ harmonic_num=8,
+ checkpointing=checkpointing,
+ )
+ elif vocoder == "RefineGAN":
+ self.dec = RefineGANGenerator(
+ sample_rate=sr,
+ downsample_rates=upsample_rates[::-1],
+ upsample_rates=upsample_rates,
+ start_channels=16,
+ num_mels=inter_channels,
+ checkpointing=checkpointing,
+ )
+ else:
+ self.dec = HiFiGANNSFGenerator(
+ inter_channels,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=gin_channels,
+ sr=sr,
+ checkpointing=checkpointing,
+ )
+ else:
+ if vocoder == "MRF HiFi-GAN":
+ print("MRF HiFi-GAN does not support training without pitch guidance.")
+ self.dec = None
+ elif vocoder == "RefineGAN":
+ print("RefineGAN does not support training without pitch guidance.")
+ self.dec = None
+ else:
+ self.dec = HiFiGANGenerator(
+ inter_channels,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=gin_channels,
+ checkpointing=checkpointing,
+ )
+ self.enc_q = PosteriorEncoder(
+ spec_channels,
+ inter_channels,
+ hidden_channels,
+ 5,
+ 1,
+ 16,
+ gin_channels=gin_channels,
+ )
+ self.flow = ResidualCouplingBlock(
+ inter_channels,
+ hidden_channels,
+ 5,
+ 1,
+ 3,
+ gin_channels=gin_channels,
+ )
+ self.emb_g = torch.nn.Embedding(spk_embed_dim, gin_channels)
+
+ def _remove_weight_norm_from(self, module):
+ for hook in module._forward_pre_hooks.values():
+ if getattr(hook, "__class__", None).__name__ == "WeightNorm":
+ torch.nn.utils.remove_weight_norm(module)
+
+ def remove_weight_norm(self):
+ for module in [self.dec, self.flow, self.enc_q]:
+ self._remove_weight_norm_from(module)
+
+ def __prepare_scriptable__(self):
+ self.remove_weight_norm()
+ return self
+
+ def forward(
+ self,
+ phone: torch.Tensor,
+ phone_lengths: torch.Tensor,
+ pitch: Optional[torch.Tensor] = None,
+ pitchf: Optional[torch.Tensor] = None,
+ y: Optional[torch.Tensor] = None,
+ y_lengths: Optional[torch.Tensor] = None,
+ ds: Optional[torch.Tensor] = None,
+ ):
+ g = self.emb_g(ds).unsqueeze(-1)
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+
+ if y is not None:
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+ z_p = self.flow(z, y_mask, g=g)
+ # regular old training method using random slices
+ if self.randomized:
+ z_slice, ids_slice = rand_slice_segments(
+ z, y_lengths, self.segment_size
+ )
+ if self.use_f0:
+ pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2)
+ o = self.dec(z_slice, pitchf, g=g)
+ else:
+ o = self.dec(z_slice, g=g)
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+ # future use for finetuning using the entire dataset each pass
+ else:
+ if self.use_f0:
+ o = self.dec(z, pitchf, g=g)
+ else:
+ o = self.dec(z, g=g)
+ return o, None, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+ else:
+ return None, None, x_mask, None, (None, None, m_p, logs_p, None, None)
+
+ @torch.jit.export
+ def infer(
+ self,
+ phone: torch.Tensor,
+ phone_lengths: torch.Tensor,
+ pitch: Optional[torch.Tensor] = None,
+ nsff0: Optional[torch.Tensor] = None,
+ sid: torch.Tensor = None,
+ rate: Optional[torch.Tensor] = None,
+ ):
+ """
+ Inference of the model.
+
+ Args:
+ phone (torch.Tensor): Phoneme sequence.
+ phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
+ pitch (torch.Tensor, optional): Pitch sequence.
+ nsff0 (torch.Tensor, optional): Fine-grained pitch sequence.
+ sid (torch.Tensor): Speaker embedding.
+ rate (torch.Tensor, optional): Rate for time-stretching.
+ """
+ g = self.emb_g(sid).unsqueeze(-1)
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+
+ if rate is not None:
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
+ z_p, x_mask = z_p[:, :, head:], x_mask[:, :, head:]
+ if self.use_f0 and nsff0 is not None:
+ nsff0 = nsff0[:, head:]
+
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
+ o = (
+ self.dec(z * x_mask, nsff0, g=g)
+ if self.use_f0
+ else self.dec(z * x_mask, g=g)
+ )
+
+ return o, x_mask, (z, z_p, m_p, logs_p)
diff --git a/rvc/lib/predictors/F0Extractor.py b/rvc/lib/predictors/F0Extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..71065c1fc120c0812d464bc0b1f86a79e60d716b
--- /dev/null
+++ b/rvc/lib/predictors/F0Extractor.py
@@ -0,0 +1,99 @@
+import dataclasses
+import pathlib
+import libf0
+import librosa
+import numpy as np
+import resampy
+import torch
+import torchcrepe
+import torchfcpe
+import os
+
+# from tools.anyf0.rmvpe import RMVPE
+from rvc.lib.predictors.RMVPE import RMVPE0Predictor
+from rvc.configs.config import Config
+
+config = Config()
+
+
+@dataclasses.dataclass
+class F0Extractor:
+ wav_path: pathlib.Path
+ sample_rate: int = 44100
+ hop_length: int = 512
+ f0_min: int = 50
+ f0_max: int = 1600
+ method: str = "rmvpe"
+ x: np.ndarray = dataclasses.field(init=False)
+
+ def __post_init__(self):
+ self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate)
+
+ @property
+ def hop_size(self):
+ return self.hop_length / self.sample_rate
+
+ @property
+ def wav16k(self):
+ return resampy.resample(self.x, self.sample_rate, 16000)
+
+ def extract_f0(self):
+ f0 = None
+ method = self.method
+ if method == "crepe":
+ wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(config.device)
+ f0 = torchcrepe.predict(
+ wav16k_torch,
+ sample_rate=16000,
+ hop_length=160,
+ batch_size=512,
+ fmin=self.f0_min,
+ fmax=self.f0_max,
+ device=config.device,
+ )
+ f0 = f0[0].cpu().numpy()
+ elif method == "fcpe":
+ audio = librosa.to_mono(self.x)
+ audio_length = len(audio)
+ f0_target_length = (audio_length // self.hop_length) + 1
+ audio = (
+ torch.from_numpy(audio)
+ .float()
+ .unsqueeze(0)
+ .unsqueeze(-1)
+ .to(config.device)
+ )
+ model = torchfcpe.spawn_bundled_infer_model(device=config.device)
+
+ f0 = model.infer(
+ audio,
+ sr=self.sample_rate,
+ decoder_mode="local_argmax",
+ threshold=0.006,
+ f0_min=self.f0_min,
+ f0_max=self.f0_max,
+ interp_uv=False,
+ output_interp_target_length=f0_target_length,
+ )
+ f0 = f0.squeeze().cpu().numpy()
+ elif method == "rmvpe":
+ model_rmvpe = RMVPE0Predictor(
+ os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
+ device=config.device,
+ # hop_length=80
+ )
+ f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03)
+
+ else:
+ raise ValueError(f"Unknown method: {self.method}")
+ return libf0.hz_to_cents(f0, librosa.midi_to_hz(0))
+
+ def plot_f0(self, f0):
+ from matplotlib import pyplot as plt
+
+ plt.figure(figsize=(10, 4))
+ plt.plot(f0)
+ plt.title(self.method)
+ plt.xlabel("Time (frames)")
+ plt.ylabel("F0 (cents)")
+ plt.show()
diff --git a/rvc/lib/predictors/FCPE.py b/rvc/lib/predictors/FCPE.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6a00ca18fe2e360aaf4fa6e545added815bacf2
--- /dev/null
+++ b/rvc/lib/predictors/FCPE.py
@@ -0,0 +1,918 @@
+from typing import Union
+
+import torch.nn.functional as F
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.utils.parametrizations import weight_norm
+from torchaudio.transforms import Resample
+import os
+import librosa
+import soundfile as sf
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+import math
+from functools import partial
+
+from einops import rearrange, repeat
+from local_attention import LocalAttention
+from torch import nn
+
+os.environ["LRU_CACHE_CAPACITY"] = "3"
+
+
+def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
+ """Loads wav file to torch tensor."""
+ try:
+ data, sample_rate = sf.read(full_path, always_2d=True)
+ except Exception as error:
+ print(f"An error occurred loading {full_path}: {error}")
+ if return_empty_on_exception:
+ return [], sample_rate or target_sr or 48000
+ else:
+ raise
+
+ data = data[:, 0] if len(data.shape) > 1 else data
+ assert len(data) > 2
+
+ # Normalize data
+ max_mag = (
+ -np.iinfo(data.dtype).min
+ if np.issubdtype(data.dtype, np.integer)
+ else max(np.amax(data), -np.amin(data))
+ )
+ max_mag = (
+ (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0)
+ )
+ data = torch.FloatTensor(data.astype(np.float32)) / max_mag
+
+ # Handle exceptions and resample
+ if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:
+ return [], sample_rate or target_sr or 48000
+ if target_sr is not None and sample_rate != target_sr:
+ data = torch.from_numpy(
+ librosa.core.resample(
+ data.numpy(), orig_sr=sample_rate, target_sr=target_sr
+ )
+ )
+ sample_rate = target_sr
+
+ return data, sample_rate
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+ return np.exp(x) / C
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+ return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+ return torch.exp(x) / C
+
+
+class STFT:
+ def __init__(
+ self,
+ sr=22050,
+ n_mels=80,
+ n_fft=1024,
+ win_size=1024,
+ hop_length=256,
+ fmin=20,
+ fmax=11025,
+ clip_val=1e-5,
+ ):
+ self.target_sr = sr
+ self.n_mels = n_mels
+ self.n_fft = n_fft
+ self.win_size = win_size
+ self.hop_length = hop_length
+ self.fmin = fmin
+ self.fmax = fmax
+ self.clip_val = clip_val
+ self.mel_basis = {}
+ self.hann_window = {}
+
+ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
+ sample_rate = self.target_sr
+ n_mels = self.n_mels
+ n_fft = self.n_fft
+ win_size = self.win_size
+ hop_length = self.hop_length
+ fmin = self.fmin
+ fmax = self.fmax
+ clip_val = self.clip_val
+
+ factor = 2 ** (keyshift / 12)
+ n_fft_new = int(np.round(n_fft * factor))
+ win_size_new = int(np.round(win_size * factor))
+ hop_length_new = int(np.round(hop_length * speed))
+
+ # Optimize mel_basis and hann_window caching
+ mel_basis = self.mel_basis if not train else {}
+ hann_window = self.hann_window if not train else {}
+
+ mel_basis_key = str(fmax) + "_" + str(y.device)
+ if mel_basis_key not in mel_basis:
+ mel = librosa_mel_fn(
+ sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
+ )
+ mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
+
+ keyshift_key = str(keyshift) + "_" + str(y.device)
+ if keyshift_key not in hann_window:
+ hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
+
+ # Padding and STFT
+ pad_left = (win_size_new - hop_length_new) // 2
+ pad_right = max(
+ (win_size_new - hop_length_new + 1) // 2,
+ win_size_new - y.size(-1) - pad_left,
+ )
+ mode = "reflect" if pad_right < y.size(-1) else "constant"
+ y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
+ y = y.squeeze(1)
+
+ spec = torch.stft(
+ y,
+ n_fft=n_fft_new,
+ hop_length=hop_length_new,
+ win_length=win_size_new,
+ window=hann_window[keyshift_key],
+ center=center,
+ pad_mode="reflect",
+ normalized=False,
+ onesided=True,
+ return_complex=True,
+ )
+ spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
+
+ # Handle keyshift and mel conversion
+ if keyshift != 0:
+ size = n_fft // 2 + 1
+ resize = spec.size(1)
+ spec = (
+ F.pad(spec, (0, 0, 0, size - resize))
+ if resize < size
+ else spec[:, :size, :]
+ )
+ spec = spec * win_size / win_size_new
+ spec = torch.matmul(mel_basis[mel_basis_key], spec)
+ spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
+ return spec
+
+ def __call__(self, audiopath):
+ audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
+ spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
+ return spect
+
+
+stft = STFT()
+
+
+def softmax_kernel(
+ data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None
+):
+ b, h, *_ = data.shape
+
+ # Normalize data
+ data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
+
+ # Project data
+ ratio = projection_matrix.shape[0] ** -0.5
+ projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
+ projection = projection.type_as(data)
+ data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)
+
+ # Calculate diagonal data
+ diag_data = data**2
+ diag_data = torch.sum(diag_data, dim=-1)
+ diag_data = (diag_data / 2.0) * (data_normalizer**2)
+ diag_data = diag_data.unsqueeze(dim=-1)
+
+ # Apply softmax
+ if is_query:
+ data_dash = ratio * (
+ torch.exp(
+ data_dash
+ - diag_data
+ - torch.max(data_dash, dim=-1, keepdim=True).values
+ )
+ + eps
+ )
+ else:
+ data_dash = ratio * (torch.exp(data_dash - diag_data + eps))
+
+ return data_dash.type_as(data)
+
+
+def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
+ unstructured_block = torch.randn((cols, cols), device=device)
+ q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
+ q, r = map(lambda t: t.to(device), (q, r))
+
+ if qr_uniform_q:
+ d = torch.diag(r, 0)
+ q *= d.sign()
+ return q.t()
+
+
+def exists(val):
+ return val is not None
+
+
+def empty(tensor):
+ return tensor.numel() == 0
+
+
+def default(val, d):
+ return val if exists(val) else d
+
+
+def cast_tuple(val):
+ return (val,) if not isinstance(val, tuple) else val
+
+
+class PCmer(nn.Module):
+ def __init__(
+ self,
+ num_layers,
+ num_heads,
+ dim_model,
+ dim_keys,
+ dim_values,
+ residual_dropout,
+ attention_dropout,
+ ):
+ super().__init__()
+ self.num_layers = num_layers
+ self.num_heads = num_heads
+ self.dim_model = dim_model
+ self.dim_values = dim_values
+ self.dim_keys = dim_keys
+ self.residual_dropout = residual_dropout
+ self.attention_dropout = attention_dropout
+
+ self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
+
+ def forward(self, phone, mask=None):
+ for layer in self._layers:
+ phone = layer(phone, mask)
+ return phone
+
+
+class _EncoderLayer(nn.Module):
+ def __init__(self, parent: PCmer):
+ super().__init__()
+ self.conformer = ConformerConvModule(parent.dim_model)
+ self.norm = nn.LayerNorm(parent.dim_model)
+ self.dropout = nn.Dropout(parent.residual_dropout)
+ self.attn = SelfAttention(
+ dim=parent.dim_model, heads=parent.num_heads, causal=False
+ )
+
+ def forward(self, phone, mask=None):
+ phone = phone + (self.attn(self.norm(phone), mask=mask))
+ phone = phone + (self.conformer(phone))
+ return phone
+
+
+def calc_same_padding(kernel_size):
+ pad = kernel_size // 2
+ return (pad, pad - (kernel_size + 1) % 2)
+
+
+class Swish(nn.Module):
+ def forward(self, x):
+ return x * x.sigmoid()
+
+
+class Transpose(nn.Module):
+ def __init__(self, dims):
+ super().__init__()
+ assert len(dims) == 2, "dims must be a tuple of two dimensions"
+ self.dims = dims
+
+ def forward(self, x):
+ return x.transpose(*self.dims)
+
+
+class GLU(nn.Module):
+ def __init__(self, dim):
+ super().__init__()
+ self.dim = dim
+
+ def forward(self, x):
+ out, gate = x.chunk(2, dim=self.dim)
+ return out * gate.sigmoid()
+
+
+class DepthWiseConv1d(nn.Module):
+ def __init__(self, chan_in, chan_out, kernel_size, padding):
+ super().__init__()
+ self.padding = padding
+ self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
+
+ def forward(self, x):
+ x = F.pad(x, self.padding)
+ return self.conv(x)
+
+
+class ConformerConvModule(nn.Module):
+ def __init__(
+ self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0
+ ):
+ super().__init__()
+
+ inner_dim = dim * expansion_factor
+ padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
+
+ self.net = nn.Sequential(
+ nn.LayerNorm(dim),
+ Transpose((1, 2)),
+ nn.Conv1d(dim, inner_dim * 2, 1),
+ GLU(dim=1),
+ DepthWiseConv1d(
+ inner_dim, inner_dim, kernel_size=kernel_size, padding=padding
+ ),
+ Swish(),
+ nn.Conv1d(inner_dim, dim, 1),
+ Transpose((1, 2)),
+ nn.Dropout(dropout),
+ )
+
+ def forward(self, x):
+ return self.net(x)
+
+
+def linear_attention(q, k, v):
+ if v is None:
+ out = torch.einsum("...ed,...nd->...ne", k, q)
+ return out
+ else:
+ k_cumsum = k.sum(dim=-2)
+ D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
+ context = torch.einsum("...nd,...ne->...de", k, v)
+ out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
+ return out
+
+
+def gaussian_orthogonal_random_matrix(
+ nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None
+):
+ nb_full_blocks = int(nb_rows / nb_columns)
+ block_list = []
+
+ for _ in range(nb_full_blocks):
+ q = orthogonal_matrix_chunk(
+ nb_columns, qr_uniform_q=qr_uniform_q, device=device
+ )
+ block_list.append(q)
+
+ remaining_rows = nb_rows - nb_full_blocks * nb_columns
+ if remaining_rows > 0:
+ q = orthogonal_matrix_chunk(
+ nb_columns, qr_uniform_q=qr_uniform_q, device=device
+ )
+ block_list.append(q[:remaining_rows])
+
+ final_matrix = torch.cat(block_list)
+
+ if scaling == 0:
+ multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
+ elif scaling == 1:
+ multiplier = math.sqrt((float(nb_columns))) * torch.ones(
+ (nb_rows,), device=device
+ )
+ else:
+ raise ValueError(f"Invalid scaling {scaling}")
+
+ return torch.diag(multiplier) @ final_matrix
+
+
+class FastAttention(nn.Module):
+ def __init__(
+ self,
+ dim_heads,
+ nb_features=None,
+ ortho_scaling=0,
+ causal=False,
+ generalized_attention=False,
+ kernel_fn=nn.ReLU(),
+ qr_uniform_q=False,
+ no_projection=False,
+ ):
+ super().__init__()
+ nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
+
+ self.dim_heads = dim_heads
+ self.nb_features = nb_features
+ self.ortho_scaling = ortho_scaling
+
+ self.create_projection = partial(
+ gaussian_orthogonal_random_matrix,
+ nb_rows=self.nb_features,
+ nb_columns=dim_heads,
+ scaling=ortho_scaling,
+ qr_uniform_q=qr_uniform_q,
+ )
+ projection_matrix = self.create_projection()
+ self.register_buffer("projection_matrix", projection_matrix)
+
+ self.generalized_attention = generalized_attention
+ self.kernel_fn = kernel_fn
+ self.no_projection = no_projection
+ self.causal = causal
+
+ @torch.no_grad()
+ def redraw_projection_matrix(self):
+ projections = self.create_projection()
+ self.projection_matrix.copy_(projections)
+ del projections
+
+ def forward(self, q, k, v):
+ device = q.device
+
+ if self.no_projection:
+ q = q.softmax(dim=-1)
+ k = torch.exp(k) if self.causal else k.softmax(dim=-2)
+ else:
+ create_kernel = partial(
+ softmax_kernel, projection_matrix=self.projection_matrix, device=device
+ )
+ q = create_kernel(q, is_query=True)
+ k = create_kernel(k, is_query=False)
+
+ attn_fn = linear_attention if not self.causal else self.causal_linear_fn
+
+ if v is None:
+ out = attn_fn(q, k, None)
+ return out
+ else:
+ out = attn_fn(q, k, v)
+ return out
+
+
+class SelfAttention(nn.Module):
+ def __init__(
+ self,
+ dim,
+ causal=False,
+ heads=8,
+ dim_head=64,
+ local_heads=0,
+ local_window_size=256,
+ nb_features=None,
+ feature_redraw_interval=1000,
+ generalized_attention=False,
+ kernel_fn=nn.ReLU(),
+ qr_uniform_q=False,
+ dropout=0.0,
+ no_projection=False,
+ ):
+ super().__init__()
+ assert dim % heads == 0, "dimension must be divisible by number of heads"
+ dim_head = default(dim_head, dim // heads)
+ inner_dim = dim_head * heads
+ self.fast_attention = FastAttention(
+ dim_head,
+ nb_features,
+ causal=causal,
+ generalized_attention=generalized_attention,
+ kernel_fn=kernel_fn,
+ qr_uniform_q=qr_uniform_q,
+ no_projection=no_projection,
+ )
+
+ self.heads = heads
+ self.global_heads = heads - local_heads
+ self.local_attn = (
+ LocalAttention(
+ window_size=local_window_size,
+ causal=causal,
+ autopad=True,
+ dropout=dropout,
+ look_forward=int(not causal),
+ rel_pos_emb_config=(dim_head, local_heads),
+ )
+ if local_heads > 0
+ else None
+ )
+
+ self.to_q = nn.Linear(dim, inner_dim)
+ self.to_k = nn.Linear(dim, inner_dim)
+ self.to_v = nn.Linear(dim, inner_dim)
+ self.to_out = nn.Linear(inner_dim, dim)
+ self.dropout = nn.Dropout(dropout)
+
+ @torch.no_grad()
+ def redraw_projection_matrix(self):
+ self.fast_attention.redraw_projection_matrix()
+
+ def forward(
+ self,
+ x,
+ context=None,
+ mask=None,
+ context_mask=None,
+ name=None,
+ inference=False,
+ **kwargs,
+ ):
+ _, _, _, h, gh = *x.shape, self.heads, self.global_heads
+
+ cross_attend = exists(context)
+ context = default(context, x)
+ context_mask = default(context_mask, mask) if not cross_attend else context_mask
+ q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
+
+ q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+ (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
+
+ attn_outs = []
+ if not empty(q):
+ if exists(context_mask):
+ global_mask = context_mask[:, None, :, None]
+ v.masked_fill_(~global_mask, 0.0)
+ if cross_attend:
+ pass # TODO: Implement cross-attention
+ else:
+ out = self.fast_attention(q, k, v)
+ attn_outs.append(out)
+
+ if not empty(lq):
+ assert (
+ not cross_attend
+ ), "local attention is not compatible with cross attention"
+ out = self.local_attn(lq, lk, lv, input_mask=mask)
+ attn_outs.append(out)
+
+ out = torch.cat(attn_outs, dim=1)
+ out = rearrange(out, "b h n d -> b n (h d)")
+ out = self.to_out(out)
+ return self.dropout(out)
+
+
+def l2_regularization(model, l2_alpha):
+ l2_loss = []
+ for module in model.modules():
+ if type(module) is nn.Conv2d:
+ l2_loss.append((module.weight**2).sum() / 2.0)
+ return l2_alpha * sum(l2_loss)
+
+
+class FCPE(nn.Module):
+ def __init__(
+ self,
+ input_channel=128,
+ out_dims=360,
+ n_layers=12,
+ n_chans=512,
+ use_siren=False,
+ use_full=False,
+ loss_mse_scale=10,
+ loss_l2_regularization=False,
+ loss_l2_regularization_scale=1,
+ loss_grad1_mse=False,
+ loss_grad1_mse_scale=1,
+ f0_max=1975.5,
+ f0_min=32.70,
+ confidence=False,
+ threshold=0.05,
+ use_input_conv=True,
+ ):
+ super().__init__()
+ if use_siren is True:
+ raise ValueError("Siren is not supported yet.")
+ if use_full is True:
+ raise ValueError("Full model is not supported yet.")
+
+ self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
+ self.loss_l2_regularization = (
+ loss_l2_regularization if (loss_l2_regularization is not None) else False
+ )
+ self.loss_l2_regularization_scale = (
+ loss_l2_regularization_scale
+ if (loss_l2_regularization_scale is not None)
+ else 1
+ )
+ self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
+ self.loss_grad1_mse_scale = (
+ loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
+ )
+ self.f0_max = f0_max if (f0_max is not None) else 1975.5
+ self.f0_min = f0_min if (f0_min is not None) else 32.70
+ self.confidence = confidence if (confidence is not None) else False
+ self.threshold = threshold if (threshold is not None) else 0.05
+ self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
+
+ self.cent_table_b = torch.Tensor(
+ np.linspace(
+ self.f0_to_cent(torch.Tensor([f0_min]))[0],
+ self.f0_to_cent(torch.Tensor([f0_max]))[0],
+ out_dims,
+ )
+ )
+ self.register_buffer("cent_table", self.cent_table_b)
+
+ # conv in stack
+ _leaky = nn.LeakyReLU()
+ self.stack = nn.Sequential(
+ nn.Conv1d(input_channel, n_chans, 3, 1, 1),
+ nn.GroupNorm(4, n_chans),
+ _leaky,
+ nn.Conv1d(n_chans, n_chans, 3, 1, 1),
+ )
+
+ # transformer
+ self.decoder = PCmer(
+ num_layers=n_layers,
+ num_heads=8,
+ dim_model=n_chans,
+ dim_keys=n_chans,
+ dim_values=n_chans,
+ residual_dropout=0.1,
+ attention_dropout=0.1,
+ )
+ self.norm = nn.LayerNorm(n_chans)
+
+ # out
+ self.n_out = out_dims
+ self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
+
+ def forward(
+ self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"
+ ):
+ if cdecoder == "argmax":
+ self.cdecoder = self.cents_decoder
+ elif cdecoder == "local_argmax":
+ self.cdecoder = self.cents_local_decoder
+
+ x = (
+ self.stack(mel.transpose(1, 2)).transpose(1, 2)
+ if self.use_input_conv
+ else mel
+ )
+ x = self.decoder(x)
+ x = self.norm(x)
+ x = self.dense_out(x)
+ x = torch.sigmoid(x)
+
+ if not infer:
+ gt_cent_f0 = self.f0_to_cent(gt_f0)
+ gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0)
+ loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0)
+ if self.loss_l2_regularization:
+ loss_all = loss_all + l2_regularization(
+ model=self, l2_alpha=self.loss_l2_regularization_scale
+ )
+ x = loss_all
+ if infer:
+ x = self.cdecoder(x)
+ x = self.cent_to_f0(x)
+ x = (1 + x / 700).log() if not return_hz_f0 else x
+
+ return x
+
+ def cents_decoder(self, y, mask=True):
+ B, N, _ = y.size()
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
+ rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(
+ y, dim=-1, keepdim=True
+ )
+ if mask:
+ confident = torch.max(y, dim=-1, keepdim=True)[0]
+ confident_mask = torch.ones_like(confident)
+ confident_mask[confident <= self.threshold] = float("-INF")
+ rtn = rtn * confident_mask
+ return (rtn, confident) if self.confidence else rtn
+
+ def cents_local_decoder(self, y, mask=True):
+ B, N, _ = y.size()
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
+ confident, max_index = torch.max(y, dim=-1, keepdim=True)
+ local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
+ local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1)
+ ci_l = torch.gather(ci, -1, local_argmax_index)
+ y_l = torch.gather(y, -1, local_argmax_index)
+ rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
+ y_l, dim=-1, keepdim=True
+ )
+ if mask:
+ confident_mask = torch.ones_like(confident)
+ confident_mask[confident <= self.threshold] = float("-INF")
+ rtn = rtn * confident_mask
+ return (rtn, confident) if self.confidence else rtn
+
+ def cent_to_f0(self, cent):
+ return 10.0 * 2 ** (cent / 1200.0)
+
+ def f0_to_cent(self, f0):
+ return 1200.0 * torch.log2(f0 / 10.0)
+
+ def gaussian_blurred_cent(self, cents):
+ mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
+ B, N, _ = cents.size()
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
+ return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
+
+
+class FCPEInfer:
+ def __init__(self, model_path, device=None, dtype=torch.float32):
+ if device is None:
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ self.device = device
+ ckpt = torch.load(model_path, map_location=torch.device(self.device), weights_only=True)
+ self.args = DotDict(ckpt["config"])
+ self.dtype = dtype
+ model = FCPE(
+ input_channel=self.args.model.input_channel,
+ out_dims=self.args.model.out_dims,
+ n_layers=self.args.model.n_layers,
+ n_chans=self.args.model.n_chans,
+ use_siren=self.args.model.use_siren,
+ use_full=self.args.model.use_full,
+ loss_mse_scale=self.args.loss.loss_mse_scale,
+ loss_l2_regularization=self.args.loss.loss_l2_regularization,
+ loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
+ loss_grad1_mse=self.args.loss.loss_grad1_mse,
+ loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
+ f0_max=self.args.model.f0_max,
+ f0_min=self.args.model.f0_min,
+ confidence=self.args.model.confidence,
+ )
+ model.to(self.device).to(self.dtype)
+ model.load_state_dict(ckpt["model"])
+ model.eval()
+ self.model = model
+ self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
+
+ @torch.no_grad()
+ def __call__(self, audio, sr, threshold=0.05):
+ self.model.threshold = threshold
+ audio = audio[None, :]
+ mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
+ f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
+ return f0
+
+
+class Wav2Mel:
+ def __init__(self, args, device=None, dtype=torch.float32):
+ self.sample_rate = args.mel.sampling_rate
+ self.hop_size = args.mel.hop_size
+ if device is None:
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ self.device = device
+ self.dtype = dtype
+ self.stft = STFT(
+ args.mel.sampling_rate,
+ args.mel.num_mels,
+ args.mel.n_fft,
+ args.mel.win_size,
+ args.mel.hop_size,
+ args.mel.fmin,
+ args.mel.fmax,
+ )
+ self.resample_kernel = {}
+
+ def extract_nvstft(self, audio, keyshift=0, train=False):
+ mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)
+ return mel
+
+ def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
+ audio = audio.to(self.dtype).to(self.device)
+ if sample_rate == self.sample_rate:
+ audio_res = audio
+ else:
+ key_str = str(sample_rate)
+ if key_str not in self.resample_kernel:
+ self.resample_kernel[key_str] = Resample(
+ sample_rate, self.sample_rate, lowpass_filter_width=128
+ )
+ self.resample_kernel[key_str] = (
+ self.resample_kernel[key_str].to(self.dtype).to(self.device)
+ )
+ audio_res = self.resample_kernel[key_str](audio)
+
+ mel = self.extract_nvstft(
+ audio_res, keyshift=keyshift, train=train
+ ) # B, n_frames, bins
+ n_frames = int(audio.shape[1] // self.hop_size) + 1
+ mel = (
+ torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel
+ )
+ mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
+ return mel
+
+ def __call__(self, audio, sample_rate, keyshift=0, train=False):
+ return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
+
+
+class DotDict(dict):
+ def __getattr__(*args):
+ val = dict.get(*args)
+ return DotDict(val) if type(val) is dict else val
+
+ __setattr__ = dict.__setitem__
+ __delattr__ = dict.__delitem__
+
+
+class F0Predictor(object):
+ def compute_f0(self, wav, p_len):
+ pass
+
+ def compute_f0_uv(self, wav, p_len):
+ pass
+
+
+class FCPEF0Predictor(F0Predictor):
+ def __init__(
+ self,
+ model_path,
+ hop_length=512,
+ f0_min=50,
+ f0_max=1100,
+ dtype=torch.float32,
+ device=None,
+ sample_rate=44100,
+ threshold=0.05,
+ ):
+ self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
+ self.hop_length = hop_length
+ self.f0_min = f0_min
+ self.f0_max = f0_max
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+ self.threshold = threshold
+ self.sample_rate = sample_rate
+ self.dtype = dtype
+ self.name = "fcpe"
+
+ def repeat_expand(
+ self,
+ content: Union[torch.Tensor, np.ndarray],
+ target_len: int,
+ mode: str = "nearest",
+ ):
+ ndim = content.ndim
+ content = (
+ content[None, None]
+ if ndim == 1
+ else content[None] if ndim == 2 else content
+ )
+ assert content.ndim == 3
+ is_np = isinstance(content, np.ndarray)
+ content = torch.from_numpy(content) if is_np else content
+ results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
+ results = results.numpy() if is_np else results
+ return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results
+
+ def post_process(self, x, sample_rate, f0, pad_to):
+ f0 = (
+ torch.from_numpy(f0).float().to(x.device)
+ if isinstance(f0, np.ndarray)
+ else f0
+ )
+ f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0
+
+ vuv_vector = torch.zeros_like(f0)
+ vuv_vector[f0 > 0.0] = 1.0
+ vuv_vector[f0 <= 0.0] = 0.0
+
+ nzindex = torch.nonzero(f0).squeeze()
+ f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
+ time_org = self.hop_length / sample_rate * nzindex.cpu().numpy()
+ time_frame = np.arange(pad_to) * self.hop_length / sample_rate
+
+ vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
+
+ if f0.shape[0] <= 0:
+ return np.zeros(pad_to), vuv_vector.cpu().numpy()
+ if f0.shape[0] == 1:
+ return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy()
+
+ f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
+ return f0, vuv_vector.cpu().numpy()
+
+ def compute_f0(self, wav, p_len=None):
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
+ p_len = x.shape[0] // self.hop_length if p_len is None else p_len
+ f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
+ if torch.all(f0 == 0):
+ return f0.cpu().numpy() if p_len is None else np.zeros(p_len)
+ return self.post_process(x, self.sample_rate, f0, p_len)[0]
+
+ def compute_f0_uv(self, wav, p_len=None):
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
+ p_len = x.shape[0] // self.hop_length if p_len is None else p_len
+ f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
+ if torch.all(f0 == 0):
+ return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
+ f0.cpu().numpy() if p_len is None else np.zeros(p_len)
+ )
+ return self.post_process(x, self.sample_rate, f0, p_len)
diff --git a/rvc/lib/predictors/RMVPE.py b/rvc/lib/predictors/RMVPE.py
new file mode 100644
index 0000000000000000000000000000000000000000..4197a3ce1ec97d8e431f667c387d0a310190439b
--- /dev/null
+++ b/rvc/lib/predictors/RMVPE.py
@@ -0,0 +1,537 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+
+from librosa.filters import mel
+from typing import List
+
+N_MELS = 128
+N_CLASS = 360
+
+
+class ConvBlockRes(nn.Module):
+ """
+ A convolutional block with residual connection.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ momentum (float): Momentum for batch normalization.
+ """
+
+ def __init__(self, in_channels, out_channels, momentum=0.01):
+ super(ConvBlockRes, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=(3, 3),
+ stride=(1, 1),
+ padding=(1, 1),
+ bias=False,
+ ),
+ nn.BatchNorm2d(out_channels, momentum=momentum),
+ nn.ReLU(),
+ nn.Conv2d(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=(3, 3),
+ stride=(1, 1),
+ padding=(1, 1),
+ bias=False,
+ ),
+ nn.BatchNorm2d(out_channels, momentum=momentum),
+ nn.ReLU(),
+ )
+ if in_channels != out_channels:
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
+ self.is_shortcut = True
+ else:
+ self.is_shortcut = False
+
+ def forward(self, x):
+ if self.is_shortcut:
+ return self.conv(x) + self.shortcut(x)
+ else:
+ return self.conv(x) + x
+
+
+class ResEncoderBlock(nn.Module):
+ """
+ A residual encoder block.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ kernel_size (tuple): Size of the average pooling kernel.
+ n_blocks (int): Number of convolutional blocks in the block.
+ momentum (float): Momentum for batch normalization.
+ """
+
+ def __init__(
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
+ ):
+ super(ResEncoderBlock, self).__init__()
+ self.n_blocks = n_blocks
+ self.conv = nn.ModuleList()
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+ for _ in range(n_blocks - 1):
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+ self.kernel_size = kernel_size
+ if self.kernel_size is not None:
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
+
+ def forward(self, x):
+ for i in range(self.n_blocks):
+ x = self.conv[i](x)
+ if self.kernel_size is not None:
+ return x, self.pool(x)
+ else:
+ return x
+
+
+class Encoder(nn.Module):
+ """
+ The encoder part of the DeepUnet.
+
+ Args:
+ in_channels (int): Number of input channels.
+ in_size (int): Size of the input tensor.
+ n_encoders (int): Number of encoder blocks.
+ kernel_size (tuple): Size of the average pooling kernel.
+ n_blocks (int): Number of convolutional blocks in each encoder block.
+ out_channels (int): Number of output channels for the first encoder block.
+ momentum (float): Momentum for batch normalization.
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ in_size,
+ n_encoders,
+ kernel_size,
+ n_blocks,
+ out_channels=16,
+ momentum=0.01,
+ ):
+ super(Encoder, self).__init__()
+ self.n_encoders = n_encoders
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+ self.layers = nn.ModuleList()
+ self.latent_channels = []
+ for i in range(self.n_encoders):
+ self.layers.append(
+ ResEncoderBlock(
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
+ )
+ )
+ self.latent_channels.append([out_channels, in_size])
+ in_channels = out_channels
+ out_channels *= 2
+ in_size //= 2
+ self.out_size = in_size
+ self.out_channel = out_channels
+
+ def forward(self, x: torch.Tensor):
+ concat_tensors: List[torch.Tensor] = []
+ x = self.bn(x)
+ for i in range(self.n_encoders):
+ t, x = self.layers[i](x)
+ concat_tensors.append(t)
+ return x, concat_tensors
+
+
+class Intermediate(nn.Module):
+ """
+ The intermediate layer of the DeepUnet.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ n_inters (int): Number of convolutional blocks in the intermediate layer.
+ n_blocks (int): Number of convolutional blocks in each intermediate block.
+ momentum (float): Momentum for batch normalization.
+ """
+
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
+ super(Intermediate, self).__init__()
+ self.n_inters = n_inters
+ self.layers = nn.ModuleList()
+ self.layers.append(
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
+ )
+ for _ in range(self.n_inters - 1):
+ self.layers.append(
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
+ )
+
+ def forward(self, x):
+ for i in range(self.n_inters):
+ x = self.layers[i](x)
+ return x
+
+
+class ResDecoderBlock(nn.Module):
+ """
+ A residual decoder block.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ stride (tuple): Stride for transposed convolution.
+ n_blocks (int): Number of convolutional blocks in the block.
+ momentum (float): Momentum for batch normalization.
+ """
+
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
+ super(ResDecoderBlock, self).__init__()
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
+ self.n_blocks = n_blocks
+ self.conv1 = nn.Sequential(
+ nn.ConvTranspose2d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=(3, 3),
+ stride=stride,
+ padding=(1, 1),
+ output_padding=out_padding,
+ bias=False,
+ ),
+ nn.BatchNorm2d(out_channels, momentum=momentum),
+ nn.ReLU(),
+ )
+ self.conv2 = nn.ModuleList()
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+ for _ in range(n_blocks - 1):
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+
+ def forward(self, x, concat_tensor):
+ x = self.conv1(x)
+ x = torch.cat((x, concat_tensor), dim=1)
+ for i in range(self.n_blocks):
+ x = self.conv2[i](x)
+ return x
+
+
+class Decoder(nn.Module):
+ """
+ The decoder part of the DeepUnet.
+
+ Args:
+ in_channels (int): Number of input channels.
+ n_decoders (int): Number of decoder blocks.
+ stride (tuple): Stride for transposed convolution.
+ n_blocks (int): Number of convolutional blocks in each decoder block.
+ momentum (float): Momentum for batch normalization.
+ """
+
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
+ super(Decoder, self).__init__()
+ self.layers = nn.ModuleList()
+ self.n_decoders = n_decoders
+ for _ in range(self.n_decoders):
+ out_channels = in_channels // 2
+ self.layers.append(
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
+ )
+ in_channels = out_channels
+
+ def forward(self, x, concat_tensors):
+ for i in range(self.n_decoders):
+ x = self.layers[i](x, concat_tensors[-1 - i])
+ return x
+
+
+class DeepUnet(nn.Module):
+ """
+ The DeepUnet architecture.
+
+ Args:
+ kernel_size (tuple): Size of the average pooling kernel.
+ n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
+ en_de_layers (int): Number of encoder/decoder layers.
+ inter_layers (int): Number of convolutional blocks in the intermediate layer.
+ in_channels (int): Number of input channels.
+ en_out_channels (int): Number of output channels for the first encoder block.
+ """
+
+ def __init__(
+ self,
+ kernel_size,
+ n_blocks,
+ en_de_layers=5,
+ inter_layers=4,
+ in_channels=1,
+ en_out_channels=16,
+ ):
+ super(DeepUnet, self).__init__()
+ self.encoder = Encoder(
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
+ )
+ self.intermediate = Intermediate(
+ self.encoder.out_channel // 2,
+ self.encoder.out_channel,
+ inter_layers,
+ n_blocks,
+ )
+ self.decoder = Decoder(
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
+ )
+
+ def forward(self, x):
+ x, concat_tensors = self.encoder(x)
+ x = self.intermediate(x)
+ x = self.decoder(x, concat_tensors)
+ return x
+
+
+class E2E(nn.Module):
+ """
+ The end-to-end model.
+
+ Args:
+ n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
+ n_gru (int): Number of GRU layers.
+ kernel_size (tuple): Size of the average pooling kernel.
+ en_de_layers (int): Number of encoder/decoder layers.
+ inter_layers (int): Number of convolutional blocks in the intermediate layer.
+ in_channels (int): Number of input channels.
+ en_out_channels (int): Number of output channels for the first encoder block.
+ """
+
+ def __init__(
+ self,
+ n_blocks,
+ n_gru,
+ kernel_size,
+ en_de_layers=5,
+ inter_layers=4,
+ in_channels=1,
+ en_out_channels=16,
+ ):
+ super(E2E, self).__init__()
+ self.unet = DeepUnet(
+ kernel_size,
+ n_blocks,
+ en_de_layers,
+ inter_layers,
+ in_channels,
+ en_out_channels,
+ )
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
+ if n_gru:
+ self.fc = nn.Sequential(
+ BiGRU(3 * 128, 256, n_gru),
+ nn.Linear(512, N_CLASS),
+ nn.Dropout(0.25),
+ nn.Sigmoid(),
+ )
+ else:
+ self.fc = nn.Sequential(
+ nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
+ )
+
+ def forward(self, mel):
+ mel = mel.transpose(-1, -2).unsqueeze(1)
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
+ x = self.fc(x)
+ return x
+
+
+class MelSpectrogram(torch.nn.Module):
+ """
+ Extracts Mel-spectrogram features from audio.
+
+ Args:
+ n_mel_channels (int): Number of Mel-frequency bands.
+ sample_rate (int): Sampling rate of the audio.
+ win_length (int): Length of the window function in samples.
+ hop_length (int): Hop size between frames in samples.
+ n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length.
+ mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0.
+ mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None.
+ clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5.
+ """
+
+ def __init__(
+ self,
+ n_mel_channels,
+ sample_rate,
+ win_length,
+ hop_length,
+ n_fft=None,
+ mel_fmin=0,
+ mel_fmax=None,
+ clamp=1e-5,
+ ):
+ super().__init__()
+ n_fft = win_length if n_fft is None else n_fft
+ self.hann_window = {}
+ mel_basis = mel(
+ sr=sample_rate,
+ n_fft=n_fft,
+ n_mels=n_mel_channels,
+ fmin=mel_fmin,
+ fmax=mel_fmax,
+ htk=True,
+ )
+ mel_basis = torch.from_numpy(mel_basis).float()
+ self.register_buffer("mel_basis", mel_basis)
+ self.n_fft = win_length if n_fft is None else n_fft
+ self.hop_length = hop_length
+ self.win_length = win_length
+ self.sample_rate = sample_rate
+ self.n_mel_channels = n_mel_channels
+ self.clamp = clamp
+
+ def forward(self, audio, keyshift=0, speed=1, center=True):
+ factor = 2 ** (keyshift / 12)
+ n_fft_new = int(np.round(self.n_fft * factor))
+ win_length_new = int(np.round(self.win_length * factor))
+ hop_length_new = int(np.round(self.hop_length * speed))
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
+ if keyshift_key not in self.hann_window:
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
+ audio.device
+ )
+ fft = torch.stft(
+ audio,
+ n_fft=n_fft_new,
+ hop_length=hop_length_new,
+ win_length=win_length_new,
+ window=self.hann_window[keyshift_key],
+ center=center,
+ return_complex=True,
+ )
+
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
+ if keyshift != 0:
+ size = self.n_fft // 2 + 1
+ resize = magnitude.size(1)
+ if resize < size:
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
+ mel_output = torch.matmul(self.mel_basis, magnitude)
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
+ return log_mel_spec
+
+
+class RMVPE0Predictor:
+ """
+ A predictor for fundamental frequency (F0) based on the RMVPE0 model.
+
+ Args:
+ model_path (str): Path to the RMVPE0 model file.
+ device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available.
+ """
+
+ def __init__(self, model_path, device=None):
+ self.resample_kernel = {}
+ model = E2E(4, 1, (2, 2))
+ ckpt = torch.load(model_path, map_location="cpu", weights_only=True)
+ model.load_state_dict(ckpt)
+ model.eval()
+ self.model = model
+ self.resample_kernel = {}
+ self.device = device
+ self.mel_extractor = MelSpectrogram(
+ N_MELS, 16000, 1024, 160, None, 30, 8000
+ ).to(device)
+ self.model = self.model.to(device)
+ cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
+ self.cents_mapping = np.pad(cents_mapping, (4, 4))
+
+ def mel2hidden(self, mel):
+ """
+ Converts Mel-spectrogram features to hidden representation.
+
+ Args:
+ mel (torch.Tensor): Mel-spectrogram features.
+ """
+ with torch.no_grad():
+ n_frames = mel.shape[-1]
+ mel = F.pad(
+ mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
+ )
+ hidden = self.model(mel)
+ return hidden[:, :n_frames]
+
+ def decode(self, hidden, thred=0.03):
+ """
+ Decodes hidden representation to F0.
+
+ Args:
+ hidden (np.ndarray): Hidden representation.
+ thred (float, optional): Threshold for salience. Defaults to 0.03.
+ """
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
+ f0 = 10 * (2 ** (cents_pred / 1200))
+ f0[f0 == 10] = 0
+ return f0
+
+ def infer_from_audio(self, audio, thred=0.03):
+ """
+ Infers F0 from audio.
+
+ Args:
+ audio (np.ndarray): Audio signal.
+ thred (float, optional): Threshold for salience. Defaults to 0.03.
+ """
+ audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
+ mel = self.mel_extractor(audio, center=True)
+ hidden = self.mel2hidden(mel)
+ hidden = hidden.squeeze(0).cpu().numpy()
+ f0 = self.decode(hidden, thred=thred)
+ return f0
+
+ def to_local_average_cents(self, salience, thred=0.05):
+ """
+ Converts salience to local average cents.
+
+ Args:
+ salience (np.ndarray): Salience values.
+ thred (float, optional): Threshold for salience. Defaults to 0.05.
+ """
+ center = np.argmax(salience, axis=1)
+ salience = np.pad(salience, ((0, 0), (4, 4)))
+ center += 4
+ todo_salience = []
+ todo_cents_mapping = []
+ starts = center - 4
+ ends = center + 5
+ for idx in range(salience.shape[0]):
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
+ todo_salience = np.array(todo_salience)
+ todo_cents_mapping = np.array(todo_cents_mapping)
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
+ weight_sum = np.sum(todo_salience, 1)
+ devided = product_sum / weight_sum
+ maxx = np.max(salience, axis=1)
+ devided[maxx <= thred] = 0
+ return devided
+
+
+class BiGRU(nn.Module):
+ """
+ A bidirectional GRU layer.
+
+ Args:
+ input_features (int): Number of input features.
+ hidden_features (int): Number of hidden features.
+ num_layers (int): Number of GRU layers.
+ """
+
+ def __init__(self, input_features, hidden_features, num_layers):
+ super(BiGRU, self).__init__()
+ self.gru = nn.GRU(
+ input_features,
+ hidden_features,
+ num_layers=num_layers,
+ batch_first=True,
+ bidirectional=True,
+ )
+
+ def forward(self, x):
+ return self.gru(x)[0]
diff --git a/rvc/lib/tools/analyzer.py b/rvc/lib/tools/analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4b794348082b168132dda0a23933c6d633f0097
--- /dev/null
+++ b/rvc/lib/tools/analyzer.py
@@ -0,0 +1,76 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import librosa.display
+import librosa
+
+
+def calculate_features(y, sr):
+ stft = np.abs(librosa.stft(y))
+ duration = librosa.get_duration(y=y, sr=sr)
+ cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0]
+ bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0]
+ rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0]
+ return stft, duration, cent, bw, rolloff
+
+
+def plot_title(title):
+ plt.suptitle(title, fontsize=16, fontweight="bold")
+
+
+def plot_spectrogram(y, sr, stft, duration, cmap="inferno"):
+ plt.subplot(3, 1, 1)
+ plt.imshow(
+ librosa.amplitude_to_db(stft, ref=np.max),
+ origin="lower",
+ extent=[0, duration, 0, sr / 1000],
+ aspect="auto",
+ cmap=cmap, # Change the colormap here
+ )
+ plt.colorbar(format="%+2.0f dB")
+ plt.xlabel("Time (s)")
+ plt.ylabel("Frequency (kHz)")
+ plt.title("Spectrogram")
+
+
+def plot_waveform(y, sr, duration):
+ plt.subplot(3, 1, 2)
+ librosa.display.waveshow(y, sr=sr)
+ plt.xlabel("Time (s)")
+ plt.ylabel("Amplitude")
+ plt.title("Waveform")
+
+
+def plot_features(times, cent, bw, rolloff, duration):
+ plt.subplot(3, 1, 3)
+ plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b")
+ plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g")
+ plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r")
+ plt.xlabel("Time (s)")
+ plt.title("Spectral Features")
+ plt.legend()
+
+
+def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"):
+ y, sr = librosa.load(audio_file)
+ stft, duration, cent, bw, rolloff = calculate_features(y, sr)
+
+ plt.figure(figsize=(12, 10))
+
+ plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1])
+ plot_spectrogram(y, sr, stft, duration)
+ plot_waveform(y, sr, duration)
+ plot_features(librosa.times_like(cent), cent, bw, rolloff, duration)
+
+ plt.tight_layout()
+
+ if save_plot_path:
+ plt.savefig(save_plot_path, bbox_inches="tight", dpi=300)
+ plt.close()
+
+ audio_info = f"""Sample Rate: {sr}\nDuration: {(
+ str(round(duration, 2)) + " seconds"
+ if duration < 60
+ else str(round(duration / 60, 2)) + " minutes"
+ )}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}"""
+
+ return audio_info, save_plot_path
diff --git a/rvc/lib/tools/gdown.py b/rvc/lib/tools/gdown.py
new file mode 100644
index 0000000000000000000000000000000000000000..35f2dc310c3ef2c69a3485da8810b69687c13264
--- /dev/null
+++ b/rvc/lib/tools/gdown.py
@@ -0,0 +1,285 @@
+import os
+import re
+import sys
+import json
+import time
+import shutil
+import tempfile
+import warnings
+from typing import Optional, Union, IO
+import requests
+from urllib.parse import urlparse, unquote
+from tqdm import tqdm
+
+CHUNK_SIZE = 512 * 1024
+HOME = os.path.expanduser("~")
+
+
+def indent(text: str, prefix: str):
+ """Indent each non-empty line of text with the given prefix."""
+ return "".join(
+ (prefix + line if line.strip() else line) for line in text.splitlines(True)
+ )
+
+
+class FileURLRetrievalError(Exception):
+ """Custom exception for issues retrieving file URLs."""
+
+
+def _extract_download_url_from_confirmation(contents: str, url_origin: str):
+ """Extract the download URL from a Google Drive confirmation page."""
+ patterns = [
+ r'href="(\/uc\?export=download[^"]+)',
+ r'href="/open\?id=([^"]+)"',
+ r'"downloadUrl":"([^"]+)',
+ ]
+ for pattern in patterns:
+ match = re.search(pattern, contents)
+ if match:
+ url = match.group(1)
+ if pattern == r'href="/open\?id=([^"]+)"':
+ uuid_match = re.search(
+ r'(.*)
', contents)
+ if error_match:
+ error = error_match.group(1)
+ raise FileURLRetrievalError(error)
+
+ raise FileURLRetrievalError(
+ "Cannot retrieve the public link of the file. "
+ "You may need to change the permission to "
+ "'Anyone with the link', or have had many accesses."
+ )
+
+
+def _create_session(
+ proxy: Optional[str] = None,
+ use_cookies: bool = True,
+ return_cookies_file: bool = False,
+):
+ """Create a requests session with optional proxy and cookie handling."""
+ sess = requests.session()
+ sess.headers.update(
+ {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}
+ )
+
+ if proxy:
+ sess.proxies = {"http": proxy, "https": proxy}
+
+ cookies_file = os.path.join(HOME, ".cache/gdown/cookies.json")
+ if os.path.exists(cookies_file) and use_cookies:
+ try:
+ with open(cookies_file) as f:
+ cookies = json.load(f)
+ for k, v in cookies:
+ sess.cookies[k] = v
+ except json.JSONDecodeError:
+ warnings.warn("Corrupted Cookies file")
+
+ return (sess, cookies_file) if return_cookies_file else sess
+
+
+def download(
+ output: Optional[str] = None,
+ quiet: bool = False,
+ proxy: Optional[str] = None,
+ speed: Optional[float] = None,
+ use_cookies: bool = True,
+ verify: Union[bool, str] = True,
+ id: Optional[str] = None,
+ fuzzy: bool = True,
+ resume: bool = False,
+ format: Optional[str] = None,
+ url: Optional[str] = None,
+):
+ """Download a file from a URL, supporting Google Drive links.
+
+ Args:
+ output: Output filepath. Default is basename of URL.
+ quiet: Suppress terminal output.
+ proxy: HTTP/HTTPS proxy.
+ speed: Download speed limit (bytes per second).
+ use_cookies: Flag to use cookies.
+ verify: Verify TLS certificates.
+ id: Google Drive's file ID.
+ fuzzy: Fuzzy Google Drive ID extraction.
+ resume: Resume download from a tmp file.
+ format: Format for Google Docs/Sheets/Slides.
+ url: URL to download from.
+
+ Returns:
+ Output filename, or None on error.
+ """
+ if not (id is None) ^ (url is None):
+ raise ValueError("Either url or id has to be specified")
+
+ if id is not None:
+ url = f"https://drive.google.com/uc?id={id}"
+
+ url_origin = url
+ sess, cookies_file = _create_session(
+ proxy=proxy, use_cookies=use_cookies, return_cookies_file=True
+ )
+
+ while True:
+ res = sess.get(url, stream=True, verify=verify)
+ res.raise_for_status()
+
+ if url == url_origin and res.status_code == 500:
+ url = f"https://drive.google.com/open?id={id}"
+ continue
+
+ if res.headers.get("Content-Type", "").startswith("text/html"):
+ title_match = re.search("(.+)", res.text)
+ if title_match:
+ title = title_match.group(1)
+ if title.endswith(" - Google Docs"):
+ url = f"https://docs.google.com/document/d/{id}/export?format={'docx' if format is None else format}"
+ continue
+ if title.endswith(" - Google Sheets"):
+ url = f"https://docs.google.com/spreadsheets/d/{id}/export?format={'xlsx' if format is None else format}"
+ continue
+ if title.endswith(" - Google Slides"):
+ url = f"https://docs.google.com/presentation/d/{id}/export?format={'pptx' if format is None else format}"
+ continue
+ if (
+ "Content-Disposition" in res.headers
+ and res.headers["Content-Disposition"].endswith("pptx")
+ and format not in (None, "pptx")
+ ):
+ url = f"https://docs.google.com/presentation/d/{id}/export?format={'pptx' if format is None else format}"
+ continue
+
+ if use_cookies:
+ os.makedirs(os.path.dirname(cookies_file), exist_ok=True)
+ cookies = [
+ (k, v)
+ for k, v in sess.cookies.items()
+ if not k.startswith("download_warning_")
+ ]
+ with open(cookies_file, "w") as f:
+ json.dump(cookies, f, indent=2)
+
+ if "Content-Disposition" in res.headers:
+ break
+
+ parsed_url = urlparse(url)
+ is_gdrive = parsed_url.hostname in ("drive.google.com", "docs.google.com")
+ is_download_link = parsed_url.path.endswith("/uc")
+
+ if not (is_gdrive and is_download_link and fuzzy):
+ break
+
+ try:
+ url = _extract_download_url_from_confirmation(res.text, url_origin)
+ except FileURLRetrievalError as e:
+ raise FileURLRetrievalError(e)
+
+ content_disposition = res.headers.get("Content-Disposition", "")
+ filename_match = re.search(
+ r"filename\*=UTF-8''(.*)", content_disposition
+ ) or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition)
+ filename_from_url = (
+ unquote(filename_match.group(1)) if filename_match else os.path.basename(url)
+ )
+ download_path = output or filename_from_url
+
+ if isinstance(download_path, str) and download_path.endswith(os.path.sep):
+ os.makedirs(download_path, exist_ok=True)
+ download_path = os.path.join(download_path, filename_from_url)
+
+ temp_dir = os.path.dirname(download_path) or "."
+ prefix = os.path.basename(download_path)
+
+ if isinstance(download_path, str):
+ existing_tmp_files = [
+ os.path.join(temp_dir, file)
+ for file in os.listdir(temp_dir)
+ if file.startswith(prefix)
+ ]
+ if resume and existing_tmp_files:
+ if len(existing_tmp_files) > 1:
+ print(
+ "There are multiple temporary files to resume:",
+ file=sys.stderr,
+ )
+ for file in existing_tmp_files:
+ print(f"\t{file}", file=sys.stderr)
+ print(
+ "Please remove them except one to resume downloading.",
+ file=sys.stderr,
+ )
+ return None
+ temp_file_path = existing_tmp_files[0]
+ else:
+ resume = False
+ temp_file_path = tempfile.mktemp(
+ suffix=tempfile.template, prefix=prefix, dir=temp_dir
+ )
+
+ try:
+ file_obj: IO = open(temp_file_path, "ab")
+ except Exception as e:
+ print(
+ f"Could not open the temporary file {temp_file_path}: {e}",
+ file=sys.stderr,
+ )
+ return None
+ else:
+ temp_file_path = None
+ file_obj = download_path
+
+ if temp_file_path is not None and file_obj.tell() != 0:
+ headers = {"Range": f"bytes={file_obj.tell()}-"}
+ res = sess.get(url, headers=headers, stream=True, verify=verify)
+ res.raise_for_status()
+
+ try:
+ total = int(res.headers.get("Content-Length", 0))
+ if total > 0:
+ if not quiet:
+ pbar = tqdm(
+ total=total, unit="B", unit_scale=True, desc=filename_from_url
+ )
+ else:
+ if not quiet:
+ pbar = tqdm(unit="B", unit_scale=True, desc=filename_from_url)
+
+ t_start = time.time()
+ for chunk in res.iter_content(chunk_size=CHUNK_SIZE):
+ file_obj.write(chunk)
+ if not quiet:
+ pbar.update(len(chunk))
+ if speed is not None:
+ elapsed_time_expected = 1.0 * pbar.n / speed
+ elapsed_time = time.time() - t_start
+ if elapsed_time < elapsed_time_expected:
+ time.sleep(elapsed_time_expected - elapsed_time)
+ if not quiet:
+ pbar.close()
+
+ if temp_file_path:
+ file_obj.close()
+ shutil.move(temp_file_path, download_path)
+ finally:
+ sess.close()
+
+ return download_path
diff --git a/rvc/lib/tools/launch_tensorboard.py b/rvc/lib/tools/launch_tensorboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f74e316762b737037f7b8e4448a1042553d5651
--- /dev/null
+++ b/rvc/lib/tools/launch_tensorboard.py
@@ -0,0 +1,21 @@
+import time
+import logging
+from tensorboard import program
+
+log_path = "logs"
+
+
+def launch_tensorboard_pipeline():
+ logging.getLogger("root").setLevel(logging.WARNING)
+ logging.getLogger("tensorboard").setLevel(logging.WARNING)
+
+ tb = program.TensorBoard()
+ tb.configure(argv=[None, "--logdir", log_path])
+ url = tb.launch()
+
+ print(
+ f"Access the tensorboard using the following link:\n{url}?pinnedCards=%5B%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fd%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fkl%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fmel%22%7D%5D"
+ )
+
+ while True:
+ time.sleep(600)
diff --git a/rvc/lib/tools/model_download.py b/rvc/lib/tools/model_download.py
new file mode 100644
index 0000000000000000000000000000000000000000..f795a201864b9702f93da7bb07117a0fcfcee90f
--- /dev/null
+++ b/rvc/lib/tools/model_download.py
@@ -0,0 +1,226 @@
+import os
+import re
+import sys
+import shutil
+import zipfile
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import unquote
+from tqdm import tqdm
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from rvc.lib.utils import format_title
+from rvc.lib.tools import gdown
+
+
+file_path = os.path.join(now_dir, "logs")
+zips_path = os.path.join(file_path, "zips")
+os.makedirs(zips_path, exist_ok=True)
+
+
+def search_pth_index(folder):
+ pth_paths = [
+ os.path.join(folder, file)
+ for file in os.listdir(folder)
+ if os.path.isfile(os.path.join(folder, file)) and file.endswith(".pth")
+ ]
+ index_paths = [
+ os.path.join(folder, file)
+ for file in os.listdir(folder)
+ if os.path.isfile(os.path.join(folder, file)) and file.endswith(".index")
+ ]
+ return pth_paths, index_paths
+
+
+def download_from_url(url):
+ os.chdir(zips_path)
+
+ try:
+ if "drive.google.com" in url:
+ file_id = extract_google_drive_id(url)
+ if file_id:
+ gdown.download(
+ url=f"https://drive.google.com/uc?id={file_id}",
+ quiet=False,
+ fuzzy=True,
+ )
+ elif "/blob/" in url or "/resolve/" in url:
+ download_blob_or_resolve(url)
+ elif "/tree/main" in url:
+ download_from_huggingface(url)
+ else:
+ download_file(url)
+
+ rename_downloaded_files()
+ return "downloaded"
+ except Exception as error:
+ print(f"An error occurred downloading the file: {error}")
+ return None
+ finally:
+ os.chdir(now_dir)
+
+
+def extract_google_drive_id(url):
+ if "file/d/" in url:
+ return url.split("file/d/")[1].split("/")[0]
+ if "id=" in url:
+ return url.split("id=")[1].split("&")[0]
+ return None
+
+
+def download_blob_or_resolve(url):
+ if "/blob/" in url:
+ url = url.replace("/blob/", "/resolve/")
+ response = requests.get(url, stream=True)
+ if response.status_code == 200:
+ save_response_content(response)
+ else:
+ raise ValueError(
+ "Download failed with status code: " + str(response.status_code)
+ )
+
+
+def save_response_content(response):
+ content_disposition = unquote(response.headers.get("Content-Disposition", ""))
+ file_name = (
+ re.search(r'filename="([^"]+)"', content_disposition)
+ .groups()[0]
+ .replace(os.path.sep, "_")
+ if content_disposition
+ else "downloaded_file"
+ )
+
+ total_size = int(response.headers.get("Content-Length", 0))
+ chunk_size = 1024
+
+ with open(os.path.join(zips_path, file_name), "wb") as file, tqdm(
+ total=total_size, unit="B", unit_scale=True, desc=file_name
+ ) as progress_bar:
+ for data in response.iter_content(chunk_size):
+ file.write(data)
+ progress_bar.update(len(data))
+
+
+def download_from_huggingface(url):
+ response = requests.get(url)
+ soup = BeautifulSoup(response.content, "html.parser")
+ temp_url = next(
+ (
+ link["href"]
+ for link in soup.find_all("a", href=True)
+ if link["href"].endswith(".zip")
+ ),
+ None,
+ )
+ if temp_url:
+ url = temp_url.replace("blob", "resolve")
+ if "huggingface.co" not in url:
+ url = "https://huggingface.co" + url
+ download_file(url)
+ else:
+ raise ValueError("No zip file found in Huggingface URL")
+
+
+def download_file(url):
+ response = requests.get(url, stream=True)
+ if response.status_code == 200:
+ save_response_content(response)
+ else:
+ raise ValueError(
+ "Download failed with status code: " + str(response.status_code)
+ )
+
+
+def rename_downloaded_files():
+ for currentPath, _, zipFiles in os.walk(zips_path):
+ for file in zipFiles:
+ file_name, extension = os.path.splitext(file)
+ real_path = os.path.join(currentPath, file)
+ os.rename(real_path, file_name.replace(os.path.sep, "_") + extension)
+
+
+def extract(zipfile_path, unzips_path):
+ try:
+ with zipfile.ZipFile(zipfile_path, "r") as zip_ref:
+ zip_ref.extractall(unzips_path)
+ os.remove(zipfile_path)
+ return True
+ except Exception as error:
+ print(f"An error occurred extracting the zip file: {error}")
+ return False
+
+
+def unzip_file(zip_path, zip_file_name):
+ zip_file_path = os.path.join(zip_path, zip_file_name + ".zip")
+ extract_path = os.path.join(file_path, zip_file_name)
+ with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+ zip_ref.extractall(extract_path)
+ os.remove(zip_file_path)
+
+
+def model_download_pipeline(url: str):
+ try:
+ result = download_from_url(url)
+ if result == "downloaded":
+ return handle_extraction_process()
+ else:
+ return "Error"
+ except Exception as error:
+ print(f"An unexpected error occurred: {error}")
+ return "Error"
+
+
+def handle_extraction_process():
+ extract_folder_path = ""
+ for filename in os.listdir(zips_path):
+ if filename.endswith(".zip"):
+ zipfile_path = os.path.join(zips_path, filename)
+ model_name = format_title(os.path.basename(zipfile_path).split(".zip")[0])
+ extract_folder_path = os.path.join("logs", os.path.normpath(model_name))
+ success = extract(zipfile_path, extract_folder_path)
+ clean_extracted_files(extract_folder_path, model_name)
+
+ if success:
+ print(f"Model {model_name} downloaded!")
+ else:
+ print(f"Error downloading {model_name}")
+ return "Error"
+ if not extract_folder_path:
+ print("Zip file was not found.")
+ return "Error"
+ return search_pth_index(extract_folder_path)
+
+
+def clean_extracted_files(extract_folder_path, model_name):
+ macosx_path = os.path.join(extract_folder_path, "__MACOSX")
+ if os.path.exists(macosx_path):
+ shutil.rmtree(macosx_path)
+
+ subfolders = [
+ f
+ for f in os.listdir(extract_folder_path)
+ if os.path.isdir(os.path.join(extract_folder_path, f))
+ ]
+ if len(subfolders) == 1:
+ subfolder_path = os.path.join(extract_folder_path, subfolders[0])
+ for item in os.listdir(subfolder_path):
+ shutil.move(
+ os.path.join(subfolder_path, item),
+ os.path.join(extract_folder_path, item),
+ )
+ os.rmdir(subfolder_path)
+
+ for item in os.listdir(extract_folder_path):
+ source_path = os.path.join(extract_folder_path, item)
+ if ".pth" in item:
+ new_file_name = model_name + ".pth"
+ elif ".index" in item:
+ new_file_name = model_name + ".index"
+ else:
+ continue
+
+ destination_path = os.path.join(extract_folder_path, new_file_name)
+ if not os.path.exists(destination_path):
+ os.rename(source_path, destination_path)
diff --git a/rvc/lib/tools/prerequisites_download.py b/rvc/lib/tools/prerequisites_download.py
new file mode 100644
index 0000000000000000000000000000000000000000..fce714272099d6fd98a0ba72b6dbcebf9daa5948
--- /dev/null
+++ b/rvc/lib/tools/prerequisites_download.py
@@ -0,0 +1,153 @@
+import os
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+import requests
+
+url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources"
+
+pretraineds_hifigan_list = [
+ (
+ "pretrained_v2/",
+ [
+ "f0D32k.pth",
+ "f0D40k.pth",
+ "f0D48k.pth",
+ "f0G32k.pth",
+ "f0G40k.pth",
+ "f0G48k.pth",
+ ],
+ )
+]
+models_list = [("predictors/", ["rmvpe.pt", "fcpe.pt"])]
+embedders_list = [("embedders/contentvec/", ["pytorch_model.bin", "config.json"])]
+executables_list = [
+ ("", ["ffmpeg.exe", "ffprobe.exe"]),
+]
+
+folder_mapping_list = {
+ "pretrained_v2/": "rvc/models/pretraineds/hifi-gan/",
+ "embedders/contentvec/": "rvc/models/embedders/contentvec/",
+ "predictors/": "rvc/models/predictors/",
+ "formant/": "rvc/models/formant/",
+}
+
+
+def get_file_size_if_missing(file_list):
+ """
+ Calculate the total size of files to be downloaded only if they do not exist locally.
+ """
+ total_size = 0
+ for remote_folder, files in file_list:
+ local_folder = folder_mapping_list.get(remote_folder, "")
+ for file in files:
+ destination_path = os.path.join(local_folder, file)
+ if not os.path.exists(destination_path):
+ url = f"{url_base}/{remote_folder}{file}"
+ response = requests.head(url)
+ total_size += int(response.headers.get("content-length", 0))
+ return total_size
+
+
+def download_file(url, destination_path, global_bar):
+ """
+ Download a file from the given URL to the specified destination path,
+ updating the global progress bar as data is downloaded.
+ """
+
+ dir_name = os.path.dirname(destination_path)
+ if dir_name:
+ os.makedirs(dir_name, exist_ok=True)
+ response = requests.get(url, stream=True)
+ block_size = 1024
+ with open(destination_path, "wb") as file:
+ for data in response.iter_content(block_size):
+ file.write(data)
+ global_bar.update(len(data))
+
+
+def download_mapping_files(file_mapping_list, global_bar):
+ """
+ Download all files in the provided file mapping list using a thread pool executor,
+ and update the global progress bar as downloads progress.
+ """
+ with ThreadPoolExecutor() as executor:
+ futures = []
+ for remote_folder, file_list in file_mapping_list:
+ local_folder = folder_mapping_list.get(remote_folder, "")
+ for file in file_list:
+ destination_path = os.path.join(local_folder, file)
+ if not os.path.exists(destination_path):
+ url = f"{url_base}/{remote_folder}{file}"
+ futures.append(
+ executor.submit(
+ download_file, url, destination_path, global_bar
+ )
+ )
+ for future in futures:
+ future.result()
+
+
+def split_pretraineds(pretrained_list):
+ f0_list = []
+ non_f0_list = []
+ for folder, files in pretrained_list:
+ f0_files = [f for f in files if f.startswith("f0")]
+ non_f0_files = [f for f in files if not f.startswith("f0")]
+ if f0_files:
+ f0_list.append((folder, f0_files))
+ if non_f0_files:
+ non_f0_list.append((folder, non_f0_files))
+ return f0_list, non_f0_list
+
+
+pretraineds_hifigan_list, _ = split_pretraineds(pretraineds_hifigan_list)
+
+
+def calculate_total_size(
+ pretraineds_hifigan,
+ models,
+ exe,
+):
+ """
+ Calculate the total size of all files to be downloaded based on selected categories.
+ """
+ total_size = 0
+ if models:
+ total_size += get_file_size_if_missing(models_list)
+ total_size += get_file_size_if_missing(embedders_list)
+ if exe and os.name == "nt":
+ total_size += get_file_size_if_missing(executables_list)
+ total_size += get_file_size_if_missing(pretraineds_hifigan)
+ return total_size
+
+
+def prequisites_download_pipeline(
+ pretraineds_hifigan,
+ models,
+ exe,
+):
+ """
+ Manage the download pipeline for different categories of files.
+ """
+ total_size = calculate_total_size(
+ pretraineds_hifigan_list if pretraineds_hifigan else [],
+ models,
+ exe,
+ )
+
+ if total_size > 0:
+ with tqdm(
+ total=total_size, unit="iB", unit_scale=True, desc="Downloading all files"
+ ) as global_bar:
+ if models:
+ download_mapping_files(models_list, global_bar)
+ download_mapping_files(embedders_list, global_bar)
+ if exe:
+ if os.name == "nt":
+ download_mapping_files(executables_list, global_bar)
+ else:
+ print("No executables needed")
+ if pretraineds_hifigan:
+ download_mapping_files(pretraineds_hifigan_list, global_bar)
+ else:
+ pass
diff --git a/rvc/lib/tools/pretrained_selector.py b/rvc/lib/tools/pretrained_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..94d67e9687cb0842325c2984e1aeb8847f2d7c6f
--- /dev/null
+++ b/rvc/lib/tools/pretrained_selector.py
@@ -0,0 +1,13 @@
+import os
+
+
+def pretrained_selector(vocoder, sample_rate):
+ base_path = os.path.join("rvc", "models", "pretraineds", f"{vocoder.lower()}")
+
+ path_g = os.path.join(base_path, f"f0G{str(sample_rate)[:2]}k.pth")
+ path_d = os.path.join(base_path, f"f0D{str(sample_rate)[:2]}k.pth")
+
+ if os.path.exists(path_g) and os.path.exists(path_d):
+ return path_g, path_d
+ else:
+ return "", ""
diff --git a/rvc/lib/tools/split_audio.py b/rvc/lib/tools/split_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7d29420dfade0884b9fd98beb0f7991c4a94d90
--- /dev/null
+++ b/rvc/lib/tools/split_audio.py
@@ -0,0 +1,79 @@
+import numpy as np
+import librosa
+
+
+def process_audio(audio, sr=16000, silence_thresh=-60, min_silence_len=250):
+ """
+ Splits an audio signal into segments using a fixed frame size and hop size.
+
+ Parameters:
+ - audio (np.ndarray): The audio signal to split.
+ - sr (int): The sample rate of the input audio (default is 16000).
+ - silence_thresh (int): Silence threshold (default =-60dB)
+ - min_silence_len (int): Minimum silence duration (default 250ms).
+
+ Returns:
+ - list of np.ndarray: A list of audio segments.
+ - np.ndarray: The intervals where the audio was split.
+ """
+ frame_length = int(min_silence_len / 1000 * sr)
+ hop_length = frame_length // 2
+ intervals = librosa.effects.split(
+ audio, top_db=-silence_thresh, frame_length=frame_length, hop_length=hop_length
+ )
+ audio_segments = [audio[start:end] for start, end in intervals]
+
+ return audio_segments, intervals
+
+
+def merge_audio(audio_segments_org, audio_segments_new, intervals, sr_orig, sr_new):
+ """
+ Merges audio segments back into a single audio signal, filling gaps with silence.
+ Assumes audio segments are already at sr_new.
+
+ Parameters:
+ - audio_segments_org (list of np.ndarray): The non-silent audio segments (at sr_orig).
+ - audio_segments_new (list of np.ndarray): The non-silent audio segments (at sr_new).
+ - intervals (np.ndarray): The intervals used for splitting the original audio.
+ - sr_orig (int): The sample rate of the original audio
+ - sr_new (int): The sample rate of the model
+ Returns:
+ - np.ndarray: The merged audio signal with silent gaps restored.
+ """
+ merged_audio = np.array([], dtype=audio_segments_new[0].dtype)
+ sr_ratio = sr_new / sr_orig
+
+ for i, (start, end) in enumerate(intervals):
+
+ start_new = int(start * sr_ratio)
+ end_new = int(end * sr_ratio)
+
+ original_duration = len(audio_segments_org[i]) / sr_orig
+ new_duration = len(audio_segments_new[i]) / sr_new
+ duration_diff = new_duration - original_duration
+
+ silence_samples = int(abs(duration_diff) * sr_new)
+ silence_compensation = np.zeros(
+ silence_samples, dtype=audio_segments_new[0].dtype
+ )
+
+ if i == 0 and start_new > 0:
+ initial_silence = np.zeros(start_new, dtype=audio_segments_new[0].dtype)
+ merged_audio = np.concatenate((merged_audio, initial_silence))
+
+ if duration_diff > 0:
+ merged_audio = np.concatenate((merged_audio, silence_compensation))
+
+ merged_audio = np.concatenate((merged_audio, audio_segments_new[i]))
+
+ if duration_diff < 0:
+ merged_audio = np.concatenate((merged_audio, silence_compensation))
+
+ if i < len(intervals) - 1:
+ next_start_new = int(intervals[i + 1][0] * sr_ratio)
+ silence_duration = next_start_new - end_new
+ if silence_duration > 0:
+ silence = np.zeros(silence_duration, dtype=audio_segments_new[0].dtype)
+ merged_audio = np.concatenate((merged_audio, silence))
+
+ return merged_audio
diff --git a/rvc/lib/tools/tts.py b/rvc/lib/tools/tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3252369cb12c10a6c7d11e0e10f754ee78e947f
--- /dev/null
+++ b/rvc/lib/tools/tts.py
@@ -0,0 +1,29 @@
+import sys
+import asyncio
+import edge_tts
+import os
+
+
+async def main():
+ # Parse command line arguments
+ tts_file = str(sys.argv[1])
+ text = str(sys.argv[2])
+ voice = str(sys.argv[3])
+ rate = int(sys.argv[4])
+ output_file = str(sys.argv[5])
+
+ rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
+ if tts_file and os.path.exists(tts_file):
+ text = ""
+ try:
+ with open(tts_file, "r", encoding="utf-8") as file:
+ text = file.read()
+ except UnicodeDecodeError:
+ with open(tts_file, "r") as file:
+ text = file.read()
+ await edge_tts.Communicate(text, voice, rate=rates).save(output_file)
+ # print(f"TTS with {voice} completed. Output TTS file: '{output_file}'")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/rvc/lib/tools/tts_voices.json b/rvc/lib/tools/tts_voices.json
new file mode 100644
index 0000000000000000000000000000000000000000..b76cf447ccfacff86e844360caeac6c8e0b27e95
--- /dev/null
+++ b/rvc/lib/tools/tts_voices.json
@@ -0,0 +1,5748 @@
+[
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (af-ZA, AdriNeural)",
+ "ShortName": "af-ZA-AdriNeural",
+ "Gender": "Female",
+ "Locale": "af-ZA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Adri Online (Natural) - Afrikaans (South Africa)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (af-ZA, WillemNeural)",
+ "ShortName": "af-ZA-WillemNeural",
+ "Gender": "Male",
+ "Locale": "af-ZA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Willem Online (Natural) - Afrikaans (South Africa)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sq-AL, AnilaNeural)",
+ "ShortName": "sq-AL-AnilaNeural",
+ "Gender": "Female",
+ "Locale": "sq-AL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Anila Online (Natural) - Albanian (Albania)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sq-AL, IlirNeural)",
+ "ShortName": "sq-AL-IlirNeural",
+ "Gender": "Male",
+ "Locale": "sq-AL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ilir Online (Natural) - Albanian (Albania)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (am-ET, AmehaNeural)",
+ "ShortName": "am-ET-AmehaNeural",
+ "Gender": "Male",
+ "Locale": "am-ET",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ameha Online (Natural) - Amharic (Ethiopia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (am-ET, MekdesNeural)",
+ "ShortName": "am-ET-MekdesNeural",
+ "Gender": "Female",
+ "Locale": "am-ET",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Mekdes Online (Natural) - Amharic (Ethiopia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-DZ, AminaNeural)",
+ "ShortName": "ar-DZ-AminaNeural",
+ "Gender": "Female",
+ "Locale": "ar-DZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Amina Online (Natural) - Arabic (Algeria)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-DZ, IsmaelNeural)",
+ "ShortName": "ar-DZ-IsmaelNeural",
+ "Gender": "Male",
+ "Locale": "ar-DZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ismael Online (Natural) - Arabic (Algeria)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-BH, AliNeural)",
+ "ShortName": "ar-BH-AliNeural",
+ "Gender": "Male",
+ "Locale": "ar-BH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ali Online (Natural) - Arabic (Bahrain)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-BH, LailaNeural)",
+ "ShortName": "ar-BH-LailaNeural",
+ "Gender": "Female",
+ "Locale": "ar-BH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Laila Online (Natural) - Arabic (Bahrain)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-EG, SalmaNeural)",
+ "ShortName": "ar-EG-SalmaNeural",
+ "Gender": "Female",
+ "Locale": "ar-EG",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Salma Online (Natural) - Arabic (Egypt)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-EG, ShakirNeural)",
+ "ShortName": "ar-EG-ShakirNeural",
+ "Gender": "Male",
+ "Locale": "ar-EG",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Shakir Online (Natural) - Arabic (Egypt)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-IQ, BasselNeural)",
+ "ShortName": "ar-IQ-BasselNeural",
+ "Gender": "Male",
+ "Locale": "ar-IQ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Bassel Online (Natural) - Arabic (Iraq)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-IQ, RanaNeural)",
+ "ShortName": "ar-IQ-RanaNeural",
+ "Gender": "Female",
+ "Locale": "ar-IQ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Rana Online (Natural) - Arabic (Iraq)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-JO, SanaNeural)",
+ "ShortName": "ar-JO-SanaNeural",
+ "Gender": "Female",
+ "Locale": "ar-JO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sana Online (Natural) - Arabic (Jordan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-JO, TaimNeural)",
+ "ShortName": "ar-JO-TaimNeural",
+ "Gender": "Male",
+ "Locale": "ar-JO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Taim Online (Natural) - Arabic (Jordan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-KW, FahedNeural)",
+ "ShortName": "ar-KW-FahedNeural",
+ "Gender": "Male",
+ "Locale": "ar-KW",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Fahed Online (Natural) - Arabic (Kuwait)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-KW, NouraNeural)",
+ "ShortName": "ar-KW-NouraNeural",
+ "Gender": "Female",
+ "Locale": "ar-KW",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Noura Online (Natural) - Arabic (Kuwait)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-LB, LaylaNeural)",
+ "ShortName": "ar-LB-LaylaNeural",
+ "Gender": "Female",
+ "Locale": "ar-LB",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Layla Online (Natural) - Arabic (Lebanon)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-LB, RamiNeural)",
+ "ShortName": "ar-LB-RamiNeural",
+ "Gender": "Male",
+ "Locale": "ar-LB",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Rami Online (Natural) - Arabic (Lebanon)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-LY, ImanNeural)",
+ "ShortName": "ar-LY-ImanNeural",
+ "Gender": "Female",
+ "Locale": "ar-LY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Iman Online (Natural) - Arabic (Libya)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-LY, OmarNeural)",
+ "ShortName": "ar-LY-OmarNeural",
+ "Gender": "Male",
+ "Locale": "ar-LY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Omar Online (Natural) - Arabic (Libya)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-MA, JamalNeural)",
+ "ShortName": "ar-MA-JamalNeural",
+ "Gender": "Male",
+ "Locale": "ar-MA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Jamal Online (Natural) - Arabic (Morocco)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-MA, MounaNeural)",
+ "ShortName": "ar-MA-MounaNeural",
+ "Gender": "Female",
+ "Locale": "ar-MA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Mouna Online (Natural) - Arabic (Morocco)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-OM, AbdullahNeural)",
+ "ShortName": "ar-OM-AbdullahNeural",
+ "Gender": "Male",
+ "Locale": "ar-OM",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Abdullah Online (Natural) - Arabic (Oman)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-OM, AyshaNeural)",
+ "ShortName": "ar-OM-AyshaNeural",
+ "Gender": "Female",
+ "Locale": "ar-OM",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Aysha Online (Natural) - Arabic (Oman)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-QA, AmalNeural)",
+ "ShortName": "ar-QA-AmalNeural",
+ "Gender": "Female",
+ "Locale": "ar-QA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Amal Online (Natural) - Arabic (Qatar)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-QA, MoazNeural)",
+ "ShortName": "ar-QA-MoazNeural",
+ "Gender": "Male",
+ "Locale": "ar-QA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Moaz Online (Natural) - Arabic (Qatar)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-SA, HamedNeural)",
+ "ShortName": "ar-SA-HamedNeural",
+ "Gender": "Male",
+ "Locale": "ar-SA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Hamed Online (Natural) - Arabic (Saudi Arabia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-SA, ZariyahNeural)",
+ "ShortName": "ar-SA-ZariyahNeural",
+ "Gender": "Female",
+ "Locale": "ar-SA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Zariyah Online (Natural) - Arabic (Saudi Arabia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-SY, AmanyNeural)",
+ "ShortName": "ar-SY-AmanyNeural",
+ "Gender": "Female",
+ "Locale": "ar-SY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Amany Online (Natural) - Arabic (Syria)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-SY, LaithNeural)",
+ "ShortName": "ar-SY-LaithNeural",
+ "Gender": "Male",
+ "Locale": "ar-SY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Laith Online (Natural) - Arabic (Syria)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-TN, HediNeural)",
+ "ShortName": "ar-TN-HediNeural",
+ "Gender": "Male",
+ "Locale": "ar-TN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Hedi Online (Natural) - Arabic (Tunisia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-TN, ReemNeural)",
+ "ShortName": "ar-TN-ReemNeural",
+ "Gender": "Female",
+ "Locale": "ar-TN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Reem Online (Natural) - Arabic (Tunisia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-AE, FatimaNeural)",
+ "ShortName": "ar-AE-FatimaNeural",
+ "Gender": "Female",
+ "Locale": "ar-AE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Fatima Online (Natural) - Arabic (United Arab Emirates)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-AE, HamdanNeural)",
+ "ShortName": "ar-AE-HamdanNeural",
+ "Gender": "Male",
+ "Locale": "ar-AE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Hamdan Online (Natural) - Arabic (United Arab Emirates)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-YE, MaryamNeural)",
+ "ShortName": "ar-YE-MaryamNeural",
+ "Gender": "Female",
+ "Locale": "ar-YE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Maryam Online (Natural) - Arabic (Yemen)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ar-YE, SalehNeural)",
+ "ShortName": "ar-YE-SalehNeural",
+ "Gender": "Male",
+ "Locale": "ar-YE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Saleh Online (Natural) - Arabic (Yemen)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (az-AZ, BabekNeural)",
+ "ShortName": "az-AZ-BabekNeural",
+ "Gender": "Male",
+ "Locale": "az-AZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Babek Online (Natural) - Azerbaijani (Azerbaijan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (az-AZ, BanuNeural)",
+ "ShortName": "az-AZ-BanuNeural",
+ "Gender": "Female",
+ "Locale": "az-AZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Banu Online (Natural) - Azerbaijani (Azerbaijan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (bn-BD, NabanitaNeural)",
+ "ShortName": "bn-BD-NabanitaNeural",
+ "Gender": "Female",
+ "Locale": "bn-BD",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Nabanita Online (Natural) - Bangla (Bangladesh)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (bn-BD, PradeepNeural)",
+ "ShortName": "bn-BD-PradeepNeural",
+ "Gender": "Male",
+ "Locale": "bn-BD",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Pradeep Online (Natural) - Bangla (Bangladesh)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (bn-IN, BashkarNeural)",
+ "ShortName": "bn-IN-BashkarNeural",
+ "Gender": "Male",
+ "Locale": "bn-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Bashkar Online (Natural) - Bangla (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (bn-IN, TanishaaNeural)",
+ "ShortName": "bn-IN-TanishaaNeural",
+ "Gender": "Female",
+ "Locale": "bn-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Tanishaa Online (Natural) - Bengali (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (bs-BA, GoranNeural)",
+ "ShortName": "bs-BA-GoranNeural",
+ "Gender": "Male",
+ "Locale": "bs-BA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Goran Online (Natural) - Bosnian (Bosnia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (bs-BA, VesnaNeural)",
+ "ShortName": "bs-BA-VesnaNeural",
+ "Gender": "Female",
+ "Locale": "bs-BA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Vesna Online (Natural) - Bosnian (Bosnia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (bg-BG, BorislavNeural)",
+ "ShortName": "bg-BG-BorislavNeural",
+ "Gender": "Male",
+ "Locale": "bg-BG",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Borislav Online (Natural) - Bulgarian (Bulgaria)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (bg-BG, KalinaNeural)",
+ "ShortName": "bg-BG-KalinaNeural",
+ "Gender": "Female",
+ "Locale": "bg-BG",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Kalina Online (Natural) - Bulgarian (Bulgaria)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (my-MM, NilarNeural)",
+ "ShortName": "my-MM-NilarNeural",
+ "Gender": "Female",
+ "Locale": "my-MM",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Nilar Online (Natural) - Burmese (Myanmar)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (my-MM, ThihaNeural)",
+ "ShortName": "my-MM-ThihaNeural",
+ "Gender": "Male",
+ "Locale": "my-MM",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Thiha Online (Natural) - Burmese (Myanmar)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ca-ES, EnricNeural)",
+ "ShortName": "ca-ES-EnricNeural",
+ "Gender": "Male",
+ "Locale": "ca-ES",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Enric Online (Natural) - Catalan (Spain)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ca-ES, JoanaNeural)",
+ "ShortName": "ca-ES-JoanaNeural",
+ "Gender": "Female",
+ "Locale": "ca-ES",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Joana Online (Natural) - Catalan (Spain)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuGaaiNeural)",
+ "ShortName": "zh-HK-HiuGaaiNeural",
+ "Gender": "Female",
+ "Locale": "zh-HK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft HiuGaai Online (Natural) - Chinese (Cantonese Traditional)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuMaanNeural)",
+ "ShortName": "zh-HK-HiuMaanNeural",
+ "Gender": "Female",
+ "Locale": "zh-HK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft HiuMaan Online (Natural) - Chinese (Hong Kong)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, WanLungNeural)",
+ "ShortName": "zh-HK-WanLungNeural",
+ "Gender": "Male",
+ "Locale": "zh-HK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft WanLung Online (Natural) - Chinese (Hong Kong)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)",
+ "ShortName": "zh-CN-XiaoxiaoNeural",
+ "Gender": "Female",
+ "Locale": "zh-CN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Xiaoxiao Online (Natural) - Chinese (Mainland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "News",
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Warm"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoyiNeural)",
+ "ShortName": "zh-CN-XiaoyiNeural",
+ "Gender": "Female",
+ "Locale": "zh-CN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Xiaoyi Online (Natural) - Chinese (Mainland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Cartoon",
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Lively"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunjianNeural)",
+ "ShortName": "zh-CN-YunjianNeural",
+ "Gender": "Male",
+ "Locale": "zh-CN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Yunjian Online (Natural) - Chinese (Mainland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Sports",
+ " Novel"
+ ],
+ "VoicePersonalities": [
+ "Passion"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiNeural)",
+ "ShortName": "zh-CN-YunxiNeural",
+ "Gender": "Male",
+ "Locale": "zh-CN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Yunxi Online (Natural) - Chinese (Mainland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Lively",
+ "Sunshine"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiaNeural)",
+ "ShortName": "zh-CN-YunxiaNeural",
+ "Gender": "Male",
+ "Locale": "zh-CN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Yunxia Online (Natural) - Chinese (Mainland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Cartoon",
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Cute"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunyangNeural)",
+ "ShortName": "zh-CN-YunyangNeural",
+ "Gender": "Male",
+ "Locale": "zh-CN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Yunyang Online (Natural) - Chinese (Mainland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "News"
+ ],
+ "VoicePersonalities": [
+ "Professional",
+ "Reliable"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN-liaoning, XiaobeiNeural)",
+ "ShortName": "zh-CN-liaoning-XiaobeiNeural",
+ "Gender": "Female",
+ "Locale": "zh-CN-liaoning",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Xiaobei Online (Natural) - Chinese (Northeastern Mandarin)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Dialect"
+ ],
+ "VoicePersonalities": [
+ "Humorous"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoChenNeural)",
+ "ShortName": "zh-TW-HsiaoChenNeural",
+ "Gender": "Female",
+ "Locale": "zh-TW",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft HsiaoChen Online (Natural) - Chinese (Taiwan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, YunJheNeural)",
+ "ShortName": "zh-TW-YunJheNeural",
+ "Gender": "Male",
+ "Locale": "zh-TW",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft YunJhe Online (Natural) - Chinese (Taiwan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoYuNeural)",
+ "ShortName": "zh-TW-HsiaoYuNeural",
+ "Gender": "Female",
+ "Locale": "zh-TW",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft HsiaoYu Online (Natural) - Chinese (Taiwanese Mandarin)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN-shaanxi, XiaoniNeural)",
+ "ShortName": "zh-CN-shaanxi-XiaoniNeural",
+ "Gender": "Female",
+ "Locale": "zh-CN-shaanxi",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Xiaoni Online (Natural) - Chinese (Zhongyuan Mandarin Shaanxi)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Dialect"
+ ],
+ "VoicePersonalities": [
+ "Bright"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (hr-HR, GabrijelaNeural)",
+ "ShortName": "hr-HR-GabrijelaNeural",
+ "Gender": "Female",
+ "Locale": "hr-HR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Gabrijela Online (Natural) - Croatian (Croatia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (hr-HR, SreckoNeural)",
+ "ShortName": "hr-HR-SreckoNeural",
+ "Gender": "Male",
+ "Locale": "hr-HR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Srecko Online (Natural) - Croatian (Croatia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (cs-CZ, AntoninNeural)",
+ "ShortName": "cs-CZ-AntoninNeural",
+ "Gender": "Male",
+ "Locale": "cs-CZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Antonin Online (Natural) - Czech (Czech)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (cs-CZ, VlastaNeural)",
+ "ShortName": "cs-CZ-VlastaNeural",
+ "Gender": "Female",
+ "Locale": "cs-CZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Vlasta Online (Natural) - Czech (Czech)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (da-DK, ChristelNeural)",
+ "ShortName": "da-DK-ChristelNeural",
+ "Gender": "Female",
+ "Locale": "da-DK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Christel Online (Natural) - Danish (Denmark)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (da-DK, JeppeNeural)",
+ "ShortName": "da-DK-JeppeNeural",
+ "Gender": "Male",
+ "Locale": "da-DK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Jeppe Online (Natural) - Danish (Denmark)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (nl-BE, ArnaudNeural)",
+ "ShortName": "nl-BE-ArnaudNeural",
+ "Gender": "Male",
+ "Locale": "nl-BE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Arnaud Online (Natural) - Dutch (Belgium)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (nl-BE, DenaNeural)",
+ "ShortName": "nl-BE-DenaNeural",
+ "Gender": "Female",
+ "Locale": "nl-BE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Dena Online (Natural) - Dutch (Belgium)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, ColetteNeural)",
+ "ShortName": "nl-NL-ColetteNeural",
+ "Gender": "Female",
+ "Locale": "nl-NL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Colette Online (Natural) - Dutch (Netherlands)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, FennaNeural)",
+ "ShortName": "nl-NL-FennaNeural",
+ "Gender": "Female",
+ "Locale": "nl-NL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Fenna Online (Natural) - Dutch (Netherlands)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, MaartenNeural)",
+ "ShortName": "nl-NL-MaartenNeural",
+ "Gender": "Male",
+ "Locale": "nl-NL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Maarten Online (Natural) - Dutch (Netherlands)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-AU, NatashaNeural)",
+ "ShortName": "en-AU-NatashaNeural",
+ "Gender": "Female",
+ "Locale": "en-AU",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Natasha Online (Natural) - English (Australia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-AU, WilliamNeural)",
+ "ShortName": "en-AU-WilliamNeural",
+ "Gender": "Male",
+ "Locale": "en-AU",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft William Online (Natural) - English (Australia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-CA, ClaraNeural)",
+ "ShortName": "en-CA-ClaraNeural",
+ "Gender": "Female",
+ "Locale": "en-CA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Clara Online (Natural) - English (Canada)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-CA, LiamNeural)",
+ "ShortName": "en-CA-LiamNeural",
+ "Gender": "Male",
+ "Locale": "en-CA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Liam Online (Natural) - English (Canada)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-HK, SamNeural)",
+ "ShortName": "en-HK-SamNeural",
+ "Gender": "Male",
+ "Locale": "en-HK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sam Online (Natural) - English (Hongkong)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-HK, YanNeural)",
+ "ShortName": "en-HK-YanNeural",
+ "Gender": "Female",
+ "Locale": "en-HK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Yan Online (Natural) - English (Hongkong)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaExpressiveNeural)",
+ "ShortName": "en-IN-NeerjaExpressiveNeural",
+ "Gender": "Female",
+ "Locale": "en-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Neerja Online (Natural) - English (India) (Preview)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaNeural)",
+ "ShortName": "en-IN-NeerjaNeural",
+ "Gender": "Female",
+ "Locale": "en-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Neerja Online (Natural) - English (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, PrabhatNeural)",
+ "ShortName": "en-IN-PrabhatNeural",
+ "Gender": "Male",
+ "Locale": "en-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Prabhat Online (Natural) - English (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-IE, ConnorNeural)",
+ "ShortName": "en-IE-ConnorNeural",
+ "Gender": "Male",
+ "Locale": "en-IE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Connor Online (Natural) - English (Ireland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-IE, EmilyNeural)",
+ "ShortName": "en-IE-EmilyNeural",
+ "Gender": "Female",
+ "Locale": "en-IE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Emily Online (Natural) - English (Ireland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-KE, AsiliaNeural)",
+ "ShortName": "en-KE-AsiliaNeural",
+ "Gender": "Female",
+ "Locale": "en-KE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Asilia Online (Natural) - English (Kenya)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-KE, ChilembaNeural)",
+ "ShortName": "en-KE-ChilembaNeural",
+ "Gender": "Male",
+ "Locale": "en-KE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Chilemba Online (Natural) - English (Kenya)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-NZ, MitchellNeural)",
+ "ShortName": "en-NZ-MitchellNeural",
+ "Gender": "Male",
+ "Locale": "en-NZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Mitchell Online (Natural) - English (New Zealand)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-NZ, MollyNeural)",
+ "ShortName": "en-NZ-MollyNeural",
+ "Gender": "Female",
+ "Locale": "en-NZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Molly Online (Natural) - English (New Zealand)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-NG, AbeoNeural)",
+ "ShortName": "en-NG-AbeoNeural",
+ "Gender": "Male",
+ "Locale": "en-NG",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Abeo Online (Natural) - English (Nigeria)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-NG, EzinneNeural)",
+ "ShortName": "en-NG-EzinneNeural",
+ "Gender": "Female",
+ "Locale": "en-NG",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ezinne Online (Natural) - English (Nigeria)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-PH, JamesNeural)",
+ "ShortName": "en-PH-JamesNeural",
+ "Gender": "Male",
+ "Locale": "en-PH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft James Online (Natural) - English (Philippines)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-PH, RosaNeural)",
+ "ShortName": "en-PH-RosaNeural",
+ "Gender": "Female",
+ "Locale": "en-PH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Rosa Online (Natural) - English (Philippines)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-SG, LunaNeural)",
+ "ShortName": "en-SG-LunaNeural",
+ "Gender": "Female",
+ "Locale": "en-SG",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Luna Online (Natural) - English (Singapore)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-SG, WayneNeural)",
+ "ShortName": "en-SG-WayneNeural",
+ "Gender": "Male",
+ "Locale": "en-SG",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Wayne Online (Natural) - English (Singapore)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-ZA, LeahNeural)",
+ "ShortName": "en-ZA-LeahNeural",
+ "Gender": "Female",
+ "Locale": "en-ZA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Leah Online (Natural) - English (South Africa)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-ZA, LukeNeural)",
+ "ShortName": "en-ZA-LukeNeural",
+ "Gender": "Male",
+ "Locale": "en-ZA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Luke Online (Natural) - English (South Africa)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-TZ, ElimuNeural)",
+ "ShortName": "en-TZ-ElimuNeural",
+ "Gender": "Male",
+ "Locale": "en-TZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Elimu Online (Natural) - English (Tanzania)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-TZ, ImaniNeural)",
+ "ShortName": "en-TZ-ImaniNeural",
+ "Gender": "Female",
+ "Locale": "en-TZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Imani Online (Natural) - English (Tanzania)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, LibbyNeural)",
+ "ShortName": "en-GB-LibbyNeural",
+ "Gender": "Female",
+ "Locale": "en-GB",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Libby Online (Natural) - English (United Kingdom)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, MaisieNeural)",
+ "ShortName": "en-GB-MaisieNeural",
+ "Gender": "Female",
+ "Locale": "en-GB",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Maisie Online (Natural) - English (United Kingdom)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, RyanNeural)",
+ "ShortName": "en-GB-RyanNeural",
+ "Gender": "Male",
+ "Locale": "en-GB",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ryan Online (Natural) - English (United Kingdom)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, SoniaNeural)",
+ "ShortName": "en-GB-SoniaNeural",
+ "Gender": "Female",
+ "Locale": "en-GB",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sonia Online (Natural) - English (United Kingdom)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, ThomasNeural)",
+ "ShortName": "en-GB-ThomasNeural",
+ "Gender": "Male",
+ "Locale": "en-GB",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Thomas Online (Natural) - English (United Kingdom)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AvaMultilingualNeural)",
+ "ShortName": "en-US-AvaMultilingualNeural",
+ "Gender": "Female",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft AvaMultilingual Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Conversation",
+ "Copilot"
+ ],
+ "VoicePersonalities": [
+ "Expressive",
+ "Caring",
+ "Pleasant",
+ "Friendly"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AndrewMultilingualNeural)",
+ "ShortName": "en-US-AndrewMultilingualNeural",
+ "Gender": "Male",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft AndrewMultilingual Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Conversation",
+ "Copilot"
+ ],
+ "VoicePersonalities": [
+ "Warm",
+ "Confident",
+ "Authentic",
+ "Honest"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EmmaMultilingualNeural)",
+ "ShortName": "en-US-EmmaMultilingualNeural",
+ "Gender": "Female",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft EmmaMultilingual Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Conversation",
+ "Copilot"
+ ],
+ "VoicePersonalities": [
+ "Cheerful",
+ "Clear",
+ "Conversational"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, BrianMultilingualNeural)",
+ "ShortName": "en-US-BrianMultilingualNeural",
+ "Gender": "Male",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft BrianMultilingual Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Conversation",
+ "Copilot"
+ ],
+ "VoicePersonalities": [
+ "Approachable",
+ "Casual",
+ "Sincere"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AvaNeural)",
+ "ShortName": "en-US-AvaNeural",
+ "Gender": "Female",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ava Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Conversation",
+ "Copilot"
+ ],
+ "VoicePersonalities": [
+ "Expressive",
+ "Caring",
+ "Pleasant",
+ "Friendly"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AndrewNeural)",
+ "ShortName": "en-US-AndrewNeural",
+ "Gender": "Male",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Andrew Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Conversation",
+ "Copilot"
+ ],
+ "VoicePersonalities": [
+ "Warm",
+ "Confident",
+ "Authentic",
+ "Honest"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EmmaNeural)",
+ "ShortName": "en-US-EmmaNeural",
+ "Gender": "Female",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Emma Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Conversation",
+ "Copilot"
+ ],
+ "VoicePersonalities": [
+ "Cheerful",
+ "Clear",
+ "Conversational"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, BrianNeural)",
+ "ShortName": "en-US-BrianNeural",
+ "Gender": "Male",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Brian Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Conversation",
+ "Copilot"
+ ],
+ "VoicePersonalities": [
+ "Approachable",
+ "Casual",
+ "Sincere"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AnaNeural)",
+ "ShortName": "en-US-AnaNeural",
+ "Gender": "Female",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ana Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "Cartoon",
+ "Conversation"
+ ],
+ "VoicePersonalities": [
+ "Cute"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)",
+ "ShortName": "en-US-AriaNeural",
+ "Gender": "Female",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Aria Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "News",
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Positive",
+ "Confident"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, ChristopherNeural)",
+ "ShortName": "en-US-ChristopherNeural",
+ "Gender": "Male",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Christopher Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "News",
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Reliable",
+ "Authority"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EricNeural)",
+ "ShortName": "en-US-EricNeural",
+ "Gender": "Male",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Eric Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "News",
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Rational"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, GuyNeural)",
+ "ShortName": "en-US-GuyNeural",
+ "Gender": "Male",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Guy Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "News",
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Passion"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)",
+ "ShortName": "en-US-JennyNeural",
+ "Gender": "Female",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Jenny Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Considerate",
+ "Comfort"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, MichelleNeural)",
+ "ShortName": "en-US-MichelleNeural",
+ "Gender": "Female",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Michelle Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "News",
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Pleasant"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, RogerNeural)",
+ "ShortName": "en-US-RogerNeural",
+ "Gender": "Male",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Roger Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "News",
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Lively"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (en-US, SteffanNeural)",
+ "ShortName": "en-US-SteffanNeural",
+ "Gender": "Male",
+ "Locale": "en-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Steffan Online (Natural) - English (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "News",
+ "Novel"
+ ],
+ "VoicePersonalities": [
+ "Rational"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (et-EE, AnuNeural)",
+ "ShortName": "et-EE-AnuNeural",
+ "Gender": "Female",
+ "Locale": "et-EE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Anu Online (Natural) - Estonian (Estonia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (et-EE, KertNeural)",
+ "ShortName": "et-EE-KertNeural",
+ "Gender": "Male",
+ "Locale": "et-EE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Kert Online (Natural) - Estonian (Estonia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fil-PH, AngeloNeural)",
+ "ShortName": "fil-PH-AngeloNeural",
+ "Gender": "Male",
+ "Locale": "fil-PH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Angelo Online (Natural) - Filipino (Philippines)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fil-PH, BlessicaNeural)",
+ "ShortName": "fil-PH-BlessicaNeural",
+ "Gender": "Female",
+ "Locale": "fil-PH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Blessica Online (Natural) - Filipino (Philippines)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fi-FI, HarriNeural)",
+ "ShortName": "fi-FI-HarriNeural",
+ "Gender": "Male",
+ "Locale": "fi-FI",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Harri Online (Natural) - Finnish (Finland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fi-FI, NooraNeural)",
+ "ShortName": "fi-FI-NooraNeural",
+ "Gender": "Female",
+ "Locale": "fi-FI",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Noora Online (Natural) - Finnish (Finland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-BE, CharlineNeural)",
+ "ShortName": "fr-BE-CharlineNeural",
+ "Gender": "Female",
+ "Locale": "fr-BE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Charline Online (Natural) - French (Belgium)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-BE, GerardNeural)",
+ "ShortName": "fr-BE-GerardNeural",
+ "Gender": "Male",
+ "Locale": "fr-BE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Gerard Online (Natural) - French (Belgium)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, ThierryNeural)",
+ "ShortName": "fr-CA-ThierryNeural",
+ "Gender": "Male",
+ "Locale": "fr-CA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Thierry Online (Natural) - French (Canada)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, AntoineNeural)",
+ "ShortName": "fr-CA-AntoineNeural",
+ "Gender": "Male",
+ "Locale": "fr-CA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Antoine Online (Natural) - French (Canada)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, JeanNeural)",
+ "ShortName": "fr-CA-JeanNeural",
+ "Gender": "Male",
+ "Locale": "fr-CA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Jean Online (Natural) - French (Canada)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, SylvieNeural)",
+ "ShortName": "fr-CA-SylvieNeural",
+ "Gender": "Female",
+ "Locale": "fr-CA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sylvie Online (Natural) - French (Canada)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, VivienneMultilingualNeural)",
+ "ShortName": "fr-FR-VivienneMultilingualNeural",
+ "Gender": "Female",
+ "Locale": "fr-FR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft VivienneMultilingual Online (Natural) - French (France)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, RemyMultilingualNeural)",
+ "ShortName": "fr-FR-RemyMultilingualNeural",
+ "Gender": "Male",
+ "Locale": "fr-FR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft RemyMultilingual Online (Natural) - French (France)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, DeniseNeural)",
+ "ShortName": "fr-FR-DeniseNeural",
+ "Gender": "Female",
+ "Locale": "fr-FR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Denise Online (Natural) - French (France)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, EloiseNeural)",
+ "ShortName": "fr-FR-EloiseNeural",
+ "Gender": "Female",
+ "Locale": "fr-FR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Eloise Online (Natural) - French (France)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, HenriNeural)",
+ "ShortName": "fr-FR-HenriNeural",
+ "Gender": "Male",
+ "Locale": "fr-FR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Henri Online (Natural) - French (France)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-CH, ArianeNeural)",
+ "ShortName": "fr-CH-ArianeNeural",
+ "Gender": "Female",
+ "Locale": "fr-CH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ariane Online (Natural) - French (Switzerland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fr-CH, FabriceNeural)",
+ "ShortName": "fr-CH-FabriceNeural",
+ "Gender": "Male",
+ "Locale": "fr-CH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Fabrice Online (Natural) - French (Switzerland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (gl-ES, RoiNeural)",
+ "ShortName": "gl-ES-RoiNeural",
+ "Gender": "Male",
+ "Locale": "gl-ES",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Roi Online (Natural) - Galician (Spain)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (gl-ES, SabelaNeural)",
+ "ShortName": "gl-ES-SabelaNeural",
+ "Gender": "Female",
+ "Locale": "gl-ES",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sabela Online (Natural) - Galician (Spain)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ka-GE, EkaNeural)",
+ "ShortName": "ka-GE-EkaNeural",
+ "Gender": "Female",
+ "Locale": "ka-GE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Eka Online (Natural) - Georgian (Georgia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ka-GE, GiorgiNeural)",
+ "ShortName": "ka-GE-GiorgiNeural",
+ "Gender": "Male",
+ "Locale": "ka-GE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Giorgi Online (Natural) - Georgian (Georgia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (de-AT, IngridNeural)",
+ "ShortName": "de-AT-IngridNeural",
+ "Gender": "Female",
+ "Locale": "de-AT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ingrid Online (Natural) - German (Austria)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (de-AT, JonasNeural)",
+ "ShortName": "de-AT-JonasNeural",
+ "Gender": "Male",
+ "Locale": "de-AT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Jonas Online (Natural) - German (Austria)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, SeraphinaMultilingualNeural)",
+ "ShortName": "de-DE-SeraphinaMultilingualNeural",
+ "Gender": "Female",
+ "Locale": "de-DE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft SeraphinaMultilingual Online (Natural) - German (Germany)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, FlorianMultilingualNeural)",
+ "ShortName": "de-DE-FlorianMultilingualNeural",
+ "Gender": "Male",
+ "Locale": "de-DE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft FlorianMultilingual Online (Natural) - German (Germany)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, AmalaNeural)",
+ "ShortName": "de-DE-AmalaNeural",
+ "Gender": "Female",
+ "Locale": "de-DE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Amala Online (Natural) - German (Germany)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, ConradNeural)",
+ "ShortName": "de-DE-ConradNeural",
+ "Gender": "Male",
+ "Locale": "de-DE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Conrad Online (Natural) - German (Germany)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, KatjaNeural)",
+ "ShortName": "de-DE-KatjaNeural",
+ "Gender": "Female",
+ "Locale": "de-DE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Katja Online (Natural) - German (Germany)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, KillianNeural)",
+ "ShortName": "de-DE-KillianNeural",
+ "Gender": "Male",
+ "Locale": "de-DE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Killian Online (Natural) - German (Germany)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (de-CH, JanNeural)",
+ "ShortName": "de-CH-JanNeural",
+ "Gender": "Male",
+ "Locale": "de-CH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Jan Online (Natural) - German (Switzerland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (de-CH, LeniNeural)",
+ "ShortName": "de-CH-LeniNeural",
+ "Gender": "Female",
+ "Locale": "de-CH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Leni Online (Natural) - German (Switzerland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (el-GR, AthinaNeural)",
+ "ShortName": "el-GR-AthinaNeural",
+ "Gender": "Female",
+ "Locale": "el-GR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Athina Online (Natural) - Greek (Greece)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (el-GR, NestorasNeural)",
+ "ShortName": "el-GR-NestorasNeural",
+ "Gender": "Male",
+ "Locale": "el-GR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Nestoras Online (Natural) - Greek (Greece)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (gu-IN, DhwaniNeural)",
+ "ShortName": "gu-IN-DhwaniNeural",
+ "Gender": "Female",
+ "Locale": "gu-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Dhwani Online (Natural) - Gujarati (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (gu-IN, NiranjanNeural)",
+ "ShortName": "gu-IN-NiranjanNeural",
+ "Gender": "Male",
+ "Locale": "gu-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Niranjan Online (Natural) - Gujarati (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (he-IL, AvriNeural)",
+ "ShortName": "he-IL-AvriNeural",
+ "Gender": "Male",
+ "Locale": "he-IL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Avri Online (Natural) - Hebrew (Israel)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (he-IL, HilaNeural)",
+ "ShortName": "he-IL-HilaNeural",
+ "Gender": "Female",
+ "Locale": "he-IL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Hila Online (Natural) - Hebrew (Israel)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (hi-IN, MadhurNeural)",
+ "ShortName": "hi-IN-MadhurNeural",
+ "Gender": "Male",
+ "Locale": "hi-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Madhur Online (Natural) - Hindi (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (hi-IN, SwaraNeural)",
+ "ShortName": "hi-IN-SwaraNeural",
+ "Gender": "Female",
+ "Locale": "hi-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Swara Online (Natural) - Hindi (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (hu-HU, NoemiNeural)",
+ "ShortName": "hu-HU-NoemiNeural",
+ "Gender": "Female",
+ "Locale": "hu-HU",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Noemi Online (Natural) - Hungarian (Hungary)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (hu-HU, TamasNeural)",
+ "ShortName": "hu-HU-TamasNeural",
+ "Gender": "Male",
+ "Locale": "hu-HU",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Tamas Online (Natural) - Hungarian (Hungary)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (is-IS, GudrunNeural)",
+ "ShortName": "is-IS-GudrunNeural",
+ "Gender": "Female",
+ "Locale": "is-IS",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Gudrun Online (Natural) - Icelandic (Iceland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (is-IS, GunnarNeural)",
+ "ShortName": "is-IS-GunnarNeural",
+ "Gender": "Male",
+ "Locale": "is-IS",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Gunnar Online (Natural) - Icelandic (Iceland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (id-ID, ArdiNeural)",
+ "ShortName": "id-ID-ArdiNeural",
+ "Gender": "Male",
+ "Locale": "id-ID",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ardi Online (Natural) - Indonesian (Indonesia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (id-ID, GadisNeural)",
+ "ShortName": "id-ID-GadisNeural",
+ "Gender": "Female",
+ "Locale": "id-ID",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Gadis Online (Natural) - Indonesian (Indonesia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ga-IE, ColmNeural)",
+ "ShortName": "ga-IE-ColmNeural",
+ "Gender": "Male",
+ "Locale": "ga-IE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Colm Online (Natural) - Irish (Ireland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ga-IE, OrlaNeural)",
+ "ShortName": "ga-IE-OrlaNeural",
+ "Gender": "Female",
+ "Locale": "ga-IE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Orla Online (Natural) - Irish (Ireland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, GiuseppeNeural)",
+ "ShortName": "it-IT-GiuseppeNeural",
+ "Gender": "Male",
+ "Locale": "it-IT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Giuseppe Online (Natural) - Italian (Italy)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, DiegoNeural)",
+ "ShortName": "it-IT-DiegoNeural",
+ "Gender": "Male",
+ "Locale": "it-IT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Diego Online (Natural) - Italian (Italy)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)",
+ "ShortName": "it-IT-ElsaNeural",
+ "Gender": "Female",
+ "Locale": "it-IT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Elsa Online (Natural) - Italian (Italy)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, IsabellaNeural)",
+ "ShortName": "it-IT-IsabellaNeural",
+ "Gender": "Female",
+ "Locale": "it-IT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Isabella Online (Natural) - Italian (Italy)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ja-JP, KeitaNeural)",
+ "ShortName": "ja-JP-KeitaNeural",
+ "Gender": "Male",
+ "Locale": "ja-JP",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Keita Online (Natural) - Japanese (Japan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ja-JP, NanamiNeural)",
+ "ShortName": "ja-JP-NanamiNeural",
+ "Gender": "Female",
+ "Locale": "ja-JP",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Nanami Online (Natural) - Japanese (Japan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (jv-ID, DimasNeural)",
+ "ShortName": "jv-ID-DimasNeural",
+ "Gender": "Male",
+ "Locale": "jv-ID",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Dimas Online (Natural) - Javanese (Indonesia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (jv-ID, SitiNeural)",
+ "ShortName": "jv-ID-SitiNeural",
+ "Gender": "Female",
+ "Locale": "jv-ID",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Siti Online (Natural) - Javanese (Indonesia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (kn-IN, GaganNeural)",
+ "ShortName": "kn-IN-GaganNeural",
+ "Gender": "Male",
+ "Locale": "kn-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Gagan Online (Natural) - Kannada (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (kn-IN, SapnaNeural)",
+ "ShortName": "kn-IN-SapnaNeural",
+ "Gender": "Female",
+ "Locale": "kn-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sapna Online (Natural) - Kannada (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (kk-KZ, AigulNeural)",
+ "ShortName": "kk-KZ-AigulNeural",
+ "Gender": "Female",
+ "Locale": "kk-KZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Aigul Online (Natural) - Kazakh (Kazakhstan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (kk-KZ, DauletNeural)",
+ "ShortName": "kk-KZ-DauletNeural",
+ "Gender": "Male",
+ "Locale": "kk-KZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Daulet Online (Natural) - Kazakh (Kazakhstan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (km-KH, PisethNeural)",
+ "ShortName": "km-KH-PisethNeural",
+ "Gender": "Male",
+ "Locale": "km-KH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Piseth Online (Natural) - Khmer (Cambodia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (km-KH, SreymomNeural)",
+ "ShortName": "km-KH-SreymomNeural",
+ "Gender": "Female",
+ "Locale": "km-KH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sreymom Online (Natural) - Khmer (Cambodia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, HyunsuNeural)",
+ "ShortName": "ko-KR-HyunsuNeural",
+ "Gender": "Male",
+ "Locale": "ko-KR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Hyunsu Online (Natural) - Korean (Korea)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, InJoonNeural)",
+ "ShortName": "ko-KR-InJoonNeural",
+ "Gender": "Male",
+ "Locale": "ko-KR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft InJoon Online (Natural) - Korean (Korea)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, SunHiNeural)",
+ "ShortName": "ko-KR-SunHiNeural",
+ "Gender": "Female",
+ "Locale": "ko-KR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft SunHi Online (Natural) - Korean (Korea)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (lo-LA, ChanthavongNeural)",
+ "ShortName": "lo-LA-ChanthavongNeural",
+ "Gender": "Male",
+ "Locale": "lo-LA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Chanthavong Online (Natural) - Lao (Laos)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (lo-LA, KeomanyNeural)",
+ "ShortName": "lo-LA-KeomanyNeural",
+ "Gender": "Female",
+ "Locale": "lo-LA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Keomany Online (Natural) - Lao (Laos)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (lv-LV, EveritaNeural)",
+ "ShortName": "lv-LV-EveritaNeural",
+ "Gender": "Female",
+ "Locale": "lv-LV",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Everita Online (Natural) - Latvian (Latvia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (lv-LV, NilsNeural)",
+ "ShortName": "lv-LV-NilsNeural",
+ "Gender": "Male",
+ "Locale": "lv-LV",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Nils Online (Natural) - Latvian (Latvia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (lt-LT, LeonasNeural)",
+ "ShortName": "lt-LT-LeonasNeural",
+ "Gender": "Male",
+ "Locale": "lt-LT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Leonas Online (Natural) - Lithuanian (Lithuania)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (lt-LT, OnaNeural)",
+ "ShortName": "lt-LT-OnaNeural",
+ "Gender": "Female",
+ "Locale": "lt-LT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ona Online (Natural) - Lithuanian (Lithuania)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (mk-MK, AleksandarNeural)",
+ "ShortName": "mk-MK-AleksandarNeural",
+ "Gender": "Male",
+ "Locale": "mk-MK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Aleksandar Online (Natural) - Macedonian (Republic of North Macedonia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (mk-MK, MarijaNeural)",
+ "ShortName": "mk-MK-MarijaNeural",
+ "Gender": "Female",
+ "Locale": "mk-MK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Marija Online (Natural) - Macedonian (Republic of North Macedonia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ms-MY, OsmanNeural)",
+ "ShortName": "ms-MY-OsmanNeural",
+ "Gender": "Male",
+ "Locale": "ms-MY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Osman Online (Natural) - Malay (Malaysia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ms-MY, YasminNeural)",
+ "ShortName": "ms-MY-YasminNeural",
+ "Gender": "Female",
+ "Locale": "ms-MY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Yasmin Online (Natural) - Malay (Malaysia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ml-IN, MidhunNeural)",
+ "ShortName": "ml-IN-MidhunNeural",
+ "Gender": "Male",
+ "Locale": "ml-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Midhun Online (Natural) - Malayalam (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ml-IN, SobhanaNeural)",
+ "ShortName": "ml-IN-SobhanaNeural",
+ "Gender": "Female",
+ "Locale": "ml-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sobhana Online (Natural) - Malayalam (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (mt-MT, GraceNeural)",
+ "ShortName": "mt-MT-GraceNeural",
+ "Gender": "Female",
+ "Locale": "mt-MT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Grace Online (Natural) - Maltese (Malta)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (mt-MT, JosephNeural)",
+ "ShortName": "mt-MT-JosephNeural",
+ "Gender": "Male",
+ "Locale": "mt-MT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Joseph Online (Natural) - Maltese (Malta)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (mr-IN, AarohiNeural)",
+ "ShortName": "mr-IN-AarohiNeural",
+ "Gender": "Female",
+ "Locale": "mr-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Aarohi Online (Natural) - Marathi (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (mr-IN, ManoharNeural)",
+ "ShortName": "mr-IN-ManoharNeural",
+ "Gender": "Male",
+ "Locale": "mr-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Manohar Online (Natural) - Marathi (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (mn-MN, BataaNeural)",
+ "ShortName": "mn-MN-BataaNeural",
+ "Gender": "Male",
+ "Locale": "mn-MN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Bataa Online (Natural) - Mongolian (Mongolia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (mn-MN, YesuiNeural)",
+ "ShortName": "mn-MN-YesuiNeural",
+ "Gender": "Female",
+ "Locale": "mn-MN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Yesui Online (Natural) - Mongolian (Mongolia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ne-NP, HemkalaNeural)",
+ "ShortName": "ne-NP-HemkalaNeural",
+ "Gender": "Female",
+ "Locale": "ne-NP",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Hemkala Online (Natural) - Nepali (Nepal)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ne-NP, SagarNeural)",
+ "ShortName": "ne-NP-SagarNeural",
+ "Gender": "Male",
+ "Locale": "ne-NP",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sagar Online (Natural) - Nepali (Nepal)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (nb-NO, FinnNeural)",
+ "ShortName": "nb-NO-FinnNeural",
+ "Gender": "Male",
+ "Locale": "nb-NO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Finn Online (Natural) - Norwegian (Bokmål Norway)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (nb-NO, PernilleNeural)",
+ "ShortName": "nb-NO-PernilleNeural",
+ "Gender": "Female",
+ "Locale": "nb-NO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Pernille Online (Natural) - Norwegian (Bokmål, Norway)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ps-AF, GulNawazNeural)",
+ "ShortName": "ps-AF-GulNawazNeural",
+ "Gender": "Male",
+ "Locale": "ps-AF",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft GulNawaz Online (Natural) - Pashto (Afghanistan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ps-AF, LatifaNeural)",
+ "ShortName": "ps-AF-LatifaNeural",
+ "Gender": "Female",
+ "Locale": "ps-AF",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Latifa Online (Natural) - Pashto (Afghanistan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fa-IR, DilaraNeural)",
+ "ShortName": "fa-IR-DilaraNeural",
+ "Gender": "Female",
+ "Locale": "fa-IR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Dilara Online (Natural) - Persian (Iran)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (fa-IR, FaridNeural)",
+ "ShortName": "fa-IR-FaridNeural",
+ "Gender": "Male",
+ "Locale": "fa-IR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Farid Online (Natural) - Persian (Iran)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (pl-PL, MarekNeural)",
+ "ShortName": "pl-PL-MarekNeural",
+ "Gender": "Male",
+ "Locale": "pl-PL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Marek Online (Natural) - Polish (Poland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (pl-PL, ZofiaNeural)",
+ "ShortName": "pl-PL-ZofiaNeural",
+ "Gender": "Female",
+ "Locale": "pl-PL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Zofia Online (Natural) - Polish (Poland)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, ThalitaNeural)",
+ "ShortName": "pt-BR-ThalitaNeural",
+ "Gender": "Female",
+ "Locale": "pt-BR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Thalita Online (Natural) - Portuguese (Brazil)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, AntonioNeural)",
+ "ShortName": "pt-BR-AntonioNeural",
+ "Gender": "Male",
+ "Locale": "pt-BR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Antonio Online (Natural) - Portuguese (Brazil)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, FranciscaNeural)",
+ "ShortName": "pt-BR-FranciscaNeural",
+ "Gender": "Female",
+ "Locale": "pt-BR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Francisca Online (Natural) - Portuguese (Brazil)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (pt-PT, DuarteNeural)",
+ "ShortName": "pt-PT-DuarteNeural",
+ "Gender": "Male",
+ "Locale": "pt-PT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Duarte Online (Natural) - Portuguese (Portugal)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (pt-PT, RaquelNeural)",
+ "ShortName": "pt-PT-RaquelNeural",
+ "Gender": "Female",
+ "Locale": "pt-PT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Raquel Online (Natural) - Portuguese (Portugal)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ro-RO, AlinaNeural)",
+ "ShortName": "ro-RO-AlinaNeural",
+ "Gender": "Female",
+ "Locale": "ro-RO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Alina Online (Natural) - Romanian (Romania)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ro-RO, EmilNeural)",
+ "ShortName": "ro-RO-EmilNeural",
+ "Gender": "Male",
+ "Locale": "ro-RO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Emil Online (Natural) - Romanian (Romania)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ru-RU, DmitryNeural)",
+ "ShortName": "ru-RU-DmitryNeural",
+ "Gender": "Male",
+ "Locale": "ru-RU",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Dmitry Online (Natural) - Russian (Russia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ru-RU, SvetlanaNeural)",
+ "ShortName": "ru-RU-SvetlanaNeural",
+ "Gender": "Female",
+ "Locale": "ru-RU",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Svetlana Online (Natural) - Russian (Russia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sr-RS, NicholasNeural)",
+ "ShortName": "sr-RS-NicholasNeural",
+ "Gender": "Male",
+ "Locale": "sr-RS",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Nicholas Online (Natural) - Serbian (Serbia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sr-RS, SophieNeural)",
+ "ShortName": "sr-RS-SophieNeural",
+ "Gender": "Female",
+ "Locale": "sr-RS",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sophie Online (Natural) - Serbian (Serbia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (si-LK, SameeraNeural)",
+ "ShortName": "si-LK-SameeraNeural",
+ "Gender": "Male",
+ "Locale": "si-LK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sameera Online (Natural) - Sinhala (Sri Lanka)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (si-LK, ThiliniNeural)",
+ "ShortName": "si-LK-ThiliniNeural",
+ "Gender": "Female",
+ "Locale": "si-LK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Thilini Online (Natural) - Sinhala (Sri Lanka)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sk-SK, LukasNeural)",
+ "ShortName": "sk-SK-LukasNeural",
+ "Gender": "Male",
+ "Locale": "sk-SK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Lukas Online (Natural) - Slovak (Slovakia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sk-SK, ViktoriaNeural)",
+ "ShortName": "sk-SK-ViktoriaNeural",
+ "Gender": "Female",
+ "Locale": "sk-SK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Viktoria Online (Natural) - Slovak (Slovakia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sl-SI, PetraNeural)",
+ "ShortName": "sl-SI-PetraNeural",
+ "Gender": "Female",
+ "Locale": "sl-SI",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Petra Online (Natural) - Slovenian (Slovenia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sl-SI, RokNeural)",
+ "ShortName": "sl-SI-RokNeural",
+ "Gender": "Male",
+ "Locale": "sl-SI",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Rok Online (Natural) - Slovenian (Slovenia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (so-SO, MuuseNeural)",
+ "ShortName": "so-SO-MuuseNeural",
+ "Gender": "Male",
+ "Locale": "so-SO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Muuse Online (Natural) - Somali (Somalia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (so-SO, UbaxNeural)",
+ "ShortName": "so-SO-UbaxNeural",
+ "Gender": "Female",
+ "Locale": "so-SO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ubax Online (Natural) - Somali (Somalia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-AR, ElenaNeural)",
+ "ShortName": "es-AR-ElenaNeural",
+ "Gender": "Female",
+ "Locale": "es-AR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Elena Online (Natural) - Spanish (Argentina)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-AR, TomasNeural)",
+ "ShortName": "es-AR-TomasNeural",
+ "Gender": "Male",
+ "Locale": "es-AR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Tomas Online (Natural) - Spanish (Argentina)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-BO, MarceloNeural)",
+ "ShortName": "es-BO-MarceloNeural",
+ "Gender": "Male",
+ "Locale": "es-BO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Marcelo Online (Natural) - Spanish (Bolivia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-BO, SofiaNeural)",
+ "ShortName": "es-BO-SofiaNeural",
+ "Gender": "Female",
+ "Locale": "es-BO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sofia Online (Natural) - Spanish (Bolivia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-CL, CatalinaNeural)",
+ "ShortName": "es-CL-CatalinaNeural",
+ "Gender": "Female",
+ "Locale": "es-CL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Catalina Online (Natural) - Spanish (Chile)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-CL, LorenzoNeural)",
+ "ShortName": "es-CL-LorenzoNeural",
+ "Gender": "Male",
+ "Locale": "es-CL",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Lorenzo Online (Natural) - Spanish (Chile)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, XimenaNeural)",
+ "ShortName": "es-ES-XimenaNeural",
+ "Gender": "Female",
+ "Locale": "es-ES",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ximena Online (Natural) - Spanish (Colombia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-CO, GonzaloNeural)",
+ "ShortName": "es-CO-GonzaloNeural",
+ "Gender": "Male",
+ "Locale": "es-CO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Gonzalo Online (Natural) - Spanish (Colombia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-CO, SalomeNeural)",
+ "ShortName": "es-CO-SalomeNeural",
+ "Gender": "Female",
+ "Locale": "es-CO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Salome Online (Natural) - Spanish (Colombia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-CR, JuanNeural)",
+ "ShortName": "es-CR-JuanNeural",
+ "Gender": "Male",
+ "Locale": "es-CR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Juan Online (Natural) - Spanish (Costa Rica)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-CR, MariaNeural)",
+ "ShortName": "es-CR-MariaNeural",
+ "Gender": "Female",
+ "Locale": "es-CR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Maria Online (Natural) - Spanish (Costa Rica)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-CU, BelkysNeural)",
+ "ShortName": "es-CU-BelkysNeural",
+ "Gender": "Female",
+ "Locale": "es-CU",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Belkys Online (Natural) - Spanish (Cuba)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-CU, ManuelNeural)",
+ "ShortName": "es-CU-ManuelNeural",
+ "Gender": "Male",
+ "Locale": "es-CU",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Manuel Online (Natural) - Spanish (Cuba)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-DO, EmilioNeural)",
+ "ShortName": "es-DO-EmilioNeural",
+ "Gender": "Male",
+ "Locale": "es-DO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Emilio Online (Natural) - Spanish (Dominican Republic)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-DO, RamonaNeural)",
+ "ShortName": "es-DO-RamonaNeural",
+ "Gender": "Female",
+ "Locale": "es-DO",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ramona Online (Natural) - Spanish (Dominican Republic)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-EC, AndreaNeural)",
+ "ShortName": "es-EC-AndreaNeural",
+ "Gender": "Female",
+ "Locale": "es-EC",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Andrea Online (Natural) - Spanish (Ecuador)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-EC, LuisNeural)",
+ "ShortName": "es-EC-LuisNeural",
+ "Gender": "Male",
+ "Locale": "es-EC",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Luis Online (Natural) - Spanish (Ecuador)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-SV, LorenaNeural)",
+ "ShortName": "es-SV-LorenaNeural",
+ "Gender": "Female",
+ "Locale": "es-SV",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Lorena Online (Natural) - Spanish (El Salvador)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-SV, RodrigoNeural)",
+ "ShortName": "es-SV-RodrigoNeural",
+ "Gender": "Male",
+ "Locale": "es-SV",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Rodrigo Online (Natural) - Spanish (El Salvador)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-GQ, JavierNeural)",
+ "ShortName": "es-GQ-JavierNeural",
+ "Gender": "Male",
+ "Locale": "es-GQ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Javier Online (Natural) - Spanish (Equatorial Guinea)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-GQ, TeresaNeural)",
+ "ShortName": "es-GQ-TeresaNeural",
+ "Gender": "Female",
+ "Locale": "es-GQ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Teresa Online (Natural) - Spanish (Equatorial Guinea)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-GT, AndresNeural)",
+ "ShortName": "es-GT-AndresNeural",
+ "Gender": "Male",
+ "Locale": "es-GT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Andres Online (Natural) - Spanish (Guatemala)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-GT, MartaNeural)",
+ "ShortName": "es-GT-MartaNeural",
+ "Gender": "Female",
+ "Locale": "es-GT",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Marta Online (Natural) - Spanish (Guatemala)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-HN, CarlosNeural)",
+ "ShortName": "es-HN-CarlosNeural",
+ "Gender": "Male",
+ "Locale": "es-HN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Carlos Online (Natural) - Spanish (Honduras)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-HN, KarlaNeural)",
+ "ShortName": "es-HN-KarlaNeural",
+ "Gender": "Female",
+ "Locale": "es-HN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Karla Online (Natural) - Spanish (Honduras)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-MX, DaliaNeural)",
+ "ShortName": "es-MX-DaliaNeural",
+ "Gender": "Female",
+ "Locale": "es-MX",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Dalia Online (Natural) - Spanish (Mexico)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-MX, JorgeNeural)",
+ "ShortName": "es-MX-JorgeNeural",
+ "Gender": "Male",
+ "Locale": "es-MX",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Jorge Online (Natural) - Spanish (Mexico)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-NI, FedericoNeural)",
+ "ShortName": "es-NI-FedericoNeural",
+ "Gender": "Male",
+ "Locale": "es-NI",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Federico Online (Natural) - Spanish (Nicaragua)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-NI, YolandaNeural)",
+ "ShortName": "es-NI-YolandaNeural",
+ "Gender": "Female",
+ "Locale": "es-NI",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Yolanda Online (Natural) - Spanish (Nicaragua)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-PA, MargaritaNeural)",
+ "ShortName": "es-PA-MargaritaNeural",
+ "Gender": "Female",
+ "Locale": "es-PA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Margarita Online (Natural) - Spanish (Panama)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-PA, RobertoNeural)",
+ "ShortName": "es-PA-RobertoNeural",
+ "Gender": "Male",
+ "Locale": "es-PA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Roberto Online (Natural) - Spanish (Panama)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-PY, MarioNeural)",
+ "ShortName": "es-PY-MarioNeural",
+ "Gender": "Male",
+ "Locale": "es-PY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Mario Online (Natural) - Spanish (Paraguay)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-PY, TaniaNeural)",
+ "ShortName": "es-PY-TaniaNeural",
+ "Gender": "Female",
+ "Locale": "es-PY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Tania Online (Natural) - Spanish (Paraguay)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-PE, AlexNeural)",
+ "ShortName": "es-PE-AlexNeural",
+ "Gender": "Male",
+ "Locale": "es-PE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Alex Online (Natural) - Spanish (Peru)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-PE, CamilaNeural)",
+ "ShortName": "es-PE-CamilaNeural",
+ "Gender": "Female",
+ "Locale": "es-PE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Camila Online (Natural) - Spanish (Peru)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-PR, KarinaNeural)",
+ "ShortName": "es-PR-KarinaNeural",
+ "Gender": "Female",
+ "Locale": "es-PR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Karina Online (Natural) - Spanish (Puerto Rico)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-PR, VictorNeural)",
+ "ShortName": "es-PR-VictorNeural",
+ "Gender": "Male",
+ "Locale": "es-PR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Victor Online (Natural) - Spanish (Puerto Rico)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, AlvaroNeural)",
+ "ShortName": "es-ES-AlvaroNeural",
+ "Gender": "Male",
+ "Locale": "es-ES",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Alvaro Online (Natural) - Spanish (Spain)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, ElviraNeural)",
+ "ShortName": "es-ES-ElviraNeural",
+ "Gender": "Female",
+ "Locale": "es-ES",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Elvira Online (Natural) - Spanish (Spain)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-US, AlonsoNeural)",
+ "ShortName": "es-US-AlonsoNeural",
+ "Gender": "Male",
+ "Locale": "es-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Alonso Online (Natural) - Spanish (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-US, PalomaNeural)",
+ "ShortName": "es-US-PalomaNeural",
+ "Gender": "Female",
+ "Locale": "es-US",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Paloma Online (Natural) - Spanish (United States)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-UY, MateoNeural)",
+ "ShortName": "es-UY-MateoNeural",
+ "Gender": "Male",
+ "Locale": "es-UY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Mateo Online (Natural) - Spanish (Uruguay)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-UY, ValentinaNeural)",
+ "ShortName": "es-UY-ValentinaNeural",
+ "Gender": "Female",
+ "Locale": "es-UY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Valentina Online (Natural) - Spanish (Uruguay)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-VE, PaolaNeural)",
+ "ShortName": "es-VE-PaolaNeural",
+ "Gender": "Female",
+ "Locale": "es-VE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Paola Online (Natural) - Spanish (Venezuela)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (es-VE, SebastianNeural)",
+ "ShortName": "es-VE-SebastianNeural",
+ "Gender": "Male",
+ "Locale": "es-VE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sebastian Online (Natural) - Spanish (Venezuela)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (su-ID, JajangNeural)",
+ "ShortName": "su-ID-JajangNeural",
+ "Gender": "Male",
+ "Locale": "su-ID",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Jajang Online (Natural) - Sundanese (Indonesia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (su-ID, TutiNeural)",
+ "ShortName": "su-ID-TutiNeural",
+ "Gender": "Female",
+ "Locale": "su-ID",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Tuti Online (Natural) - Sundanese (Indonesia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sw-KE, RafikiNeural)",
+ "ShortName": "sw-KE-RafikiNeural",
+ "Gender": "Male",
+ "Locale": "sw-KE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Rafiki Online (Natural) - Swahili (Kenya)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sw-KE, ZuriNeural)",
+ "ShortName": "sw-KE-ZuriNeural",
+ "Gender": "Female",
+ "Locale": "sw-KE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Zuri Online (Natural) - Swahili (Kenya)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sw-TZ, DaudiNeural)",
+ "ShortName": "sw-TZ-DaudiNeural",
+ "Gender": "Male",
+ "Locale": "sw-TZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Daudi Online (Natural) - Swahili (Tanzania)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sw-TZ, RehemaNeural)",
+ "ShortName": "sw-TZ-RehemaNeural",
+ "Gender": "Female",
+ "Locale": "sw-TZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Rehema Online (Natural) - Swahili (Tanzania)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sv-SE, MattiasNeural)",
+ "ShortName": "sv-SE-MattiasNeural",
+ "Gender": "Male",
+ "Locale": "sv-SE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Mattias Online (Natural) - Swedish (Sweden)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (sv-SE, SofieNeural)",
+ "ShortName": "sv-SE-SofieNeural",
+ "Gender": "Female",
+ "Locale": "sv-SE",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sofie Online (Natural) - Swedish (Sweden)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ta-IN, PallaviNeural)",
+ "ShortName": "ta-IN-PallaviNeural",
+ "Gender": "Female",
+ "Locale": "ta-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Pallavi Online (Natural) - Tamil (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ta-IN, ValluvarNeural)",
+ "ShortName": "ta-IN-ValluvarNeural",
+ "Gender": "Male",
+ "Locale": "ta-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Valluvar Online (Natural) - Tamil (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ta-MY, KaniNeural)",
+ "ShortName": "ta-MY-KaniNeural",
+ "Gender": "Female",
+ "Locale": "ta-MY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Kani Online (Natural) - Tamil (Malaysia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ta-MY, SuryaNeural)",
+ "ShortName": "ta-MY-SuryaNeural",
+ "Gender": "Male",
+ "Locale": "ta-MY",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Surya Online (Natural) - Tamil (Malaysia)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ta-SG, AnbuNeural)",
+ "ShortName": "ta-SG-AnbuNeural",
+ "Gender": "Male",
+ "Locale": "ta-SG",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Anbu Online (Natural) - Tamil (Singapore)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ta-SG, VenbaNeural)",
+ "ShortName": "ta-SG-VenbaNeural",
+ "Gender": "Female",
+ "Locale": "ta-SG",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Venba Online (Natural) - Tamil (Singapore)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ta-LK, KumarNeural)",
+ "ShortName": "ta-LK-KumarNeural",
+ "Gender": "Male",
+ "Locale": "ta-LK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Kumar Online (Natural) - Tamil (Sri Lanka)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ta-LK, SaranyaNeural)",
+ "ShortName": "ta-LK-SaranyaNeural",
+ "Gender": "Female",
+ "Locale": "ta-LK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Saranya Online (Natural) - Tamil (Sri Lanka)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (te-IN, MohanNeural)",
+ "ShortName": "te-IN-MohanNeural",
+ "Gender": "Male",
+ "Locale": "te-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Mohan Online (Natural) - Telugu (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (te-IN, ShrutiNeural)",
+ "ShortName": "te-IN-ShrutiNeural",
+ "Gender": "Female",
+ "Locale": "te-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Shruti Online (Natural) - Telugu (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (th-TH, NiwatNeural)",
+ "ShortName": "th-TH-NiwatNeural",
+ "Gender": "Male",
+ "Locale": "th-TH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Niwat Online (Natural) - Thai (Thailand)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (th-TH, PremwadeeNeural)",
+ "ShortName": "th-TH-PremwadeeNeural",
+ "Gender": "Female",
+ "Locale": "th-TH",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Premwadee Online (Natural) - Thai (Thailand)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (tr-TR, AhmetNeural)",
+ "ShortName": "tr-TR-AhmetNeural",
+ "Gender": "Male",
+ "Locale": "tr-TR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ahmet Online (Natural) - Turkish (Turkey)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (tr-TR, EmelNeural)",
+ "ShortName": "tr-TR-EmelNeural",
+ "Gender": "Female",
+ "Locale": "tr-TR",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Emel Online (Natural) - Turkish (Turkey)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (uk-UA, OstapNeural)",
+ "ShortName": "uk-UA-OstapNeural",
+ "Gender": "Male",
+ "Locale": "uk-UA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Ostap Online (Natural) - Ukrainian (Ukraine)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (uk-UA, PolinaNeural)",
+ "ShortName": "uk-UA-PolinaNeural",
+ "Gender": "Female",
+ "Locale": "uk-UA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Polina Online (Natural) - Ukrainian (Ukraine)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ur-IN, GulNeural)",
+ "ShortName": "ur-IN-GulNeural",
+ "Gender": "Female",
+ "Locale": "ur-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Gul Online (Natural) - Urdu (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ur-IN, SalmanNeural)",
+ "ShortName": "ur-IN-SalmanNeural",
+ "Gender": "Male",
+ "Locale": "ur-IN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Salman Online (Natural) - Urdu (India)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ur-PK, AsadNeural)",
+ "ShortName": "ur-PK-AsadNeural",
+ "Gender": "Male",
+ "Locale": "ur-PK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Asad Online (Natural) - Urdu (Pakistan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (ur-PK, UzmaNeural)",
+ "ShortName": "ur-PK-UzmaNeural",
+ "Gender": "Female",
+ "Locale": "ur-PK",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Uzma Online (Natural) - Urdu (Pakistan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (uz-UZ, MadinaNeural)",
+ "ShortName": "uz-UZ-MadinaNeural",
+ "Gender": "Female",
+ "Locale": "uz-UZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Madina Online (Natural) - Uzbek (Uzbekistan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (uz-UZ, SardorNeural)",
+ "ShortName": "uz-UZ-SardorNeural",
+ "Gender": "Male",
+ "Locale": "uz-UZ",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Sardor Online (Natural) - Uzbek (Uzbekistan)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (vi-VN, HoaiMyNeural)",
+ "ShortName": "vi-VN-HoaiMyNeural",
+ "Gender": "Female",
+ "Locale": "vi-VN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft HoaiMy Online (Natural) - Vietnamese (Vietnam)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (vi-VN, NamMinhNeural)",
+ "ShortName": "vi-VN-NamMinhNeural",
+ "Gender": "Male",
+ "Locale": "vi-VN",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft NamMinh Online (Natural) - Vietnamese (Vietnam)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (cy-GB, AledNeural)",
+ "ShortName": "cy-GB-AledNeural",
+ "Gender": "Male",
+ "Locale": "cy-GB",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Aled Online (Natural) - Welsh (United Kingdom)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (cy-GB, NiaNeural)",
+ "ShortName": "cy-GB-NiaNeural",
+ "Gender": "Female",
+ "Locale": "cy-GB",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Nia Online (Natural) - Welsh (United Kingdom)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zu-ZA, ThandoNeural)",
+ "ShortName": "zu-ZA-ThandoNeural",
+ "Gender": "Female",
+ "Locale": "zu-ZA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Thando Online (Natural) - Zulu (South Africa)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ },
+ {
+ "Name": "Microsoft Server Speech Text to Speech Voice (zu-ZA, ThembaNeural)",
+ "ShortName": "zu-ZA-ThembaNeural",
+ "Gender": "Male",
+ "Locale": "zu-ZA",
+ "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
+ "FriendlyName": "Microsoft Themba Online (Natural) - Zulu (South Africa)",
+ "Status": "GA",
+ "VoiceTag": {
+ "ContentCategories": [
+ "General"
+ ],
+ "VoicePersonalities": [
+ "Friendly",
+ "Positive"
+ ]
+ }
+ }
+]
\ No newline at end of file
diff --git a/rvc/lib/utils.py b/rvc/lib/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20286a2c7af337bde1b1abc924edaaa74835c6f
--- /dev/null
+++ b/rvc/lib/utils.py
@@ -0,0 +1,142 @@
+import os
+import sys
+import soxr
+import librosa
+import soundfile as sf
+import numpy as np
+import re
+import unicodedata
+import wget
+from torch import nn
+
+import logging
+from transformers import HubertModel
+import warnings
+
+# Remove this to see warnings about transformers models
+warnings.filterwarnings("ignore")
+
+logging.getLogger("fairseq").setLevel(logging.ERROR)
+logging.getLogger("faiss.loader").setLevel(logging.ERROR)
+logging.getLogger("transformers").setLevel(logging.ERROR)
+logging.getLogger("torch").setLevel(logging.ERROR)
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+base_path = os.path.join(now_dir, "rvc", "models", "formant", "stftpitchshift")
+stft = base_path + ".exe" if sys.platform == "win32" else base_path
+
+
+class HubertModelWithFinalProj(HubertModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+
+
+def load_audio(file, sample_rate):
+ try:
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+ audio, sr = sf.read(file)
+ if len(audio.shape) > 1:
+ audio = librosa.to_mono(audio.T)
+ if sr != sample_rate:
+ audio = librosa.resample(
+ audio, orig_sr=sr, target_sr=sample_rate, res_type="soxr_vhq"
+ )
+ except Exception as error:
+ raise RuntimeError(f"An error occurred loading the audio: {error}")
+
+ return audio.flatten()
+
+
+def load_audio_infer(
+ file,
+ sample_rate,
+ **kwargs,
+):
+ formant_shifting = kwargs.get("formant_shifting", False)
+ try:
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+ if not os.path.isfile(file):
+ raise FileNotFoundError(f"File not found: {file}")
+ audio, sr = sf.read(file)
+ if len(audio.shape) > 1:
+ audio = librosa.to_mono(audio.T)
+ if sr != sample_rate:
+ audio = librosa.resample(
+ audio, orig_sr=sr, target_sr=sample_rate, res_type="soxr_vhq"
+ )
+ if formant_shifting:
+ formant_qfrency = kwargs.get("formant_qfrency", 0.8)
+ formant_timbre = kwargs.get("formant_timbre", 0.8)
+
+ from stftpitchshift import StftPitchShift
+
+ pitchshifter = StftPitchShift(1024, 32, sample_rate)
+ audio = pitchshifter.shiftpitch(
+ audio,
+ factors=1,
+ quefrency=formant_qfrency * 1e-3,
+ distortion=formant_timbre,
+ )
+ except Exception as error:
+ raise RuntimeError(f"An error occurred loading the audio: {error}")
+ return np.array(audio).flatten()
+
+
+def format_title(title):
+ formatted_title = (
+ unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
+ )
+ formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
+ formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title)
+ formatted_title = re.sub(r"\s+", "_", formatted_title)
+ return formatted_title
+
+
+def load_embedding(embedder_model, custom_embedder=None):
+ embedder_root = os.path.join(now_dir, "rvc", "models", "embedders")
+ embedding_list = {
+ "contentvec": os.path.join(embedder_root, "contentvec"),
+ "chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"),
+ "japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"),
+ "korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"),
+ }
+
+ online_embedders = {
+ "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/pytorch_model.bin",
+ "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin",
+ "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin",
+ "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin",
+ }
+
+ config_files = {
+ "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/config.json",
+ "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/config.json",
+ "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/config.json",
+ "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/config.json",
+ }
+
+ if embedder_model == "custom":
+ if os.path.exists(custom_embedder):
+ model_path = custom_embedder
+ else:
+ print(f"Custom embedder not found: {custom_embedder}, using contentvec")
+ model_path = embedding_list["contentvec"]
+ else:
+ model_path = embedding_list[embedder_model]
+ bin_file = os.path.join(model_path, "pytorch_model.bin")
+ json_file = os.path.join(model_path, "config.json")
+ os.makedirs(model_path, exist_ok=True)
+ if not os.path.exists(bin_file):
+ url = online_embedders[embedder_model]
+ print(f"Downloading {url} to {model_path}...")
+ wget.download(url, out=bin_file)
+ if not os.path.exists(json_file):
+ url = config_files[embedder_model]
+ print(f"Downloading {url} to {model_path}...")
+ wget.download(url, out=json_file)
+
+ models = HubertModelWithFinalProj.from_pretrained(model_path)
+ return models
diff --git a/rvc/lib/zluda.py b/rvc/lib/zluda.py
new file mode 100644
index 0000000000000000000000000000000000000000..45d59b78cb5804b11db649bdb0855063410cdb7f
--- /dev/null
+++ b/rvc/lib/zluda.py
@@ -0,0 +1,76 @@
+import torch
+
+if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"):
+
+ class STFT:
+ def __init__(self):
+ self.device = "cuda"
+ self.fourier_bases = {} # Cache for Fourier bases
+
+ def _get_fourier_basis(self, n_fft):
+ # Check if the basis for this n_fft is already cached
+ if n_fft in self.fourier_bases:
+ return self.fourier_bases[n_fft]
+ fourier_basis = torch.fft.fft(torch.eye(n_fft, device="cpu")).to(
+ self.device
+ )
+ # stack separated real and imaginary components and convert to torch tensor
+ cutoff = n_fft // 2 + 1
+ fourier_basis = torch.cat(
+ [fourier_basis.real[:cutoff], fourier_basis.imag[:cutoff]], dim=0
+ )
+ # cache the tensor and return
+ self.fourier_bases[n_fft] = fourier_basis
+ return fourier_basis
+
+ def transform(self, input, n_fft, hop_length, window):
+ # fetch cached Fourier basis
+ fourier_basis = self._get_fourier_basis(n_fft)
+ # apply hann window to Fourier basis
+ fourier_basis = fourier_basis * window
+ # pad input to center with reflect
+ pad_amount = n_fft // 2
+ input = torch.nn.functional.pad(
+ input, (pad_amount, pad_amount), mode="reflect"
+ )
+ # separate input into n_fft-sized frames
+ input_frames = input.unfold(1, n_fft, hop_length).permute(0, 2, 1)
+ # apply fft to each frame
+ fourier_transform = torch.matmul(fourier_basis, input_frames)
+ cutoff = n_fft // 2 + 1
+ return torch.complex(
+ fourier_transform[:, :cutoff, :], fourier_transform[:, cutoff:, :]
+ )
+
+ stft = STFT()
+ _torch_stft = torch.stft
+
+ def z_stft(input: torch.Tensor, window: torch.Tensor, *args, **kwargs):
+ # only optimizing a specific call from rvc.train.mel_processing.MultiScaleMelSpectrogramLoss
+ if (
+ kwargs.get("win_length") == None
+ and kwargs.get("center") == None
+ and kwargs.get("return_complex") == True
+ ):
+ # use GPU accelerated calculation
+ return stft.transform(
+ input, kwargs.get("n_fft"), kwargs.get("hop_length"), window
+ )
+ else:
+ # simply do the operation on CPU
+ return _torch_stft(
+ input=input.cpu(), window=window.cpu(), *args, **kwargs
+ ).to(input.device)
+
+ def z_jit(f, *_, **__):
+ f.graph = torch._C.Graph()
+ return f
+
+ # hijacks
+ torch.stft = z_stft
+ torch.jit.script = z_jit
+ # disabling unsupported cudnn
+ torch.backends.cudnn.enabled = False
+ torch.backends.cuda.enable_flash_sdp(False)
+ torch.backends.cuda.enable_math_sdp(True)
+ torch.backends.cuda.enable_mem_efficient_sdp(False)
diff --git a/rvc/models/embedders/.gitkeep b/rvc/models/embedders/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/rvc/models/embedders/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/rvc/models/embedders/embedders_custom/.gitkeep b/rvc/models/embedders/embedders_custom/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/rvc/models/embedders/embedders_custom/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/rvc/models/formant/.gitkeep b/rvc/models/formant/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/rvc/models/formant/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/rvc/models/predictors/.gitkeep b/rvc/models/predictors/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/rvc/models/pretraineds/.gitkeep b/rvc/models/pretraineds/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/rvc/models/pretraineds/custom/.gitkeep b/rvc/models/pretraineds/custom/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/rvc/models/pretraineds/custom/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/rvc/models/pretraineds/hifi-gan/.gitkeep b/rvc/models/pretraineds/hifi-gan/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/rvc/train/data_utils.py b/rvc/train/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b33d718030a702c347766ab8619f77859790c2c
--- /dev/null
+++ b/rvc/train/data_utils.py
@@ -0,0 +1,379 @@
+import os
+import numpy as np
+import torch
+import torch.utils.data
+
+from mel_processing import spectrogram_torch
+from utils import load_filepaths_and_text, load_wav_to_torch
+
+
+class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
+ """
+ Dataset that loads text and audio pairs.
+
+ Args:
+ hparams: Hyperparameters.
+ """
+
+ def __init__(self, hparams):
+ self.audiopaths_and_text = load_filepaths_and_text(hparams.training_files)
+ self.max_wav_value = hparams.max_wav_value
+ self.sample_rate = hparams.sample_rate
+ self.filter_length = hparams.filter_length
+ self.hop_length = hparams.hop_length
+ self.win_length = hparams.win_length
+ self.sample_rate = hparams.sample_rate
+ self.min_text_len = getattr(hparams, "min_text_len", 1)
+ self.max_text_len = getattr(hparams, "max_text_len", 5000)
+ self._filter()
+
+ def _filter(self):
+ """
+ Filters audio paths and text pairs based on text length.
+ """
+ audiopaths_and_text_new = []
+ lengths = []
+ for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text:
+ if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
+ audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv])
+ lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length))
+ self.audiopaths_and_text = audiopaths_and_text_new
+ self.lengths = lengths
+
+ def get_sid(self, sid):
+ """
+ Converts speaker ID to a LongTensor.
+
+ Args:
+ sid (str): Speaker ID.
+ """
+ try:
+ sid = torch.LongTensor([int(sid)])
+ except ValueError as error:
+ print(f"Error converting speaker ID '{sid}' to integer. Exception: {error}")
+ sid = torch.LongTensor([0])
+ return sid
+
+ def get_audio_text_pair(self, audiopath_and_text):
+ """
+ Loads and processes audio and text data for a single pair.
+
+ Args:
+ audiopath_and_text (list): List containing audio path, text, pitch, pitchf, and speaker ID.
+ """
+ file = audiopath_and_text[0]
+ phone = audiopath_and_text[1]
+ pitch = audiopath_and_text[2]
+ pitchf = audiopath_and_text[3]
+ dv = audiopath_and_text[4]
+
+ phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf)
+ spec, wav = self.get_audio(file)
+ dv = self.get_sid(dv)
+
+ len_phone = phone.size()[0]
+ len_spec = spec.size()[-1]
+ if len_phone != len_spec:
+ len_min = min(len_phone, len_spec)
+ len_wav = len_min * self.hop_length
+
+ spec = spec[:, :len_min]
+ wav = wav[:, :len_wav]
+
+ phone = phone[:len_min, :]
+ pitch = pitch[:len_min]
+ pitchf = pitchf[:len_min]
+
+ return (spec, wav, phone, pitch, pitchf, dv)
+
+ def get_labels(self, phone, pitch, pitchf):
+ """
+ Loads and processes phoneme, pitch, and pitchf labels.
+
+ Args:
+ phone (str): Path to phoneme label file.
+ pitch (str): Path to pitch label file.
+ pitchf (str): Path to pitchf label file.
+ """
+ phone = np.load(phone)
+ phone = np.repeat(phone, 2, axis=0)
+ pitch = np.load(pitch)
+ pitchf = np.load(pitchf)
+ n_num = min(phone.shape[0], 900)
+ phone = phone[:n_num, :]
+ pitch = pitch[:n_num]
+ pitchf = pitchf[:n_num]
+ phone = torch.FloatTensor(phone)
+ pitch = torch.LongTensor(pitch)
+ pitchf = torch.FloatTensor(pitchf)
+ return phone, pitch, pitchf
+
+ def get_audio(self, filename):
+ """
+ Loads and processes audio data.
+
+ Args:
+ filename (str): Path to audio file.
+ """
+ audio, sample_rate = load_wav_to_torch(filename)
+ if sample_rate != self.sample_rate:
+ raise ValueError(
+ f"{sample_rate} SR doesn't match target {self.sample_rate} SR"
+ )
+ audio_norm = audio
+ audio_norm = audio_norm.unsqueeze(0)
+ spec_filename = filename.replace(".wav", ".spec.pt")
+ if os.path.exists(spec_filename):
+ try:
+ spec = torch.load(spec_filename, weights_only=True)
+ except Exception as error:
+ print(f"An error occurred getting spec from {spec_filename}: {error}")
+ spec = spectrogram_torch(
+ audio_norm,
+ self.filter_length,
+ self.hop_length,
+ self.win_length,
+ center=False,
+ )
+ spec = torch.squeeze(spec, 0)
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
+ else:
+ spec = spectrogram_torch(
+ audio_norm,
+ self.filter_length,
+ self.hop_length,
+ self.win_length,
+ center=False,
+ )
+ spec = torch.squeeze(spec, 0)
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
+ return spec, audio_norm
+
+ def __getitem__(self, index):
+ """
+ Returns a single audio-text pair.
+
+ Args:
+ index (int): Index of the data sample.
+ """
+ return self.get_audio_text_pair(self.audiopaths_and_text[index])
+
+ def __len__(self):
+ """
+ Returns the length of the dataset.
+ """
+ return len(self.audiopaths_and_text)
+
+
+class TextAudioCollateMultiNSFsid:
+ """
+ Collates text and audio data for training.
+
+ Args:
+ return_ids (bool, optional): Whether to return sample IDs. Defaults to False.
+ """
+
+ def __init__(self, return_ids=False):
+ self.return_ids = return_ids
+
+ def __call__(self, batch):
+ """
+ Collates a batch of data samples.
+
+ Args:
+ batch (list): List of data samples.
+ """
+ _, ids_sorted_decreasing = torch.sort(
+ torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
+ )
+
+ max_spec_len = max([x[0].size(1) for x in batch])
+ max_wave_len = max([x[1].size(1) for x in batch])
+ spec_lengths = torch.LongTensor(len(batch))
+ wave_lengths = torch.LongTensor(len(batch))
+ spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
+ wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
+ spec_padded.zero_()
+ wave_padded.zero_()
+
+ max_phone_len = max([x[2].size(0) for x in batch])
+ phone_lengths = torch.LongTensor(len(batch))
+ phone_padded = torch.FloatTensor(
+ len(batch), max_phone_len, batch[0][2].shape[1]
+ )
+ pitch_padded = torch.LongTensor(len(batch), max_phone_len)
+ pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
+ phone_padded.zero_()
+ pitch_padded.zero_()
+ pitchf_padded.zero_()
+ sid = torch.LongTensor(len(batch))
+
+ for i in range(len(ids_sorted_decreasing)):
+ row = batch[ids_sorted_decreasing[i]]
+
+ spec = row[0]
+ spec_padded[i, :, : spec.size(1)] = spec
+ spec_lengths[i] = spec.size(1)
+
+ wave = row[1]
+ wave_padded[i, :, : wave.size(1)] = wave
+ wave_lengths[i] = wave.size(1)
+
+ phone = row[2]
+ phone_padded[i, : phone.size(0), :] = phone
+ phone_lengths[i] = phone.size(0)
+
+ pitch = row[3]
+ pitch_padded[i, : pitch.size(0)] = pitch
+ pitchf = row[4]
+ pitchf_padded[i, : pitchf.size(0)] = pitchf
+
+ sid[i] = row[5]
+
+ return (
+ phone_padded,
+ phone_lengths,
+ pitch_padded,
+ pitchf_padded,
+ spec_padded,
+ spec_lengths,
+ wave_padded,
+ wave_lengths,
+ sid,
+ )
+
+
+class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
+ """
+ Distributed sampler that groups data into buckets based on length.
+
+ Args:
+ dataset (torch.utils.data.Dataset): Dataset to sample from.
+ batch_size (int): Batch size.
+ boundaries (list): List of length boundaries for buckets.
+ num_replicas (int, optional): Number of processes participating in distributed training. Defaults to None.
+ rank (int, optional): Rank of the current process. Defaults to None.
+ shuffle (bool, optional): Whether to shuffle the data. Defaults to True.
+ """
+
+ def __init__(
+ self,
+ dataset,
+ batch_size,
+ boundaries,
+ num_replicas=None,
+ rank=None,
+ shuffle=True,
+ ):
+ super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+ self.lengths = dataset.lengths
+ self.batch_size = batch_size
+ self.boundaries = boundaries
+
+ self.buckets, self.num_samples_per_bucket = self._create_buckets()
+ self.total_size = sum(self.num_samples_per_bucket)
+ self.num_samples = self.total_size // self.num_replicas
+
+ def _create_buckets(self):
+ """
+ Creates buckets of data samples based on length.
+ """
+ buckets = [[] for _ in range(len(self.boundaries) - 1)]
+ for i in range(len(self.lengths)):
+ length = self.lengths[i]
+ idx_bucket = self._bisect(length)
+ if idx_bucket != -1:
+ buckets[idx_bucket].append(i)
+
+ for i in range(len(buckets) - 1, -1, -1): #
+ if len(buckets[i]) == 0:
+ buckets.pop(i)
+ self.boundaries.pop(i + 1)
+
+ num_samples_per_bucket = []
+ for i in range(len(buckets)):
+ len_bucket = len(buckets[i])
+ total_batch_size = self.num_replicas * self.batch_size
+ rem = (
+ total_batch_size - (len_bucket % total_batch_size)
+ ) % total_batch_size
+ num_samples_per_bucket.append(len_bucket + rem)
+ return buckets, num_samples_per_bucket
+
+ def __iter__(self):
+ """
+ Iterates over batches of data samples.
+ """
+ g = torch.Generator()
+ g.manual_seed(self.epoch)
+
+ indices = []
+ if self.shuffle:
+ for bucket in self.buckets:
+ indices.append(torch.randperm(len(bucket), generator=g).tolist())
+ else:
+ for bucket in self.buckets:
+ indices.append(list(range(len(bucket))))
+
+ batches = []
+ for i in range(len(self.buckets)):
+ bucket = self.buckets[i]
+ len_bucket = len(bucket)
+ ids_bucket = indices[i]
+ num_samples_bucket = self.num_samples_per_bucket[i]
+
+ rem = num_samples_bucket - len_bucket
+ ids_bucket = (
+ ids_bucket
+ + ids_bucket * (rem // len_bucket)
+ + ids_bucket[: (rem % len_bucket)]
+ )
+
+ ids_bucket = ids_bucket[self.rank :: self.num_replicas]
+
+ # batching
+ for j in range(len(ids_bucket) // self.batch_size):
+ batch = [
+ bucket[idx]
+ for idx in ids_bucket[
+ j * self.batch_size : (j + 1) * self.batch_size
+ ]
+ ]
+ batches.append(batch)
+
+ if self.shuffle:
+ batch_ids = torch.randperm(len(batches), generator=g).tolist()
+ batches = [batches[i] for i in batch_ids]
+ self.batches = batches
+
+ assert len(self.batches) * self.batch_size == self.num_samples
+ return iter(self.batches)
+
+ def _bisect(self, x, lo=0, hi=None):
+ """
+ Performs binary search to find the bucket index for a given length.
+
+ Args:
+ x (int): Length to find the bucket for.
+ lo (int, optional): Lower bound of the search range. Defaults to 0.
+ hi (int, optional): Upper bound of the search range. Defaults to None.
+ """
+ if hi is None:
+ hi = len(self.boundaries) - 1
+
+ if hi > lo:
+ mid = (hi + lo) // 2
+ if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
+ return mid
+ elif x <= self.boundaries[mid]:
+ return self._bisect(x, lo, mid)
+ else:
+ return self._bisect(x, mid + 1, hi)
+ else:
+ return -1
+
+ def __len__(self):
+ """
+ Returns the length of the sampler.
+ """
+ return self.num_samples // self.batch_size
diff --git a/rvc/train/extract/extract.py b/rvc/train/extract/extract.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3d524027fbc3945eff8f1d82f7905f2c2f57ffb
--- /dev/null
+++ b/rvc/train/extract/extract.py
@@ -0,0 +1,248 @@
+import os
+import sys
+import glob
+import time
+import tqdm
+import torch
+import torchcrepe
+import numpy as np
+import concurrent.futures
+import multiprocessing as mp
+import json
+
+now_dir = os.getcwd()
+sys.path.append(os.path.join(now_dir))
+
+# Zluda hijack
+import rvc.lib.zluda
+
+from rvc.lib.utils import load_audio, load_embedding
+from rvc.train.extract.preparing_files import generate_config, generate_filelist
+from rvc.lib.predictors.RMVPE import RMVPE0Predictor
+from rvc.configs.config import Config
+
+# Load config
+config = Config()
+mp.set_start_method("spawn", force=True)
+
+
+class FeatureInput:
+ def __init__(self, sample_rate=16000, hop_size=160, device="cpu"):
+ self.fs = sample_rate
+ self.hop = hop_size
+ self.f0_bin = 256
+ self.f0_max = 1100.0
+ self.f0_min = 50.0
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+ self.device = device
+ self.model_rmvpe = None
+
+ def compute_f0(self, audio_array, method, hop_length):
+ if method == "crepe":
+ return self._get_crepe(audio_array, hop_length, type="full")
+ elif method == "crepe-tiny":
+ return self._get_crepe(audio_array, hop_length, type="tiny")
+ elif method == "rmvpe":
+ return self.model_rmvpe.infer_from_audio(audio_array, thred=0.03)
+
+ def _get_crepe(self, x, hop_length, type):
+ audio = torch.from_numpy(x.astype(np.float32)).to(self.device)
+ audio /= torch.quantile(torch.abs(audio), 0.999)
+ audio = audio.unsqueeze(0)
+ pitch = torchcrepe.predict(
+ audio,
+ self.fs,
+ hop_length,
+ self.f0_min,
+ self.f0_max,
+ type,
+ batch_size=hop_length * 2,
+ device=audio.device,
+ pad=True,
+ )
+ source = pitch.squeeze(0).cpu().float().numpy()
+ source[source < 0.001] = np.nan
+ return np.nan_to_num(
+ np.interp(
+ np.arange(0, len(source) * (x.size // self.hop), len(source))
+ / (x.size // self.hop),
+ np.arange(0, len(source)),
+ source,
+ )
+ )
+
+ def coarse_f0(self, f0):
+ f0_mel = 1127.0 * np.log(1.0 + f0 / 700.0)
+ f0_mel = np.clip(
+ (f0_mel - self.f0_mel_min)
+ * (self.f0_bin - 2)
+ / (self.f0_mel_max - self.f0_mel_min)
+ + 1,
+ 1,
+ self.f0_bin - 1,
+ )
+ return np.rint(f0_mel).astype(int)
+
+ def process_file(self, file_info, f0_method, hop_length):
+ inp_path, opt_path_coarse, opt_path_full, _ = file_info
+ if os.path.exists(opt_path_coarse) and os.path.exists(opt_path_full):
+ return
+
+ try:
+ np_arr = load_audio(inp_path, self.fs)
+ feature_pit = self.compute_f0(np_arr, f0_method, hop_length)
+ np.save(opt_path_full, feature_pit, allow_pickle=False)
+ coarse_pit = self.coarse_f0(feature_pit)
+ np.save(opt_path_coarse, coarse_pit, allow_pickle=False)
+ except Exception as error:
+ print(
+ f"An error occurred extracting file {inp_path} on {self.device}: {error}"
+ )
+
+ def process_files(self, files, f0_method, hop_length, device, threads):
+ self.device = device
+ if f0_method == "rmvpe":
+ self.model_rmvpe = RMVPE0Predictor(
+ os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
+ device=device,
+ )
+
+ def worker(file_info):
+ self.process_file(file_info, f0_method, hop_length)
+
+ with tqdm.tqdm(total=len(files), leave=True) as pbar:
+ with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
+ futures = [executor.submit(worker, f) for f in files]
+ for _ in concurrent.futures.as_completed(futures):
+ pbar.update(1)
+
+
+def run_pitch_extraction(files, devices, f0_method, hop_length, threads):
+ devices_str = ", ".join(devices)
+ print(
+ f"Starting pitch extraction with {num_processes} cores on {devices_str} using {f0_method}..."
+ )
+ start_time = time.time()
+ fe = FeatureInput()
+ with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor:
+ tasks = [
+ executor.submit(
+ fe.process_files,
+ files[i :: len(devices)],
+ f0_method,
+ hop_length,
+ devices[i],
+ threads // len(devices),
+ )
+ for i in range(len(devices))
+ ]
+ concurrent.futures.wait(tasks)
+
+ print(f"Pitch extraction completed in {time.time() - start_time:.2f} seconds.")
+
+
+def process_file_embedding(
+ files, embedder_model, embedder_model_custom, device_num, device, n_threads
+):
+ model = load_embedding(embedder_model, embedder_model_custom).to(device).float()
+ model.eval()
+ n_threads = max(1, n_threads)
+
+ def worker(file_info):
+ wav_file_path, _, _, out_file_path = file_info
+ if os.path.exists(out_file_path):
+ return
+ feats = torch.from_numpy(load_audio(wav_file_path, 16000)).to(device).float()
+ feats = feats.view(1, -1)
+ with torch.no_grad():
+ result = model(feats)["last_hidden_state"]
+ feats_out = result.squeeze(0).float().cpu().numpy()
+ if not np.isnan(feats_out).any():
+ np.save(out_file_path, feats_out, allow_pickle=False)
+ else:
+ print(f"{wav_file_path} produced NaN values; skipping.")
+
+ with tqdm.tqdm(total=len(files), leave=True, position=device_num) as pbar:
+ with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
+ futures = [executor.submit(worker, f) for f in files]
+ for _ in concurrent.futures.as_completed(futures):
+ pbar.update(1)
+
+
+def run_embedding_extraction(
+ files, devices, embedder_model, embedder_model_custom, threads
+):
+ devices_str = ", ".join(devices)
+ print(
+ f"Starting embedding extraction with {num_processes} cores on {devices_str}..."
+ )
+ start_time = time.time()
+ with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor:
+ tasks = [
+ executor.submit(
+ process_file_embedding,
+ files[i :: len(devices)],
+ embedder_model,
+ embedder_model_custom,
+ i,
+ devices[i],
+ threads // len(devices),
+ )
+ for i in range(len(devices))
+ ]
+ concurrent.futures.wait(tasks)
+
+ print(f"Embedding extraction completed in {time.time() - start_time:.2f} seconds.")
+
+
+if __name__ == "__main__":
+ exp_dir = sys.argv[1]
+ f0_method = sys.argv[2]
+ hop_length = int(sys.argv[3])
+ num_processes = int(sys.argv[4])
+ gpus = sys.argv[5]
+ sample_rate = sys.argv[6]
+ embedder_model = sys.argv[7]
+ embedder_model_custom = sys.argv[8] if len(sys.argv) > 8 else None
+ include_mutes = int(sys.argv[9]) if len(sys.argv) > 9 else 2
+
+ wav_path = os.path.join(exp_dir, "sliced_audios_16k")
+ os.makedirs(os.path.join(exp_dir, "f0"), exist_ok=True)
+ os.makedirs(os.path.join(exp_dir, "f0_voiced"), exist_ok=True)
+ os.makedirs(os.path.join(exp_dir, "extracted"), exist_ok=True)
+
+ chosen_embedder_model = (
+ embedder_model_custom if embedder_model == "custom" else embedder_model
+ )
+ file_path = os.path.join(exp_dir, "model_info.json")
+ if os.path.exists(file_path):
+ with open(file_path, "r") as f:
+ data = json.load(f)
+ else:
+ data = {}
+ data["embedder_model"] = chosen_embedder_model
+ with open(file_path, "w") as f:
+ json.dump(data, f, indent=4)
+
+ files = []
+ for file in glob.glob(os.path.join(wav_path, "*.wav")):
+ file_name = os.path.basename(file)
+ file_info = [
+ file,
+ os.path.join(exp_dir, "f0", file_name + ".npy"),
+ os.path.join(exp_dir, "f0_voiced", file_name + ".npy"),
+ os.path.join(exp_dir, "extracted", file_name.replace("wav", "npy")),
+ ]
+ files.append(file_info)
+
+ devices = ["cpu"] if gpus == "-" else [f"cuda:{idx}" for idx in gpus.split("-")]
+
+ run_pitch_extraction(files, devices, f0_method, hop_length, num_processes)
+
+ run_embedding_extraction(
+ files, devices, embedder_model, embedder_model_custom, num_processes
+ )
+
+ generate_config(sample_rate, exp_dir)
+ generate_filelist(exp_dir, sample_rate, include_mutes)
diff --git a/rvc/train/extract/preparing_files.py b/rvc/train/extract/preparing_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ae4fc08dfb1ed0edbcda348b0f2f841176abd08
--- /dev/null
+++ b/rvc/train/extract/preparing_files.py
@@ -0,0 +1,75 @@
+import os
+import shutil
+from random import shuffle
+from rvc.configs.config import Config
+import json
+
+config = Config()
+current_directory = os.getcwd()
+
+
+def generate_config(sample_rate: int, model_path: str):
+ config_path = os.path.join("rvc", "configs", f"{sample_rate}.json")
+ config_save_path = os.path.join(model_path, "config.json")
+ if not os.path.exists(config_save_path):
+ shutil.copyfile(config_path, config_save_path)
+
+
+def generate_filelist(model_path: str, sample_rate: int, include_mutes: int = 2):
+ gt_wavs_dir = os.path.join(model_path, "sliced_audios")
+ feature_dir = os.path.join(model_path, f"extracted")
+
+ f0_dir, f0nsf_dir = None, None
+ f0_dir = os.path.join(model_path, "f0")
+ f0nsf_dir = os.path.join(model_path, "f0_voiced")
+
+ gt_wavs_files = set(name.split(".")[0] for name in os.listdir(gt_wavs_dir))
+ feature_files = set(name.split(".")[0] for name in os.listdir(feature_dir))
+
+ f0_files = set(name.split(".")[0] for name in os.listdir(f0_dir))
+ f0nsf_files = set(name.split(".")[0] for name in os.listdir(f0nsf_dir))
+ names = gt_wavs_files & feature_files & f0_files & f0nsf_files
+
+ options = []
+ mute_base_path = os.path.join(current_directory, "logs", "mute")
+ sids = []
+ for name in names:
+ sid = name.split("_")[0]
+ if sid not in sids:
+ sids.append(sid)
+ options.append(
+ f"{os.path.join(gt_wavs_dir, name)}.wav|{os.path.join(feature_dir, name)}.npy|{os.path.join(f0_dir, name)}.wav.npy|{os.path.join(f0nsf_dir, name)}.wav.npy|{sid}"
+ )
+
+ if include_mutes > 0:
+ mute_audio_path = os.path.join(
+ mute_base_path, "sliced_audios", f"mute{sample_rate}.wav"
+ )
+ mute_feature_path = os.path.join(mute_base_path, f"extracted", "mute.npy")
+ mute_f0_path = os.path.join(mute_base_path, "f0", "mute.wav.npy")
+ mute_f0nsf_path = os.path.join(mute_base_path, "f0_voiced", "mute.wav.npy")
+
+ # adding x files per sid
+ for sid in sids * include_mutes:
+ options.append(
+ f"{mute_audio_path}|{mute_feature_path}|{mute_f0_path}|{mute_f0nsf_path}|{sid}"
+ )
+
+ file_path = os.path.join(model_path, "model_info.json")
+ if os.path.exists(file_path):
+ with open(file_path, "r") as f:
+ data = json.load(f)
+ else:
+ data = {}
+ data.update(
+ {
+ "speakers_id": len(sids),
+ }
+ )
+ with open(file_path, "w") as f:
+ json.dump(data, f, indent=4)
+
+ shuffle(options)
+
+ with open(os.path.join(model_path, "filelist.txt"), "w") as f:
+ f.write("\n".join(options))
diff --git a/rvc/train/losses.py b/rvc/train/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef3a2eb27b139a43a54ce5e074c557af01a0f417
--- /dev/null
+++ b/rvc/train/losses.py
@@ -0,0 +1,132 @@
+import torch
+
+
+def feature_loss(fmap_r, fmap_g):
+ """
+ Compute the feature loss between reference and generated feature maps.
+
+ Args:
+ fmap_r (list of torch.Tensor): List of reference feature maps.
+ fmap_g (list of torch.Tensor): List of generated feature maps.
+ """
+ return 2 * sum(
+ torch.mean(torch.abs(rl - gl))
+ for dr, dg in zip(fmap_r, fmap_g)
+ for rl, gl in zip(dr, dg)
+ )
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+ """
+ Compute the discriminator loss for real and generated outputs.
+
+ Args:
+ disc_real_outputs (list of torch.Tensor): List of discriminator outputs for real samples.
+ disc_generated_outputs (list of torch.Tensor): List of discriminator outputs for generated samples.
+ """
+ loss = 0
+ r_losses = []
+ g_losses = []
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+ r_loss = torch.mean((1 - dr.float()) ** 2)
+ g_loss = torch.mean(dg.float() ** 2)
+
+ # r_losses.append(r_loss.item())
+ # g_losses.append(g_loss.item())
+ loss += r_loss + g_loss
+
+ return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+ """
+ Compute the generator loss based on discriminator outputs.
+
+ Args:
+ disc_outputs (list of torch.Tensor): List of discriminator outputs for generated samples.
+ """
+ loss = 0
+ gen_losses = []
+ for dg in disc_outputs:
+ l = torch.mean((1 - dg.float()) ** 2)
+ # gen_losses.append(l.item())
+ loss += l
+
+ return loss, gen_losses
+
+
+def discriminator_loss_scaled(disc_real, disc_fake, scale=1.0):
+ loss = 0
+ for i, (d_real, d_fake) in enumerate(zip(disc_real, disc_fake)):
+ real_loss = torch.mean((1 - d_real) ** 2)
+ fake_loss = torch.mean(d_fake**2)
+ _loss = real_loss + fake_loss
+ loss += _loss if i < len(disc_real) / 2 else scale * _loss
+ return loss, None, None
+
+
+def generator_loss_scaled(disc_outputs, scale=1.0):
+ loss = 0
+ for i, d_fake in enumerate(disc_outputs):
+ d_fake = d_fake.float()
+ _loss = torch.mean((1 - d_fake) ** 2)
+ loss += _loss if i < len(disc_outputs) / 2 else scale * _loss
+ return loss, None, None
+
+
+def discriminator_loss_scaled(disc_real, disc_fake, scale=1.0):
+ """
+ Compute the scaled discriminator loss for real and generated outputs.
+
+ Args:
+ disc_real (list of torch.Tensor): List of discriminator outputs for real samples.
+ disc_fake (list of torch.Tensor): List of discriminator outputs for generated samples.
+ scale (float, optional): Scaling factor applied to losses beyond the midpoint. Default is 1.0.
+ """
+ midpoint = len(disc_real) // 2
+ losses = []
+ for i, (d_real, d_fake) in enumerate(zip(disc_real, disc_fake)):
+ real_loss = (1 - d_real).pow(2).mean()
+ fake_loss = d_fake.pow(2).mean()
+ total_loss = real_loss + fake_loss
+ if i >= midpoint:
+ total_loss *= scale
+ losses.append(total_loss)
+ loss = sum(losses)
+ return loss, None, None
+
+
+def generator_loss_scaled(disc_outputs, scale=1.0):
+ """
+ Compute the scaled generator loss based on discriminator outputs.
+
+ Args:
+ disc_outputs (list of torch.Tensor): List of discriminator outputs for generated samples.
+ scale (float, optional): Scaling factor applied to losses beyond the midpoint. Default is 1.0.
+ """
+ midpoint = len(disc_outputs) // 2
+ losses = []
+ for i, d_fake in enumerate(disc_outputs):
+ loss_value = (1 - d_fake).pow(2).mean()
+ if i >= midpoint:
+ loss_value *= scale
+ losses.append(loss_value)
+ loss = sum(losses)
+ return loss, None, None
+
+
+def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
+ """
+ Compute the Kullback-Leibler divergence loss.
+
+ Args:
+ z_p (torch.Tensor): Latent variable z_p [b, h, t_t].
+ logs_q (torch.Tensor): Log variance of q [b, h, t_t].
+ m_p (torch.Tensor): Mean of p [b, h, t_t].
+ logs_p (torch.Tensor): Log variance of p [b, h, t_t].
+ z_mask (torch.Tensor): Mask for the latent variables [b, h, t_t].
+ """
+ kl = logs_p - logs_q - 0.5 + 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2 * logs_p)
+ kl = (kl * z_mask).sum()
+ loss = kl / z_mask.sum()
+ return loss
diff --git a/rvc/train/mel_processing.py b/rvc/train/mel_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4d51cbc102e8201ece9d7a1352256c54b263dde
--- /dev/null
+++ b/rvc/train/mel_processing.py
@@ -0,0 +1,234 @@
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+ """
+ Dynamic range compression using log10.
+
+ Args:
+ x (torch.Tensor): Input tensor.
+ C (float, optional): Scaling factor. Defaults to 1.
+ clip_val (float, optional): Minimum value for clamping. Defaults to 1e-5.
+ """
+ return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+ """
+ Dynamic range decompression using exp.
+
+ Args:
+ x (torch.Tensor): Input tensor.
+ C (float, optional): Scaling factor. Defaults to 1.
+ """
+ return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+ """
+ Spectral normalization using dynamic range compression.
+
+ Args:
+ magnitudes (torch.Tensor): Magnitude spectrogram.
+ """
+ return dynamic_range_compression_torch(magnitudes)
+
+
+def spectral_de_normalize_torch(magnitudes):
+ """
+ Spectral de-normalization using dynamic range decompression.
+
+ Args:
+ magnitudes (torch.Tensor): Normalized spectrogram.
+ """
+ return dynamic_range_decompression_torch(magnitudes)
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, hop_size, win_size, center=False):
+ """
+ Compute the spectrogram of a signal using STFT.
+
+ Args:
+ y (torch.Tensor): Input signal.
+ n_fft (int): FFT window size.
+ hop_size (int): Hop size between frames.
+ win_size (int): Window size.
+ center (bool, optional): Whether to center the window. Defaults to False.
+ """
+ global hann_window
+ dtype_device = str(y.dtype) + "_" + str(y.device)
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
+ dtype=y.dtype, device=y.device
+ )
+
+ y = torch.nn.functional.pad(
+ y.unsqueeze(1),
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+ mode="reflect",
+ )
+ y = y.squeeze(1)
+
+ spec = torch.stft(
+ y,
+ n_fft=n_fft,
+ hop_length=hop_size,
+ win_length=win_size,
+ window=hann_window[wnsize_dtype_device],
+ center=center,
+ pad_mode="reflect",
+ normalized=False,
+ onesided=True,
+ return_complex=True,
+ )
+
+ spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
+
+ return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sample_rate, fmin, fmax):
+ """
+ Convert a spectrogram to a mel-spectrogram.
+
+ Args:
+ spec (torch.Tensor): Magnitude spectrogram.
+ n_fft (int): FFT window size.
+ num_mels (int): Number of mel frequency bins.
+ sample_rate (int): Sampling rate of the audio signal.
+ fmin (float): Minimum frequency.
+ fmax (float): Maximum frequency.
+ """
+ global mel_basis
+ dtype_device = str(spec.dtype) + "_" + str(spec.device)
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
+ if fmax_dtype_device not in mel_basis:
+ mel = librosa_mel_fn(
+ sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
+ )
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
+ dtype=spec.dtype, device=spec.device
+ )
+
+ melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+ melspec = spectral_normalize_torch(melspec)
+ return melspec
+
+
+def mel_spectrogram_torch(
+ y, n_fft, num_mels, sample_rate, hop_size, win_size, fmin, fmax, center=False
+):
+ """
+ Compute the mel-spectrogram of a signal.
+
+ Args:
+ y (torch.Tensor): Input signal.
+ n_fft (int): FFT window size.
+ num_mels (int): Number of mel frequency bins.
+ sample_rate (int): Sampling rate of the audio signal.
+ hop_size (int): Hop size between frames.
+ win_size (int): Window size.
+ fmin (float): Minimum frequency.
+ fmax (float): Maximum frequency.
+ center (bool, optional): Whether to center the window. Defaults to False.
+ """
+ spec = spectrogram_torch(y, n_fft, hop_size, win_size, center)
+
+ melspec = spec_to_mel_torch(spec, n_fft, num_mels, sample_rate, fmin, fmax)
+
+ return melspec
+
+
+def compute_window_length(n_mels: int, sample_rate: int):
+ f_min = 0
+ f_max = sample_rate / 2
+ window_length_seconds = 8 * n_mels / (f_max - f_min)
+ window_length = int(window_length_seconds * sample_rate)
+ return 2 ** (window_length.bit_length() - 1)
+
+
+class MultiScaleMelSpectrogramLoss(torch.nn.Module):
+
+ def __init__(
+ self,
+ sample_rate: int = 24000,
+ n_mels: list[int] = [5, 10, 20, 40, 80, 160, 320, 480],
+ loss_fn=torch.nn.L1Loss(),
+ ):
+ super().__init__()
+ self.sample_rate = sample_rate
+ self.loss_fn = loss_fn
+ self.log_base = torch.log(torch.tensor(10.0))
+ self.stft_params: list[tuple] = []
+ self.hann_window: dict[int, torch.Tensor] = {}
+ self.mel_banks: dict[int, torch.Tensor] = {}
+
+ self.stft_params = [
+ (mel, compute_window_length(mel, sample_rate), self.sample_rate // 100)
+ for mel in n_mels
+ ]
+
+ def mel_spectrogram(
+ self,
+ wav: torch.Tensor,
+ n_mels: int,
+ window_length: int,
+ hop_length: int,
+ ):
+ # IDs for caching
+ dtype_device = str(wav.dtype) + "_" + str(wav.device)
+ win_dtype_device = str(window_length) + "_" + dtype_device
+ mel_dtype_device = str(n_mels) + "_" + dtype_device
+ # caching hann window
+ if win_dtype_device not in self.hann_window:
+ self.hann_window[win_dtype_device] = torch.hann_window(
+ window_length, device=wav.device, dtype=torch.float32
+ )
+
+ wav = wav.squeeze(1) # -> torch(B, T)
+
+ stft = torch.stft(
+ wav.float(),
+ n_fft=window_length,
+ hop_length=hop_length,
+ window=self.hann_window[win_dtype_device],
+ return_complex=True,
+ ) # -> torch (B, window_length // 2 + 1, (T - window_length)/hop_length + 1)
+
+ magnitude = torch.sqrt(stft.real.pow(2) + stft.imag.pow(2) + 1e-6)
+
+ # caching mel filter
+ if mel_dtype_device not in self.mel_banks:
+ self.mel_banks[mel_dtype_device] = torch.from_numpy(
+ librosa_mel_fn(
+ sr=self.sample_rate,
+ n_mels=n_mels,
+ n_fft=window_length,
+ fmin=0,
+ fmax=None,
+ )
+ ).to(device=wav.device, dtype=torch.float32)
+
+ mel_spectrogram = torch.matmul(
+ self.mel_banks[mel_dtype_device], magnitude
+ ) # torch(B, n_mels, stft.frames)
+ return mel_spectrogram
+
+ def forward(
+ self, real: torch.Tensor, fake: torch.Tensor
+ ): # real: torch(B, 1, T) , fake: torch(B, 1, T)
+ loss = 0.0
+ for p in self.stft_params:
+ real_mels = self.mel_spectrogram(real, *p)
+ fake_mels = self.mel_spectrogram(fake, *p)
+ real_logmels = torch.log(real_mels.clamp(min=1e-5)) / self.log_base
+ fake_logmels = torch.log(fake_mels.clamp(min=1e-5)) / self.log_base
+ loss += self.loss_fn(real_logmels, fake_logmels)
+ return loss
diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd563a1b86ac1bcd18a27622bae420b9d67f881
--- /dev/null
+++ b/rvc/train/preprocess/preprocess.py
@@ -0,0 +1,345 @@
+import os
+import sys
+import time
+from scipy import signal
+from scipy.io import wavfile
+import numpy as np
+import concurrent.futures
+from tqdm import tqdm
+import json
+from distutils.util import strtobool
+import librosa
+import multiprocessing
+import noisereduce as nr
+import soxr
+
+now_directory = os.getcwd()
+sys.path.append(now_directory)
+
+from rvc.lib.utils import load_audio
+from rvc.train.preprocess.slicer import Slicer
+
+import logging
+
+logging.getLogger("numba.core.byteflow").setLevel(logging.WARNING)
+logging.getLogger("numba.core.ssa").setLevel(logging.WARNING)
+logging.getLogger("numba.core.interpreter").setLevel(logging.WARNING)
+
+OVERLAP = 0.3
+PERCENTAGE = 3.0
+MAX_AMPLITUDE = 0.9
+ALPHA = 0.75
+HIGH_PASS_CUTOFF = 48
+SAMPLE_RATE_16K = 16000
+RES_TYPE = "soxr_vhq"
+
+
+class PreProcess:
+ def __init__(self, sr: int, exp_dir: str):
+ self.slicer = Slicer(
+ sr=sr,
+ threshold=-42,
+ min_length=1500,
+ min_interval=400,
+ hop_size=15,
+ max_sil_kept=500,
+ )
+ self.sr = sr
+ self.b_high, self.a_high = signal.butter(
+ N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr
+ )
+ self.exp_dir = exp_dir
+ self.device = "cpu"
+ self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios")
+ self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k")
+ os.makedirs(self.gt_wavs_dir, exist_ok=True)
+ os.makedirs(self.wavs16k_dir, exist_ok=True)
+
+ def _normalize_audio(self, audio: np.ndarray):
+ tmp_max = np.abs(audio).max()
+ if tmp_max > 2.5:
+ return None
+ return (audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (1 - ALPHA) * audio
+
+ def process_audio_segment(
+ self,
+ normalized_audio: np.ndarray,
+ sid: int,
+ idx0: int,
+ idx1: int,
+ ):
+ if normalized_audio is None:
+ print(f"{sid}-{idx0}-{idx1}-filtered")
+ return
+ wavfile.write(
+ os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{idx1}.wav"),
+ self.sr,
+ normalized_audio.astype(np.float32),
+ )
+ audio_16k = librosa.resample(
+ normalized_audio,
+ orig_sr=self.sr,
+ target_sr=SAMPLE_RATE_16K,
+ res_type=RES_TYPE,
+ )
+ wavfile.write(
+ os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{idx1}.wav"),
+ SAMPLE_RATE_16K,
+ audio_16k.astype(np.float32),
+ )
+
+ def simple_cut(
+ self,
+ audio: np.ndarray,
+ sid: int,
+ idx0: int,
+ chunk_len: float,
+ overlap_len: float,
+ ):
+ chunk_length = int(self.sr * chunk_len)
+ overlap_length = int(self.sr * overlap_len)
+ i = 0
+ while i < len(audio):
+ chunk = audio[i : i + chunk_length]
+ if len(chunk) == chunk_length:
+ # full SR for training
+ wavfile.write(
+ os.path.join(
+ self.gt_wavs_dir,
+ f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav",
+ ),
+ self.sr,
+ chunk.astype(np.float32),
+ )
+ # 16KHz for feature extraction
+ chunk_16k = librosa.resample(
+ chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K, res_type=RES_TYPE
+ )
+ wavfile.write(
+ os.path.join(
+ self.wavs16k_dir,
+ f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav",
+ ),
+ SAMPLE_RATE_16K,
+ chunk_16k.astype(np.float32),
+ )
+ i += chunk_length - overlap_length
+
+ def process_audio(
+ self,
+ path: str,
+ idx0: int,
+ sid: int,
+ cut_preprocess: str,
+ process_effects: bool,
+ noise_reduction: bool,
+ reduction_strength: float,
+ chunk_len: float,
+ overlap_len: float,
+ ):
+ audio_length = 0
+ try:
+ audio = load_audio(path, self.sr)
+ audio_length = librosa.get_duration(y=audio, sr=self.sr)
+
+ if process_effects:
+ audio = signal.lfilter(self.b_high, self.a_high, audio)
+ audio = self._normalize_audio(audio)
+ if noise_reduction:
+ audio = nr.reduce_noise(
+ y=audio, sr=self.sr, prop_decrease=reduction_strength
+ )
+ if cut_preprocess == "Skip":
+ # no cutting
+ self.process_audio_segment(
+ audio,
+ sid,
+ idx0,
+ 0,
+ )
+ elif cut_preprocess == "Simple":
+ # simple
+ self.simple_cut(audio, sid, idx0, chunk_len, overlap_len)
+ elif cut_preprocess == "Automatic":
+ idx1 = 0
+ # legacy
+ for audio_segment in self.slicer.slice(audio):
+ i = 0
+ while True:
+ start = int(self.sr * (PERCENTAGE - OVERLAP) * i)
+ i += 1
+ if (
+ len(audio_segment[start:])
+ > (PERCENTAGE + OVERLAP) * self.sr
+ ):
+ tmp_audio = audio_segment[
+ start : start + int(PERCENTAGE * self.sr)
+ ]
+ self.process_audio_segment(
+ tmp_audio,
+ sid,
+ idx0,
+ idx1,
+ )
+ idx1 += 1
+ else:
+ tmp_audio = audio_segment[start:]
+ self.process_audio_segment(
+ tmp_audio,
+ sid,
+ idx0,
+ idx1,
+ )
+ idx1 += 1
+ break
+
+ except Exception as error:
+ print(f"Error processing audio: {error}")
+ return audio_length
+
+
+def format_duration(seconds):
+ hours = int(seconds // 3600)
+ minutes = int((seconds % 3600) // 60)
+ seconds = int(seconds % 60)
+ return f"{hours:02}:{minutes:02}:{seconds:02}"
+
+
+def save_dataset_duration(file_path, dataset_duration):
+ try:
+ with open(file_path, "r") as f:
+ data = json.load(f)
+ except FileNotFoundError:
+ data = {}
+
+ formatted_duration = format_duration(dataset_duration)
+ new_data = {
+ "total_dataset_duration": formatted_duration,
+ "total_seconds": dataset_duration,
+ }
+ data.update(new_data)
+
+ with open(file_path, "w") as f:
+ json.dump(data, f, indent=4)
+
+
+def process_audio_wrapper(args):
+ (
+ pp,
+ file,
+ cut_preprocess,
+ process_effects,
+ noise_reduction,
+ reduction_strength,
+ chunk_len,
+ overlap_len,
+ ) = args
+ file_path, idx0, sid = file
+ return pp.process_audio(
+ file_path,
+ idx0,
+ sid,
+ cut_preprocess,
+ process_effects,
+ noise_reduction,
+ reduction_strength,
+ chunk_len,
+ overlap_len,
+ )
+
+
+def preprocess_training_set(
+ input_root: str,
+ sr: int,
+ num_processes: int,
+ exp_dir: str,
+ cut_preprocess: str,
+ process_effects: bool,
+ noise_reduction: bool,
+ reduction_strength: float,
+ chunk_len: float,
+ overlap_len: float,
+):
+ start_time = time.time()
+ pp = PreProcess(sr, exp_dir)
+ print(f"Starting preprocess with {num_processes} processes...")
+
+ files = []
+ idx = 0
+
+ for root, _, filenames in os.walk(input_root):
+ try:
+ sid = 0 if root == input_root else int(os.path.basename(root))
+ for f in filenames:
+ if f.lower().endswith((".wav", ".mp3", ".flac", ".ogg")):
+ files.append((os.path.join(root, f), idx, sid))
+ idx += 1
+ except ValueError:
+ print(
+ f'Speaker ID folder is expected to be integer, got "{os.path.basename(root)}" instead.'
+ )
+
+ # print(f"Number of files: {len(files)}")
+ audio_length = []
+ with tqdm(total=len(files)) as pbar:
+ with concurrent.futures.ProcessPoolExecutor(
+ max_workers=num_processes
+ ) as executor:
+ futures = [
+ executor.submit(
+ process_audio_wrapper,
+ (
+ pp,
+ file,
+ cut_preprocess,
+ process_effects,
+ noise_reduction,
+ reduction_strength,
+ chunk_len,
+ overlap_len,
+ ),
+ )
+ for file in files
+ ]
+ for future in concurrent.futures.as_completed(futures):
+ audio_length.append(future.result())
+ pbar.update(1)
+
+ audio_length = sum(audio_length)
+ save_dataset_duration(
+ os.path.join(exp_dir, "model_info.json"), dataset_duration=audio_length
+ )
+ elapsed_time = time.time() - start_time
+ print(
+ f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(audio_length)} seconds of audio."
+ )
+
+
+if __name__ == "__main__":
+ experiment_directory = str(sys.argv[1])
+ input_root = str(sys.argv[2])
+ sample_rate = int(sys.argv[3])
+ num_processes = sys.argv[4]
+ if num_processes.lower() == "none":
+ num_processes = multiprocessing.cpu_count()
+ else:
+ num_processes = int(num_processes)
+ cut_preprocess = str(sys.argv[5])
+ process_effects = strtobool(sys.argv[6])
+ noise_reduction = strtobool(sys.argv[7])
+ reduction_strength = float(sys.argv[8])
+ chunk_len = float(sys.argv[9])
+ overlap_len = float(sys.argv[10])
+
+ preprocess_training_set(
+ input_root,
+ sample_rate,
+ num_processes,
+ experiment_directory,
+ cut_preprocess,
+ process_effects,
+ noise_reduction,
+ reduction_strength,
+ chunk_len,
+ overlap_len,
+ )
diff --git a/rvc/train/preprocess/slicer.py b/rvc/train/preprocess/slicer.py
new file mode 100644
index 0000000000000000000000000000000000000000..09c4f9a556dee5e8ef506115ccf3ace328ffaaa6
--- /dev/null
+++ b/rvc/train/preprocess/slicer.py
@@ -0,0 +1,235 @@
+import numpy as np
+
+
+class Slicer:
+ """
+ A class for slicing audio waveforms into segments based on silence detection.
+
+ Attributes:
+ sr (int): Sampling rate of the audio waveform.
+ threshold (float): RMS threshold for silence detection, in dB.
+ min_length (int): Minimum length of a segment, in milliseconds.
+ min_interval (int): Minimum interval between segments, in milliseconds.
+ hop_size (int): Hop size for RMS calculation, in milliseconds.
+ max_sil_kept (int): Maximum length of silence to keep at the beginning or end of a segment, in milliseconds.
+
+ Methods:
+ slice(waveform): Slices the given waveform into segments.
+ """
+
+ def __init__(
+ self,
+ sr: int,
+ threshold: float = -40.0,
+ min_length: int = 5000,
+ min_interval: int = 300,
+ hop_size: int = 20,
+ max_sil_kept: int = 5000,
+ ):
+ """
+ Initializes a Slicer object.
+
+ Args:
+ sr (int): Sampling rate of the audio waveform.
+ threshold (float, optional): RMS threshold for silence detection, in dB. Defaults to -40.0.
+ min_length (int, optional): Minimum length of a segment, in milliseconds. Defaults to 5000.
+ min_interval (int, optional): Minimum interval between segments, in milliseconds. Defaults to 300.
+ hop_size (int, optional): Hop size for RMS calculation, in milliseconds. Defaults to 20.
+ max_sil_kept (int, optional): Maximum length of silence to keep at the beginning or end of a segment, in milliseconds. Defaults to 5000.
+
+ Raises:
+ ValueError: If the input parameters are not valid.
+ """
+ if not min_length >= min_interval >= hop_size:
+ raise ValueError("min_length >= min_interval >= hop_size is required")
+ if not max_sil_kept >= hop_size:
+ raise ValueError("max_sil_kept >= hop_size is required")
+
+ # Convert time-based parameters to sample-based parameters
+ min_interval = sr * min_interval / 1000
+ self.threshold = 10 ** (threshold / 20.0)
+ self.hop_size = round(sr * hop_size / 1000)
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
+ self.min_interval = round(min_interval / self.hop_size)
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+
+ def _apply_slice(self, waveform, begin, end):
+ """
+ Applies a slice to the waveform.
+
+ Args:
+ waveform (numpy.ndarray): The waveform to slice.
+ begin (int): Start frame index.
+ end (int): End frame index.
+ """
+ start_idx = begin * self.hop_size
+ if len(waveform.shape) > 1:
+ end_idx = min(waveform.shape[1], end * self.hop_size)
+ return waveform[:, start_idx:end_idx]
+ else:
+ end_idx = min(waveform.shape[0], end * self.hop_size)
+ return waveform[start_idx:end_idx]
+
+ def slice(self, waveform):
+ """
+ Slices the given waveform into segments.
+
+ Args:
+ waveform (numpy.ndarray): The waveform to slice.
+ """
+ # Calculate RMS for each frame
+ samples = waveform.mean(axis=0) if len(waveform.shape) > 1 else waveform
+ if samples.shape[0] <= self.min_length:
+ return [waveform]
+
+ rms_list = get_rms(
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
+ ).squeeze(0)
+
+ # Detect silence segments and mark them
+ sil_tags = []
+ silence_start, clip_start = None, 0
+ for i, rms in enumerate(rms_list):
+ # If current frame is silent
+ if rms < self.threshold:
+ if silence_start is None:
+ silence_start = i
+ continue
+
+ # If current frame is not silent
+ if silence_start is None:
+ continue
+
+ # Check if current silence segment is leading silence or need to slice
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+ need_slice_middle = (
+ i - silence_start >= self.min_interval
+ and i - clip_start >= self.min_length
+ )
+
+ # If not leading silence and not need to slice middle
+ if not is_leading_silence and not need_slice_middle:
+ silence_start = None
+ continue
+
+ # Handle different cases of silence segments
+ if i - silence_start <= self.max_sil_kept:
+ # Short silence
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
+ if silence_start == 0:
+ sil_tags.append((0, pos))
+ else:
+ sil_tags.append((pos, pos))
+ clip_start = pos
+ elif i - silence_start <= self.max_sil_kept * 2:
+ # Medium silence
+ pos = rms_list[
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
+ ].argmin()
+ pos += i - self.max_sil_kept
+ pos_l = (
+ rms_list[
+ silence_start : silence_start + self.max_sil_kept + 1
+ ].argmin()
+ + silence_start
+ )
+ pos_r = (
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
+ + i
+ - self.max_sil_kept
+ )
+ if silence_start == 0:
+ sil_tags.append((0, pos_r))
+ clip_start = pos_r
+ else:
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+ clip_start = max(pos_r, pos)
+ else:
+ # Long silence
+ pos_l = (
+ rms_list[
+ silence_start : silence_start + self.max_sil_kept + 1
+ ].argmin()
+ + silence_start
+ )
+ pos_r = (
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
+ + i
+ - self.max_sil_kept
+ )
+ if silence_start == 0:
+ sil_tags.append((0, pos_r))
+ else:
+ sil_tags.append((pos_l, pos_r))
+ clip_start = pos_r
+ silence_start = None
+
+ # Handle trailing silence
+ total_frames = rms_list.shape[0]
+ if (
+ silence_start is not None
+ and total_frames - silence_start >= self.min_interval
+ ):
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
+ sil_tags.append((pos, total_frames + 1))
+
+ # Extract segments based on silence tags
+ if not sil_tags:
+ return [waveform]
+ else:
+ chunks = []
+ if sil_tags[0][0] > 0:
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
+
+ for i in range(len(sil_tags) - 1):
+ chunks.append(
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
+ )
+
+ if sil_tags[-1][1] < total_frames:
+ chunks.append(
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames)
+ )
+
+ return chunks
+
+
+def get_rms(
+ y,
+ frame_length=2048,
+ hop_length=512,
+ pad_mode="constant",
+):
+ """
+ Calculates the root mean square (RMS) of a waveform.
+
+ Args:
+ y (numpy.ndarray): The waveform.
+ frame_length (int, optional): The length of the frame in samples. Defaults to 2048.
+ hop_length (int, optional): The hop length between frames in samples. Defaults to 512.
+ pad_mode (str, optional): The padding mode used for the waveform. Defaults to "constant".
+ """
+ padding = (int(frame_length // 2), int(frame_length // 2))
+ y = np.pad(y, padding, mode=pad_mode)
+
+ axis = -1
+ out_strides = y.strides + tuple([y.strides[axis]])
+ x_shape_trimmed = list(y.shape)
+ x_shape_trimmed[axis] -= frame_length - 1
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
+
+ if axis < 0:
+ target_axis = axis - 1
+ else:
+ target_axis = axis + 1
+
+ xw = np.moveaxis(xw, -1, target_axis)
+ slices = [slice(None)] * xw.ndim
+ slices[axis] = slice(0, None, hop_length)
+ x = xw[tuple(slices)]
+
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
+ return np.sqrt(power)
diff --git a/rvc/train/process/change_info.py b/rvc/train/process/change_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a58c2024cf53b0f6b861e896ac0a6d3b8afcf23
--- /dev/null
+++ b/rvc/train/process/change_info.py
@@ -0,0 +1,22 @@
+import os
+import torch
+
+
+def change_info(path, info, name):
+ try:
+ ckpt = torch.load(path, map_location="cpu", weights_only=True)
+ ckpt["info"] = info
+
+ if not name:
+ name = os.path.splitext(os.path.basename(path))[0]
+
+ target_dir = os.path.join("logs", name)
+ os.makedirs(target_dir, exist_ok=True)
+
+ torch.save(ckpt, os.path.join(target_dir, f"{name}.pth"))
+
+ return "Success."
+
+ except Exception as error:
+ print(f"An error occurred while changing the info: {error}")
+ return f"Error: {error}"
diff --git a/rvc/train/process/extract_index.py b/rvc/train/process/extract_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab89578ba071332dbfaf02afcb54b5b19ecf8320
--- /dev/null
+++ b/rvc/train/process/extract_index.py
@@ -0,0 +1,70 @@
+import os
+import sys
+import faiss
+import numpy as np
+from sklearn.cluster import MiniBatchKMeans
+from multiprocessing import cpu_count
+
+# Parse command line arguments
+exp_dir = str(sys.argv[1])
+index_algorithm = str(sys.argv[2])
+
+try:
+ feature_dir = os.path.join(exp_dir, f"extracted")
+ model_name = os.path.basename(exp_dir)
+
+ index_filename_added = f"{model_name}.index"
+ index_filepath_added = os.path.join(exp_dir, index_filename_added)
+
+ if os.path.exists(index_filepath_added):
+ pass
+ else:
+ npys = []
+ listdir_res = sorted(os.listdir(feature_dir))
+
+ for name in listdir_res:
+ file_path = os.path.join(feature_dir, name)
+ phone = np.load(file_path)
+ npys.append(phone)
+
+ big_npy = np.concatenate(npys, axis=0)
+
+ big_npy_idx = np.arange(big_npy.shape[0])
+ np.random.shuffle(big_npy_idx)
+ big_npy = big_npy[big_npy_idx]
+
+ if big_npy.shape[0] > 2e5 and (
+ index_algorithm == "Auto" or index_algorithm == "KMeans"
+ ):
+ big_npy = (
+ MiniBatchKMeans(
+ n_clusters=10000,
+ verbose=True,
+ batch_size=256 * cpu_count(),
+ compute_labels=False,
+ init="random",
+ )
+ .fit(big_npy)
+ .cluster_centers_
+ )
+
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
+
+ # index_added
+ index_added = faiss.index_factory(768, f"IVF{n_ivf},Flat")
+ index_ivf_added = faiss.extract_index_ivf(index_added)
+ index_ivf_added.nprobe = 1
+ index_added.train(big_npy)
+
+ batch_size_add = 8192
+ for i in range(0, big_npy.shape[0], batch_size_add):
+ index_added.add(big_npy[i : i + batch_size_add])
+
+ faiss.write_index(index_added, index_filepath_added)
+ print(f"Saved index file '{index_filepath_added}'")
+
+except Exception as error:
+ print(f"An error occurred extracting the index: {error}")
+ print(
+ "If you are running this code in a virtual environment, make sure you have enough GPU available to generate the Index file."
+ )
diff --git a/rvc/train/process/extract_model.py b/rvc/train/process/extract_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..09bef31d903e97f135bb8f33910859aa5a2a0e52
--- /dev/null
+++ b/rvc/train/process/extract_model.py
@@ -0,0 +1,114 @@
+import datetime
+import hashlib
+import json
+import os
+import sys
+from collections import OrderedDict
+
+import torch
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+
+def replace_keys_in_dict(d, old_key_part, new_key_part):
+ if isinstance(d, OrderedDict):
+ updated_dict = OrderedDict()
+ else:
+ updated_dict = {}
+ for key, value in d.items():
+ new_key = key.replace(old_key_part, new_key_part)
+ if isinstance(value, dict):
+ value = replace_keys_in_dict(value, old_key_part, new_key_part)
+ updated_dict[new_key] = value
+ return updated_dict
+
+
+def extract_model(
+ ckpt,
+ sr,
+ name,
+ model_path,
+ epoch,
+ step,
+ hps,
+ overtrain_info,
+ vocoder,
+ pitch_guidance=True,
+ version="v2",
+):
+ try:
+ model_dir = os.path.dirname(model_path)
+ os.makedirs(model_dir, exist_ok=True)
+
+ if os.path.exists(os.path.join(model_dir, "model_info.json")):
+ with open(os.path.join(model_dir, "model_info.json"), "r") as f:
+ data = json.load(f)
+ dataset_length = data.get("total_dataset_duration", None)
+ embedder_model = data.get("embedder_model", None)
+ speakers_id = data.get("speakers_id", 1)
+ else:
+ dataset_length = None
+
+ with open(os.path.join(now_dir, "assets", "config.json"), "r") as f:
+ data = json.load(f)
+ model_author = data.get("model_author", None)
+
+ opt = OrderedDict(
+ weight={
+ key: value.half() for key, value in ckpt.items() if "enc_q" not in key
+ }
+ )
+ opt["config"] = [
+ hps.data.filter_length // 2 + 1,
+ 32,
+ hps.model.inter_channels,
+ hps.model.hidden_channels,
+ hps.model.filter_channels,
+ hps.model.n_heads,
+ hps.model.n_layers,
+ hps.model.kernel_size,
+ hps.model.p_dropout,
+ hps.model.resblock,
+ hps.model.resblock_kernel_sizes,
+ hps.model.resblock_dilation_sizes,
+ hps.model.upsample_rates,
+ hps.model.upsample_initial_channel,
+ hps.model.upsample_kernel_sizes,
+ hps.model.spk_embed_dim,
+ hps.model.gin_channels,
+ hps.data.sample_rate,
+ ]
+
+ opt["epoch"] = epoch
+ opt["step"] = step
+ opt["sr"] = sr
+ opt["f0"] = pitch_guidance
+ opt["version"] = version
+ opt["creation_date"] = datetime.datetime.now().isoformat()
+
+ hash_input = f"{name}-{epoch}-{step}-{sr}-{version}-{opt['config']}"
+ opt["model_hash"] = hashlib.sha256(hash_input.encode()).hexdigest()
+ opt["overtrain_info"] = overtrain_info
+ opt["dataset_length"] = dataset_length
+ opt["model_name"] = name
+ opt["author"] = model_author
+ opt["embedder_model"] = embedder_model
+ opt["speakers_id"] = speakers_id
+ opt["vocoder"] = vocoder
+
+ torch.save(
+ replace_keys_in_dict(
+ replace_keys_in_dict(
+ opt, ".parametrizations.weight.original1", ".weight_v"
+ ),
+ ".parametrizations.weight.original0",
+ ".weight_g",
+ ),
+ model_path,
+ )
+
+ print(f"Saved model '{model_path}' (epoch {epoch} and step {step})")
+
+ except Exception as error:
+ print(f"An error occurred extracting the model: {error}")
diff --git a/rvc/train/process/model_blender.py b/rvc/train/process/model_blender.py
new file mode 100644
index 0000000000000000000000000000000000000000..52d2d256ef61bc12d072708e5a12bb8205e63f6d
--- /dev/null
+++ b/rvc/train/process/model_blender.py
@@ -0,0 +1,68 @@
+import os
+import torch
+from collections import OrderedDict
+
+
+def extract(ckpt):
+ a = ckpt["model"]
+ opt = OrderedDict()
+ opt["weight"] = {}
+ for key in a.keys():
+ if "enc_q" in key:
+ continue
+ opt["weight"][key] = a[key]
+ return opt
+
+
+def model_blender(name, path1, path2, ratio):
+ try:
+ message = f"Model {path1} and {path2} are merged with alpha {ratio}."
+ ckpt1 = torch.load(path1, map_location="cpu", weights_only=True)
+ ckpt2 = torch.load(path2, map_location="cpu", weights_only=True)
+
+ if ckpt1["sr"] != ckpt2["sr"]:
+ return "The sample rates of the two models are not the same."
+
+ cfg = ckpt1["config"]
+ cfg_f0 = ckpt1["f0"]
+ cfg_version = ckpt1["version"]
+ cfg_sr = ckpt1["sr"]
+
+ if "model" in ckpt1:
+ ckpt1 = extract(ckpt1)
+ else:
+ ckpt1 = ckpt1["weight"]
+ if "model" in ckpt2:
+ ckpt2 = extract(ckpt2)
+ else:
+ ckpt2 = ckpt2["weight"]
+
+ if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
+ return "Fail to merge the models. The model architectures are not the same."
+
+ opt = OrderedDict()
+ opt["weight"] = {}
+ for key in ckpt1.keys():
+ if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
+ min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
+ opt["weight"][key] = (
+ ratio * (ckpt1[key][:min_shape0].float())
+ + (1 - ratio) * (ckpt2[key][:min_shape0].float())
+ ).half()
+ else:
+ opt["weight"][key] = (
+ ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float())
+ ).half()
+
+ opt["config"] = cfg
+ opt["sr"] = cfg_sr
+ opt["f0"] = cfg_f0
+ opt["version"] = cfg_version
+ opt["info"] = message
+
+ torch.save(opt, os.path.join("logs", f"{name}.pth"))
+ print(message)
+ return message, os.path.join("logs", f"{name}.pth")
+ except Exception as error:
+ print(f"An error occurred blending the models: {error}")
+ return error
diff --git a/rvc/train/process/model_information.py b/rvc/train/process/model_information.py
new file mode 100644
index 0000000000000000000000000000000000000000..e05d1a15042070e23b37252fc6ff51f0ee4da110
--- /dev/null
+++ b/rvc/train/process/model_information.py
@@ -0,0 +1,49 @@
+import torch
+from datetime import datetime
+
+
+def prettify_date(date_str):
+ if date_str is None:
+ return "None"
+ try:
+ date_time_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f")
+ return date_time_obj.strftime("%Y-%m-%d %H:%M:%S")
+ except ValueError:
+ return "Invalid date format"
+
+
+def model_information(path):
+ model_data = torch.load(path, map_location="cpu", weights_only=True)
+
+ print(f"Loaded model from {path}")
+
+ model_name = model_data.get("model_name", "None")
+ epochs = model_data.get("epoch", "None")
+ steps = model_data.get("step", "None")
+ sr = model_data.get("sr", "None")
+ f0 = model_data.get("f0", "None")
+ dataset_length = model_data.get("dataset_length", "None")
+ vocoder = model_data.get("vocoder", "None")
+ creation_date = model_data.get("creation_date", "None")
+ model_hash = model_data.get("model_hash", None)
+ overtrain_info = model_data.get("overtrain_info", "None")
+ model_author = model_data.get("author", "None")
+ embedder_model = model_data.get("embedder_model", "None")
+ speakers_id = model_data.get("speakers_id", 0)
+
+ creation_date_str = prettify_date(creation_date) if creation_date else "None"
+
+ return (
+ f"Model Name: {model_name}\n"
+ f"Model Creator: {model_author}\n"
+ f"Epochs: {epochs}\n"
+ f"Steps: {steps}\n"
+ f"Vocoder: {vocoder}\n"
+ f"Sampling Rate: {sr}\n"
+ f"Dataset Length: {dataset_length}\n"
+ f"Creation Date: {creation_date_str}\n"
+ f"Overtrain Info: {overtrain_info}\n"
+ f"Embedder Model: {embedder_model}\n"
+ f"Max Speakers ID: {speakers_id}"
+ f"Hash: {model_hash}\n"
+ )
diff --git a/rvc/train/train.py b/rvc/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..40b5782a65c92400ec6303dea0215cb92053b157
--- /dev/null
+++ b/rvc/train/train.py
@@ -0,0 +1,1066 @@
+import os
+import sys
+
+os.environ["USE_LIBUV"] = "0" if sys.platform == "win32" else "1"
+import glob
+import json
+import torch
+import datetime
+
+from collections import deque
+from distutils.util import strtobool
+from random import randint, shuffle
+from time import time as ttime
+from tqdm import tqdm
+import numpy as np
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from torch.cuda.amp import GradScaler, autocast
+from torch.utils.data import DataLoader
+from torch.nn import functional as F
+
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+now_dir = os.getcwd()
+sys.path.append(os.path.join(now_dir))
+
+# Zluda hijack
+import rvc.lib.zluda
+
+from utils import (
+ HParams,
+ plot_spectrogram_to_numpy,
+ summarize,
+ load_checkpoint,
+ save_checkpoint,
+ latest_checkpoint_path,
+ load_wav_to_torch,
+)
+
+from losses import (
+ discriminator_loss,
+ feature_loss,
+ generator_loss,
+ kl_loss,
+)
+from mel_processing import (
+ mel_spectrogram_torch,
+ spec_to_mel_torch,
+ MultiScaleMelSpectrogramLoss,
+)
+
+from rvc.train.process.extract_model import extract_model
+
+from rvc.lib.algorithm import commons
+
+# Parse command line arguments
+model_name = sys.argv[1]
+save_every_epoch = int(sys.argv[2])
+total_epoch = int(sys.argv[3])
+pretrainG = sys.argv[4]
+pretrainD = sys.argv[5]
+gpus = sys.argv[6]
+batch_size = int(sys.argv[7])
+sample_rate = int(sys.argv[8])
+save_only_latest = strtobool(sys.argv[9])
+save_every_weights = strtobool(sys.argv[10])
+cache_data_in_gpu = strtobool(sys.argv[11])
+overtraining_detector = strtobool(sys.argv[12])
+overtraining_threshold = int(sys.argv[13])
+cleanup = strtobool(sys.argv[14])
+vocoder = sys.argv[15]
+checkpointing = strtobool(sys.argv[16])
+randomized = True
+optimizer = "RAdam" # "AdamW"
+
+current_dir = os.getcwd()
+experiment_dir = os.path.join(current_dir, "logs", model_name)
+config_save_path = os.path.join(experiment_dir, "config.json")
+dataset_path = os.path.join(experiment_dir, "sliced_audios")
+
+with open(config_save_path, "r") as f:
+ config = json.load(f)
+config = HParams(**config)
+config.data.training_files = os.path.join(experiment_dir, "filelist.txt")
+
+torch.backends.cudnn.deterministic = False
+torch.backends.cudnn.benchmark = True
+
+global_step = 0
+last_loss_gen_all = 0
+overtrain_save_epoch = 0
+loss_gen_history = []
+smoothed_loss_gen_history = []
+loss_disc_history = []
+smoothed_loss_disc_history = []
+lowest_value = {"step": 0, "value": float("inf"), "epoch": 0}
+training_file_path = os.path.join(experiment_dir, "training_data.json")
+
+avg_losses = {
+ "gen_loss_queue": deque(maxlen=10),
+ "disc_loss_queue": deque(maxlen=10),
+ "disc_loss_50": deque(maxlen=50),
+ "fm_loss_50": deque(maxlen=50),
+ "kl_loss_50": deque(maxlen=50),
+ "mel_loss_50": deque(maxlen=50),
+ "gen_loss_50": deque(maxlen=50),
+}
+
+import logging
+
+logging.getLogger("torch").setLevel(logging.ERROR)
+
+
+class EpochRecorder:
+ """
+ Records the time elapsed per epoch.
+ """
+
+ def __init__(self):
+ self.last_time = ttime()
+
+ def record(self):
+ """
+ Records the elapsed time and returns a formatted string.
+ """
+ now_time = ttime()
+ elapsed_time = now_time - self.last_time
+ self.last_time = now_time
+ elapsed_time = round(elapsed_time, 1)
+ elapsed_time_str = str(datetime.timedelta(seconds=int(elapsed_time)))
+ current_time = datetime.datetime.now().strftime("%H:%M:%S")
+ return f"time={current_time} | training_speed={elapsed_time_str}"
+
+
+def verify_checkpoint_shapes(checkpoint_path, model):
+ checkpoint = torch.load(checkpoint_path, map_location="cpu")
+ checkpoint_state_dict = checkpoint["model"]
+ try:
+ if hasattr(model, "module"):
+ model_state_dict = model.module.load_state_dict(checkpoint_state_dict)
+ else:
+ model_state_dict = model.load_state_dict(checkpoint_state_dict)
+ except RuntimeError:
+ print(
+ "The parameters of the pretrain model such as the sample rate or architecture do not match the selected model."
+ )
+ sys.exit(1)
+ else:
+ del checkpoint
+ del checkpoint_state_dict
+ del model_state_dict
+
+
+def main():
+ """
+ Main function to start the training process.
+ """
+ global training_file_path, last_loss_gen_all, smoothed_loss_gen_history, loss_gen_history, loss_disc_history, smoothed_loss_disc_history, overtrain_save_epoch, gpus
+
+ os.environ["MASTER_ADDR"] = "localhost"
+ os.environ["MASTER_PORT"] = str(randint(20000, 55555))
+ # Check sample rate
+ wavs = glob.glob(
+ os.path.join(os.path.join(experiment_dir, "sliced_audios"), "*.wav")
+ )
+ if wavs:
+ _, sr = load_wav_to_torch(wavs[0])
+ if sr != sample_rate:
+ print(
+ f"Error: Pretrained model sample rate ({sample_rate} Hz) does not match dataset audio sample rate ({sr} Hz)."
+ )
+ os._exit(1)
+ else:
+ print("No wav file found.")
+
+ if torch.cuda.is_available():
+ device = torch.device("cuda")
+ gpus = [int(item) for item in gpus.split("-")]
+ n_gpus = len(gpus)
+ elif torch.backends.mps.is_available():
+ device = torch.device("mps")
+ gpus = [0]
+ n_gpus = 1
+ else:
+ device = torch.device("cpu")
+ gpus = [0]
+ n_gpus = 1
+ print("Training with CPU, this will take a long time.")
+
+ def start():
+ """
+ Starts the training process with multi-GPU support or CPU.
+ """
+ children = []
+ pid_data = {"process_pids": []}
+ with open(config_save_path, "r") as pid_file:
+ try:
+ existing_data = json.load(pid_file)
+ pid_data.update(existing_data)
+ except json.JSONDecodeError:
+ pass
+ with open(config_save_path, "w") as pid_file:
+ for rank, device_id in enumerate(gpus):
+ subproc = mp.Process(
+ target=run,
+ args=(
+ rank,
+ n_gpus,
+ experiment_dir,
+ pretrainG,
+ pretrainD,
+ total_epoch,
+ save_every_weights,
+ config,
+ device,
+ device_id,
+ ),
+ )
+ children.append(subproc)
+ subproc.start()
+ pid_data["process_pids"].append(subproc.pid)
+ json.dump(pid_data, pid_file, indent=4)
+
+ for i in range(n_gpus):
+ children[i].join()
+
+ def load_from_json(file_path):
+ """
+ Load data from a JSON file.
+
+ Args:
+ file_path (str): The path to the JSON file.
+ """
+ if os.path.exists(file_path):
+ with open(file_path, "r") as f:
+ data = json.load(f)
+ return (
+ data.get("loss_disc_history", []),
+ data.get("smoothed_loss_disc_history", []),
+ data.get("loss_gen_history", []),
+ data.get("smoothed_loss_gen_history", []),
+ )
+ return [], [], [], []
+
+ def continue_overtrain_detector(training_file_path):
+ """
+ Continues the overtrain detector by loading the training history from a JSON file.
+
+ Args:
+ training_file_path (str): The file path of the JSON file containing the training history.
+ """
+ if overtraining_detector:
+ if os.path.exists(training_file_path):
+ (
+ loss_disc_history,
+ smoothed_loss_disc_history,
+ loss_gen_history,
+ smoothed_loss_gen_history,
+ ) = load_from_json(training_file_path)
+
+ if cleanup:
+ print("Removing files from the prior training attempt...")
+
+ # Clean up unnecessary files
+ for root, dirs, files in os.walk(
+ os.path.join(now_dir, "logs", model_name), topdown=False
+ ):
+ for name in files:
+ file_path = os.path.join(root, name)
+ file_name, file_extension = os.path.splitext(name)
+ if (
+ file_extension == ".0"
+ or (file_name.startswith("D_") and file_extension == ".pth")
+ or (file_name.startswith("G_") and file_extension == ".pth")
+ or (file_name.startswith("added") and file_extension == ".index")
+ ):
+ os.remove(file_path)
+ for name in dirs:
+ if name == "eval":
+ folder_path = os.path.join(root, name)
+ for item in os.listdir(folder_path):
+ item_path = os.path.join(folder_path, item)
+ if os.path.isfile(item_path):
+ os.remove(item_path)
+ os.rmdir(folder_path)
+
+ print("Cleanup done!")
+
+ continue_overtrain_detector(training_file_path)
+ start()
+
+
+def run(
+ rank,
+ n_gpus,
+ experiment_dir,
+ pretrainG,
+ pretrainD,
+ custom_total_epoch,
+ custom_save_every_weights,
+ config,
+ device,
+ device_id,
+):
+ """
+ Runs the training loop on a specific GPU or CPU.
+
+ Args:
+ rank (int): The rank of the current process within the distributed training setup.
+ n_gpus (int): The total number of GPUs available for training.
+ experiment_dir (str): The directory where experiment logs and checkpoints will be saved.
+ pretrainG (str): Path to the pre-trained generator model.
+ pretrainD (str): Path to the pre-trained discriminator model.
+ custom_total_epoch (int): The total number of epochs for training.
+ custom_save_every_weights (int): The interval (in epochs) at which to save model weights.
+ config (object): Configuration object containing training parameters.
+ device (torch.device): The device to use for training (CPU or GPU).
+ """
+ global global_step, smoothed_value_gen, smoothed_value_disc, optimizer
+
+ smoothed_value_gen = 0
+ smoothed_value_disc = 0
+
+ if rank == 0:
+ writer_eval = SummaryWriter(log_dir=os.path.join(experiment_dir, "eval"))
+ else:
+ writer_eval = None
+
+ dist.init_process_group(
+ backend="gloo" if sys.platform == "win32" or device.type != "cuda" else "nccl",
+ init_method="env://",
+ world_size=n_gpus if device.type == "cuda" else 1,
+ rank=rank if device.type == "cuda" else 0,
+ )
+
+ torch.manual_seed(config.train.seed)
+
+ if torch.cuda.is_available():
+ torch.cuda.set_device(device_id)
+
+ # Create datasets and dataloaders
+ from data_utils import (
+ DistributedBucketSampler,
+ TextAudioCollateMultiNSFsid,
+ TextAudioLoaderMultiNSFsid,
+ )
+
+ train_dataset = TextAudioLoaderMultiNSFsid(config.data)
+ collate_fn = TextAudioCollateMultiNSFsid()
+ train_sampler = DistributedBucketSampler(
+ train_dataset,
+ batch_size * n_gpus,
+ [50, 100, 200, 300, 400, 500, 600, 700, 800, 900],
+ num_replicas=n_gpus,
+ rank=rank,
+ shuffle=True,
+ )
+
+ train_loader = DataLoader(
+ train_dataset,
+ num_workers=4,
+ shuffle=False,
+ pin_memory=True,
+ collate_fn=collate_fn,
+ batch_sampler=train_sampler,
+ persistent_workers=True,
+ prefetch_factor=8,
+ )
+
+ # Validations
+ if len(train_loader) < 3:
+ print(
+ "Not enough data present in the training set. Perhaps you forgot to slice the audio files in preprocess?"
+ )
+ os._exit(2333333)
+ else:
+ g_file = latest_checkpoint_path(experiment_dir, "G_*.pth")
+ if g_file != None:
+ print("Checking saved weights...")
+ g = torch.load(g_file, map_location="cpu")
+ if (
+ optimizer == "RAdam"
+ and "amsgrad" in g["optimizer"]["param_groups"][0].keys()
+ ):
+ optimizer = "AdamW"
+ print(
+ f"Optimizer choice has been reverted to {optimizer} to match the saved D/G weights."
+ )
+ elif (
+ optimizer == "AdamW"
+ and "decoupled_weight_decay" in g["optimizer"]["param_groups"][0].keys()
+ ):
+ optimizer = "RAdam"
+ print(
+ f"Optimizer choice has been reverted to {optimizer} to match the saved D/G weights."
+ )
+ del g
+
+ # Initialize models and optimizers
+ from rvc.lib.algorithm.discriminators import MultiPeriodDiscriminator
+ from rvc.lib.algorithm.synthesizers import Synthesizer
+
+ net_g = Synthesizer(
+ config.data.filter_length // 2 + 1,
+ config.train.segment_size // config.data.hop_length,
+ **config.model,
+ use_f0=True,
+ sr=sample_rate,
+ vocoder=vocoder,
+ checkpointing=checkpointing,
+ randomized=randomized,
+ )
+
+ net_d = MultiPeriodDiscriminator(
+ config.model.use_spectral_norm, checkpointing=checkpointing
+ )
+
+ if torch.cuda.is_available():
+ net_g = net_g.cuda(device_id)
+ net_d = net_d.cuda(device_id)
+ else:
+ net_g = net_g.to(device)
+ net_d = net_d.to(device)
+
+ if optimizer == "AdamW":
+ optimizer = torch.optim.AdamW
+ elif optimizer == "RAdam":
+ optimizer = torch.optim.RAdam
+
+ optim_g = optimizer(
+ net_g.parameters(),
+ config.train.learning_rate,
+ betas=config.train.betas,
+ eps=config.train.eps,
+ )
+ optim_d = optimizer(
+ net_d.parameters(),
+ config.train.learning_rate,
+ betas=config.train.betas,
+ eps=config.train.eps,
+ )
+
+ fn_mel_loss = MultiScaleMelSpectrogramLoss(sample_rate=sample_rate)
+
+ # Wrap models with DDP for multi-gpu processing
+ if n_gpus > 1 and device.type == "cuda":
+ net_g = DDP(net_g, device_ids=[device_id])
+ net_d = DDP(net_d, device_ids=[device_id])
+
+ # Load checkpoint if available
+ try:
+ print("Starting training...")
+ _, _, _, epoch_str = load_checkpoint(
+ latest_checkpoint_path(experiment_dir, "D_*.pth"), net_d, optim_d
+ )
+ _, _, _, epoch_str = load_checkpoint(
+ latest_checkpoint_path(experiment_dir, "G_*.pth"), net_g, optim_g
+ )
+ epoch_str += 1
+ global_step = (epoch_str - 1) * len(train_loader)
+
+ except:
+ epoch_str = 1
+ global_step = 0
+ if pretrainG != "" and pretrainG != "None":
+ if rank == 0:
+ verify_checkpoint_shapes(pretrainG, net_g)
+ print(f"Loaded pretrained (G) '{pretrainG}'")
+ if hasattr(net_g, "module"):
+ net_g.module.load_state_dict(
+ torch.load(pretrainG, map_location="cpu", weights_only=True)["model"]
+ )
+ else:
+ net_g.load_state_dict(
+ torch.load(pretrainG, map_location="cpu", weights_only=True)["model"]
+ )
+
+ if pretrainD != "" and pretrainD != "None":
+ if rank == 0:
+ print(f"Loaded pretrained (D) '{pretrainD}'")
+ if hasattr(net_d, "module"):
+ net_d.module.load_state_dict(
+ torch.load(pretrainD, map_location="cpu", weights_only=True)["model"]
+ )
+ else:
+ net_d.load_state_dict(
+ torch.load(pretrainD, map_location="cpu", weights_only=True)["model"]
+ )
+
+ # Initialize schedulers
+ scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
+ optim_g, gamma=config.train.lr_decay, last_epoch=epoch_str - 2
+ )
+ scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
+ optim_d, gamma=config.train.lr_decay, last_epoch=epoch_str - 2
+ )
+
+ cache = []
+ # get the first sample as reference for tensorboard evaluation
+ # custom reference temporarily disabled
+ if True == False and os.path.isfile(
+ os.path.join("logs", "reference", f"ref{sample_rate}.wav")
+ ):
+ phone = np.load(
+ os.path.join("logs", "reference", f"ref{sample_rate}_feats.npy")
+ )
+ # expanding x2 to match pitch size
+ phone = np.repeat(phone, 2, axis=0)
+ phone = torch.FloatTensor(phone).unsqueeze(0).to(device)
+ phone_lengths = torch.LongTensor(phone.size(0)).to(device)
+ pitch = np.load(os.path.join("logs", "reference", f"ref{sample_rate}_f0c.npy"))
+ # removed last frame to match features
+ pitch = torch.LongTensor(pitch[:-1]).unsqueeze(0).to(device)
+ pitchf = np.load(os.path.join("logs", "reference", f"ref{sample_rate}_f0f.npy"))
+ # removed last frame to match features
+ pitchf = torch.FloatTensor(pitchf[:-1]).unsqueeze(0).to(device)
+ sid = torch.LongTensor([0]).to(device)
+ reference = (
+ phone,
+ phone_lengths,
+ pitch,
+ pitchf,
+ sid,
+ )
+ else:
+ for info in train_loader:
+ phone, phone_lengths, pitch, pitchf, _, _, _, _, sid = info
+ if device.type == "cuda":
+ reference = (
+ phone.cuda(device_id, non_blocking=True),
+ phone_lengths.cuda(device_id, non_blocking=True),
+ pitch.cuda(device_id, non_blocking=True),
+ pitchf.cuda(device_id, non_blocking=True),
+ sid.cuda(device_id, non_blocking=True),
+ )
+ else:
+ reference = (
+ phone.to(device),
+ phone_lengths.to(device),
+ pitch.to(device),
+ pitchf.to(device),
+ sid.to(device),
+ )
+ break
+
+ for epoch in range(epoch_str, total_epoch + 1):
+ train_and_evaluate(
+ rank,
+ epoch,
+ config,
+ [net_g, net_d],
+ [optim_g, optim_d],
+ [train_loader, None],
+ [writer_eval],
+ cache,
+ custom_save_every_weights,
+ custom_total_epoch,
+ device,
+ device_id,
+ reference,
+ fn_mel_loss,
+ )
+
+ scheduler_g.step()
+ scheduler_d.step()
+
+
+def train_and_evaluate(
+ rank,
+ epoch,
+ hps,
+ nets,
+ optims,
+ loaders,
+ writers,
+ cache,
+ custom_save_every_weights,
+ custom_total_epoch,
+ device,
+ device_id,
+ reference,
+ fn_mel_loss,
+):
+ """
+ Trains and evaluates the model for one epoch.
+
+ Args:
+ rank (int): Rank of the current process.
+ epoch (int): Current epoch number.
+ hps (Namespace): Hyperparameters.
+ nets (list): List of models [net_g, net_d].
+ optims (list): List of optimizers [optim_g, optim_d].
+ loaders (list): List of dataloaders [train_loader, eval_loader].
+ writers (list): List of TensorBoard writers [writer_eval].
+ cache (list): List to cache data in GPU memory.
+ use_cpu (bool): Whether to use CPU for training.
+ """
+ global global_step, lowest_value, loss_disc, consecutive_increases_gen, consecutive_increases_disc, smoothed_value_gen, smoothed_value_disc
+
+ if epoch == 1:
+ lowest_value = {"step": 0, "value": float("inf"), "epoch": 0}
+ consecutive_increases_gen = 0
+ consecutive_increases_disc = 0
+
+ epoch_disc_sum = 0.0
+ epoch_gen_sum = 0.0
+
+ net_g, net_d = nets
+ optim_g, optim_d = optims
+ train_loader = loaders[0] if loaders is not None else None
+ if writers is not None:
+ writer = writers[0]
+
+ train_loader.batch_sampler.set_epoch(epoch)
+
+ net_g.train()
+ net_d.train()
+
+ # Data caching
+ if device.type == "cuda" and cache_data_in_gpu:
+ data_iterator = cache
+ if cache == []:
+ for batch_idx, info in enumerate(train_loader):
+ # phone, phone_lengths, pitch, pitchf, spec, spec_lengths, wave, wave_lengths, sid
+ info = [tensor.cuda(device_id, non_blocking=True) for tensor in info]
+ cache.append((batch_idx, info))
+ else:
+ shuffle(cache)
+ else:
+ data_iterator = enumerate(train_loader)
+
+ epoch_recorder = EpochRecorder()
+ with tqdm(total=len(train_loader), leave=False) as pbar:
+ for batch_idx, info in data_iterator:
+ if device.type == "cuda" and not cache_data_in_gpu:
+ info = [tensor.cuda(device_id, non_blocking=True) for tensor in info]
+ elif device.type != "cuda":
+ info = [tensor.to(device) for tensor in info]
+ # else iterator is going thru a cached list with a device already assigned
+
+ (
+ phone,
+ phone_lengths,
+ pitch,
+ pitchf,
+ spec,
+ spec_lengths,
+ wave,
+ wave_lengths,
+ sid,
+ ) = info
+
+ # Forward pass
+ model_output = net_g(
+ phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
+ )
+ y_hat, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q) = (
+ model_output
+ )
+ # slice of the original waveform to match a generate slice
+ if randomized:
+ wave = commons.slice_segments(
+ wave,
+ ids_slice * config.data.hop_length,
+ config.train.segment_size,
+ dim=3,
+ )
+ y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
+ loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g)
+ # Discriminator backward and update
+ epoch_disc_sum += loss_disc.item()
+ optim_d.zero_grad()
+ loss_disc.backward()
+ grad_norm_d = torch.nn.utils.clip_grad_norm_(
+ net_d.parameters(), max_norm=1000.0
+ )
+ optim_d.step()
+
+ # Generator backward and update
+ _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
+
+ loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0
+ loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl
+ loss_fm = feature_loss(fmap_r, fmap_g)
+ loss_gen, _ = generator_loss(y_d_hat_g)
+ loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
+
+ if loss_gen_all < lowest_value["value"]:
+ lowest_value = {
+ "step": global_step,
+ "value": loss_gen_all,
+ "epoch": epoch,
+ }
+ epoch_gen_sum += loss_gen_all.item()
+ optim_g.zero_grad()
+ loss_gen_all.backward()
+ grad_norm_g = torch.nn.utils.clip_grad_norm_(
+ net_g.parameters(), max_norm=1000.0
+ )
+ optim_g.step()
+
+ global_step += 1
+
+ # queue for rolling losses over 50 steps
+ avg_losses["disc_loss_50"].append(loss_disc.detach())
+ avg_losses["fm_loss_50"].append(loss_fm.detach())
+ avg_losses["kl_loss_50"].append(loss_kl.detach())
+ avg_losses["mel_loss_50"].append(loss_mel.detach())
+ avg_losses["gen_loss_50"].append(loss_gen_all.detach())
+
+ if rank == 0 and global_step % 50 == 0:
+ # logging rolling averages
+ scalar_dict = {
+ "loss_avg_50/d/total": torch.mean(
+ torch.stack(list(avg_losses["disc_loss_50"]))
+ ),
+ "loss_avg_50/g/fm": torch.mean(
+ torch.stack(list(avg_losses["fm_loss_50"]))
+ ),
+ "loss_avg_50/g/kl": torch.mean(
+ torch.stack(list(avg_losses["kl_loss_50"]))
+ ),
+ "loss_avg_50/g/mel": torch.mean(
+ torch.stack(list(avg_losses["mel_loss_50"]))
+ ),
+ "loss_avg_50/g/total": torch.mean(
+ torch.stack(list(avg_losses["gen_loss_50"]))
+ ),
+ }
+ summarize(
+ writer=writer,
+ global_step=global_step,
+ scalars=scalar_dict,
+ )
+
+ pbar.update(1)
+ # end of batch train
+ # end of tqdm
+ with torch.no_grad():
+ torch.cuda.empty_cache()
+
+ # Logging and checkpointing
+ if rank == 0:
+
+ avg_losses["disc_loss_queue"].append(epoch_disc_sum / len(train_loader))
+ avg_losses["gen_loss_queue"].append(epoch_gen_sum / len(train_loader))
+
+ # used for tensorboard chart - all/mel
+ mel = spec_to_mel_torch(
+ spec,
+ config.data.filter_length,
+ config.data.n_mel_channels,
+ config.data.sample_rate,
+ config.data.mel_fmin,
+ config.data.mel_fmax,
+ )
+ # used for tensorboard chart - slice/mel_org
+ if randomized:
+ y_mel = commons.slice_segments(
+ mel,
+ ids_slice,
+ config.train.segment_size // config.data.hop_length,
+ dim=3,
+ )
+ else:
+ y_mel = mel
+ # used for tensorboard chart - slice/mel_gen
+ y_hat_mel = mel_spectrogram_torch(
+ y_hat.float().squeeze(1),
+ config.data.filter_length,
+ config.data.n_mel_channels,
+ config.data.sample_rate,
+ config.data.hop_length,
+ config.data.win_length,
+ config.data.mel_fmin,
+ config.data.mel_fmax,
+ )
+
+ lr = optim_g.param_groups[0]["lr"]
+
+ scalar_dict = {
+ "loss/g/total": loss_gen_all,
+ "loss/d/total": loss_disc,
+ "learning_rate": lr,
+ "grad/norm_d": grad_norm_d.item(),
+ "grad/norm_g": grad_norm_g.item(),
+ "loss/g/fm": loss_fm,
+ "loss/g/mel": loss_mel,
+ "loss/g/kl": loss_kl,
+ "loss_avg_epoch/disc": np.mean(avg_losses["disc_loss_queue"]),
+ "loss_avg_epoch/gen": np.mean(avg_losses["gen_loss_queue"]),
+ }
+
+ image_dict = {
+ "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
+ "slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
+ "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
+ }
+
+ if epoch % save_every_epoch == 0:
+ with torch.no_grad():
+ if hasattr(net_g, "module"):
+ o, *_ = net_g.module.infer(*reference)
+ else:
+ o, *_ = net_g.infer(*reference)
+ audio_dict = {f"gen/audio_{global_step:07d}": o[0, :, :]}
+ summarize(
+ writer=writer,
+ global_step=global_step,
+ images=image_dict,
+ scalars=scalar_dict,
+ audios=audio_dict,
+ audio_sample_rate=config.data.sample_rate,
+ )
+ else:
+ summarize(
+ writer=writer,
+ global_step=global_step,
+ images=image_dict,
+ scalars=scalar_dict,
+ )
+
+ # Save checkpoint
+ model_add = []
+ model_del = []
+ done = False
+
+ if rank == 0:
+ overtrain_info = ""
+ # Check overtraining
+ if overtraining_detector and rank == 0 and epoch > 1:
+ # Add the current loss to the history
+ current_loss_disc = float(loss_disc)
+ loss_disc_history.append(current_loss_disc)
+ # Update smoothed loss history with loss_disc
+ smoothed_value_disc = update_exponential_moving_average(
+ smoothed_loss_disc_history, current_loss_disc
+ )
+ # Check overtraining with smoothed loss_disc
+ is_overtraining_disc = check_overtraining(
+ smoothed_loss_disc_history, overtraining_threshold * 2
+ )
+ if is_overtraining_disc:
+ consecutive_increases_disc += 1
+ else:
+ consecutive_increases_disc = 0
+ # Add the current loss_gen to the history
+ current_loss_gen = float(lowest_value["value"])
+ loss_gen_history.append(current_loss_gen)
+ # Update the smoothed loss_gen history
+ smoothed_value_gen = update_exponential_moving_average(
+ smoothed_loss_gen_history, current_loss_gen
+ )
+ # Check for overtraining with the smoothed loss_gen
+ is_overtraining_gen = check_overtraining(
+ smoothed_loss_gen_history, overtraining_threshold, 0.01
+ )
+ if is_overtraining_gen:
+ consecutive_increases_gen += 1
+ else:
+ consecutive_increases_gen = 0
+ overtrain_info = f"Smoothed loss_g {smoothed_value_gen:.3f} and loss_d {smoothed_value_disc:.3f}"
+ # Save the data in the JSON file if the epoch is divisible by save_every_epoch
+ if epoch % save_every_epoch == 0:
+ save_to_json(
+ training_file_path,
+ loss_disc_history,
+ smoothed_loss_disc_history,
+ loss_gen_history,
+ smoothed_loss_gen_history,
+ )
+
+ if (
+ is_overtraining_gen
+ and consecutive_increases_gen == overtraining_threshold
+ or is_overtraining_disc
+ and consecutive_increases_disc == overtraining_threshold * 2
+ ):
+ print(
+ f"Overtraining detected at epoch {epoch} with smoothed loss_g {smoothed_value_gen:.3f} and loss_d {smoothed_value_disc:.3f}"
+ )
+ done = True
+ else:
+ print(
+ f"New best epoch {epoch} with smoothed loss_g {smoothed_value_gen:.3f} and loss_d {smoothed_value_disc:.3f}"
+ )
+ old_model_files = glob.glob(
+ os.path.join(experiment_dir, f"{model_name}_*e_*s_best_epoch.pth")
+ )
+ for file in old_model_files:
+ model_del.append(file)
+ model_add.append(
+ os.path.join(
+ experiment_dir,
+ f"{model_name}_{epoch}e_{global_step}s_best_epoch.pth",
+ )
+ )
+
+ # Print training progress
+ lowest_value_rounded = float(lowest_value["value"])
+ lowest_value_rounded = round(lowest_value_rounded, 3)
+
+ record = f"{model_name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()}"
+ if epoch > 1:
+ record = (
+ record
+ + f" | lowest_value={lowest_value_rounded} (epoch {lowest_value['epoch']} and step {lowest_value['step']})"
+ )
+
+ if overtraining_detector:
+ remaining_epochs_gen = overtraining_threshold - consecutive_increases_gen
+ remaining_epochs_disc = (
+ overtraining_threshold * 2 - consecutive_increases_disc
+ )
+ record = (
+ record
+ + f" | Number of epochs remaining for overtraining: g/total: {remaining_epochs_gen} d/total: {remaining_epochs_disc} | smoothed_loss_gen={smoothed_value_gen:.3f} | smoothed_loss_disc={smoothed_value_disc:.3f}"
+ )
+ print(record)
+
+ # Save weights every N epochs
+ if epoch % save_every_epoch == 0:
+ checkpoint_suffix = f"{2333333 if save_only_latest else global_step}.pth"
+ save_checkpoint(
+ net_g,
+ optim_g,
+ config.train.learning_rate,
+ epoch,
+ os.path.join(experiment_dir, "G_" + checkpoint_suffix),
+ )
+ save_checkpoint(
+ net_d,
+ optim_d,
+ config.train.learning_rate,
+ epoch,
+ os.path.join(experiment_dir, "D_" + checkpoint_suffix),
+ )
+ if custom_save_every_weights:
+ model_add.append(
+ os.path.join(
+ experiment_dir, f"{model_name}_{epoch}e_{global_step}s.pth"
+ )
+ )
+
+ # Clean-up old best epochs
+ for m in model_del:
+ os.remove(m)
+
+ if model_add:
+ ckpt = (
+ net_g.module.state_dict()
+ if hasattr(net_g, "module")
+ else net_g.state_dict()
+ )
+ for m in model_add:
+ if os.path.exists(m):
+ print(f"{m} already exists. Overwriting.")
+ extract_model(
+ ckpt=ckpt,
+ sr=sample_rate,
+ name=model_name,
+ model_path=m,
+ epoch=epoch,
+ step=global_step,
+ hps=hps,
+ overtrain_info=overtrain_info,
+ vocoder=vocoder,
+ )
+
+ # Check completion
+ if epoch >= custom_total_epoch:
+ lowest_value_rounded = float(lowest_value["value"])
+ lowest_value_rounded = round(lowest_value_rounded, 3)
+ print(
+ f"Training has been successfully completed with {epoch} epoch, {global_step} steps and {round(loss_gen_all.item(), 3)} loss gen."
+ )
+ print(
+ f"Lowest generator loss: {lowest_value_rounded} at epoch {lowest_value['epoch']}, step {lowest_value['step']}"
+ )
+
+ pid_file_path = os.path.join(experiment_dir, "config.json")
+ with open(pid_file_path, "r") as pid_file:
+ pid_data = json.load(pid_file)
+ with open(pid_file_path, "w") as pid_file:
+ pid_data.pop("process_pids", None)
+ json.dump(pid_data, pid_file, indent=4)
+ # Final model
+ model_add.append(
+ os.path.join(
+ experiment_dir, f"{model_name}_{epoch}e_{global_step}s.pth"
+ )
+ )
+ done = True
+
+ if done:
+ os._exit(2333333)
+
+ with torch.no_grad():
+ torch.cuda.empty_cache()
+
+
+def check_overtraining(smoothed_loss_history, threshold, epsilon=0.004):
+ """
+ Checks for overtraining based on the smoothed loss history.
+
+ Args:
+ smoothed_loss_history (list): List of smoothed losses for each epoch.
+ threshold (int): Number of consecutive epochs with insignificant changes or increases to consider overtraining.
+ epsilon (float): The maximum change considered insignificant.
+ """
+ if len(smoothed_loss_history) < threshold + 1:
+ return False
+
+ for i in range(-threshold, -1):
+ if smoothed_loss_history[i + 1] > smoothed_loss_history[i]:
+ return True
+ if abs(smoothed_loss_history[i + 1] - smoothed_loss_history[i]) >= epsilon:
+ return False
+ return True
+
+
+def update_exponential_moving_average(
+ smoothed_loss_history, new_value, smoothing=0.987
+):
+ """
+ Updates the exponential moving average with a new value.
+
+ Args:
+ smoothed_loss_history (list): List of smoothed values.
+ new_value (float): New value to be added.
+ smoothing (float): Smoothing factor.
+ """
+ if smoothed_loss_history:
+ smoothed_value = (
+ smoothing * smoothed_loss_history[-1] + (1 - smoothing) * new_value
+ )
+ else:
+ smoothed_value = new_value
+ smoothed_loss_history.append(smoothed_value)
+ return smoothed_value
+
+
+def save_to_json(
+ file_path,
+ loss_disc_history,
+ smoothed_loss_disc_history,
+ loss_gen_history,
+ smoothed_loss_gen_history,
+):
+ """
+ Save the training history to a JSON file.
+ """
+ data = {
+ "loss_disc_history": loss_disc_history,
+ "smoothed_loss_disc_history": smoothed_loss_disc_history,
+ "loss_gen_history": loss_gen_history,
+ "smoothed_loss_gen_history": smoothed_loss_gen_history,
+ }
+ with open(file_path, "w") as f:
+ json.dump(data, f)
+
+
+if __name__ == "__main__":
+ torch.multiprocessing.set_start_method("spawn")
+ main()
diff --git a/rvc/train/utils.py b/rvc/train/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c43d6b3b930e6e0708a890f36baa990e40b4ac96
--- /dev/null
+++ b/rvc/train/utils.py
@@ -0,0 +1,247 @@
+import os
+import glob
+import torch
+import numpy as np
+import soundfile as sf
+from collections import OrderedDict
+import matplotlib.pyplot as plt
+
+MATPLOTLIB_FLAG = False
+
+
+def replace_keys_in_dict(d, old_key_part, new_key_part):
+ """
+ Recursively replace parts of the keys in a dictionary.
+
+ Args:
+ d (dict or OrderedDict): The dictionary to update.
+ old_key_part (str): The part of the key to replace.
+ new_key_part (str): The new part of the key.
+ """
+ updated_dict = OrderedDict() if isinstance(d, OrderedDict) else {}
+ for key, value in d.items():
+ new_key = (
+ key.replace(old_key_part, new_key_part) if isinstance(key, str) else key
+ )
+ updated_dict[new_key] = (
+ replace_keys_in_dict(value, old_key_part, new_key_part)
+ if isinstance(value, dict)
+ else value
+ )
+ return updated_dict
+
+
+def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
+ """
+ Load a checkpoint into a model and optionally the optimizer.
+
+ Args:
+ checkpoint_path (str): Path to the checkpoint file.
+ model (torch.nn.Module): The model to load the checkpoint into.
+ optimizer (torch.optim.Optimizer, optional): The optimizer to load the state from. Defaults to None.
+ load_opt (int, optional): Whether to load the optimizer state. Defaults to 1.
+ """
+ assert os.path.isfile(
+ checkpoint_path
+ ), f"Checkpoint file not found: {checkpoint_path}"
+
+ checkpoint_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+ checkpoint_dict = replace_keys_in_dict(
+ replace_keys_in_dict(
+ checkpoint_dict, ".weight_v", ".parametrizations.weight.original1"
+ ),
+ ".weight_g",
+ ".parametrizations.weight.original0",
+ )
+
+ # Update model state_dict
+ model_state_dict = (
+ model.module.state_dict() if hasattr(model, "module") else model.state_dict()
+ )
+ new_state_dict = {
+ k: checkpoint_dict["model"].get(k, v) for k, v in model_state_dict.items()
+ }
+
+ # Load state_dict into model
+ if hasattr(model, "module"):
+ model.module.load_state_dict(new_state_dict, strict=False)
+ else:
+ model.load_state_dict(new_state_dict, strict=False)
+
+ if optimizer and load_opt == 1:
+ optimizer.load_state_dict(checkpoint_dict.get("optimizer", {}))
+
+ print(
+ f"Loaded checkpoint '{checkpoint_path}' (epoch {checkpoint_dict['iteration']})"
+ )
+ return (
+ model,
+ optimizer,
+ checkpoint_dict.get("learning_rate", 0),
+ checkpoint_dict["iteration"],
+ )
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+ """
+ Save the model and optimizer state to a checkpoint file.
+
+ Args:
+ model (torch.nn.Module): The model to save.
+ optimizer (torch.optim.Optimizer): The optimizer to save the state of.
+ learning_rate (float): The current learning rate.
+ iteration (int): The current iteration.
+ checkpoint_path (str): The path to save the checkpoint to.
+ """
+ state_dict = (
+ model.module.state_dict() if hasattr(model, "module") else model.state_dict()
+ )
+ checkpoint_data = {
+ "model": state_dict,
+ "iteration": iteration,
+ "optimizer": optimizer.state_dict(),
+ "learning_rate": learning_rate,
+ }
+
+ # Create a backwards-compatible checkpoint
+ torch.save(
+ replace_keys_in_dict(
+ replace_keys_in_dict(
+ checkpoint_data, ".parametrizations.weight.original1", ".weight_v"
+ ),
+ ".parametrizations.weight.original0",
+ ".weight_g",
+ ),
+ checkpoint_path,
+ )
+
+ print(f"Saved model '{checkpoint_path}' (epoch {iteration})")
+
+
+def summarize(
+ writer,
+ global_step,
+ scalars={},
+ histograms={},
+ images={},
+ audios={},
+ audio_sample_rate=22050,
+):
+ """
+ Log various summaries to a TensorBoard writer.
+
+ Args:
+ writer (SummaryWriter): The TensorBoard writer.
+ global_step (int): The current global step.
+ scalars (dict, optional): Dictionary of scalar values to log.
+ histograms (dict, optional): Dictionary of histogram values to log.
+ images (dict, optional): Dictionary of image values to log.
+ audios (dict, optional): Dictionary of audio values to log.
+ audio_sample_rate (int, optional): Sampling rate of the audio data.
+ """
+ for k, v in scalars.items():
+ writer.add_scalar(k, v, global_step)
+ for k, v in histograms.items():
+ writer.add_histogram(k, v, global_step)
+ for k, v in images.items():
+ writer.add_image(k, v, global_step, dataformats="HWC")
+ for k, v in audios.items():
+ writer.add_audio(k, v, global_step, audio_sample_rate)
+
+
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+ """
+ Get the latest checkpoint file in a directory.
+
+ Args:
+ dir_path (str): The directory to search for checkpoints.
+ regex (str, optional): The regular expression to match checkpoint files.
+ """
+ checkpoints = sorted(
+ glob.glob(os.path.join(dir_path, regex)),
+ key=lambda f: int("".join(filter(str.isdigit, f))),
+ )
+ return checkpoints[-1] if checkpoints else None
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+ """
+ Convert a spectrogram to a NumPy array for visualization.
+
+ Args:
+ spectrogram (numpy.ndarray): The spectrogram to plot.
+ """
+ global MATPLOTLIB_FLAG
+ if not MATPLOTLIB_FLAG:
+ plt.switch_backend("Agg")
+ MATPLOTLIB_FLAG = True
+
+ fig, ax = plt.subplots(figsize=(10, 2))
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
+ plt.colorbar(im, ax=ax)
+ plt.xlabel("Frames")
+ plt.ylabel("Channels")
+ plt.tight_layout()
+
+ fig.canvas.draw()
+ data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+ plt.close(fig)
+ return data
+
+
+def load_wav_to_torch(full_path):
+ """
+ Load a WAV file into a PyTorch tensor.
+
+ Args:
+ full_path (str): The path to the WAV file.
+ """
+ data, sample_rate = sf.read(full_path, dtype="float32")
+ return torch.FloatTensor(data), sample_rate
+
+
+def load_filepaths_and_text(filename, split="|"):
+ """
+ Load filepaths and associated text from a file.
+
+ Args:
+ filename (str): The path to the file.
+ split (str, optional): The delimiter used to split the lines.
+ """
+ with open(filename, encoding="utf-8") as f:
+ return [line.strip().split(split) for line in f]
+
+
+class HParams:
+ """
+ A class for storing and accessing hyperparameters.
+ """
+
+ def __init__(self, **kwargs):
+ for k, v in kwargs.items():
+ self[k] = HParams(**v) if isinstance(v, dict) else v
+
+ def keys(self):
+ return self.__dict__.keys()
+
+ def items(self):
+ return self.__dict__.items()
+
+ def values(self):
+ return self.__dict__.values()
+
+ def __len__(self):
+ return len(self.__dict__)
+
+ def __getitem__(self, key):
+ return self.__dict__[key]
+
+ def __setitem__(self, key, value):
+ self.__dict__[key] = value
+
+ def __contains__(self, key):
+ return key in self.__dict__
+
+ def __repr__(self):
+ return repr(self.__dict__)
diff --git a/tabs/download/download.py b/tabs/download/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..d93f179dade19032af6d179f79d93c477985cdaa
--- /dev/null
+++ b/tabs/download/download.py
@@ -0,0 +1,231 @@
+import os
+import sys
+import json
+import shutil
+import requests
+import tempfile
+import gradio as gr
+import pandas as pd
+
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from core import run_download_script
+from rvc.lib.utils import format_title
+
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+gradio_temp_dir = os.path.join(tempfile.gettempdir(), "gradio")
+
+if os.path.exists(gradio_temp_dir):
+ shutil.rmtree(gradio_temp_dir)
+
+
+def save_drop_model(dropbox):
+ if "pth" not in dropbox and "index" not in dropbox:
+ raise gr.Error(
+ message="The file you dropped is not a valid model file. Please try again."
+ )
+
+ file_name = format_title(os.path.basename(dropbox))
+ model_name = file_name
+
+ if ".pth" in model_name:
+ model_name = model_name.split(".pth")[0]
+ elif ".index" in model_name:
+ replacements = ["nprobe_1_", "_v1", "_v2", "added_"]
+ for rep in replacements:
+ model_name = model_name.replace(rep, "")
+ model_name = model_name.split(".index")[0]
+
+ model_path = os.path.join(now_dir, "logs", model_name)
+ if not os.path.exists(model_path):
+ os.makedirs(model_path)
+ if os.path.exists(os.path.join(model_path, file_name)):
+ os.remove(os.path.join(model_path, file_name))
+ shutil.move(dropbox, os.path.join(model_path, file_name))
+ print(f"{file_name} saved in {model_path}")
+ gr.Info(f"{file_name} saved in {model_path}")
+
+ return None
+
+
+json_url = "https://huggingface.co/IAHispano/Applio/raw/main/pretrains.json"
+
+
+def fetch_pretrained_data():
+ pretraineds_custom_path = os.path.join(
+ "rvc", "models", "pretraineds", "pretraineds_custom"
+ )
+ os.makedirs(pretraineds_custom_path, exist_ok=True)
+ try:
+ with open(
+ os.path.join(pretraineds_custom_path, json_url.split("/")[-1]), "r"
+ ) as f:
+ data = json.load(f)
+ except:
+ try:
+ response = requests.get(json_url)
+ response.raise_for_status()
+ data = response.json()
+ with open(
+ os.path.join(pretraineds_custom_path, json_url.split("/")[-1]),
+ "w",
+ encoding="utf-8",
+ ) as f:
+ json.dump(
+ data,
+ f,
+ indent=2,
+ separators=(",", ": "),
+ ensure_ascii=False,
+ )
+ except:
+ data = {
+ "Titan": {
+ "32k": {"D": "null", "G": "null"},
+ },
+ }
+ return data
+
+
+def get_pretrained_list():
+ data = fetch_pretrained_data()
+ return list(data.keys())
+
+
+def get_pretrained_sample_rates(model):
+ data = fetch_pretrained_data()
+ return list(data[model].keys())
+
+
+def get_file_size(url):
+ response = requests.head(url)
+ return int(response.headers.get("content-length", 0))
+
+
+def download_file(url, destination_path, progress_bar):
+ os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+ response = requests.get(url, stream=True)
+ block_size = 1024
+ with open(destination_path, "wb") as file:
+ for data in response.iter_content(block_size):
+ file.write(data)
+ progress_bar.update(len(data))
+
+
+def download_pretrained_model(model, sample_rate):
+ data = fetch_pretrained_data()
+ paths = data[model][sample_rate]
+ pretraineds_custom_path = os.path.join(
+ "rvc", "models", "pretraineds", "pretraineds_custom"
+ )
+ os.makedirs(pretraineds_custom_path, exist_ok=True)
+
+ d_url = f"https://huggingface.co/{paths['D']}"
+ g_url = f"https://huggingface.co/{paths['G']}"
+
+ total_size = get_file_size(d_url) + get_file_size(g_url)
+
+ gr.Info("Downloading pretrained model...")
+
+ with tqdm(
+ total=total_size, unit="iB", unit_scale=True, desc="Downloading files"
+ ) as progress_bar:
+ with ThreadPoolExecutor(max_workers=2) as executor:
+ futures = [
+ executor.submit(
+ download_file,
+ d_url,
+ os.path.join(pretraineds_custom_path, os.path.basename(paths["D"])),
+ progress_bar,
+ ),
+ executor.submit(
+ download_file,
+ g_url,
+ os.path.join(pretraineds_custom_path, os.path.basename(paths["G"])),
+ progress_bar,
+ ),
+ ]
+ for future in futures:
+ future.result()
+
+ gr.Info("Pretrained model downloaded successfully!")
+ print("Pretrained model downloaded successfully!")
+
+
+def update_sample_rate_dropdown(model):
+ return {
+ "choices": get_pretrained_sample_rates(model),
+ "value": get_pretrained_sample_rates(model)[0],
+ "__type__": "update",
+ }
+
+
+def download_tab():
+ with gr.Column():
+ gr.Markdown(value=i18n("## Download Model"))
+ model_link = gr.Textbox(
+ label=i18n("Model Link"),
+ placeholder=i18n("Introduce the model link"),
+ interactive=True,
+ )
+ model_download_output_info = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ value="",
+ max_lines=8,
+ interactive=False,
+ )
+ model_download_button = gr.Button(i18n("Download Model"))
+ model_download_button.click(
+ fn=run_download_script,
+ inputs=[model_link],
+ outputs=[model_download_output_info],
+ )
+ gr.Markdown(value=i18n("## Drop files"))
+ dropbox = gr.File(
+ label=i18n(
+ "Drag your .pth file and .index file into this space. Drag one and then the other."
+ ),
+ type="filepath",
+ )
+
+ dropbox.upload(
+ fn=save_drop_model,
+ inputs=[dropbox],
+ outputs=[dropbox],
+ )
+ gr.Markdown(value=i18n("## Download Pretrained Models"))
+ pretrained_model = gr.Dropdown(
+ label=i18n("Pretrained"),
+ info=i18n("Select the pretrained model you want to download."),
+ choices=get_pretrained_list(),
+ value="Titan",
+ interactive=True,
+ )
+ pretrained_sample_rate = gr.Dropdown(
+ label=i18n("Sampling Rate"),
+ info=i18n("And select the sampling rate."),
+ choices=get_pretrained_sample_rates(pretrained_model.value),
+ value="40k",
+ interactive=True,
+ allow_custom_value=True,
+ )
+ pretrained_model.change(
+ update_sample_rate_dropdown,
+ inputs=[pretrained_model],
+ outputs=[pretrained_sample_rate],
+ )
+ download_pretrained = gr.Button(i18n("Download"))
+ download_pretrained.click(
+ fn=download_pretrained_model,
+ inputs=[pretrained_model, pretrained_sample_rate],
+ outputs=[],
+ )
diff --git a/tabs/extra/extra.py b/tabs/extra/extra.py
new file mode 100644
index 0000000000000000000000000000000000000000..99426015f1f7dcf316209e7a2e9707dc94176df8
--- /dev/null
+++ b/tabs/extra/extra.py
@@ -0,0 +1,25 @@
+import os
+import sys
+import gradio as gr
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from tabs.extra.sections.processing import processing_tab
+from tabs.extra.sections.analyzer import analyzer_tab
+from tabs.extra.sections.f0_extractor import f0_extractor_tab
+
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+
+def extra_tab():
+ with gr.TabItem(i18n("Model information")):
+ processing_tab()
+
+ with gr.TabItem(i18n("F0 Curve")):
+ f0_extractor_tab()
+
+ with gr.TabItem(i18n("Audio Analyzer")):
+ analyzer_tab()
diff --git a/tabs/extra/model_information.py b/tabs/extra/model_information.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd39110c5288cb2d49c2ad999fa8e06519168688
--- /dev/null
+++ b/tabs/extra/model_information.py
@@ -0,0 +1,29 @@
+import gradio as gr
+from core import run_model_information_script
+
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+
+def model_information_tab():
+ with gr.Column():
+ model_name = gr.Textbox(
+ label=i18n("Path to Model"),
+ info=i18n("Introduce the model pth path"),
+ placeholder=i18n("Introduce the model pth path"),
+ interactive=True,
+ )
+ model_information_output_info = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ value="",
+ max_lines=12,
+ interactive=False,
+ )
+ model_information_button = gr.Button(i18n("See Model Information"))
+ model_information_button.click(
+ fn=run_model_information_script,
+ inputs=[model_name],
+ outputs=[model_information_output_info],
+ )
diff --git a/tabs/extra/sections/analyzer.py b/tabs/extra/sections/analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..371d8aa952adfca71ac265219bd94c23152e2a86
--- /dev/null
+++ b/tabs/extra/sections/analyzer.py
@@ -0,0 +1,30 @@
+import os, sys
+import gradio as gr
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from core import run_audio_analyzer_script
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+
+def analyzer_tab():
+ with gr.Column():
+ audio_input = gr.Audio(type="filepath")
+ output_info = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ value="",
+ max_lines=8,
+ interactive=False,
+ )
+ get_info_button = gr.Button(value=i18n("Get information about the audio"))
+ image_output = gr.Image(type="filepath", interactive=False)
+
+ get_info_button.click(
+ fn=run_audio_analyzer_script,
+ inputs=[audio_input],
+ outputs=[output_info, image_output],
+ )
diff --git a/tabs/extra/sections/f0_extractor.py b/tabs/extra/sections/f0_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a6c546c9cb19c63cc3b5fd48d0b245d598b6ac3
--- /dev/null
+++ b/tabs/extra/sections/f0_extractor.py
@@ -0,0 +1,66 @@
+import os
+import librosa
+import gradio as gr
+from matplotlib import pyplot as plt
+
+from rvc.lib.predictors.F0Extractor import F0Extractor
+
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+
+def extract_f0_curve(audio_path: str, method: str):
+ print("Extracting F0 Curve...")
+ image_path = os.path.join("logs", "f0_plot.png")
+ txt_path = os.path.join("logs", "f0_curve.txt")
+ y, sr = librosa.load(audio_path, sr=None)
+ hop_length = 160
+
+ librosa.note_to_hz("C1")
+ librosa.note_to_hz("C8")
+
+ f0_extractor = F0Extractor(audio_path, sample_rate=sr, method=method)
+ f0 = f0_extractor.extract_f0()
+
+ plt.figure(figsize=(10, 4))
+ plt.plot(f0)
+ plt.title(method)
+ plt.xlabel("Time (frames)")
+ plt.ylabel("Frequency (Hz)")
+ plt.savefig(image_path)
+ plt.close()
+
+ with open(txt_path, "w") as txtfile:
+ for i, f0_value in enumerate(f0):
+ frequency = i * sr / hop_length
+ txtfile.write(f"{frequency},{f0_value}\n")
+
+ print("F0 Curve extracted successfully!")
+ return image_path, txt_path
+
+
+def f0_extractor_tab():
+ audio = gr.Audio(label=i18n("Upload Audio"), type="filepath")
+ f0_method = gr.Radio(
+ label=i18n("Pitch extraction algorithm"),
+ info=i18n(
+ "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
+ ),
+ choices=["crepe", "fcpe", "rmvpe"],
+ value="rmvpe",
+ )
+ button = gr.Button(i18n("Extract F0 Curve"))
+
+ with gr.Row():
+ txt_output = gr.File(label="F0 Curve", type="filepath")
+ image_output = gr.Image(type="filepath", interactive=False)
+
+ button.click(
+ fn=extract_f0_curve,
+ inputs=[
+ audio,
+ f0_method,
+ ],
+ outputs=[image_output, txt_output],
+ )
diff --git a/tabs/extra/sections/processing.py b/tabs/extra/sections/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d12640e9bc7768227f9b492d9599b790fde17ee
--- /dev/null
+++ b/tabs/extra/sections/processing.py
@@ -0,0 +1,34 @@
+import os
+import sys
+import gradio as gr
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from core import run_model_information_script
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+
+def processing_tab():
+ model_view_model_path = gr.Textbox(
+ label=i18n("Path to Model"),
+ info=i18n("Introduce the model pth path"),
+ value="",
+ interactive=True,
+ placeholder=i18n("Enter path to model"),
+ )
+
+ model_view_output_info = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ value="",
+ max_lines=11,
+ )
+ model_view_button = gr.Button(i18n("View"))
+ model_view_button.click(
+ fn=run_model_information_script,
+ inputs=[model_view_model_path],
+ outputs=[model_view_output_info],
+ )
diff --git a/tabs/inference/inference.py b/tabs/inference/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6fac000c266b714f96f00b60c22ca4feb085639
--- /dev/null
+++ b/tabs/inference/inference.py
@@ -0,0 +1,2185 @@
+import os, sys
+import gradio as gr
+import regex as re
+import shutil
+import datetime
+import json
+import torch
+
+from core import (
+ run_infer_script,
+ run_batch_infer_script,
+)
+
+from assets.i18n.i18n import I18nAuto
+
+from rvc.lib.utils import format_title
+from tabs.settings.sections.restart import stop_infer
+
+i18n = I18nAuto()
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+model_root = os.path.join(now_dir, "logs")
+audio_root = os.path.join(now_dir, "assets", "audios")
+custom_embedder_root = os.path.join(
+ now_dir, "rvc", "models", "embedders", "embedders_custom"
+)
+
+PRESETS_DIR = os.path.join(now_dir, "assets", "presets")
+FORMANTSHIFT_DIR = os.path.join(now_dir, "assets", "formant_shift")
+
+os.makedirs(custom_embedder_root, exist_ok=True)
+
+custom_embedder_root_relative = os.path.relpath(custom_embedder_root, now_dir)
+model_root_relative = os.path.relpath(model_root, now_dir)
+audio_root_relative = os.path.relpath(audio_root, now_dir)
+
+sup_audioext = {
+ "wav",
+ "mp3",
+ "flac",
+ "ogg",
+ "opus",
+ "m4a",
+ "mp4",
+ "aac",
+ "alac",
+ "wma",
+ "aiff",
+ "webm",
+ "ac3",
+}
+
+names = [
+ os.path.join(root, file)
+ for root, _, files in os.walk(model_root_relative, topdown=False)
+ for file in files
+ if (
+ file.endswith((".pth", ".onnx"))
+ and not (file.startswith("G_") or file.startswith("D_"))
+ )
+]
+
+default_weight = names[0] if names else None
+
+indexes_list = [
+ os.path.join(root, name)
+ for root, _, files in os.walk(model_root_relative, topdown=False)
+ for name in files
+ if name.endswith(".index") and "trained" not in name
+]
+
+audio_paths = [
+ os.path.join(root, name)
+ for root, _, files in os.walk(audio_root_relative, topdown=False)
+ for name in files
+ if name.endswith(tuple(sup_audioext))
+ and root == audio_root_relative
+ and "_output" not in name
+]
+
+custom_embedders = [
+ os.path.join(dirpath, dirname)
+ for dirpath, dirnames, _ in os.walk(custom_embedder_root_relative)
+ for dirname in dirnames
+]
+
+
+def update_sliders(preset):
+ with open(
+ os.path.join(PRESETS_DIR, f"{preset}.json"), "r", encoding="utf-8"
+ ) as json_file:
+ values = json.load(json_file)
+ return (
+ values["pitch"],
+ values["filter_radius"],
+ values["index_rate"],
+ values["rms_mix_rate"],
+ values["protect"],
+ )
+
+
+def update_sliders_formant(preset):
+ with open(
+ os.path.join(FORMANTSHIFT_DIR, f"{preset}.json"), "r", encoding="utf-8"
+ ) as json_file:
+ values = json.load(json_file)
+ return (
+ values["formant_qfrency"],
+ values["formant_timbre"],
+ )
+
+
+def export_presets(presets, file_path):
+ with open(file_path, "w", encoding="utf-8") as json_file:
+ json.dump(presets, json_file, ensure_ascii=False, indent=4)
+
+
+def import_presets(file_path):
+ with open(file_path, "r", encoding="utf-8") as json_file:
+ presets = json.load(json_file)
+ return presets
+
+
+def get_presets_data(pitch, filter_radius, index_rate, rms_mix_rate, protect):
+ return {
+ "pitch": pitch,
+ "filter_radius": filter_radius,
+ "index_rate": index_rate,
+ "rms_mix_rate": rms_mix_rate,
+ "protect": protect,
+ }
+
+
+def export_presets_button(
+ preset_name, pitch, filter_radius, index_rate, rms_mix_rate, protect
+):
+ if preset_name:
+ file_path = os.path.join(PRESETS_DIR, f"{preset_name}.json")
+ presets_data = get_presets_data(
+ pitch, filter_radius, index_rate, rms_mix_rate, protect
+ )
+ with open(file_path, "w", encoding="utf-8") as json_file:
+ json.dump(presets_data, json_file, ensure_ascii=False, indent=4)
+ return "Export successful"
+ return "Export cancelled"
+
+
+def import_presets_button(file_path):
+ if file_path:
+ imported_presets = import_presets(file_path.name)
+ return (
+ list(imported_presets.keys()),
+ imported_presets,
+ "Presets imported successfully!",
+ )
+ return [], {}, "No file selected for import."
+
+
+def list_json_files(directory):
+ return [f.rsplit(".", 1)[0] for f in os.listdir(directory) if f.endswith(".json")]
+
+
+def refresh_presets():
+ json_files = list_json_files(PRESETS_DIR)
+ return gr.update(choices=json_files)
+
+
+def output_path_fn(input_audio_path):
+ original_name_without_extension = os.path.basename(input_audio_path).rsplit(".", 1)[
+ 0
+ ]
+ new_name = original_name_without_extension + "_output.wav"
+ output_path = os.path.join(os.path.dirname(input_audio_path), new_name)
+ return output_path
+
+
+def change_choices(model):
+ if model:
+ speakers = get_speakers_id(model)
+ else:
+ speakers = [0]
+ names = [
+ os.path.join(root, file)
+ for root, _, files in os.walk(model_root_relative, topdown=False)
+ for file in files
+ if (
+ file.endswith((".pth", ".onnx"))
+ and not (file.startswith("G_") or file.startswith("D_"))
+ )
+ ]
+
+ indexes_list = [
+ os.path.join(root, name)
+ for root, _, files in os.walk(model_root_relative, topdown=False)
+ for name in files
+ if name.endswith(".index") and "trained" not in name
+ ]
+
+ audio_paths = [
+ os.path.join(root, name)
+ for root, _, files in os.walk(audio_root_relative, topdown=False)
+ for name in files
+ if name.endswith(tuple(sup_audioext))
+ and root == audio_root_relative
+ and "_output" not in name
+ ]
+
+ return (
+ {"choices": sorted(names), "__type__": "update"},
+ {"choices": sorted(indexes_list), "__type__": "update"},
+ {"choices": sorted(audio_paths), "__type__": "update"},
+ {
+ "choices": (
+ sorted(speakers)
+ if speakers is not None and isinstance(speakers, (list, tuple))
+ else [0]
+ ),
+ "__type__": "update",
+ },
+ {
+ "choices": (
+ sorted(speakers)
+ if speakers is not None and isinstance(speakers, (list, tuple))
+ else [0]
+ ),
+ "__type__": "update",
+ },
+ )
+
+
+def get_indexes():
+ indexes_list = [
+ os.path.join(dirpath, filename)
+ for dirpath, _, filenames in os.walk(model_root_relative)
+ for filename in filenames
+ if filename.endswith(".index") and "trained" not in filename
+ ]
+
+ return indexes_list if indexes_list else ""
+
+
+def extract_model_and_epoch(path):
+ base_name = os.path.basename(path)
+ match = re.match(r"(.+?)_(\d+)e_", base_name)
+ if match:
+ model, epoch = match.groups()
+ return model, int(epoch)
+ return "", 0
+
+
+def save_to_wav(record_button):
+ if record_button is None:
+ pass
+ else:
+ path_to_file = record_button
+ new_name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".wav"
+ target_path = os.path.join(audio_root_relative, os.path.basename(new_name))
+
+ shutil.move(path_to_file, target_path)
+ return target_path, output_path_fn(target_path)
+
+
+def save_to_wav2(upload_audio):
+ file_path = upload_audio
+ formated_name = format_title(os.path.basename(file_path))
+ target_path = os.path.join(audio_root_relative, formated_name)
+
+ if os.path.exists(target_path):
+ os.remove(target_path)
+
+ shutil.copy(file_path, target_path)
+ return target_path, output_path_fn(target_path)
+
+
+def delete_outputs():
+ gr.Info(f"Outputs cleared!")
+ for root, _, files in os.walk(audio_root_relative, topdown=False):
+ for name in files:
+ if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"):
+ os.remove(os.path.join(root, name))
+
+
+def match_index(model_file_value):
+ if model_file_value:
+ model_folder = os.path.dirname(model_file_value)
+ model_name = os.path.basename(model_file_value)
+ index_files = get_indexes()
+ pattern = r"^(.*?)_"
+ match = re.match(pattern, model_name)
+ for index_file in index_files:
+ if os.path.dirname(index_file) == model_folder:
+ return index_file
+ elif match and match.group(1) in os.path.basename(index_file):
+ return index_file
+ elif model_name in os.path.basename(index_file):
+ return index_file
+ return ""
+
+
+def create_folder_and_move_files(folder_name, bin_file, config_file):
+ if not folder_name:
+ return "Folder name must not be empty."
+
+ folder_name = os.path.join(custom_embedder_root, folder_name)
+ os.makedirs(folder_name, exist_ok=True)
+
+ if bin_file:
+ bin_file_path = os.path.join(folder_name, os.path.basename(bin_file))
+ shutil.copy(bin_file, bin_file_path)
+
+ if config_file:
+ config_file_path = os.path.join(folder_name, os.path.basename(config_file))
+ shutil.copy(config_file, config_file_path)
+
+ return f"Files moved to folder {folder_name}"
+
+
+def refresh_formant():
+ json_files = list_json_files(FORMANTSHIFT_DIR)
+ return gr.update(choices=json_files)
+
+
+def refresh_embedders_folders():
+ custom_embedders = [
+ os.path.join(dirpath, dirname)
+ for dirpath, dirnames, _ in os.walk(custom_embedder_root_relative)
+ for dirname in dirnames
+ ]
+ return custom_embedders
+
+
+def get_speakers_id(model):
+ if model:
+ try:
+ model_data = torch.load(os.path.join(now_dir, model), map_location="cpu")
+ speakers_id = model_data.get("speakers_id")
+ if speakers_id:
+ return list(range(speakers_id))
+ else:
+ return [0]
+ except Exception as e:
+ return [0]
+ else:
+ return [0]
+
+
+# Inference tab
+def inference_tab():
+ with gr.Column():
+ with gr.Row():
+ model_file = gr.Dropdown(
+ label=i18n("Voice Model"),
+ info=i18n("Select the voice model to use for the conversion."),
+ choices=sorted(names, key=lambda x: extract_model_and_epoch(x)),
+ interactive=True,
+ value=default_weight,
+ allow_custom_value=True,
+ )
+
+ index_file = gr.Dropdown(
+ label=i18n("Index File"),
+ info=i18n("Select the index file to use for the conversion."),
+ choices=get_indexes(),
+ value=match_index(default_weight) if default_weight else "",
+ interactive=True,
+ allow_custom_value=True,
+ )
+ with gr.Row():
+ unload_button = gr.Button(i18n("Unload Voice"))
+ refresh_button = gr.Button(i18n("Refresh"))
+
+ unload_button.click(
+ fn=lambda: (
+ {"value": "", "__type__": "update"},
+ {"value": "", "__type__": "update"},
+ ),
+ inputs=[],
+ outputs=[model_file, index_file],
+ )
+ model_file.select(
+ fn=lambda model_file_value: match_index(model_file_value),
+ inputs=[model_file],
+ outputs=[index_file],
+ )
+
+ # Single inference tab
+ with gr.Tab(i18n("Single")):
+ with gr.Column():
+ upload_audio = gr.Audio(
+ label=i18n("Upload Audio"), type="filepath", editable=False
+ )
+ with gr.Row():
+ audio = gr.Dropdown(
+ label=i18n("Select Audio"),
+ info=i18n("Select the audio to convert."),
+ choices=sorted(audio_paths),
+ value=audio_paths[0] if audio_paths else "",
+ interactive=True,
+ allow_custom_value=True,
+ )
+
+ with gr.Accordion(i18n("Advanced Settings"), open=False):
+ with gr.Column():
+ clear_outputs_infer = gr.Button(
+ i18n("Clear Outputs (Deletes all audios in assets/audios)")
+ )
+ output_path = gr.Textbox(
+ label=i18n("Output Path"),
+ placeholder=i18n("Enter output path"),
+ info=i18n(
+ "The path where the output audio will be saved, by default in assets/audios/output.wav"
+ ),
+ value=(
+ output_path_fn(audio_paths[0])
+ if audio_paths
+ else os.path.join(now_dir, "assets", "audios", "output.wav")
+ ),
+ interactive=True,
+ )
+ export_format = gr.Radio(
+ label=i18n("Export Format"),
+ info=i18n("Select the format to export the audio."),
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+ value="WAV",
+ interactive=True,
+ )
+ sid = gr.Dropdown(
+ label=i18n("Speaker ID"),
+ info=i18n("Select the speaker ID to use for the conversion."),
+ choices=get_speakers_id(model_file.value),
+ value=0,
+ interactive=True,
+ )
+ split_audio = gr.Checkbox(
+ label=i18n("Split Audio"),
+ info=i18n(
+ "Split the audio into chunks for inference to obtain better results in some cases."
+ ),
+ visible=True,
+ value=False,
+ interactive=True,
+ )
+ autotune = gr.Checkbox(
+ label=i18n("Autotune"),
+ info=i18n(
+ "Apply a soft autotune to your inferences, recommended for singing conversions."
+ ),
+ visible=True,
+ value=False,
+ interactive=True,
+ )
+ autotune_strength = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Autotune Strength"),
+ info=i18n(
+ "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
+ ),
+ visible=False,
+ value=1,
+ interactive=True,
+ )
+ clean_audio = gr.Checkbox(
+ label=i18n("Clean Audio"),
+ info=i18n(
+ "Clean your audio output using noise detection algorithms, recommended for speaking audios."
+ ),
+ visible=True,
+ value=False,
+ interactive=True,
+ )
+ clean_strength = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Clean Strength"),
+ info=i18n(
+ "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
+ ),
+ visible=False,
+ value=0.5,
+ interactive=True,
+ )
+ formant_shifting = gr.Checkbox(
+ label=i18n("Formant Shifting"),
+ info=i18n(
+ "Enable formant shifting. Used for male to female and vice-versa convertions."
+ ),
+ value=False,
+ visible=True,
+ interactive=True,
+ )
+ post_process = gr.Checkbox(
+ label=i18n("Post-Process"),
+ info=i18n("Post-process the audio to apply effects to the output."),
+ value=False,
+ interactive=True,
+ )
+ with gr.Row(visible=False) as formant_row:
+ formant_preset = gr.Dropdown(
+ label=i18n("Browse presets for formanting"),
+ info=i18n(
+ "Presets are located in /assets/formant_shift folder"
+ ),
+ choices=list_json_files(FORMANTSHIFT_DIR),
+ visible=False,
+ interactive=True,
+ )
+ formant_refresh_button = gr.Button(
+ value="Refresh",
+ visible=False,
+ )
+ formant_qfrency = gr.Slider(
+ value=1.0,
+ info=i18n("Default value is 1.0"),
+ label=i18n("Quefrency for formant shifting"),
+ minimum=0.0,
+ maximum=16.0,
+ step=0.1,
+ visible=False,
+ interactive=True,
+ )
+ formant_timbre = gr.Slider(
+ value=1.0,
+ info=i18n("Default value is 1.0"),
+ label=i18n("Timbre for formant shifting"),
+ minimum=0.0,
+ maximum=16.0,
+ step=0.1,
+ visible=False,
+ interactive=True,
+ )
+ reverb = gr.Checkbox(
+ label=i18n("Reverb"),
+ info=i18n("Apply reverb to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ reverb_room_size = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Room Size"),
+ info=i18n("Set the room size of the reverb."),
+ value=0.5,
+ interactive=True,
+ visible=False,
+ )
+ reverb_damping = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Damping"),
+ info=i18n("Set the damping of the reverb."),
+ value=0.5,
+ interactive=True,
+ visible=False,
+ )
+ reverb_wet_gain = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Wet Gain"),
+ info=i18n("Set the wet gain of the reverb."),
+ value=0.33,
+ interactive=True,
+ visible=False,
+ )
+ reverb_dry_gain = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Dry Gain"),
+ info=i18n("Set the dry gain of the reverb."),
+ value=0.4,
+ interactive=True,
+ visible=False,
+ )
+ reverb_width = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Width"),
+ info=i18n("Set the width of the reverb."),
+ value=1.0,
+ interactive=True,
+ visible=False,
+ )
+ reverb_freeze_mode = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Freeze Mode"),
+ info=i18n("Set the freeze mode of the reverb."),
+ value=0.0,
+ interactive=True,
+ visible=False,
+ )
+ pitch_shift = gr.Checkbox(
+ label=i18n("Pitch Shift"),
+ info=i18n("Apply pitch shift to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ pitch_shift_semitones = gr.Slider(
+ minimum=-12,
+ maximum=12,
+ label=i18n("Pitch Shift Semitones"),
+ info=i18n("Set the pitch shift semitones."),
+ value=0,
+ interactive=True,
+ visible=False,
+ )
+ limiter = gr.Checkbox(
+ label=i18n("Limiter"),
+ info=i18n("Apply limiter to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ limiter_threshold = gr.Slider(
+ minimum=-60,
+ maximum=0,
+ label=i18n("Limiter Threshold dB"),
+ info=i18n("Set the limiter threshold dB."),
+ value=-6,
+ interactive=True,
+ visible=False,
+ )
+ limiter_release_time = gr.Slider(
+ minimum=0.01,
+ maximum=1,
+ label=i18n("Limiter Release Time"),
+ info=i18n("Set the limiter release time."),
+ value=0.05,
+ interactive=True,
+ visible=False,
+ )
+ gain = gr.Checkbox(
+ label=i18n("Gain"),
+ info=i18n("Apply gain to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ gain_db = gr.Slider(
+ minimum=-60,
+ maximum=60,
+ label=i18n("Gain dB"),
+ info=i18n("Set the gain dB."),
+ value=0,
+ interactive=True,
+ visible=False,
+ )
+ distortion = gr.Checkbox(
+ label=i18n("Distortion"),
+ info=i18n("Apply distortion to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ distortion_gain = gr.Slider(
+ minimum=-60,
+ maximum=60,
+ label=i18n("Distortion Gain"),
+ info=i18n("Set the distortion gain."),
+ value=25,
+ interactive=True,
+ visible=False,
+ )
+ chorus = gr.Checkbox(
+ label=i18n("chorus"),
+ info=i18n("Apply chorus to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ chorus_rate = gr.Slider(
+ minimum=0,
+ maximum=100,
+ label=i18n("Chorus Rate Hz"),
+ info=i18n("Set the chorus rate Hz."),
+ value=1.0,
+ interactive=True,
+ visible=False,
+ )
+ chorus_depth = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("chorus Depth"),
+ info=i18n("Set the chorus depth."),
+ value=0.25,
+ interactive=True,
+ visible=False,
+ )
+ chorus_center_delay = gr.Slider(
+ minimum=7,
+ maximum=8,
+ label=i18n("chorus Center Delay ms"),
+ info=i18n("Set the chorus center delay ms."),
+ value=7,
+ interactive=True,
+ visible=False,
+ )
+ chorus_feedback = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("chorus Feedback"),
+ info=i18n("Set the chorus feedback."),
+ value=0.0,
+ interactive=True,
+ visible=False,
+ )
+ chorus_mix = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Chorus Mix"),
+ info=i18n("Set the chorus mix."),
+ value=0.5,
+ interactive=True,
+ visible=False,
+ )
+ bitcrush = gr.Checkbox(
+ label=i18n("Bitcrush"),
+ info=i18n("Apply bitcrush to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ bitcrush_bit_depth = gr.Slider(
+ minimum=1,
+ maximum=32,
+ label=i18n("Bitcrush Bit Depth"),
+ info=i18n("Set the bitcrush bit depth."),
+ value=8,
+ interactive=True,
+ visible=False,
+ )
+ clipping = gr.Checkbox(
+ label=i18n("Clipping"),
+ info=i18n("Apply clipping to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ clipping_threshold = gr.Slider(
+ minimum=-60,
+ maximum=0,
+ label=i18n("Clipping Threshold"),
+ info=i18n("Set the clipping threshold."),
+ value=-6,
+ interactive=True,
+ visible=False,
+ )
+ compressor = gr.Checkbox(
+ label=i18n("Compressor"),
+ info=i18n("Apply compressor to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ compressor_threshold = gr.Slider(
+ minimum=-60,
+ maximum=0,
+ label=i18n("Compressor Threshold dB"),
+ info=i18n("Set the compressor threshold dB."),
+ value=0,
+ interactive=True,
+ visible=False,
+ )
+ compressor_ratio = gr.Slider(
+ minimum=1,
+ maximum=20,
+ label=i18n("Compressor Ratio"),
+ info=i18n("Set the compressor ratio."),
+ value=1,
+ interactive=True,
+ visible=False,
+ )
+ compressor_attack = gr.Slider(
+ minimum=0.0,
+ maximum=100,
+ label=i18n("Compressor Attack ms"),
+ info=i18n("Set the compressor attack ms."),
+ value=1.0,
+ interactive=True,
+ visible=False,
+ )
+ compressor_release = gr.Slider(
+ minimum=0.01,
+ maximum=100,
+ label=i18n("Compressor Release ms"),
+ info=i18n("Set the compressor release ms."),
+ value=100,
+ interactive=True,
+ visible=False,
+ )
+ delay = gr.Checkbox(
+ label=i18n("Delay"),
+ info=i18n("Apply delay to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ delay_seconds = gr.Slider(
+ minimum=0.0,
+ maximum=5.0,
+ label=i18n("Delay Seconds"),
+ info=i18n("Set the delay seconds."),
+ value=0.5,
+ interactive=True,
+ visible=False,
+ )
+ delay_feedback = gr.Slider(
+ minimum=0.0,
+ maximum=1.0,
+ label=i18n("Delay Feedback"),
+ info=i18n("Set the delay feedback."),
+ value=0.0,
+ interactive=True,
+ visible=False,
+ )
+ delay_mix = gr.Slider(
+ minimum=0.0,
+ maximum=1.0,
+ label=i18n("Delay Mix"),
+ info=i18n("Set the delay mix."),
+ value=0.5,
+ interactive=True,
+ visible=False,
+ )
+ with gr.Accordion(i18n("Preset Settings"), open=False):
+ with gr.Row():
+ preset_dropdown = gr.Dropdown(
+ label=i18n("Select Custom Preset"),
+ choices=list_json_files(PRESETS_DIR),
+ interactive=True,
+ )
+ presets_refresh_button = gr.Button(i18n("Refresh Presets"))
+ import_file = gr.File(
+ label=i18n("Select file to import"),
+ file_count="single",
+ type="filepath",
+ interactive=True,
+ )
+ import_file.change(
+ import_presets_button,
+ inputs=import_file,
+ outputs=[preset_dropdown],
+ )
+ presets_refresh_button.click(
+ refresh_presets, outputs=preset_dropdown
+ )
+ with gr.Row():
+ preset_name_input = gr.Textbox(
+ label=i18n("Preset Name"),
+ placeholder=i18n("Enter preset name"),
+ )
+ export_button = gr.Button(i18n("Export Preset"))
+ pitch = gr.Slider(
+ minimum=-24,
+ maximum=24,
+ step=1,
+ label=i18n("Pitch"),
+ info=i18n(
+ "Set the pitch of the audio, the higher the value, the higher the pitch."
+ ),
+ value=0,
+ interactive=True,
+ )
+ filter_radius = gr.Slider(
+ minimum=0,
+ maximum=7,
+ label=i18n("Filter Radius"),
+ info=i18n(
+ "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
+ ),
+ value=3,
+ step=1,
+ interactive=True,
+ )
+ index_rate = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Search Feature Ratio"),
+ info=i18n(
+ "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
+ ),
+ value=0.75,
+ interactive=True,
+ )
+ rms_mix_rate = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Volume Envelope"),
+ info=i18n(
+ "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
+ ),
+ value=1,
+ interactive=True,
+ )
+ protect = gr.Slider(
+ minimum=0,
+ maximum=0.5,
+ label=i18n("Protect Voiceless Consonants"),
+ info=i18n(
+ "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
+ ),
+ value=0.5,
+ interactive=True,
+ )
+ preset_dropdown.change(
+ update_sliders,
+ inputs=preset_dropdown,
+ outputs=[
+ pitch,
+ filter_radius,
+ index_rate,
+ rms_mix_rate,
+ protect,
+ ],
+ )
+ export_button.click(
+ export_presets_button,
+ inputs=[
+ preset_name_input,
+ pitch,
+ filter_radius,
+ index_rate,
+ rms_mix_rate,
+ protect,
+ ],
+ )
+ hop_length = gr.Slider(
+ minimum=1,
+ maximum=512,
+ step=1,
+ label=i18n("Hop Length"),
+ info=i18n(
+ "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
+ ),
+ visible=False,
+ value=128,
+ interactive=True,
+ )
+ f0_method = gr.Radio(
+ label=i18n("Pitch extraction algorithm"),
+ info=i18n(
+ "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
+ ),
+ choices=[
+ "crepe",
+ "crepe-tiny",
+ "rmvpe",
+ "fcpe",
+ "hybrid[rmvpe+fcpe]",
+ ],
+ value="rmvpe",
+ interactive=True,
+ )
+ embedder_model = gr.Radio(
+ label=i18n("Embedder Model"),
+ info=i18n("Model used for learning speaker embedding."),
+ choices=[
+ "contentvec",
+ "chinese-hubert-base",
+ "japanese-hubert-base",
+ "korean-hubert-base",
+ "custom",
+ ],
+ value="contentvec",
+ interactive=True,
+ )
+ with gr.Column(visible=False) as embedder_custom:
+ with gr.Accordion(i18n("Custom Embedder"), open=True):
+ with gr.Row():
+ embedder_model_custom = gr.Dropdown(
+ label=i18n("Select Custom Embedder"),
+ choices=refresh_embedders_folders(),
+ interactive=True,
+ allow_custom_value=True,
+ )
+ refresh_embedders_button = gr.Button(
+ i18n("Refresh embedders")
+ )
+ folder_name_input = gr.Textbox(
+ label=i18n("Folder Name"), interactive=True
+ )
+ with gr.Row():
+ bin_file_upload = gr.File(
+ label=i18n("Upload .bin"),
+ type="filepath",
+ interactive=True,
+ )
+ config_file_upload = gr.File(
+ label=i18n("Upload .json"),
+ type="filepath",
+ interactive=True,
+ )
+ move_files_button = gr.Button(
+ i18n("Move files to custom embedder folder")
+ )
+
+ f0_file = gr.File(
+ label=i18n(
+ "The f0 curve represents the variations in the base frequency of a voice over time, showing how pitch rises and falls."
+ ),
+ visible=True,
+ )
+
+ def enforce_terms(terms_accepted, *args):
+ if not terms_accepted:
+ message = "You must agree to the Terms of Use to proceed."
+ gr.Info(message)
+ return message, None
+ return run_infer_script(*args)
+
+ def enforce_terms_batch(terms_accepted, *args):
+ if not terms_accepted:
+ message = "You must agree to the Terms of Use to proceed."
+ gr.Info(message)
+ return message, None
+ return run_batch_infer_script(*args)
+
+ terms_checkbox = gr.Checkbox(
+ label=i18n("I agree to the terms of use"),
+ info=i18n(
+ "Please ensure compliance with the terms and conditions detailed in [this document](https://github.com/IAHispano/Applio/blob/main/TERMS_OF_USE.md) before proceeding with your inference."
+ ),
+ value=False,
+ interactive=True,
+ )
+
+ convert_button1 = gr.Button(i18n("Convert"))
+
+ with gr.Row():
+ vc_output1 = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ )
+ vc_output2 = gr.Audio(label=i18n("Export Audio"))
+
+ # Batch inference tab
+ with gr.Tab(i18n("Batch")):
+ with gr.Row():
+ with gr.Column():
+ input_folder_batch = gr.Textbox(
+ label=i18n("Input Folder"),
+ info=i18n("Select the folder containing the audios to convert."),
+ placeholder=i18n("Enter input path"),
+ value=os.path.join(now_dir, "assets", "audios"),
+ interactive=True,
+ )
+ output_folder_batch = gr.Textbox(
+ label=i18n("Output Folder"),
+ info=i18n(
+ "Select the folder where the output audios will be saved."
+ ),
+ placeholder=i18n("Enter output path"),
+ value=os.path.join(now_dir, "assets", "audios"),
+ interactive=True,
+ )
+ with gr.Accordion(i18n("Advanced Settings"), open=False):
+ with gr.Column():
+ clear_outputs_batch = gr.Button(
+ i18n("Clear Outputs (Deletes all audios in assets/audios)")
+ )
+ export_format_batch = gr.Radio(
+ label=i18n("Export Format"),
+ info=i18n("Select the format to export the audio."),
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+ value="WAV",
+ interactive=True,
+ )
+ sid_batch = gr.Dropdown(
+ label=i18n("Speaker ID"),
+ info=i18n("Select the speaker ID to use for the conversion."),
+ choices=get_speakers_id(model_file.value),
+ value=0,
+ interactive=True,
+ )
+ split_audio_batch = gr.Checkbox(
+ label=i18n("Split Audio"),
+ info=i18n(
+ "Split the audio into chunks for inference to obtain better results in some cases."
+ ),
+ visible=True,
+ value=False,
+ interactive=True,
+ )
+ autotune_batch = gr.Checkbox(
+ label=i18n("Autotune"),
+ info=i18n(
+ "Apply a soft autotune to your inferences, recommended for singing conversions."
+ ),
+ visible=True,
+ value=False,
+ interactive=True,
+ )
+ autotune_strength_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Autotune Strength"),
+ info=i18n(
+ "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
+ ),
+ visible=False,
+ value=1,
+ interactive=True,
+ )
+ clean_audio_batch = gr.Checkbox(
+ label=i18n("Clean Audio"),
+ info=i18n(
+ "Clean your audio output using noise detection algorithms, recommended for speaking audios."
+ ),
+ visible=True,
+ value=False,
+ interactive=True,
+ )
+ clean_strength_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Clean Strength"),
+ info=i18n(
+ "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
+ ),
+ visible=False,
+ value=0.5,
+ interactive=True,
+ )
+ formant_shifting_batch = gr.Checkbox(
+ label=i18n("Formant Shifting"),
+ info=i18n(
+ "Enable formant shifting. Used for male to female and vice-versa convertions."
+ ),
+ value=False,
+ visible=True,
+ interactive=True,
+ )
+ post_process_batch = gr.Checkbox(
+ label=i18n("Post-Process"),
+ info=i18n("Post-process the audio to apply effects to the output."),
+ value=False,
+ interactive=True,
+ )
+ with gr.Row(visible=False) as formant_row_batch:
+ formant_preset_batch = gr.Dropdown(
+ label=i18n("Browse presets for formanting"),
+ info=i18n(
+ "Presets are located in /assets/formant_shift folder"
+ ),
+ choices=list_json_files(FORMANTSHIFT_DIR),
+ visible=False,
+ interactive=True,
+ )
+ formant_refresh_button_batch = gr.Button(
+ value="Refresh",
+ visible=False,
+ )
+ formant_qfrency_batch = gr.Slider(
+ value=1.0,
+ info=i18n("Default value is 1.0"),
+ label=i18n("Quefrency for formant shifting"),
+ minimum=0.0,
+ maximum=16.0,
+ step=0.1,
+ visible=False,
+ interactive=True,
+ )
+ formant_timbre_batch = gr.Slider(
+ value=1.0,
+ info=i18n("Default value is 1.0"),
+ label=i18n("Timbre for formant shifting"),
+ minimum=0.0,
+ maximum=16.0,
+ step=0.1,
+ visible=False,
+ interactive=True,
+ )
+ reverb_batch = gr.Checkbox(
+ label=i18n("Reverb"),
+ info=i18n("Apply reverb to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ reverb_room_size_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Room Size"),
+ info=i18n("Set the room size of the reverb."),
+ value=0.5,
+ interactive=True,
+ visible=False,
+ )
+ reverb_damping_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Damping"),
+ info=i18n("Set the damping of the reverb."),
+ value=0.5,
+ interactive=True,
+ visible=False,
+ )
+ reverb_wet_gain_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Wet Gain"),
+ info=i18n("Set the wet gain of the reverb."),
+ value=0.33,
+ interactive=True,
+ visible=False,
+ )
+ reverb_dry_gain_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Dry Gain"),
+ info=i18n("Set the dry gain of the reverb."),
+ value=0.4,
+ interactive=True,
+ visible=False,
+ )
+ reverb_width_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Width"),
+ info=i18n("Set the width of the reverb."),
+ value=1.0,
+ interactive=True,
+ visible=False,
+ )
+ reverb_freeze_mode_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Reverb Freeze Mode"),
+ info=i18n("Set the freeze mode of the reverb."),
+ value=0.0,
+ interactive=True,
+ visible=False,
+ )
+ pitch_shift_batch = gr.Checkbox(
+ label=i18n("Pitch Shift"),
+ info=i18n("Apply pitch shift to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ pitch_shift_semitones_batch = gr.Slider(
+ minimum=-12,
+ maximum=12,
+ label=i18n("Pitch Shift Semitones"),
+ info=i18n("Set the pitch shift semitones."),
+ value=0,
+ interactive=True,
+ visible=False,
+ )
+ limiter_batch = gr.Checkbox(
+ label=i18n("Limiter"),
+ info=i18n("Apply limiter to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ limiter_threshold_batch = gr.Slider(
+ minimum=-60,
+ maximum=0,
+ label=i18n("Limiter Threshold dB"),
+ info=i18n("Set the limiter threshold dB."),
+ value=-6,
+ interactive=True,
+ visible=False,
+ )
+ limiter_release_time_batch = gr.Slider(
+ minimum=0.01,
+ maximum=1,
+ label=i18n("Limiter Release Time"),
+ info=i18n("Set the limiter release time."),
+ value=0.05,
+ interactive=True,
+ visible=False,
+ )
+ gain_batch = gr.Checkbox(
+ label=i18n("Gain"),
+ info=i18n("Apply gain to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ gain_db_batch = gr.Slider(
+ minimum=-60,
+ maximum=60,
+ label=i18n("Gain dB"),
+ info=i18n("Set the gain dB."),
+ value=0,
+ interactive=True,
+ visible=False,
+ )
+ distortion_batch = gr.Checkbox(
+ label=i18n("Distortion"),
+ info=i18n("Apply distortion to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ distortion_gain_batch = gr.Slider(
+ minimum=-60,
+ maximum=60,
+ label=i18n("Distortion Gain"),
+ info=i18n("Set the distortion gain."),
+ value=25,
+ interactive=True,
+ visible=False,
+ )
+ chorus_batch = gr.Checkbox(
+ label=i18n("chorus"),
+ info=i18n("Apply chorus to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ chorus_rate_batch = gr.Slider(
+ minimum=0,
+ maximum=100,
+ label=i18n("Chorus Rate Hz"),
+ info=i18n("Set the chorus rate Hz."),
+ value=1.0,
+ interactive=True,
+ visible=False,
+ )
+ chorus_depth_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("chorus Depth"),
+ info=i18n("Set the chorus depth."),
+ value=0.25,
+ interactive=True,
+ visible=False,
+ )
+ chorus_center_delay_batch = gr.Slider(
+ minimum=7,
+ maximum=8,
+ label=i18n("chorus Center Delay ms"),
+ info=i18n("Set the chorus center delay ms."),
+ value=7,
+ interactive=True,
+ visible=False,
+ )
+ chorus_feedback_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("chorus Feedback"),
+ info=i18n("Set the chorus feedback."),
+ value=0.0,
+ interactive=True,
+ visible=False,
+ )
+ chorus_mix_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Chorus Mix"),
+ info=i18n("Set the chorus mix."),
+ value=0.5,
+ interactive=True,
+ visible=False,
+ )
+ bitcrush_batch = gr.Checkbox(
+ label=i18n("Bitcrush"),
+ info=i18n("Apply bitcrush to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ bitcrush_bit_depth_batch = gr.Slider(
+ minimum=1,
+ maximum=32,
+ label=i18n("Bitcrush Bit Depth"),
+ info=i18n("Set the bitcrush bit depth."),
+ value=8,
+ interactive=True,
+ visible=False,
+ )
+ clipping_batch = gr.Checkbox(
+ label=i18n("Clipping"),
+ info=i18n("Apply clipping to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ clipping_threshold_batch = gr.Slider(
+ minimum=-60,
+ maximum=0,
+ label=i18n("Clipping Threshold"),
+ info=i18n("Set the clipping threshold."),
+ value=-6,
+ interactive=True,
+ visible=False,
+ )
+ compressor_batch = gr.Checkbox(
+ label=i18n("Compressor"),
+ info=i18n("Apply compressor to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ compressor_threshold_batch = gr.Slider(
+ minimum=-60,
+ maximum=0,
+ label=i18n("Compressor Threshold dB"),
+ info=i18n("Set the compressor threshold dB."),
+ value=0,
+ interactive=True,
+ visible=False,
+ )
+ compressor_ratio_batch = gr.Slider(
+ minimum=1,
+ maximum=20,
+ label=i18n("Compressor Ratio"),
+ info=i18n("Set the compressor ratio."),
+ value=1,
+ interactive=True,
+ visible=False,
+ )
+ compressor_attack_batch = gr.Slider(
+ minimum=0.0,
+ maximum=100,
+ label=i18n("Compressor Attack ms"),
+ info=i18n("Set the compressor attack ms."),
+ value=1.0,
+ interactive=True,
+ visible=False,
+ )
+ compressor_release_batch = gr.Slider(
+ minimum=0.01,
+ maximum=100,
+ label=i18n("Compressor Release ms"),
+ info=i18n("Set the compressor release ms."),
+ value=100,
+ interactive=True,
+ visible=False,
+ )
+ delay_batch = gr.Checkbox(
+ label=i18n("Delay"),
+ info=i18n("Apply delay to the audio."),
+ value=False,
+ interactive=True,
+ visible=False,
+ )
+ delay_seconds_batch = gr.Slider(
+ minimum=0.0,
+ maximum=5.0,
+ label=i18n("Delay Seconds"),
+ info=i18n("Set the delay seconds."),
+ value=0.5,
+ interactive=True,
+ visible=False,
+ )
+ delay_feedback_batch = gr.Slider(
+ minimum=0.0,
+ maximum=1.0,
+ label=i18n("Delay Feedback"),
+ info=i18n("Set the delay feedback."),
+ value=0.0,
+ interactive=True,
+ visible=False,
+ )
+ delay_mix_batch = gr.Slider(
+ minimum=0.0,
+ maximum=1.0,
+ label=i18n("Delay Mix"),
+ info=i18n("Set the delay mix."),
+ value=0.5,
+ interactive=True,
+ visible=False,
+ )
+ with gr.Accordion(i18n("Preset Settings"), open=False):
+ with gr.Row():
+ preset_dropdown = gr.Dropdown(
+ label=i18n("Select Custom Preset"),
+ interactive=True,
+ )
+ presets_batch_refresh_button = gr.Button(
+ i18n("Refresh Presets")
+ )
+ import_file = gr.File(
+ label=i18n("Select file to import"),
+ file_count="single",
+ type="filepath",
+ interactive=True,
+ )
+ import_file.change(
+ import_presets_button,
+ inputs=import_file,
+ outputs=[preset_dropdown],
+ )
+ presets_batch_refresh_button.click(
+ refresh_presets, outputs=preset_dropdown
+ )
+ with gr.Row():
+ preset_name_input = gr.Textbox(
+ label=i18n("Preset Name"),
+ placeholder=i18n("Enter preset name"),
+ )
+ export_button = gr.Button(i18n("Export Preset"))
+ pitch_batch = gr.Slider(
+ minimum=-24,
+ maximum=24,
+ step=1,
+ label=i18n("Pitch"),
+ info=i18n(
+ "Set the pitch of the audio, the higher the value, the higher the pitch."
+ ),
+ value=0,
+ interactive=True,
+ )
+ filter_radius_batch = gr.Slider(
+ minimum=0,
+ maximum=7,
+ label=i18n("Filter Radius"),
+ info=i18n(
+ "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
+ ),
+ value=3,
+ step=1,
+ interactive=True,
+ )
+ index_rate_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Search Feature Ratio"),
+ info=i18n(
+ "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
+ ),
+ value=0.75,
+ interactive=True,
+ )
+ rms_mix_rate_batch = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Volume Envelope"),
+ info=i18n(
+ "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
+ ),
+ value=1,
+ interactive=True,
+ )
+ protect_batch = gr.Slider(
+ minimum=0,
+ maximum=0.5,
+ label=i18n("Protect Voiceless Consonants"),
+ info=i18n(
+ "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
+ ),
+ value=0.5,
+ interactive=True,
+ )
+ preset_dropdown.change(
+ update_sliders,
+ inputs=preset_dropdown,
+ outputs=[
+ pitch_batch,
+ filter_radius_batch,
+ index_rate_batch,
+ rms_mix_rate_batch,
+ protect_batch,
+ ],
+ )
+ export_button.click(
+ export_presets_button,
+ inputs=[
+ preset_name_input,
+ pitch,
+ filter_radius,
+ index_rate,
+ rms_mix_rate,
+ protect,
+ ],
+ outputs=[],
+ )
+ hop_length_batch = gr.Slider(
+ minimum=1,
+ maximum=512,
+ step=1,
+ label=i18n("Hop Length"),
+ info=i18n(
+ "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
+ ),
+ visible=False,
+ value=128,
+ interactive=True,
+ )
+ f0_method_batch = gr.Radio(
+ label=i18n("Pitch extraction algorithm"),
+ info=i18n(
+ "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
+ ),
+ choices=[
+ "crepe",
+ "crepe-tiny",
+ "rmvpe",
+ "fcpe",
+ "hybrid[rmvpe+fcpe]",
+ ],
+ value="rmvpe",
+ interactive=True,
+ )
+ embedder_model_batch = gr.Radio(
+ label=i18n("Embedder Model"),
+ info=i18n("Model used for learning speaker embedding."),
+ choices=[
+ "contentvec",
+ "chinese-hubert-base",
+ "japanese-hubert-base",
+ "korean-hubert-base",
+ "custom",
+ ],
+ value="contentvec",
+ interactive=True,
+ )
+ f0_file_batch = gr.File(
+ label=i18n(
+ "The f0 curve represents the variations in the base frequency of a voice over time, showing how pitch rises and falls."
+ ),
+ visible=True,
+ )
+ with gr.Column(visible=False) as embedder_custom_batch:
+ with gr.Accordion(i18n("Custom Embedder"), open=True):
+ with gr.Row():
+ embedder_model_custom_batch = gr.Dropdown(
+ label=i18n("Select Custom Embedder"),
+ choices=refresh_embedders_folders(),
+ interactive=True,
+ allow_custom_value=True,
+ )
+ refresh_embedders_button_batch = gr.Button(
+ i18n("Refresh embedders")
+ )
+ folder_name_input_batch = gr.Textbox(
+ label=i18n("Folder Name"), interactive=True
+ )
+ with gr.Row():
+ bin_file_upload_batch = gr.File(
+ label=i18n("Upload .bin"),
+ type="filepath",
+ interactive=True,
+ )
+ config_file_upload_batch = gr.File(
+ label=i18n("Upload .json"),
+ type="filepath",
+ interactive=True,
+ )
+ move_files_button_batch = gr.Button(
+ i18n("Move files to custom embedder folder")
+ )
+
+ terms_checkbox_batch = gr.Checkbox(
+ label=i18n("I agree to the terms of use"),
+ info=i18n(
+ "Please ensure compliance with the terms and conditions detailed in [this document](https://github.com/IAHispano/Applio/blob/main/TERMS_OF_USE.md) before proceeding with your inference."
+ ),
+ value=False,
+ interactive=True,
+ )
+ convert_button_batch = gr.Button(i18n("Convert"))
+ stop_button = gr.Button(i18n("Stop convert"), visible=False)
+ stop_button.click(fn=stop_infer, inputs=[], outputs=[])
+
+ with gr.Row():
+ vc_output3 = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ )
+
+ def toggle_visible(checkbox):
+ return {"visible": checkbox, "__type__": "update"}
+
+ def toggle_visible_hop_length(f0_method):
+ if f0_method == "crepe" or f0_method == "crepe-tiny":
+ return {"visible": True, "__type__": "update"}
+ return {"visible": False, "__type__": "update"}
+
+ def toggle_visible_embedder_custom(embedder_model):
+ if embedder_model == "custom":
+ return {"visible": True, "__type__": "update"}
+ return {"visible": False, "__type__": "update"}
+
+ def enable_stop_convert_button():
+ return {"visible": False, "__type__": "update"}, {
+ "visible": True,
+ "__type__": "update",
+ }
+
+ def disable_stop_convert_button():
+ return {"visible": True, "__type__": "update"}, {
+ "visible": False,
+ "__type__": "update",
+ }
+
+ def toggle_visible_formant_shifting(checkbox):
+ if checkbox:
+ return (
+ gr.update(visible=True),
+ gr.update(visible=True),
+ gr.update(visible=True),
+ gr.update(visible=True),
+ gr.update(visible=True),
+ )
+ else:
+ return (
+ gr.update(visible=False),
+ gr.update(visible=False),
+ gr.update(visible=False),
+ gr.update(visible=False),
+ gr.update(visible=False),
+ )
+
+ def update_visibility(checkbox, count):
+ return [gr.update(visible=checkbox) for _ in range(count)]
+
+ def post_process_visible(checkbox):
+ return update_visibility(checkbox, 10)
+
+ def reverb_visible(checkbox):
+ return update_visibility(checkbox, 6)
+
+ def limiter_visible(checkbox):
+ return update_visibility(checkbox, 2)
+
+ def chorus_visible(checkbox):
+ return update_visibility(checkbox, 6)
+
+ def bitcrush_visible(checkbox):
+ return update_visibility(checkbox, 1)
+
+ def compress_visible(checkbox):
+ return update_visibility(checkbox, 4)
+
+ def delay_visible(checkbox):
+ return update_visibility(checkbox, 3)
+
+ autotune.change(
+ fn=toggle_visible,
+ inputs=[autotune],
+ outputs=[autotune_strength],
+ )
+ clean_audio.change(
+ fn=toggle_visible,
+ inputs=[clean_audio],
+ outputs=[clean_strength],
+ )
+ formant_shifting.change(
+ fn=toggle_visible_formant_shifting,
+ inputs=[formant_shifting],
+ outputs=[
+ formant_row,
+ formant_preset,
+ formant_refresh_button,
+ formant_qfrency,
+ formant_timbre,
+ ],
+ )
+ formant_shifting_batch.change(
+ fn=toggle_visible_formant_shifting,
+ inputs=[formant_shifting],
+ outputs=[
+ formant_row_batch,
+ formant_preset_batch,
+ formant_refresh_button_batch,
+ formant_qfrency_batch,
+ formant_timbre_batch,
+ ],
+ )
+ formant_refresh_button.click(
+ fn=refresh_formant,
+ inputs=[],
+ outputs=[formant_preset],
+ )
+ formant_preset.change(
+ fn=update_sliders_formant,
+ inputs=[formant_preset],
+ outputs=[
+ formant_qfrency,
+ formant_timbre,
+ ],
+ )
+ formant_preset_batch.change(
+ fn=update_sliders_formant,
+ inputs=[formant_preset_batch],
+ outputs=[
+ formant_qfrency,
+ formant_timbre,
+ ],
+ )
+ post_process.change(
+ fn=post_process_visible,
+ inputs=[post_process],
+ outputs=[
+ reverb,
+ pitch_shift,
+ limiter,
+ gain,
+ distortion,
+ chorus,
+ bitcrush,
+ clipping,
+ compressor,
+ delay,
+ ],
+ )
+ reverb.change(
+ fn=reverb_visible,
+ inputs=[reverb],
+ outputs=[
+ reverb_room_size,
+ reverb_damping,
+ reverb_wet_gain,
+ reverb_dry_gain,
+ reverb_width,
+ reverb_freeze_mode,
+ ],
+ )
+ pitch_shift.change(
+ fn=toggle_visible,
+ inputs=[pitch_shift],
+ outputs=[pitch_shift_semitones],
+ )
+ limiter.change(
+ fn=limiter_visible,
+ inputs=[limiter],
+ outputs=[limiter_threshold, limiter_release_time],
+ )
+ gain.change(
+ fn=toggle_visible,
+ inputs=[gain],
+ outputs=[gain_db],
+ )
+ distortion.change(
+ fn=toggle_visible,
+ inputs=[distortion],
+ outputs=[distortion_gain],
+ )
+ chorus.change(
+ fn=chorus_visible,
+ inputs=[chorus],
+ outputs=[
+ chorus_rate,
+ chorus_depth,
+ chorus_center_delay,
+ chorus_feedback,
+ chorus_mix,
+ ],
+ )
+ bitcrush.change(
+ fn=bitcrush_visible,
+ inputs=[bitcrush],
+ outputs=[bitcrush_bit_depth],
+ )
+ clipping.change(
+ fn=toggle_visible,
+ inputs=[clipping],
+ outputs=[clipping_threshold],
+ )
+ compressor.change(
+ fn=compress_visible,
+ inputs=[compressor],
+ outputs=[
+ compressor_threshold,
+ compressor_ratio,
+ compressor_attack,
+ compressor_release,
+ ],
+ )
+ delay.change(
+ fn=delay_visible,
+ inputs=[delay],
+ outputs=[delay_seconds, delay_feedback, delay_mix],
+ )
+ post_process_batch.change(
+ fn=post_process_visible,
+ inputs=[post_process_batch],
+ outputs=[
+ reverb_batch,
+ pitch_shift_batch,
+ limiter_batch,
+ gain_batch,
+ distortion_batch,
+ chorus_batch,
+ bitcrush_batch,
+ clipping_batch,
+ compressor_batch,
+ delay_batch,
+ ],
+ )
+ reverb_batch.change(
+ fn=reverb_visible,
+ inputs=[reverb_batch],
+ outputs=[
+ reverb_room_size_batch,
+ reverb_damping_batch,
+ reverb_wet_gain_batch,
+ reverb_dry_gain_batch,
+ reverb_width_batch,
+ reverb_freeze_mode_batch,
+ ],
+ )
+ pitch_shift_batch.change(
+ fn=toggle_visible,
+ inputs=[pitch_shift_batch],
+ outputs=[pitch_shift_semitones_batch],
+ )
+ limiter_batch.change(
+ fn=limiter_visible,
+ inputs=[limiter_batch],
+ outputs=[limiter_threshold_batch, limiter_release_time_batch],
+ )
+ gain_batch.change(
+ fn=toggle_visible,
+ inputs=[gain_batch],
+ outputs=[gain_db_batch],
+ )
+ distortion_batch.change(
+ fn=toggle_visible,
+ inputs=[distortion_batch],
+ outputs=[distortion_gain_batch],
+ )
+ chorus_batch.change(
+ fn=chorus_visible,
+ inputs=[chorus_batch],
+ outputs=[
+ chorus_rate_batch,
+ chorus_depth_batch,
+ chorus_center_delay_batch,
+ chorus_feedback_batch,
+ chorus_mix_batch,
+ ],
+ )
+ bitcrush_batch.change(
+ fn=bitcrush_visible,
+ inputs=[bitcrush_batch],
+ outputs=[bitcrush_bit_depth_batch],
+ )
+ clipping_batch.change(
+ fn=toggle_visible,
+ inputs=[clipping_batch],
+ outputs=[clipping_threshold_batch],
+ )
+ compressor_batch.change(
+ fn=compress_visible,
+ inputs=[compressor_batch],
+ outputs=[
+ compressor_threshold_batch,
+ compressor_ratio_batch,
+ compressor_attack_batch,
+ compressor_release_batch,
+ ],
+ )
+ delay_batch.change(
+ fn=delay_visible,
+ inputs=[delay_batch],
+ outputs=[delay_seconds_batch, delay_feedback_batch, delay_mix_batch],
+ )
+ autotune_batch.change(
+ fn=toggle_visible,
+ inputs=[autotune_batch],
+ outputs=[autotune_strength_batch],
+ )
+ clean_audio_batch.change(
+ fn=toggle_visible,
+ inputs=[clean_audio_batch],
+ outputs=[clean_strength_batch],
+ )
+ f0_method.change(
+ fn=toggle_visible_hop_length,
+ inputs=[f0_method],
+ outputs=[hop_length],
+ )
+ f0_method_batch.change(
+ fn=toggle_visible_hop_length,
+ inputs=[f0_method_batch],
+ outputs=[hop_length_batch],
+ )
+ refresh_button.click(
+ fn=change_choices,
+ inputs=[model_file],
+ outputs=[model_file, index_file, audio, sid, sid_batch],
+ )
+ audio.change(
+ fn=output_path_fn,
+ inputs=[audio],
+ outputs=[output_path],
+ )
+ upload_audio.upload(
+ fn=save_to_wav2,
+ inputs=[upload_audio],
+ outputs=[audio, output_path],
+ )
+ upload_audio.stop_recording(
+ fn=save_to_wav,
+ inputs=[upload_audio],
+ outputs=[audio, output_path],
+ )
+ clear_outputs_infer.click(
+ fn=delete_outputs,
+ inputs=[],
+ outputs=[],
+ )
+ clear_outputs_batch.click(
+ fn=delete_outputs,
+ inputs=[],
+ outputs=[],
+ )
+ embedder_model.change(
+ fn=toggle_visible_embedder_custom,
+ inputs=[embedder_model],
+ outputs=[embedder_custom],
+ )
+ embedder_model_batch.change(
+ fn=toggle_visible_embedder_custom,
+ inputs=[embedder_model_batch],
+ outputs=[embedder_custom_batch],
+ )
+ move_files_button.click(
+ fn=create_folder_and_move_files,
+ inputs=[folder_name_input, bin_file_upload, config_file_upload],
+ outputs=[],
+ )
+ refresh_embedders_button.click(
+ fn=lambda: gr.update(choices=refresh_embedders_folders()),
+ inputs=[],
+ outputs=[embedder_model_custom],
+ )
+ move_files_button_batch.click(
+ fn=create_folder_and_move_files,
+ inputs=[
+ folder_name_input_batch,
+ bin_file_upload_batch,
+ config_file_upload_batch,
+ ],
+ outputs=[],
+ )
+ refresh_embedders_button_batch.click(
+ fn=lambda: gr.update(choices=refresh_embedders_folders()),
+ inputs=[],
+ outputs=[embedder_model_custom_batch],
+ )
+ convert_button1.click(
+ fn=enforce_terms,
+ inputs=[
+ terms_checkbox,
+ pitch,
+ filter_radius,
+ index_rate,
+ rms_mix_rate,
+ protect,
+ hop_length,
+ f0_method,
+ audio,
+ output_path,
+ model_file,
+ index_file,
+ split_audio,
+ autotune,
+ autotune_strength,
+ clean_audio,
+ clean_strength,
+ export_format,
+ f0_file,
+ embedder_model,
+ embedder_model_custom,
+ formant_shifting,
+ formant_qfrency,
+ formant_timbre,
+ post_process,
+ reverb,
+ pitch_shift,
+ limiter,
+ gain,
+ distortion,
+ chorus,
+ bitcrush,
+ clipping,
+ compressor,
+ delay,
+ reverb_room_size,
+ reverb_damping,
+ reverb_wet_gain,
+ reverb_dry_gain,
+ reverb_width,
+ reverb_freeze_mode,
+ pitch_shift_semitones,
+ limiter_threshold,
+ limiter_release_time,
+ gain_db,
+ distortion_gain,
+ chorus_rate,
+ chorus_depth,
+ chorus_center_delay,
+ chorus_feedback,
+ chorus_mix,
+ bitcrush_bit_depth,
+ clipping_threshold,
+ compressor_threshold,
+ compressor_ratio,
+ compressor_attack,
+ compressor_release,
+ delay_seconds,
+ delay_feedback,
+ delay_mix,
+ sid,
+ ],
+ outputs=[vc_output1, vc_output2],
+ )
+ convert_button_batch.click(
+ fn=enforce_terms_batch,
+ inputs=[
+ terms_checkbox_batch,
+ pitch_batch,
+ filter_radius_batch,
+ index_rate_batch,
+ rms_mix_rate_batch,
+ protect_batch,
+ hop_length_batch,
+ f0_method_batch,
+ input_folder_batch,
+ output_folder_batch,
+ model_file,
+ index_file,
+ split_audio_batch,
+ autotune_batch,
+ autotune_strength_batch,
+ clean_audio_batch,
+ clean_strength_batch,
+ export_format_batch,
+ f0_file_batch,
+ embedder_model_batch,
+ embedder_model_custom_batch,
+ formant_shifting_batch,
+ formant_qfrency_batch,
+ formant_timbre_batch,
+ post_process_batch,
+ reverb_batch,
+ pitch_shift_batch,
+ limiter_batch,
+ gain_batch,
+ distortion_batch,
+ chorus_batch,
+ bitcrush_batch,
+ clipping_batch,
+ compressor_batch,
+ delay_batch,
+ reverb_room_size_batch,
+ reverb_damping_batch,
+ reverb_wet_gain_batch,
+ reverb_dry_gain_batch,
+ reverb_width_batch,
+ reverb_freeze_mode_batch,
+ pitch_shift_semitones_batch,
+ limiter_threshold_batch,
+ limiter_release_time_batch,
+ gain_db_batch,
+ distortion_gain_batch,
+ chorus_rate_batch,
+ chorus_depth_batch,
+ chorus_center_delay_batch,
+ chorus_feedback_batch,
+ chorus_mix_batch,
+ bitcrush_bit_depth_batch,
+ clipping_threshold_batch,
+ compressor_threshold_batch,
+ compressor_ratio_batch,
+ compressor_attack_batch,
+ compressor_release_batch,
+ delay_seconds_batch,
+ delay_feedback_batch,
+ delay_mix_batch,
+ sid_batch,
+ ],
+ outputs=[vc_output3],
+ )
+ convert_button_batch.click(
+ fn=enable_stop_convert_button,
+ inputs=[],
+ outputs=[convert_button_batch, stop_button],
+ )
+ stop_button.click(
+ fn=disable_stop_convert_button,
+ inputs=[],
+ outputs=[convert_button_batch, stop_button],
+ )
diff --git a/tabs/plugins/plugins.py b/tabs/plugins/plugins.py
new file mode 100644
index 0000000000000000000000000000000000000000..afa1d5c92a7fda40405bb219307751ed8f2fc45b
--- /dev/null
+++ b/tabs/plugins/plugins.py
@@ -0,0 +1,34 @@
+import os, sys
+import gradio as gr
+import importlib.util
+import tabs.plugins.plugins_core as plugins_core
+
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+plugins_core.check_new_folders()
+
+
+def plugins_tab():
+ with gr.TabItem(i18n("Plugin Installer")):
+ dropbox = gr.File(
+ label=i18n("Drag your plugin.zip to install it"),
+ type="filepath",
+ )
+
+ dropbox.upload(
+ fn=plugins_core.save_plugin_dropbox,
+ inputs=[dropbox],
+ outputs=[dropbox],
+ )
+
+ for plugin in os.listdir(os.path.join(now_dir, "tabs", "plugins", "installed")):
+ plugin_main = f"tabs.plugins.installed.{plugin}.plugin"
+ plugin_import = importlib.import_module(plugin_main)
+
+ with gr.TabItem(plugin):
+ plugin_import.applio_plugin()
diff --git a/tabs/plugins/plugins_core.py b/tabs/plugins/plugins_core.py
new file mode 100644
index 0000000000000000000000000000000000000000..51f133789f7efeab1a2ef42b581c1e560a8c93b3
--- /dev/null
+++ b/tabs/plugins/plugins_core.py
@@ -0,0 +1,134 @@
+import os, sys, shutil
+import json
+import gradio as gr
+import zipfile
+import subprocess
+
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from tabs.settings.sections.restart import restart_applio
+
+plugins_path = os.path.join(now_dir, "tabs", "plugins", "installed")
+if not os.path.exists(plugins_path):
+ os.makedirs(plugins_path)
+json_file_path = os.path.join(now_dir, "assets", "config.json")
+current_folders = os.listdir(plugins_path)
+
+
+def get_existing_folders():
+ if os.path.exists(json_file_path):
+ with open(json_file_path, "r") as file:
+ config = json.load(file)
+ return config["plugins"]
+ else:
+ return []
+
+
+def save_existing_folders(existing_folders):
+ with open(json_file_path, "r") as file:
+ config = json.load(file)
+ config["plugins"] = existing_folders
+ with open(json_file_path, "w") as file:
+ json.dump(config, file, indent=2)
+
+
+def save_plugin_dropbox(dropbox):
+ if "zip" not in dropbox:
+ raise gr.Error(
+ message="The file you dropped is not a valid plugin.zip. Please try again."
+ )
+ else:
+ file_name = os.path.basename(dropbox)
+ folder_name = file_name.split(".zip")[0]
+ folder_path = os.path.join(plugins_path, folder_name)
+ zip_file_path = os.path.join(plugins_path, file_name)
+
+ if os.path.exists(folder_name):
+ os.remove(folder_name)
+
+ shutil.move(dropbox, os.path.join(plugins_path, file_name))
+ print("Proceeding with the extraction...")
+
+ with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+ zip_ref.extractall(plugins_path)
+ os.remove(zip_file_path)
+
+ if os.path.exists(os.path.join(folder_path, "requirements.txt")):
+ if os.name == "nt":
+ subprocess.run(
+ [
+ os.path.join("env", "python.exe"),
+ "-m",
+ "pip",
+ "install",
+ "-r",
+ os.path.join(folder_path, "requirements.txt"),
+ ]
+ )
+ else:
+ subprocess.run(
+ [
+ "python",
+ "-m",
+ "pip",
+ "install",
+ "-r",
+ os.path.join(folder_path, "requirements.txt"),
+ ]
+ )
+ else:
+ print("No requirements.txt file found in the plugin folder.")
+
+ save_existing_folders(get_existing_folders() + [folder_name])
+
+ print(
+ f"{folder_name} plugin installed in {plugins_path}! Restarting applio to apply the changes."
+ )
+ gr.Info(
+ f"{folder_name} plugin installed in {plugins_path}! Restarting applio to apply the changes."
+ )
+ restart_applio()
+ return None
+
+
+def check_new_folders():
+ existing_folders = get_existing_folders()
+ new_folders = set(current_folders) - set(existing_folders)
+ save_existing_folders(current_folders)
+ if new_folders:
+ for new_folder in new_folders:
+ complete_path = os.path.join(plugins_path, new_folder)
+ print(f"New plugin {new_folder} found, installing it...")
+
+ if os.path.exists(os.path.join(complete_path, "requirements.txt")):
+ if os.name == "nt":
+ subprocess.run(
+ [
+ os.path.join("env", "python.exe"),
+ "-m",
+ "pip",
+ "install",
+ "-r",
+ os.path.join(complete_path, "requirements.txt"),
+ ]
+ )
+ else:
+ subprocess.run(
+ [
+ "python",
+ "-m",
+ "pip",
+ "install",
+ "-r",
+ os.path.join(complete_path, "requirements.txt"),
+ ]
+ )
+ else:
+ print("No requirements.txt file found in the plugin folder.")
+ print("Plugins checked and installed! Restarting applio to apply the changes.")
+ restart_applio()
diff --git a/tabs/report/main.js b/tabs/report/main.js
new file mode 100644
index 0000000000000000000000000000000000000000..755cb9ab442c247cab0ab647e1599481bff491aa
--- /dev/null
+++ b/tabs/report/main.js
@@ -0,0 +1,74 @@
+// main.js
+if (!ScreenCastRecorder.isSupportedBrowser()) {
+ console.error("Screen Recording not supported in this browser");
+}
+let recorder;
+let outputBlob;
+const stopRecording = () => __awaiter(void 0, void 0, void 0, function* () {
+ let currentState = "RECORDING";
+ // We should do nothing if the user try to stop recording when it is not started
+ if (currentState === "OFF" || recorder == null) {
+ return;
+ }
+ // if (currentState === "COUNTDOWN") {
+ // this.setState({
+ // currentState: "OFF",
+ // })
+ // }
+ if (currentState === "RECORDING") {
+ if (recorder.getState() === "inactive") {
+ // this.setState({
+ // currentState: "OFF",
+ // })
+ console.log("Inactive");
+ }
+ else {
+ outputBlob = yield recorder.stop();
+ console.log("Done recording");
+ // this.setState({
+ // outputBlob,
+ // currentState: "PREVIEW_FILE",
+ // })
+ window.currentState = "PREVIEW_FILE";
+ const videoSource = URL.createObjectURL(outputBlob);
+ window.videoSource = videoSource;
+ const fileName = "recording";
+ const link = document.createElement("a");
+ link.setAttribute("href", videoSource);
+ link.setAttribute("download", `${fileName}.webm`);
+ link.click();
+ }
+ }
+});
+const startRecording = () => __awaiter(void 0, void 0, void 0, function* () {
+ const recordAudio = false;
+ recorder = new ScreenCastRecorder({
+ recordAudio,
+ onErrorOrStop: () => stopRecording(),
+ });
+ try {
+ yield recorder.initialize();
+ }
+ catch (e) {
+ console.warn(`ScreenCastRecorder.initialize error: ${e}`);
+ // this.setState({ currentState: "UNSUPPORTED" })
+ window.currentState = "UNSUPPORTED";
+ return;
+ }
+ // this.setState({ currentState: "COUNTDOWN" })
+ const hasStarted = recorder.start();
+ if (hasStarted) {
+ // this.setState({
+ // currentState: "RECORDING",
+ // })
+ console.log("Started recording");
+ window.currentState = "RECORDING";
+ }
+ else {
+ stopRecording().catch(err => console.warn(`withScreencast.stopRecording threw an error: ${err}`));
+ }
+});
+
+// Set global functions to window.
+window.startRecording = startRecording;
+window.stopRecording = stopRecording;
\ No newline at end of file
diff --git a/tabs/report/record_button.js b/tabs/report/record_button.js
new file mode 100644
index 0000000000000000000000000000000000000000..aa4fbf33fdaee2635cefc931ef0a786d5b06824a
--- /dev/null
+++ b/tabs/report/record_button.js
@@ -0,0 +1,40 @@
+// Setup if needed and start recording.
+async () => {
+ // Set up recording functions if not already initialized
+ if (!window.startRecording) {
+ let recorder_js = null;
+ let main_js = null;
+ }
+
+ // Function to fetch and convert video blob to base64 using async/await without explicit Promise
+ async function getVideoBlobAsBase64(objectURL) {
+ const response = await fetch(objectURL);
+ if (!response.ok) {
+ throw new Error('Failed to fetch video blob.');
+ }
+
+ const blob = await response.blob();
+
+ const reader = new FileReader();
+ reader.readAsDataURL(blob);
+
+ return new Promise((resolve, reject) => {
+ reader.onloadend = () => {
+ if (reader.result) {
+ resolve(reader.result.split(',')[1]); // Return the base64 string (without data URI prefix)
+ } else {
+ reject('Failed to convert blob to base64.');
+ }
+ };
+ });
+ }
+
+ if (window.currentState === "RECORDING") {
+ await window.stopRecording();
+ const base64String = await getVideoBlobAsBase64(window.videoSource);
+ return base64String;
+ } else {
+ window.startRecording();
+ return "Record";
+ }
+}
diff --git a/tabs/report/recorder.js b/tabs/report/recorder.js
new file mode 100644
index 0000000000000000000000000000000000000000..d054437c04bacb705425f9cd7c6783e3895fade1
--- /dev/null
+++ b/tabs/report/recorder.js
@@ -0,0 +1,112 @@
+// recorder.js
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+ return new (P || (P = Promise))(function (resolve, reject) {
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
+ });
+};
+const BLOB_TYPE = "video/webm";
+class ScreenCastRecorder {
+ /** True if the current browser likely supports screencasts. */
+ static isSupportedBrowser() {
+ return (navigator.mediaDevices != null &&
+ navigator.mediaDevices.getUserMedia != null &&
+ navigator.mediaDevices.getDisplayMedia != null &&
+ MediaRecorder.isTypeSupported(BLOB_TYPE));
+ }
+ constructor({ recordAudio, onErrorOrStop }) {
+ this.recordAudio = recordAudio;
+ this.onErrorOrStopCallback = onErrorOrStop;
+ this.inputStream = null;
+ this.recordedChunks = [];
+ this.mediaRecorder = null;
+ }
+ /**
+ * This asynchronous method will initialize the screen recording object asking
+ * for permissions to the user which are needed to start recording.
+ */
+ initialize() {
+ return __awaiter(this, void 0, void 0, function* () {
+ const desktopStream = yield navigator.mediaDevices.getDisplayMedia({
+ video: true,
+ });
+ let tracks = desktopStream.getTracks();
+ if (this.recordAudio) {
+ const voiceStream = yield navigator.mediaDevices.getUserMedia({
+ video: false,
+ audio: true,
+ });
+ tracks = tracks.concat(voiceStream.getAudioTracks());
+ }
+ this.recordedChunks = [];
+ this.inputStream = new MediaStream(tracks);
+ this.mediaRecorder = new MediaRecorder(this.inputStream, {
+ mimeType: BLOB_TYPE,
+ });
+ this.mediaRecorder.ondataavailable = e => this.recordedChunks.push(e.data);
+ });
+ }
+ getState() {
+ if (this.mediaRecorder) {
+ return this.mediaRecorder.state;
+ }
+ return "inactive";
+ }
+ /**
+ * This method will start the screen recording if the user has granted permissions
+ * and the mediaRecorder has been initialized
+ *
+ * @returns {boolean}
+ */
+ start() {
+ if (!this.mediaRecorder) {
+ console.warn(`ScreenCastRecorder.start: mediaRecorder is null`);
+ return false;
+ }
+ const logRecorderError = (e) => {
+ console.warn(`mediaRecorder.start threw an error: ${e}`);
+ };
+ this.mediaRecorder.onerror = (e) => {
+ logRecorderError(e);
+ this.onErrorOrStopCallback();
+ };
+ this.mediaRecorder.onstop = () => this.onErrorOrStopCallback();
+ try {
+ this.mediaRecorder.start();
+ }
+ catch (e) {
+ logRecorderError(e);
+ return false;
+ }
+ return true;
+ }
+ /**
+ * This method will stop recording and then return the generated Blob
+ *
+ * @returns {(Promise|undefined)}
+ * A Promise which will return the generated Blob
+ * Undefined if the MediaRecorder could not initialize
+ */
+ stop() {
+ if (!this.mediaRecorder) {
+ return undefined;
+ }
+ let resolver;
+ const promise = new Promise(r => {
+ resolver = r;
+ });
+ this.mediaRecorder.onstop = () => resolver();
+ this.mediaRecorder.stop();
+ if (this.inputStream) {
+ this.inputStream.getTracks().forEach(s => s.stop());
+ this.inputStream = null;
+ }
+ return promise.then(() => this.buildOutputBlob());
+ }
+ buildOutputBlob() {
+ return new Blob(this.recordedChunks, { type: BLOB_TYPE });
+ }
+}
\ No newline at end of file
diff --git a/tabs/report/report.py b/tabs/report/report.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f13e14c487fe45e739bd92495fb96572af001c7
--- /dev/null
+++ b/tabs/report/report.py
@@ -0,0 +1,80 @@
+import os
+import sys
+import base64
+import pathlib
+import tempfile
+import gradio as gr
+
+from assets.i18n.i18n import I18nAuto
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+i18n = I18nAuto()
+
+recorder_js_path = os.path.join(now_dir, "tabs", "report", "recorder.js")
+main_js_path = os.path.join(now_dir, "tabs", "report", "main.js")
+record_button_js_path = os.path.join(now_dir, "tabs", "report", "record_button.js")
+
+recorder_js = pathlib.Path(recorder_js_path).read_text()
+main_js = pathlib.Path(main_js_path).read_text()
+record_button_js = (
+ pathlib.Path(record_button_js_path)
+ .read_text()
+ .replace("let recorder_js = null;", recorder_js)
+ .replace("let main_js = null;", main_js)
+)
+
+
+def save_base64_video(base64_string):
+ base64_video = base64_string
+ video_data = base64.b64decode(base64_video)
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
+ temp_filename = temp_file.name
+ temp_file.write(video_data)
+ print(f"Temporary MP4 file saved as: {temp_filename}")
+ return temp_filename
+
+
+def report_tab():
+ instructions = [
+ i18n("# How to Report an Issue on GitHub"),
+ i18n(
+ "1. Click on the 'Record Screen' button below to start recording the issue you are experiencing."
+ ),
+ i18n(
+ "2. Once you have finished recording the issue, click on the 'Stop Recording' button (the same button, but the label changes depending on whether you are actively recording or not)."
+ ),
+ i18n(
+ "3. Go to [GitHub Issues](https://github.com/IAHispano/Applio/issues) and click on the 'New Issue' button."
+ ),
+ i18n(
+ "4. Complete the provided issue template, ensuring to include details as needed, and utilize the assets section to upload the recorded file from the previous step."
+ ),
+ ]
+ components = [gr.Markdown(value=instruction) for instruction in instructions]
+
+ start_button = gr.Button("Record Screen")
+ video_component = gr.Video(interactive=False)
+
+ def toggle_button_label(returned_string):
+ if returned_string.startswith("Record"):
+ return gr.Button(value="Stop Recording"), None
+ else:
+ try:
+ temp_filename = save_base64_video(returned_string)
+ except Exception as error:
+ print(f"An error occurred converting video to mp4: {error}")
+ return gr.Button(value="Record Screen"), gr.Warning(
+ f"Failed to convert video to mp4:\n{error}"
+ )
+ return gr.Button(value="Record Screen"), gr.Video(
+ value=temp_filename, interactive=False
+ )
+
+ start_button.click(
+ fn=toggle_button_label,
+ inputs=[start_button],
+ outputs=[start_button, video_component],
+ js=record_button_js,
+ )
diff --git a/tabs/settings/sections/lang.py b/tabs/settings/sections/lang.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dc8d7b8b4f2423a709d53f2ef90ebe1577f08b4
--- /dev/null
+++ b/tabs/settings/sections/lang.py
@@ -0,0 +1,57 @@
+import os, sys
+import json
+import gradio as gr
+from assets.i18n.i18n import I18nAuto
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+i18n = I18nAuto()
+
+config_file = os.path.join(now_dir, "assets", "config.json")
+
+
+def get_language_settings():
+ with open(config_file, "r", encoding="utf8") as file:
+ config = json.load(file)
+
+ if config["lang"]["override"] == False:
+ return "Language automatically detected in the system"
+ else:
+ return config["lang"]["selected_lang"]
+
+
+def save_lang_settings(selected_language):
+ with open(config_file, "r", encoding="utf8") as file:
+ config = json.load(file)
+
+ if selected_language == "Language automatically detected in the system":
+ config["lang"]["override"] = False
+ else:
+ config["lang"]["override"] = True
+ config["lang"]["selected_lang"] = selected_language
+
+ gr.Info("Language have been saved. Restart Applio to apply the changes.")
+
+ with open(config_file, "w", encoding="utf8") as file:
+ json.dump(config, file, indent=2)
+
+
+def lang_tab():
+ with gr.Column():
+ selected_language = gr.Dropdown(
+ label=i18n("Language"),
+ info=i18n(
+ "Select the language you want to use. (Requires restarting Applio)"
+ ),
+ value=get_language_settings(),
+ choices=["Language automatically detected in the system"]
+ + i18n._get_available_languages(),
+ interactive=True,
+ )
+
+ selected_language.change(
+ fn=save_lang_settings,
+ inputs=[selected_language],
+ outputs=[],
+ )
diff --git a/tabs/settings/sections/model_author.py b/tabs/settings/sections/model_author.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d527803cb881becd12bf522dbb6fa1d64dda703
--- /dev/null
+++ b/tabs/settings/sections/model_author.py
@@ -0,0 +1,54 @@
+import os
+import sys
+import json
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+import gradio as gr
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+
+def set_model_author(model_author: str):
+ with open(os.path.join(now_dir, "assets", "config.json"), "r") as f:
+ config = json.load(f)
+
+ config["model_author"] = model_author
+
+ with open(os.path.join(now_dir, "assets", "config.json"), "w") as f:
+ json.dump(config, f, indent=4)
+
+ print(f"Model author set to {model_author}.")
+ return f"Model author set to {model_author}."
+
+
+def get_model_author():
+ with open(os.path.join(now_dir, "assets", "config.json"), "r") as f:
+ config = json.load(f)
+
+ return config["model_author"] if "model_author" in config else None
+
+
+def model_author_tab():
+ model_author_name = gr.Textbox(
+ label=i18n("Model Author Name"),
+ info=i18n("The name that will appear in the model information."),
+ value=get_model_author(),
+ placeholder=i18n("Enter your nickname"),
+ interactive=True,
+ )
+ model_author_output_info = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ value="",
+ max_lines=1,
+ )
+ button = gr.Button(i18n("Set name"))
+
+ button.click(
+ fn=set_model_author,
+ inputs=[model_author_name],
+ outputs=[model_author_output_info],
+ )
diff --git a/tabs/settings/sections/presence.py b/tabs/settings/sections/presence.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4dc2e6a0f6aad731194c30c1a79cdd46544579f
--- /dev/null
+++ b/tabs/settings/sections/presence.py
@@ -0,0 +1,55 @@
+import os
+import sys
+import gradio as gr
+import json
+from assets.i18n.i18n import I18nAuto
+from assets.discord_presence import RPCManager
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+i18n = I18nAuto()
+config_file = os.path.join(now_dir, "assets", "config.json")
+
+
+def load_config_presence():
+ with open(config_file, "r", encoding="utf8") as file:
+ config = json.load(file)
+ return config["discord_presence"]
+
+
+def save_config(value):
+ with open(config_file, "r", encoding="utf8") as file:
+ config = json.load(file)
+ config["discord_presence"] = value
+ with open(config_file, "w", encoding="utf8") as file:
+ json.dump(config, file, indent=2)
+
+
+def presence_tab():
+ with gr.Row():
+ with gr.Column():
+ presence = gr.Checkbox(
+ label=i18n("Enable Applio integration with Discord presence"),
+ info=i18n(
+ "It will activate the possibility of displaying the current Applio activity in Discord."
+ ),
+ interactive=True,
+ value=load_config_presence(),
+ )
+ presence.change(
+ fn=toggle,
+ inputs=[presence],
+ outputs=[],
+ )
+
+
+def toggle(checkbox):
+ save_config(bool(checkbox))
+ if load_config_presence() == True:
+ try:
+ RPCManager.start_presence()
+ except KeyboardInterrupt:
+ RPCManager.stop_presence()
+ else:
+ RPCManager.stop_presence()
diff --git a/tabs/settings/sections/restart.py b/tabs/settings/sections/restart.py
new file mode 100644
index 0000000000000000000000000000000000000000..50c3bce5850fe9dac6c11255f58cfb2f1f77951f
--- /dev/null
+++ b/tabs/settings/sections/restart.py
@@ -0,0 +1,58 @@
+import gradio as gr
+import os
+import sys
+import json
+
+now_dir = os.getcwd()
+
+
+def stop_train(model_name: str):
+ pid_file_path = os.path.join(now_dir, "logs", model_name, "config.json")
+ try:
+ with open(pid_file_path, "r") as pid_file:
+ pid_data = json.load(pid_file)
+ pids = pid_data.get("process_pids", [])
+ with open(pid_file_path, "w") as pid_file:
+ pid_data.pop("process_pids", None)
+ json.dump(pid_data, pid_file, indent=4)
+ for pid in pids:
+ os.kill(pid, 9)
+ except:
+ pass
+
+
+def stop_infer():
+ pid_file_path = os.path.join(now_dir, "assets", "infer_pid.txt")
+ try:
+ with open(pid_file_path, "r") as pid_file:
+ pids = [int(pid) for pid in pid_file.readlines()]
+ for pid in pids:
+ os.kill(pid, 9)
+ os.remove(pid_file_path)
+ except:
+ pass
+
+
+def restart_applio():
+ if os.name != "nt":
+ os.system("clear")
+ else:
+ os.system("cls")
+ python = sys.executable
+ os.execl(python, python, *sys.argv)
+
+
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+
+def restart_tab():
+ with gr.Row():
+ with gr.Column():
+ restart_button = gr.Button(i18n("Restart Applio"))
+ restart_button.click(
+ fn=restart_applio,
+ inputs=[],
+ outputs=[],
+ )
diff --git a/tabs/settings/sections/themes.py b/tabs/settings/sections/themes.py
new file mode 100644
index 0000000000000000000000000000000000000000..33ca373d0e0a7dd11269a22c46ac33e60ced1cec
--- /dev/null
+++ b/tabs/settings/sections/themes.py
@@ -0,0 +1,30 @@
+import os
+import sys
+import gradio as gr
+
+from assets.i18n.i18n import I18nAuto
+import assets.themes.loadThemes as loadThemes
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+i18n = I18nAuto()
+
+
+def theme_tab():
+ with gr.Row():
+ with gr.Column():
+ themes_select = gr.Dropdown(
+ loadThemes.get_theme_list(),
+ value=loadThemes.load_theme(),
+ label=i18n("Theme"),
+ info=i18n(
+ "Select the theme you want to use. (Requires restarting Applio)"
+ ),
+ visible=True,
+ )
+ themes_select.change(
+ fn=loadThemes.select_theme,
+ inputs=themes_select,
+ outputs=[],
+ )
diff --git a/tabs/settings/sections/version.py b/tabs/settings/sections/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bd83884c11e878923627c56d61a58e4b4d8cecf
--- /dev/null
+++ b/tabs/settings/sections/version.py
@@ -0,0 +1,24 @@
+import gradio as gr
+
+from assets.version_checker import compare_version
+from assets.i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+
+def version_tab():
+ with gr.Row():
+ with gr.Column():
+ version_check = gr.Textbox(
+ label=i18n("Version Checker"),
+ info=i18n(
+ "Check which version of Applio is the latest to see if you need to update."
+ ),
+ interactive=False,
+ )
+ version_button = gr.Button(i18n("Check for updates"))
+ version_button.click(
+ fn=compare_version,
+ inputs=[],
+ outputs=[version_check],
+ )
diff --git a/tabs/settings/settings.py b/tabs/settings/settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b54cf105379cde2eaa495264dae88bdfaad8196
--- /dev/null
+++ b/tabs/settings/settings.py
@@ -0,0 +1,24 @@
+import os
+import sys
+import gradio as gr
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from tabs.settings.sections.presence import presence_tab
+from tabs.settings.sections.themes import theme_tab
+from tabs.settings.sections.version import version_tab
+from tabs.settings.sections.lang import lang_tab
+from tabs.settings.sections.restart import restart_tab
+from tabs.settings.sections.model_author import model_author_tab
+
+
+def settings_tab():
+ with gr.TabItem(label="General"):
+ presence_tab()
+ theme_tab()
+ version_tab()
+ lang_tab()
+ restart_tab()
+ with gr.TabItem(label="Training"):
+ model_author_tab()
diff --git a/tabs/train/train.py b/tabs/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ada1d391fb3a67b2549437c14997d106724b0d0
--- /dev/null
+++ b/tabs/train/train.py
@@ -0,0 +1,1008 @@
+import os
+import shutil
+import sys
+from multiprocessing import cpu_count
+
+import gradio as gr
+
+from assets.i18n.i18n import I18nAuto
+from core import (
+ run_extract_script,
+ run_index_script,
+ run_preprocess_script,
+ run_prerequisites_script,
+ run_train_script,
+)
+from rvc.configs.config import get_gpu_info, get_number_of_gpus, max_vram_gpu
+from rvc.lib.utils import format_title
+from tabs.settings.sections.restart import stop_train
+
+i18n = I18nAuto()
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+
+sup_audioext = {
+ "wav",
+ "mp3",
+ "flac",
+ "ogg",
+ "opus",
+ "m4a",
+ "mp4",
+ "aac",
+ "alac",
+ "wma",
+ "aiff",
+ "webm",
+ "ac3",
+}
+
+# Custom Pretraineds
+pretraineds_custom_path = os.path.join(
+ now_dir, "rvc", "models", "pretraineds", "pretraineds_custom"
+)
+
+pretraineds_custom_path_relative = os.path.relpath(pretraineds_custom_path, now_dir)
+
+custom_embedder_root = os.path.join(
+ now_dir, "rvc", "models", "embedders", "embedders_custom"
+)
+custom_embedder_root_relative = os.path.relpath(custom_embedder_root, now_dir)
+
+os.makedirs(custom_embedder_root, exist_ok=True)
+os.makedirs(pretraineds_custom_path_relative, exist_ok=True)
+
+
+def get_pretrained_list(suffix):
+ return [
+ os.path.join(dirpath, filename)
+ for dirpath, _, filenames in os.walk(pretraineds_custom_path_relative)
+ for filename in filenames
+ if filename.endswith(".pth") and suffix in filename
+ ]
+
+
+pretraineds_list_d = get_pretrained_list("D")
+pretraineds_list_g = get_pretrained_list("G")
+
+
+def refresh_custom_pretraineds():
+ return (
+ {"choices": sorted(get_pretrained_list("G")), "__type__": "update"},
+ {"choices": sorted(get_pretrained_list("D")), "__type__": "update"},
+ )
+
+
+# Dataset Creator
+datasets_path = os.path.join(now_dir, "assets", "datasets")
+
+if not os.path.exists(datasets_path):
+ os.makedirs(datasets_path)
+
+datasets_path_relative = os.path.relpath(datasets_path, now_dir)
+
+
+def get_datasets_list():
+ return [
+ dirpath
+ for dirpath, _, filenames in os.walk(datasets_path_relative)
+ if any(filename.endswith(tuple(sup_audioext)) for filename in filenames)
+ ]
+
+
+def refresh_datasets():
+ return {"choices": sorted(get_datasets_list()), "__type__": "update"}
+
+
+# Model Names
+models_path = os.path.join(now_dir, "logs")
+
+
+def get_models_list():
+ return [
+ os.path.basename(dirpath)
+ for dirpath in os.listdir(models_path)
+ if os.path.isdir(os.path.join(models_path, dirpath))
+ and all(excluded not in dirpath for excluded in ["zips", "mute", "reference"])
+ ]
+
+
+def refresh_models():
+ return {"choices": sorted(get_models_list()), "__type__": "update"}
+
+
+# Refresh Models and Datasets
+def refresh_models_and_datasets():
+ return (
+ {"choices": sorted(get_models_list()), "__type__": "update"},
+ {"choices": sorted(get_datasets_list()), "__type__": "update"},
+ )
+
+
+# Refresh Custom Embedders
+def get_embedder_custom_list():
+ return [
+ os.path.join(dirpath, dirname)
+ for dirpath, dirnames, _ in os.walk(custom_embedder_root_relative)
+ for dirname in dirnames
+ ]
+
+
+def refresh_custom_embedder_list():
+ return {"choices": sorted(get_embedder_custom_list()), "__type__": "update"}
+
+
+# Drop Model
+def save_drop_model(dropbox):
+ if ".pth" not in dropbox:
+ gr.Info(
+ i18n(
+ "The file you dropped is not a valid pretrained file. Please try again."
+ )
+ )
+ else:
+ file_name = os.path.basename(dropbox)
+ pretrained_path = os.path.join(pretraineds_custom_path_relative, file_name)
+ if os.path.exists(pretrained_path):
+ os.remove(pretrained_path)
+ shutil.copy(dropbox, pretrained_path)
+ gr.Info(
+ i18n(
+ "Click the refresh button to see the pretrained file in the dropdown menu."
+ )
+ )
+ return None
+
+
+# Drop Dataset
+def save_drop_dataset_audio(dropbox, dataset_name):
+ if not dataset_name:
+ gr.Info("Please enter a valid dataset name. Please try again.")
+ return None, None
+ else:
+ file_extension = os.path.splitext(dropbox)[1][1:].lower()
+ if file_extension not in sup_audioext:
+ gr.Info("The file you dropped is not a valid audio file. Please try again.")
+ else:
+ dataset_name = format_title(dataset_name)
+ audio_file = format_title(os.path.basename(dropbox))
+ dataset_path = os.path.join(now_dir, "assets", "datasets", dataset_name)
+ if not os.path.exists(dataset_path):
+ os.makedirs(dataset_path)
+ destination_path = os.path.join(dataset_path, audio_file)
+ if os.path.exists(destination_path):
+ os.remove(destination_path)
+ shutil.copy(dropbox, destination_path)
+ gr.Info(
+ i18n(
+ "The audio file has been successfully added to the dataset. Please click the preprocess button."
+ )
+ )
+ dataset_path = os.path.dirname(destination_path)
+ relative_dataset_path = os.path.relpath(dataset_path, now_dir)
+
+ return None, relative_dataset_path
+
+
+# Drop Custom Embedder
+def create_folder_and_move_files(folder_name, bin_file, config_file):
+ if not folder_name:
+ return "Folder name must not be empty."
+
+ folder_name = os.path.join(custom_embedder_root, folder_name)
+ os.makedirs(folder_name, exist_ok=True)
+
+ if bin_file:
+ bin_file_path = os.path.join(folder_name, os.path.basename(bin_file))
+ shutil.copy(bin_file, bin_file_path)
+
+ if config_file:
+ config_file_path = os.path.join(folder_name, os.path.basename(config_file))
+ shutil.copy(config_file, config_file_path)
+
+ return f"Files moved to folder {folder_name}"
+
+
+def refresh_embedders_folders():
+ custom_embedders = [
+ os.path.join(dirpath, dirname)
+ for dirpath, dirnames, _ in os.walk(custom_embedder_root_relative)
+ for dirname in dirnames
+ ]
+ return custom_embedders
+
+
+# Export
+## Get Pth and Index Files
+def get_pth_list():
+ return [
+ os.path.relpath(os.path.join(dirpath, filename), now_dir)
+ for dirpath, _, filenames in os.walk(models_path)
+ for filename in filenames
+ if filename.endswith(".pth")
+ ]
+
+
+def get_index_list():
+ return [
+ os.path.relpath(os.path.join(dirpath, filename), now_dir)
+ for dirpath, _, filenames in os.walk(models_path)
+ for filename in filenames
+ if filename.endswith(".index") and "trained" not in filename
+ ]
+
+
+def refresh_pth_and_index_list():
+ return (
+ {"choices": sorted(get_pth_list()), "__type__": "update"},
+ {"choices": sorted(get_index_list()), "__type__": "update"},
+ )
+
+
+## Export Pth and Index Files
+def export_pth(pth_path):
+ if pth_path and os.path.exists(pth_path):
+ return pth_path
+ return None
+
+
+def export_index(index_path):
+ if index_path and os.path.exists(index_path):
+ return index_path
+ return None
+
+
+## Upload to Google Drive
+def upload_to_google_drive(pth_path, index_path):
+ def upload_file(file_path):
+ if file_path:
+ try:
+ gr.Info(f"Uploading {pth_path} to Google Drive...")
+ google_drive_folder = "/content/drive/MyDrive/ApplioExported"
+ if not os.path.exists(google_drive_folder):
+ os.makedirs(google_drive_folder)
+ google_drive_file_path = os.path.join(
+ google_drive_folder, os.path.basename(file_path)
+ )
+ if os.path.exists(google_drive_file_path):
+ os.remove(google_drive_file_path)
+ shutil.copy2(file_path, google_drive_file_path)
+ gr.Info("File uploaded successfully.")
+ except Exception as error:
+ print(f"An error occurred uploading to Google Drive: {error}")
+ gr.Info("Error uploading to Google Drive")
+
+ upload_file(pth_path)
+ upload_file(index_path)
+
+
+# Train Tab
+def train_tab():
+ # Model settings section
+ with gr.Accordion(i18n("Model Settings")):
+ with gr.Row():
+ with gr.Column():
+ model_name = gr.Dropdown(
+ label=i18n("Model Name"),
+ info=i18n("Name of the new model."),
+ choices=get_models_list(),
+ value="my-project",
+ interactive=True,
+ allow_custom_value=True,
+ )
+ architecture = gr.Radio(
+ label=i18n("Architecture"),
+ info=i18n(
+ "Choose the model architecture:\n- **RVC (V2)**: Default option, compatible with all clients.\n- **Applio**: Advanced quality with improved vocoders and higher sample rates, Applio-only."
+ ),
+ choices=["RVC", "Applio"],
+ value="RVC",
+ interactive=True,
+ visible=True,
+ )
+ with gr.Column():
+ sampling_rate = gr.Radio(
+ label=i18n("Sampling Rate"),
+ info=i18n("The sampling rate of the audio files."),
+ choices=["32000", "40000", "48000"],
+ value="40000",
+ interactive=True,
+ )
+ vocoder = gr.Radio(
+ label=i18n("Vocoder"),
+ info=i18n(
+ "Choose the vocoder for audio synthesis:\n- **HiFi-GAN**: Default option, compatible with all clients.\n- **MRF HiFi-GAN**: Higher fidelity, Applio-only.\n- **RefineGAN**: Superior audio quality, Applio-only."
+ ),
+ choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"],
+ value="HiFi-GAN",
+ interactive=False,
+ visible=True,
+ )
+ with gr.Accordion(
+ i18n("Advanced Settings"),
+ open=False,
+ ):
+ with gr.Row():
+ with gr.Column():
+ cpu_cores = gr.Slider(
+ 1,
+ min(cpu_count(), 32), # max 32 parallel processes
+ min(cpu_count(), 32),
+ step=1,
+ label=i18n("CPU Cores"),
+ info=i18n(
+ "The number of CPU cores to use in the extraction process. The default setting are your cpu cores, which is recommended for most cases."
+ ),
+ interactive=True,
+ )
+
+ with gr.Column():
+ gpu = gr.Textbox(
+ label=i18n("GPU Number"),
+ info=i18n(
+ "Specify the number of GPUs you wish to utilize for extracting by entering them separated by hyphens (-)."
+ ),
+ placeholder=i18n("0 to ∞ separated by -"),
+ value=str(get_number_of_gpus()),
+ interactive=True,
+ )
+ gr.Textbox(
+ label=i18n("GPU Information"),
+ info=i18n("The GPU information will be displayed here."),
+ value=get_gpu_info(),
+ interactive=False,
+ )
+ # Preprocess section
+ with gr.Accordion(i18n("Preprocess")):
+ dataset_path = gr.Dropdown(
+ label=i18n("Dataset Path"),
+ info=i18n("Path to the dataset folder."),
+ # placeholder=i18n("Enter dataset path"),
+ choices=get_datasets_list(),
+ allow_custom_value=True,
+ interactive=True,
+ )
+ dataset_creator = gr.Checkbox(
+ label=i18n("Dataset Creator"),
+ value=False,
+ interactive=True,
+ visible=True,
+ )
+ with gr.Column(visible=False) as dataset_creator_settings:
+ with gr.Accordion(i18n("Dataset Creator")):
+ dataset_name = gr.Textbox(
+ label=i18n("Dataset Name"),
+ info=i18n("Name of the new dataset."),
+ placeholder=i18n("Enter dataset name"),
+ interactive=True,
+ )
+ upload_audio_dataset = gr.File(
+ label=i18n("Upload Audio Dataset"),
+ type="filepath",
+ interactive=True,
+ )
+ refresh = gr.Button(i18n("Refresh"))
+
+ with gr.Accordion(i18n("Advanced Settings"), open=False):
+ cut_preprocess = gr.Radio(
+ label=i18n("Audio cutting"),
+ info=i18n(
+ "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it."
+ ),
+ choices=["Skip", "Simple", "Automatic"],
+ value="Automatic",
+ interactive=True,
+ )
+ with gr.Row():
+ chunk_len = gr.Slider(
+ 0.5,
+ 5.0,
+ 3.0,
+ step=0.1,
+ label=i18n("Chunk length (sec)"),
+ info=i18n("Length of the audio slice for 'Simple' method."),
+ interactive=True,
+ )
+ overlap_len = gr.Slider(
+ 0.0,
+ 0.4,
+ 0.3,
+ step=0.1,
+ label=i18n("Overlap length (sec)"),
+ info=i18n(
+ "Length of the overlap between slices for 'Simple' method."
+ ),
+ interactive=True,
+ )
+
+ with gr.Row():
+ process_effects = gr.Checkbox(
+ label=i18n("Process effects"),
+ info=i18n(
+ "It's recommended to deactivate this option if your dataset has already been processed."
+ ),
+ value=True,
+ interactive=True,
+ visible=True,
+ )
+ noise_reduction = gr.Checkbox(
+ label=i18n("Noise Reduction"),
+ info=i18n(
+ "It's recommended keep deactivate this option if your dataset has already been processed."
+ ),
+ value=False,
+ interactive=True,
+ visible=True,
+ )
+ clean_strength = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Noise Reduction Strength"),
+ info=i18n(
+ "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
+ ),
+ visible=False,
+ value=0.5,
+ interactive=True,
+ )
+ preprocess_output_info = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ value="",
+ max_lines=8,
+ interactive=False,
+ )
+
+ with gr.Row():
+ preprocess_button = gr.Button(i18n("Preprocess Dataset"))
+ preprocess_button.click(
+ fn=run_preprocess_script,
+ inputs=[
+ model_name,
+ dataset_path,
+ sampling_rate,
+ cpu_cores,
+ cut_preprocess,
+ process_effects,
+ noise_reduction,
+ clean_strength,
+ chunk_len,
+ overlap_len,
+ ],
+ outputs=[preprocess_output_info],
+ )
+
+ # Extract section
+ with gr.Accordion(i18n("Extract")):
+ with gr.Row():
+ f0_method = gr.Radio(
+ label=i18n("Pitch extraction algorithm"),
+ info=i18n(
+ "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
+ ),
+ choices=["crepe", "crepe-tiny", "rmvpe"],
+ value="rmvpe",
+ interactive=True,
+ )
+
+ embedder_model = gr.Radio(
+ label=i18n("Embedder Model"),
+ info=i18n("Model used for learning speaker embedding."),
+ choices=[
+ "contentvec",
+ "chinese-hubert-base",
+ "japanese-hubert-base",
+ "korean-hubert-base",
+ "custom",
+ ],
+ value="contentvec",
+ interactive=True,
+ )
+ include_mutes = gr.Slider(
+ 0,
+ 10,
+ 2,
+ step=1,
+ label=i18n("Silent training files"),
+ info=i18n(
+ "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence."
+ ),
+ value=True,
+ interactive=True,
+ )
+ hop_length = gr.Slider(
+ 1,
+ 512,
+ 128,
+ step=1,
+ label=i18n("Hop Length"),
+ info=i18n(
+ "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
+ ),
+ visible=False,
+ interactive=True,
+ )
+ with gr.Row(visible=False) as embedder_custom:
+ with gr.Accordion("Custom Embedder", open=True):
+ with gr.Row():
+ embedder_model_custom = gr.Dropdown(
+ label="Select Custom Embedder",
+ choices=refresh_embedders_folders(),
+ interactive=True,
+ allow_custom_value=True,
+ )
+ refresh_embedders_button = gr.Button("Refresh embedders")
+ folder_name_input = gr.Textbox(label="Folder Name", interactive=True)
+ with gr.Row():
+ bin_file_upload = gr.File(
+ label="Upload .bin", type="filepath", interactive=True
+ )
+ config_file_upload = gr.File(
+ label="Upload .json", type="filepath", interactive=True
+ )
+ move_files_button = gr.Button("Move files to custom embedder folder")
+
+ extract_output_info = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ value="",
+ max_lines=8,
+ interactive=False,
+ )
+ extract_button = gr.Button(i18n("Extract Features"))
+ extract_button.click(
+ fn=run_extract_script,
+ inputs=[
+ model_name,
+ f0_method,
+ hop_length,
+ cpu_cores,
+ gpu,
+ sampling_rate,
+ embedder_model,
+ embedder_model_custom,
+ include_mutes,
+ ],
+ outputs=[extract_output_info],
+ )
+
+ # Training section
+ with gr.Accordion(i18n("Training")):
+ with gr.Row():
+ batch_size = gr.Slider(
+ 1,
+ 50,
+ max_vram_gpu(0),
+ step=1,
+ label=i18n("Batch Size"),
+ info=i18n(
+ "It's advisable to align it with the available VRAM of your GPU. A setting of 4 offers improved accuracy but slower processing, while 8 provides faster and standard results."
+ ),
+ interactive=True,
+ )
+ save_every_epoch = gr.Slider(
+ 1,
+ 100,
+ 10,
+ step=1,
+ label=i18n("Save Every Epoch"),
+ info=i18n("Determine at how many epochs the model will saved at."),
+ interactive=True,
+ )
+ total_epoch = gr.Slider(
+ 1,
+ 10000,
+ 500,
+ step=1,
+ label=i18n("Total Epoch"),
+ info=i18n(
+ "Specifies the overall quantity of epochs for the model training process."
+ ),
+ interactive=True,
+ )
+ with gr.Accordion(i18n("Advanced Settings"), open=False):
+ with gr.Row():
+ with gr.Column():
+ save_only_latest = gr.Checkbox(
+ label=i18n("Save Only Latest"),
+ info=i18n(
+ "Enabling this setting will result in the G and D files saving only their most recent versions, effectively conserving storage space."
+ ),
+ value=True,
+ interactive=True,
+ )
+ save_every_weights = gr.Checkbox(
+ label=i18n("Save Every Weights"),
+ info=i18n(
+ "This setting enables you to save the weights of the model at the conclusion of each epoch."
+ ),
+ value=True,
+ interactive=True,
+ )
+ pretrained = gr.Checkbox(
+ label=i18n("Pretrained"),
+ info=i18n(
+ "Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality."
+ ),
+ value=True,
+ interactive=True,
+ )
+ with gr.Column():
+ cleanup = gr.Checkbox(
+ label=i18n("Fresh Training"),
+ info=i18n(
+ "Enable this setting only if you are training a new model from scratch or restarting the training. Deletes all previously generated weights and tensorboard logs."
+ ),
+ value=False,
+ interactive=True,
+ )
+ cache_dataset_in_gpu = gr.Checkbox(
+ label=i18n("Cache Dataset in GPU"),
+ info=i18n(
+ "Cache the dataset in GPU memory to speed up the training process."
+ ),
+ value=False,
+ interactive=True,
+ )
+ checkpointing = gr.Checkbox(
+ label=i18n("Checkpointing"),
+ info=i18n(
+ "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate."
+ ),
+ value=False,
+ interactive=True,
+ )
+ with gr.Row():
+ custom_pretrained = gr.Checkbox(
+ label=i18n("Custom Pretrained"),
+ info=i18n(
+ "Utilizing custom pretrained models can lead to superior results, as selecting the most suitable pretrained models tailored to the specific use case can significantly enhance performance."
+ ),
+ value=False,
+ interactive=True,
+ )
+ overtraining_detector = gr.Checkbox(
+ label=i18n("Overtraining Detector"),
+ info=i18n(
+ "Detect overtraining to prevent the model from learning the training data too well and losing the ability to generalize to new data."
+ ),
+ value=False,
+ interactive=True,
+ )
+ with gr.Row():
+ with gr.Column(visible=False) as pretrained_custom_settings:
+ with gr.Accordion(i18n("Pretrained Custom Settings")):
+ upload_pretrained = gr.File(
+ label=i18n("Upload Pretrained Model"),
+ type="filepath",
+ interactive=True,
+ )
+ refresh_custom_pretaineds_button = gr.Button(
+ i18n("Refresh Custom Pretraineds")
+ )
+ g_pretrained_path = gr.Dropdown(
+ label=i18n("Custom Pretrained G"),
+ info=i18n(
+ "Select the custom pretrained model for the generator."
+ ),
+ choices=sorted(pretraineds_list_g),
+ interactive=True,
+ allow_custom_value=True,
+ )
+ d_pretrained_path = gr.Dropdown(
+ label=i18n("Custom Pretrained D"),
+ info=i18n(
+ "Select the custom pretrained model for the discriminator."
+ ),
+ choices=sorted(pretraineds_list_d),
+ interactive=True,
+ allow_custom_value=True,
+ )
+
+ with gr.Column(visible=False) as overtraining_settings:
+ with gr.Accordion(i18n("Overtraining Detector Settings")):
+ overtraining_threshold = gr.Slider(
+ 1,
+ 100,
+ 50,
+ step=1,
+ label=i18n("Overtraining Threshold"),
+ info=i18n(
+ "Set the maximum number of epochs you want your model to stop training if no improvement is detected."
+ ),
+ interactive=True,
+ )
+ index_algorithm = gr.Radio(
+ label=i18n("Index Algorithm"),
+ info=i18n(
+ "KMeans is a clustering algorithm that divides the dataset into K clusters. This setting is particularly useful for large datasets."
+ ),
+ choices=["Auto", "Faiss", "KMeans"],
+ value="Auto",
+ interactive=True,
+ )
+
+ def enforce_terms(terms_accepted, *args):
+ if not terms_accepted:
+ message = "You must agree to the Terms of Use to proceed."
+ gr.Info(message)
+ return message
+ return run_train_script(*args)
+
+ terms_checkbox = gr.Checkbox(
+ label=i18n("I agree to the terms of use"),
+ info=i18n(
+ "Please ensure compliance with the terms and conditions detailed in [this document](https://github.com/IAHispano/Applio/blob/main/TERMS_OF_USE.md) before proceeding with your training."
+ ),
+ value=False,
+ interactive=True,
+ )
+ train_output_info = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ value="",
+ max_lines=8,
+ interactive=False,
+ )
+
+ with gr.Row():
+ train_button = gr.Button(i18n("Start Training"))
+ train_button.click(
+ fn=enforce_terms,
+ inputs=[
+ terms_checkbox,
+ model_name,
+ save_every_epoch,
+ save_only_latest,
+ save_every_weights,
+ total_epoch,
+ sampling_rate,
+ batch_size,
+ gpu,
+ overtraining_detector,
+ overtraining_threshold,
+ pretrained,
+ cleanup,
+ index_algorithm,
+ cache_dataset_in_gpu,
+ custom_pretrained,
+ g_pretrained_path,
+ d_pretrained_path,
+ vocoder,
+ checkpointing,
+ ],
+ outputs=[train_output_info],
+ )
+
+ stop_train_button = gr.Button(i18n("Stop Training"), visible=False)
+ stop_train_button.click(
+ fn=stop_train,
+ inputs=[model_name],
+ outputs=[],
+ )
+
+ index_button = gr.Button(i18n("Generate Index"))
+ index_button.click(
+ fn=run_index_script,
+ inputs=[model_name, index_algorithm],
+ outputs=[train_output_info],
+ )
+
+ # Export Model section
+ with gr.Accordion(i18n("Export Model"), open=False):
+ if not os.name == "nt":
+ gr.Markdown(
+ i18n(
+ "The button 'Upload' is only for google colab: Uploads the exported files to the ApplioExported folder in your Google Drive."
+ )
+ )
+ with gr.Row():
+ with gr.Column():
+ pth_file_export = gr.File(
+ label=i18n("Exported Pth file"),
+ type="filepath",
+ value=None,
+ interactive=False,
+ )
+ pth_dropdown_export = gr.Dropdown(
+ label=i18n("Pth file"),
+ info=i18n("Select the pth file to be exported"),
+ choices=get_pth_list(),
+ value=None,
+ interactive=True,
+ allow_custom_value=True,
+ )
+ with gr.Column():
+ index_file_export = gr.File(
+ label=i18n("Exported Index File"),
+ type="filepath",
+ value=None,
+ interactive=False,
+ )
+ index_dropdown_export = gr.Dropdown(
+ label=i18n("Index File"),
+ info=i18n("Select the index file to be exported"),
+ choices=get_index_list(),
+ value=None,
+ interactive=True,
+ allow_custom_value=True,
+ )
+ with gr.Row():
+ with gr.Column():
+ refresh_export = gr.Button(i18n("Refresh"))
+ if not os.name == "nt":
+ upload_exported = gr.Button(i18n("Upload"))
+ upload_exported.click(
+ fn=upload_to_google_drive,
+ inputs=[pth_dropdown_export, index_dropdown_export],
+ outputs=[],
+ )
+
+ def toggle_visible(checkbox):
+ return {"visible": checkbox, "__type__": "update"}
+
+ def toggle_visible_hop_length(f0_method):
+ if f0_method == "crepe" or f0_method == "crepe-tiny":
+ return {"visible": True, "__type__": "update"}
+ return {"visible": False, "__type__": "update"}
+
+ def toggle_pretrained(pretrained, custom_pretrained):
+ if custom_pretrained == False:
+ return {"visible": pretrained, "__type__": "update"}, {
+ "visible": False,
+ "__type__": "update",
+ }
+ else:
+ return {"visible": pretrained, "__type__": "update"}, {
+ "visible": pretrained,
+ "__type__": "update",
+ }
+
+ def enable_stop_train_button():
+ return {"visible": False, "__type__": "update"}, {
+ "visible": True,
+ "__type__": "update",
+ }
+
+ def disable_stop_train_button():
+ return {"visible": True, "__type__": "update"}, {
+ "visible": False,
+ "__type__": "update",
+ }
+
+ def download_prerequisites():
+ gr.Info(
+ "Checking for prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped."
+ )
+ run_prerequisites_script(
+ pretraineds_hifigan=True,
+ models=False,
+ exe=False,
+ )
+ gr.Info(
+ "Prerequisites check complete. Missing files were downloaded, and you may now start preprocessing."
+ )
+
+ def toggle_visible_embedder_custom(embedder_model):
+ if embedder_model == "custom":
+ return {"visible": True, "__type__": "update"}
+ return {"visible": False, "__type__": "update"}
+
+ def toggle_architecture(architecture):
+ if architecture == "Applio":
+ return {
+ "choices": ["32000", "40000", "44100", "48000"],
+ "__type__": "update",
+ }, {
+ "interactive": True,
+ "__type__": "update",
+ }
+ else:
+ return {
+ "choices": ["32000", "40000", "48000"],
+ "__type__": "update",
+ "value": "40000",
+ }, {"interactive": False, "__type__": "update", "value": "HiFi-GAN"}
+
+ def update_slider_visibility(noise_reduction):
+ return gr.update(visible=noise_reduction)
+
+ noise_reduction.change(
+ fn=update_slider_visibility,
+ inputs=noise_reduction,
+ outputs=clean_strength,
+ )
+ architecture.change(
+ fn=toggle_architecture,
+ inputs=[architecture],
+ outputs=[sampling_rate, vocoder],
+ )
+ refresh.click(
+ fn=refresh_models_and_datasets,
+ inputs=[],
+ outputs=[model_name, dataset_path],
+ )
+ dataset_creator.change(
+ fn=toggle_visible,
+ inputs=[dataset_creator],
+ outputs=[dataset_creator_settings],
+ )
+ upload_audio_dataset.upload(
+ fn=save_drop_dataset_audio,
+ inputs=[upload_audio_dataset, dataset_name],
+ outputs=[upload_audio_dataset, dataset_path],
+ )
+ f0_method.change(
+ fn=toggle_visible_hop_length,
+ inputs=[f0_method],
+ outputs=[hop_length],
+ )
+ embedder_model.change(
+ fn=toggle_visible_embedder_custom,
+ inputs=[embedder_model],
+ outputs=[embedder_custom],
+ )
+ embedder_model.change(
+ fn=toggle_visible_embedder_custom,
+ inputs=[embedder_model],
+ outputs=[embedder_custom],
+ )
+ move_files_button.click(
+ fn=create_folder_and_move_files,
+ inputs=[folder_name_input, bin_file_upload, config_file_upload],
+ outputs=[],
+ )
+ refresh_embedders_button.click(
+ fn=refresh_embedders_folders, inputs=[], outputs=[embedder_model_custom]
+ )
+ pretrained.change(
+ fn=toggle_pretrained,
+ inputs=[pretrained, custom_pretrained],
+ outputs=[custom_pretrained, pretrained_custom_settings],
+ )
+ custom_pretrained.change(
+ fn=toggle_visible,
+ inputs=[custom_pretrained],
+ outputs=[pretrained_custom_settings],
+ )
+ refresh_custom_pretaineds_button.click(
+ fn=refresh_custom_pretraineds,
+ inputs=[],
+ outputs=[g_pretrained_path, d_pretrained_path],
+ )
+ upload_pretrained.upload(
+ fn=save_drop_model,
+ inputs=[upload_pretrained],
+ outputs=[upload_pretrained],
+ )
+ overtraining_detector.change(
+ fn=toggle_visible,
+ inputs=[overtraining_detector],
+ outputs=[overtraining_settings],
+ )
+ train_button.click(
+ fn=enable_stop_train_button,
+ inputs=[],
+ outputs=[train_button, stop_train_button],
+ )
+ train_output_info.change(
+ fn=disable_stop_train_button,
+ inputs=[],
+ outputs=[train_button, stop_train_button],
+ )
+ pth_dropdown_export.change(
+ fn=export_pth,
+ inputs=[pth_dropdown_export],
+ outputs=[pth_file_export],
+ )
+ index_dropdown_export.change(
+ fn=export_index,
+ inputs=[index_dropdown_export],
+ outputs=[index_file_export],
+ )
+ refresh_export.click(
+ fn=refresh_pth_and_index_list,
+ inputs=[],
+ outputs=[pth_dropdown_export, index_dropdown_export],
+ )
diff --git a/tabs/tts/tts.py b/tabs/tts/tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..f700fbbc6a07b54169e2438ce162d90f34618da6
--- /dev/null
+++ b/tabs/tts/tts.py
@@ -0,0 +1,432 @@
+import json
+import os
+import random
+import sys
+
+import gradio as gr
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from assets.i18n.i18n import I18nAuto
+from core import run_tts_script
+from tabs.inference.inference import (
+ change_choices,
+ create_folder_and_move_files,
+ get_indexes,
+ get_speakers_id,
+ match_index,
+ refresh_embedders_folders,
+ extract_model_and_epoch,
+ names,
+ default_weight,
+)
+
+i18n = I18nAuto()
+
+
+with open(
+ os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8"
+) as file:
+ tts_voices_data = json.load(file)
+
+short_names = [voice.get("ShortName", "") for voice in tts_voices_data]
+
+
+def process_input(file_path):
+ try:
+ with open(file_path, "r", encoding="utf-8") as file:
+ file.read()
+ gr.Info(f"The file has been loaded!")
+ return file_path, file_path
+ except UnicodeDecodeError:
+ gr.Info(f"The file has to be in UTF-8 encoding.")
+ return None, None
+
+
+# TTS tab
+def tts_tab():
+ with gr.Column():
+ with gr.Row():
+ model_file = gr.Dropdown(
+ label=i18n("Voice Model"),
+ info=i18n("Select the voice model to use for the conversion."),
+ choices=sorted(names, key=lambda x: extract_model_and_epoch(x)),
+ interactive=True,
+ value=default_weight,
+ allow_custom_value=True,
+ )
+ best_default_index_path = match_index(model_file.value)
+ index_file = gr.Dropdown(
+ label=i18n("Index File"),
+ info=i18n("Select the index file to use for the conversion."),
+ choices=get_indexes(),
+ value=best_default_index_path,
+ interactive=True,
+ allow_custom_value=True,
+ )
+ with gr.Row():
+ unload_button = gr.Button(i18n("Unload Voice"))
+ refresh_button = gr.Button(i18n("Refresh"))
+
+ unload_button.click(
+ fn=lambda: (
+ {"value": "", "__type__": "update"},
+ {"value": "", "__type__": "update"},
+ ),
+ inputs=[],
+ outputs=[model_file, index_file],
+ )
+
+ model_file.select(
+ fn=lambda model_file_value: match_index(model_file_value),
+ inputs=[model_file],
+ outputs=[index_file],
+ )
+
+ gr.Markdown(
+ i18n(
+ f"Applio is a Speech-to-Speech conversion software, utilizing EdgeTTS as middleware for running the Text-to-Speech (TTS) component. Read more about it [here!](https://docs.applio.org/applio/getting-started/tts)"
+ )
+ )
+ tts_voice = gr.Dropdown(
+ label=i18n("TTS Voices"),
+ info=i18n("Select the TTS voice to use for the conversion."),
+ choices=short_names,
+ interactive=True,
+ value=random.choice(short_names),
+ )
+
+ tts_rate = gr.Slider(
+ minimum=-100,
+ maximum=100,
+ step=1,
+ label=i18n("TTS Speed"),
+ info=i18n("Increase or decrease TTS speed."),
+ value=0,
+ interactive=True,
+ )
+
+ with gr.Tabs():
+ with gr.Tab(label="Text to Speech"):
+ tts_text = gr.Textbox(
+ label=i18n("Text to Synthesize"),
+ info=i18n("Enter the text to synthesize."),
+ placeholder=i18n("Enter text to synthesize"),
+ lines=3,
+ )
+ with gr.Tab(label="File to Speech"):
+ txt_file = gr.File(
+ label=i18n("Upload a .txt file"),
+ type="filepath",
+ )
+ input_tts_path = gr.Textbox(
+ label=i18n("Input path for text file"),
+ placeholder=i18n(
+ "The path to the text file that contains content for text to speech."
+ ),
+ value="",
+ interactive=True,
+ )
+
+ with gr.Accordion(i18n("Advanced Settings"), open=False):
+ with gr.Column():
+ output_tts_path = gr.Textbox(
+ label=i18n("Output Path for TTS Audio"),
+ placeholder=i18n("Enter output path"),
+ value=os.path.join(now_dir, "assets", "audios", "tts_output.wav"),
+ interactive=True,
+ )
+ output_rvc_path = gr.Textbox(
+ label=i18n("Output Path for RVC Audio"),
+ placeholder=i18n("Enter output path"),
+ value=os.path.join(now_dir, "assets", "audios", "tts_rvc_output.wav"),
+ interactive=True,
+ )
+ export_format = gr.Radio(
+ label=i18n("Export Format"),
+ info=i18n("Select the format to export the audio."),
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+ value="WAV",
+ interactive=True,
+ )
+ sid = gr.Dropdown(
+ label=i18n("Speaker ID"),
+ info=i18n("Select the speaker ID to use for the conversion."),
+ choices=get_speakers_id(model_file.value),
+ value=0,
+ interactive=True,
+ )
+ split_audio = gr.Checkbox(
+ label=i18n("Split Audio"),
+ info=i18n(
+ "Split the audio into chunks for inference to obtain better results in some cases."
+ ),
+ visible=True,
+ value=False,
+ interactive=True,
+ )
+ autotune = gr.Checkbox(
+ label=i18n("Autotune"),
+ info=i18n(
+ "Apply a soft autotune to your inferences, recommended for singing conversions."
+ ),
+ visible=True,
+ value=False,
+ interactive=True,
+ )
+ autotune_strength = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Autotune Strength"),
+ info=i18n(
+ "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
+ ),
+ visible=False,
+ value=1,
+ interactive=True,
+ )
+ clean_audio = gr.Checkbox(
+ label=i18n("Clean Audio"),
+ info=i18n(
+ "Clean your audio output using noise detection algorithms, recommended for speaking audios."
+ ),
+ visible=True,
+ value=True,
+ interactive=True,
+ )
+ clean_strength = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Clean Strength"),
+ info=i18n(
+ "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
+ ),
+ visible=True,
+ value=0.5,
+ interactive=True,
+ )
+ pitch = gr.Slider(
+ minimum=-24,
+ maximum=24,
+ step=1,
+ label=i18n("Pitch"),
+ info=i18n(
+ "Set the pitch of the audio, the higher the value, the higher the pitch."
+ ),
+ value=0,
+ interactive=True,
+ )
+ filter_radius = gr.Slider(
+ minimum=0,
+ maximum=7,
+ label=i18n("Filter Radius"),
+ info=i18n(
+ "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
+ ),
+ value=3,
+ step=1,
+ interactive=True,
+ )
+ index_rate = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Search Feature Ratio"),
+ info=i18n(
+ "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
+ ),
+ value=0.75,
+ interactive=True,
+ )
+ rms_mix_rate = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Volume Envelope"),
+ info=i18n(
+ "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
+ ),
+ value=1,
+ interactive=True,
+ )
+ protect = gr.Slider(
+ minimum=0,
+ maximum=0.5,
+ label=i18n("Protect Voiceless Consonants"),
+ info=i18n(
+ "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
+ ),
+ value=0.5,
+ interactive=True,
+ )
+ hop_length = gr.Slider(
+ minimum=1,
+ maximum=512,
+ step=1,
+ label=i18n("Hop Length"),
+ info=i18n(
+ "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
+ ),
+ value=128,
+ interactive=True,
+ )
+ f0_method = gr.Radio(
+ label=i18n("Pitch extraction algorithm"),
+ info=i18n(
+ "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
+ ),
+ choices=[
+ "crepe",
+ "crepe-tiny",
+ "rmvpe",
+ "fcpe",
+ "hybrid[rmvpe+fcpe]",
+ ],
+ value="rmvpe",
+ interactive=True,
+ )
+ embedder_model = gr.Radio(
+ label=i18n("Embedder Model"),
+ info=i18n("Model used for learning speaker embedding."),
+ choices=[
+ "contentvec",
+ "chinese-hubert-base",
+ "japanese-hubert-base",
+ "korean-hubert-base",
+ "custom",
+ ],
+ value="contentvec",
+ interactive=True,
+ )
+ with gr.Column(visible=False) as embedder_custom:
+ with gr.Accordion(i18n("Custom Embedder"), open=True):
+ with gr.Row():
+ embedder_model_custom = gr.Dropdown(
+ label=i18n("Select Custom Embedder"),
+ choices=refresh_embedders_folders(),
+ interactive=True,
+ allow_custom_value=True,
+ )
+ refresh_embedders_button = gr.Button(i18n("Refresh embedders"))
+ folder_name_input = gr.Textbox(
+ label=i18n("Folder Name"), interactive=True
+ )
+ with gr.Row():
+ bin_file_upload = gr.File(
+ label=i18n("Upload .bin"),
+ type="filepath",
+ interactive=True,
+ )
+ config_file_upload = gr.File(
+ label=i18n("Upload .json"),
+ type="filepath",
+ interactive=True,
+ )
+ move_files_button = gr.Button(
+ i18n("Move files to custom embedder folder")
+ )
+ f0_file = gr.File(
+ label=i18n(
+ "The f0 curve represents the variations in the base frequency of a voice over time, showing how pitch rises and falls."
+ ),
+ visible=True,
+ )
+
+ def enforce_terms(terms_accepted, *args):
+ if not terms_accepted:
+ message = "You must agree to the Terms of Use to proceed."
+ gr.Info(message)
+ return message, None
+ return run_tts_script(*args)
+
+ terms_checkbox = gr.Checkbox(
+ label=i18n("I agree to the terms of use"),
+ info=i18n(
+ "Please ensure compliance with the terms and conditions detailed in [this document](https://github.com/IAHispano/Applio/blob/main/TERMS_OF_USE.md) before proceeding with your inference."
+ ),
+ value=False,
+ interactive=True,
+ )
+ convert_button = gr.Button(i18n("Convert"))
+
+ with gr.Row():
+ vc_output1 = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ )
+ vc_output2 = gr.Audio(label=i18n("Export Audio"))
+
+ def toggle_visible(checkbox):
+ return {"visible": checkbox, "__type__": "update"}
+
+ def toggle_visible_embedder_custom(embedder_model):
+ if embedder_model == "custom":
+ return {"visible": True, "__type__": "update"}
+ return {"visible": False, "__type__": "update"}
+
+ autotune.change(
+ fn=toggle_visible,
+ inputs=[autotune],
+ outputs=[autotune_strength],
+ )
+ clean_audio.change(
+ fn=toggle_visible,
+ inputs=[clean_audio],
+ outputs=[clean_strength],
+ )
+ refresh_button.click(
+ fn=change_choices,
+ inputs=[model_file],
+ outputs=[model_file, index_file, sid, sid],
+ )
+ txt_file.upload(
+ fn=process_input,
+ inputs=[txt_file],
+ outputs=[input_tts_path, txt_file],
+ )
+ embedder_model.change(
+ fn=toggle_visible_embedder_custom,
+ inputs=[embedder_model],
+ outputs=[embedder_custom],
+ )
+ move_files_button.click(
+ fn=create_folder_and_move_files,
+ inputs=[folder_name_input, bin_file_upload, config_file_upload],
+ outputs=[],
+ )
+ refresh_embedders_button.click(
+ fn=lambda: gr.update(choices=refresh_embedders_folders()),
+ inputs=[],
+ outputs=[embedder_model_custom],
+ )
+ convert_button.click(
+ fn=enforce_terms,
+ inputs=[
+ terms_checkbox,
+ input_tts_path,
+ tts_text,
+ tts_voice,
+ tts_rate,
+ pitch,
+ filter_radius,
+ index_rate,
+ rms_mix_rate,
+ protect,
+ hop_length,
+ f0_method,
+ output_tts_path,
+ output_rvc_path,
+ model_file,
+ index_file,
+ split_audio,
+ autotune,
+ autotune_strength,
+ clean_audio,
+ clean_strength,
+ export_format,
+ f0_file,
+ embedder_model,
+ embedder_model_custom,
+ sid,
+ ],
+ outputs=[vc_output1, vc_output2],
+ )
diff --git a/tabs/voice_blender/voice_blender.py b/tabs/voice_blender/voice_blender.py
new file mode 100644
index 0000000000000000000000000000000000000000..954332a803b0de6fdffbc87e58466f3adbefc70f
--- /dev/null
+++ b/tabs/voice_blender/voice_blender.py
@@ -0,0 +1,98 @@
+import os, sys
+import gradio as gr
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from assets.i18n.i18n import I18nAuto
+from core import run_model_blender_script
+
+i18n = I18nAuto()
+
+
+def update_model_fusion(dropbox):
+ return dropbox, None
+
+
+def voice_blender_tab():
+ gr.Markdown(i18n("## Voice Blender"))
+ gr.Markdown(
+ i18n(
+ "Select two voice models, set your desired blend percentage, and blend them into an entirely new voice."
+ )
+ )
+ with gr.Column():
+ model_fusion_name = gr.Textbox(
+ label=i18n("Model Name"),
+ info=i18n("Name of the new model."),
+ value="",
+ max_lines=1,
+ interactive=True,
+ placeholder=i18n("Enter model name"),
+ )
+ with gr.Row():
+ with gr.Column():
+ model_fusion_a_dropbox = gr.File(
+ label=i18n("Drag and drop your model here"), type="filepath"
+ )
+ model_fusion_a = gr.Textbox(
+ label=i18n("Path to Model"),
+ value="",
+ interactive=True,
+ placeholder=i18n("Enter path to model"),
+ info=i18n("You can also use a custom path."),
+ )
+ with gr.Column():
+ model_fusion_b_dropbox = gr.File(
+ label=i18n("Drag and drop your model here"), type="filepath"
+ )
+ model_fusion_b = gr.Textbox(
+ label=i18n("Path to Model"),
+ value="",
+ interactive=True,
+ placeholder=i18n("Enter path to model"),
+ info=i18n("You can also use a custom path."),
+ )
+ alpha_a = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("Blend Ratio"),
+ value=0.5,
+ interactive=True,
+ info=i18n(
+ "Adjusting the position more towards one side or the other will make the model more similar to the first or second."
+ ),
+ )
+ model_fusion_button = gr.Button(i18n("Fusion"))
+ with gr.Row():
+ model_fusion_output_info = gr.Textbox(
+ label=i18n("Output Information"),
+ info=i18n("The output information will be displayed here."),
+ value="",
+ )
+ model_fusion_pth_output = gr.File(
+ label=i18n("Download Model"), type="filepath", interactive=False
+ )
+
+ model_fusion_button.click(
+ fn=run_model_blender_script,
+ inputs=[
+ model_fusion_name,
+ model_fusion_a,
+ model_fusion_b,
+ alpha_a,
+ ],
+ outputs=[model_fusion_output_info, model_fusion_pth_output],
+ )
+
+ model_fusion_a_dropbox.upload(
+ fn=update_model_fusion,
+ inputs=model_fusion_a_dropbox,
+ outputs=[model_fusion_a, model_fusion_a_dropbox],
+ )
+
+ model_fusion_b_dropbox.upload(
+ fn=update_model_fusion,
+ inputs=model_fusion_b_dropbox,
+ outputs=[model_fusion_b, model_fusion_b_dropbox],
+ )