kiuuiro commited on
Commit
e08ed7a
·
verified ·
1 Parent(s): ebe5cbc

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +150 -0
  2. core.py +2432 -0
app.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import sys
3
+ import os
4
+ import logging
5
+
6
+ from typing import Any
7
+
8
+ DEFAULT_SERVER_NAME = "127.0.0.1"
9
+ DEFAULT_PORT = 6969
10
+ MAX_PORT_ATTEMPTS = 10
11
+
12
+ # Set up logging
13
+ logging.getLogger("uvicorn").setLevel(logging.WARNING)
14
+ logging.getLogger("httpx").setLevel(logging.WARNING)
15
+
16
+ # Add current directory to sys.path
17
+ now_dir = os.getcwd()
18
+ sys.path.append(now_dir)
19
+
20
+ # Zluda hijack
21
+ import rvc.lib.zluda
22
+
23
+ # Import Tabs
24
+ from tabs.inference.inference import inference_tab
25
+ from tabs.train.train import train_tab
26
+ from tabs.extra.extra import extra_tab
27
+ from tabs.report.report import report_tab
28
+ from tabs.download.download import download_tab
29
+ from tabs.tts.tts import tts_tab
30
+ from tabs.voice_blender.voice_blender import voice_blender_tab
31
+ from tabs.plugins.plugins import plugins_tab
32
+ from tabs.settings.settings import settings_tab
33
+
34
+ # Run prerequisites
35
+ from core import run_prerequisites_script
36
+
37
+ run_prerequisites_script(
38
+ pretraineds_hifigan=True,
39
+ models=True,
40
+ exe=True,
41
+ )
42
+
43
+ # Initialize i18n
44
+ from assets.i18n.i18n import I18nAuto
45
+
46
+ i18n = I18nAuto()
47
+
48
+ # Start Discord presence if enabled
49
+ from tabs.settings.sections.presence import load_config_presence
50
+
51
+ if load_config_presence():
52
+ from assets.discord_presence import RPCManager
53
+
54
+ RPCManager.start_presence()
55
+
56
+ # Check installation
57
+ import assets.installation_checker as installation_checker
58
+
59
+ installation_checker.check_installation()
60
+
61
+ # Load theme
62
+ import assets.themes.loadThemes as loadThemes
63
+
64
+ my_applio = loadThemes.load_theme() or "ParityError/Interstellar"
65
+
66
+ # Define Gradio interface
67
+ with gr.Blocks(
68
+ theme=my_applio, title="Applio", css="footer{display:none !important}"
69
+ ) as Applio:
70
+ gr.Markdown("# Applio")
71
+ gr.Markdown(
72
+ i18n(
73
+ "A simple, high-quality voice conversion tool focused on ease of use and performance."
74
+ )
75
+ )
76
+ gr.Markdown(
77
+ i18n(
78
+ "[Support](https://discord.gg/urxFjYmYYh) — [GitHub](https://github.com/IAHispano/Applio)"
79
+ )
80
+ )
81
+ with gr.Tab(i18n("Inference")):
82
+ inference_tab()
83
+
84
+ with gr.Tab(i18n("Training")):
85
+ train_tab()
86
+
87
+ with gr.Tab(i18n("TTS")):
88
+ tts_tab()
89
+
90
+ with gr.Tab(i18n("Voice Blender")):
91
+ voice_blender_tab()
92
+
93
+ with gr.Tab(i18n("Plugins")):
94
+ plugins_tab()
95
+
96
+ with gr.Tab(i18n("Download")):
97
+ download_tab()
98
+
99
+ with gr.Tab(i18n("Report a Bug")):
100
+ report_tab()
101
+
102
+ with gr.Tab(i18n("Extra")):
103
+ extra_tab()
104
+
105
+ with gr.Tab(i18n("Settings")):
106
+ settings_tab()
107
+
108
+ gr.Markdown(
109
+ """
110
+ <div style="text-align: center; font-size: 0.9em; text-color: a3a3a3;">
111
+ By using Applio, you agree to comply with ethical and legal standards, respect intellectual property and privacy rights, avoid harmful or prohibited uses, and accept full responsibility for any outcomes, while Applio disclaims liability and reserves the right to amend these terms.
112
+ </div>
113
+ """
114
+ )
115
+
116
+
117
+ def launch_gradio(server_name: str, server_port: int) -> None:
118
+ Applio.launch(
119
+ favicon_path="assets/ICON.ico",
120
+ share="--share" in sys.argv,
121
+ inbrowser="--open" in sys.argv,
122
+ server_name=server_name,
123
+ server_port=server_port,
124
+ )
125
+
126
+
127
+ def get_value_from_args(key: str, default: Any = None) -> Any:
128
+ if key in sys.argv:
129
+ index = sys.argv.index(key) + 1
130
+ if index < len(sys.argv):
131
+ return sys.argv[index]
132
+ return default
133
+
134
+
135
+ if __name__ == "__main__":
136
+ port = int(get_value_from_args("--port", DEFAULT_PORT))
137
+ server = get_value_from_args("--server-name", DEFAULT_SERVER_NAME)
138
+
139
+ for _ in range(MAX_PORT_ATTEMPTS):
140
+ try:
141
+ launch_gradio(server, port)
142
+ break
143
+ except OSError:
144
+ print(
145
+ f"Failed to launch on port {port}, trying again on port {port - 1}..."
146
+ )
147
+ port -= 1
148
+ except Exception as error:
149
+ print(f"An error occurred launching Gradio: {error}")
150
+ break
core.py ADDED
@@ -0,0 +1,2432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import argparse
5
+ import subprocess
6
+ from functools import lru_cache
7
+ from distutils.util import strtobool
8
+
9
+ now_dir = os.getcwd()
10
+ sys.path.append(now_dir)
11
+
12
+ current_script_directory = os.path.dirname(os.path.realpath(__file__))
13
+ logs_path = os.path.join(current_script_directory, "logs")
14
+
15
+ from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
16
+ from rvc.train.process.model_blender import model_blender
17
+ from rvc.train.process.model_information import model_information
18
+ from rvc.lib.tools.analyzer import analyze_audio
19
+ from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
20
+ from rvc.lib.tools.model_download import model_download_pipeline
21
+
22
+ python = sys.executable
23
+
24
+
25
+ # Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4
26
+ @lru_cache(maxsize=1) # Cache only one result since the file is static
27
+ def load_voices_data():
28
+ with open(
29
+ os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8"
30
+ ) as file:
31
+ return json.load(file)
32
+
33
+
34
+ voices_data = load_voices_data()
35
+ locales = list({voice["ShortName"] for voice in voices_data})
36
+
37
+
38
+ @lru_cache(maxsize=None)
39
+ def import_voice_converter():
40
+ from rvc.infer.infer import VoiceConverter
41
+
42
+ return VoiceConverter()
43
+
44
+
45
+ @lru_cache(maxsize=1)
46
+ def get_config():
47
+ from rvc.configs.config import Config
48
+
49
+ return Config()
50
+
51
+
52
+ # Infer
53
+ def run_infer_script(
54
+ pitch: int,
55
+ filter_radius: int,
56
+ index_rate: float,
57
+ volume_envelope: int,
58
+ protect: float,
59
+ hop_length: int,
60
+ f0_method: str,
61
+ input_path: str,
62
+ output_path: str,
63
+ pth_path: str,
64
+ index_path: str,
65
+ split_audio: bool,
66
+ f0_autotune: bool,
67
+ f0_autotune_strength: float,
68
+ clean_audio: bool,
69
+ clean_strength: float,
70
+ export_format: str,
71
+ f0_file: str,
72
+ embedder_model: str,
73
+ embedder_model_custom: str = None,
74
+ formant_shifting: bool = False,
75
+ formant_qfrency: float = 1.0,
76
+ formant_timbre: float = 1.0,
77
+ post_process: bool = False,
78
+ reverb: bool = False,
79
+ pitch_shift: bool = False,
80
+ limiter: bool = False,
81
+ gain: bool = False,
82
+ distortion: bool = False,
83
+ chorus: bool = False,
84
+ bitcrush: bool = False,
85
+ clipping: bool = False,
86
+ compressor: bool = False,
87
+ delay: bool = False,
88
+ reverb_room_size: float = 0.5,
89
+ reverb_damping: float = 0.5,
90
+ reverb_wet_gain: float = 0.5,
91
+ reverb_dry_gain: float = 0.5,
92
+ reverb_width: float = 0.5,
93
+ reverb_freeze_mode: float = 0.5,
94
+ pitch_shift_semitones: float = 0.0,
95
+ limiter_threshold: float = -6,
96
+ limiter_release_time: float = 0.01,
97
+ gain_db: float = 0.0,
98
+ distortion_gain: float = 25,
99
+ chorus_rate: float = 1.0,
100
+ chorus_depth: float = 0.25,
101
+ chorus_center_delay: float = 7,
102
+ chorus_feedback: float = 0.0,
103
+ chorus_mix: float = 0.5,
104
+ bitcrush_bit_depth: int = 8,
105
+ clipping_threshold: float = -6,
106
+ compressor_threshold: float = 0,
107
+ compressor_ratio: float = 1,
108
+ compressor_attack: float = 1.0,
109
+ compressor_release: float = 100,
110
+ delay_seconds: float = 0.5,
111
+ delay_feedback: float = 0.0,
112
+ delay_mix: float = 0.5,
113
+ sid: int = 0,
114
+ ):
115
+ kwargs = {
116
+ "audio_input_path": input_path,
117
+ "audio_output_path": output_path,
118
+ "model_path": pth_path,
119
+ "index_path": index_path,
120
+ "pitch": pitch,
121
+ "filter_radius": filter_radius,
122
+ "index_rate": index_rate,
123
+ "volume_envelope": volume_envelope,
124
+ "protect": protect,
125
+ "hop_length": hop_length,
126
+ "f0_method": f0_method,
127
+ "pth_path": pth_path,
128
+ "index_path": index_path,
129
+ "split_audio": split_audio,
130
+ "f0_autotune": f0_autotune,
131
+ "f0_autotune_strength": f0_autotune_strength,
132
+ "clean_audio": clean_audio,
133
+ "clean_strength": clean_strength,
134
+ "export_format": export_format,
135
+ "f0_file": f0_file,
136
+ "embedder_model": embedder_model,
137
+ "embedder_model_custom": embedder_model_custom,
138
+ "post_process": post_process,
139
+ "formant_shifting": formant_shifting,
140
+ "formant_qfrency": formant_qfrency,
141
+ "formant_timbre": formant_timbre,
142
+ "reverb": reverb,
143
+ "pitch_shift": pitch_shift,
144
+ "limiter": limiter,
145
+ "gain": gain,
146
+ "distortion": distortion,
147
+ "chorus": chorus,
148
+ "bitcrush": bitcrush,
149
+ "clipping": clipping,
150
+ "compressor": compressor,
151
+ "delay": delay,
152
+ "reverb_room_size": reverb_room_size,
153
+ "reverb_damping": reverb_damping,
154
+ "reverb_wet_level": reverb_wet_gain,
155
+ "reverb_dry_level": reverb_dry_gain,
156
+ "reverb_width": reverb_width,
157
+ "reverb_freeze_mode": reverb_freeze_mode,
158
+ "pitch_shift_semitones": pitch_shift_semitones,
159
+ "limiter_threshold": limiter_threshold,
160
+ "limiter_release": limiter_release_time,
161
+ "gain_db": gain_db,
162
+ "distortion_gain": distortion_gain,
163
+ "chorus_rate": chorus_rate,
164
+ "chorus_depth": chorus_depth,
165
+ "chorus_delay": chorus_center_delay,
166
+ "chorus_feedback": chorus_feedback,
167
+ "chorus_mix": chorus_mix,
168
+ "bitcrush_bit_depth": bitcrush_bit_depth,
169
+ "clipping_threshold": clipping_threshold,
170
+ "compressor_threshold": compressor_threshold,
171
+ "compressor_ratio": compressor_ratio,
172
+ "compressor_attack": compressor_attack,
173
+ "compressor_release": compressor_release,
174
+ "delay_seconds": delay_seconds,
175
+ "delay_feedback": delay_feedback,
176
+ "delay_mix": delay_mix,
177
+ "sid": sid,
178
+ }
179
+ infer_pipeline = import_voice_converter()
180
+ infer_pipeline.convert_audio(
181
+ **kwargs,
182
+ )
183
+ return f"File {input_path} inferred successfully.", output_path.replace(
184
+ ".wav", f".{export_format.lower()}"
185
+ )
186
+
187
+
188
+ # Batch infer
189
+ def run_batch_infer_script(
190
+ pitch: int,
191
+ filter_radius: int,
192
+ index_rate: float,
193
+ volume_envelope: int,
194
+ protect: float,
195
+ hop_length: int,
196
+ f0_method: str,
197
+ input_folder: str,
198
+ output_folder: str,
199
+ pth_path: str,
200
+ index_path: str,
201
+ split_audio: bool,
202
+ f0_autotune: bool,
203
+ f0_autotune_strength: float,
204
+ clean_audio: bool,
205
+ clean_strength: float,
206
+ export_format: str,
207
+ f0_file: str,
208
+ embedder_model: str,
209
+ embedder_model_custom: str = None,
210
+ formant_shifting: bool = False,
211
+ formant_qfrency: float = 1.0,
212
+ formant_timbre: float = 1.0,
213
+ post_process: bool = False,
214
+ reverb: bool = False,
215
+ pitch_shift: bool = False,
216
+ limiter: bool = False,
217
+ gain: bool = False,
218
+ distortion: bool = False,
219
+ chorus: bool = False,
220
+ bitcrush: bool = False,
221
+ clipping: bool = False,
222
+ compressor: bool = False,
223
+ delay: bool = False,
224
+ reverb_room_size: float = 0.5,
225
+ reverb_damping: float = 0.5,
226
+ reverb_wet_gain: float = 0.5,
227
+ reverb_dry_gain: float = 0.5,
228
+ reverb_width: float = 0.5,
229
+ reverb_freeze_mode: float = 0.5,
230
+ pitch_shift_semitones: float = 0.0,
231
+ limiter_threshold: float = -6,
232
+ limiter_release_time: float = 0.01,
233
+ gain_db: float = 0.0,
234
+ distortion_gain: float = 25,
235
+ chorus_rate: float = 1.0,
236
+ chorus_depth: float = 0.25,
237
+ chorus_center_delay: float = 7,
238
+ chorus_feedback: float = 0.0,
239
+ chorus_mix: float = 0.5,
240
+ bitcrush_bit_depth: int = 8,
241
+ clipping_threshold: float = -6,
242
+ compressor_threshold: float = 0,
243
+ compressor_ratio: float = 1,
244
+ compressor_attack: float = 1.0,
245
+ compressor_release: float = 100,
246
+ delay_seconds: float = 0.5,
247
+ delay_feedback: float = 0.0,
248
+ delay_mix: float = 0.5,
249
+ sid: int = 0,
250
+ ):
251
+ kwargs = {
252
+ "audio_input_paths": input_folder,
253
+ "audio_output_path": output_folder,
254
+ "model_path": pth_path,
255
+ "index_path": index_path,
256
+ "pitch": pitch,
257
+ "filter_radius": filter_radius,
258
+ "index_rate": index_rate,
259
+ "volume_envelope": volume_envelope,
260
+ "protect": protect,
261
+ "hop_length": hop_length,
262
+ "f0_method": f0_method,
263
+ "pth_path": pth_path,
264
+ "index_path": index_path,
265
+ "split_audio": split_audio,
266
+ "f0_autotune": f0_autotune,
267
+ "f0_autotune_strength": f0_autotune_strength,
268
+ "clean_audio": clean_audio,
269
+ "clean_strength": clean_strength,
270
+ "export_format": export_format,
271
+ "f0_file": f0_file,
272
+ "embedder_model": embedder_model,
273
+ "embedder_model_custom": embedder_model_custom,
274
+ "post_process": post_process,
275
+ "formant_shifting": formant_shifting,
276
+ "formant_qfrency": formant_qfrency,
277
+ "formant_timbre": formant_timbre,
278
+ "reverb": reverb,
279
+ "pitch_shift": pitch_shift,
280
+ "limiter": limiter,
281
+ "gain": gain,
282
+ "distortion": distortion,
283
+ "chorus": chorus,
284
+ "bitcrush": bitcrush,
285
+ "clipping": clipping,
286
+ "compressor": compressor,
287
+ "delay": delay,
288
+ "reverb_room_size": reverb_room_size,
289
+ "reverb_damping": reverb_damping,
290
+ "reverb_wet_level": reverb_wet_gain,
291
+ "reverb_dry_level": reverb_dry_gain,
292
+ "reverb_width": reverb_width,
293
+ "reverb_freeze_mode": reverb_freeze_mode,
294
+ "pitch_shift_semitones": pitch_shift_semitones,
295
+ "limiter_threshold": limiter_threshold,
296
+ "limiter_release": limiter_release_time,
297
+ "gain_db": gain_db,
298
+ "distortion_gain": distortion_gain,
299
+ "chorus_rate": chorus_rate,
300
+ "chorus_depth": chorus_depth,
301
+ "chorus_delay": chorus_center_delay,
302
+ "chorus_feedback": chorus_feedback,
303
+ "chorus_mix": chorus_mix,
304
+ "bitcrush_bit_depth": bitcrush_bit_depth,
305
+ "clipping_threshold": clipping_threshold,
306
+ "compressor_threshold": compressor_threshold,
307
+ "compressor_ratio": compressor_ratio,
308
+ "compressor_attack": compressor_attack,
309
+ "compressor_release": compressor_release,
310
+ "delay_seconds": delay_seconds,
311
+ "delay_feedback": delay_feedback,
312
+ "delay_mix": delay_mix,
313
+ "sid": sid,
314
+ }
315
+ infer_pipeline = import_voice_converter()
316
+ infer_pipeline.convert_audio_batch(
317
+ **kwargs,
318
+ )
319
+
320
+ return f"Files from {input_folder} inferred successfully."
321
+
322
+
323
+ # TTS
324
+ def run_tts_script(
325
+ tts_file: str,
326
+ tts_text: str,
327
+ tts_voice: str,
328
+ tts_rate: int,
329
+ pitch: int,
330
+ filter_radius: int,
331
+ index_rate: float,
332
+ volume_envelope: int,
333
+ protect: float,
334
+ hop_length: int,
335
+ f0_method: str,
336
+ output_tts_path: str,
337
+ output_rvc_path: str,
338
+ pth_path: str,
339
+ index_path: str,
340
+ split_audio: bool,
341
+ f0_autotune: bool,
342
+ f0_autotune_strength: float,
343
+ clean_audio: bool,
344
+ clean_strength: float,
345
+ export_format: str,
346
+ f0_file: str,
347
+ embedder_model: str,
348
+ embedder_model_custom: str = None,
349
+ sid: int = 0,
350
+ ):
351
+
352
+ tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
353
+
354
+ if os.path.exists(output_tts_path):
355
+ os.remove(output_tts_path)
356
+
357
+ command_tts = [
358
+ *map(
359
+ str,
360
+ [
361
+ python,
362
+ tts_script_path,
363
+ tts_file,
364
+ tts_text,
365
+ tts_voice,
366
+ tts_rate,
367
+ output_tts_path,
368
+ ],
369
+ ),
370
+ ]
371
+ subprocess.run(command_tts)
372
+ infer_pipeline = import_voice_converter()
373
+ infer_pipeline.convert_audio(
374
+ pitch=pitch,
375
+ filter_radius=filter_radius,
376
+ index_rate=index_rate,
377
+ volume_envelope=volume_envelope,
378
+ protect=protect,
379
+ hop_length=hop_length,
380
+ f0_method=f0_method,
381
+ audio_input_path=output_tts_path,
382
+ audio_output_path=output_rvc_path,
383
+ model_path=pth_path,
384
+ index_path=index_path,
385
+ split_audio=split_audio,
386
+ f0_autotune=f0_autotune,
387
+ f0_autotune_strength=f0_autotune_strength,
388
+ clean_audio=clean_audio,
389
+ clean_strength=clean_strength,
390
+ export_format=export_format,
391
+ f0_file=f0_file,
392
+ embedder_model=embedder_model,
393
+ embedder_model_custom=embedder_model_custom,
394
+ sid=sid,
395
+ formant_shifting=None,
396
+ formant_qfrency=None,
397
+ formant_timbre=None,
398
+ post_process=None,
399
+ reverb=None,
400
+ pitch_shift=None,
401
+ limiter=None,
402
+ gain=None,
403
+ distortion=None,
404
+ chorus=None,
405
+ bitcrush=None,
406
+ clipping=None,
407
+ compressor=None,
408
+ delay=None,
409
+ sliders=None,
410
+ )
411
+
412
+ return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
413
+ ".wav", f".{export_format.lower()}"
414
+ )
415
+
416
+
417
+ # Preprocess
418
+ def run_preprocess_script(
419
+ model_name: str,
420
+ dataset_path: str,
421
+ sample_rate: int,
422
+ cpu_cores: int,
423
+ cut_preprocess: str,
424
+ process_effects: bool,
425
+ noise_reduction: bool,
426
+ clean_strength: float,
427
+ chunk_len: float,
428
+ overlap_len: float,
429
+ ):
430
+ preprocess_script_path = os.path.join("rvc", "train", "preprocess", "preprocess.py")
431
+ command = [
432
+ python,
433
+ preprocess_script_path,
434
+ *map(
435
+ str,
436
+ [
437
+ os.path.join(logs_path, model_name),
438
+ dataset_path,
439
+ sample_rate,
440
+ cpu_cores,
441
+ cut_preprocess,
442
+ process_effects,
443
+ noise_reduction,
444
+ clean_strength,
445
+ chunk_len,
446
+ overlap_len,
447
+ ],
448
+ ),
449
+ ]
450
+ subprocess.run(command)
451
+ return f"Model {model_name} preprocessed successfully."
452
+
453
+
454
+ # Extract
455
+ def run_extract_script(
456
+ model_name: str,
457
+ f0_method: str,
458
+ hop_length: int,
459
+ cpu_cores: int,
460
+ gpu: int,
461
+ sample_rate: int,
462
+ embedder_model: str,
463
+ embedder_model_custom: str = None,
464
+ include_mutes: int = 2,
465
+ ):
466
+
467
+ model_path = os.path.join(logs_path, model_name)
468
+ extract = os.path.join("rvc", "train", "extract", "extract.py")
469
+
470
+ command_1 = [
471
+ python,
472
+ extract,
473
+ *map(
474
+ str,
475
+ [
476
+ model_path,
477
+ f0_method,
478
+ hop_length,
479
+ cpu_cores,
480
+ gpu,
481
+ sample_rate,
482
+ embedder_model,
483
+ embedder_model_custom,
484
+ include_mutes,
485
+ ],
486
+ ),
487
+ ]
488
+
489
+ subprocess.run(command_1)
490
+
491
+ return f"Model {model_name} extracted successfully."
492
+
493
+
494
+ # Train
495
+ def run_train_script(
496
+ model_name: str,
497
+ save_every_epoch: int,
498
+ save_only_latest: bool,
499
+ save_every_weights: bool,
500
+ total_epoch: int,
501
+ sample_rate: int,
502
+ batch_size: int,
503
+ gpu: int,
504
+ overtraining_detector: bool,
505
+ overtraining_threshold: int,
506
+ pretrained: bool,
507
+ cleanup: bool,
508
+ index_algorithm: str = "Auto",
509
+ cache_data_in_gpu: bool = False,
510
+ custom_pretrained: bool = False,
511
+ g_pretrained_path: str = None,
512
+ d_pretrained_path: str = None,
513
+ vocoder: str = "HiFi-GAN",
514
+ checkpointing: bool = False,
515
+ ):
516
+
517
+ if pretrained == True:
518
+ from rvc.lib.tools.pretrained_selector import pretrained_selector
519
+
520
+ if custom_pretrained == False:
521
+ pg, pd = pretrained_selector(str(vocoder), int(sample_rate))
522
+ else:
523
+ if g_pretrained_path is None or d_pretrained_path is None:
524
+ raise ValueError(
525
+ "Please provide the path to the pretrained G and D models."
526
+ )
527
+ pg, pd = g_pretrained_path, d_pretrained_path
528
+ else:
529
+ pg, pd = "", ""
530
+
531
+ train_script_path = os.path.join("rvc", "train", "train.py")
532
+ command = [
533
+ python,
534
+ train_script_path,
535
+ *map(
536
+ str,
537
+ [
538
+ model_name,
539
+ save_every_epoch,
540
+ total_epoch,
541
+ pg,
542
+ pd,
543
+ gpu,
544
+ batch_size,
545
+ sample_rate,
546
+ save_only_latest,
547
+ save_every_weights,
548
+ cache_data_in_gpu,
549
+ overtraining_detector,
550
+ overtraining_threshold,
551
+ cleanup,
552
+ vocoder,
553
+ checkpointing,
554
+ ],
555
+ ),
556
+ ]
557
+ subprocess.run(command)
558
+ run_index_script(model_name, index_algorithm)
559
+ return f"Model {model_name} trained successfully."
560
+
561
+
562
+ # Index
563
+ def run_index_script(model_name: str, index_algorithm: str):
564
+ index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
565
+ command = [
566
+ python,
567
+ index_script_path,
568
+ os.path.join(logs_path, model_name),
569
+ index_algorithm,
570
+ ]
571
+
572
+ subprocess.run(command)
573
+ return f"Index file for {model_name} generated successfully."
574
+
575
+
576
+ # Model information
577
+ def run_model_information_script(pth_path: str):
578
+ print(model_information(pth_path))
579
+ return model_information(pth_path)
580
+
581
+
582
+ # Model blender
583
+ def run_model_blender_script(
584
+ model_name: str, pth_path_1: str, pth_path_2: str, ratio: float
585
+ ):
586
+ message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
587
+ return message, model_blended
588
+
589
+
590
+ # Tensorboard
591
+ def run_tensorboard_script():
592
+ launch_tensorboard_pipeline()
593
+
594
+
595
+ # Download
596
+ def run_download_script(model_link: str):
597
+ model_download_pipeline(model_link)
598
+ return f"Model downloaded successfully."
599
+
600
+
601
+ # Prerequisites
602
+ def run_prerequisites_script(
603
+ pretraineds_hifigan: bool,
604
+ models: bool,
605
+ exe: bool,
606
+ ):
607
+ prequisites_download_pipeline(
608
+ pretraineds_hifigan,
609
+ models,
610
+ exe,
611
+ )
612
+ return "Prerequisites installed successfully."
613
+
614
+
615
+ # Audio analyzer
616
+ def run_audio_analyzer_script(
617
+ input_path: str, save_plot_path: str = "logs/audio_analysis.png"
618
+ ):
619
+ audio_info, plot_path = analyze_audio(input_path, save_plot_path)
620
+ print(
621
+ f"Audio info of {input_path}: {audio_info}",
622
+ f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
623
+ )
624
+ return audio_info, plot_path
625
+
626
+
627
+ # Parse arguments
628
+ def parse_arguments():
629
+ parser = argparse.ArgumentParser(
630
+ description="Run the main.py script with specific parameters."
631
+ )
632
+ subparsers = parser.add_subparsers(
633
+ title="subcommands", dest="mode", help="Choose a mode"
634
+ )
635
+
636
+ # Parser for 'infer' mode
637
+ infer_parser = subparsers.add_parser("infer", help="Run inference")
638
+ pitch_description = (
639
+ "Set the pitch of the audio. Higher values result in a higher pitch."
640
+ )
641
+ infer_parser.add_argument(
642
+ "--pitch",
643
+ type=int,
644
+ help=pitch_description,
645
+ choices=range(-24, 25),
646
+ default=0,
647
+ )
648
+ filter_radius_description = "Apply median filtering to the extracted pitch values if this value is greater than or equal to three. This can help reduce breathiness in the output audio."
649
+ infer_parser.add_argument(
650
+ "--filter_radius",
651
+ type=int,
652
+ help=filter_radius_description,
653
+ choices=range(11),
654
+ default=3,
655
+ )
656
+ index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning."
657
+ infer_parser.add_argument(
658
+ "--index_rate",
659
+ type=float,
660
+ help=index_rate_description,
661
+ choices=[i / 100.0 for i in range(0, 101)],
662
+ default=0.3,
663
+ )
664
+ volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used."
665
+ infer_parser.add_argument(
666
+ "--volume_envelope",
667
+ type=float,
668
+ help=volume_envelope_description,
669
+ choices=[i / 100.0 for i in range(0, 101)],
670
+ default=1,
671
+ )
672
+ protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect."
673
+ infer_parser.add_argument(
674
+ "--protect",
675
+ type=float,
676
+ help=protect_description,
677
+ choices=[i / 1000.0 for i in range(0, 501)],
678
+ default=0.33,
679
+ )
680
+ hop_length_description = "Only applicable for the Crepe pitch extraction method. Determines the time it takes for the system to react to a significant pitch change. Smaller values require more processing time but can lead to better pitch accuracy."
681
+ infer_parser.add_argument(
682
+ "--hop_length",
683
+ type=int,
684
+ help=hop_length_description,
685
+ choices=range(1, 513),
686
+ default=128,
687
+ )
688
+ f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended."
689
+ infer_parser.add_argument(
690
+ "--f0_method",
691
+ type=str,
692
+ help=f0_method_description,
693
+ choices=[
694
+ "crepe",
695
+ "crepe-tiny",
696
+ "rmvpe",
697
+ "fcpe",
698
+ "hybrid[crepe+rmvpe]",
699
+ "hybrid[crepe+fcpe]",
700
+ "hybrid[rmvpe+fcpe]",
701
+ "hybrid[crepe+rmvpe+fcpe]",
702
+ ],
703
+ default="rmvpe",
704
+ )
705
+ infer_parser.add_argument(
706
+ "--input_path",
707
+ type=str,
708
+ help="Full path to the input audio file.",
709
+ required=True,
710
+ )
711
+ infer_parser.add_argument(
712
+ "--output_path",
713
+ type=str,
714
+ help="Full path to the output audio file.",
715
+ required=True,
716
+ )
717
+ pth_path_description = "Full path to the RVC model file (.pth)."
718
+ infer_parser.add_argument(
719
+ "--pth_path", type=str, help=pth_path_description, required=True
720
+ )
721
+ index_path_description = "Full path to the index file (.index)."
722
+ infer_parser.add_argument(
723
+ "--index_path", type=str, help=index_path_description, required=True
724
+ )
725
+ split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files."
726
+ infer_parser.add_argument(
727
+ "--split_audio",
728
+ type=lambda x: bool(strtobool(x)),
729
+ choices=[True, False],
730
+ help=split_audio_description,
731
+ default=False,
732
+ )
733
+ f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions."
734
+ infer_parser.add_argument(
735
+ "--f0_autotune",
736
+ type=lambda x: bool(strtobool(x)),
737
+ choices=[True, False],
738
+ help=f0_autotune_description,
739
+ default=False,
740
+ )
741
+ f0_autotune_strength_description = "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
742
+ infer_parser.add_argument(
743
+ "--f0_autotune_strength",
744
+ type=float,
745
+ help=f0_autotune_strength_description,
746
+ choices=[(i / 10) for i in range(11)],
747
+ default=1.0,
748
+ )
749
+ clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions."
750
+ infer_parser.add_argument(
751
+ "--clean_audio",
752
+ type=lambda x: bool(strtobool(x)),
753
+ choices=[True, False],
754
+ help=clean_audio_description,
755
+ default=False,
756
+ )
757
+ clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound."
758
+ infer_parser.add_argument(
759
+ "--clean_strength",
760
+ type=float,
761
+ help=clean_strength_description,
762
+ choices=[(i / 10) for i in range(11)],
763
+ default=0.7,
764
+ )
765
+ export_format_description = "Select the desired output audio format."
766
+ infer_parser.add_argument(
767
+ "--export_format",
768
+ type=str,
769
+ help=export_format_description,
770
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
771
+ default="WAV",
772
+ )
773
+ embedder_model_description = (
774
+ "Choose the model used for generating speaker embeddings."
775
+ )
776
+ infer_parser.add_argument(
777
+ "--embedder_model",
778
+ type=str,
779
+ help=embedder_model_description,
780
+ choices=[
781
+ "contentvec",
782
+ "chinese-hubert-base",
783
+ "japanese-hubert-base",
784
+ "korean-hubert-base",
785
+ "custom",
786
+ ],
787
+ default="contentvec",
788
+ )
789
+ embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'."
790
+ infer_parser.add_argument(
791
+ "--embedder_model_custom",
792
+ type=str,
793
+ help=embedder_model_custom_description,
794
+ default=None,
795
+ )
796
+ f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio."
797
+ infer_parser.add_argument(
798
+ "--f0_file",
799
+ type=str,
800
+ help=f0_file_description,
801
+ default=None,
802
+ )
803
+ formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice."
804
+ infer_parser.add_argument(
805
+ "--formant_shifting",
806
+ type=lambda x: bool(strtobool(x)),
807
+ choices=[True, False],
808
+ help=formant_shifting_description,
809
+ default=False,
810
+ required=False,
811
+ )
812
+ formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect."
813
+ infer_parser.add_argument(
814
+ "--formant_qfrency",
815
+ type=float,
816
+ help=formant_qfrency_description,
817
+ default=1.0,
818
+ required=False,
819
+ )
820
+ formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect."
821
+ infer_parser.add_argument(
822
+ "--formant_timbre",
823
+ type=float,
824
+ help=formant_timbre_description,
825
+ default=1.0,
826
+ required=False,
827
+ )
828
+ sid_description = "Speaker ID for multi-speaker models."
829
+ infer_parser.add_argument(
830
+ "--sid",
831
+ type=int,
832
+ help=sid_description,
833
+ default=0,
834
+ required=False,
835
+ )
836
+ post_process_description = "Apply post-processing effects to the output audio."
837
+ infer_parser.add_argument(
838
+ "--post_process",
839
+ type=lambda x: bool(strtobool(x)),
840
+ choices=[True, False],
841
+ help=post_process_description,
842
+ default=False,
843
+ required=False,
844
+ )
845
+ reverb_description = "Apply reverb effect to the output audio."
846
+ infer_parser.add_argument(
847
+ "--reverb",
848
+ type=lambda x: bool(strtobool(x)),
849
+ choices=[True, False],
850
+ help=reverb_description,
851
+ default=False,
852
+ required=False,
853
+ )
854
+
855
+ pitch_shift_description = "Apply pitch shifting effect to the output audio."
856
+ infer_parser.add_argument(
857
+ "--pitch_shift",
858
+ type=lambda x: bool(strtobool(x)),
859
+ choices=[True, False],
860
+ help=pitch_shift_description,
861
+ default=False,
862
+ required=False,
863
+ )
864
+
865
+ limiter_description = "Apply limiter effect to the output audio."
866
+ infer_parser.add_argument(
867
+ "--limiter",
868
+ type=lambda x: bool(strtobool(x)),
869
+ choices=[True, False],
870
+ help=limiter_description,
871
+ default=False,
872
+ required=False,
873
+ )
874
+
875
+ gain_description = "Apply gain effect to the output audio."
876
+ infer_parser.add_argument(
877
+ "--gain",
878
+ type=lambda x: bool(strtobool(x)),
879
+ choices=[True, False],
880
+ help=gain_description,
881
+ default=False,
882
+ required=False,
883
+ )
884
+
885
+ distortion_description = "Apply distortion effect to the output audio."
886
+ infer_parser.add_argument(
887
+ "--distortion",
888
+ type=lambda x: bool(strtobool(x)),
889
+ choices=[True, False],
890
+ help=distortion_description,
891
+ default=False,
892
+ required=False,
893
+ )
894
+
895
+ chorus_description = "Apply chorus effect to the output audio."
896
+ infer_parser.add_argument(
897
+ "--chorus",
898
+ type=lambda x: bool(strtobool(x)),
899
+ choices=[True, False],
900
+ help=chorus_description,
901
+ default=False,
902
+ required=False,
903
+ )
904
+
905
+ bitcrush_description = "Apply bitcrush effect to the output audio."
906
+ infer_parser.add_argument(
907
+ "--bitcrush",
908
+ type=lambda x: bool(strtobool(x)),
909
+ choices=[True, False],
910
+ help=bitcrush_description,
911
+ default=False,
912
+ required=False,
913
+ )
914
+
915
+ clipping_description = "Apply clipping effect to the output audio."
916
+ infer_parser.add_argument(
917
+ "--clipping",
918
+ type=lambda x: bool(strtobool(x)),
919
+ choices=[True, False],
920
+ help=clipping_description,
921
+ default=False,
922
+ required=False,
923
+ )
924
+
925
+ compressor_description = "Apply compressor effect to the output audio."
926
+ infer_parser.add_argument(
927
+ "--compressor",
928
+ type=lambda x: bool(strtobool(x)),
929
+ choices=[True, False],
930
+ help=compressor_description,
931
+ default=False,
932
+ required=False,
933
+ )
934
+
935
+ delay_description = "Apply delay effect to the output audio."
936
+ infer_parser.add_argument(
937
+ "--delay",
938
+ type=lambda x: bool(strtobool(x)),
939
+ choices=[True, False],
940
+ help=delay_description,
941
+ default=False,
942
+ required=False,
943
+ )
944
+
945
+ reverb_room_size_description = "Control the room size of the reverb effect. Higher values result in a larger room size."
946
+ infer_parser.add_argument(
947
+ "--reverb_room_size",
948
+ type=float,
949
+ help=reverb_room_size_description,
950
+ default=0.5,
951
+ required=False,
952
+ )
953
+
954
+ reverb_damping_description = "Control the damping of the reverb effect. Higher values result in a more damped sound."
955
+ infer_parser.add_argument(
956
+ "--reverb_damping",
957
+ type=float,
958
+ help=reverb_damping_description,
959
+ default=0.5,
960
+ required=False,
961
+ )
962
+
963
+ reverb_wet_gain_description = "Control the wet gain of the reverb effect. Higher values result in a stronger reverb effect."
964
+ infer_parser.add_argument(
965
+ "--reverb_wet_gain",
966
+ type=float,
967
+ help=reverb_wet_gain_description,
968
+ default=0.5,
969
+ required=False,
970
+ )
971
+
972
+ reverb_dry_gain_description = "Control the dry gain of the reverb effect. Higher values result in a stronger dry signal."
973
+ infer_parser.add_argument(
974
+ "--reverb_dry_gain",
975
+ type=float,
976
+ help=reverb_dry_gain_description,
977
+ default=0.5,
978
+ required=False,
979
+ )
980
+
981
+ reverb_width_description = "Control the stereo width of the reverb effect. Higher values result in a wider stereo image."
982
+ infer_parser.add_argument(
983
+ "--reverb_width",
984
+ type=float,
985
+ help=reverb_width_description,
986
+ default=0.5,
987
+ required=False,
988
+ )
989
+
990
+ reverb_freeze_mode_description = "Control the freeze mode of the reverb effect. Higher values result in a stronger freeze effect."
991
+ infer_parser.add_argument(
992
+ "--reverb_freeze_mode",
993
+ type=float,
994
+ help=reverb_freeze_mode_description,
995
+ default=0.5,
996
+ required=False,
997
+ )
998
+
999
+ pitch_shift_semitones_description = "Control the pitch shift in semitones. Positive values increase the pitch, while negative values decrease it."
1000
+ infer_parser.add_argument(
1001
+ "--pitch_shift_semitones",
1002
+ type=float,
1003
+ help=pitch_shift_semitones_description,
1004
+ default=0.0,
1005
+ required=False,
1006
+ )
1007
+
1008
+ limiter_threshold_description = "Control the threshold of the limiter effect. Higher values result in a stronger limiting effect."
1009
+ infer_parser.add_argument(
1010
+ "--limiter_threshold",
1011
+ type=float,
1012
+ help=limiter_threshold_description,
1013
+ default=-6,
1014
+ required=False,
1015
+ )
1016
+
1017
+ limiter_release_time_description = "Control the release time of the limiter effect. Higher values result in a longer release time."
1018
+ infer_parser.add_argument(
1019
+ "--limiter_release_time",
1020
+ type=float,
1021
+ help=limiter_release_time_description,
1022
+ default=0.01,
1023
+ required=False,
1024
+ )
1025
+
1026
+ gain_db_description = "Control the gain in decibels. Positive values increase the gain, while negative values decrease it."
1027
+ infer_parser.add_argument(
1028
+ "--gain_db",
1029
+ type=float,
1030
+ help=gain_db_description,
1031
+ default=0.0,
1032
+ required=False,
1033
+ )
1034
+
1035
+ distortion_gain_description = "Control the gain of the distortion effect. Higher values result in a stronger distortion effect."
1036
+ infer_parser.add_argument(
1037
+ "--distortion_gain",
1038
+ type=float,
1039
+ help=distortion_gain_description,
1040
+ default=25,
1041
+ required=False,
1042
+ )
1043
+
1044
+ chorus_rate_description = "Control the rate of the chorus effect. Higher values result in a faster chorus effect."
1045
+ infer_parser.add_argument(
1046
+ "--chorus_rate",
1047
+ type=float,
1048
+ help=chorus_rate_description,
1049
+ default=1.0,
1050
+ required=False,
1051
+ )
1052
+
1053
+ chorus_depth_description = "Control the depth of the chorus effect. Higher values result in a stronger chorus effect."
1054
+ infer_parser.add_argument(
1055
+ "--chorus_depth",
1056
+ type=float,
1057
+ help=chorus_depth_description,
1058
+ default=0.25,
1059
+ required=False,
1060
+ )
1061
+
1062
+ chorus_center_delay_description = "Control the center delay of the chorus effect. Higher values result in a longer center delay."
1063
+ infer_parser.add_argument(
1064
+ "--chorus_center_delay",
1065
+ type=float,
1066
+ help=chorus_center_delay_description,
1067
+ default=7,
1068
+ required=False,
1069
+ )
1070
+
1071
+ chorus_feedback_description = "Control the feedback of the chorus effect. Higher values result in a stronger feedback effect."
1072
+ infer_parser.add_argument(
1073
+ "--chorus_feedback",
1074
+ type=float,
1075
+ help=chorus_feedback_description,
1076
+ default=0.0,
1077
+ required=False,
1078
+ )
1079
+
1080
+ chorus_mix_description = "Control the mix of the chorus effect. Higher values result in a stronger chorus effect."
1081
+ infer_parser.add_argument(
1082
+ "--chorus_mix",
1083
+ type=float,
1084
+ help=chorus_mix_description,
1085
+ default=0.5,
1086
+ required=False,
1087
+ )
1088
+
1089
+ bitcrush_bit_depth_description = "Control the bit depth of the bitcrush effect. Higher values result in a stronger bitcrush effect."
1090
+ infer_parser.add_argument(
1091
+ "--bitcrush_bit_depth",
1092
+ type=int,
1093
+ help=bitcrush_bit_depth_description,
1094
+ default=8,
1095
+ required=False,
1096
+ )
1097
+
1098
+ clipping_threshold_description = "Control the threshold of the clipping effect. Higher values result in a stronger clipping effect."
1099
+ infer_parser.add_argument(
1100
+ "--clipping_threshold",
1101
+ type=float,
1102
+ help=clipping_threshold_description,
1103
+ default=-6,
1104
+ required=False,
1105
+ )
1106
+
1107
+ compressor_threshold_description = "Control the threshold of the compressor effect. Higher values result in a stronger compressor effect."
1108
+ infer_parser.add_argument(
1109
+ "--compressor_threshold",
1110
+ type=float,
1111
+ help=compressor_threshold_description,
1112
+ default=0,
1113
+ required=False,
1114
+ )
1115
+
1116
+ compressor_ratio_description = "Control the ratio of the compressor effect. Higher values result in a stronger compressor effect."
1117
+ infer_parser.add_argument(
1118
+ "--compressor_ratio",
1119
+ type=float,
1120
+ help=compressor_ratio_description,
1121
+ default=1,
1122
+ required=False,
1123
+ )
1124
+
1125
+ compressor_attack_description = "Control the attack of the compressor effect. Higher values result in a stronger compressor effect."
1126
+ infer_parser.add_argument(
1127
+ "--compressor_attack",
1128
+ type=float,
1129
+ help=compressor_attack_description,
1130
+ default=1.0,
1131
+ required=False,
1132
+ )
1133
+
1134
+ compressor_release_description = "Control the release of the compressor effect. Higher values result in a stronger compressor effect."
1135
+ infer_parser.add_argument(
1136
+ "--compressor_release",
1137
+ type=float,
1138
+ help=compressor_release_description,
1139
+ default=100,
1140
+ required=False,
1141
+ )
1142
+
1143
+ delay_seconds_description = "Control the delay time in seconds. Higher values result in a longer delay time."
1144
+ infer_parser.add_argument(
1145
+ "--delay_seconds",
1146
+ type=float,
1147
+ help=delay_seconds_description,
1148
+ default=0.5,
1149
+ required=False,
1150
+ )
1151
+ delay_feedback_description = "Control the feedback of the delay effect. Higher values result in a stronger feedback effect."
1152
+ infer_parser.add_argument(
1153
+ "--delay_feedback",
1154
+ type=float,
1155
+ help=delay_feedback_description,
1156
+ default=0.0,
1157
+ required=False,
1158
+ )
1159
+ delay_mix_description = "Control the mix of the delay effect. Higher values result in a stronger delay effect."
1160
+ infer_parser.add_argument(
1161
+ "--delay_mix",
1162
+ type=float,
1163
+ help=delay_mix_description,
1164
+ default=0.5,
1165
+ required=False,
1166
+ )
1167
+
1168
+ # Parser for 'batch_infer' mode
1169
+ batch_infer_parser = subparsers.add_parser(
1170
+ "batch_infer",
1171
+ help="Run batch inference",
1172
+ )
1173
+ batch_infer_parser.add_argument(
1174
+ "--pitch",
1175
+ type=int,
1176
+ help=pitch_description,
1177
+ choices=range(-24, 25),
1178
+ default=0,
1179
+ )
1180
+ batch_infer_parser.add_argument(
1181
+ "--filter_radius",
1182
+ type=int,
1183
+ help=filter_radius_description,
1184
+ choices=range(11),
1185
+ default=3,
1186
+ )
1187
+ batch_infer_parser.add_argument(
1188
+ "--index_rate",
1189
+ type=float,
1190
+ help=index_rate_description,
1191
+ choices=[i / 100.0 for i in range(0, 101)],
1192
+ default=0.3,
1193
+ )
1194
+ batch_infer_parser.add_argument(
1195
+ "--volume_envelope",
1196
+ type=float,
1197
+ help=volume_envelope_description,
1198
+ choices=[i / 100.0 for i in range(0, 101)],
1199
+ default=1,
1200
+ )
1201
+ batch_infer_parser.add_argument(
1202
+ "--protect",
1203
+ type=float,
1204
+ help=protect_description,
1205
+ choices=[i / 1000.0 for i in range(0, 501)],
1206
+ default=0.33,
1207
+ )
1208
+ batch_infer_parser.add_argument(
1209
+ "--hop_length",
1210
+ type=int,
1211
+ help=hop_length_description,
1212
+ choices=range(1, 513),
1213
+ default=128,
1214
+ )
1215
+ batch_infer_parser.add_argument(
1216
+ "--f0_method",
1217
+ type=str,
1218
+ help=f0_method_description,
1219
+ choices=[
1220
+ "crepe",
1221
+ "crepe-tiny",
1222
+ "rmvpe",
1223
+ "fcpe",
1224
+ "hybrid[crepe+rmvpe]",
1225
+ "hybrid[crepe+fcpe]",
1226
+ "hybrid[rmvpe+fcpe]",
1227
+ "hybrid[crepe+rmvpe+fcpe]",
1228
+ ],
1229
+ default="rmvpe",
1230
+ )
1231
+ batch_infer_parser.add_argument(
1232
+ "--input_folder",
1233
+ type=str,
1234
+ help="Path to the folder containing input audio files.",
1235
+ required=True,
1236
+ )
1237
+ batch_infer_parser.add_argument(
1238
+ "--output_folder",
1239
+ type=str,
1240
+ help="Path to the folder for saving output audio files.",
1241
+ required=True,
1242
+ )
1243
+ batch_infer_parser.add_argument(
1244
+ "--pth_path", type=str, help=pth_path_description, required=True
1245
+ )
1246
+ batch_infer_parser.add_argument(
1247
+ "--index_path", type=str, help=index_path_description, required=True
1248
+ )
1249
+ batch_infer_parser.add_argument(
1250
+ "--split_audio",
1251
+ type=lambda x: bool(strtobool(x)),
1252
+ choices=[True, False],
1253
+ help=split_audio_description,
1254
+ default=False,
1255
+ )
1256
+ batch_infer_parser.add_argument(
1257
+ "--f0_autotune",
1258
+ type=lambda x: bool(strtobool(x)),
1259
+ choices=[True, False],
1260
+ help=f0_autotune_description,
1261
+ default=False,
1262
+ )
1263
+ batch_infer_parser.add_argument(
1264
+ "--f0_autotune_strength",
1265
+ type=float,
1266
+ help=clean_strength_description,
1267
+ choices=[(i / 10) for i in range(11)],
1268
+ default=1.0,
1269
+ )
1270
+ batch_infer_parser.add_argument(
1271
+ "--clean_audio",
1272
+ type=lambda x: bool(strtobool(x)),
1273
+ choices=[True, False],
1274
+ help=clean_audio_description,
1275
+ default=False,
1276
+ )
1277
+ batch_infer_parser.add_argument(
1278
+ "--clean_strength",
1279
+ type=float,
1280
+ help=clean_strength_description,
1281
+ choices=[(i / 10) for i in range(11)],
1282
+ default=0.7,
1283
+ )
1284
+ batch_infer_parser.add_argument(
1285
+ "--export_format",
1286
+ type=str,
1287
+ help=export_format_description,
1288
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
1289
+ default="WAV",
1290
+ )
1291
+ batch_infer_parser.add_argument(
1292
+ "--embedder_model",
1293
+ type=str,
1294
+ help=embedder_model_description,
1295
+ choices=[
1296
+ "contentvec",
1297
+ "chinese-hubert-base",
1298
+ "japanese-hubert-base",
1299
+ "korean-hubert-base",
1300
+ "custom",
1301
+ ],
1302
+ default="contentvec",
1303
+ )
1304
+ batch_infer_parser.add_argument(
1305
+ "--embedder_model_custom",
1306
+ type=str,
1307
+ help=embedder_model_custom_description,
1308
+ default=None,
1309
+ )
1310
+ batch_infer_parser.add_argument(
1311
+ "--f0_file",
1312
+ type=str,
1313
+ help=f0_file_description,
1314
+ default=None,
1315
+ )
1316
+ batch_infer_parser.add_argument(
1317
+ "--formant_shifting",
1318
+ type=lambda x: bool(strtobool(x)),
1319
+ choices=[True, False],
1320
+ help=formant_shifting_description,
1321
+ default=False,
1322
+ required=False,
1323
+ )
1324
+ batch_infer_parser.add_argument(
1325
+ "--formant_qfrency",
1326
+ type=float,
1327
+ help=formant_qfrency_description,
1328
+ default=1.0,
1329
+ required=False,
1330
+ )
1331
+ batch_infer_parser.add_argument(
1332
+ "--formant_timbre",
1333
+ type=float,
1334
+ help=formant_timbre_description,
1335
+ default=1.0,
1336
+ required=False,
1337
+ )
1338
+ batch_infer_parser.add_argument(
1339
+ "--sid",
1340
+ type=int,
1341
+ help=sid_description,
1342
+ default=0,
1343
+ required=False,
1344
+ )
1345
+ batch_infer_parser.add_argument(
1346
+ "--post_process",
1347
+ type=lambda x: bool(strtobool(x)),
1348
+ choices=[True, False],
1349
+ help=post_process_description,
1350
+ default=False,
1351
+ required=False,
1352
+ )
1353
+ batch_infer_parser.add_argument(
1354
+ "--reverb",
1355
+ type=lambda x: bool(strtobool(x)),
1356
+ choices=[True, False],
1357
+ help=reverb_description,
1358
+ default=False,
1359
+ required=False,
1360
+ )
1361
+
1362
+ batch_infer_parser.add_argument(
1363
+ "--pitch_shift",
1364
+ type=lambda x: bool(strtobool(x)),
1365
+ choices=[True, False],
1366
+ help=pitch_shift_description,
1367
+ default=False,
1368
+ required=False,
1369
+ )
1370
+
1371
+ batch_infer_parser.add_argument(
1372
+ "--limiter",
1373
+ type=lambda x: bool(strtobool(x)),
1374
+ choices=[True, False],
1375
+ help=limiter_description,
1376
+ default=False,
1377
+ required=False,
1378
+ )
1379
+
1380
+ batch_infer_parser.add_argument(
1381
+ "--gain",
1382
+ type=lambda x: bool(strtobool(x)),
1383
+ choices=[True, False],
1384
+ help=gain_description,
1385
+ default=False,
1386
+ required=False,
1387
+ )
1388
+
1389
+ batch_infer_parser.add_argument(
1390
+ "--distortion",
1391
+ type=lambda x: bool(strtobool(x)),
1392
+ choices=[True, False],
1393
+ help=distortion_description,
1394
+ default=False,
1395
+ required=False,
1396
+ )
1397
+
1398
+ batch_infer_parser.add_argument(
1399
+ "--chorus",
1400
+ type=lambda x: bool(strtobool(x)),
1401
+ choices=[True, False],
1402
+ help=chorus_description,
1403
+ default=False,
1404
+ required=False,
1405
+ )
1406
+
1407
+ batch_infer_parser.add_argument(
1408
+ "--bitcrush",
1409
+ type=lambda x: bool(strtobool(x)),
1410
+ choices=[True, False],
1411
+ help=bitcrush_description,
1412
+ default=False,
1413
+ required=False,
1414
+ )
1415
+
1416
+ batch_infer_parser.add_argument(
1417
+ "--clipping",
1418
+ type=lambda x: bool(strtobool(x)),
1419
+ choices=[True, False],
1420
+ help=clipping_description,
1421
+ default=False,
1422
+ required=False,
1423
+ )
1424
+
1425
+ batch_infer_parser.add_argument(
1426
+ "--compressor",
1427
+ type=lambda x: bool(strtobool(x)),
1428
+ choices=[True, False],
1429
+ help=compressor_description,
1430
+ default=False,
1431
+ required=False,
1432
+ )
1433
+
1434
+ batch_infer_parser.add_argument(
1435
+ "--delay",
1436
+ type=lambda x: bool(strtobool(x)),
1437
+ choices=[True, False],
1438
+ help=delay_description,
1439
+ default=False,
1440
+ required=False,
1441
+ )
1442
+
1443
+ batch_infer_parser.add_argument(
1444
+ "--reverb_room_size",
1445
+ type=float,
1446
+ help=reverb_room_size_description,
1447
+ default=0.5,
1448
+ required=False,
1449
+ )
1450
+
1451
+ batch_infer_parser.add_argument(
1452
+ "--reverb_damping",
1453
+ type=float,
1454
+ help=reverb_damping_description,
1455
+ default=0.5,
1456
+ required=False,
1457
+ )
1458
+
1459
+ batch_infer_parser.add_argument(
1460
+ "--reverb_wet_gain",
1461
+ type=float,
1462
+ help=reverb_wet_gain_description,
1463
+ default=0.5,
1464
+ required=False,
1465
+ )
1466
+
1467
+ batch_infer_parser.add_argument(
1468
+ "--reverb_dry_gain",
1469
+ type=float,
1470
+ help=reverb_dry_gain_description,
1471
+ default=0.5,
1472
+ required=False,
1473
+ )
1474
+
1475
+ batch_infer_parser.add_argument(
1476
+ "--reverb_width",
1477
+ type=float,
1478
+ help=reverb_width_description,
1479
+ default=0.5,
1480
+ required=False,
1481
+ )
1482
+
1483
+ batch_infer_parser.add_argument(
1484
+ "--reverb_freeze_mode",
1485
+ type=float,
1486
+ help=reverb_freeze_mode_description,
1487
+ default=0.5,
1488
+ required=False,
1489
+ )
1490
+
1491
+ batch_infer_parser.add_argument(
1492
+ "--pitch_shift_semitones",
1493
+ type=float,
1494
+ help=pitch_shift_semitones_description,
1495
+ default=0.0,
1496
+ required=False,
1497
+ )
1498
+
1499
+ batch_infer_parser.add_argument(
1500
+ "--limiter_threshold",
1501
+ type=float,
1502
+ help=limiter_threshold_description,
1503
+ default=-6,
1504
+ required=False,
1505
+ )
1506
+
1507
+ batch_infer_parser.add_argument(
1508
+ "--limiter_release_time",
1509
+ type=float,
1510
+ help=limiter_release_time_description,
1511
+ default=0.01,
1512
+ required=False,
1513
+ )
1514
+ batch_infer_parser.add_argument(
1515
+ "--gain_db",
1516
+ type=float,
1517
+ help=gain_db_description,
1518
+ default=0.0,
1519
+ required=False,
1520
+ )
1521
+
1522
+ batch_infer_parser.add_argument(
1523
+ "--distortion_gain",
1524
+ type=float,
1525
+ help=distortion_gain_description,
1526
+ default=25,
1527
+ required=False,
1528
+ )
1529
+
1530
+ batch_infer_parser.add_argument(
1531
+ "--chorus_rate",
1532
+ type=float,
1533
+ help=chorus_rate_description,
1534
+ default=1.0,
1535
+ required=False,
1536
+ )
1537
+
1538
+ batch_infer_parser.add_argument(
1539
+ "--chorus_depth",
1540
+ type=float,
1541
+ help=chorus_depth_description,
1542
+ default=0.25,
1543
+ required=False,
1544
+ )
1545
+ batch_infer_parser.add_argument(
1546
+ "--chorus_center_delay",
1547
+ type=float,
1548
+ help=chorus_center_delay_description,
1549
+ default=7,
1550
+ required=False,
1551
+ )
1552
+
1553
+ batch_infer_parser.add_argument(
1554
+ "--chorus_feedback",
1555
+ type=float,
1556
+ help=chorus_feedback_description,
1557
+ default=0.0,
1558
+ required=False,
1559
+ )
1560
+
1561
+ batch_infer_parser.add_argument(
1562
+ "--chorus_mix",
1563
+ type=float,
1564
+ help=chorus_mix_description,
1565
+ default=0.5,
1566
+ required=False,
1567
+ )
1568
+
1569
+ batch_infer_parser.add_argument(
1570
+ "--bitcrush_bit_depth",
1571
+ type=int,
1572
+ help=bitcrush_bit_depth_description,
1573
+ default=8,
1574
+ required=False,
1575
+ )
1576
+
1577
+ batch_infer_parser.add_argument(
1578
+ "--clipping_threshold",
1579
+ type=float,
1580
+ help=clipping_threshold_description,
1581
+ default=-6,
1582
+ required=False,
1583
+ )
1584
+
1585
+ batch_infer_parser.add_argument(
1586
+ "--compressor_threshold",
1587
+ type=float,
1588
+ help=compressor_threshold_description,
1589
+ default=0,
1590
+ required=False,
1591
+ )
1592
+
1593
+ batch_infer_parser.add_argument(
1594
+ "--compressor_ratio",
1595
+ type=float,
1596
+ help=compressor_ratio_description,
1597
+ default=1,
1598
+ required=False,
1599
+ )
1600
+
1601
+ batch_infer_parser.add_argument(
1602
+ "--compressor_attack",
1603
+ type=float,
1604
+ help=compressor_attack_description,
1605
+ default=1.0,
1606
+ required=False,
1607
+ )
1608
+
1609
+ batch_infer_parser.add_argument(
1610
+ "--compressor_release",
1611
+ type=float,
1612
+ help=compressor_release_description,
1613
+ default=100,
1614
+ required=False,
1615
+ )
1616
+ batch_infer_parser.add_argument(
1617
+ "--delay_seconds",
1618
+ type=float,
1619
+ help=delay_seconds_description,
1620
+ default=0.5,
1621
+ required=False,
1622
+ )
1623
+ batch_infer_parser.add_argument(
1624
+ "--delay_feedback",
1625
+ type=float,
1626
+ help=delay_feedback_description,
1627
+ default=0.0,
1628
+ required=False,
1629
+ )
1630
+ batch_infer_parser.add_argument(
1631
+ "--delay_mix",
1632
+ type=float,
1633
+ help=delay_mix_description,
1634
+ default=0.5,
1635
+ required=False,
1636
+ )
1637
+
1638
+ # Parser for 'tts' mode
1639
+ tts_parser = subparsers.add_parser("tts", help="Run TTS inference")
1640
+ tts_parser.add_argument(
1641
+ "--tts_file", type=str, help="File with a text to be synthesized", required=True
1642
+ )
1643
+ tts_parser.add_argument(
1644
+ "--tts_text", type=str, help="Text to be synthesized", required=True
1645
+ )
1646
+ tts_parser.add_argument(
1647
+ "--tts_voice",
1648
+ type=str,
1649
+ help="Voice to be used for TTS synthesis.",
1650
+ choices=locales,
1651
+ required=True,
1652
+ )
1653
+ tts_parser.add_argument(
1654
+ "--tts_rate",
1655
+ type=int,
1656
+ help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).",
1657
+ choices=range(-100, 101),
1658
+ default=0,
1659
+ )
1660
+ tts_parser.add_argument(
1661
+ "--pitch",
1662
+ type=int,
1663
+ help=pitch_description,
1664
+ choices=range(-24, 25),
1665
+ default=0,
1666
+ )
1667
+ tts_parser.add_argument(
1668
+ "--filter_radius",
1669
+ type=int,
1670
+ help=filter_radius_description,
1671
+ choices=range(11),
1672
+ default=3,
1673
+ )
1674
+ tts_parser.add_argument(
1675
+ "--index_rate",
1676
+ type=float,
1677
+ help=index_rate_description,
1678
+ choices=[(i / 10) for i in range(11)],
1679
+ default=0.3,
1680
+ )
1681
+ tts_parser.add_argument(
1682
+ "--volume_envelope",
1683
+ type=float,
1684
+ help=volume_envelope_description,
1685
+ choices=[(i / 10) for i in range(11)],
1686
+ default=1,
1687
+ )
1688
+ tts_parser.add_argument(
1689
+ "--protect",
1690
+ type=float,
1691
+ help=protect_description,
1692
+ choices=[(i / 10) for i in range(6)],
1693
+ default=0.33,
1694
+ )
1695
+ tts_parser.add_argument(
1696
+ "--hop_length",
1697
+ type=int,
1698
+ help=hop_length_description,
1699
+ choices=range(1, 513),
1700
+ default=128,
1701
+ )
1702
+ tts_parser.add_argument(
1703
+ "--f0_method",
1704
+ type=str,
1705
+ help=f0_method_description,
1706
+ choices=[
1707
+ "crepe",
1708
+ "crepe-tiny",
1709
+ "rmvpe",
1710
+ "fcpe",
1711
+ "hybrid[crepe+rmvpe]",
1712
+ "hybrid[crepe+fcpe]",
1713
+ "hybrid[rmvpe+fcpe]",
1714
+ "hybrid[crepe+rmvpe+fcpe]",
1715
+ ],
1716
+ default="rmvpe",
1717
+ )
1718
+ tts_parser.add_argument(
1719
+ "--output_tts_path",
1720
+ type=str,
1721
+ help="Full path to save the synthesized TTS audio.",
1722
+ required=True,
1723
+ )
1724
+ tts_parser.add_argument(
1725
+ "--output_rvc_path",
1726
+ type=str,
1727
+ help="Full path to save the voice-converted audio using the synthesized TTS.",
1728
+ required=True,
1729
+ )
1730
+ tts_parser.add_argument(
1731
+ "--pth_path", type=str, help=pth_path_description, required=True
1732
+ )
1733
+ tts_parser.add_argument(
1734
+ "--index_path", type=str, help=index_path_description, required=True
1735
+ )
1736
+ tts_parser.add_argument(
1737
+ "--split_audio",
1738
+ type=lambda x: bool(strtobool(x)),
1739
+ choices=[True, False],
1740
+ help=split_audio_description,
1741
+ default=False,
1742
+ )
1743
+ tts_parser.add_argument(
1744
+ "--f0_autotune",
1745
+ type=lambda x: bool(strtobool(x)),
1746
+ choices=[True, False],
1747
+ help=f0_autotune_description,
1748
+ default=False,
1749
+ )
1750
+ tts_parser.add_argument(
1751
+ "--f0_autotune_strength",
1752
+ type=float,
1753
+ help=clean_strength_description,
1754
+ choices=[(i / 10) for i in range(11)],
1755
+ default=1.0,
1756
+ )
1757
+ tts_parser.add_argument(
1758
+ "--clean_audio",
1759
+ type=lambda x: bool(strtobool(x)),
1760
+ choices=[True, False],
1761
+ help=clean_audio_description,
1762
+ default=False,
1763
+ )
1764
+ tts_parser.add_argument(
1765
+ "--clean_strength",
1766
+ type=float,
1767
+ help=clean_strength_description,
1768
+ choices=[(i / 10) for i in range(11)],
1769
+ default=0.7,
1770
+ )
1771
+ tts_parser.add_argument(
1772
+ "--export_format",
1773
+ type=str,
1774
+ help=export_format_description,
1775
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
1776
+ default="WAV",
1777
+ )
1778
+ tts_parser.add_argument(
1779
+ "--embedder_model",
1780
+ type=str,
1781
+ help=embedder_model_description,
1782
+ choices=[
1783
+ "contentvec",
1784
+ "chinese-hubert-base",
1785
+ "japanese-hubert-base",
1786
+ "korean-hubert-base",
1787
+ "custom",
1788
+ ],
1789
+ default="contentvec",
1790
+ )
1791
+ tts_parser.add_argument(
1792
+ "--embedder_model_custom",
1793
+ type=str,
1794
+ help=embedder_model_custom_description,
1795
+ default=None,
1796
+ )
1797
+ tts_parser.add_argument(
1798
+ "--f0_file",
1799
+ type=str,
1800
+ help=f0_file_description,
1801
+ default=None,
1802
+ )
1803
+
1804
+ # Parser for 'preprocess' mode
1805
+ preprocess_parser = subparsers.add_parser(
1806
+ "preprocess", help="Preprocess a dataset for training."
1807
+ )
1808
+ preprocess_parser.add_argument(
1809
+ "--model_name", type=str, help="Name of the model to be trained.", required=True
1810
+ )
1811
+ preprocess_parser.add_argument(
1812
+ "--dataset_path", type=str, help="Path to the dataset directory.", required=True
1813
+ )
1814
+ preprocess_parser.add_argument(
1815
+ "--sample_rate",
1816
+ type=int,
1817
+ help="Target sampling rate for the audio data.",
1818
+ choices=[32000, 40000, 44100, 48000],
1819
+ required=True,
1820
+ )
1821
+ preprocess_parser.add_argument(
1822
+ "--cpu_cores",
1823
+ type=int,
1824
+ help="Number of CPU cores to use for preprocessing.",
1825
+ choices=range(1, 65),
1826
+ )
1827
+ preprocess_parser.add_argument(
1828
+ "--cut_preprocess",
1829
+ type=str,
1830
+ choices=["Skip", "Simple", "Automatic"],
1831
+ help="Cut the dataset into smaller segments for faster preprocessing.",
1832
+ default="Automatic",
1833
+ required=True,
1834
+ )
1835
+ preprocess_parser.add_argument(
1836
+ "--process_effects",
1837
+ type=lambda x: bool(strtobool(x)),
1838
+ choices=[True, False],
1839
+ help="Disable all filters during preprocessing.",
1840
+ default=False,
1841
+ required=False,
1842
+ )
1843
+ preprocess_parser.add_argument(
1844
+ "--noise_reduction",
1845
+ type=lambda x: bool(strtobool(x)),
1846
+ choices=[True, False],
1847
+ help="Enable noise reduction during preprocessing.",
1848
+ default=False,
1849
+ required=False,
1850
+ )
1851
+ preprocess_parser.add_argument(
1852
+ "--noise_reduction_strength",
1853
+ type=float,
1854
+ help="Strength of the noise reduction filter.",
1855
+ choices=[(i / 10) for i in range(11)],
1856
+ default=0.7,
1857
+ required=False,
1858
+ )
1859
+ preprocess_parser.add_argument(
1860
+ "--chunk_len",
1861
+ type=float,
1862
+ help="Chunk length.",
1863
+ choices=[i * 0.5 for i in range(1, 11)],
1864
+ default=3.0,
1865
+ required=False,
1866
+ )
1867
+ preprocess_parser.add_argument(
1868
+ "--overlap_len",
1869
+ type=float,
1870
+ help="Overlap length.",
1871
+ choices=[0.0, 0.1, 0.2, 0.3, 0.4],
1872
+ default=0.3,
1873
+ required=False,
1874
+ )
1875
+
1876
+ # Parser for 'extract' mode
1877
+ extract_parser = subparsers.add_parser(
1878
+ "extract", help="Extract features from a dataset."
1879
+ )
1880
+ extract_parser.add_argument(
1881
+ "--model_name", type=str, help="Name of the model.", required=True
1882
+ )
1883
+ extract_parser.add_argument(
1884
+ "--f0_method",
1885
+ type=str,
1886
+ help="Pitch extraction method to use.",
1887
+ choices=[
1888
+ "crepe",
1889
+ "crepe-tiny",
1890
+ "rmvpe",
1891
+ ],
1892
+ default="rmvpe",
1893
+ )
1894
+ extract_parser.add_argument(
1895
+ "--hop_length",
1896
+ type=int,
1897
+ help="Hop length for feature extraction. Only applicable for Crepe pitch extraction.",
1898
+ choices=range(1, 513),
1899
+ default=128,
1900
+ )
1901
+ extract_parser.add_argument(
1902
+ "--cpu_cores",
1903
+ type=int,
1904
+ help="Number of CPU cores to use for feature extraction (optional).",
1905
+ choices=range(1, 65),
1906
+ default=None,
1907
+ )
1908
+ extract_parser.add_argument(
1909
+ "--gpu",
1910
+ type=str,
1911
+ help="GPU device to use for feature extraction (optional).",
1912
+ default="-",
1913
+ )
1914
+ extract_parser.add_argument(
1915
+ "--sample_rate",
1916
+ type=int,
1917
+ help="Target sampling rate for the audio data.",
1918
+ choices=[32000, 40000, 44100, 48000],
1919
+ required=True,
1920
+ )
1921
+ extract_parser.add_argument(
1922
+ "--embedder_model",
1923
+ type=str,
1924
+ help=embedder_model_description,
1925
+ choices=[
1926
+ "contentvec",
1927
+ "chinese-hubert-base",
1928
+ "japanese-hubert-base",
1929
+ "korean-hubert-base",
1930
+ "custom",
1931
+ ],
1932
+ default="contentvec",
1933
+ )
1934
+ extract_parser.add_argument(
1935
+ "--embedder_model_custom",
1936
+ type=str,
1937
+ help=embedder_model_custom_description,
1938
+ default=None,
1939
+ )
1940
+ extract_parser.add_argument(
1941
+ "--include_mutes",
1942
+ type=int,
1943
+ help="Number of silent files to include.",
1944
+ choices=range(0, 11),
1945
+ default=2,
1946
+ required=True,
1947
+ )
1948
+
1949
+ # Parser for 'train' mode
1950
+ train_parser = subparsers.add_parser("train", help="Train an RVC model.")
1951
+ train_parser.add_argument(
1952
+ "--model_name", type=str, help="Name of the model to be trained.", required=True
1953
+ )
1954
+ train_parser.add_argument(
1955
+ "--vocoder",
1956
+ type=str,
1957
+ help="Vocoder name",
1958
+ choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"],
1959
+ default="HiFi-GAN",
1960
+ )
1961
+ train_parser.add_argument(
1962
+ "--checkpointing",
1963
+ type=lambda x: bool(strtobool(x)),
1964
+ choices=[True, False],
1965
+ help="Enables memory-efficient training.",
1966
+ default=False,
1967
+ required=False,
1968
+ )
1969
+ train_parser.add_argument(
1970
+ "--save_every_epoch",
1971
+ type=int,
1972
+ help="Save the model every specified number of epochs.",
1973
+ choices=range(1, 101),
1974
+ required=True,
1975
+ )
1976
+ train_parser.add_argument(
1977
+ "--save_only_latest",
1978
+ type=lambda x: bool(strtobool(x)),
1979
+ choices=[True, False],
1980
+ help="Save only the latest model checkpoint.",
1981
+ default=False,
1982
+ )
1983
+ train_parser.add_argument(
1984
+ "--save_every_weights",
1985
+ type=lambda x: bool(strtobool(x)),
1986
+ choices=[True, False],
1987
+ help="Save model weights every epoch.",
1988
+ default=True,
1989
+ )
1990
+ train_parser.add_argument(
1991
+ "--total_epoch",
1992
+ type=int,
1993
+ help="Total number of epochs to train for.",
1994
+ choices=range(1, 10001),
1995
+ default=1000,
1996
+ )
1997
+ train_parser.add_argument(
1998
+ "--sample_rate",
1999
+ type=int,
2000
+ help="Sampling rate of the training data.",
2001
+ choices=[32000, 40000, 48000],
2002
+ required=True,
2003
+ )
2004
+ train_parser.add_argument(
2005
+ "--batch_size",
2006
+ type=int,
2007
+ help="Batch size for training.",
2008
+ choices=range(1, 51),
2009
+ default=8,
2010
+ )
2011
+ train_parser.add_argument(
2012
+ "--gpu",
2013
+ type=str,
2014
+ help="GPU device to use for training (e.g., '0').",
2015
+ default="0",
2016
+ )
2017
+ train_parser.add_argument(
2018
+ "--pretrained",
2019
+ type=lambda x: bool(strtobool(x)),
2020
+ choices=[True, False],
2021
+ help="Use a pretrained model for initialization.",
2022
+ default=True,
2023
+ )
2024
+ train_parser.add_argument(
2025
+ "--custom_pretrained",
2026
+ type=lambda x: bool(strtobool(x)),
2027
+ choices=[True, False],
2028
+ help="Use a custom pretrained model.",
2029
+ default=False,
2030
+ )
2031
+ train_parser.add_argument(
2032
+ "--g_pretrained_path",
2033
+ type=str,
2034
+ nargs="?",
2035
+ default=None,
2036
+ help="Path to the pretrained generator model file.",
2037
+ )
2038
+ train_parser.add_argument(
2039
+ "--d_pretrained_path",
2040
+ type=str,
2041
+ nargs="?",
2042
+ default=None,
2043
+ help="Path to the pretrained discriminator model file.",
2044
+ )
2045
+ train_parser.add_argument(
2046
+ "--overtraining_detector",
2047
+ type=lambda x: bool(strtobool(x)),
2048
+ choices=[True, False],
2049
+ help="Enable overtraining detection.",
2050
+ default=False,
2051
+ )
2052
+ train_parser.add_argument(
2053
+ "--overtraining_threshold",
2054
+ type=int,
2055
+ help="Threshold for overtraining detection.",
2056
+ choices=range(1, 101),
2057
+ default=50,
2058
+ )
2059
+ train_parser.add_argument(
2060
+ "--cleanup",
2061
+ type=lambda x: bool(strtobool(x)),
2062
+ choices=[True, False],
2063
+ help="Cleanup previous training attempt.",
2064
+ default=False,
2065
+ )
2066
+ train_parser.add_argument(
2067
+ "--cache_data_in_gpu",
2068
+ type=lambda x: bool(strtobool(x)),
2069
+ choices=[True, False],
2070
+ help="Cache training data in GPU memory.",
2071
+ default=False,
2072
+ )
2073
+ train_parser.add_argument(
2074
+ "--index_algorithm",
2075
+ type=str,
2076
+ choices=["Auto", "Faiss", "KMeans"],
2077
+ help="Choose the method for generating the index file.",
2078
+ default="Auto",
2079
+ required=False,
2080
+ )
2081
+
2082
+ # Parser for 'index' mode
2083
+ index_parser = subparsers.add_parser(
2084
+ "index", help="Generate an index file for an RVC model."
2085
+ )
2086
+ index_parser.add_argument(
2087
+ "--model_name", type=str, help="Name of the model.", required=True
2088
+ )
2089
+ index_parser.add_argument(
2090
+ "--index_algorithm",
2091
+ type=str,
2092
+ choices=["Auto", "Faiss", "KMeans"],
2093
+ help="Choose the method for generating the index file.",
2094
+ default="Auto",
2095
+ required=False,
2096
+ )
2097
+
2098
+ # Parser for 'model_information' mode
2099
+ model_information_parser = subparsers.add_parser(
2100
+ "model_information", help="Display information about a trained model."
2101
+ )
2102
+ model_information_parser.add_argument(
2103
+ "--pth_path", type=str, help="Path to the .pth model file.", required=True
2104
+ )
2105
+
2106
+ # Parser for 'model_blender' mode
2107
+ model_blender_parser = subparsers.add_parser(
2108
+ "model_blender", help="Fuse two RVC models together."
2109
+ )
2110
+ model_blender_parser.add_argument(
2111
+ "--model_name", type=str, help="Name of the new fused model.", required=True
2112
+ )
2113
+ model_blender_parser.add_argument(
2114
+ "--pth_path_1",
2115
+ type=str,
2116
+ help="Path to the first .pth model file.",
2117
+ required=True,
2118
+ )
2119
+ model_blender_parser.add_argument(
2120
+ "--pth_path_2",
2121
+ type=str,
2122
+ help="Path to the second .pth model file.",
2123
+ required=True,
2124
+ )
2125
+ model_blender_parser.add_argument(
2126
+ "--ratio",
2127
+ type=float,
2128
+ help="Ratio for blending the two models (0.0 to 1.0).",
2129
+ choices=[(i / 10) for i in range(11)],
2130
+ default=0.5,
2131
+ )
2132
+
2133
+ # Parser for 'tensorboard' mode
2134
+ subparsers.add_parser(
2135
+ "tensorboard", help="Launch TensorBoard for monitoring training progress."
2136
+ )
2137
+
2138
+ # Parser for 'download' mode
2139
+ download_parser = subparsers.add_parser(
2140
+ "download", help="Download a model from a provided link."
2141
+ )
2142
+ download_parser.add_argument(
2143
+ "--model_link", type=str, help="Direct link to the model file.", required=True
2144
+ )
2145
+
2146
+ # Parser for 'prerequisites' mode
2147
+ prerequisites_parser = subparsers.add_parser(
2148
+ "prerequisites", help="Install prerequisites for RVC."
2149
+ )
2150
+ prerequisites_parser.add_argument(
2151
+ "--pretraineds_hifigan",
2152
+ type=lambda x: bool(strtobool(x)),
2153
+ choices=[True, False],
2154
+ default=True,
2155
+ help="Download pretrained models for RVC v2.",
2156
+ )
2157
+ prerequisites_parser.add_argument(
2158
+ "--models",
2159
+ type=lambda x: bool(strtobool(x)),
2160
+ choices=[True, False],
2161
+ default=True,
2162
+ help="Download additional models.",
2163
+ )
2164
+ prerequisites_parser.add_argument(
2165
+ "--exe",
2166
+ type=lambda x: bool(strtobool(x)),
2167
+ choices=[True, False],
2168
+ default=True,
2169
+ help="Download required executables.",
2170
+ )
2171
+
2172
+ # Parser for 'audio_analyzer' mode
2173
+ audio_analyzer = subparsers.add_parser(
2174
+ "audio_analyzer", help="Analyze an audio file."
2175
+ )
2176
+ audio_analyzer.add_argument(
2177
+ "--input_path", type=str, help="Path to the input audio file.", required=True
2178
+ )
2179
+
2180
+ return parser.parse_args()
2181
+
2182
+
2183
+ def main():
2184
+ if len(sys.argv) == 1:
2185
+ print("Please run the script with '-h' for more information.")
2186
+ sys.exit(1)
2187
+
2188
+ args = parse_arguments()
2189
+
2190
+ try:
2191
+ if args.mode == "infer":
2192
+ run_infer_script(
2193
+ pitch=args.pitch,
2194
+ filter_radius=args.filter_radius,
2195
+ index_rate=args.index_rate,
2196
+ volume_envelope=args.volume_envelope,
2197
+ protect=args.protect,
2198
+ hop_length=args.hop_length,
2199
+ f0_method=args.f0_method,
2200
+ input_path=args.input_path,
2201
+ output_path=args.output_path,
2202
+ pth_path=args.pth_path,
2203
+ index_path=args.index_path,
2204
+ split_audio=args.split_audio,
2205
+ f0_autotune=args.f0_autotune,
2206
+ f0_autotune_strength=args.f0_autotune_strength,
2207
+ clean_audio=args.clean_audio,
2208
+ clean_strength=args.clean_strength,
2209
+ export_format=args.export_format,
2210
+ embedder_model=args.embedder_model,
2211
+ embedder_model_custom=args.embedder_model_custom,
2212
+ f0_file=args.f0_file,
2213
+ formant_shifting=args.formant_shifting,
2214
+ formant_qfrency=args.formant_qfrency,
2215
+ formant_timbre=args.formant_timbre,
2216
+ sid=args.sid,
2217
+ post_process=args.post_process,
2218
+ reverb=args.reverb,
2219
+ pitch_shift=args.pitch_shift,
2220
+ limiter=args.limiter,
2221
+ gain=args.gain,
2222
+ distortion=args.distortion,
2223
+ chorus=args.chorus,
2224
+ bitcrush=args.bitcrush,
2225
+ clipping=args.clipping,
2226
+ compressor=args.compressor,
2227
+ delay=args.delay,
2228
+ reverb_room_size=args.reverb_room_size,
2229
+ reverb_damping=args.reverb_damping,
2230
+ reverb_wet_gain=args.reverb_wet_gain,
2231
+ reverb_dry_gain=args.reverb_dry_gain,
2232
+ reverb_width=args.reverb_width,
2233
+ reverb_freeze_mode=args.reverb_freeze_mode,
2234
+ pitch_shift_semitones=args.pitch_shift_semitones,
2235
+ limiter_threshold=args.limiter_threshold,
2236
+ limiter_release_time=args.limiter_release_time,
2237
+ gain_db=args.gain_db,
2238
+ distortion_gain=args.distortion_gain,
2239
+ chorus_rate=args.chorus_rate,
2240
+ chorus_depth=args.chorus_depth,
2241
+ chorus_center_delay=args.chorus_center_delay,
2242
+ chorus_feedback=args.chorus_feedback,
2243
+ chorus_mix=args.chorus_mix,
2244
+ bitcrush_bit_depth=args.bitcrush_bit_depth,
2245
+ clipping_threshold=args.clipping_threshold,
2246
+ compressor_threshold=args.compressor_threshold,
2247
+ compressor_ratio=args.compressor_ratio,
2248
+ compressor_attack=args.compressor_attack,
2249
+ compressor_release=args.compressor_release,
2250
+ delay_seconds=args.delay_seconds,
2251
+ delay_feedback=args.delay_feedback,
2252
+ delay_mix=args.delay_mix,
2253
+ )
2254
+ elif args.mode == "batch_infer":
2255
+ run_batch_infer_script(
2256
+ pitch=args.pitch,
2257
+ filter_radius=args.filter_radius,
2258
+ index_rate=args.index_rate,
2259
+ volume_envelope=args.volume_envelope,
2260
+ protect=args.protect,
2261
+ hop_length=args.hop_length,
2262
+ f0_method=args.f0_method,
2263
+ input_folder=args.input_folder,
2264
+ output_folder=args.output_folder,
2265
+ pth_path=args.pth_path,
2266
+ index_path=args.index_path,
2267
+ split_audio=args.split_audio,
2268
+ f0_autotune=args.f0_autotune,
2269
+ f0_autotune_strength=args.f0_autotune_strength,
2270
+ clean_audio=args.clean_audio,
2271
+ clean_strength=args.clean_strength,
2272
+ export_format=args.export_format,
2273
+ embedder_model=args.embedder_model,
2274
+ embedder_model_custom=args.embedder_model_custom,
2275
+ f0_file=args.f0_file,
2276
+ formant_shifting=args.formant_shifting,
2277
+ formant_qfrency=args.formant_qfrency,
2278
+ formant_timbre=args.formant_timbre,
2279
+ sid=args.sid,
2280
+ post_process=args.post_process,
2281
+ reverb=args.reverb,
2282
+ pitch_shift=args.pitch_shift,
2283
+ limiter=args.limiter,
2284
+ gain=args.gain,
2285
+ distortion=args.distortion,
2286
+ chorus=args.chorus,
2287
+ bitcrush=args.bitcrush,
2288
+ clipping=args.clipping,
2289
+ compressor=args.compressor,
2290
+ delay=args.delay,
2291
+ reverb_room_size=args.reverb_room_size,
2292
+ reverb_damping=args.reverb_damping,
2293
+ reverb_wet_gain=args.reverb_wet_gain,
2294
+ reverb_dry_gain=args.reverb_dry_gain,
2295
+ reverb_width=args.reverb_width,
2296
+ reverb_freeze_mode=args.reverb_freeze_mode,
2297
+ pitch_shift_semitones=args.pitch_shift_semitones,
2298
+ limiter_threshold=args.limiter_threshold,
2299
+ limiter_release_time=args.limiter_release_time,
2300
+ gain_db=args.gain_db,
2301
+ distortion_gain=args.distortion_gain,
2302
+ chorus_rate=args.chorus_rate,
2303
+ chorus_depth=args.chorus_depth,
2304
+ chorus_center_delay=args.chorus_center_delay,
2305
+ chorus_feedback=args.chorus_feedback,
2306
+ chorus_mix=args.chorus_mix,
2307
+ bitcrush_bit_depth=args.bitcrush_bit_depth,
2308
+ clipping_threshold=args.clipping_threshold,
2309
+ compressor_threshold=args.compressor_threshold,
2310
+ compressor_ratio=args.compressor_ratio,
2311
+ compressor_attack=args.compressor_attack,
2312
+ compressor_release=args.compressor_release,
2313
+ delay_seconds=args.delay_seconds,
2314
+ delay_feedback=args.delay_feedback,
2315
+ delay_mix=args.delay_mix,
2316
+ )
2317
+ elif args.mode == "tts":
2318
+ run_tts_script(
2319
+ tts_file=args.tts_file,
2320
+ tts_text=args.tts_text,
2321
+ tts_voice=args.tts_voice,
2322
+ tts_rate=args.tts_rate,
2323
+ pitch=args.pitch,
2324
+ filter_radius=args.filter_radius,
2325
+ index_rate=args.index_rate,
2326
+ volume_envelope=args.volume_envelope,
2327
+ protect=args.protect,
2328
+ hop_length=args.hop_length,
2329
+ f0_method=args.f0_method,
2330
+ output_tts_path=args.output_tts_path,
2331
+ output_rvc_path=args.output_rvc_path,
2332
+ pth_path=args.pth_path,
2333
+ index_path=args.index_path,
2334
+ split_audio=args.split_audio,
2335
+ f0_autotune=args.f0_autotune,
2336
+ f0_autotune_strength=args.f0_autotune_strength,
2337
+ clean_audio=args.clean_audio,
2338
+ clean_strength=args.clean_strength,
2339
+ export_format=args.export_format,
2340
+ embedder_model=args.embedder_model,
2341
+ embedder_model_custom=args.embedder_model_custom,
2342
+ f0_file=args.f0_file,
2343
+ )
2344
+ elif args.mode == "preprocess":
2345
+ run_preprocess_script(
2346
+ model_name=args.model_name,
2347
+ dataset_path=args.dataset_path,
2348
+ sample_rate=args.sample_rate,
2349
+ cpu_cores=args.cpu_cores,
2350
+ cut_preprocess=args.cut_preprocess,
2351
+ process_effects=args.process_effects,
2352
+ noise_reduction=args.noise_reduction,
2353
+ clean_strength=args.noise_reduction_strength,
2354
+ chunk_len=args.chunk_len,
2355
+ overlap_len=args.overlap_len,
2356
+ )
2357
+ elif args.mode == "extract":
2358
+ run_extract_script(
2359
+ model_name=args.model_name,
2360
+ f0_method=args.f0_method,
2361
+ hop_length=args.hop_length,
2362
+ cpu_cores=args.cpu_cores,
2363
+ gpu=args.gpu,
2364
+ sample_rate=args.sample_rate,
2365
+ embedder_model=args.embedder_model,
2366
+ embedder_model_custom=args.embedder_model_custom,
2367
+ include_mutes=args.include_mutes,
2368
+ )
2369
+ elif args.mode == "train":
2370
+ run_train_script(
2371
+ model_name=args.model_name,
2372
+ save_every_epoch=args.save_every_epoch,
2373
+ save_only_latest=args.save_only_latest,
2374
+ save_every_weights=args.save_every_weights,
2375
+ total_epoch=args.total_epoch,
2376
+ sample_rate=args.sample_rate,
2377
+ batch_size=args.batch_size,
2378
+ gpu=args.gpu,
2379
+ overtraining_detector=args.overtraining_detector,
2380
+ overtraining_threshold=args.overtraining_threshold,
2381
+ pretrained=args.pretrained,
2382
+ custom_pretrained=args.custom_pretrained,
2383
+ cleanup=args.cleanup,
2384
+ index_algorithm=args.index_algorithm,
2385
+ cache_data_in_gpu=args.cache_data_in_gpu,
2386
+ g_pretrained_path=args.g_pretrained_path,
2387
+ d_pretrained_path=args.d_pretrained_path,
2388
+ vocoder=args.vocoder,
2389
+ checkpointing=args.checkpointing,
2390
+ )
2391
+ elif args.mode == "index":
2392
+ run_index_script(
2393
+ model_name=args.model_name,
2394
+ index_algorithm=args.index_algorithm,
2395
+ )
2396
+ elif args.mode == "model_information":
2397
+ run_model_information_script(
2398
+ pth_path=args.pth_path,
2399
+ )
2400
+ elif args.mode == "model_blender":
2401
+ run_model_blender_script(
2402
+ model_name=args.model_name,
2403
+ pth_path_1=args.pth_path_1,
2404
+ pth_path_2=args.pth_path_2,
2405
+ ratio=args.ratio,
2406
+ )
2407
+ elif args.mode == "tensorboard":
2408
+ run_tensorboard_script()
2409
+ elif args.mode == "download":
2410
+ run_download_script(
2411
+ model_link=args.model_link,
2412
+ )
2413
+ elif args.mode == "prerequisites":
2414
+ run_prerequisites_script(
2415
+ pretraineds_hifigan=args.pretraineds_hifigan,
2416
+ models=args.models,
2417
+ exe=args.exe,
2418
+ )
2419
+ elif args.mode == "audio_analyzer":
2420
+ run_audio_analyzer_script(
2421
+ input_path=args.input_path,
2422
+ )
2423
+ except Exception as error:
2424
+ print(f"An error occurred during execution: {error}")
2425
+
2426
+ import traceback
2427
+
2428
+ traceback.print_exc()
2429
+
2430
+
2431
+ if __name__ == "__main__":
2432
+ main()