Upload 2 files
Browse files
app.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
import logging
|
5 |
+
|
6 |
+
from typing import Any
|
7 |
+
|
8 |
+
DEFAULT_SERVER_NAME = "127.0.0.1"
|
9 |
+
DEFAULT_PORT = 6969
|
10 |
+
MAX_PORT_ATTEMPTS = 10
|
11 |
+
|
12 |
+
# Set up logging
|
13 |
+
logging.getLogger("uvicorn").setLevel(logging.WARNING)
|
14 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
15 |
+
|
16 |
+
# Add current directory to sys.path
|
17 |
+
now_dir = os.getcwd()
|
18 |
+
sys.path.append(now_dir)
|
19 |
+
|
20 |
+
# Zluda hijack
|
21 |
+
import rvc.lib.zluda
|
22 |
+
|
23 |
+
# Import Tabs
|
24 |
+
from tabs.inference.inference import inference_tab
|
25 |
+
from tabs.train.train import train_tab
|
26 |
+
from tabs.extra.extra import extra_tab
|
27 |
+
from tabs.report.report import report_tab
|
28 |
+
from tabs.download.download import download_tab
|
29 |
+
from tabs.tts.tts import tts_tab
|
30 |
+
from tabs.voice_blender.voice_blender import voice_blender_tab
|
31 |
+
from tabs.plugins.plugins import plugins_tab
|
32 |
+
from tabs.settings.settings import settings_tab
|
33 |
+
|
34 |
+
# Run prerequisites
|
35 |
+
from core import run_prerequisites_script
|
36 |
+
|
37 |
+
run_prerequisites_script(
|
38 |
+
pretraineds_hifigan=True,
|
39 |
+
models=True,
|
40 |
+
exe=True,
|
41 |
+
)
|
42 |
+
|
43 |
+
# Initialize i18n
|
44 |
+
from assets.i18n.i18n import I18nAuto
|
45 |
+
|
46 |
+
i18n = I18nAuto()
|
47 |
+
|
48 |
+
# Start Discord presence if enabled
|
49 |
+
from tabs.settings.sections.presence import load_config_presence
|
50 |
+
|
51 |
+
if load_config_presence():
|
52 |
+
from assets.discord_presence import RPCManager
|
53 |
+
|
54 |
+
RPCManager.start_presence()
|
55 |
+
|
56 |
+
# Check installation
|
57 |
+
import assets.installation_checker as installation_checker
|
58 |
+
|
59 |
+
installation_checker.check_installation()
|
60 |
+
|
61 |
+
# Load theme
|
62 |
+
import assets.themes.loadThemes as loadThemes
|
63 |
+
|
64 |
+
my_applio = loadThemes.load_theme() or "ParityError/Interstellar"
|
65 |
+
|
66 |
+
# Define Gradio interface
|
67 |
+
with gr.Blocks(
|
68 |
+
theme=my_applio, title="Applio", css="footer{display:none !important}"
|
69 |
+
) as Applio:
|
70 |
+
gr.Markdown("# Applio")
|
71 |
+
gr.Markdown(
|
72 |
+
i18n(
|
73 |
+
"A simple, high-quality voice conversion tool focused on ease of use and performance."
|
74 |
+
)
|
75 |
+
)
|
76 |
+
gr.Markdown(
|
77 |
+
i18n(
|
78 |
+
"[Support](https://discord.gg/urxFjYmYYh) — [GitHub](https://github.com/IAHispano/Applio)"
|
79 |
+
)
|
80 |
+
)
|
81 |
+
with gr.Tab(i18n("Inference")):
|
82 |
+
inference_tab()
|
83 |
+
|
84 |
+
with gr.Tab(i18n("Training")):
|
85 |
+
train_tab()
|
86 |
+
|
87 |
+
with gr.Tab(i18n("TTS")):
|
88 |
+
tts_tab()
|
89 |
+
|
90 |
+
with gr.Tab(i18n("Voice Blender")):
|
91 |
+
voice_blender_tab()
|
92 |
+
|
93 |
+
with gr.Tab(i18n("Plugins")):
|
94 |
+
plugins_tab()
|
95 |
+
|
96 |
+
with gr.Tab(i18n("Download")):
|
97 |
+
download_tab()
|
98 |
+
|
99 |
+
with gr.Tab(i18n("Report a Bug")):
|
100 |
+
report_tab()
|
101 |
+
|
102 |
+
with gr.Tab(i18n("Extra")):
|
103 |
+
extra_tab()
|
104 |
+
|
105 |
+
with gr.Tab(i18n("Settings")):
|
106 |
+
settings_tab()
|
107 |
+
|
108 |
+
gr.Markdown(
|
109 |
+
"""
|
110 |
+
<div style="text-align: center; font-size: 0.9em; text-color: a3a3a3;">
|
111 |
+
By using Applio, you agree to comply with ethical and legal standards, respect intellectual property and privacy rights, avoid harmful or prohibited uses, and accept full responsibility for any outcomes, while Applio disclaims liability and reserves the right to amend these terms.
|
112 |
+
</div>
|
113 |
+
"""
|
114 |
+
)
|
115 |
+
|
116 |
+
|
117 |
+
def launch_gradio(server_name: str, server_port: int) -> None:
|
118 |
+
Applio.launch(
|
119 |
+
favicon_path="assets/ICON.ico",
|
120 |
+
share="--share" in sys.argv,
|
121 |
+
inbrowser="--open" in sys.argv,
|
122 |
+
server_name=server_name,
|
123 |
+
server_port=server_port,
|
124 |
+
)
|
125 |
+
|
126 |
+
|
127 |
+
def get_value_from_args(key: str, default: Any = None) -> Any:
|
128 |
+
if key in sys.argv:
|
129 |
+
index = sys.argv.index(key) + 1
|
130 |
+
if index < len(sys.argv):
|
131 |
+
return sys.argv[index]
|
132 |
+
return default
|
133 |
+
|
134 |
+
|
135 |
+
if __name__ == "__main__":
|
136 |
+
port = int(get_value_from_args("--port", DEFAULT_PORT))
|
137 |
+
server = get_value_from_args("--server-name", DEFAULT_SERVER_NAME)
|
138 |
+
|
139 |
+
for _ in range(MAX_PORT_ATTEMPTS):
|
140 |
+
try:
|
141 |
+
launch_gradio(server, port)
|
142 |
+
break
|
143 |
+
except OSError:
|
144 |
+
print(
|
145 |
+
f"Failed to launch on port {port}, trying again on port {port - 1}..."
|
146 |
+
)
|
147 |
+
port -= 1
|
148 |
+
except Exception as error:
|
149 |
+
print(f"An error occurred launching Gradio: {error}")
|
150 |
+
break
|
core.py
ADDED
@@ -0,0 +1,2432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import json
|
4 |
+
import argparse
|
5 |
+
import subprocess
|
6 |
+
from functools import lru_cache
|
7 |
+
from distutils.util import strtobool
|
8 |
+
|
9 |
+
now_dir = os.getcwd()
|
10 |
+
sys.path.append(now_dir)
|
11 |
+
|
12 |
+
current_script_directory = os.path.dirname(os.path.realpath(__file__))
|
13 |
+
logs_path = os.path.join(current_script_directory, "logs")
|
14 |
+
|
15 |
+
from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
|
16 |
+
from rvc.train.process.model_blender import model_blender
|
17 |
+
from rvc.train.process.model_information import model_information
|
18 |
+
from rvc.lib.tools.analyzer import analyze_audio
|
19 |
+
from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
|
20 |
+
from rvc.lib.tools.model_download import model_download_pipeline
|
21 |
+
|
22 |
+
python = sys.executable
|
23 |
+
|
24 |
+
|
25 |
+
# Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4
|
26 |
+
@lru_cache(maxsize=1) # Cache only one result since the file is static
|
27 |
+
def load_voices_data():
|
28 |
+
with open(
|
29 |
+
os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8"
|
30 |
+
) as file:
|
31 |
+
return json.load(file)
|
32 |
+
|
33 |
+
|
34 |
+
voices_data = load_voices_data()
|
35 |
+
locales = list({voice["ShortName"] for voice in voices_data})
|
36 |
+
|
37 |
+
|
38 |
+
@lru_cache(maxsize=None)
|
39 |
+
def import_voice_converter():
|
40 |
+
from rvc.infer.infer import VoiceConverter
|
41 |
+
|
42 |
+
return VoiceConverter()
|
43 |
+
|
44 |
+
|
45 |
+
@lru_cache(maxsize=1)
|
46 |
+
def get_config():
|
47 |
+
from rvc.configs.config import Config
|
48 |
+
|
49 |
+
return Config()
|
50 |
+
|
51 |
+
|
52 |
+
# Infer
|
53 |
+
def run_infer_script(
|
54 |
+
pitch: int,
|
55 |
+
filter_radius: int,
|
56 |
+
index_rate: float,
|
57 |
+
volume_envelope: int,
|
58 |
+
protect: float,
|
59 |
+
hop_length: int,
|
60 |
+
f0_method: str,
|
61 |
+
input_path: str,
|
62 |
+
output_path: str,
|
63 |
+
pth_path: str,
|
64 |
+
index_path: str,
|
65 |
+
split_audio: bool,
|
66 |
+
f0_autotune: bool,
|
67 |
+
f0_autotune_strength: float,
|
68 |
+
clean_audio: bool,
|
69 |
+
clean_strength: float,
|
70 |
+
export_format: str,
|
71 |
+
f0_file: str,
|
72 |
+
embedder_model: str,
|
73 |
+
embedder_model_custom: str = None,
|
74 |
+
formant_shifting: bool = False,
|
75 |
+
formant_qfrency: float = 1.0,
|
76 |
+
formant_timbre: float = 1.0,
|
77 |
+
post_process: bool = False,
|
78 |
+
reverb: bool = False,
|
79 |
+
pitch_shift: bool = False,
|
80 |
+
limiter: bool = False,
|
81 |
+
gain: bool = False,
|
82 |
+
distortion: bool = False,
|
83 |
+
chorus: bool = False,
|
84 |
+
bitcrush: bool = False,
|
85 |
+
clipping: bool = False,
|
86 |
+
compressor: bool = False,
|
87 |
+
delay: bool = False,
|
88 |
+
reverb_room_size: float = 0.5,
|
89 |
+
reverb_damping: float = 0.5,
|
90 |
+
reverb_wet_gain: float = 0.5,
|
91 |
+
reverb_dry_gain: float = 0.5,
|
92 |
+
reverb_width: float = 0.5,
|
93 |
+
reverb_freeze_mode: float = 0.5,
|
94 |
+
pitch_shift_semitones: float = 0.0,
|
95 |
+
limiter_threshold: float = -6,
|
96 |
+
limiter_release_time: float = 0.01,
|
97 |
+
gain_db: float = 0.0,
|
98 |
+
distortion_gain: float = 25,
|
99 |
+
chorus_rate: float = 1.0,
|
100 |
+
chorus_depth: float = 0.25,
|
101 |
+
chorus_center_delay: float = 7,
|
102 |
+
chorus_feedback: float = 0.0,
|
103 |
+
chorus_mix: float = 0.5,
|
104 |
+
bitcrush_bit_depth: int = 8,
|
105 |
+
clipping_threshold: float = -6,
|
106 |
+
compressor_threshold: float = 0,
|
107 |
+
compressor_ratio: float = 1,
|
108 |
+
compressor_attack: float = 1.0,
|
109 |
+
compressor_release: float = 100,
|
110 |
+
delay_seconds: float = 0.5,
|
111 |
+
delay_feedback: float = 0.0,
|
112 |
+
delay_mix: float = 0.5,
|
113 |
+
sid: int = 0,
|
114 |
+
):
|
115 |
+
kwargs = {
|
116 |
+
"audio_input_path": input_path,
|
117 |
+
"audio_output_path": output_path,
|
118 |
+
"model_path": pth_path,
|
119 |
+
"index_path": index_path,
|
120 |
+
"pitch": pitch,
|
121 |
+
"filter_radius": filter_radius,
|
122 |
+
"index_rate": index_rate,
|
123 |
+
"volume_envelope": volume_envelope,
|
124 |
+
"protect": protect,
|
125 |
+
"hop_length": hop_length,
|
126 |
+
"f0_method": f0_method,
|
127 |
+
"pth_path": pth_path,
|
128 |
+
"index_path": index_path,
|
129 |
+
"split_audio": split_audio,
|
130 |
+
"f0_autotune": f0_autotune,
|
131 |
+
"f0_autotune_strength": f0_autotune_strength,
|
132 |
+
"clean_audio": clean_audio,
|
133 |
+
"clean_strength": clean_strength,
|
134 |
+
"export_format": export_format,
|
135 |
+
"f0_file": f0_file,
|
136 |
+
"embedder_model": embedder_model,
|
137 |
+
"embedder_model_custom": embedder_model_custom,
|
138 |
+
"post_process": post_process,
|
139 |
+
"formant_shifting": formant_shifting,
|
140 |
+
"formant_qfrency": formant_qfrency,
|
141 |
+
"formant_timbre": formant_timbre,
|
142 |
+
"reverb": reverb,
|
143 |
+
"pitch_shift": pitch_shift,
|
144 |
+
"limiter": limiter,
|
145 |
+
"gain": gain,
|
146 |
+
"distortion": distortion,
|
147 |
+
"chorus": chorus,
|
148 |
+
"bitcrush": bitcrush,
|
149 |
+
"clipping": clipping,
|
150 |
+
"compressor": compressor,
|
151 |
+
"delay": delay,
|
152 |
+
"reverb_room_size": reverb_room_size,
|
153 |
+
"reverb_damping": reverb_damping,
|
154 |
+
"reverb_wet_level": reverb_wet_gain,
|
155 |
+
"reverb_dry_level": reverb_dry_gain,
|
156 |
+
"reverb_width": reverb_width,
|
157 |
+
"reverb_freeze_mode": reverb_freeze_mode,
|
158 |
+
"pitch_shift_semitones": pitch_shift_semitones,
|
159 |
+
"limiter_threshold": limiter_threshold,
|
160 |
+
"limiter_release": limiter_release_time,
|
161 |
+
"gain_db": gain_db,
|
162 |
+
"distortion_gain": distortion_gain,
|
163 |
+
"chorus_rate": chorus_rate,
|
164 |
+
"chorus_depth": chorus_depth,
|
165 |
+
"chorus_delay": chorus_center_delay,
|
166 |
+
"chorus_feedback": chorus_feedback,
|
167 |
+
"chorus_mix": chorus_mix,
|
168 |
+
"bitcrush_bit_depth": bitcrush_bit_depth,
|
169 |
+
"clipping_threshold": clipping_threshold,
|
170 |
+
"compressor_threshold": compressor_threshold,
|
171 |
+
"compressor_ratio": compressor_ratio,
|
172 |
+
"compressor_attack": compressor_attack,
|
173 |
+
"compressor_release": compressor_release,
|
174 |
+
"delay_seconds": delay_seconds,
|
175 |
+
"delay_feedback": delay_feedback,
|
176 |
+
"delay_mix": delay_mix,
|
177 |
+
"sid": sid,
|
178 |
+
}
|
179 |
+
infer_pipeline = import_voice_converter()
|
180 |
+
infer_pipeline.convert_audio(
|
181 |
+
**kwargs,
|
182 |
+
)
|
183 |
+
return f"File {input_path} inferred successfully.", output_path.replace(
|
184 |
+
".wav", f".{export_format.lower()}"
|
185 |
+
)
|
186 |
+
|
187 |
+
|
188 |
+
# Batch infer
|
189 |
+
def run_batch_infer_script(
|
190 |
+
pitch: int,
|
191 |
+
filter_radius: int,
|
192 |
+
index_rate: float,
|
193 |
+
volume_envelope: int,
|
194 |
+
protect: float,
|
195 |
+
hop_length: int,
|
196 |
+
f0_method: str,
|
197 |
+
input_folder: str,
|
198 |
+
output_folder: str,
|
199 |
+
pth_path: str,
|
200 |
+
index_path: str,
|
201 |
+
split_audio: bool,
|
202 |
+
f0_autotune: bool,
|
203 |
+
f0_autotune_strength: float,
|
204 |
+
clean_audio: bool,
|
205 |
+
clean_strength: float,
|
206 |
+
export_format: str,
|
207 |
+
f0_file: str,
|
208 |
+
embedder_model: str,
|
209 |
+
embedder_model_custom: str = None,
|
210 |
+
formant_shifting: bool = False,
|
211 |
+
formant_qfrency: float = 1.0,
|
212 |
+
formant_timbre: float = 1.0,
|
213 |
+
post_process: bool = False,
|
214 |
+
reverb: bool = False,
|
215 |
+
pitch_shift: bool = False,
|
216 |
+
limiter: bool = False,
|
217 |
+
gain: bool = False,
|
218 |
+
distortion: bool = False,
|
219 |
+
chorus: bool = False,
|
220 |
+
bitcrush: bool = False,
|
221 |
+
clipping: bool = False,
|
222 |
+
compressor: bool = False,
|
223 |
+
delay: bool = False,
|
224 |
+
reverb_room_size: float = 0.5,
|
225 |
+
reverb_damping: float = 0.5,
|
226 |
+
reverb_wet_gain: float = 0.5,
|
227 |
+
reverb_dry_gain: float = 0.5,
|
228 |
+
reverb_width: float = 0.5,
|
229 |
+
reverb_freeze_mode: float = 0.5,
|
230 |
+
pitch_shift_semitones: float = 0.0,
|
231 |
+
limiter_threshold: float = -6,
|
232 |
+
limiter_release_time: float = 0.01,
|
233 |
+
gain_db: float = 0.0,
|
234 |
+
distortion_gain: float = 25,
|
235 |
+
chorus_rate: float = 1.0,
|
236 |
+
chorus_depth: float = 0.25,
|
237 |
+
chorus_center_delay: float = 7,
|
238 |
+
chorus_feedback: float = 0.0,
|
239 |
+
chorus_mix: float = 0.5,
|
240 |
+
bitcrush_bit_depth: int = 8,
|
241 |
+
clipping_threshold: float = -6,
|
242 |
+
compressor_threshold: float = 0,
|
243 |
+
compressor_ratio: float = 1,
|
244 |
+
compressor_attack: float = 1.0,
|
245 |
+
compressor_release: float = 100,
|
246 |
+
delay_seconds: float = 0.5,
|
247 |
+
delay_feedback: float = 0.0,
|
248 |
+
delay_mix: float = 0.5,
|
249 |
+
sid: int = 0,
|
250 |
+
):
|
251 |
+
kwargs = {
|
252 |
+
"audio_input_paths": input_folder,
|
253 |
+
"audio_output_path": output_folder,
|
254 |
+
"model_path": pth_path,
|
255 |
+
"index_path": index_path,
|
256 |
+
"pitch": pitch,
|
257 |
+
"filter_radius": filter_radius,
|
258 |
+
"index_rate": index_rate,
|
259 |
+
"volume_envelope": volume_envelope,
|
260 |
+
"protect": protect,
|
261 |
+
"hop_length": hop_length,
|
262 |
+
"f0_method": f0_method,
|
263 |
+
"pth_path": pth_path,
|
264 |
+
"index_path": index_path,
|
265 |
+
"split_audio": split_audio,
|
266 |
+
"f0_autotune": f0_autotune,
|
267 |
+
"f0_autotune_strength": f0_autotune_strength,
|
268 |
+
"clean_audio": clean_audio,
|
269 |
+
"clean_strength": clean_strength,
|
270 |
+
"export_format": export_format,
|
271 |
+
"f0_file": f0_file,
|
272 |
+
"embedder_model": embedder_model,
|
273 |
+
"embedder_model_custom": embedder_model_custom,
|
274 |
+
"post_process": post_process,
|
275 |
+
"formant_shifting": formant_shifting,
|
276 |
+
"formant_qfrency": formant_qfrency,
|
277 |
+
"formant_timbre": formant_timbre,
|
278 |
+
"reverb": reverb,
|
279 |
+
"pitch_shift": pitch_shift,
|
280 |
+
"limiter": limiter,
|
281 |
+
"gain": gain,
|
282 |
+
"distortion": distortion,
|
283 |
+
"chorus": chorus,
|
284 |
+
"bitcrush": bitcrush,
|
285 |
+
"clipping": clipping,
|
286 |
+
"compressor": compressor,
|
287 |
+
"delay": delay,
|
288 |
+
"reverb_room_size": reverb_room_size,
|
289 |
+
"reverb_damping": reverb_damping,
|
290 |
+
"reverb_wet_level": reverb_wet_gain,
|
291 |
+
"reverb_dry_level": reverb_dry_gain,
|
292 |
+
"reverb_width": reverb_width,
|
293 |
+
"reverb_freeze_mode": reverb_freeze_mode,
|
294 |
+
"pitch_shift_semitones": pitch_shift_semitones,
|
295 |
+
"limiter_threshold": limiter_threshold,
|
296 |
+
"limiter_release": limiter_release_time,
|
297 |
+
"gain_db": gain_db,
|
298 |
+
"distortion_gain": distortion_gain,
|
299 |
+
"chorus_rate": chorus_rate,
|
300 |
+
"chorus_depth": chorus_depth,
|
301 |
+
"chorus_delay": chorus_center_delay,
|
302 |
+
"chorus_feedback": chorus_feedback,
|
303 |
+
"chorus_mix": chorus_mix,
|
304 |
+
"bitcrush_bit_depth": bitcrush_bit_depth,
|
305 |
+
"clipping_threshold": clipping_threshold,
|
306 |
+
"compressor_threshold": compressor_threshold,
|
307 |
+
"compressor_ratio": compressor_ratio,
|
308 |
+
"compressor_attack": compressor_attack,
|
309 |
+
"compressor_release": compressor_release,
|
310 |
+
"delay_seconds": delay_seconds,
|
311 |
+
"delay_feedback": delay_feedback,
|
312 |
+
"delay_mix": delay_mix,
|
313 |
+
"sid": sid,
|
314 |
+
}
|
315 |
+
infer_pipeline = import_voice_converter()
|
316 |
+
infer_pipeline.convert_audio_batch(
|
317 |
+
**kwargs,
|
318 |
+
)
|
319 |
+
|
320 |
+
return f"Files from {input_folder} inferred successfully."
|
321 |
+
|
322 |
+
|
323 |
+
# TTS
|
324 |
+
def run_tts_script(
|
325 |
+
tts_file: str,
|
326 |
+
tts_text: str,
|
327 |
+
tts_voice: str,
|
328 |
+
tts_rate: int,
|
329 |
+
pitch: int,
|
330 |
+
filter_radius: int,
|
331 |
+
index_rate: float,
|
332 |
+
volume_envelope: int,
|
333 |
+
protect: float,
|
334 |
+
hop_length: int,
|
335 |
+
f0_method: str,
|
336 |
+
output_tts_path: str,
|
337 |
+
output_rvc_path: str,
|
338 |
+
pth_path: str,
|
339 |
+
index_path: str,
|
340 |
+
split_audio: bool,
|
341 |
+
f0_autotune: bool,
|
342 |
+
f0_autotune_strength: float,
|
343 |
+
clean_audio: bool,
|
344 |
+
clean_strength: float,
|
345 |
+
export_format: str,
|
346 |
+
f0_file: str,
|
347 |
+
embedder_model: str,
|
348 |
+
embedder_model_custom: str = None,
|
349 |
+
sid: int = 0,
|
350 |
+
):
|
351 |
+
|
352 |
+
tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
|
353 |
+
|
354 |
+
if os.path.exists(output_tts_path):
|
355 |
+
os.remove(output_tts_path)
|
356 |
+
|
357 |
+
command_tts = [
|
358 |
+
*map(
|
359 |
+
str,
|
360 |
+
[
|
361 |
+
python,
|
362 |
+
tts_script_path,
|
363 |
+
tts_file,
|
364 |
+
tts_text,
|
365 |
+
tts_voice,
|
366 |
+
tts_rate,
|
367 |
+
output_tts_path,
|
368 |
+
],
|
369 |
+
),
|
370 |
+
]
|
371 |
+
subprocess.run(command_tts)
|
372 |
+
infer_pipeline = import_voice_converter()
|
373 |
+
infer_pipeline.convert_audio(
|
374 |
+
pitch=pitch,
|
375 |
+
filter_radius=filter_radius,
|
376 |
+
index_rate=index_rate,
|
377 |
+
volume_envelope=volume_envelope,
|
378 |
+
protect=protect,
|
379 |
+
hop_length=hop_length,
|
380 |
+
f0_method=f0_method,
|
381 |
+
audio_input_path=output_tts_path,
|
382 |
+
audio_output_path=output_rvc_path,
|
383 |
+
model_path=pth_path,
|
384 |
+
index_path=index_path,
|
385 |
+
split_audio=split_audio,
|
386 |
+
f0_autotune=f0_autotune,
|
387 |
+
f0_autotune_strength=f0_autotune_strength,
|
388 |
+
clean_audio=clean_audio,
|
389 |
+
clean_strength=clean_strength,
|
390 |
+
export_format=export_format,
|
391 |
+
f0_file=f0_file,
|
392 |
+
embedder_model=embedder_model,
|
393 |
+
embedder_model_custom=embedder_model_custom,
|
394 |
+
sid=sid,
|
395 |
+
formant_shifting=None,
|
396 |
+
formant_qfrency=None,
|
397 |
+
formant_timbre=None,
|
398 |
+
post_process=None,
|
399 |
+
reverb=None,
|
400 |
+
pitch_shift=None,
|
401 |
+
limiter=None,
|
402 |
+
gain=None,
|
403 |
+
distortion=None,
|
404 |
+
chorus=None,
|
405 |
+
bitcrush=None,
|
406 |
+
clipping=None,
|
407 |
+
compressor=None,
|
408 |
+
delay=None,
|
409 |
+
sliders=None,
|
410 |
+
)
|
411 |
+
|
412 |
+
return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
|
413 |
+
".wav", f".{export_format.lower()}"
|
414 |
+
)
|
415 |
+
|
416 |
+
|
417 |
+
# Preprocess
|
418 |
+
def run_preprocess_script(
|
419 |
+
model_name: str,
|
420 |
+
dataset_path: str,
|
421 |
+
sample_rate: int,
|
422 |
+
cpu_cores: int,
|
423 |
+
cut_preprocess: str,
|
424 |
+
process_effects: bool,
|
425 |
+
noise_reduction: bool,
|
426 |
+
clean_strength: float,
|
427 |
+
chunk_len: float,
|
428 |
+
overlap_len: float,
|
429 |
+
):
|
430 |
+
preprocess_script_path = os.path.join("rvc", "train", "preprocess", "preprocess.py")
|
431 |
+
command = [
|
432 |
+
python,
|
433 |
+
preprocess_script_path,
|
434 |
+
*map(
|
435 |
+
str,
|
436 |
+
[
|
437 |
+
os.path.join(logs_path, model_name),
|
438 |
+
dataset_path,
|
439 |
+
sample_rate,
|
440 |
+
cpu_cores,
|
441 |
+
cut_preprocess,
|
442 |
+
process_effects,
|
443 |
+
noise_reduction,
|
444 |
+
clean_strength,
|
445 |
+
chunk_len,
|
446 |
+
overlap_len,
|
447 |
+
],
|
448 |
+
),
|
449 |
+
]
|
450 |
+
subprocess.run(command)
|
451 |
+
return f"Model {model_name} preprocessed successfully."
|
452 |
+
|
453 |
+
|
454 |
+
# Extract
|
455 |
+
def run_extract_script(
|
456 |
+
model_name: str,
|
457 |
+
f0_method: str,
|
458 |
+
hop_length: int,
|
459 |
+
cpu_cores: int,
|
460 |
+
gpu: int,
|
461 |
+
sample_rate: int,
|
462 |
+
embedder_model: str,
|
463 |
+
embedder_model_custom: str = None,
|
464 |
+
include_mutes: int = 2,
|
465 |
+
):
|
466 |
+
|
467 |
+
model_path = os.path.join(logs_path, model_name)
|
468 |
+
extract = os.path.join("rvc", "train", "extract", "extract.py")
|
469 |
+
|
470 |
+
command_1 = [
|
471 |
+
python,
|
472 |
+
extract,
|
473 |
+
*map(
|
474 |
+
str,
|
475 |
+
[
|
476 |
+
model_path,
|
477 |
+
f0_method,
|
478 |
+
hop_length,
|
479 |
+
cpu_cores,
|
480 |
+
gpu,
|
481 |
+
sample_rate,
|
482 |
+
embedder_model,
|
483 |
+
embedder_model_custom,
|
484 |
+
include_mutes,
|
485 |
+
],
|
486 |
+
),
|
487 |
+
]
|
488 |
+
|
489 |
+
subprocess.run(command_1)
|
490 |
+
|
491 |
+
return f"Model {model_name} extracted successfully."
|
492 |
+
|
493 |
+
|
494 |
+
# Train
|
495 |
+
def run_train_script(
|
496 |
+
model_name: str,
|
497 |
+
save_every_epoch: int,
|
498 |
+
save_only_latest: bool,
|
499 |
+
save_every_weights: bool,
|
500 |
+
total_epoch: int,
|
501 |
+
sample_rate: int,
|
502 |
+
batch_size: int,
|
503 |
+
gpu: int,
|
504 |
+
overtraining_detector: bool,
|
505 |
+
overtraining_threshold: int,
|
506 |
+
pretrained: bool,
|
507 |
+
cleanup: bool,
|
508 |
+
index_algorithm: str = "Auto",
|
509 |
+
cache_data_in_gpu: bool = False,
|
510 |
+
custom_pretrained: bool = False,
|
511 |
+
g_pretrained_path: str = None,
|
512 |
+
d_pretrained_path: str = None,
|
513 |
+
vocoder: str = "HiFi-GAN",
|
514 |
+
checkpointing: bool = False,
|
515 |
+
):
|
516 |
+
|
517 |
+
if pretrained == True:
|
518 |
+
from rvc.lib.tools.pretrained_selector import pretrained_selector
|
519 |
+
|
520 |
+
if custom_pretrained == False:
|
521 |
+
pg, pd = pretrained_selector(str(vocoder), int(sample_rate))
|
522 |
+
else:
|
523 |
+
if g_pretrained_path is None or d_pretrained_path is None:
|
524 |
+
raise ValueError(
|
525 |
+
"Please provide the path to the pretrained G and D models."
|
526 |
+
)
|
527 |
+
pg, pd = g_pretrained_path, d_pretrained_path
|
528 |
+
else:
|
529 |
+
pg, pd = "", ""
|
530 |
+
|
531 |
+
train_script_path = os.path.join("rvc", "train", "train.py")
|
532 |
+
command = [
|
533 |
+
python,
|
534 |
+
train_script_path,
|
535 |
+
*map(
|
536 |
+
str,
|
537 |
+
[
|
538 |
+
model_name,
|
539 |
+
save_every_epoch,
|
540 |
+
total_epoch,
|
541 |
+
pg,
|
542 |
+
pd,
|
543 |
+
gpu,
|
544 |
+
batch_size,
|
545 |
+
sample_rate,
|
546 |
+
save_only_latest,
|
547 |
+
save_every_weights,
|
548 |
+
cache_data_in_gpu,
|
549 |
+
overtraining_detector,
|
550 |
+
overtraining_threshold,
|
551 |
+
cleanup,
|
552 |
+
vocoder,
|
553 |
+
checkpointing,
|
554 |
+
],
|
555 |
+
),
|
556 |
+
]
|
557 |
+
subprocess.run(command)
|
558 |
+
run_index_script(model_name, index_algorithm)
|
559 |
+
return f"Model {model_name} trained successfully."
|
560 |
+
|
561 |
+
|
562 |
+
# Index
|
563 |
+
def run_index_script(model_name: str, index_algorithm: str):
|
564 |
+
index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
|
565 |
+
command = [
|
566 |
+
python,
|
567 |
+
index_script_path,
|
568 |
+
os.path.join(logs_path, model_name),
|
569 |
+
index_algorithm,
|
570 |
+
]
|
571 |
+
|
572 |
+
subprocess.run(command)
|
573 |
+
return f"Index file for {model_name} generated successfully."
|
574 |
+
|
575 |
+
|
576 |
+
# Model information
|
577 |
+
def run_model_information_script(pth_path: str):
|
578 |
+
print(model_information(pth_path))
|
579 |
+
return model_information(pth_path)
|
580 |
+
|
581 |
+
|
582 |
+
# Model blender
|
583 |
+
def run_model_blender_script(
|
584 |
+
model_name: str, pth_path_1: str, pth_path_2: str, ratio: float
|
585 |
+
):
|
586 |
+
message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
|
587 |
+
return message, model_blended
|
588 |
+
|
589 |
+
|
590 |
+
# Tensorboard
|
591 |
+
def run_tensorboard_script():
|
592 |
+
launch_tensorboard_pipeline()
|
593 |
+
|
594 |
+
|
595 |
+
# Download
|
596 |
+
def run_download_script(model_link: str):
|
597 |
+
model_download_pipeline(model_link)
|
598 |
+
return f"Model downloaded successfully."
|
599 |
+
|
600 |
+
|
601 |
+
# Prerequisites
|
602 |
+
def run_prerequisites_script(
|
603 |
+
pretraineds_hifigan: bool,
|
604 |
+
models: bool,
|
605 |
+
exe: bool,
|
606 |
+
):
|
607 |
+
prequisites_download_pipeline(
|
608 |
+
pretraineds_hifigan,
|
609 |
+
models,
|
610 |
+
exe,
|
611 |
+
)
|
612 |
+
return "Prerequisites installed successfully."
|
613 |
+
|
614 |
+
|
615 |
+
# Audio analyzer
|
616 |
+
def run_audio_analyzer_script(
|
617 |
+
input_path: str, save_plot_path: str = "logs/audio_analysis.png"
|
618 |
+
):
|
619 |
+
audio_info, plot_path = analyze_audio(input_path, save_plot_path)
|
620 |
+
print(
|
621 |
+
f"Audio info of {input_path}: {audio_info}",
|
622 |
+
f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
|
623 |
+
)
|
624 |
+
return audio_info, plot_path
|
625 |
+
|
626 |
+
|
627 |
+
# Parse arguments
|
628 |
+
def parse_arguments():
|
629 |
+
parser = argparse.ArgumentParser(
|
630 |
+
description="Run the main.py script with specific parameters."
|
631 |
+
)
|
632 |
+
subparsers = parser.add_subparsers(
|
633 |
+
title="subcommands", dest="mode", help="Choose a mode"
|
634 |
+
)
|
635 |
+
|
636 |
+
# Parser for 'infer' mode
|
637 |
+
infer_parser = subparsers.add_parser("infer", help="Run inference")
|
638 |
+
pitch_description = (
|
639 |
+
"Set the pitch of the audio. Higher values result in a higher pitch."
|
640 |
+
)
|
641 |
+
infer_parser.add_argument(
|
642 |
+
"--pitch",
|
643 |
+
type=int,
|
644 |
+
help=pitch_description,
|
645 |
+
choices=range(-24, 25),
|
646 |
+
default=0,
|
647 |
+
)
|
648 |
+
filter_radius_description = "Apply median filtering to the extracted pitch values if this value is greater than or equal to three. This can help reduce breathiness in the output audio."
|
649 |
+
infer_parser.add_argument(
|
650 |
+
"--filter_radius",
|
651 |
+
type=int,
|
652 |
+
help=filter_radius_description,
|
653 |
+
choices=range(11),
|
654 |
+
default=3,
|
655 |
+
)
|
656 |
+
index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning."
|
657 |
+
infer_parser.add_argument(
|
658 |
+
"--index_rate",
|
659 |
+
type=float,
|
660 |
+
help=index_rate_description,
|
661 |
+
choices=[i / 100.0 for i in range(0, 101)],
|
662 |
+
default=0.3,
|
663 |
+
)
|
664 |
+
volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used."
|
665 |
+
infer_parser.add_argument(
|
666 |
+
"--volume_envelope",
|
667 |
+
type=float,
|
668 |
+
help=volume_envelope_description,
|
669 |
+
choices=[i / 100.0 for i in range(0, 101)],
|
670 |
+
default=1,
|
671 |
+
)
|
672 |
+
protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect."
|
673 |
+
infer_parser.add_argument(
|
674 |
+
"--protect",
|
675 |
+
type=float,
|
676 |
+
help=protect_description,
|
677 |
+
choices=[i / 1000.0 for i in range(0, 501)],
|
678 |
+
default=0.33,
|
679 |
+
)
|
680 |
+
hop_length_description = "Only applicable for the Crepe pitch extraction method. Determines the time it takes for the system to react to a significant pitch change. Smaller values require more processing time but can lead to better pitch accuracy."
|
681 |
+
infer_parser.add_argument(
|
682 |
+
"--hop_length",
|
683 |
+
type=int,
|
684 |
+
help=hop_length_description,
|
685 |
+
choices=range(1, 513),
|
686 |
+
default=128,
|
687 |
+
)
|
688 |
+
f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended."
|
689 |
+
infer_parser.add_argument(
|
690 |
+
"--f0_method",
|
691 |
+
type=str,
|
692 |
+
help=f0_method_description,
|
693 |
+
choices=[
|
694 |
+
"crepe",
|
695 |
+
"crepe-tiny",
|
696 |
+
"rmvpe",
|
697 |
+
"fcpe",
|
698 |
+
"hybrid[crepe+rmvpe]",
|
699 |
+
"hybrid[crepe+fcpe]",
|
700 |
+
"hybrid[rmvpe+fcpe]",
|
701 |
+
"hybrid[crepe+rmvpe+fcpe]",
|
702 |
+
],
|
703 |
+
default="rmvpe",
|
704 |
+
)
|
705 |
+
infer_parser.add_argument(
|
706 |
+
"--input_path",
|
707 |
+
type=str,
|
708 |
+
help="Full path to the input audio file.",
|
709 |
+
required=True,
|
710 |
+
)
|
711 |
+
infer_parser.add_argument(
|
712 |
+
"--output_path",
|
713 |
+
type=str,
|
714 |
+
help="Full path to the output audio file.",
|
715 |
+
required=True,
|
716 |
+
)
|
717 |
+
pth_path_description = "Full path to the RVC model file (.pth)."
|
718 |
+
infer_parser.add_argument(
|
719 |
+
"--pth_path", type=str, help=pth_path_description, required=True
|
720 |
+
)
|
721 |
+
index_path_description = "Full path to the index file (.index)."
|
722 |
+
infer_parser.add_argument(
|
723 |
+
"--index_path", type=str, help=index_path_description, required=True
|
724 |
+
)
|
725 |
+
split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files."
|
726 |
+
infer_parser.add_argument(
|
727 |
+
"--split_audio",
|
728 |
+
type=lambda x: bool(strtobool(x)),
|
729 |
+
choices=[True, False],
|
730 |
+
help=split_audio_description,
|
731 |
+
default=False,
|
732 |
+
)
|
733 |
+
f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions."
|
734 |
+
infer_parser.add_argument(
|
735 |
+
"--f0_autotune",
|
736 |
+
type=lambda x: bool(strtobool(x)),
|
737 |
+
choices=[True, False],
|
738 |
+
help=f0_autotune_description,
|
739 |
+
default=False,
|
740 |
+
)
|
741 |
+
f0_autotune_strength_description = "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
|
742 |
+
infer_parser.add_argument(
|
743 |
+
"--f0_autotune_strength",
|
744 |
+
type=float,
|
745 |
+
help=f0_autotune_strength_description,
|
746 |
+
choices=[(i / 10) for i in range(11)],
|
747 |
+
default=1.0,
|
748 |
+
)
|
749 |
+
clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions."
|
750 |
+
infer_parser.add_argument(
|
751 |
+
"--clean_audio",
|
752 |
+
type=lambda x: bool(strtobool(x)),
|
753 |
+
choices=[True, False],
|
754 |
+
help=clean_audio_description,
|
755 |
+
default=False,
|
756 |
+
)
|
757 |
+
clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound."
|
758 |
+
infer_parser.add_argument(
|
759 |
+
"--clean_strength",
|
760 |
+
type=float,
|
761 |
+
help=clean_strength_description,
|
762 |
+
choices=[(i / 10) for i in range(11)],
|
763 |
+
default=0.7,
|
764 |
+
)
|
765 |
+
export_format_description = "Select the desired output audio format."
|
766 |
+
infer_parser.add_argument(
|
767 |
+
"--export_format",
|
768 |
+
type=str,
|
769 |
+
help=export_format_description,
|
770 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
771 |
+
default="WAV",
|
772 |
+
)
|
773 |
+
embedder_model_description = (
|
774 |
+
"Choose the model used for generating speaker embeddings."
|
775 |
+
)
|
776 |
+
infer_parser.add_argument(
|
777 |
+
"--embedder_model",
|
778 |
+
type=str,
|
779 |
+
help=embedder_model_description,
|
780 |
+
choices=[
|
781 |
+
"contentvec",
|
782 |
+
"chinese-hubert-base",
|
783 |
+
"japanese-hubert-base",
|
784 |
+
"korean-hubert-base",
|
785 |
+
"custom",
|
786 |
+
],
|
787 |
+
default="contentvec",
|
788 |
+
)
|
789 |
+
embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'."
|
790 |
+
infer_parser.add_argument(
|
791 |
+
"--embedder_model_custom",
|
792 |
+
type=str,
|
793 |
+
help=embedder_model_custom_description,
|
794 |
+
default=None,
|
795 |
+
)
|
796 |
+
f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio."
|
797 |
+
infer_parser.add_argument(
|
798 |
+
"--f0_file",
|
799 |
+
type=str,
|
800 |
+
help=f0_file_description,
|
801 |
+
default=None,
|
802 |
+
)
|
803 |
+
formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice."
|
804 |
+
infer_parser.add_argument(
|
805 |
+
"--formant_shifting",
|
806 |
+
type=lambda x: bool(strtobool(x)),
|
807 |
+
choices=[True, False],
|
808 |
+
help=formant_shifting_description,
|
809 |
+
default=False,
|
810 |
+
required=False,
|
811 |
+
)
|
812 |
+
formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect."
|
813 |
+
infer_parser.add_argument(
|
814 |
+
"--formant_qfrency",
|
815 |
+
type=float,
|
816 |
+
help=formant_qfrency_description,
|
817 |
+
default=1.0,
|
818 |
+
required=False,
|
819 |
+
)
|
820 |
+
formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect."
|
821 |
+
infer_parser.add_argument(
|
822 |
+
"--formant_timbre",
|
823 |
+
type=float,
|
824 |
+
help=formant_timbre_description,
|
825 |
+
default=1.0,
|
826 |
+
required=False,
|
827 |
+
)
|
828 |
+
sid_description = "Speaker ID for multi-speaker models."
|
829 |
+
infer_parser.add_argument(
|
830 |
+
"--sid",
|
831 |
+
type=int,
|
832 |
+
help=sid_description,
|
833 |
+
default=0,
|
834 |
+
required=False,
|
835 |
+
)
|
836 |
+
post_process_description = "Apply post-processing effects to the output audio."
|
837 |
+
infer_parser.add_argument(
|
838 |
+
"--post_process",
|
839 |
+
type=lambda x: bool(strtobool(x)),
|
840 |
+
choices=[True, False],
|
841 |
+
help=post_process_description,
|
842 |
+
default=False,
|
843 |
+
required=False,
|
844 |
+
)
|
845 |
+
reverb_description = "Apply reverb effect to the output audio."
|
846 |
+
infer_parser.add_argument(
|
847 |
+
"--reverb",
|
848 |
+
type=lambda x: bool(strtobool(x)),
|
849 |
+
choices=[True, False],
|
850 |
+
help=reverb_description,
|
851 |
+
default=False,
|
852 |
+
required=False,
|
853 |
+
)
|
854 |
+
|
855 |
+
pitch_shift_description = "Apply pitch shifting effect to the output audio."
|
856 |
+
infer_parser.add_argument(
|
857 |
+
"--pitch_shift",
|
858 |
+
type=lambda x: bool(strtobool(x)),
|
859 |
+
choices=[True, False],
|
860 |
+
help=pitch_shift_description,
|
861 |
+
default=False,
|
862 |
+
required=False,
|
863 |
+
)
|
864 |
+
|
865 |
+
limiter_description = "Apply limiter effect to the output audio."
|
866 |
+
infer_parser.add_argument(
|
867 |
+
"--limiter",
|
868 |
+
type=lambda x: bool(strtobool(x)),
|
869 |
+
choices=[True, False],
|
870 |
+
help=limiter_description,
|
871 |
+
default=False,
|
872 |
+
required=False,
|
873 |
+
)
|
874 |
+
|
875 |
+
gain_description = "Apply gain effect to the output audio."
|
876 |
+
infer_parser.add_argument(
|
877 |
+
"--gain",
|
878 |
+
type=lambda x: bool(strtobool(x)),
|
879 |
+
choices=[True, False],
|
880 |
+
help=gain_description,
|
881 |
+
default=False,
|
882 |
+
required=False,
|
883 |
+
)
|
884 |
+
|
885 |
+
distortion_description = "Apply distortion effect to the output audio."
|
886 |
+
infer_parser.add_argument(
|
887 |
+
"--distortion",
|
888 |
+
type=lambda x: bool(strtobool(x)),
|
889 |
+
choices=[True, False],
|
890 |
+
help=distortion_description,
|
891 |
+
default=False,
|
892 |
+
required=False,
|
893 |
+
)
|
894 |
+
|
895 |
+
chorus_description = "Apply chorus effect to the output audio."
|
896 |
+
infer_parser.add_argument(
|
897 |
+
"--chorus",
|
898 |
+
type=lambda x: bool(strtobool(x)),
|
899 |
+
choices=[True, False],
|
900 |
+
help=chorus_description,
|
901 |
+
default=False,
|
902 |
+
required=False,
|
903 |
+
)
|
904 |
+
|
905 |
+
bitcrush_description = "Apply bitcrush effect to the output audio."
|
906 |
+
infer_parser.add_argument(
|
907 |
+
"--bitcrush",
|
908 |
+
type=lambda x: bool(strtobool(x)),
|
909 |
+
choices=[True, False],
|
910 |
+
help=bitcrush_description,
|
911 |
+
default=False,
|
912 |
+
required=False,
|
913 |
+
)
|
914 |
+
|
915 |
+
clipping_description = "Apply clipping effect to the output audio."
|
916 |
+
infer_parser.add_argument(
|
917 |
+
"--clipping",
|
918 |
+
type=lambda x: bool(strtobool(x)),
|
919 |
+
choices=[True, False],
|
920 |
+
help=clipping_description,
|
921 |
+
default=False,
|
922 |
+
required=False,
|
923 |
+
)
|
924 |
+
|
925 |
+
compressor_description = "Apply compressor effect to the output audio."
|
926 |
+
infer_parser.add_argument(
|
927 |
+
"--compressor",
|
928 |
+
type=lambda x: bool(strtobool(x)),
|
929 |
+
choices=[True, False],
|
930 |
+
help=compressor_description,
|
931 |
+
default=False,
|
932 |
+
required=False,
|
933 |
+
)
|
934 |
+
|
935 |
+
delay_description = "Apply delay effect to the output audio."
|
936 |
+
infer_parser.add_argument(
|
937 |
+
"--delay",
|
938 |
+
type=lambda x: bool(strtobool(x)),
|
939 |
+
choices=[True, False],
|
940 |
+
help=delay_description,
|
941 |
+
default=False,
|
942 |
+
required=False,
|
943 |
+
)
|
944 |
+
|
945 |
+
reverb_room_size_description = "Control the room size of the reverb effect. Higher values result in a larger room size."
|
946 |
+
infer_parser.add_argument(
|
947 |
+
"--reverb_room_size",
|
948 |
+
type=float,
|
949 |
+
help=reverb_room_size_description,
|
950 |
+
default=0.5,
|
951 |
+
required=False,
|
952 |
+
)
|
953 |
+
|
954 |
+
reverb_damping_description = "Control the damping of the reverb effect. Higher values result in a more damped sound."
|
955 |
+
infer_parser.add_argument(
|
956 |
+
"--reverb_damping",
|
957 |
+
type=float,
|
958 |
+
help=reverb_damping_description,
|
959 |
+
default=0.5,
|
960 |
+
required=False,
|
961 |
+
)
|
962 |
+
|
963 |
+
reverb_wet_gain_description = "Control the wet gain of the reverb effect. Higher values result in a stronger reverb effect."
|
964 |
+
infer_parser.add_argument(
|
965 |
+
"--reverb_wet_gain",
|
966 |
+
type=float,
|
967 |
+
help=reverb_wet_gain_description,
|
968 |
+
default=0.5,
|
969 |
+
required=False,
|
970 |
+
)
|
971 |
+
|
972 |
+
reverb_dry_gain_description = "Control the dry gain of the reverb effect. Higher values result in a stronger dry signal."
|
973 |
+
infer_parser.add_argument(
|
974 |
+
"--reverb_dry_gain",
|
975 |
+
type=float,
|
976 |
+
help=reverb_dry_gain_description,
|
977 |
+
default=0.5,
|
978 |
+
required=False,
|
979 |
+
)
|
980 |
+
|
981 |
+
reverb_width_description = "Control the stereo width of the reverb effect. Higher values result in a wider stereo image."
|
982 |
+
infer_parser.add_argument(
|
983 |
+
"--reverb_width",
|
984 |
+
type=float,
|
985 |
+
help=reverb_width_description,
|
986 |
+
default=0.5,
|
987 |
+
required=False,
|
988 |
+
)
|
989 |
+
|
990 |
+
reverb_freeze_mode_description = "Control the freeze mode of the reverb effect. Higher values result in a stronger freeze effect."
|
991 |
+
infer_parser.add_argument(
|
992 |
+
"--reverb_freeze_mode",
|
993 |
+
type=float,
|
994 |
+
help=reverb_freeze_mode_description,
|
995 |
+
default=0.5,
|
996 |
+
required=False,
|
997 |
+
)
|
998 |
+
|
999 |
+
pitch_shift_semitones_description = "Control the pitch shift in semitones. Positive values increase the pitch, while negative values decrease it."
|
1000 |
+
infer_parser.add_argument(
|
1001 |
+
"--pitch_shift_semitones",
|
1002 |
+
type=float,
|
1003 |
+
help=pitch_shift_semitones_description,
|
1004 |
+
default=0.0,
|
1005 |
+
required=False,
|
1006 |
+
)
|
1007 |
+
|
1008 |
+
limiter_threshold_description = "Control the threshold of the limiter effect. Higher values result in a stronger limiting effect."
|
1009 |
+
infer_parser.add_argument(
|
1010 |
+
"--limiter_threshold",
|
1011 |
+
type=float,
|
1012 |
+
help=limiter_threshold_description,
|
1013 |
+
default=-6,
|
1014 |
+
required=False,
|
1015 |
+
)
|
1016 |
+
|
1017 |
+
limiter_release_time_description = "Control the release time of the limiter effect. Higher values result in a longer release time."
|
1018 |
+
infer_parser.add_argument(
|
1019 |
+
"--limiter_release_time",
|
1020 |
+
type=float,
|
1021 |
+
help=limiter_release_time_description,
|
1022 |
+
default=0.01,
|
1023 |
+
required=False,
|
1024 |
+
)
|
1025 |
+
|
1026 |
+
gain_db_description = "Control the gain in decibels. Positive values increase the gain, while negative values decrease it."
|
1027 |
+
infer_parser.add_argument(
|
1028 |
+
"--gain_db",
|
1029 |
+
type=float,
|
1030 |
+
help=gain_db_description,
|
1031 |
+
default=0.0,
|
1032 |
+
required=False,
|
1033 |
+
)
|
1034 |
+
|
1035 |
+
distortion_gain_description = "Control the gain of the distortion effect. Higher values result in a stronger distortion effect."
|
1036 |
+
infer_parser.add_argument(
|
1037 |
+
"--distortion_gain",
|
1038 |
+
type=float,
|
1039 |
+
help=distortion_gain_description,
|
1040 |
+
default=25,
|
1041 |
+
required=False,
|
1042 |
+
)
|
1043 |
+
|
1044 |
+
chorus_rate_description = "Control the rate of the chorus effect. Higher values result in a faster chorus effect."
|
1045 |
+
infer_parser.add_argument(
|
1046 |
+
"--chorus_rate",
|
1047 |
+
type=float,
|
1048 |
+
help=chorus_rate_description,
|
1049 |
+
default=1.0,
|
1050 |
+
required=False,
|
1051 |
+
)
|
1052 |
+
|
1053 |
+
chorus_depth_description = "Control the depth of the chorus effect. Higher values result in a stronger chorus effect."
|
1054 |
+
infer_parser.add_argument(
|
1055 |
+
"--chorus_depth",
|
1056 |
+
type=float,
|
1057 |
+
help=chorus_depth_description,
|
1058 |
+
default=0.25,
|
1059 |
+
required=False,
|
1060 |
+
)
|
1061 |
+
|
1062 |
+
chorus_center_delay_description = "Control the center delay of the chorus effect. Higher values result in a longer center delay."
|
1063 |
+
infer_parser.add_argument(
|
1064 |
+
"--chorus_center_delay",
|
1065 |
+
type=float,
|
1066 |
+
help=chorus_center_delay_description,
|
1067 |
+
default=7,
|
1068 |
+
required=False,
|
1069 |
+
)
|
1070 |
+
|
1071 |
+
chorus_feedback_description = "Control the feedback of the chorus effect. Higher values result in a stronger feedback effect."
|
1072 |
+
infer_parser.add_argument(
|
1073 |
+
"--chorus_feedback",
|
1074 |
+
type=float,
|
1075 |
+
help=chorus_feedback_description,
|
1076 |
+
default=0.0,
|
1077 |
+
required=False,
|
1078 |
+
)
|
1079 |
+
|
1080 |
+
chorus_mix_description = "Control the mix of the chorus effect. Higher values result in a stronger chorus effect."
|
1081 |
+
infer_parser.add_argument(
|
1082 |
+
"--chorus_mix",
|
1083 |
+
type=float,
|
1084 |
+
help=chorus_mix_description,
|
1085 |
+
default=0.5,
|
1086 |
+
required=False,
|
1087 |
+
)
|
1088 |
+
|
1089 |
+
bitcrush_bit_depth_description = "Control the bit depth of the bitcrush effect. Higher values result in a stronger bitcrush effect."
|
1090 |
+
infer_parser.add_argument(
|
1091 |
+
"--bitcrush_bit_depth",
|
1092 |
+
type=int,
|
1093 |
+
help=bitcrush_bit_depth_description,
|
1094 |
+
default=8,
|
1095 |
+
required=False,
|
1096 |
+
)
|
1097 |
+
|
1098 |
+
clipping_threshold_description = "Control the threshold of the clipping effect. Higher values result in a stronger clipping effect."
|
1099 |
+
infer_parser.add_argument(
|
1100 |
+
"--clipping_threshold",
|
1101 |
+
type=float,
|
1102 |
+
help=clipping_threshold_description,
|
1103 |
+
default=-6,
|
1104 |
+
required=False,
|
1105 |
+
)
|
1106 |
+
|
1107 |
+
compressor_threshold_description = "Control the threshold of the compressor effect. Higher values result in a stronger compressor effect."
|
1108 |
+
infer_parser.add_argument(
|
1109 |
+
"--compressor_threshold",
|
1110 |
+
type=float,
|
1111 |
+
help=compressor_threshold_description,
|
1112 |
+
default=0,
|
1113 |
+
required=False,
|
1114 |
+
)
|
1115 |
+
|
1116 |
+
compressor_ratio_description = "Control the ratio of the compressor effect. Higher values result in a stronger compressor effect."
|
1117 |
+
infer_parser.add_argument(
|
1118 |
+
"--compressor_ratio",
|
1119 |
+
type=float,
|
1120 |
+
help=compressor_ratio_description,
|
1121 |
+
default=1,
|
1122 |
+
required=False,
|
1123 |
+
)
|
1124 |
+
|
1125 |
+
compressor_attack_description = "Control the attack of the compressor effect. Higher values result in a stronger compressor effect."
|
1126 |
+
infer_parser.add_argument(
|
1127 |
+
"--compressor_attack",
|
1128 |
+
type=float,
|
1129 |
+
help=compressor_attack_description,
|
1130 |
+
default=1.0,
|
1131 |
+
required=False,
|
1132 |
+
)
|
1133 |
+
|
1134 |
+
compressor_release_description = "Control the release of the compressor effect. Higher values result in a stronger compressor effect."
|
1135 |
+
infer_parser.add_argument(
|
1136 |
+
"--compressor_release",
|
1137 |
+
type=float,
|
1138 |
+
help=compressor_release_description,
|
1139 |
+
default=100,
|
1140 |
+
required=False,
|
1141 |
+
)
|
1142 |
+
|
1143 |
+
delay_seconds_description = "Control the delay time in seconds. Higher values result in a longer delay time."
|
1144 |
+
infer_parser.add_argument(
|
1145 |
+
"--delay_seconds",
|
1146 |
+
type=float,
|
1147 |
+
help=delay_seconds_description,
|
1148 |
+
default=0.5,
|
1149 |
+
required=False,
|
1150 |
+
)
|
1151 |
+
delay_feedback_description = "Control the feedback of the delay effect. Higher values result in a stronger feedback effect."
|
1152 |
+
infer_parser.add_argument(
|
1153 |
+
"--delay_feedback",
|
1154 |
+
type=float,
|
1155 |
+
help=delay_feedback_description,
|
1156 |
+
default=0.0,
|
1157 |
+
required=False,
|
1158 |
+
)
|
1159 |
+
delay_mix_description = "Control the mix of the delay effect. Higher values result in a stronger delay effect."
|
1160 |
+
infer_parser.add_argument(
|
1161 |
+
"--delay_mix",
|
1162 |
+
type=float,
|
1163 |
+
help=delay_mix_description,
|
1164 |
+
default=0.5,
|
1165 |
+
required=False,
|
1166 |
+
)
|
1167 |
+
|
1168 |
+
# Parser for 'batch_infer' mode
|
1169 |
+
batch_infer_parser = subparsers.add_parser(
|
1170 |
+
"batch_infer",
|
1171 |
+
help="Run batch inference",
|
1172 |
+
)
|
1173 |
+
batch_infer_parser.add_argument(
|
1174 |
+
"--pitch",
|
1175 |
+
type=int,
|
1176 |
+
help=pitch_description,
|
1177 |
+
choices=range(-24, 25),
|
1178 |
+
default=0,
|
1179 |
+
)
|
1180 |
+
batch_infer_parser.add_argument(
|
1181 |
+
"--filter_radius",
|
1182 |
+
type=int,
|
1183 |
+
help=filter_radius_description,
|
1184 |
+
choices=range(11),
|
1185 |
+
default=3,
|
1186 |
+
)
|
1187 |
+
batch_infer_parser.add_argument(
|
1188 |
+
"--index_rate",
|
1189 |
+
type=float,
|
1190 |
+
help=index_rate_description,
|
1191 |
+
choices=[i / 100.0 for i in range(0, 101)],
|
1192 |
+
default=0.3,
|
1193 |
+
)
|
1194 |
+
batch_infer_parser.add_argument(
|
1195 |
+
"--volume_envelope",
|
1196 |
+
type=float,
|
1197 |
+
help=volume_envelope_description,
|
1198 |
+
choices=[i / 100.0 for i in range(0, 101)],
|
1199 |
+
default=1,
|
1200 |
+
)
|
1201 |
+
batch_infer_parser.add_argument(
|
1202 |
+
"--protect",
|
1203 |
+
type=float,
|
1204 |
+
help=protect_description,
|
1205 |
+
choices=[i / 1000.0 for i in range(0, 501)],
|
1206 |
+
default=0.33,
|
1207 |
+
)
|
1208 |
+
batch_infer_parser.add_argument(
|
1209 |
+
"--hop_length",
|
1210 |
+
type=int,
|
1211 |
+
help=hop_length_description,
|
1212 |
+
choices=range(1, 513),
|
1213 |
+
default=128,
|
1214 |
+
)
|
1215 |
+
batch_infer_parser.add_argument(
|
1216 |
+
"--f0_method",
|
1217 |
+
type=str,
|
1218 |
+
help=f0_method_description,
|
1219 |
+
choices=[
|
1220 |
+
"crepe",
|
1221 |
+
"crepe-tiny",
|
1222 |
+
"rmvpe",
|
1223 |
+
"fcpe",
|
1224 |
+
"hybrid[crepe+rmvpe]",
|
1225 |
+
"hybrid[crepe+fcpe]",
|
1226 |
+
"hybrid[rmvpe+fcpe]",
|
1227 |
+
"hybrid[crepe+rmvpe+fcpe]",
|
1228 |
+
],
|
1229 |
+
default="rmvpe",
|
1230 |
+
)
|
1231 |
+
batch_infer_parser.add_argument(
|
1232 |
+
"--input_folder",
|
1233 |
+
type=str,
|
1234 |
+
help="Path to the folder containing input audio files.",
|
1235 |
+
required=True,
|
1236 |
+
)
|
1237 |
+
batch_infer_parser.add_argument(
|
1238 |
+
"--output_folder",
|
1239 |
+
type=str,
|
1240 |
+
help="Path to the folder for saving output audio files.",
|
1241 |
+
required=True,
|
1242 |
+
)
|
1243 |
+
batch_infer_parser.add_argument(
|
1244 |
+
"--pth_path", type=str, help=pth_path_description, required=True
|
1245 |
+
)
|
1246 |
+
batch_infer_parser.add_argument(
|
1247 |
+
"--index_path", type=str, help=index_path_description, required=True
|
1248 |
+
)
|
1249 |
+
batch_infer_parser.add_argument(
|
1250 |
+
"--split_audio",
|
1251 |
+
type=lambda x: bool(strtobool(x)),
|
1252 |
+
choices=[True, False],
|
1253 |
+
help=split_audio_description,
|
1254 |
+
default=False,
|
1255 |
+
)
|
1256 |
+
batch_infer_parser.add_argument(
|
1257 |
+
"--f0_autotune",
|
1258 |
+
type=lambda x: bool(strtobool(x)),
|
1259 |
+
choices=[True, False],
|
1260 |
+
help=f0_autotune_description,
|
1261 |
+
default=False,
|
1262 |
+
)
|
1263 |
+
batch_infer_parser.add_argument(
|
1264 |
+
"--f0_autotune_strength",
|
1265 |
+
type=float,
|
1266 |
+
help=clean_strength_description,
|
1267 |
+
choices=[(i / 10) for i in range(11)],
|
1268 |
+
default=1.0,
|
1269 |
+
)
|
1270 |
+
batch_infer_parser.add_argument(
|
1271 |
+
"--clean_audio",
|
1272 |
+
type=lambda x: bool(strtobool(x)),
|
1273 |
+
choices=[True, False],
|
1274 |
+
help=clean_audio_description,
|
1275 |
+
default=False,
|
1276 |
+
)
|
1277 |
+
batch_infer_parser.add_argument(
|
1278 |
+
"--clean_strength",
|
1279 |
+
type=float,
|
1280 |
+
help=clean_strength_description,
|
1281 |
+
choices=[(i / 10) for i in range(11)],
|
1282 |
+
default=0.7,
|
1283 |
+
)
|
1284 |
+
batch_infer_parser.add_argument(
|
1285 |
+
"--export_format",
|
1286 |
+
type=str,
|
1287 |
+
help=export_format_description,
|
1288 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
1289 |
+
default="WAV",
|
1290 |
+
)
|
1291 |
+
batch_infer_parser.add_argument(
|
1292 |
+
"--embedder_model",
|
1293 |
+
type=str,
|
1294 |
+
help=embedder_model_description,
|
1295 |
+
choices=[
|
1296 |
+
"contentvec",
|
1297 |
+
"chinese-hubert-base",
|
1298 |
+
"japanese-hubert-base",
|
1299 |
+
"korean-hubert-base",
|
1300 |
+
"custom",
|
1301 |
+
],
|
1302 |
+
default="contentvec",
|
1303 |
+
)
|
1304 |
+
batch_infer_parser.add_argument(
|
1305 |
+
"--embedder_model_custom",
|
1306 |
+
type=str,
|
1307 |
+
help=embedder_model_custom_description,
|
1308 |
+
default=None,
|
1309 |
+
)
|
1310 |
+
batch_infer_parser.add_argument(
|
1311 |
+
"--f0_file",
|
1312 |
+
type=str,
|
1313 |
+
help=f0_file_description,
|
1314 |
+
default=None,
|
1315 |
+
)
|
1316 |
+
batch_infer_parser.add_argument(
|
1317 |
+
"--formant_shifting",
|
1318 |
+
type=lambda x: bool(strtobool(x)),
|
1319 |
+
choices=[True, False],
|
1320 |
+
help=formant_shifting_description,
|
1321 |
+
default=False,
|
1322 |
+
required=False,
|
1323 |
+
)
|
1324 |
+
batch_infer_parser.add_argument(
|
1325 |
+
"--formant_qfrency",
|
1326 |
+
type=float,
|
1327 |
+
help=formant_qfrency_description,
|
1328 |
+
default=1.0,
|
1329 |
+
required=False,
|
1330 |
+
)
|
1331 |
+
batch_infer_parser.add_argument(
|
1332 |
+
"--formant_timbre",
|
1333 |
+
type=float,
|
1334 |
+
help=formant_timbre_description,
|
1335 |
+
default=1.0,
|
1336 |
+
required=False,
|
1337 |
+
)
|
1338 |
+
batch_infer_parser.add_argument(
|
1339 |
+
"--sid",
|
1340 |
+
type=int,
|
1341 |
+
help=sid_description,
|
1342 |
+
default=0,
|
1343 |
+
required=False,
|
1344 |
+
)
|
1345 |
+
batch_infer_parser.add_argument(
|
1346 |
+
"--post_process",
|
1347 |
+
type=lambda x: bool(strtobool(x)),
|
1348 |
+
choices=[True, False],
|
1349 |
+
help=post_process_description,
|
1350 |
+
default=False,
|
1351 |
+
required=False,
|
1352 |
+
)
|
1353 |
+
batch_infer_parser.add_argument(
|
1354 |
+
"--reverb",
|
1355 |
+
type=lambda x: bool(strtobool(x)),
|
1356 |
+
choices=[True, False],
|
1357 |
+
help=reverb_description,
|
1358 |
+
default=False,
|
1359 |
+
required=False,
|
1360 |
+
)
|
1361 |
+
|
1362 |
+
batch_infer_parser.add_argument(
|
1363 |
+
"--pitch_shift",
|
1364 |
+
type=lambda x: bool(strtobool(x)),
|
1365 |
+
choices=[True, False],
|
1366 |
+
help=pitch_shift_description,
|
1367 |
+
default=False,
|
1368 |
+
required=False,
|
1369 |
+
)
|
1370 |
+
|
1371 |
+
batch_infer_parser.add_argument(
|
1372 |
+
"--limiter",
|
1373 |
+
type=lambda x: bool(strtobool(x)),
|
1374 |
+
choices=[True, False],
|
1375 |
+
help=limiter_description,
|
1376 |
+
default=False,
|
1377 |
+
required=False,
|
1378 |
+
)
|
1379 |
+
|
1380 |
+
batch_infer_parser.add_argument(
|
1381 |
+
"--gain",
|
1382 |
+
type=lambda x: bool(strtobool(x)),
|
1383 |
+
choices=[True, False],
|
1384 |
+
help=gain_description,
|
1385 |
+
default=False,
|
1386 |
+
required=False,
|
1387 |
+
)
|
1388 |
+
|
1389 |
+
batch_infer_parser.add_argument(
|
1390 |
+
"--distortion",
|
1391 |
+
type=lambda x: bool(strtobool(x)),
|
1392 |
+
choices=[True, False],
|
1393 |
+
help=distortion_description,
|
1394 |
+
default=False,
|
1395 |
+
required=False,
|
1396 |
+
)
|
1397 |
+
|
1398 |
+
batch_infer_parser.add_argument(
|
1399 |
+
"--chorus",
|
1400 |
+
type=lambda x: bool(strtobool(x)),
|
1401 |
+
choices=[True, False],
|
1402 |
+
help=chorus_description,
|
1403 |
+
default=False,
|
1404 |
+
required=False,
|
1405 |
+
)
|
1406 |
+
|
1407 |
+
batch_infer_parser.add_argument(
|
1408 |
+
"--bitcrush",
|
1409 |
+
type=lambda x: bool(strtobool(x)),
|
1410 |
+
choices=[True, False],
|
1411 |
+
help=bitcrush_description,
|
1412 |
+
default=False,
|
1413 |
+
required=False,
|
1414 |
+
)
|
1415 |
+
|
1416 |
+
batch_infer_parser.add_argument(
|
1417 |
+
"--clipping",
|
1418 |
+
type=lambda x: bool(strtobool(x)),
|
1419 |
+
choices=[True, False],
|
1420 |
+
help=clipping_description,
|
1421 |
+
default=False,
|
1422 |
+
required=False,
|
1423 |
+
)
|
1424 |
+
|
1425 |
+
batch_infer_parser.add_argument(
|
1426 |
+
"--compressor",
|
1427 |
+
type=lambda x: bool(strtobool(x)),
|
1428 |
+
choices=[True, False],
|
1429 |
+
help=compressor_description,
|
1430 |
+
default=False,
|
1431 |
+
required=False,
|
1432 |
+
)
|
1433 |
+
|
1434 |
+
batch_infer_parser.add_argument(
|
1435 |
+
"--delay",
|
1436 |
+
type=lambda x: bool(strtobool(x)),
|
1437 |
+
choices=[True, False],
|
1438 |
+
help=delay_description,
|
1439 |
+
default=False,
|
1440 |
+
required=False,
|
1441 |
+
)
|
1442 |
+
|
1443 |
+
batch_infer_parser.add_argument(
|
1444 |
+
"--reverb_room_size",
|
1445 |
+
type=float,
|
1446 |
+
help=reverb_room_size_description,
|
1447 |
+
default=0.5,
|
1448 |
+
required=False,
|
1449 |
+
)
|
1450 |
+
|
1451 |
+
batch_infer_parser.add_argument(
|
1452 |
+
"--reverb_damping",
|
1453 |
+
type=float,
|
1454 |
+
help=reverb_damping_description,
|
1455 |
+
default=0.5,
|
1456 |
+
required=False,
|
1457 |
+
)
|
1458 |
+
|
1459 |
+
batch_infer_parser.add_argument(
|
1460 |
+
"--reverb_wet_gain",
|
1461 |
+
type=float,
|
1462 |
+
help=reverb_wet_gain_description,
|
1463 |
+
default=0.5,
|
1464 |
+
required=False,
|
1465 |
+
)
|
1466 |
+
|
1467 |
+
batch_infer_parser.add_argument(
|
1468 |
+
"--reverb_dry_gain",
|
1469 |
+
type=float,
|
1470 |
+
help=reverb_dry_gain_description,
|
1471 |
+
default=0.5,
|
1472 |
+
required=False,
|
1473 |
+
)
|
1474 |
+
|
1475 |
+
batch_infer_parser.add_argument(
|
1476 |
+
"--reverb_width",
|
1477 |
+
type=float,
|
1478 |
+
help=reverb_width_description,
|
1479 |
+
default=0.5,
|
1480 |
+
required=False,
|
1481 |
+
)
|
1482 |
+
|
1483 |
+
batch_infer_parser.add_argument(
|
1484 |
+
"--reverb_freeze_mode",
|
1485 |
+
type=float,
|
1486 |
+
help=reverb_freeze_mode_description,
|
1487 |
+
default=0.5,
|
1488 |
+
required=False,
|
1489 |
+
)
|
1490 |
+
|
1491 |
+
batch_infer_parser.add_argument(
|
1492 |
+
"--pitch_shift_semitones",
|
1493 |
+
type=float,
|
1494 |
+
help=pitch_shift_semitones_description,
|
1495 |
+
default=0.0,
|
1496 |
+
required=False,
|
1497 |
+
)
|
1498 |
+
|
1499 |
+
batch_infer_parser.add_argument(
|
1500 |
+
"--limiter_threshold",
|
1501 |
+
type=float,
|
1502 |
+
help=limiter_threshold_description,
|
1503 |
+
default=-6,
|
1504 |
+
required=False,
|
1505 |
+
)
|
1506 |
+
|
1507 |
+
batch_infer_parser.add_argument(
|
1508 |
+
"--limiter_release_time",
|
1509 |
+
type=float,
|
1510 |
+
help=limiter_release_time_description,
|
1511 |
+
default=0.01,
|
1512 |
+
required=False,
|
1513 |
+
)
|
1514 |
+
batch_infer_parser.add_argument(
|
1515 |
+
"--gain_db",
|
1516 |
+
type=float,
|
1517 |
+
help=gain_db_description,
|
1518 |
+
default=0.0,
|
1519 |
+
required=False,
|
1520 |
+
)
|
1521 |
+
|
1522 |
+
batch_infer_parser.add_argument(
|
1523 |
+
"--distortion_gain",
|
1524 |
+
type=float,
|
1525 |
+
help=distortion_gain_description,
|
1526 |
+
default=25,
|
1527 |
+
required=False,
|
1528 |
+
)
|
1529 |
+
|
1530 |
+
batch_infer_parser.add_argument(
|
1531 |
+
"--chorus_rate",
|
1532 |
+
type=float,
|
1533 |
+
help=chorus_rate_description,
|
1534 |
+
default=1.0,
|
1535 |
+
required=False,
|
1536 |
+
)
|
1537 |
+
|
1538 |
+
batch_infer_parser.add_argument(
|
1539 |
+
"--chorus_depth",
|
1540 |
+
type=float,
|
1541 |
+
help=chorus_depth_description,
|
1542 |
+
default=0.25,
|
1543 |
+
required=False,
|
1544 |
+
)
|
1545 |
+
batch_infer_parser.add_argument(
|
1546 |
+
"--chorus_center_delay",
|
1547 |
+
type=float,
|
1548 |
+
help=chorus_center_delay_description,
|
1549 |
+
default=7,
|
1550 |
+
required=False,
|
1551 |
+
)
|
1552 |
+
|
1553 |
+
batch_infer_parser.add_argument(
|
1554 |
+
"--chorus_feedback",
|
1555 |
+
type=float,
|
1556 |
+
help=chorus_feedback_description,
|
1557 |
+
default=0.0,
|
1558 |
+
required=False,
|
1559 |
+
)
|
1560 |
+
|
1561 |
+
batch_infer_parser.add_argument(
|
1562 |
+
"--chorus_mix",
|
1563 |
+
type=float,
|
1564 |
+
help=chorus_mix_description,
|
1565 |
+
default=0.5,
|
1566 |
+
required=False,
|
1567 |
+
)
|
1568 |
+
|
1569 |
+
batch_infer_parser.add_argument(
|
1570 |
+
"--bitcrush_bit_depth",
|
1571 |
+
type=int,
|
1572 |
+
help=bitcrush_bit_depth_description,
|
1573 |
+
default=8,
|
1574 |
+
required=False,
|
1575 |
+
)
|
1576 |
+
|
1577 |
+
batch_infer_parser.add_argument(
|
1578 |
+
"--clipping_threshold",
|
1579 |
+
type=float,
|
1580 |
+
help=clipping_threshold_description,
|
1581 |
+
default=-6,
|
1582 |
+
required=False,
|
1583 |
+
)
|
1584 |
+
|
1585 |
+
batch_infer_parser.add_argument(
|
1586 |
+
"--compressor_threshold",
|
1587 |
+
type=float,
|
1588 |
+
help=compressor_threshold_description,
|
1589 |
+
default=0,
|
1590 |
+
required=False,
|
1591 |
+
)
|
1592 |
+
|
1593 |
+
batch_infer_parser.add_argument(
|
1594 |
+
"--compressor_ratio",
|
1595 |
+
type=float,
|
1596 |
+
help=compressor_ratio_description,
|
1597 |
+
default=1,
|
1598 |
+
required=False,
|
1599 |
+
)
|
1600 |
+
|
1601 |
+
batch_infer_parser.add_argument(
|
1602 |
+
"--compressor_attack",
|
1603 |
+
type=float,
|
1604 |
+
help=compressor_attack_description,
|
1605 |
+
default=1.0,
|
1606 |
+
required=False,
|
1607 |
+
)
|
1608 |
+
|
1609 |
+
batch_infer_parser.add_argument(
|
1610 |
+
"--compressor_release",
|
1611 |
+
type=float,
|
1612 |
+
help=compressor_release_description,
|
1613 |
+
default=100,
|
1614 |
+
required=False,
|
1615 |
+
)
|
1616 |
+
batch_infer_parser.add_argument(
|
1617 |
+
"--delay_seconds",
|
1618 |
+
type=float,
|
1619 |
+
help=delay_seconds_description,
|
1620 |
+
default=0.5,
|
1621 |
+
required=False,
|
1622 |
+
)
|
1623 |
+
batch_infer_parser.add_argument(
|
1624 |
+
"--delay_feedback",
|
1625 |
+
type=float,
|
1626 |
+
help=delay_feedback_description,
|
1627 |
+
default=0.0,
|
1628 |
+
required=False,
|
1629 |
+
)
|
1630 |
+
batch_infer_parser.add_argument(
|
1631 |
+
"--delay_mix",
|
1632 |
+
type=float,
|
1633 |
+
help=delay_mix_description,
|
1634 |
+
default=0.5,
|
1635 |
+
required=False,
|
1636 |
+
)
|
1637 |
+
|
1638 |
+
# Parser for 'tts' mode
|
1639 |
+
tts_parser = subparsers.add_parser("tts", help="Run TTS inference")
|
1640 |
+
tts_parser.add_argument(
|
1641 |
+
"--tts_file", type=str, help="File with a text to be synthesized", required=True
|
1642 |
+
)
|
1643 |
+
tts_parser.add_argument(
|
1644 |
+
"--tts_text", type=str, help="Text to be synthesized", required=True
|
1645 |
+
)
|
1646 |
+
tts_parser.add_argument(
|
1647 |
+
"--tts_voice",
|
1648 |
+
type=str,
|
1649 |
+
help="Voice to be used for TTS synthesis.",
|
1650 |
+
choices=locales,
|
1651 |
+
required=True,
|
1652 |
+
)
|
1653 |
+
tts_parser.add_argument(
|
1654 |
+
"--tts_rate",
|
1655 |
+
type=int,
|
1656 |
+
help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).",
|
1657 |
+
choices=range(-100, 101),
|
1658 |
+
default=0,
|
1659 |
+
)
|
1660 |
+
tts_parser.add_argument(
|
1661 |
+
"--pitch",
|
1662 |
+
type=int,
|
1663 |
+
help=pitch_description,
|
1664 |
+
choices=range(-24, 25),
|
1665 |
+
default=0,
|
1666 |
+
)
|
1667 |
+
tts_parser.add_argument(
|
1668 |
+
"--filter_radius",
|
1669 |
+
type=int,
|
1670 |
+
help=filter_radius_description,
|
1671 |
+
choices=range(11),
|
1672 |
+
default=3,
|
1673 |
+
)
|
1674 |
+
tts_parser.add_argument(
|
1675 |
+
"--index_rate",
|
1676 |
+
type=float,
|
1677 |
+
help=index_rate_description,
|
1678 |
+
choices=[(i / 10) for i in range(11)],
|
1679 |
+
default=0.3,
|
1680 |
+
)
|
1681 |
+
tts_parser.add_argument(
|
1682 |
+
"--volume_envelope",
|
1683 |
+
type=float,
|
1684 |
+
help=volume_envelope_description,
|
1685 |
+
choices=[(i / 10) for i in range(11)],
|
1686 |
+
default=1,
|
1687 |
+
)
|
1688 |
+
tts_parser.add_argument(
|
1689 |
+
"--protect",
|
1690 |
+
type=float,
|
1691 |
+
help=protect_description,
|
1692 |
+
choices=[(i / 10) for i in range(6)],
|
1693 |
+
default=0.33,
|
1694 |
+
)
|
1695 |
+
tts_parser.add_argument(
|
1696 |
+
"--hop_length",
|
1697 |
+
type=int,
|
1698 |
+
help=hop_length_description,
|
1699 |
+
choices=range(1, 513),
|
1700 |
+
default=128,
|
1701 |
+
)
|
1702 |
+
tts_parser.add_argument(
|
1703 |
+
"--f0_method",
|
1704 |
+
type=str,
|
1705 |
+
help=f0_method_description,
|
1706 |
+
choices=[
|
1707 |
+
"crepe",
|
1708 |
+
"crepe-tiny",
|
1709 |
+
"rmvpe",
|
1710 |
+
"fcpe",
|
1711 |
+
"hybrid[crepe+rmvpe]",
|
1712 |
+
"hybrid[crepe+fcpe]",
|
1713 |
+
"hybrid[rmvpe+fcpe]",
|
1714 |
+
"hybrid[crepe+rmvpe+fcpe]",
|
1715 |
+
],
|
1716 |
+
default="rmvpe",
|
1717 |
+
)
|
1718 |
+
tts_parser.add_argument(
|
1719 |
+
"--output_tts_path",
|
1720 |
+
type=str,
|
1721 |
+
help="Full path to save the synthesized TTS audio.",
|
1722 |
+
required=True,
|
1723 |
+
)
|
1724 |
+
tts_parser.add_argument(
|
1725 |
+
"--output_rvc_path",
|
1726 |
+
type=str,
|
1727 |
+
help="Full path to save the voice-converted audio using the synthesized TTS.",
|
1728 |
+
required=True,
|
1729 |
+
)
|
1730 |
+
tts_parser.add_argument(
|
1731 |
+
"--pth_path", type=str, help=pth_path_description, required=True
|
1732 |
+
)
|
1733 |
+
tts_parser.add_argument(
|
1734 |
+
"--index_path", type=str, help=index_path_description, required=True
|
1735 |
+
)
|
1736 |
+
tts_parser.add_argument(
|
1737 |
+
"--split_audio",
|
1738 |
+
type=lambda x: bool(strtobool(x)),
|
1739 |
+
choices=[True, False],
|
1740 |
+
help=split_audio_description,
|
1741 |
+
default=False,
|
1742 |
+
)
|
1743 |
+
tts_parser.add_argument(
|
1744 |
+
"--f0_autotune",
|
1745 |
+
type=lambda x: bool(strtobool(x)),
|
1746 |
+
choices=[True, False],
|
1747 |
+
help=f0_autotune_description,
|
1748 |
+
default=False,
|
1749 |
+
)
|
1750 |
+
tts_parser.add_argument(
|
1751 |
+
"--f0_autotune_strength",
|
1752 |
+
type=float,
|
1753 |
+
help=clean_strength_description,
|
1754 |
+
choices=[(i / 10) for i in range(11)],
|
1755 |
+
default=1.0,
|
1756 |
+
)
|
1757 |
+
tts_parser.add_argument(
|
1758 |
+
"--clean_audio",
|
1759 |
+
type=lambda x: bool(strtobool(x)),
|
1760 |
+
choices=[True, False],
|
1761 |
+
help=clean_audio_description,
|
1762 |
+
default=False,
|
1763 |
+
)
|
1764 |
+
tts_parser.add_argument(
|
1765 |
+
"--clean_strength",
|
1766 |
+
type=float,
|
1767 |
+
help=clean_strength_description,
|
1768 |
+
choices=[(i / 10) for i in range(11)],
|
1769 |
+
default=0.7,
|
1770 |
+
)
|
1771 |
+
tts_parser.add_argument(
|
1772 |
+
"--export_format",
|
1773 |
+
type=str,
|
1774 |
+
help=export_format_description,
|
1775 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
1776 |
+
default="WAV",
|
1777 |
+
)
|
1778 |
+
tts_parser.add_argument(
|
1779 |
+
"--embedder_model",
|
1780 |
+
type=str,
|
1781 |
+
help=embedder_model_description,
|
1782 |
+
choices=[
|
1783 |
+
"contentvec",
|
1784 |
+
"chinese-hubert-base",
|
1785 |
+
"japanese-hubert-base",
|
1786 |
+
"korean-hubert-base",
|
1787 |
+
"custom",
|
1788 |
+
],
|
1789 |
+
default="contentvec",
|
1790 |
+
)
|
1791 |
+
tts_parser.add_argument(
|
1792 |
+
"--embedder_model_custom",
|
1793 |
+
type=str,
|
1794 |
+
help=embedder_model_custom_description,
|
1795 |
+
default=None,
|
1796 |
+
)
|
1797 |
+
tts_parser.add_argument(
|
1798 |
+
"--f0_file",
|
1799 |
+
type=str,
|
1800 |
+
help=f0_file_description,
|
1801 |
+
default=None,
|
1802 |
+
)
|
1803 |
+
|
1804 |
+
# Parser for 'preprocess' mode
|
1805 |
+
preprocess_parser = subparsers.add_parser(
|
1806 |
+
"preprocess", help="Preprocess a dataset for training."
|
1807 |
+
)
|
1808 |
+
preprocess_parser.add_argument(
|
1809 |
+
"--model_name", type=str, help="Name of the model to be trained.", required=True
|
1810 |
+
)
|
1811 |
+
preprocess_parser.add_argument(
|
1812 |
+
"--dataset_path", type=str, help="Path to the dataset directory.", required=True
|
1813 |
+
)
|
1814 |
+
preprocess_parser.add_argument(
|
1815 |
+
"--sample_rate",
|
1816 |
+
type=int,
|
1817 |
+
help="Target sampling rate for the audio data.",
|
1818 |
+
choices=[32000, 40000, 44100, 48000],
|
1819 |
+
required=True,
|
1820 |
+
)
|
1821 |
+
preprocess_parser.add_argument(
|
1822 |
+
"--cpu_cores",
|
1823 |
+
type=int,
|
1824 |
+
help="Number of CPU cores to use for preprocessing.",
|
1825 |
+
choices=range(1, 65),
|
1826 |
+
)
|
1827 |
+
preprocess_parser.add_argument(
|
1828 |
+
"--cut_preprocess",
|
1829 |
+
type=str,
|
1830 |
+
choices=["Skip", "Simple", "Automatic"],
|
1831 |
+
help="Cut the dataset into smaller segments for faster preprocessing.",
|
1832 |
+
default="Automatic",
|
1833 |
+
required=True,
|
1834 |
+
)
|
1835 |
+
preprocess_parser.add_argument(
|
1836 |
+
"--process_effects",
|
1837 |
+
type=lambda x: bool(strtobool(x)),
|
1838 |
+
choices=[True, False],
|
1839 |
+
help="Disable all filters during preprocessing.",
|
1840 |
+
default=False,
|
1841 |
+
required=False,
|
1842 |
+
)
|
1843 |
+
preprocess_parser.add_argument(
|
1844 |
+
"--noise_reduction",
|
1845 |
+
type=lambda x: bool(strtobool(x)),
|
1846 |
+
choices=[True, False],
|
1847 |
+
help="Enable noise reduction during preprocessing.",
|
1848 |
+
default=False,
|
1849 |
+
required=False,
|
1850 |
+
)
|
1851 |
+
preprocess_parser.add_argument(
|
1852 |
+
"--noise_reduction_strength",
|
1853 |
+
type=float,
|
1854 |
+
help="Strength of the noise reduction filter.",
|
1855 |
+
choices=[(i / 10) for i in range(11)],
|
1856 |
+
default=0.7,
|
1857 |
+
required=False,
|
1858 |
+
)
|
1859 |
+
preprocess_parser.add_argument(
|
1860 |
+
"--chunk_len",
|
1861 |
+
type=float,
|
1862 |
+
help="Chunk length.",
|
1863 |
+
choices=[i * 0.5 for i in range(1, 11)],
|
1864 |
+
default=3.0,
|
1865 |
+
required=False,
|
1866 |
+
)
|
1867 |
+
preprocess_parser.add_argument(
|
1868 |
+
"--overlap_len",
|
1869 |
+
type=float,
|
1870 |
+
help="Overlap length.",
|
1871 |
+
choices=[0.0, 0.1, 0.2, 0.3, 0.4],
|
1872 |
+
default=0.3,
|
1873 |
+
required=False,
|
1874 |
+
)
|
1875 |
+
|
1876 |
+
# Parser for 'extract' mode
|
1877 |
+
extract_parser = subparsers.add_parser(
|
1878 |
+
"extract", help="Extract features from a dataset."
|
1879 |
+
)
|
1880 |
+
extract_parser.add_argument(
|
1881 |
+
"--model_name", type=str, help="Name of the model.", required=True
|
1882 |
+
)
|
1883 |
+
extract_parser.add_argument(
|
1884 |
+
"--f0_method",
|
1885 |
+
type=str,
|
1886 |
+
help="Pitch extraction method to use.",
|
1887 |
+
choices=[
|
1888 |
+
"crepe",
|
1889 |
+
"crepe-tiny",
|
1890 |
+
"rmvpe",
|
1891 |
+
],
|
1892 |
+
default="rmvpe",
|
1893 |
+
)
|
1894 |
+
extract_parser.add_argument(
|
1895 |
+
"--hop_length",
|
1896 |
+
type=int,
|
1897 |
+
help="Hop length for feature extraction. Only applicable for Crepe pitch extraction.",
|
1898 |
+
choices=range(1, 513),
|
1899 |
+
default=128,
|
1900 |
+
)
|
1901 |
+
extract_parser.add_argument(
|
1902 |
+
"--cpu_cores",
|
1903 |
+
type=int,
|
1904 |
+
help="Number of CPU cores to use for feature extraction (optional).",
|
1905 |
+
choices=range(1, 65),
|
1906 |
+
default=None,
|
1907 |
+
)
|
1908 |
+
extract_parser.add_argument(
|
1909 |
+
"--gpu",
|
1910 |
+
type=str,
|
1911 |
+
help="GPU device to use for feature extraction (optional).",
|
1912 |
+
default="-",
|
1913 |
+
)
|
1914 |
+
extract_parser.add_argument(
|
1915 |
+
"--sample_rate",
|
1916 |
+
type=int,
|
1917 |
+
help="Target sampling rate for the audio data.",
|
1918 |
+
choices=[32000, 40000, 44100, 48000],
|
1919 |
+
required=True,
|
1920 |
+
)
|
1921 |
+
extract_parser.add_argument(
|
1922 |
+
"--embedder_model",
|
1923 |
+
type=str,
|
1924 |
+
help=embedder_model_description,
|
1925 |
+
choices=[
|
1926 |
+
"contentvec",
|
1927 |
+
"chinese-hubert-base",
|
1928 |
+
"japanese-hubert-base",
|
1929 |
+
"korean-hubert-base",
|
1930 |
+
"custom",
|
1931 |
+
],
|
1932 |
+
default="contentvec",
|
1933 |
+
)
|
1934 |
+
extract_parser.add_argument(
|
1935 |
+
"--embedder_model_custom",
|
1936 |
+
type=str,
|
1937 |
+
help=embedder_model_custom_description,
|
1938 |
+
default=None,
|
1939 |
+
)
|
1940 |
+
extract_parser.add_argument(
|
1941 |
+
"--include_mutes",
|
1942 |
+
type=int,
|
1943 |
+
help="Number of silent files to include.",
|
1944 |
+
choices=range(0, 11),
|
1945 |
+
default=2,
|
1946 |
+
required=True,
|
1947 |
+
)
|
1948 |
+
|
1949 |
+
# Parser for 'train' mode
|
1950 |
+
train_parser = subparsers.add_parser("train", help="Train an RVC model.")
|
1951 |
+
train_parser.add_argument(
|
1952 |
+
"--model_name", type=str, help="Name of the model to be trained.", required=True
|
1953 |
+
)
|
1954 |
+
train_parser.add_argument(
|
1955 |
+
"--vocoder",
|
1956 |
+
type=str,
|
1957 |
+
help="Vocoder name",
|
1958 |
+
choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"],
|
1959 |
+
default="HiFi-GAN",
|
1960 |
+
)
|
1961 |
+
train_parser.add_argument(
|
1962 |
+
"--checkpointing",
|
1963 |
+
type=lambda x: bool(strtobool(x)),
|
1964 |
+
choices=[True, False],
|
1965 |
+
help="Enables memory-efficient training.",
|
1966 |
+
default=False,
|
1967 |
+
required=False,
|
1968 |
+
)
|
1969 |
+
train_parser.add_argument(
|
1970 |
+
"--save_every_epoch",
|
1971 |
+
type=int,
|
1972 |
+
help="Save the model every specified number of epochs.",
|
1973 |
+
choices=range(1, 101),
|
1974 |
+
required=True,
|
1975 |
+
)
|
1976 |
+
train_parser.add_argument(
|
1977 |
+
"--save_only_latest",
|
1978 |
+
type=lambda x: bool(strtobool(x)),
|
1979 |
+
choices=[True, False],
|
1980 |
+
help="Save only the latest model checkpoint.",
|
1981 |
+
default=False,
|
1982 |
+
)
|
1983 |
+
train_parser.add_argument(
|
1984 |
+
"--save_every_weights",
|
1985 |
+
type=lambda x: bool(strtobool(x)),
|
1986 |
+
choices=[True, False],
|
1987 |
+
help="Save model weights every epoch.",
|
1988 |
+
default=True,
|
1989 |
+
)
|
1990 |
+
train_parser.add_argument(
|
1991 |
+
"--total_epoch",
|
1992 |
+
type=int,
|
1993 |
+
help="Total number of epochs to train for.",
|
1994 |
+
choices=range(1, 10001),
|
1995 |
+
default=1000,
|
1996 |
+
)
|
1997 |
+
train_parser.add_argument(
|
1998 |
+
"--sample_rate",
|
1999 |
+
type=int,
|
2000 |
+
help="Sampling rate of the training data.",
|
2001 |
+
choices=[32000, 40000, 48000],
|
2002 |
+
required=True,
|
2003 |
+
)
|
2004 |
+
train_parser.add_argument(
|
2005 |
+
"--batch_size",
|
2006 |
+
type=int,
|
2007 |
+
help="Batch size for training.",
|
2008 |
+
choices=range(1, 51),
|
2009 |
+
default=8,
|
2010 |
+
)
|
2011 |
+
train_parser.add_argument(
|
2012 |
+
"--gpu",
|
2013 |
+
type=str,
|
2014 |
+
help="GPU device to use for training (e.g., '0').",
|
2015 |
+
default="0",
|
2016 |
+
)
|
2017 |
+
train_parser.add_argument(
|
2018 |
+
"--pretrained",
|
2019 |
+
type=lambda x: bool(strtobool(x)),
|
2020 |
+
choices=[True, False],
|
2021 |
+
help="Use a pretrained model for initialization.",
|
2022 |
+
default=True,
|
2023 |
+
)
|
2024 |
+
train_parser.add_argument(
|
2025 |
+
"--custom_pretrained",
|
2026 |
+
type=lambda x: bool(strtobool(x)),
|
2027 |
+
choices=[True, False],
|
2028 |
+
help="Use a custom pretrained model.",
|
2029 |
+
default=False,
|
2030 |
+
)
|
2031 |
+
train_parser.add_argument(
|
2032 |
+
"--g_pretrained_path",
|
2033 |
+
type=str,
|
2034 |
+
nargs="?",
|
2035 |
+
default=None,
|
2036 |
+
help="Path to the pretrained generator model file.",
|
2037 |
+
)
|
2038 |
+
train_parser.add_argument(
|
2039 |
+
"--d_pretrained_path",
|
2040 |
+
type=str,
|
2041 |
+
nargs="?",
|
2042 |
+
default=None,
|
2043 |
+
help="Path to the pretrained discriminator model file.",
|
2044 |
+
)
|
2045 |
+
train_parser.add_argument(
|
2046 |
+
"--overtraining_detector",
|
2047 |
+
type=lambda x: bool(strtobool(x)),
|
2048 |
+
choices=[True, False],
|
2049 |
+
help="Enable overtraining detection.",
|
2050 |
+
default=False,
|
2051 |
+
)
|
2052 |
+
train_parser.add_argument(
|
2053 |
+
"--overtraining_threshold",
|
2054 |
+
type=int,
|
2055 |
+
help="Threshold for overtraining detection.",
|
2056 |
+
choices=range(1, 101),
|
2057 |
+
default=50,
|
2058 |
+
)
|
2059 |
+
train_parser.add_argument(
|
2060 |
+
"--cleanup",
|
2061 |
+
type=lambda x: bool(strtobool(x)),
|
2062 |
+
choices=[True, False],
|
2063 |
+
help="Cleanup previous training attempt.",
|
2064 |
+
default=False,
|
2065 |
+
)
|
2066 |
+
train_parser.add_argument(
|
2067 |
+
"--cache_data_in_gpu",
|
2068 |
+
type=lambda x: bool(strtobool(x)),
|
2069 |
+
choices=[True, False],
|
2070 |
+
help="Cache training data in GPU memory.",
|
2071 |
+
default=False,
|
2072 |
+
)
|
2073 |
+
train_parser.add_argument(
|
2074 |
+
"--index_algorithm",
|
2075 |
+
type=str,
|
2076 |
+
choices=["Auto", "Faiss", "KMeans"],
|
2077 |
+
help="Choose the method for generating the index file.",
|
2078 |
+
default="Auto",
|
2079 |
+
required=False,
|
2080 |
+
)
|
2081 |
+
|
2082 |
+
# Parser for 'index' mode
|
2083 |
+
index_parser = subparsers.add_parser(
|
2084 |
+
"index", help="Generate an index file for an RVC model."
|
2085 |
+
)
|
2086 |
+
index_parser.add_argument(
|
2087 |
+
"--model_name", type=str, help="Name of the model.", required=True
|
2088 |
+
)
|
2089 |
+
index_parser.add_argument(
|
2090 |
+
"--index_algorithm",
|
2091 |
+
type=str,
|
2092 |
+
choices=["Auto", "Faiss", "KMeans"],
|
2093 |
+
help="Choose the method for generating the index file.",
|
2094 |
+
default="Auto",
|
2095 |
+
required=False,
|
2096 |
+
)
|
2097 |
+
|
2098 |
+
# Parser for 'model_information' mode
|
2099 |
+
model_information_parser = subparsers.add_parser(
|
2100 |
+
"model_information", help="Display information about a trained model."
|
2101 |
+
)
|
2102 |
+
model_information_parser.add_argument(
|
2103 |
+
"--pth_path", type=str, help="Path to the .pth model file.", required=True
|
2104 |
+
)
|
2105 |
+
|
2106 |
+
# Parser for 'model_blender' mode
|
2107 |
+
model_blender_parser = subparsers.add_parser(
|
2108 |
+
"model_blender", help="Fuse two RVC models together."
|
2109 |
+
)
|
2110 |
+
model_blender_parser.add_argument(
|
2111 |
+
"--model_name", type=str, help="Name of the new fused model.", required=True
|
2112 |
+
)
|
2113 |
+
model_blender_parser.add_argument(
|
2114 |
+
"--pth_path_1",
|
2115 |
+
type=str,
|
2116 |
+
help="Path to the first .pth model file.",
|
2117 |
+
required=True,
|
2118 |
+
)
|
2119 |
+
model_blender_parser.add_argument(
|
2120 |
+
"--pth_path_2",
|
2121 |
+
type=str,
|
2122 |
+
help="Path to the second .pth model file.",
|
2123 |
+
required=True,
|
2124 |
+
)
|
2125 |
+
model_blender_parser.add_argument(
|
2126 |
+
"--ratio",
|
2127 |
+
type=float,
|
2128 |
+
help="Ratio for blending the two models (0.0 to 1.0).",
|
2129 |
+
choices=[(i / 10) for i in range(11)],
|
2130 |
+
default=0.5,
|
2131 |
+
)
|
2132 |
+
|
2133 |
+
# Parser for 'tensorboard' mode
|
2134 |
+
subparsers.add_parser(
|
2135 |
+
"tensorboard", help="Launch TensorBoard for monitoring training progress."
|
2136 |
+
)
|
2137 |
+
|
2138 |
+
# Parser for 'download' mode
|
2139 |
+
download_parser = subparsers.add_parser(
|
2140 |
+
"download", help="Download a model from a provided link."
|
2141 |
+
)
|
2142 |
+
download_parser.add_argument(
|
2143 |
+
"--model_link", type=str, help="Direct link to the model file.", required=True
|
2144 |
+
)
|
2145 |
+
|
2146 |
+
# Parser for 'prerequisites' mode
|
2147 |
+
prerequisites_parser = subparsers.add_parser(
|
2148 |
+
"prerequisites", help="Install prerequisites for RVC."
|
2149 |
+
)
|
2150 |
+
prerequisites_parser.add_argument(
|
2151 |
+
"--pretraineds_hifigan",
|
2152 |
+
type=lambda x: bool(strtobool(x)),
|
2153 |
+
choices=[True, False],
|
2154 |
+
default=True,
|
2155 |
+
help="Download pretrained models for RVC v2.",
|
2156 |
+
)
|
2157 |
+
prerequisites_parser.add_argument(
|
2158 |
+
"--models",
|
2159 |
+
type=lambda x: bool(strtobool(x)),
|
2160 |
+
choices=[True, False],
|
2161 |
+
default=True,
|
2162 |
+
help="Download additional models.",
|
2163 |
+
)
|
2164 |
+
prerequisites_parser.add_argument(
|
2165 |
+
"--exe",
|
2166 |
+
type=lambda x: bool(strtobool(x)),
|
2167 |
+
choices=[True, False],
|
2168 |
+
default=True,
|
2169 |
+
help="Download required executables.",
|
2170 |
+
)
|
2171 |
+
|
2172 |
+
# Parser for 'audio_analyzer' mode
|
2173 |
+
audio_analyzer = subparsers.add_parser(
|
2174 |
+
"audio_analyzer", help="Analyze an audio file."
|
2175 |
+
)
|
2176 |
+
audio_analyzer.add_argument(
|
2177 |
+
"--input_path", type=str, help="Path to the input audio file.", required=True
|
2178 |
+
)
|
2179 |
+
|
2180 |
+
return parser.parse_args()
|
2181 |
+
|
2182 |
+
|
2183 |
+
def main():
|
2184 |
+
if len(sys.argv) == 1:
|
2185 |
+
print("Please run the script with '-h' for more information.")
|
2186 |
+
sys.exit(1)
|
2187 |
+
|
2188 |
+
args = parse_arguments()
|
2189 |
+
|
2190 |
+
try:
|
2191 |
+
if args.mode == "infer":
|
2192 |
+
run_infer_script(
|
2193 |
+
pitch=args.pitch,
|
2194 |
+
filter_radius=args.filter_radius,
|
2195 |
+
index_rate=args.index_rate,
|
2196 |
+
volume_envelope=args.volume_envelope,
|
2197 |
+
protect=args.protect,
|
2198 |
+
hop_length=args.hop_length,
|
2199 |
+
f0_method=args.f0_method,
|
2200 |
+
input_path=args.input_path,
|
2201 |
+
output_path=args.output_path,
|
2202 |
+
pth_path=args.pth_path,
|
2203 |
+
index_path=args.index_path,
|
2204 |
+
split_audio=args.split_audio,
|
2205 |
+
f0_autotune=args.f0_autotune,
|
2206 |
+
f0_autotune_strength=args.f0_autotune_strength,
|
2207 |
+
clean_audio=args.clean_audio,
|
2208 |
+
clean_strength=args.clean_strength,
|
2209 |
+
export_format=args.export_format,
|
2210 |
+
embedder_model=args.embedder_model,
|
2211 |
+
embedder_model_custom=args.embedder_model_custom,
|
2212 |
+
f0_file=args.f0_file,
|
2213 |
+
formant_shifting=args.formant_shifting,
|
2214 |
+
formant_qfrency=args.formant_qfrency,
|
2215 |
+
formant_timbre=args.formant_timbre,
|
2216 |
+
sid=args.sid,
|
2217 |
+
post_process=args.post_process,
|
2218 |
+
reverb=args.reverb,
|
2219 |
+
pitch_shift=args.pitch_shift,
|
2220 |
+
limiter=args.limiter,
|
2221 |
+
gain=args.gain,
|
2222 |
+
distortion=args.distortion,
|
2223 |
+
chorus=args.chorus,
|
2224 |
+
bitcrush=args.bitcrush,
|
2225 |
+
clipping=args.clipping,
|
2226 |
+
compressor=args.compressor,
|
2227 |
+
delay=args.delay,
|
2228 |
+
reverb_room_size=args.reverb_room_size,
|
2229 |
+
reverb_damping=args.reverb_damping,
|
2230 |
+
reverb_wet_gain=args.reverb_wet_gain,
|
2231 |
+
reverb_dry_gain=args.reverb_dry_gain,
|
2232 |
+
reverb_width=args.reverb_width,
|
2233 |
+
reverb_freeze_mode=args.reverb_freeze_mode,
|
2234 |
+
pitch_shift_semitones=args.pitch_shift_semitones,
|
2235 |
+
limiter_threshold=args.limiter_threshold,
|
2236 |
+
limiter_release_time=args.limiter_release_time,
|
2237 |
+
gain_db=args.gain_db,
|
2238 |
+
distortion_gain=args.distortion_gain,
|
2239 |
+
chorus_rate=args.chorus_rate,
|
2240 |
+
chorus_depth=args.chorus_depth,
|
2241 |
+
chorus_center_delay=args.chorus_center_delay,
|
2242 |
+
chorus_feedback=args.chorus_feedback,
|
2243 |
+
chorus_mix=args.chorus_mix,
|
2244 |
+
bitcrush_bit_depth=args.bitcrush_bit_depth,
|
2245 |
+
clipping_threshold=args.clipping_threshold,
|
2246 |
+
compressor_threshold=args.compressor_threshold,
|
2247 |
+
compressor_ratio=args.compressor_ratio,
|
2248 |
+
compressor_attack=args.compressor_attack,
|
2249 |
+
compressor_release=args.compressor_release,
|
2250 |
+
delay_seconds=args.delay_seconds,
|
2251 |
+
delay_feedback=args.delay_feedback,
|
2252 |
+
delay_mix=args.delay_mix,
|
2253 |
+
)
|
2254 |
+
elif args.mode == "batch_infer":
|
2255 |
+
run_batch_infer_script(
|
2256 |
+
pitch=args.pitch,
|
2257 |
+
filter_radius=args.filter_radius,
|
2258 |
+
index_rate=args.index_rate,
|
2259 |
+
volume_envelope=args.volume_envelope,
|
2260 |
+
protect=args.protect,
|
2261 |
+
hop_length=args.hop_length,
|
2262 |
+
f0_method=args.f0_method,
|
2263 |
+
input_folder=args.input_folder,
|
2264 |
+
output_folder=args.output_folder,
|
2265 |
+
pth_path=args.pth_path,
|
2266 |
+
index_path=args.index_path,
|
2267 |
+
split_audio=args.split_audio,
|
2268 |
+
f0_autotune=args.f0_autotune,
|
2269 |
+
f0_autotune_strength=args.f0_autotune_strength,
|
2270 |
+
clean_audio=args.clean_audio,
|
2271 |
+
clean_strength=args.clean_strength,
|
2272 |
+
export_format=args.export_format,
|
2273 |
+
embedder_model=args.embedder_model,
|
2274 |
+
embedder_model_custom=args.embedder_model_custom,
|
2275 |
+
f0_file=args.f0_file,
|
2276 |
+
formant_shifting=args.formant_shifting,
|
2277 |
+
formant_qfrency=args.formant_qfrency,
|
2278 |
+
formant_timbre=args.formant_timbre,
|
2279 |
+
sid=args.sid,
|
2280 |
+
post_process=args.post_process,
|
2281 |
+
reverb=args.reverb,
|
2282 |
+
pitch_shift=args.pitch_shift,
|
2283 |
+
limiter=args.limiter,
|
2284 |
+
gain=args.gain,
|
2285 |
+
distortion=args.distortion,
|
2286 |
+
chorus=args.chorus,
|
2287 |
+
bitcrush=args.bitcrush,
|
2288 |
+
clipping=args.clipping,
|
2289 |
+
compressor=args.compressor,
|
2290 |
+
delay=args.delay,
|
2291 |
+
reverb_room_size=args.reverb_room_size,
|
2292 |
+
reverb_damping=args.reverb_damping,
|
2293 |
+
reverb_wet_gain=args.reverb_wet_gain,
|
2294 |
+
reverb_dry_gain=args.reverb_dry_gain,
|
2295 |
+
reverb_width=args.reverb_width,
|
2296 |
+
reverb_freeze_mode=args.reverb_freeze_mode,
|
2297 |
+
pitch_shift_semitones=args.pitch_shift_semitones,
|
2298 |
+
limiter_threshold=args.limiter_threshold,
|
2299 |
+
limiter_release_time=args.limiter_release_time,
|
2300 |
+
gain_db=args.gain_db,
|
2301 |
+
distortion_gain=args.distortion_gain,
|
2302 |
+
chorus_rate=args.chorus_rate,
|
2303 |
+
chorus_depth=args.chorus_depth,
|
2304 |
+
chorus_center_delay=args.chorus_center_delay,
|
2305 |
+
chorus_feedback=args.chorus_feedback,
|
2306 |
+
chorus_mix=args.chorus_mix,
|
2307 |
+
bitcrush_bit_depth=args.bitcrush_bit_depth,
|
2308 |
+
clipping_threshold=args.clipping_threshold,
|
2309 |
+
compressor_threshold=args.compressor_threshold,
|
2310 |
+
compressor_ratio=args.compressor_ratio,
|
2311 |
+
compressor_attack=args.compressor_attack,
|
2312 |
+
compressor_release=args.compressor_release,
|
2313 |
+
delay_seconds=args.delay_seconds,
|
2314 |
+
delay_feedback=args.delay_feedback,
|
2315 |
+
delay_mix=args.delay_mix,
|
2316 |
+
)
|
2317 |
+
elif args.mode == "tts":
|
2318 |
+
run_tts_script(
|
2319 |
+
tts_file=args.tts_file,
|
2320 |
+
tts_text=args.tts_text,
|
2321 |
+
tts_voice=args.tts_voice,
|
2322 |
+
tts_rate=args.tts_rate,
|
2323 |
+
pitch=args.pitch,
|
2324 |
+
filter_radius=args.filter_radius,
|
2325 |
+
index_rate=args.index_rate,
|
2326 |
+
volume_envelope=args.volume_envelope,
|
2327 |
+
protect=args.protect,
|
2328 |
+
hop_length=args.hop_length,
|
2329 |
+
f0_method=args.f0_method,
|
2330 |
+
output_tts_path=args.output_tts_path,
|
2331 |
+
output_rvc_path=args.output_rvc_path,
|
2332 |
+
pth_path=args.pth_path,
|
2333 |
+
index_path=args.index_path,
|
2334 |
+
split_audio=args.split_audio,
|
2335 |
+
f0_autotune=args.f0_autotune,
|
2336 |
+
f0_autotune_strength=args.f0_autotune_strength,
|
2337 |
+
clean_audio=args.clean_audio,
|
2338 |
+
clean_strength=args.clean_strength,
|
2339 |
+
export_format=args.export_format,
|
2340 |
+
embedder_model=args.embedder_model,
|
2341 |
+
embedder_model_custom=args.embedder_model_custom,
|
2342 |
+
f0_file=args.f0_file,
|
2343 |
+
)
|
2344 |
+
elif args.mode == "preprocess":
|
2345 |
+
run_preprocess_script(
|
2346 |
+
model_name=args.model_name,
|
2347 |
+
dataset_path=args.dataset_path,
|
2348 |
+
sample_rate=args.sample_rate,
|
2349 |
+
cpu_cores=args.cpu_cores,
|
2350 |
+
cut_preprocess=args.cut_preprocess,
|
2351 |
+
process_effects=args.process_effects,
|
2352 |
+
noise_reduction=args.noise_reduction,
|
2353 |
+
clean_strength=args.noise_reduction_strength,
|
2354 |
+
chunk_len=args.chunk_len,
|
2355 |
+
overlap_len=args.overlap_len,
|
2356 |
+
)
|
2357 |
+
elif args.mode == "extract":
|
2358 |
+
run_extract_script(
|
2359 |
+
model_name=args.model_name,
|
2360 |
+
f0_method=args.f0_method,
|
2361 |
+
hop_length=args.hop_length,
|
2362 |
+
cpu_cores=args.cpu_cores,
|
2363 |
+
gpu=args.gpu,
|
2364 |
+
sample_rate=args.sample_rate,
|
2365 |
+
embedder_model=args.embedder_model,
|
2366 |
+
embedder_model_custom=args.embedder_model_custom,
|
2367 |
+
include_mutes=args.include_mutes,
|
2368 |
+
)
|
2369 |
+
elif args.mode == "train":
|
2370 |
+
run_train_script(
|
2371 |
+
model_name=args.model_name,
|
2372 |
+
save_every_epoch=args.save_every_epoch,
|
2373 |
+
save_only_latest=args.save_only_latest,
|
2374 |
+
save_every_weights=args.save_every_weights,
|
2375 |
+
total_epoch=args.total_epoch,
|
2376 |
+
sample_rate=args.sample_rate,
|
2377 |
+
batch_size=args.batch_size,
|
2378 |
+
gpu=args.gpu,
|
2379 |
+
overtraining_detector=args.overtraining_detector,
|
2380 |
+
overtraining_threshold=args.overtraining_threshold,
|
2381 |
+
pretrained=args.pretrained,
|
2382 |
+
custom_pretrained=args.custom_pretrained,
|
2383 |
+
cleanup=args.cleanup,
|
2384 |
+
index_algorithm=args.index_algorithm,
|
2385 |
+
cache_data_in_gpu=args.cache_data_in_gpu,
|
2386 |
+
g_pretrained_path=args.g_pretrained_path,
|
2387 |
+
d_pretrained_path=args.d_pretrained_path,
|
2388 |
+
vocoder=args.vocoder,
|
2389 |
+
checkpointing=args.checkpointing,
|
2390 |
+
)
|
2391 |
+
elif args.mode == "index":
|
2392 |
+
run_index_script(
|
2393 |
+
model_name=args.model_name,
|
2394 |
+
index_algorithm=args.index_algorithm,
|
2395 |
+
)
|
2396 |
+
elif args.mode == "model_information":
|
2397 |
+
run_model_information_script(
|
2398 |
+
pth_path=args.pth_path,
|
2399 |
+
)
|
2400 |
+
elif args.mode == "model_blender":
|
2401 |
+
run_model_blender_script(
|
2402 |
+
model_name=args.model_name,
|
2403 |
+
pth_path_1=args.pth_path_1,
|
2404 |
+
pth_path_2=args.pth_path_2,
|
2405 |
+
ratio=args.ratio,
|
2406 |
+
)
|
2407 |
+
elif args.mode == "tensorboard":
|
2408 |
+
run_tensorboard_script()
|
2409 |
+
elif args.mode == "download":
|
2410 |
+
run_download_script(
|
2411 |
+
model_link=args.model_link,
|
2412 |
+
)
|
2413 |
+
elif args.mode == "prerequisites":
|
2414 |
+
run_prerequisites_script(
|
2415 |
+
pretraineds_hifigan=args.pretraineds_hifigan,
|
2416 |
+
models=args.models,
|
2417 |
+
exe=args.exe,
|
2418 |
+
)
|
2419 |
+
elif args.mode == "audio_analyzer":
|
2420 |
+
run_audio_analyzer_script(
|
2421 |
+
input_path=args.input_path,
|
2422 |
+
)
|
2423 |
+
except Exception as error:
|
2424 |
+
print(f"An error occurred during execution: {error}")
|
2425 |
+
|
2426 |
+
import traceback
|
2427 |
+
|
2428 |
+
traceback.print_exc()
|
2429 |
+
|
2430 |
+
|
2431 |
+
if __name__ == "__main__":
|
2432 |
+
main()
|