AnhP commited on
Commit
30e8419
·
verified ·
1 Parent(s): b6bee41

Delete main

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. main/app/app.py +0 -87
  2. main/app/core/downloads.py +0 -187
  3. main/app/core/editing.py +0 -96
  4. main/app/core/f0_extract.py +0 -54
  5. main/app/core/inference.py +0 -387
  6. main/app/core/model_utils.py +0 -162
  7. main/app/core/presets.py +0 -165
  8. main/app/core/process.py +0 -134
  9. main/app/core/restart.py +0 -48
  10. main/app/core/separate.py +0 -35
  11. main/app/core/training.py +0 -219
  12. main/app/core/tts.py +0 -99
  13. main/app/core/ui.py +0 -179
  14. main/app/core/utils.py +0 -97
  15. main/app/parser.py +0 -319
  16. main/app/run_tensorboard.py +0 -33
  17. main/app/tabs/downloads/downloads.py +0 -119
  18. main/app/tabs/editing/child/audio_effects.py +0 -393
  19. main/app/tabs/editing/child/quirk.py +0 -48
  20. main/app/tabs/editing/editing.py +0 -20
  21. main/app/tabs/extra/child/convert_model.py +0 -31
  22. main/app/tabs/extra/child/f0_extract.py +0 -51
  23. main/app/tabs/extra/child/fushion.py +0 -45
  24. main/app/tabs/extra/child/read_model.py +0 -29
  25. main/app/tabs/extra/child/report_bugs.py +0 -24
  26. main/app/tabs/extra/child/settings.py +0 -61
  27. main/app/tabs/extra/extra.py +0 -40
  28. main/app/tabs/inference/child/convert.py +0 -313
  29. main/app/tabs/inference/child/convert_tts.py +0 -171
  30. main/app/tabs/inference/child/convert_with_whisper.py +0 -160
  31. main/app/tabs/inference/child/separate.py +0 -108
  32. main/app/tabs/inference/inference.py +0 -30
  33. main/app/tabs/training/child/create_dataset.py +0 -71
  34. main/app/tabs/training/child/training.py +0 -237
  35. main/app/tabs/training/training.py +0 -20
  36. main/app/variables.py +0 -106
  37. main/configs/config.json +0 -584
  38. main/configs/config.py +0 -101
  39. main/configs/v1/32000.json +0 -46
  40. main/configs/v1/40000.json +0 -46
  41. main/configs/v1/48000.json +0 -46
  42. main/configs/v2/32000.json +0 -42
  43. main/configs/v2/40000.json +0 -42
  44. main/configs/v2/48000.json +0 -42
  45. main/inference/audio_effects.py +0 -185
  46. main/inference/conversion/convert.py +0 -300
  47. main/inference/conversion/pipeline.py +0 -251
  48. main/inference/conversion/utils.py +0 -66
  49. main/inference/create_dataset.py +0 -212
  50. main/inference/create_index.py +0 -73
main/app/app.py DELETED
@@ -1,87 +0,0 @@
1
- import os
2
- import io
3
- import ssl
4
- import sys
5
- import time
6
- import codecs
7
- import logging
8
- import warnings
9
-
10
- import gradio as gr
11
-
12
- sys.path.append(os.getcwd())
13
- start_time = time.time()
14
-
15
- from main.app.tabs.extra.extra import extra_tab
16
- from main.app.tabs.editing.editing import editing_tab
17
- from main.app.tabs.training.training import training_tab
18
- from main.app.tabs.downloads.downloads import download_tab
19
- from main.app.tabs.inference.inference import inference_tab
20
- from main.app.variables import logger, config, translations, theme, font, configs, language, allow_disk
21
-
22
- ssl._create_default_https_context = ssl._create_unverified_context
23
-
24
- warnings.filterwarnings("ignore")
25
- for l in ["httpx", "gradio", "uvicorn", "httpcore", "urllib3"]:
26
- logging.getLogger(l).setLevel(logging.ERROR)
27
-
28
- with gr.Blocks(title="📱 Vietnamese-RVC GUI BY ANH", theme=theme, css="<style> @import url('{fonts}'); * {{font-family: 'Courgette', cursive !important;}} body, html {{font-family: 'Courgette', cursive !important;}} h1, h2, h3, h4, h5, h6, p, button, input, textarea, label, span, div, select {{font-family: 'Courgette', cursive !important;}} </style>".format(fonts=font or "https://fonts.googleapis.com/css2?family=Courgette&display=swap")) as app:
29
- gr.HTML("<h1 style='text-align: center;'>🎵VIETNAMESE RVC BY ANH🎵</h1>")
30
- gr.HTML(f"<h3 style='text-align: center;'>{translations['title']}</h3>")
31
-
32
- with gr.Tabs():
33
- inference_tab()
34
- editing_tab()
35
- training_tab()
36
- download_tab()
37
- extra_tab(app)
38
-
39
- with gr.Row():
40
- gr.Markdown(translations["rick_roll"].format(rickroll=codecs.decode('uggcf://jjj.lbhghor.pbz/jngpu?i=qDj4j9JtKpD', 'rot13')))
41
-
42
- with gr.Row():
43
- gr.Markdown(translations["terms_of_use"])
44
-
45
- with gr.Row():
46
- gr.Markdown(translations["exemption"])
47
-
48
- logger.info(config.device)
49
- logger.info(translations["start_app"])
50
- logger.info(translations["set_lang"].format(lang=language))
51
-
52
- port = configs.get("app_port", 7860)
53
- server_name = configs.get("server_name", "0.0.0.0")
54
- share = "--share" in sys.argv
55
-
56
- original_stdout = sys.stdout
57
- sys.stdout = io.StringIO()
58
-
59
- for i in range(configs.get("num_of_restart", 5)):
60
- try:
61
- _, _, share_url = app.queue().launch(
62
- favicon_path=configs["ico_path"],
63
- server_name=server_name,
64
- server_port=port,
65
- show_error=configs.get("app_show_error", False),
66
- inbrowser="--open" in sys.argv,
67
- share=share,
68
- allowed_paths=allow_disk,
69
- prevent_thread_lock=True,
70
- quiet=True
71
- )
72
- break
73
- except OSError:
74
- logger.debug(translations["port"].format(port=port))
75
- port -= 1
76
- except Exception as e:
77
- logger.error(translations["error_occurred"].format(e=e))
78
- sys.exit(1)
79
-
80
- sys.stdout = original_stdout
81
- logger.info(f"{translations['running_local_url']}: {server_name}:{port}")
82
-
83
- if share: logger.info(f"{translations['running_share_url']}: {share_url}")
84
- logger.info(f"{translations['gradio_start']}: {(time.time() - start_time):.2f}s")
85
-
86
- while 1:
87
- time.sleep(5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/downloads.py DELETED
@@ -1,187 +0,0 @@
1
- import os
2
- import re
3
- import sys
4
- import json
5
- import codecs
6
- import shutil
7
- import yt_dlp
8
- import warnings
9
- import requests
10
-
11
- from bs4 import BeautifulSoup
12
-
13
- sys.path.append(os.getcwd())
14
-
15
- from main.tools import huggingface, gdown, meganz, mediafire, pixeldrain
16
- from main.app.core.ui import gr_info, gr_warning, gr_error, process_output
17
- from main.app.variables import logger, translations, model_options, configs
18
- from main.app.core.process import move_files_from_directory, fetch_pretrained_data, extract_name_model
19
-
20
- def download_url(url):
21
- if not url: return gr_warning(translations["provide_url"])
22
- if not os.path.exists(configs["audios_path"]): os.makedirs(configs["audios_path"], exist_ok=True)
23
-
24
- with warnings.catch_warnings():
25
- warnings.filterwarnings("ignore")
26
- ydl_opts = {
27
- "format": "bestaudio/best",
28
- "postprocessors": [{
29
- "key": "FFmpegExtractAudio",
30
- "preferredcodec": "wav",
31
- "preferredquality": "192"
32
- }],
33
- "quiet": True,
34
- "no_warnings": True,
35
- "noplaylist": True,
36
- "verbose": False
37
- }
38
-
39
- gr_info(translations["start"].format(start=translations["download_music"]))
40
-
41
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
42
- audio_output = os.path.join(configs["audios_path"], re.sub(r'\s+', '-', re.sub(r'[^\w\s\u4e00-\u9fff\uac00-\ud7af\u0400-\u04FF\u1100-\u11FF]', '', ydl.extract_info(url, download=False).get('title', 'video')).strip()))
43
- if os.path.exists(audio_output): shutil.rmtree(audio_output, ignore_errors=True)
44
-
45
- ydl_opts['outtmpl'] = audio_output
46
-
47
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
48
- audio_output = process_output(audio_output + ".wav")
49
-
50
- ydl.download([url])
51
-
52
- gr_info(translations["success"])
53
- return [audio_output, audio_output, translations["success"]]
54
-
55
- def move_file(file, download_dir, model):
56
- weights_dir = configs["weights_path"]
57
- logs_dir = configs["logs_path"]
58
-
59
- if not os.path.exists(weights_dir): os.makedirs(weights_dir, exist_ok=True)
60
- if not os.path.exists(logs_dir): os.makedirs(logs_dir, exist_ok=True)
61
-
62
- if file.endswith(".zip"): shutil.unpack_archive(file, download_dir)
63
- move_files_from_directory(download_dir, weights_dir, logs_dir, model)
64
-
65
- def download_model(url=None, model=None):
66
- if not url: return gr_warning(translations["provide_url"])
67
-
68
- url = url.replace("/blob/", "/resolve/").replace("?download=true", "").strip()
69
- download_dir = "download_model"
70
-
71
- os.makedirs(download_dir, exist_ok=True)
72
-
73
- try:
74
- gr_info(translations["start"].format(start=translations["download"]))
75
-
76
- if "huggingface.co" in url: file = huggingface.HF_download_file(url, download_dir)
77
- elif "google.com" in url: file = gdown.gdown_download(url, download_dir)
78
- elif "mediafire.com" in url: file = mediafire.Mediafire_Download(url, download_dir)
79
- elif "pixeldrain.com" in url: file = pixeldrain.pixeldrain(url, download_dir)
80
- elif "mega.nz" in url: file = meganz.mega_download_url(url, download_dir)
81
- else:
82
- gr_warning(translations["not_support_url"])
83
- return translations["not_support_url"]
84
-
85
- if not model:
86
- modelname = os.path.basename(file)
87
- model = extract_name_model(modelname) if modelname.endswith(".index") else os.path.splitext(modelname)[0]
88
- if model is None: model = os.path.splitext(modelname)[0]
89
-
90
- model = model.replace(".onnx", "").replace(".pth", "").replace(".index", "").replace(".zip", "").replace(" ", "_").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace("{", "").replace("}", "").replace(",", "").replace('"', "").replace("'", "").replace("|", "").strip()
91
-
92
- move_file(file, download_dir, model)
93
- gr_info(translations["success"])
94
-
95
- return translations["success"]
96
- except Exception as e:
97
- gr_error(message=translations["error_occurred"].format(e=e))
98
- return translations["error_occurred"].format(e=e)
99
- finally:
100
- shutil.rmtree(download_dir, ignore_errors=True)
101
-
102
- def download_pretrained_model(choices, model, sample_rate):
103
- pretraineds_custom_path = configs["pretrained_custom_path"]
104
-
105
- if choices == translations["list_model"]:
106
- paths = fetch_pretrained_data()[model][sample_rate]
107
-
108
- if not os.path.exists(pretraineds_custom_path): os.makedirs(pretraineds_custom_path, exist_ok=True)
109
- url = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_phfgbz/", "rot13") + paths
110
-
111
- gr_info(translations["download_pretrain"])
112
- file = huggingface.HF_download_file(url.replace("/blob/", "/resolve/").replace("?download=true", "").strip(), os.path.join(pretraineds_custom_path, paths))
113
-
114
- if file.endswith(".zip"):
115
- shutil.unpack_archive(file, pretraineds_custom_path)
116
- os.remove(file)
117
-
118
- gr_info(translations["success"])
119
- return translations["success"], None
120
- elif choices == translations["download_url"]:
121
- if not model: return gr_warning(translations["provide_pretrain"].format(dg="D"))
122
- if not sample_rate: return gr_warning(translations["provide_pretrain"].format(dg="G"))
123
-
124
- gr_info(translations["download_pretrain"])
125
-
126
- for url in [model, sample_rate]:
127
- url = url.replace("/blob/", "/resolve/").replace("?download=true", "").strip()
128
-
129
- if "huggingface.co" in url: huggingface.HF_download_file(url, pretraineds_custom_path)
130
- elif "google.com" in url: gdown.gdown_download(url, pretraineds_custom_path)
131
- elif "mediafire.com" in url: mediafire.Mediafire_Download(url, pretraineds_custom_path)
132
- elif "pixeldrain.com" in url: pixeldrain.pixeldrain(url, pretraineds_custom_path)
133
- elif "mega.nz" in url: meganz.mega_download_url(url, pretraineds_custom_path)
134
- else:
135
- gr_warning(translations["not_support_url"])
136
- return translations["not_support_url"], translations["not_support_url"]
137
-
138
- gr_info(translations["success"])
139
- return translations["success"], translations["success"]
140
-
141
- def fetch_models_data(search):
142
- all_table_data = []
143
- page = 1
144
-
145
- while 1:
146
- try:
147
- response = requests.post(url=codecs.decode("uggcf://ibvpr-zbqryf.pbz/srgpu_qngn.cuc", "rot13"), data={"page": page, "search": search})
148
-
149
- if response.status_code == 200:
150
- table_data = response.json().get("table", "")
151
- if not table_data.strip(): break
152
-
153
- all_table_data.append(table_data)
154
- page += 1
155
- else:
156
- logger.debug(f"{translations['code_error']} {response.status_code}")
157
- break
158
- except json.JSONDecodeError:
159
- logger.debug(translations["json_error"])
160
- break
161
- except requests.RequestException as e:
162
- logger.debug(translations["requests_error"].format(e=e))
163
- break
164
-
165
- return all_table_data
166
-
167
- def search_models(name):
168
- if not name: return gr_warning(translations["provide_name"])
169
- gr_info(translations["start"].format(start=translations["search"]))
170
-
171
- tables = fetch_models_data(name)
172
-
173
- if len(tables) == 0:
174
- gr_info(translations["not_found"].format(name=name))
175
- return [None]*2
176
- else:
177
- model_options.clear()
178
-
179
- for table in tables:
180
- for row in BeautifulSoup(table, "html.parser").select("tr"):
181
- name_tag, url_tag = row.find("a", {"class": "fs-5"}), row.find("a", {"class": "btn btn-sm fw-bold btn-light ms-0 p-1 ps-2 pe-2"})
182
- url = url_tag["href"].replace("https://easyaivoice.com/run?url=", "")
183
- if "huggingface" in url:
184
- if name_tag and url_tag: model_options[name_tag.text.replace(".onnx", "").replace(".pth", "").replace(".index", "").replace(".zip", "").replace(" ", "_").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").replace("'", "").replace("|", "_").replace("-_-", "_").replace("_-_", "_").replace("-", "_").replace("---", "_").replace("___", "_").strip()] = url
185
-
186
- gr_info(translations["found"].format(results=len(model_options)))
187
- return [{"value": "", "choices": model_options, "interactive": True, "visible": True, "__type__": "update"}, {"value": translations["downloads"], "visible": True, "__type__": "update"}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/editing.py DELETED
@@ -1,96 +0,0 @@
1
- import os
2
- import sys
3
- import random
4
- import librosa
5
- import subprocess
6
-
7
- import numpy as np
8
- import soundfile as sf
9
-
10
- sys.path.append(os.getcwd())
11
-
12
- from main.app.core.ui import gr_info, gr_warning, process_output
13
- from main.app.variables import python, translations, configs, config
14
-
15
- def audio_effects(input_path, output_path, resample, resample_sr, chorus_depth, chorus_rate, chorus_mix, chorus_delay, chorus_feedback, distortion_drive, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift, delay_seconds, delay_feedback, delay_mix, compressor_threshold, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold, limiter_release, gain_db, bitcrush_bit_depth, clipping_threshold, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost_db, bass_boost_frequency, treble_boost_db, treble_boost_frequency, fade_in_duration, fade_out_duration, export_format, chorus, distortion, reverb, delay, compressor, limiter, gain, bitcrush, clipping, phaser, treble_bass_boost, fade_in_out, audio_combination, audio_combination_input, main_vol, combine_vol):
16
- if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path):
17
- gr_warning(translations["input_not_valid"])
18
- return None
19
-
20
- if not output_path:
21
- gr_warning(translations["output_not_valid"])
22
- return None
23
-
24
- if os.path.isdir(output_path): output_path = os.path.join(output_path, f"audio_effects.{export_format}")
25
- output_dir = os.path.dirname(output_path) or output_path
26
-
27
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
28
- output_path = process_output(output_path)
29
-
30
- gr_info(translations["start"].format(start=translations["apply_effect"]))
31
-
32
- if config.debug_mode: subprocess.run([python, configs["audio_effects_path"], "--input_path", input_path, "--output_path", output_path, "--resample", str(resample), "--resample_sr", str(resample_sr), "--chorus_depth", str(chorus_depth), "--chorus_rate", str(chorus_rate), "--chorus_mix", str(chorus_mix), "--chorus_delay", str(chorus_delay), "--chorus_feedback", str(chorus_feedback), "--drive_db", str(distortion_drive), "--reverb_room_size", str(reverb_room_size), "--reverb_damping", str(reverb_damping), "--reverb_wet_level", str(reverb_wet_level), "--reverb_dry_level", str(reverb_dry_level), "--reverb_width", str(reverb_width), "--reverb_freeze_mode", str(reverb_freeze_mode), "--pitch_shift", str(pitch_shift), "--delay_seconds", str(delay_seconds), "--delay_feedback", str(delay_feedback), "--delay_mix", str(delay_mix), "--compressor_threshold", str(compressor_threshold), "--compressor_ratio", str(compressor_ratio), "--compressor_attack_ms", str(compressor_attack_ms), "--compressor_release_ms", str(compressor_release_ms), "--limiter_threshold", str(limiter_threshold), "--limiter_release", str(limiter_release), "--gain_db", str(gain_db), "--bitcrush_bit_depth", str(bitcrush_bit_depth), "--clipping_threshold", str(clipping_threshold), "--phaser_rate_hz", str(phaser_rate_hz), "--phaser_depth", str(phaser_depth), "--phaser_centre_frequency_hz", str(phaser_centre_frequency_hz), "--phaser_feedback", str(phaser_feedback), "--phaser_mix", str(phaser_mix), "--bass_boost_db", str(bass_boost_db), "--bass_boost_frequency", str(bass_boost_frequency), "--treble_boost_db", str(treble_boost_db), "--treble_boost_frequency", str(treble_boost_frequency), "--fade_in_duration", str(fade_in_duration), "--fade_out_duration", str(fade_out_duration), "--export_format", export_format, "--chorus", str(chorus), "--distortion", str(distortion), "--reverb", str(reverb), "--pitchshift", str(pitch_shift != 0), "--delay", str(delay), "--compressor", str(compressor), "--limiter", str(limiter), "--gain", str(gain), "--bitcrush", str(bitcrush), "--clipping", str(clipping), "--phaser", str(phaser), "--treble_bass_boost", str(treble_bass_boost), "--fade_in_out", str(fade_in_out), "--audio_combination", str(audio_combination), "--audio_combination_input", audio_combination_input, "--main_volume", str(main_vol), "--combination_volume", str(combine_vol)])
33
- else:
34
- from main.inference.audio_effects import process_audio
35
-
36
- process_audio(input_path, output_path, resample, resample_sr, chorus_depth, chorus_rate, chorus_mix, chorus_delay, chorus_feedback, distortion_drive, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift, delay_seconds, delay_feedback, delay_mix, compressor_threshold, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold, limiter_release, gain_db, bitcrush_bit_depth, clipping_threshold, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost_db, bass_boost_frequency, treble_boost_db, treble_boost_frequency, fade_in_duration, fade_out_duration, export_format, chorus, distortion, reverb, pitch_shift != 0, delay, compressor, limiter, gain, bitcrush, clipping, phaser, treble_bass_boost, fade_in_out, audio_combination, audio_combination_input, main_vol, combine_vol)
37
-
38
- gr_info(translations["success"])
39
- return output_path.replace("wav", export_format)
40
-
41
- def vibrato(y, sr, freq=5, depth=0.003):
42
- return y[np.clip((np.arange(len(y)) + (depth * np.sin(2 * np.pi * freq * (np.arange(len(y)) / sr))) * sr).astype(int), 0, len(y) - 1)]
43
-
44
- def apply_voice_quirk(audio_path, mode, output_path, export_format):
45
- if not audio_path or not os.path.exists(audio_path) or os.path.isdir(audio_path):
46
- gr_warning(translations["input_not_valid"])
47
- return None
48
-
49
- if not output_path:
50
- gr_warning(translations["output_not_valid"])
51
- return None
52
-
53
- if os.path.isdir(output_path): output_path = os.path.join(output_path, f"audio_quirk.{export_format}")
54
- output_dir = os.path.dirname(output_path) or output_path
55
-
56
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
57
- output_path = process_output(output_path)
58
-
59
- gr_info(translations["start"].format(start=translations["apply_effect"]))
60
-
61
- y, sr = librosa.load(audio_path, sr=None)
62
- output_path = output_path.replace("wav", export_format)
63
-
64
- mode = translations["quirk_choice"][mode]
65
- if mode == 0: mode = random.randint(1, 16)
66
-
67
- if mode == 1: y *= np.random.uniform(0.5, 0.8, size=len(y))
68
- elif mode == 2: y = librosa.effects.pitch_shift(y=y + np.random.normal(0, 0.01, y.shape), sr=sr, n_steps=np.random.uniform(-1.5, -3.5))
69
- elif mode == 3: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=3), rate=1.2)
70
- elif mode == 4: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=8), rate=1.3)
71
- elif mode == 5: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=-3), rate=0.75)
72
- elif mode == 6: y *= np.sin(np.linspace(0, np.pi * 20, len(y))) * 0.5 + 0.5
73
- elif mode == 7: y = librosa.effects.time_stretch(vibrato(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=-4), sr, freq=3, depth=0.004), rate=0.85)
74
- elif mode == 8: y *= 0.6 + np.pad(y, (sr // 2, 0), mode='constant')[:len(y)] * 0.4
75
- elif mode == 9: y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=2) + np.sin(np.linspace(0, np.pi * 20, len(y))) * 0.02
76
- elif mode == 10: y = vibrato(y, sr, freq=8, depth=0.005)
77
- elif mode == 11: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=4), rate=1.25)
78
- elif mode == 12: y = np.hstack([np.pad(f, (0, int(len(f)*0.3)), mode='edge') for f in librosa.util.frame(y, frame_length=2048, hop_length=512).T])
79
- elif mode == 13: y = np.concatenate([y, np.sin(2 * np.pi * np.linspace(0, 1, int(0.05 * sr))) * 0.02])
80
- elif mode == 14: y += np.random.normal(0, 0.005, len(y))
81
- elif mode == 15:
82
- frame = int(sr * 0.2)
83
- chunks = [y[i:i + frame] for i in range(0, len(y), frame)]
84
-
85
- np.random.shuffle(chunks)
86
- y = np.concatenate(chunks)
87
- elif mode == 16:
88
- frame = int(sr * 0.3)
89
-
90
- for i in range(0, len(y), frame * 2):
91
- y[i:i+frame] = y[i:i+frame][::-1]
92
-
93
- sf.write(output_path, y, sr, format=export_format)
94
- gr_info(translations["success"])
95
-
96
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/f0_extract.py DELETED
@@ -1,54 +0,0 @@
1
- import os
2
- import sys
3
- import librosa
4
-
5
- import numpy as np
6
- import matplotlib.pyplot as plt
7
-
8
- sys.path.append(os.getcwd())
9
-
10
- from main.library.utils import check_assets
11
- from main.app.core.ui import gr_info, gr_warning
12
- from main.library.predictors.Generator import Generator
13
- from main.app.variables import config, translations, configs
14
-
15
- def f0_extract(audio, f0_method, f0_onnx):
16
- if not audio or not os.path.exists(audio) or os.path.isdir(audio):
17
- gr_warning(translations["input_not_valid"])
18
- return [None]*2
19
-
20
- check_assets(f0_method, None, f0_onnx, None)
21
-
22
- f0_path = os.path.join(configs["f0_path"], os.path.splitext(os.path.basename(audio))[0])
23
- image_path = os.path.join(f0_path, "f0.png")
24
- txt_path = os.path.join(f0_path, "f0.txt")
25
-
26
- gr_info(translations["start_extract"])
27
-
28
- if not os.path.exists(f0_path): os.makedirs(f0_path, exist_ok=True)
29
-
30
- y, sr = librosa.load(audio, sr=None)
31
-
32
- f0_generator = Generator(sr, 160, 50, 1600, is_half=config.is_half, device=config.device, f0_onnx_mode=f0_onnx, del_onnx_model=f0_onnx)
33
- _, pitchf = f0_generator.calculator(config.x_pad, f0_method, y, 0, None, 3, False, 0, None, False)
34
-
35
- F_temp = np.array(pitchf, dtype=np.float32)
36
- F_temp[F_temp == 0] = np.nan
37
-
38
- f0 = 1200 * np.log2(F_temp / librosa.midi_to_hz(0))
39
-
40
- plt.figure(figsize=(10, 4))
41
- plt.plot(f0)
42
- plt.title(f0_method)
43
- plt.xlabel(translations["time_frames"])
44
- plt.ylabel(translations["Frequency"])
45
- plt.savefig(image_path)
46
- plt.close()
47
-
48
- with open(txt_path, "w") as f:
49
- for i, f0_value in enumerate(f0):
50
- f.write(f"{i * sr / 160},{f0_value}\n")
51
-
52
- gr_info(translations["extract_done"])
53
-
54
- return [txt_path, image_path]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/inference.py DELETED
@@ -1,387 +0,0 @@
1
- import os
2
- import re
3
- import sys
4
- import shutil
5
- import librosa
6
- import datetime
7
- import subprocess
8
-
9
- import numpy as np
10
-
11
- sys.path.append(os.getcwd())
12
-
13
- from main.app.core.ui import gr_info, gr_warning, gr_error, process_output
14
- from main.app.variables import logger, config, configs, translations, python
15
-
16
- def convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold):
17
- if config.debug_mode: subprocess.run([python, configs["convert_path"], "--pitch", str(pitch), "--filter_radius", str(filter_radius), "--index_rate", str(index_rate), "--rms_mix_rate", str(rms_mix_rate), "--protect", str(protect), "--hop_length", str(hop_length), "--f0_method", f0_method, "--input_path", input_path, "--output_path", output_path, "--pth_path", pth_path, "--index_path", index_path, "--f0_autotune", str(f0_autotune), "--clean_audio", str(clean_audio), "--clean_strength", str(clean_strength), "--export_format", export_format, "--embedder_model", embedder_model, "--resample_sr", str(resample_sr), "--split_audio", str(split_audio), "--f0_autotune_strength", str(f0_autotune_strength), "--checkpointing", str(checkpointing), "--f0_onnx", str(f0_onnx), "--embedders_mode", embedders_mode, "--formant_shifting", str(formant_shifting), "--formant_qfrency", str(formant_qfrency), "--formant_timbre", str(formant_timbre), "--f0_file", f0_file, "--proposal_pitch", str(proposal_pitch), "--proposal_pitch_threshold", str(proposal_pitch_threshold)])
18
- else:
19
- from main.inference.conversion.convert import run_convert_script
20
-
21
- run_convert_script(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, f0_autotune_strength, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, checkpointing, f0_file, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold)
22
-
23
- def convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, input_audio_name, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold):
24
- model_path = os.path.join(configs["weights_path"], model) if not os.path.exists(model) else model
25
-
26
- return_none = [None]*6
27
- return_none[5] = {"visible": True, "__type__": "update"}
28
-
29
- if not use_audio:
30
- if merge_instrument or not_merge_backing or convert_backing or use_original:
31
- gr_warning(translations["turn_on_use_audio"])
32
- return return_none
33
-
34
- if use_original:
35
- if convert_backing:
36
- gr_warning(translations["turn_off_convert_backup"])
37
- return return_none
38
- elif not_merge_backing:
39
- gr_warning(translations["turn_off_merge_backup"])
40
- return return_none
41
-
42
- if not model or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")):
43
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
44
- return return_none
45
-
46
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
47
-
48
- if use_audio:
49
- output_audio = os.path.join(configs["audios_path"], input_audio_name)
50
-
51
- from main.library.utils import pydub_load
52
-
53
- def get_audio_file(label):
54
- matching_files = [f for f in os.listdir(output_audio) if label in f]
55
-
56
- if not matching_files: return translations["notfound"]
57
- return os.path.join(output_audio, matching_files[0])
58
-
59
- output_path = os.path.join(output_audio, f"Convert_Vocals.{format}")
60
- output_backing = os.path.join(output_audio, f"Convert_Backing.{format}")
61
- output_merge_backup = os.path.join(output_audio, f"Vocals+Backing.{format}")
62
- output_merge_instrument = os.path.join(output_audio, f"Vocals+Instruments.{format}")
63
-
64
- if os.path.exists(output_audio): os.makedirs(output_audio, exist_ok=True)
65
- output_path = process_output(output_path)
66
-
67
- if use_original:
68
- original_vocal = get_audio_file('Original_Vocals_No_Reverb.')
69
-
70
- if original_vocal == translations["notfound"]: original_vocal = get_audio_file('Original_Vocals.')
71
-
72
- if original_vocal == translations["notfound"]:
73
- gr_warning(translations["not_found_original_vocal"])
74
- return return_none
75
-
76
- input_path = original_vocal
77
- else:
78
- main_vocal = get_audio_file('Main_Vocals_No_Reverb.')
79
- backing_vocal = get_audio_file('Backing_Vocals_No_Reverb.')
80
-
81
- if main_vocal == translations["notfound"]: main_vocal = get_audio_file('Main_Vocals.')
82
- if not not_merge_backing and backing_vocal == translations["notfound"]: backing_vocal = get_audio_file('Backing_Vocals.')
83
-
84
- if main_vocal == translations["notfound"]:
85
- gr_warning(translations["not_found_main_vocal"])
86
- return return_none
87
-
88
- if not not_merge_backing and backing_vocal == translations["notfound"]:
89
- gr_warning(translations["not_found_backing_vocal"])
90
- return return_none
91
-
92
- input_path = main_vocal
93
- backing_path = backing_vocal
94
-
95
- gr_info(translations["convert_vocal"])
96
-
97
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input_path, output_path, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold)
98
-
99
- gr_info(translations["convert_success"])
100
-
101
- if convert_backing:
102
- output_backing = process_output(output_backing)
103
-
104
- gr_info(translations["convert_backup"])
105
-
106
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, backing_path, output_backing, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold)
107
-
108
- gr_info(translations["convert_backup_success"])
109
-
110
- try:
111
- if not not_merge_backing and not use_original:
112
- backing_source = output_backing if convert_backing else backing_vocal
113
-
114
- output_merge_backup = process_output(output_merge_backup)
115
-
116
- gr_info(translations["merge_backup"])
117
-
118
- pydub_load(output_path, volume=-4).overlay(pydub_load(backing_source, volume=-6)).export(output_merge_backup, format=format)
119
-
120
- gr_info(translations["merge_success"])
121
-
122
- if merge_instrument:
123
- vocals = output_merge_backup if not not_merge_backing and not use_original else output_path
124
-
125
- output_merge_instrument = process_output(output_merge_instrument)
126
-
127
- gr_info(translations["merge_instruments_process"])
128
-
129
- instruments = get_audio_file('Instruments.')
130
-
131
- if instruments == translations["notfound"]:
132
- gr_warning(translations["not_found_instruments"])
133
- output_merge_instrument = None
134
- else: pydub_load(instruments, volume=-7).overlay(pydub_load(vocals, volume=-4 if use_original else None)).export(output_merge_instrument, format=format)
135
-
136
- gr_info(translations["merge_success"])
137
- except:
138
- return return_none
139
-
140
- return [(None if use_original else output_path), output_backing, (None if not_merge_backing and use_original else output_merge_backup), (output_path if use_original else None), (output_merge_instrument if merge_instrument else None), {"visible": True, "__type__": "update"}]
141
- else:
142
- if not input or not os.path.exists(input) or os.path.isdir(input):
143
- gr_warning(translations["input_not_valid"])
144
- return return_none
145
-
146
- if not output:
147
- gr_warning(translations["output_not_valid"])
148
- return return_none
149
-
150
- output = output.replace("wav", format)
151
-
152
- if os.path.isdir(input):
153
- gr_info(translations["is_folder"])
154
-
155
- if not [f for f in os.listdir(input) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]:
156
- gr_warning(translations["not_found_in_folder"])
157
- return return_none
158
-
159
- gr_info(translations["batch_convert"])
160
-
161
- output_dir = os.path.dirname(output) or output
162
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output_dir, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold)
163
-
164
- gr_info(translations["batch_convert_success"])
165
-
166
- return return_none
167
- else:
168
- output_dir = os.path.dirname(output) or output
169
-
170
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
171
- output = process_output(output)
172
-
173
- gr_info(translations["convert_vocal"])
174
-
175
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold)
176
-
177
- gr_info(translations["convert_success"])
178
-
179
- return_none[0] = output
180
- return return_none
181
-
182
- def convert_selection(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold):
183
- if use_audio:
184
- gr_info(translations["search_separate"])
185
- choice = [f for f in os.listdir(configs["audios_path"]) if os.path.isdir(os.path.join(configs["audios_path"], f))] if config.debug_mode else [f for f in os.listdir(configs["audios_path"]) if os.path.isdir(os.path.join(configs["audios_path"], f)) and any(file.lower().endswith((".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")) for file in os.listdir(os.path.join(configs["audios_path"], f)))]
186
-
187
- gr_info(translations["found_choice"].format(choice=len(choice)))
188
-
189
- if len(choice) == 0:
190
- gr_warning(translations["separator==0"])
191
-
192
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, None, None, None, None, None, {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
193
- elif len(choice) == 1:
194
- convert_output = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, None, None, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, choice[0], checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold)
195
-
196
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, convert_output[0], convert_output[1], convert_output[2], convert_output[3], convert_output[4], {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
197
- else: return [{"choices": choice, "value": choice[0], "interactive": True, "visible": True, "__type__": "update"}, None, None, None, None, None, {"visible": False, "__type__": "update"}, {"visible": True, "__type__": "update"}]
198
- else:
199
- main_convert = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, None, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold)
200
-
201
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, main_convert[0], None, None, None, None, {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
202
-
203
- def convert_with_whisper(num_spk, model_size, cleaner, clean_strength, autotune, f0_autotune_strength, checkpointing, model_1, model_2, model_index_1, model_index_2, pitch_1, pitch_2, index_strength_1, index_strength_2, export_format, input_audio, output_audio, onnx_f0_mode, method, hybrid_method, hop_length, embed_mode, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, formant_shifting, formant_qfrency_1, formant_timbre_1, formant_qfrency_2, formant_timbre_2, proposal_pitch, proposal_pitch_threshold):
204
- from pydub import AudioSegment
205
- from sklearn.cluster import AgglomerativeClustering
206
-
207
- from main.library.speaker_diarization.audio import Audio
208
- from main.library.speaker_diarization.segment import Segment
209
- from main.library.speaker_diarization.whisper import load_model
210
- from main.library.utils import check_spk_diarization, pydub_load
211
- from main.library.speaker_diarization.embedding import SpeechBrainPretrainedSpeakerEmbedding
212
-
213
- check_spk_diarization(model_size)
214
- model_pth_1, model_pth_2 = os.path.join(configs["weights_path"], model_1) if not os.path.exists(model_1) else model_1, os.path.join(configs["weights_path"], model_2) if not os.path.exists(model_2) else model_2
215
-
216
- if (not model_1 or not os.path.exists(model_pth_1) or os.path.isdir(model_pth_1) or not model_pth_1.endswith((".pth", ".onnx"))) and (not model_2 or not os.path.exists(model_pth_2) or os.path.isdir(model_pth_2) or not model_pth_2.endswith((".pth", ".onnx"))):
217
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
218
- return None
219
-
220
- if not model_1: model_pth_1 = model_pth_2
221
- if not model_2: model_pth_2 = model_pth_1
222
-
223
- if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio):
224
- gr_warning(translations["input_not_valid"])
225
- return None
226
-
227
- if not output_audio:
228
- gr_warning(translations["output_not_valid"])
229
- return None
230
-
231
- output_audio = process_output(output_audio)
232
- gr_info(translations["start_whisper"])
233
-
234
- try:
235
- audio = Audio()
236
-
237
- embedding_model = SpeechBrainPretrainedSpeakerEmbedding(embedding=os.path.join(configs["speaker_diarization_path"], "models", "speechbrain"), device=config.device)
238
- segments = load_model(model_size, device=config.device).transcribe(input_audio, fp16=configs.get("fp16", False), word_timestamps=True)["segments"]
239
-
240
- y, sr = librosa.load(input_audio, sr=None)
241
- duration = len(y) / sr
242
-
243
- def segment_embedding(segment):
244
- waveform, _ = audio.crop(input_audio, Segment(segment["start"], min(duration, segment["end"])))
245
- return embedding_model(waveform.mean(dim=0, keepdim=True)[None] if waveform.shape[0] == 2 else waveform[None])
246
-
247
- def time(secs):
248
- return datetime.timedelta(seconds=round(secs))
249
-
250
- def merge_audio(files_list, time_stamps, original_file_path, output_path, format):
251
- def extract_number(filename):
252
- match = re.search(r'_(\d+)', filename)
253
- return int(match.group(1)) if match else 0
254
-
255
- total_duration = len(pydub_load(original_file_path))
256
- combined = AudioSegment.empty()
257
- current_position = 0
258
-
259
- for file, (start_i, end_i) in zip(sorted(files_list, key=extract_number), time_stamps):
260
- if start_i > current_position: combined += AudioSegment.silent(duration=start_i - current_position)
261
-
262
- combined += pydub_load(file)
263
- current_position = end_i
264
-
265
- if current_position < total_duration: combined += AudioSegment.silent(duration=total_duration - current_position)
266
- combined.export(output_path, format=format)
267
-
268
- return output_path
269
-
270
- embeddings = np.zeros(shape=(len(segments), 192))
271
- for i, segment in enumerate(segments):
272
- embeddings[i] = segment_embedding(segment)
273
-
274
- labels = AgglomerativeClustering(num_spk).fit(np.nan_to_num(embeddings)).labels_
275
- for i in range(len(segments)):
276
- segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
277
-
278
- merged_segments, current_text = [], []
279
- current_speaker, current_start = None, None
280
-
281
- for i, segment in enumerate(segments):
282
- speaker = segment["speaker"]
283
- start_time = segment["start"]
284
- text = segment["text"][1:]
285
-
286
- if speaker == current_speaker:
287
- current_text.append(text)
288
- end_time = segment["end"]
289
- else:
290
- if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)})
291
-
292
- current_speaker = speaker
293
- current_start = start_time
294
- current_text = [text]
295
- end_time = segment["end"]
296
-
297
- if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)})
298
-
299
- gr_info(translations["whisper_done"])
300
-
301
- x = ""
302
- for segment in merged_segments:
303
- x += f"\n{segment['speaker']} {str(time(segment['start']))} - {str(time(segment['end']))}\n"
304
- x += segment["text"] + "\n"
305
-
306
- logger.info(x)
307
-
308
- gr_info(translations["process_audio"])
309
-
310
- audio = pydub_load(input_audio)
311
- output_folder = "audios_temp"
312
-
313
- if os.path.exists(output_folder): shutil.rmtree(output_folder, ignore_errors=True)
314
- for f in [output_folder, os.path.join(output_folder, "1"), os.path.join(output_folder, "2")]:
315
- os.makedirs(f, exist_ok=True)
316
-
317
- time_stamps, processed_segments = [], []
318
- for i, segment in enumerate(merged_segments):
319
- start_ms = int(segment["start"] * 1000)
320
- end_ms = int(segment["end"] * 1000)
321
-
322
- index = i + 1
323
-
324
- segment_filename = os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}.wav")
325
- audio[start_ms:end_ms].export(segment_filename, format="wav")
326
-
327
- processed_segments.append(os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}_output.wav"))
328
- time_stamps.append((start_ms, end_ms))
329
-
330
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
331
-
332
- gr_info(translations["process_done_start_convert"])
333
-
334
- convert(pitch_1, filter_radius, index_strength_1, rms_mix_rate, protect, hop_length, f0method, os.path.join(output_folder, "1"), output_folder, model_pth_1, model_index_1, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_1, formant_timbre_1, "", proposal_pitch, proposal_pitch_threshold)
335
- convert(pitch_2, filter_radius, index_strength_2, rms_mix_rate, protect, hop_length, f0method, os.path.join(output_folder, "2"), output_folder, model_pth_2, model_index_2, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_2, formant_timbre_2, "", proposal_pitch, proposal_pitch_threshold)
336
-
337
- gr_info(translations["convert_success"])
338
- return merge_audio(processed_segments, time_stamps, input_audio, output_audio.replace("wav", export_format), export_format)
339
- except Exception as e:
340
- gr_error(translations["error_occurred"].format(e=e))
341
- import traceback
342
- logger.debug(traceback.format_exc())
343
- return None
344
- finally:
345
- if os.path.exists("audios_temp"): shutil.rmtree("audios_temp", ignore_errors=True)
346
-
347
- def convert_tts(clean, autotune, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold):
348
- model_path = os.path.join(configs["weights_path"], model) if not os.path.exists(model) else model
349
-
350
- if not model_path or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")):
351
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
352
- return None
353
-
354
- if not input or not os.path.exists(input):
355
- gr_warning(translations["input_not_valid"])
356
- return None
357
-
358
- if os.path.isdir(input):
359
- input_audio = [f for f in os.listdir(input) if "tts" in f and f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]
360
-
361
- if not input_audio:
362
- gr_warning(translations["not_found_in_folder"])
363
- return None
364
-
365
- input = os.path.join(input, input_audio[0])
366
-
367
- if not output:
368
- gr_warning(translations["output_not_valid"])
369
- return None
370
-
371
- output = output.replace("wav", format)
372
- if os.path.isdir(output): output = os.path.join(output, f"tts.{format}")
373
-
374
- output_dir = os.path.dirname(output)
375
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
376
-
377
- output = process_output(output)
378
-
379
- f0method = method if method != "hybrid" else hybrid_method
380
- embedder_model = embedders if embedders != "custom" else custom_embedders
381
-
382
- gr_info(translations["convert_vocal"])
383
-
384
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold)
385
-
386
- gr_info(translations["convert_success"])
387
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/model_utils.py DELETED
@@ -1,162 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import onnx
5
- import torch
6
- import datetime
7
-
8
- from collections import OrderedDict
9
-
10
- sys.path.append(os.getcwd())
11
-
12
- from main.app.core.ui import gr_info, gr_warning, gr_error
13
- from main.library.algorithm.onnx_export import onnx_exporter
14
- from main.app.variables import config, logger, translations, configs
15
-
16
- def fushion_model_pth(name, pth_1, pth_2, ratio):
17
- if not name.endswith(".pth"): name = name + ".pth"
18
-
19
- if not pth_1 or not os.path.exists(pth_1) or not pth_1.endswith(".pth"):
20
- gr_warning(translations["provide_file"].format(filename=translations["model"] + " 1"))
21
- return [translations["provide_file"].format(filename=translations["model"] + " 1"), None]
22
-
23
- if not pth_2 or not os.path.exists(pth_2) or not pth_2.endswith(".pth"):
24
- gr_warning(translations["provide_file"].format(filename=translations["model"] + " 2"))
25
- return [translations["provide_file"].format(filename=translations["model"] + " 2"), None]
26
-
27
- def extract(ckpt):
28
- a = ckpt["model"]
29
- opt = OrderedDict()
30
- opt["weight"] = {}
31
-
32
- for key in a.keys():
33
- if "enc_q" in key: continue
34
-
35
- opt["weight"][key] = a[key]
36
-
37
- return opt
38
-
39
- try:
40
- ckpt1 = torch.load(pth_1, map_location="cpu", weights_only=True)
41
- ckpt2 = torch.load(pth_2, map_location="cpu", weights_only=True)
42
-
43
- if ckpt1["sr"] != ckpt2["sr"]:
44
- gr_warning(translations["sr_not_same"])
45
- return [translations["sr_not_same"], None]
46
-
47
- cfg = ckpt1["config"]
48
- cfg_f0 = ckpt1["f0"]
49
- cfg_version = ckpt1["version"]
50
- cfg_sr = ckpt1["sr"]
51
-
52
- vocoder = ckpt1.get("vocoder", "Default")
53
- rms_extract = ckpt1.get("energy", False)
54
-
55
- ckpt1 = extract(ckpt1) if "model" in ckpt1 else ckpt1["weight"]
56
- ckpt2 = extract(ckpt2) if "model" in ckpt2 else ckpt2["weight"]
57
-
58
- if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
59
- gr_warning(translations["architectures_not_same"])
60
- return [translations["architectures_not_same"], None]
61
-
62
- gr_info(translations["start"].format(start=translations["fushion_model"]))
63
-
64
- opt = OrderedDict()
65
- opt["weight"] = {}
66
-
67
- for key in ckpt1.keys():
68
- if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
69
- min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
70
- opt["weight"][key] = (ratio * (ckpt1[key][:min_shape0].float()) + (1 - ratio) * (ckpt2[key][:min_shape0].float())).half()
71
- else: opt["weight"][key] = (ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float())).half()
72
-
73
- opt["config"] = cfg
74
- opt["sr"] = cfg_sr
75
- opt["f0"] = cfg_f0
76
- opt["version"] = cfg_version
77
- opt["infos"] = translations["model_fushion_info"].format(name=name, pth_1=pth_1, pth_2=pth_2, ratio=ratio)
78
- opt["vocoder"] = vocoder
79
- opt["energy"] = rms_extract
80
-
81
- output_model = configs["weights_path"]
82
- if not os.path.exists(output_model): os.makedirs(output_model, exist_ok=True)
83
-
84
- torch.save(opt, os.path.join(output_model, name))
85
-
86
- gr_info(translations["success"])
87
- return [translations["success"], os.path.join(output_model, name)]
88
- except Exception as e:
89
- gr_error(message=translations["error_occurred"].format(e=e))
90
- return [e, None]
91
-
92
- def fushion_model(name, path_1, path_2, ratio):
93
- if not name:
94
- gr_warning(translations["provide_name_is_save"])
95
- return [translations["provide_name_is_save"], None]
96
-
97
- if path_1.endswith(".pth") and path_2.endswith(".pth"): return fushion_model_pth(name.replace(".onnx", ".pth"), path_1, path_2, ratio)
98
- else:
99
- gr_warning(translations["format_not_valid"])
100
- return [None, None]
101
-
102
- def onnx_export(model_path):
103
- if not model_path.endswith(".pth"): model_path + ".pth"
104
- if not model_path or not os.path.exists(model_path) or not model_path.endswith(".pth"): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
105
-
106
- try:
107
- gr_info(translations["start_onnx_export"])
108
- output = onnx_exporter(model_path, model_path.replace(".pth", ".onnx"), is_half=config.is_half, device=config.device)
109
-
110
- gr_info(translations["success"])
111
- return output
112
- except Exception as e:
113
- return gr_error(e)
114
-
115
- def model_info(path):
116
- if not path or not os.path.exists(path) or os.path.isdir(path) or not path.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
117
-
118
- def prettify_date(date_str):
119
- if date_str == translations["not_found_create_time"]: return None
120
-
121
- try:
122
- return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f").strftime("%Y-%m-%d %H:%M:%S")
123
- except ValueError as e:
124
- logger.debug(e)
125
- return translations["format_not_valid"]
126
-
127
- if path.endswith(".pth"): model_data = torch.load(path, map_location=torch.device("cpu"))
128
- else:
129
- model = onnx.load(path)
130
- model_data = None
131
-
132
- for prop in model.metadata_props:
133
- if prop.key == "model_info":
134
- model_data = json.loads(prop.value)
135
- break
136
-
137
- gr_info(translations["read_info"])
138
-
139
- epochs = model_data.get("epoch", None)
140
- if epochs is None:
141
- epochs = model_data.get("info", None)
142
- try:
143
- epoch = epochs.replace("epoch", "").replace("e", "").isdigit()
144
- if epoch and epochs is None: epochs = translations["not_found"].format(name=translations["epoch"])
145
- except:
146
- pass
147
-
148
- steps = model_data.get("step", translations["not_found"].format(name=translations["step"]))
149
- sr = model_data.get("sr", translations["not_found"].format(name=translations["sr"]))
150
- f0 = model_data.get("f0", translations["not_found"].format(name=translations["f0"]))
151
- version = model_data.get("version", translations["not_found"].format(name=translations["version"]))
152
- creation_date = model_data.get("creation_date", translations["not_found_create_time"])
153
- model_hash = model_data.get("model_hash", translations["not_found"].format(name="model_hash"))
154
- pitch_guidance = translations["trained_f0"] if f0 else translations["not_f0"]
155
- creation_date_str = prettify_date(creation_date) if creation_date else translations["not_found_create_time"]
156
- model_name = model_data.get("model_name", translations["unregistered"])
157
- model_author = model_data.get("author", translations["not_author"])
158
- vocoder = model_data.get("vocoder", "Default")
159
- rms_extract = model_data.get("energy", False)
160
-
161
- gr_info(translations["success"])
162
- return translations["model_info"].format(model_name=model_name, model_author=model_author, epochs=epochs, steps=steps, version=version, sr=sr, pitch_guidance=pitch_guidance, model_hash=model_hash, creation_date_str=creation_date_str, vocoder=vocoder, rms_extract=rms_extract)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/presets.py DELETED
@@ -1,165 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
-
5
- sys.path.append(os.getcwd())
6
-
7
- from main.app.variables import translations, configs
8
- from main.app.core.ui import gr_info, gr_warning, change_preset_choices, change_effect_preset_choices
9
-
10
- def load_presets(presets, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, formant_shifting, formant_qfrency, formant_timbre):
11
- if not presets: gr_warning(translations["provide_file_settings"])
12
-
13
- file = {}
14
- if presets:
15
- with open(os.path.join(configs["presets_path"], presets)) as f:
16
- file = json.load(f)
17
-
18
- gr_info(translations["load_presets"].format(presets=presets))
19
- return [file.get("cleaner", cleaner), file.get("autotune", autotune), file.get("pitch", pitch), file.get("clean_strength", clean_strength), file.get("index_strength", index_strength), file.get("resample_sr", resample_sr), file.get("filter_radius", filter_radius), file.get("rms_mix_rate", rms_mix_rate), file.get("protect", protect), file.get("split_audio", split_audio), file.get("f0_autotune_strength", f0_autotune_strength), file.get("formant_shifting", formant_shifting), file.get("formant_qfrency", formant_qfrency), file.get("formant_timbre", formant_timbre)]
20
-
21
- def save_presets(name, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, rms_mix_rate_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox, formant_shifting, formant_qfrency, formant_timbre):
22
- if not name: return gr_warning(translations["provide_filename_settings"])
23
- if not any([cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, rms_mix_rate_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox]): return gr_warning(translations["choose1"])
24
-
25
- settings = {}
26
-
27
- for checkbox, data in [(cleaner_chbox, {"cleaner": cleaner, "clean_strength": clean_strength}), (autotune_chbox, {"autotune": autotune, "f0_autotune_strength": f0_autotune_strength}), (pitch_chbox, {"pitch": pitch}), (index_strength_chbox, {"index_strength": index_strength}), (resample_sr_chbox, {"resample_sr": resample_sr}), (filter_radius_chbox, {"filter_radius": filter_radius}), (rms_mix_rate_chbox, {"rms_mix_rate": rms_mix_rate}), (protect_chbox, {"protect": protect}), (split_audio_chbox, {"split_audio": split_audio}), (formant_shifting_chbox, {"formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre})]:
28
- if checkbox: settings.update(data)
29
-
30
- with open(os.path.join(configs["presets_path"], name + ".conversion.json"), "w") as f:
31
- json.dump(settings, f, indent=4)
32
-
33
- gr_info(translations["export_settings"].format(name=name))
34
- return change_preset_choices()
35
-
36
- def audio_effect_load_presets(presets, resample_checkbox, audio_effect_resample_sr, chorus_depth, chorus_rate_hz, chorus_mix, chorus_centre_delay_ms, chorus_feedback, distortion_drive_db, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift_semitones, delay_second, delay_feedback, delay_mix, compressor_threshold_db, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold_db, limiter_release_ms, gain_db, bitcrush_bit_depth, clipping_threshold_db, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost, bass_frequency, treble_boost, treble_frequency, fade_in, fade_out, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade):
37
- if not presets: gr_warning(translations["provide_file_settings"])
38
-
39
- file = {}
40
- if presets:
41
- with open(os.path.join(configs["presets_path"], presets)) as f:
42
- file = json.load(f)
43
-
44
- gr_info(translations["load_presets"].format(presets=presets))
45
- return [
46
- file.get("resample_checkbox", resample_checkbox), file.get("audio_effect_resample_sr", audio_effect_resample_sr),
47
- file.get("chorus_depth", chorus_depth), file.get("chorus_rate_hz", chorus_rate_hz),
48
- file.get("chorus_mix", chorus_mix), file.get("chorus_centre_delay_ms", chorus_centre_delay_ms),
49
- file.get("chorus_feedback", chorus_feedback), file.get("distortion_drive_db", distortion_drive_db),
50
- file.get("reverb_room_size", reverb_room_size), file.get("reverb_damping", reverb_damping),
51
- file.get("reverb_wet_level", reverb_wet_level), file.get("reverb_dry_level", reverb_dry_level),
52
- file.get("reverb_width", reverb_width), file.get("reverb_freeze_mode", reverb_freeze_mode),
53
- file.get("pitch_shift_semitones", pitch_shift_semitones), file.get("delay_second", delay_second),
54
- file.get("delay_feedback", delay_feedback), file.get("delay_mix", delay_mix),
55
- file.get("compressor_threshold_db", compressor_threshold_db), file.get("compressor_ratio", compressor_ratio),
56
- file.get("compressor_attack_ms", compressor_attack_ms), file.get("compressor_release_ms", compressor_release_ms),
57
- file.get("limiter_threshold_db", limiter_threshold_db), file.get("limiter_release_ms", limiter_release_ms),
58
- file.get("gain_db", gain_db), file.get("bitcrush_bit_depth", bitcrush_bit_depth),
59
- file.get("clipping_threshold_db", clipping_threshold_db), file.get("phaser_rate_hz", phaser_rate_hz),
60
- file.get("phaser_depth", phaser_depth), file.get("phaser_centre_frequency_hz", phaser_centre_frequency_hz),
61
- file.get("phaser_feedback", phaser_feedback), file.get("phaser_mix", phaser_mix),
62
- file.get("bass_boost", bass_boost), file.get("bass_frequency", bass_frequency),
63
- file.get("treble_boost", treble_boost), file.get("treble_frequency", treble_frequency),
64
- file.get("fade_in", fade_in), file.get("fade_out", fade_out),
65
- file.get("chorus_check_box", chorus_check_box), file.get("distortion_checkbox", distortion_checkbox),
66
- file.get("reverb_check_box", reverb_check_box), file.get("delay_check_box", delay_check_box),
67
- file.get("compressor_check_box", compressor_check_box), file.get("limiter", limiter),
68
- file.get("gain_checkbox", gain_checkbox), file.get("bitcrush_checkbox", bitcrush_checkbox),
69
- file.get("clipping_checkbox", clipping_checkbox), file.get("phaser_check_box", phaser_check_box),
70
- file.get("bass_or_treble", bass_or_treble), file.get("fade", fade)
71
- ]
72
-
73
- def audio_effect_save_presets(name, resample_checkbox, audio_effect_resample_sr, chorus_depth, chorus_rate_hz, chorus_mix, chorus_centre_delay_ms, chorus_feedback, distortion_drive_db, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift_semitones, delay_second, delay_feedback, delay_mix, compressor_threshold_db, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold_db, limiter_release_ms, gain_db, bitcrush_bit_depth, clipping_threshold_db, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost, bass_frequency, treble_boost, treble_frequency, fade_in, fade_out, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade):
74
- if not name: return gr_warning(translations["provide_filename_settings"])
75
- if not any([resample_checkbox, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade, pitch_shift_semitones != 0]): return gr_warning(translations["choose1"])
76
-
77
- settings = {}
78
-
79
- for checkbox, data in [
80
- (resample_checkbox, {
81
- "resample_checkbox": resample_checkbox,
82
- "audio_effect_resample_sr": audio_effect_resample_sr
83
- }),
84
- (chorus_check_box, {
85
- "chorus_check_box": chorus_check_box,
86
- "chorus_depth": chorus_depth,
87
- "chorus_rate_hz": chorus_rate_hz,
88
- "chorus_mix": chorus_mix,
89
- "chorus_centre_delay_ms": chorus_centre_delay_ms,
90
- "chorus_feedback": chorus_feedback
91
- }),
92
- (distortion_checkbox, {
93
- "distortion_checkbox": distortion_checkbox,
94
- "distortion_drive_db": distortion_drive_db
95
- }),
96
- (reverb_check_box, {
97
- "reverb_check_box": reverb_check_box,
98
- "reverb_room_size": reverb_room_size,
99
- "reverb_damping": reverb_damping,
100
- "reverb_wet_level": reverb_wet_level,
101
- "reverb_dry_level": reverb_dry_level,
102
- "reverb_width": reverb_width,
103
- "reverb_freeze_mode": reverb_freeze_mode
104
- }),
105
- (pitch_shift_semitones != 0, {
106
- "pitch_shift_semitones": pitch_shift_semitones
107
- }),
108
- (delay_check_box, {
109
- "delay_check_box": delay_check_box,
110
- "delay_second": delay_second,
111
- "delay_feedback": delay_feedback,
112
- "delay_mix": delay_mix
113
- }),
114
- (compressor_check_box, {
115
- "compressor_check_box": compressor_check_box,
116
- "compressor_threshold_db": compressor_threshold_db,
117
- "compressor_ratio": compressor_ratio,
118
- "compressor_attack_ms": compressor_attack_ms,
119
- "compressor_release_ms": compressor_release_ms
120
- }),
121
- (limiter, {
122
- "limiter": limiter,
123
- "limiter_threshold_db": limiter_threshold_db,
124
- "limiter_release_ms": limiter_release_ms
125
- }),
126
- (gain_checkbox, {
127
- "gain_checkbox": gain_checkbox,
128
- "gain_db": gain_db
129
- }),
130
- (bitcrush_checkbox, {
131
- "bitcrush_checkbox": bitcrush_checkbox,
132
- "bitcrush_bit_depth": bitcrush_bit_depth
133
- }),
134
- (clipping_checkbox, {
135
- "clipping_checkbox": clipping_checkbox,
136
- "clipping_threshold_db": clipping_threshold_db
137
- }),
138
- (phaser_check_box, {
139
- "phaser_check_box": phaser_check_box,
140
- "phaser_rate_hz": phaser_rate_hz,
141
- "phaser_depth": phaser_depth,
142
- "phaser_centre_frequency_hz": phaser_centre_frequency_hz,
143
- "phaser_feedback": phaser_feedback,
144
- "phaser_mix": phaser_mix
145
- }),
146
- (bass_or_treble, {
147
- "bass_or_treble": bass_or_treble,
148
- "bass_boost": bass_boost,
149
- "bass_frequency": bass_frequency,
150
- "treble_boost": treble_boost,
151
- "treble_frequency": treble_frequency
152
- }),
153
- (fade, {
154
- "fade": fade,
155
- "fade_in": fade_in,
156
- "fade_out": fade_out
157
- })
158
- ]:
159
- if checkbox: settings.update(data)
160
-
161
- with open(os.path.join(configs["presets_path"], name + ".effect.json"), "w") as f:
162
- json.dump(settings, f, indent=4)
163
-
164
- gr_info(translations["export_settings"].format(name=name))
165
- return change_effect_preset_choices()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/process.py DELETED
@@ -1,134 +0,0 @@
1
- import os
2
- import re
3
- import sys
4
- import shutil
5
- import codecs
6
- import zipfile
7
- import requests
8
- import xml.etree.ElementTree
9
-
10
- sys.path.append(os.getcwd())
11
-
12
- from main.app.variables import logger, translations, configs
13
- from main.app.core.ui import gr_info, gr_warning, gr_error, process_output
14
-
15
- def read_docx_text(path):
16
- with zipfile.ZipFile(path) as docx:
17
- with docx.open("word/document.xml") as document_xml:
18
- xml_content = document_xml.read()
19
-
20
- WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
21
-
22
- paragraphs = []
23
- for paragraph in xml.etree.ElementTree.XML(xml_content).iter(WORD_NAMESPACE + 'p'):
24
- texts = [node.text for node in paragraph.iter(WORD_NAMESPACE + 't') if node.text]
25
- if texts: paragraphs.append(''.join(texts))
26
-
27
- return '\n'.join(paragraphs)
28
-
29
- def process_input(file_path):
30
- if file_path.endswith(".srt"): file_contents = ""
31
- elif file_path.endswith(".docx"): file_contents = read_docx_text(file_path)
32
- else:
33
- try:
34
- with open(file_path, "r", encoding="utf-8") as file:
35
- file_contents = file.read()
36
- except Exception as e:
37
- gr_warning(translations["read_error"])
38
- logger.debug(e)
39
- file_contents = ""
40
-
41
- gr_info(translations["upload_success"].format(name=translations["text"]))
42
- return file_contents
43
-
44
- def move_files_from_directory(src_dir, dest_weights, dest_logs, model_name):
45
- for root, _, files in os.walk(src_dir):
46
- for file in files:
47
- file_path = os.path.join(root, file)
48
- if file.endswith(".index"):
49
- model_log_dir = os.path.join(dest_logs, model_name)
50
- os.makedirs(model_log_dir, exist_ok=True)
51
-
52
- filepath = process_output(os.path.join(model_log_dir, file.replace(' ', '_').replace('(', '').replace(')', '').replace('[', '').replace(']', '').replace(",", "").replace('"', "").replace("'", "").replace("|", "").replace("{", "").replace("}", "").strip()))
53
-
54
- shutil.move(file_path, filepath)
55
- elif file.endswith(".pth") and not file.startswith("D_") and not file.startswith("G_"):
56
- pth_path = process_output(os.path.join(dest_weights, model_name + ".pth"))
57
-
58
- shutil.move(file_path, pth_path)
59
- elif file.endswith(".onnx") and not file.startswith("D_") and not file.startswith("G_"):
60
- pth_path = process_output(os.path.join(dest_weights, model_name + ".onnx"))
61
-
62
- shutil.move(file_path, pth_path)
63
-
64
- def extract_name_model(filename):
65
- match = re.search(r"_([A-Za-z0-9]+)(?=_v\d*)", filename.replace('-', '').replace('(', '').replace(')', '').replace('[', '').replace(']', '').replace(",", "").replace('"', "").replace("'", "").replace("|", "").replace("{", "").replace("}", "").strip())
66
- return match.group(1) if match else None
67
-
68
- def save_drop_model(dropbox):
69
- weight_folder = configs["weights_path"]
70
- logs_folder = configs["logs_path"]
71
- save_model_temp = "save_model_temp"
72
-
73
- if not os.path.exists(weight_folder): os.makedirs(weight_folder, exist_ok=True)
74
- if not os.path.exists(logs_folder): os.makedirs(logs_folder, exist_ok=True)
75
- if not os.path.exists(save_model_temp): os.makedirs(save_model_temp, exist_ok=True)
76
-
77
- shutil.move(dropbox, save_model_temp)
78
-
79
- try:
80
- file_name = os.path.basename(dropbox)
81
-
82
- if file_name.endswith(".zip"):
83
- shutil.unpack_archive(os.path.join(save_model_temp, file_name), save_model_temp)
84
- move_files_from_directory(save_model_temp, weight_folder, logs_folder, file_name.replace(".zip", ""))
85
- elif file_name.endswith((".pth", ".onnx")):
86
- output_file = process_output(os.path.join(weight_folder, file_name))
87
-
88
- shutil.move(os.path.join(save_model_temp, file_name), output_file)
89
- elif file_name.endswith(".index"):
90
- modelname = extract_name_model(file_name)
91
- if modelname is None: modelname = os.path.splitext(os.path.basename(file_name))[0]
92
-
93
- model_logs = os.path.join(logs_folder, modelname)
94
- if not os.path.exists(model_logs): os.makedirs(model_logs, exist_ok=True)
95
-
96
- shutil.move(os.path.join(save_model_temp, file_name), model_logs)
97
- else:
98
- gr_warning(translations["unable_analyze_model"])
99
- return None
100
-
101
- gr_info(translations["upload_success"].format(name=translations["model"]))
102
- return None
103
- except Exception as e:
104
- gr_error(message=translations["error_occurred"].format(e=e))
105
- return None
106
- finally:
107
- shutil.rmtree(save_model_temp, ignore_errors=True)
108
-
109
- def zip_file(name, pth, index):
110
- pth_path = os.path.join(configs["weights_path"], pth)
111
- if not pth or not os.path.exists(pth_path) or not pth.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
112
-
113
- zip_file_path = os.path.join(configs["logs_path"], name, name + ".zip")
114
- gr_info(translations["start"].format(start=translations["zip"]))
115
-
116
- with zipfile.ZipFile(zip_file_path, 'w') as zipf:
117
- zipf.write(pth_path, os.path.basename(pth_path))
118
- if index: zipf.write(index, os.path.basename(index))
119
-
120
- gr_info(translations["success"])
121
- return {"visible": True, "value": zip_file_path, "__type__": "update"}
122
-
123
- def fetch_pretrained_data():
124
- try:
125
- response = requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/wfba/phfgbz_cergenvarq.wfba", "rot13"))
126
- response.raise_for_status()
127
-
128
- return response.json()
129
- except:
130
- return {}
131
-
132
- def update_sample_rate_dropdown(model):
133
- data = fetch_pretrained_data()
134
- if model != translations["success"]: return {"choices": list(data[model].keys()), "value": list(data[model].keys())[0], "__type__": "update"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/restart.py DELETED
@@ -1,48 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import platform
5
- import subprocess
6
-
7
- sys.path.append(os.getcwd())
8
-
9
- from main.app.core.ui import gr_info
10
- from main.app.variables import python, translations, configs_json
11
-
12
- def restart_app(app):
13
- gr_info(translations["30s"])
14
- os.system("cls" if platform.system() == "Windows" else "clear")
15
-
16
- app.close()
17
- subprocess.run([python, os.path.join("main", "app", "app.py")] + sys.argv[1:])
18
-
19
- def change_language(lang, app):
20
- configs = json.load(open(configs_json, "r"))
21
-
22
- if lang != configs["language"]:
23
- configs["language"] = lang
24
-
25
- with open(configs_json, "w") as f:
26
- json.dump(configs, f, indent=4)
27
-
28
- restart_app(app)
29
-
30
- def change_theme(theme, app):
31
- configs = json.load(open(configs_json, "r"))
32
-
33
- if theme != configs["theme"]:
34
- configs["theme"] = theme
35
- with open(configs_json, "w") as f:
36
- json.dump(configs, f, indent=4)
37
-
38
- restart_app(app)
39
-
40
- def change_font(font, app):
41
- configs = json.load(open(configs_json, "r"))
42
-
43
- if font != configs["font"]:
44
- configs["font"] = font
45
- with open(configs_json, "w") as f:
46
- json.dump(configs, f, indent=4)
47
-
48
- restart_app(app)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/separate.py DELETED
@@ -1,35 +0,0 @@
1
- import os
2
- import sys
3
- import subprocess
4
-
5
- sys.path.append(os.getcwd())
6
-
7
- from main.app.core.ui import gr_info, gr_warning
8
- from main.app.variables import python, translations, configs, config
9
-
10
- def separator_music(input, output_audio, format, shifts, segments_size, overlap, clean_audio, clean_strength, denoise, separator_model, kara_model, backing, reverb, backing_reverb, hop_length, batch_size, sample_rate):
11
- output = os.path.dirname(output_audio) or output_audio
12
-
13
- if not input or not os.path.exists(input) or os.path.isdir(input):
14
- gr_warning(translations["input_not_valid"])
15
- return [None]*4
16
-
17
- if not os.path.exists(output):
18
- gr_warning(translations["output_not_valid"])
19
- return [None]*4
20
-
21
- if not os.path.exists(output): os.makedirs(output)
22
- gr_info(translations["start"].format(start=translations["separator_music"]))
23
-
24
- if config.debug_mode: subprocess.run([python, configs["separate_path"], "--input_path", input, "--output_path", output, "--format", format, "--shifts", str(shifts), "--segments_size", str(segments_size), "--overlap", str(overlap), "--mdx_hop_length", str(hop_length), "--mdx_batch_size", str(batch_size), "--clean_audio", str(clean_audio), "--clean_strength", str(clean_strength), "--kara_model", kara_model, "--backing", str(backing), "--mdx_denoise", str(denoise), "--reverb", str(reverb), "--backing_reverb", str(backing_reverb), "--model_name", separator_model, "--sample_rate", str(sample_rate)])
25
- else:
26
- from main.inference.separator_music import separate
27
-
28
- separate(input, output, format, shifts, segments_size, overlap, hop_length, batch_size, clean_audio, clean_strength, separator_model, kara_model, backing, denoise, reverb, backing_reverb, sample_rate)
29
-
30
- gr_info(translations["success"])
31
-
32
- filename, _ = os.path.splitext(os.path.basename(input))
33
- output = os.path.join(output, filename)
34
-
35
- return [os.path.join(output, f"Original_Vocals_No_Reverb.{format}") if reverb else os.path.join(output, f"Original_Vocals.{format}"), os.path.join(output, f"Instruments.{format}"), (os.path.join(output, f"Main_Vocals_No_Reverb.{format}") if reverb else os.path.join(output, f"Main_Vocals.{format}") if backing else None), (os.path.join(output, f"Backing_Vocals_No_Reverb.{format}") if backing_reverb else os.path.join(output, f"Backing_Vocals.{format}") if backing else None)] if os.path.isfile(input) else [None]*4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/training.py DELETED
@@ -1,219 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
- import shutil
5
- import codecs
6
- import threading
7
- import subprocess
8
-
9
- sys.path.append(os.getcwd())
10
-
11
- from main.tools import huggingface
12
- from main.app.core.ui import gr_info, gr_warning
13
- from main.app.variables import python, translations, configs
14
-
15
- def if_done(done, p):
16
- while 1:
17
- if p.poll() is None: time.sleep(0.5)
18
- else: break
19
-
20
- done[0] = True
21
-
22
- def log_read(done, name):
23
- log_file = os.path.join(configs["logs_path"], "app.log")
24
-
25
- f = open(log_file, "w", encoding="utf-8")
26
- f.close()
27
-
28
- while 1:
29
- with open(log_file, "r", encoding="utf-8") as f:
30
- yield "".join(line for line in f.readlines() if "DEBUG" not in line and name in line and line.strip() != "")
31
-
32
- time.sleep(1)
33
- if done[0]: break
34
-
35
- with open(log_file, "r", encoding="utf-8") as f:
36
- log = "".join(line for line in f.readlines() if "DEBUG" not in line and line.strip() != "")
37
-
38
- yield log
39
-
40
- def create_dataset(input_audio, output_dataset, clean_dataset, clean_strength, separator_reverb, kim_vocals_version, overlap, segments_size, denoise_mdx, skip, skip_start, skip_end, hop_length, batch_size, sample_rate):
41
- version = 1 if kim_vocals_version == "Version-1" else 2
42
- gr_info(translations["start"].format(start=translations["create"]))
43
-
44
- p = subprocess.Popen(f'{python} {configs["create_dataset_path"]} --input_audio "{input_audio}" --output_dataset "{output_dataset}" --clean_dataset {clean_dataset} --clean_strength {clean_strength} --separator_reverb {separator_reverb} --kim_vocal_version {version} --overlap {overlap} --segments_size {segments_size} --mdx_hop_length {hop_length} --mdx_batch_size {batch_size} --denoise_mdx {denoise_mdx} --skip {skip} --skip_start_audios "{skip_start}" --skip_end_audios "{skip_end}" --sample_rate {sample_rate}', shell=True)
45
- done = [False]
46
-
47
- threading.Thread(target=if_done, args=(done, p)).start()
48
-
49
- for log in log_read(done, "create_dataset"):
50
- yield log
51
-
52
- def preprocess(model_name, sample_rate, cpu_core, cut_preprocess, process_effects, dataset, clean_dataset, clean_strength):
53
- sr = int(float(sample_rate.rstrip("k")) * 1000)
54
-
55
- if not model_name: return gr_warning(translations["provide_name"])
56
- if not os.path.exists(dataset) or not any(f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3")) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f))): return gr_warning(translations["not_found_data"])
57
-
58
- model_dir = os.path.join(configs["logs_path"], model_name)
59
- if os.path.exists(model_dir): shutil.rmtree(model_dir, ignore_errors=True)
60
-
61
- p = subprocess.Popen(f'{python} {configs["preprocess_path"]} --model_name "{model_name}" --dataset_path "{dataset}" --sample_rate {sr} --cpu_cores {cpu_core} --cut_preprocess {cut_preprocess} --process_effects {process_effects} --clean_dataset {clean_dataset} --clean_strength {clean_strength}', shell=True)
62
- done = [False]
63
-
64
- threading.Thread(target=if_done, args=(done, p)).start()
65
- os.makedirs(model_dir, exist_ok=True)
66
-
67
- for log in log_read(done, "preprocess"):
68
- yield log
69
-
70
- def extract(model_name, version, method, pitch_guidance, hop_length, cpu_cores, gpu, sample_rate, embedders, custom_embedders, onnx_f0_mode, embedders_mode, f0_autotune, f0_autotune_strength, hybrid_method, rms_extract):
71
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
72
- sr = int(float(sample_rate.rstrip("k")) * 1000)
73
-
74
- if not model_name: return gr_warning(translations["provide_name"])
75
- model_dir = os.path.join(configs["logs_path"], model_name)
76
-
77
- try:
78
- if not any(os.path.isfile(os.path.join(model_dir, "sliced_audios", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios"))) or not any(os.path.isfile(os.path.join(model_dir, "sliced_audios_16k", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios_16k"))): return gr_warning(translations["not_found_data_preprocess"])
79
- except:
80
- return gr_warning(translations["not_found_data_preprocess"])
81
-
82
- p = subprocess.Popen(f'{python} {configs["extract_path"]} --model_name "{model_name}" --rvc_version {version} --f0_method {f0method} --pitch_guidance {pitch_guidance} --hop_length {hop_length} --cpu_cores {cpu_cores} --gpu {gpu} --sample_rate {sr} --embedder_model {embedder_model} --f0_onnx {onnx_f0_mode} --embedders_mode {embedders_mode} --f0_autotune {f0_autotune} --f0_autotune_strength {f0_autotune_strength} --rms_extract {rms_extract}', shell=True)
83
- done = [False]
84
-
85
- threading.Thread(target=if_done, args=(done, p)).start()
86
- os.makedirs(model_dir, exist_ok=True)
87
-
88
- for log in log_read(done, "extract"):
89
- yield log
90
-
91
- def create_index(model_name, rvc_version, index_algorithm):
92
- if not model_name: return gr_warning(translations["provide_name"])
93
- model_dir = os.path.join(configs["logs_path"], model_name)
94
-
95
- try:
96
- if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"])
97
- except:
98
- return gr_warning(translations["not_found_data_extract"])
99
-
100
- p = subprocess.Popen(f'{python} {configs["create_index_path"]} --model_name "{model_name}" --rvc_version {rvc_version} --index_algorithm {index_algorithm}', shell=True)
101
- done = [False]
102
-
103
- threading.Thread(target=if_done, args=(done, p)).start()
104
- os.makedirs(model_dir, exist_ok=True)
105
-
106
- for log in log_read(done, "create_index"):
107
- yield log
108
-
109
- def training(model_name, rvc_version, save_every_epoch, save_only_latest, save_every_weights, total_epoch, sample_rate, batch_size, gpu, pitch_guidance, not_pretrain, custom_pretrained, pretrain_g, pretrain_d, detector, threshold, clean_up, cache, model_author, vocoder, checkpointing, deterministic, benchmark, optimizer, energy_use):
110
- sr = int(float(sample_rate.rstrip("k")) * 1000)
111
- if not model_name: return gr_warning(translations["provide_name"])
112
-
113
- model_dir = os.path.join(configs["logs_path"], model_name)
114
- if os.path.exists(os.path.join(model_dir, "train_pid.txt")): os.remove(os.path.join(model_dir, "train_pid.txt"))
115
-
116
- try:
117
- if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"])
118
- except:
119
- return gr_warning(translations["not_found_data_extract"])
120
-
121
- if not not_pretrain:
122
- if not custom_pretrained:
123
- pretrain_dir = configs["pretrained_v2_path"] if rvc_version == 'v2' else configs["pretrained_v1_path"]
124
- download_version = codecs.decode(f"uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_i{'2' if rvc_version == 'v2' else '1'}/", "rot13")
125
-
126
- pretrained_selector = {
127
- True: {
128
- 32000: ("f0G32k.pth", "f0D32k.pth"),
129
- 40000: ("f0G40k.pth", "f0D40k.pth"),
130
- 48000: ("f0G48k.pth", "f0D48k.pth")
131
- },
132
- False: {
133
- 32000: ("G32k.pth", "D32k.pth"),
134
- 40000: ("G40k.pth", "D40k.pth"),
135
- 48000: ("G48k.pth", "D48k.pth")
136
- }
137
- }
138
-
139
- pg2, pd2 = "", ""
140
- pg, pd = pretrained_selector[pitch_guidance][sr]
141
-
142
- if energy_use: pg2, pd2 = pg2 + "ENERGY_", pd2 + "ENERGY_"
143
- if vocoder != 'Default': pg2, pd2 = pg2 + vocoder + "_", pd2 + vocoder + "_"
144
-
145
- pg2, pd2 = pg2 + pg, pd2 + pd
146
- pretrained_G, pretrained_D = (
147
- os.path.join(
148
- pretrain_dir,
149
- pg2
150
- ),
151
- os.path.join(
152
- pretrain_dir,
153
- pd2
154
- )
155
- )
156
-
157
- try:
158
- if not os.path.exists(pretrained_G):
159
- gr_info(translations["download_pretrained"].format(dg="G", rvc_version=rvc_version))
160
- huggingface.HF_download_file(
161
- "".join(
162
- [
163
- download_version,
164
- pg2
165
- ]
166
- ),
167
- os.path.join(
168
- pretrain_dir,
169
- pg2
170
- )
171
- )
172
-
173
- if not os.path.exists(pretrained_D):
174
- gr_info(translations["download_pretrained"].format(dg="D", rvc_version=rvc_version))
175
- huggingface.HF_download_file(
176
- "".join(
177
- [
178
- download_version,
179
- pd2
180
- ]
181
- ),
182
- os.path.join(
183
- pretrain_dir,
184
- pd2
185
- )
186
- )
187
- except:
188
- gr_warning(translations["not_use_pretrain_error_download"])
189
- pretrained_G = pretrained_D = None
190
- else:
191
- if not pretrain_g: return gr_warning(translations["provide_pretrained"].format(dg="G"))
192
- if not pretrain_d: return gr_warning(translations["provide_pretrained"].format(dg="D"))
193
-
194
- pg2, pd2 = pretrain_g, pretrain_d
195
- pretrained_G, pretrained_D = (
196
- (os.path.join(configs["pretrained_custom_path"], pg2) if not os.path.exists(pg2) else pg2),
197
- (os.path.join(configs["pretrained_custom_path"], pd2) if not os.path.exists(pd2) else pd2)
198
- )
199
-
200
- if not os.path.exists(pretrained_G): return gr_warning(translations["not_found_pretrain"].format(dg="G"))
201
- if not os.path.exists(pretrained_D): return gr_warning(translations["not_found_pretrain"].format(dg="D"))
202
- else:
203
- pretrained_G = pretrained_D = None
204
- gr_warning(translations["not_use_pretrain"])
205
-
206
- gr_info(translations["start"].format(start=translations["training"]))
207
-
208
- p = subprocess.Popen(f'{python} {configs["train_path"]} --model_name "{model_name}" --rvc_version {rvc_version} --save_every_epoch {save_every_epoch} --save_only_latest {save_only_latest} --save_every_weights {save_every_weights} --total_epoch {total_epoch} --sample_rate {sr} --batch_size {batch_size} --gpu {gpu} --pitch_guidance {pitch_guidance} --overtraining_detector {detector} --overtraining_threshold {threshold} --cleanup {clean_up} --cache_data_in_gpu {cache} --g_pretrained_path "{pretrained_G}" --d_pretrained_path "{pretrained_D}" --model_author "{model_author}" --vocoder "{vocoder}" --checkpointing {checkpointing} --deterministic {deterministic} --benchmark {benchmark} --optimizer {optimizer} --energy_use {energy_use}', shell=True)
209
- done = [False]
210
-
211
- with open(os.path.join(model_dir, "train_pid.txt"), "w") as pid_file:
212
- pid_file.write(str(p.pid))
213
-
214
- threading.Thread(target=if_done, args=(done, p)).start()
215
-
216
- for log in log_read(done, "train"):
217
- lines = log.splitlines()
218
- if len(lines) > 100: log = "\n".join(lines[-100:])
219
- yield log
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/tts.py DELETED
@@ -1,99 +0,0 @@
1
- import os
2
- import sys
3
- import pysrt
4
- import codecs
5
- import librosa
6
- import asyncio
7
- import requests
8
- import tempfile
9
-
10
- import numpy as np
11
- import soundfile as sf
12
-
13
- from edge_tts import Communicate
14
-
15
- sys.path.append(os.getcwd())
16
-
17
- from main.app.variables import translations
18
- from main.app.core.ui import gr_info, gr_warning, gr_error
19
-
20
- def synthesize_tts(prompt, voice, speed, output, pitch, google):
21
- if not google: asyncio.run(Communicate(text=prompt, voice=voice, rate=f"+{speed}%" if speed >= 0 else f"{speed}%", pitch=f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz").save(output))
22
- else:
23
- response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyr.pbz/genafyngr_ggf", "rot13"), params={"ie": "UTF-8", "q": prompt, "tl": voice, "ttsspeed": speed, "client": "tw-ob"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"})
24
-
25
- if response.status_code == 200:
26
- with open(output, "wb") as f:
27
- f.write(response.content)
28
-
29
- if pitch != 0 or speed != 0:
30
- y, sr = librosa.load(output, sr=None)
31
-
32
- if pitch != 0: y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch)
33
- if speed != 0: y = librosa.effects.time_stretch(y, rate=speed)
34
-
35
- sf.write(file=output, data=y, samplerate=sr, format=os.path.splitext(os.path.basename(output))[-1].lower().replace('.', ''))
36
- else: gr_error(f"{response.status_code}, {response.text}")
37
-
38
- def time_stretch(y, sr, target_duration):
39
- rate = (len(y) / sr) / target_duration
40
- if rate != 1.0: y = librosa.effects.time_stretch(y=y.astype(np.float32), rate=rate)
41
-
42
- n_target = int(round(target_duration * sr))
43
- return np.pad(y, (0, n_target - len(y))) if len(y) < n_target else y[:n_target]
44
-
45
- def pysrttime_to_seconds(t):
46
- return (t.hours * 60 + t.minutes) * 60 + t.seconds + t.milliseconds / 1000
47
-
48
- def srt_tts(srt_file, out_file, voice, rate = 0, sr = 24000, google = False):
49
- subs = pysrt.open(srt_file)
50
- if not subs: raise ValueError(translations["srt"])
51
-
52
- final_audio = np.zeros(int(round(pysrttime_to_seconds(subs[-1].end) * sr)), dtype=np.float32)
53
-
54
- with tempfile.TemporaryDirectory() as tempdir:
55
- for idx, seg in enumerate(subs):
56
- wav_path = os.path.join(tempdir, f"seg_{idx}.wav")
57
- synthesize_tts(" ".join(seg.text.splitlines()), voice, 0, wav_path, rate, google)
58
-
59
- audio, file_sr = sf.read(wav_path, dtype=np.float32)
60
- if file_sr != sr: audio = np.interp(np.linspace(0, len(audio) - 1, int(len(audio) * sr / file_sr)), np.arange(len(audio)), audio)
61
- adjusted = time_stretch(audio, sr, pysrttime_to_seconds(seg.duration))
62
-
63
- start_sample = int(round(pysrttime_to_seconds(seg.start) * sr))
64
- end_sample = start_sample + adjusted.shape[0]
65
-
66
- if end_sample > final_audio.shape[0]:
67
- adjusted = adjusted[: final_audio.shape[0] - start_sample]
68
- end_sample = final_audio.shape[0]
69
-
70
- final_audio[start_sample:end_sample] += adjusted
71
-
72
- sf.write(out_file, final_audio, sr)
73
-
74
- def TTS(prompt, voice, speed, output, pitch, google, srt_input):
75
- if not srt_input: srt_input = ""
76
-
77
- if not prompt and not srt_input.endswith(".srt"):
78
- gr_warning(translations["enter_the_text"])
79
- return None
80
-
81
- if not voice:
82
- gr_warning(translations["choose_voice"])
83
- return None
84
-
85
- if not output:
86
- gr_warning(translations["output_not_valid"])
87
- return None
88
-
89
- if os.path.isdir(output): output = os.path.join(output, f"tts.wav")
90
- gr_info(translations["convert"].format(name=translations["text"]))
91
-
92
- output_dir = os.path.dirname(output) or output
93
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
94
-
95
- if srt_input.endswith(".srt"): srt_tts(srt_input, output, voice, 0, 24000, google)
96
- else: synthesize_tts(prompt, voice, speed, output, pitch, google)
97
-
98
- gr_info(translations["success"])
99
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/ui.py DELETED
@@ -1,179 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import torch
5
- import shutil
6
-
7
- import gradio as gr
8
-
9
- sys.path.append(os.getcwd())
10
-
11
- from main.library import opencl
12
- from main.app.variables import config, configs, configs_json, logger, translations, edgetts, google_tts_voice, method_f0, method_f0_full
13
-
14
- def gr_info(message):
15
- gr.Info(message, duration=2)
16
- logger.info(message)
17
-
18
- def gr_warning(message):
19
- gr.Warning(message, duration=2)
20
- logger.warning(message)
21
-
22
- def gr_error(message):
23
- gr.Error(message=message, duration=6)
24
- logger.error(message)
25
-
26
- def get_gpu_info():
27
- ngpu = torch.cuda.device_count()
28
- gpu_infos = [f"{i}: {torch.cuda.get_device_name(i)} ({int(torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4)} GB)" for i in range(ngpu) if torch.cuda.is_available() or ngpu != 0]
29
-
30
- if len(gpu_infos) == 0:
31
- ngpu = opencl.device_count()
32
- gpu_infos = [f"{i}: {opencl.device_name(i)}" for i in range(ngpu) if opencl.is_available() or ngpu != 0]
33
-
34
- return "\n".join(gpu_infos) if len(gpu_infos) > 0 else translations["no_support_gpu"]
35
-
36
- def gpu_number_str():
37
- ngpu = torch.cuda.device_count()
38
- if ngpu == 0: ngpu = opencl.device_count()
39
-
40
- return str("-".join(map(str, range(ngpu))) if torch.cuda.is_available() or opencl.is_available() else "-")
41
-
42
- def change_f0_choices():
43
- f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["f0_path"]) for f in files if f.endswith(".txt")])
44
- return {"value": f0_file[0] if len(f0_file) >= 1 else "", "choices": f0_file, "__type__": "update"}
45
-
46
- def change_audios_choices(input_audio):
47
- audios = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["audios_path"]) for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")])
48
- return {"value": input_audio if input_audio != "" else (audios[0] if len(audios) >= 1 else ""), "choices": audios, "__type__": "update"}
49
-
50
- def change_models_choices():
51
- model, index = sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_"))), sorted([os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name])
52
- return [{"value": model[0] if len(model) >= 1 else "", "choices": model, "__type__": "update"}, {"value": index[0] if len(index) >= 1 else "", "choices": index, "__type__": "update"}]
53
-
54
- def change_pretrained_choices():
55
- pretrainD = sorted([model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "D" in model])
56
- pretrainG = sorted([model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "G" in model])
57
-
58
- return [{"choices": pretrainD, "value": pretrainD[0] if len(pretrainD) >= 1 else "", "__type__": "update"}, {"choices": pretrainG, "value": pretrainG[0] if len(pretrainG) >= 1 else "", "__type__": "update"}]
59
-
60
- def change_choices_del():
61
- return [{"choices": sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith(".pth") and not model.startswith("G_") and not model.startswith("D_"))), "__type__": "update"}, {"choices": sorted([os.path.join(configs["logs_path"], f) for f in os.listdir(configs["logs_path"]) if "mute" not in f and os.path.isdir(os.path.join(configs["logs_path"], f))]), "__type__": "update"}]
62
-
63
- def change_preset_choices():
64
- return {"value": "", "choices": sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".conversion.json"))), "__type__": "update"}
65
-
66
- def change_effect_preset_choices():
67
- return {"value": "", "choices": sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".effect.json"))), "__type__": "update"}
68
-
69
- def change_tts_voice_choices(google):
70
- return {"choices": google_tts_voice if google else edgetts, "value": google_tts_voice[0] if google else edgetts[0], "__type__": "update"}
71
-
72
- def change_backing_choices(backing, merge):
73
- if backing or merge: return {"value": False, "interactive": False, "__type__": "update"}
74
- elif not backing or not merge: return {"interactive": True, "__type__": "update"}
75
- else: gr_warning(translations["option_not_valid"])
76
-
77
- def change_download_choices(select):
78
- selects = [False]*10
79
-
80
- if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True
81
- elif select == translations["download_from_csv"]: selects[3] = selects[4] = True
82
- elif select == translations["search_models"]: selects[5] = selects[6] = True
83
- elif select == translations["upload"]: selects[9] = True
84
- else: gr_warning(translations["option_not_valid"])
85
-
86
- return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))]
87
-
88
- def change_download_pretrained_choices(select):
89
- selects = [False]*8
90
-
91
- if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True
92
- elif select == translations["list_model"]: selects[3] = selects[4] = selects[5] = True
93
- elif select == translations["upload"]: selects[6] = selects[7] = True
94
- else: gr_warning(translations["option_not_valid"])
95
-
96
- return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))]
97
-
98
- def get_index(model):
99
- model = os.path.basename(model).split("_")[0]
100
- return {"value": next((f for f in [os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name] if model.split(".")[0] in f), ""), "__type__": "update"} if model else None
101
-
102
- def index_strength_show(index):
103
- return {"visible": index != "" and os.path.exists(index), "value": 0.5, "__type__": "update"}
104
-
105
- def hoplength_show(method, hybrid_method=None):
106
- visible = False
107
-
108
- for m in ["mangio-crepe", "fcpe", "yin", "piptrack", "fcn"]:
109
- if m in method: visible = True
110
- if m in hybrid_method: visible = True
111
-
112
- if visible: break
113
- else: visible = False
114
-
115
- return {"visible": visible, "__type__": "update"}
116
-
117
- def visible(value):
118
- return {"visible": value, "__type__": "update"}
119
-
120
- def valueFalse_interactive(value):
121
- return {"value": False, "interactive": value, "__type__": "update"}
122
-
123
- def valueEmpty_visible1(value):
124
- return {"value": "", "visible": value, "__type__": "update"}
125
-
126
- def pitch_guidance_lock(vocoders):
127
- return {"value": True, "interactive": vocoders == "Default", "__type__": "update"}
128
-
129
- def vocoders_lock(pitch, vocoders):
130
- return {"value": vocoders if pitch else "Default", "interactive": pitch, "__type__": "update"}
131
-
132
- def unlock_f0(value):
133
- return {"choices": method_f0_full if value else method_f0, "value": "rmvpe", "__type__": "update"}
134
-
135
- def unlock_vocoder(value, vocoder):
136
- return {"value": vocoder if value == "v2" else "Default", "interactive": value == "v2", "__type__": "update"}
137
-
138
- def unlock_ver(value, vocoder):
139
- return {"value": "v2" if vocoder == "Default" else value, "interactive": vocoder == "Default", "__type__": "update"}
140
-
141
- def visible_embedders(value):
142
- return {"visible": value != "spin", "__type__": "update"}
143
-
144
- def change_fp(fp):
145
- fp16 = fp == "fp16"
146
-
147
- if fp16 and config.device in ["cpu", "mps", "ocl:0"]:
148
- gr_warning(translations["fp16_not_support"])
149
- return "fp32"
150
- else:
151
- gr_info(translations["start_update_precision"])
152
-
153
- configs = json.load(open(configs_json, "r"))
154
- configs["fp16"] = config.is_half = fp16
155
-
156
- with open(configs_json, "w") as f:
157
- json.dump(configs, f, indent=4)
158
-
159
- gr_info(translations["success"])
160
- return "fp16" if fp16 else "fp32"
161
-
162
- def process_output(file_path):
163
- if config.configs.get("delete_exists_file", True):
164
- if os.path.exists(file_path): os.remove(file_path)
165
- return file_path
166
- else:
167
- if not os.path.exists(file_path): return file_path
168
- file = os.path.splitext(os.path.basename(file_path))
169
-
170
- index = 1
171
- while 1:
172
- file_path = os.path.join(os.path.dirname(file_path), f"{file[0]}_{index}{file[1]}")
173
- if not os.path.exists(file_path): return file_path
174
- index += 1
175
-
176
- def shutil_move(input_path, output_path):
177
- output_path = os.path.join(output_path, os.path.basename(input_path)) if os.path.isdir(output_path) else output_path
178
-
179
- return shutil.move(input_path, process_output(output_path)) if os.path.exists(output_path) else shutil.move(input_path, output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/utils.py DELETED
@@ -1,97 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import codecs
5
- import requests
6
- import platform
7
- import datetime
8
-
9
- sys.path.append(os.getcwd())
10
-
11
- from main.app.core.ui import gr_info, gr_warning, gr_error
12
- from main.app.variables import logger, translations, configs
13
-
14
- def stop_pid(pid_file, model_name=None, train=False):
15
- try:
16
- pid_file_path = os.path.join("assets", f"{pid_file}.txt") if model_name is None else os.path.join(configs["logs_path"], model_name, f"{pid_file}.txt")
17
-
18
- if not os.path.exists(pid_file_path): return gr_warning(translations["not_found_pid"])
19
- else:
20
- with open(pid_file_path, "r") as pid_file:
21
- pids = [int(pid) for pid in pid_file.readlines()]
22
-
23
- for pid in pids:
24
- os.kill(pid, 9)
25
-
26
- if os.path.exists(pid_file_path): os.remove(pid_file_path)
27
-
28
- pid_file_path = os.path.join(configs["logs_path"], model_name, "config.json")
29
-
30
- if train and os.path.exists(pid_file_path):
31
- with open(pid_file_path, "r") as pid_file:
32
- pid_data = json.load(pid_file)
33
- pids = pid_data.get("process_pids", [])
34
-
35
- with open(pid_file_path, "w") as pid_file:
36
- pid_data.pop("process_pids", None)
37
-
38
- json.dump(pid_data, pid_file, indent=4)
39
-
40
- for pid in pids:
41
- os.kill(pid, 9)
42
-
43
- gr_info(translations["end_pid"])
44
- except:
45
- pass
46
-
47
- def report_bug(error_info, provide):
48
- report_path = os.path.join(configs["logs_path"], "report_bugs.log")
49
- if os.path.exists(report_path): os.remove(report_path)
50
-
51
- report_url = codecs.decode(requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/jroubbx.gkg", "rot13")).text, "rot13")
52
- if not error_info: error_info = "Không Có"
53
-
54
- gr_info(translations["thank"])
55
-
56
- if provide:
57
- try:
58
- for log in [os.path.join(root, name) for root, _, files in os.walk(os.path.join(configs["logs_path"]), topdown=False) for name in files if name.endswith(".log")]:
59
- with open(log, "r", encoding="utf-8") as r:
60
- with open(report_path, "a", encoding="utf-8") as w:
61
- w.write(str(r.read()))
62
- w.write("\n")
63
- except Exception as e:
64
- gr_error(translations["error_read_log"])
65
- logger.debug(e)
66
-
67
- try:
68
- with open(report_path, "r", encoding="utf-8") as f:
69
- content = f.read()
70
-
71
- requests.post(report_url, json={"embeds": [{"title": "Báo Cáo Lỗi", "description": f"Mô tả lỗi: {error_info}", "color": 15158332, "author": {"name": "Vietnamese_RVC", "icon_url": codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/vpb.cat", "rot13"), "url": codecs.decode("uggcf://tvguho.pbz/CunzUhlauNau16/Ivrganzrfr-EIP/gerr/znva","rot13")}, "thumbnail": {"url": codecs.decode("uggcf://p.grabe.pbz/7dADJbv-36fNNNNq/grabe.tvs", "rot13")}, "fields": [{"name": "Số Lượng Gỡ Lỗi", "value": content.count("DEBUG")}, {"name": "Số Lượng Thông Tin", "value": content.count("INFO")}, {"name": "Số Lượng Cảnh Báo", "value": content.count("WARNING")}, {"name": "Số Lượng Lỗi", "value": content.count("ERROR")}], "footer": {"text": f"Tên Máy: {platform.uname().node} - Hệ Điều Hành: {platform.system()}-{platform.version()}\nThời Gian Báo Cáo Lỗi: {datetime.datetime.now()}."}}]})
72
-
73
- with open(report_path, "rb") as f:
74
- requests.post(report_url, files={"file": f})
75
- except Exception as e:
76
- gr_error(translations["error_send"])
77
- finally:
78
- if os.path.exists(report_path): os.remove(report_path)
79
- else: requests.post(report_url, json={"embeds": [{"title": "Báo Cáo Lỗi", "description": error_info}]})
80
-
81
- def google_translate(text, source='auto', target='vi'):
82
- if text == "": return gr_warning(translations["prompt_warning"])
83
-
84
- try:
85
- import textwrap
86
-
87
- def translate_chunk(chunk):
88
- response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyrncvf.pbz/genafyngr_n/fvatyr", "rot13"), params={'client': 'gtx', 'sl': source, 'tl': target, 'dt': 't', 'q': chunk})
89
- return ''.join([i[0] for i in response.json()[0]]) if response.status_code == 200 else chunk
90
-
91
- translated_text = ''
92
- for chunk in textwrap.wrap(text, 5000, break_long_words=False, break_on_hyphens=False):
93
- translated_text += translate_chunk(chunk)
94
-
95
- return translated_text
96
- except:
97
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/parser.py DELETED
@@ -1,319 +0,0 @@
1
- import os
2
- import sys
3
-
4
- sys.path.append(os.getcwd())
5
-
6
- try:
7
- argv = sys.argv[1]
8
- except IndexError:
9
- argv = None
10
-
11
- argv_is_allows = ["--audio_effects", "--convert", "--create_dataset", "--create_index", "--extract", "--preprocess", "--separator_music", "--train", "--help_audio_effects", "--help_convert", "--help_create_dataset", "--help_create_index", "--help_extract", "--help_preprocess", "--help_separator_music", "--help_train", "--help"]
12
-
13
- if argv not in argv_is_allows:
14
- print("Cú pháp không hợp lệ! Sử dụng --help để biết thêm")
15
- quit()
16
-
17
- if argv_is_allows[0] in argv: from main.inference.audio_effects import main
18
- elif argv_is_allows[1] in argv: from main.inference.conversion.convert import main
19
- elif argv_is_allows[2] in argv: from main.inference.create_dataset import main
20
- elif argv_is_allows[3] in argv: from main.inference.create_index import main
21
- elif argv_is_allows[4] in argv: from main.inference.extracting.extract import main
22
- elif argv_is_allows[5] in argv: from main.inference.preprocess.preprocess import main
23
- elif argv_is_allows[6] in argv: from main.inference.separator_music import main
24
- elif argv_is_allows[7] in argv: from main.inference.training.train import main
25
- elif argv_is_allows[8] in argv:
26
- print("""Các tham số của `--audio_effects`:
27
- 1. Đường dẫn tệp:
28
- - `--input_path` (bắt buộc): Đường dẫn đến tệp âm thanh đầu vào.
29
- - `--output_path` (mặc định: `./audios/apply_effects.wav`): Đường dẫn lưu tệp đầu ra.
30
- - `--export_format` (mặc định: `wav`): Định dạng xuất tệp (`wav`, `mp3`, ...).
31
-
32
- 2. Lấy mẫu lại:
33
- - `--resample` (mặc định: `False`): Có lấy mẫu lại hay không.
34
- - `--resample_sr` (mặc định: `0`): Tần số lấy mẫu mới (Hz).
35
-
36
- 3. Hiệu ứng chorus:
37
- - `--chorus`: Bật/tắt chorus.
38
- - `--chorus_depth`, `--chorus_rate`, `--chorus_mix`, `--chorus_delay`, `--chorus_feedback`: Các thông số điều chỉnh chorus.
39
-
40
- 4. Hiệu ứng distortion:
41
- - `--distortion`: Bật/tắt distortion.
42
- - `--drive_db`: Mức độ méo âm thanh.
43
-
44
- 5. Hiệu ứng reverb:
45
- - `--reverb`: Bật/tắt hồi âm.
46
- - `--reverb_room_size`, `--reverb_damping`, `--reverb_wet_level`, `--reverb_dry_level`, `--reverb_width`, `--reverb_freeze_mode`: Điều chỉnh hồi âm.
47
-
48
- 6. Hiệu ứng pitch shift:
49
- - `--pitchshift`: Bật/tắt thay đổi cao độ.
50
- - `--pitch_shift`: Giá trị dịch cao độ.
51
-
52
- 7. Hiệu ứng delay:
53
- - `--delay`: Bật/tắt delay.
54
- - `--delay_seconds`, `--delay_feedback`, `--delay_mix`: Điều chỉnh thời gian trễ, phản hồi và hòa trộn.
55
-
56
- 8. Compressor:
57
- - `--compressor`: Bật/tắt compressor.
58
- - `--compressor_threshold`, `--compressor_ratio`, `--compressor_attack_ms`, `--compressor_release_ms`: Các thông số nén.
59
-
60
- 9. Limiter:
61
- - `--limiter`: Bật/tắt giới hạn mức âm thanh.
62
- - `--limiter_threshold`, `--limiter_release`: Ngưỡng giới hạn và thời gian nhả.
63
-
64
- 10. Gain (Khuếch đại):
65
- - `--gain`: Bật/tắt gain.
66
- - `--gain_db`: Mức gain (dB).
67
-
68
- 11. Bitcrush:
69
- - `--bitcrush`: Bật/tắt hiệu ứng giảm độ phân giải.
70
- - `--bitcrush_bit_depth`: Số bit của bitcrush.
71
-
72
- 12. Clipping:
73
- - `--clipping`: Bật/tắt cắt âm thanh.
74
- - `--clipping_threshold`: Ngưỡng clipping.
75
-
76
- 13. Phaser:
77
- - `--phaser`: Bật/tắt hiệu ứng phaser.
78
- - `--phaser_rate_hz`, `--phaser_depth`, `--phaser_centre_frequency_hz`, `--phaser_feedback`, `--phaser_mix`: Điều chỉnh hiệu ứng phaser.
79
-
80
- 14. Boost bass & treble:
81
- - `--treble_bass_boost`: Bật/tắt tăng cường âm bass và treble.
82
- - `--bass_boost_db`, `--bass_boost_frequency`, `--treble_boost_db`, `--treble_boost_frequency`: Các thông số tăng bass và treble.
83
-
84
- 15. Fade in & fade out:
85
- - `--fade_in_out`: Bật/tắt hiệu ứng fade.
86
- - `--fade_in_duration`, `--fade_out_duration`: Thời gian fade vào/ra.
87
-
88
- 16. Kết hợp âm thanh:
89
- - `--audio_combination`: Bật/tắt ghép nhiều tệp âm thanh.
90
- - `--audio_combination_input`: Đường dẫn tệp âm thanh bổ sung.
91
- - `--main_volume`: Âm lượng của âm thanh chính.
92
- - `--combination_volume`:: Âm lượng của âm thanh cần kết hợp.
93
- """)
94
- quit()
95
- elif argv_is_allows[9] in argv:
96
- print("""Các tham số của --convert:
97
- 1. Cấu hình xử lý giọng nói:
98
- - `--pitch` (mặc định: `0`): Điều chỉnh cao độ.
99
- - `--filter_radius` (mặc định: `3`): Độ mượt của đường F0.
100
- - `--index_rate` (mặc định: `0.5`): Tỷ lệ sử dụng chỉ mục giọng nói.
101
- - `--rms_mix_rate` (mặc định: `1`): Hệ số điều chỉnh biên độ âm lượng.
102
- - `--protect` (mặc định: `0.33`): Bảo vệ phụ âm.
103
-
104
- 2. Cấu hình mẫu (frame hop):
105
- - `--hop_length` (mặc định: `64`): Bước nhảy khi xử lý âm thanh.
106
-
107
- 3. Cấu hình F0:
108
- - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`).
109
- - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không.
110
- - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0.
111
- - `--f0_file` (mặc định: ``): Đường dẫn tệp F0 có sẵn.
112
- - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không.
113
- - `--proposal_pitch` (mặc định: `False`): Đề xuất cao độ thay vì điều chỉnh thủ công.
114
- - `--proposal_pitch_threshold` (mặc định: `255.0`): Tần số ước tính cao độ.
115
-
116
- 4. Mô hình nhúng:
117
- - `--embedder_model` (mặc định: `contentvec_base`): Mô hình nhúng sử dụng.
118
- - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`).
119
-
120
- 5. Đường dẫn tệp:
121
- - `--input_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào.
122
- - `--output_path` (mặc định: `./audios/output.wav`): Đường dẫn lưu tệp đầu ra.
123
- - `--export_format` (mặc định: `wav`): Định dạng xuất tệp.
124
- - `--pth_path` (bắt buộc): Đường dẫn đến tệp mô hình `.pth`.
125
- - `--index_path` (mặc định: `None`): Đường dẫn tệp chỉ mục (nếu có).
126
-
127
- 6. Làm sạch âm thanh:
128
- - `--clean_audio` (mặc định: `False`): Có áp dụng làm sạch âm thanh không.
129
- - `--clean_strength` (mặc định: `0.7`): Mức độ làm sạch.
130
-
131
- 7. Resampling & chia nhỏ âm thanh:
132
- - `--resample_sr` (mặc định: `0`): Tần số lấy mẫu mới (0 nghĩa là giữ nguyên).
133
- - `--split_audio` (mặc định: `False`): Có chia nhỏ audio trước khi xử lý không.
134
-
135
- 8. Kiểm tra & tối ưu hóa:
136
- - `--checkpointing` (mặc định: `False`): Bật/tắt checkpointing để tiết kiệm RAM.
137
-
138
- 9. Dịch formant:
139
- - `--formant_shifting` (mặc định: `False`): Có bật hiệu ứng dịch formant không.
140
- - `--formant_qfrency` (mặc định: `0.8`): Hệ số dịch formant theo tần số.
141
- - `--formant_timbre` (mặc định: `0.8`): Hệ số thay đổi màu sắc giọng.
142
- """)
143
- quit()
144
- elif argv_is_allows[10] in argv:
145
- print("""Các tham số của --create_dataset:
146
- 1. Đường dẫn & cấu hình dataset:
147
- - `--input_audio` (bắt buộc): Đường dẫn liên kết đến âm thanh (Liên kết Youtube, có thể dùng dấu `,` để dùng nhiều liên kết).
148
- - `--output_dataset` (mặc định: `./dataset`): Thư mục xuất dữ liệu đầu ra.
149
- - `--sample_rate` (mặc định: `44100`): Tần số lấy mẫu cho âm thanh.
150
-
151
- 2. Làm sạch dữ liệu:
152
- - `--clean_dataset` (mặc định: `False`): Có áp dụng làm sạch dữ liệu hay không.
153
- - `--clean_strength` (mặc định: `0.7`): Mức độ làm sạch dữ liệu.
154
-
155
- 3. Tách giọng & hiệu ứng:
156
- - `--separator_reverb` (mặc định: `False`): Có tách vang giọng không.
157
- - `--kim_vocal_version` (mặc định: `2`): Phiên bản mô hình Kim Vocal để tách (`1`, `2`).
158
-
159
- 4. Cấu hình phân đoạn âm thanh:
160
- - `--overlap` (mặc định: `0.25`): Mức độ chồng lấn giữa các đoạn khi tách.
161
- - `--segments_size` (mặc định: `256`): Kích thước của từng phân đoạn.
162
-
163
- 5. Cấu hình MDX (Music Demixing):
164
- - `--mdx_hop_length` (mặc định: `1024`): Bước nhảy MDX khi xử lý.
165
- - `--mdx_batch_size` (mặc định: `1`): Kích thước batch khi xử lý MDX.
166
- - `--denoise_mdx` (mặc định: `False`): Có áp dụng khử nhiễu khi tách bằng MDX không.
167
-
168
- 6. Bỏ qua phần âm thanh:
169
- - `--skip` (mặc định: `False`): Có bỏ qua giây âm thanh nào không.
170
- - `--skip_start_audios` (mặc định: `0`): Thời gian (giây) cần bỏ qua ở đầu audio.
171
- - `--skip_end_audios` (mặc định: `0`): Thời gian (giây) cần bỏ qua ở cuối audio.
172
- """)
173
- quit()
174
- elif argv_is_allows[11] in argv:
175
- print("""Các tham số của --create_index:
176
- 1. Thông tin mô hình:
177
- - `--model_name` (bắt buộc): Tên mô hình.
178
- - `--rvc_version` (mặc định: `v2`): Phiên bản (`v1`, `v2`).
179
- - `--index_algorithm` (mặc định: `Auto`): Thuật toán index sử dụng (`Auto`, `Faiss`, `KMeans`).
180
- """)
181
- quit()
182
- elif argv_is_allows[12] in argv:
183
- print("""Các tham số của --extract:
184
- 1. Thông tin mô hình:
185
- - `--model_name` (bắt buộc): Tên mô hình.
186
- - `--rvc_version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`).
187
-
188
- 2. Cấu hình F0:
189
- - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`).
190
- - `--pitch_guidance` (mặc định: `True`): Có sử dụng hướng dẫn cao độ hay không.
191
- - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không.
192
- - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0.
193
-
194
- 3. Cấu hình xử lý:
195
- - `--hop_length` (mặc định: `128`): Độ dài bước nhảy trong quá trình xử lý.
196
- - `--cpu_cores` (mặc định: `2`): Số lượng luồng CPU sử dụng.
197
- - `--gpu` (mặc định: `-`): Chỉ định GPU sử dụng (ví dụ: `0` cho GPU đầu tiên, `-` để tắt GPU).
198
- - `--sample_rate` (bắt buộc): Tần số lấy mẫu của âm thanh đầu vào.
199
-
200
- 4. Cấu hình nhúng:
201
- - `--embedder_model` (mặc định: `contentvec_base`): Tên mô hình nhúng.
202
- - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không.
203
- - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`).
204
-
205
- 4. RMS:
206
- - `--rms_extract` (mặc định: False): Trích xuất thêm năng lượng rms.
207
- """)
208
- quit()
209
- elif argv_is_allows[13] in argv:
210
- print("""Các tham số của --preprocess:
211
- 1. Thông tin mô hình:
212
- - `--model_name` (bắt buộc): Tên mô hình.
213
-
214
- 2. Cấu hình dữ liệu:
215
- - `--dataset_path` (mặc định: `./dataset`): Đường dẫn thư mục chứa tệp dữ liệu.
216
- - `--sample_rate` (bắt buộc): Tần số lấy mẫu của dữ liệu âm thanh.
217
-
218
- 3. Cấu hình xử lý:
219
- - `--cpu_cores` (mặc định: `2`): Số lượng luồng CPU sử dụng.
220
- - `--cut_preprocess` (mặc định: `True`): Có cắt tệp dữ liệu hay không.
221
- - `--process_effects` (mặc định: `False`): Có áp dụng tiền xử lý hay không.
222
- - `--clean_dataset` (mặc định: `False`): Có làm sạch tệp dữ liệu hay không.
223
- - `--clean_strength` (mặc định: `0.7`): Độ mạnh của quá trình làm sạch dữ liệu.
224
- """)
225
- quit()
226
- elif argv_is_allows[14] in argv:
227
- print("""Các tham số của --separator_music:
228
- 1. Đường dẫn dữ liệu:
229
- - `--input_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào.
230
- - `--output_path` (mặc định: `./audios`): Thư mục lưu tệp đầu ra.
231
- - `--format` (mặc định: `wav`): Định dạng xuất tệp (`wav`, `mp3`,...).
232
-
233
- 2. Cấu hình xử lý âm thanh:
234
- - `--shifts` (mặc định: `2`): Số lượng dự đoán.
235
- - `--segments_size` (mặc định: `256`): Kích thước phân đoạn âm thanh.
236
- - `--overlap` (mặc định: `0.25`): Mức độ chồng lấn giữa các đoạn.
237
- - `--mdx_hop_length` (mặc định: `1024`): Bước nhảy MDX khi xử lý.
238
- - `--mdx_batch_size` (mặc định: `1`): Kích thước lô.
239
-
240
- 3. Xử lý làm sạch:
241
- - `--clean_audio` (mặc định: `False`): Có làm sạch âm thanh hay không.
242
- - `--clean_strength` (mặc định: `0.7`): Độ mạnh của bộ lọc làm sạch.
243
-
244
- 4. Cấu hình mô hình:
245
- - `--model_name` (mặc định: `HT-Normal`): Mô hình tách nhạc (`Main_340`, `Main_390`, `Main_406`, `Main_427`, `Main_438`, `Inst_full_292`, `Inst_HQ_1`, `Inst_HQ_2`, `Inst_HQ_3`, `Inst_HQ_4`, `Inst_HQ_5`, `Kim_Vocal_1`, `Kim_Vocal_2`, `Kim_Inst`, `Inst_187_beta`, `Inst_82_beta`, `Inst_90_beta`, `Voc_FT`, `Crowd_HQ`, `Inst_1`, `Inst_2`, `Inst_3`, `MDXNET_1_9703`, `MDXNET_2_9682`, `MDXNET_3_9662`, `Inst_Main`, `MDXNET_Main`, `MDXNET_9482`, `HT-Normal`, `HT-Tuned`, `HD_MMI`, `HT_6S`).
246
- - `--kara_model` (mặc định: `Version-1`): Phiên bản mô hình tách bè (`Version-1`, `Version-2`).
247
-
248
- 5. Hiệu ứng và xử lý hậu kỳ:
249
- - `--backing` (mặc định: `False`): Có tách bè hay không.
250
- - `--mdx_denoise` (mặc định: `False`): Có sử dụng khử nhiễu MDX hay không.
251
- - `--reverb` (mặc định: `False`): Có tách vang hay không.
252
- - `--backing_reverb` (mặc định: `False`): có tách vang cho giọng bè không.
253
-
254
- 6. Tần số lấy mẫu:
255
- - `--sample_rate` (mặc định: `44100`): Tần số lấy mẫu của âm thanh đầu ra.
256
- """)
257
- quit()
258
- elif argv_is_allows[15] in argv:
259
- print("""Các tham số của --train:
260
- 1. Cấu hình mô hình:
261
- - `--model_name` (bắt buộc): Tên mô hình.
262
- - `--rvc_version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`).
263
- - `--model_author` (tùy chọn): Tác giả của mô hình.
264
-
265
- 2. Cấu hình lưu:
266
- - `--save_every_epoch` (bắt buộc): Số kỷ nguyên giữa mỗi lần lưu.
267
- - `--save_only_latest` (mặc định: `True`): Chỉ lưu điểm mới nhất.
268
- - `--save_every_weights` (mặc định: `True`): Lưu tất cả trọng số của mô hình.
269
-
270
- 3. Cấu hình huấn luyện:
271
- - `--total_epoch` (mặc định: `300`): Tổng số kỷ nguyên huấn luyện.
272
- - `--batch_size` (mặc định: `8`): Kích thước lô trong quá trình huấn luyện.
273
- - `--sample_rate` (bắt buộc): Tần số lấy mẫu của âm thanh.
274
-
275
- 4. Cấu hình thiết bị:
276
- - `--gpu` (mặc định: `0`): Chỉ định GPU để sử dụng (số hoặc `-` nếu không dùng GPU).
277
- - `--cache_data_in_gpu` (mặc định: `False`): Lưu dữ liệu vào GPU để tăng tốc.
278
-
279
- 5. Cấu hình huấn luyện nâng cao:
280
- - `--pitch_guidance` (mặc định: `True`): Sử dụng hướng dẫn cao độ.
281
- - `--g_pretrained_path` (mặc định: ``): Đường dẫn đến trọng số G đã huấn luyện trước.
282
- - `--d_pretrained_path` (mặc định: ``): Đường dẫn đến trọng số D đã huấn luyện trước.
283
- - `--vocoder` (mặc định: `Default`): Bộ mã hóa được sử dụng (`Default`, `MRF-HiFi-GAN`, `RefineGAN`).
284
- - `--energy_use` (mặc định: `False`): Sử dụng năng lượng rms.
285
-
286
- 6. Phát hiện huấn luyện quá mức:
287
- - `--overtraining_detector` (mặc định: `False`): Bật/tắt chế độ phát hiện huấn luyện quá mức.
288
- - `--overtraining_threshold` (mặc định: `50`): Ngưỡng để xác định huấn luyện quá mức.
289
-
290
- 7. Xử lý dữ liệu:
291
- - `--cleanup` (mặc định: `False`): Dọn dẹp tệp huấn luyện cũ để tiến hành huấn luyện lại từ đầu.
292
-
293
- 8. Tối ưu:
294
- - `--checkpointing` (mặc định: `False`): Bật/tắt checkpointing để tiết kiệm RAM.
295
- - `--deterministic` (mặc định: `False`): Khi bật sẽ sử dụng các thuật toán có tính xác định cao, đảm bảo rằng mỗi lần chạy cùng một dữ liệu đầu vào sẽ cho kết quả giống nhau.
296
- - `--benchmark` (mặc định: `False`): Khi bật sẽ thử nghiệm và chọn thuật toán tối ưu nhất cho phần cứng và kích thước cụ thể.
297
- - `--optimizer` (mặc định: `AdamW`): Trình tối ưu hóa được sử dụng (`AdamW`, `RAdam`).
298
- """)
299
- quit()
300
- elif argv_is_allows[16] in argv:
301
- print("""Sử dụng:
302
- 1. `--help_audio_effects`: Trợ giúp về phần thêm hiệu ứng âm thanh.
303
- 2. `--help_convert`: Trợ giúp về chuyển đổi âm thanh.
304
- 3. `--help_create_dataset`: Trợ giúp về tạo dữ liệu huấn luyện.
305
- 4. `--help_create_index`: Trợ giúp về tạo chỉ mục.
306
- 5. `--help_extract`: Trợ giúp về trích xuất dữ liệu huấn luyện.
307
- 6. `--help_preprocess`: Trợ giúp về xử lý trước dữ liệu.
308
- 7. `--help_separator_music`: Trợ giúp về tách nhạc.
309
- 8. `--help_train`: Trợ giúp về huấn luyện mô hình.
310
- """)
311
- quit()
312
-
313
- if __name__ == "__main__":
314
- import torch.multiprocessing as mp
315
-
316
- if "--train" in argv: mp.set_start_method("spawn")
317
- if "--preprocess" in argv or "--extract" in argv: mp.set_start_method("spawn", force=True)
318
-
319
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/run_tensorboard.py DELETED
@@ -1,33 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
- import logging
5
- import webbrowser
6
-
7
- from tensorboard import program
8
-
9
- sys.path.append(os.getcwd())
10
-
11
- from main.configs.config import Config
12
-
13
- config = Config()
14
- translations = config.translations
15
-
16
- def launch_tensorboard():
17
- for l in ["root", "tensorboard"]:
18
- logging.getLogger(l).setLevel(logging.ERROR)
19
-
20
- tb = program.TensorBoard()
21
- tb.configure(argv=[None, "--logdir", config.configs["logs_path"], f"--port={config.configs['tensorboard_port']}"])
22
- url = tb.launch()
23
-
24
- print(f"{translations['tensorboard_url']}: {url}")
25
- if "--open" in sys.argv: webbrowser.open(url)
26
-
27
- return f"{translations['tensorboard_url']}: {url}"
28
-
29
- if __name__ == "__main__":
30
- launch_tensorboard()
31
-
32
- while 1:
33
- time.sleep(5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/downloads/downloads.py DELETED
@@ -1,119 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import translations, configs, models, model_options
9
- from main.app.core.downloads import download_model, search_models, download_pretrained_model
10
- from main.app.core.ui import change_download_choices, change_download_pretrained_choices, shutil_move
11
- from main.app.core.process import fetch_pretrained_data, save_drop_model, update_sample_rate_dropdown
12
-
13
- def download_tab():
14
- with gr.TabItem(translations["downloads"], visible=configs.get("downloads_tab", True)):
15
- gr.Markdown(translations["download_markdown"])
16
- with gr.Row():
17
- gr.Markdown(translations["download_markdown_2"])
18
- with gr.Row():
19
- with gr.Accordion(translations["model_download"], open=True):
20
- with gr.Row():
21
- downloadmodel = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["download_from_csv"], translations["search_models"], translations["upload"]], interactive=True, value=translations["download_url"])
22
- with gr.Row():
23
- gr.Markdown("___")
24
- with gr.Column():
25
- with gr.Row():
26
- url_input = gr.Textbox(label=translations["model_url"], value="", placeholder="https://...", scale=6)
27
- download_model_name = gr.Textbox(label=translations["modelname"], value="", placeholder=translations["modelname"], scale=2)
28
- url_download = gr.Button(value=translations["downloads"], scale=2)
29
- with gr.Column():
30
- model_browser = gr.Dropdown(choices=models.keys(), label=translations["model_warehouse"], scale=8, allow_custom_value=True, visible=False)
31
- download_from_browser = gr.Button(value=translations["get_model"], scale=2, variant="primary", visible=False)
32
- with gr.Column():
33
- search_name = gr.Textbox(label=translations["name_to_search"], placeholder=translations["modelname"], interactive=True, scale=8, visible=False)
34
- search = gr.Button(translations["search_2"], scale=2, visible=False)
35
- search_dropdown = gr.Dropdown(label=translations["select_download_model"], value="", choices=[], allow_custom_value=True, interactive=False, visible=False)
36
- download = gr.Button(translations["downloads"], variant="primary", visible=False)
37
- with gr.Column():
38
- model_upload = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx", ".index", ".zip"], visible=False)
39
- with gr.Row():
40
- with gr.Accordion(translations["download_pretrained_2"], open=False):
41
- with gr.Row():
42
- pretrain_download_choices = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["list_model"], translations["upload"]], value=translations["download_url"], interactive=True)
43
- with gr.Row():
44
- gr.Markdown("___")
45
- with gr.Column():
46
- with gr.Row():
47
- pretrainD = gr.Textbox(label=translations["pretrained_url"].format(dg="D"), value="", placeholder="https://...", interactive=True, scale=4)
48
- pretrainG = gr.Textbox(label=translations["pretrained_url"].format(dg="G"), value="", placeholder="https://...", interactive=True, scale=4)
49
- download_pretrain_button = gr.Button(translations["downloads"], scale=2)
50
- with gr.Column():
51
- with gr.Row():
52
- pretrain_choices = gr.Dropdown(label=translations["select_pretrain"], info=translations["select_pretrain_info"], choices=list(fetch_pretrained_data().keys()), value="Titan_Medium", allow_custom_value=True, interactive=True, scale=6, visible=False)
53
- sample_rate_pretrain = gr.Dropdown(label=translations["pretrain_sr"], info=translations["pretrain_sr"], choices=["48k", "40k", "32k"], value="48k", interactive=True, visible=False)
54
- download_pretrain_choices_button = gr.Button(translations["downloads"], scale=2, variant="primary", visible=False)
55
- with gr.Row():
56
- pretrain_upload_g = gr.File(label=translations["drop_pretrain"].format(dg="G"), file_types=[".pth"], visible=False)
57
- pretrain_upload_d = gr.File(label=translations["drop_pretrain"].format(dg="D"), file_types=[".pth"], visible=False)
58
- with gr.Row():
59
- url_download.click(
60
- fn=download_model,
61
- inputs=[
62
- url_input,
63
- download_model_name
64
- ],
65
- outputs=[url_input],
66
- api_name="download_model"
67
- )
68
- download_from_browser.click(
69
- fn=lambda model: download_model(models[model], model),
70
- inputs=[model_browser],
71
- outputs=[model_browser],
72
- api_name="download_browser"
73
- )
74
- with gr.Row():
75
- downloadmodel.change(fn=change_download_choices, inputs=[downloadmodel], outputs=[url_input, download_model_name, url_download, model_browser, download_from_browser, search_name, search, search_dropdown, download, model_upload])
76
- search.click(fn=search_models, inputs=[search_name], outputs=[search_dropdown, download])
77
- model_upload.upload(fn=save_drop_model, inputs=[model_upload], outputs=[model_upload])
78
- download.click(
79
- fn=lambda model: download_model(model_options[model], model),
80
- inputs=[search_dropdown],
81
- outputs=[search_dropdown],
82
- api_name="search_models"
83
- )
84
- with gr.Row():
85
- pretrain_download_choices.change(fn=change_download_pretrained_choices, inputs=[pretrain_download_choices], outputs=[pretrainD, pretrainG, download_pretrain_button, pretrain_choices, sample_rate_pretrain, download_pretrain_choices_button, pretrain_upload_d, pretrain_upload_g])
86
- pretrain_choices.change(fn=update_sample_rate_dropdown, inputs=[pretrain_choices], outputs=[sample_rate_pretrain])
87
- with gr.Row():
88
- download_pretrain_button.click(
89
- fn=download_pretrained_model,
90
- inputs=[
91
- pretrain_download_choices,
92
- pretrainD,
93
- pretrainG
94
- ],
95
- outputs=[pretrainD, pretrainG],
96
- api_name="download_pretrain_link"
97
- )
98
- download_pretrain_choices_button.click(
99
- fn=download_pretrained_model,
100
- inputs=[
101
- pretrain_download_choices,
102
- pretrain_choices,
103
- sample_rate_pretrain
104
- ],
105
- outputs=[pretrain_choices],
106
- api_name="download_pretrain_choices"
107
- )
108
- pretrain_upload_g.upload(
109
- fn=lambda pretrain_upload_g: shutil_move(pretrain_upload_g.name, configs["pretrained_custom_path"]),
110
- inputs=[pretrain_upload_g],
111
- outputs=[],
112
- api_name="upload_pretrain_g"
113
- )
114
- pretrain_upload_d.upload(
115
- fn=lambda pretrain_upload_d: shutil_move(pretrain_upload_d.name, configs["pretrained_custom_path"]),
116
- inputs=[pretrain_upload_d],
117
- outputs=[],
118
- api_name="upload_pretrain_d"
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/editing/child/audio_effects.py DELETED
@@ -1,393 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.editing import audio_effects
9
- from main.app.core.presets import audio_effect_load_presets, audio_effect_save_presets
10
- from main.app.core.ui import visible, change_audios_choices, change_effect_preset_choices, shutil_move
11
- from main.app.variables import translations, paths_for_files, sample_rate_choice, audio_effect_presets_file, configs
12
-
13
- def audio_effects_tab():
14
- with gr.Row():
15
- gr.Markdown(translations["audio_effects_edit"])
16
- with gr.Row():
17
- with gr.Column():
18
- with gr.Row():
19
- reverb_check_box = gr.Checkbox(label=translations["reverb"], value=False, interactive=True)
20
- chorus_check_box = gr.Checkbox(label=translations["chorus"], value=False, interactive=True)
21
- delay_check_box = gr.Checkbox(label=translations["delay"], value=False, interactive=True)
22
- phaser_check_box = gr.Checkbox(label=translations["phaser"], value=False, interactive=True)
23
- compressor_check_box = gr.Checkbox(label=translations["compressor"], value=False, interactive=True)
24
- more_options = gr.Checkbox(label=translations["more_option"], value=False, interactive=True)
25
- with gr.Row():
26
- with gr.Accordion(translations["input_output"], open=False):
27
- with gr.Row():
28
- upload_audio = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
29
- with gr.Row():
30
- audio_in_path = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True)
31
- audio_out_path = gr.Textbox(label=translations["output_audio"], value="audios/audio_effects.wav", placeholder="audios/audio_effects.wav", info=translations["provide_output"], interactive=True)
32
- with gr.Row():
33
- with gr.Column():
34
- audio_combination = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True)
35
- audio_combination_input = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True, visible=audio_combination.value)
36
- with gr.Row():
37
- main_vol = gr.Slider(minimum=-80, maximum=80, label=translations["main_volume"], info=translations["main_volume_info"], value=-4, step=1, interactive=True, visible=audio_combination.value)
38
- combine_vol = gr.Slider(minimum=-80, maximum=80, label=translations["combination_volume"], info=translations["combination_volume_info"], value=-7, step=1, interactive=True, visible=audio_combination.value)
39
- with gr.Row():
40
- audio_effects_refresh = gr.Button(translations["refresh"])
41
- with gr.Row():
42
- audio_output_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
43
- with gr.Row():
44
- with gr.Accordion(translations["use_presets"], open=False):
45
- with gr.Row():
46
- presets_name = gr.Dropdown(label=translations["file_preset"], choices=audio_effect_presets_file, value=audio_effect_presets_file[0] if len(audio_effect_presets_file) > 0 else '', interactive=True, allow_custom_value=True)
47
- with gr.Row():
48
- load_click = gr.Button(translations["load_file"], variant="primary")
49
- refresh_click = gr.Button(translations["refresh"])
50
- with gr.Accordion(translations["export_file"], open=False):
51
- with gr.Row():
52
- with gr.Column():
53
- name_to_save_file = gr.Textbox(label=translations["filename_to_save"])
54
- save_file_button = gr.Button(translations["export_file"])
55
- with gr.Row():
56
- upload_presets = gr.File(label=translations["upload_presets"], file_types=[".effect.json"])
57
- with gr.Row():
58
- apply_effects_button = gr.Button(translations["apply"], variant="primary", scale=2)
59
- with gr.Row():
60
- with gr.Column():
61
- with gr.Row():
62
- with gr.Accordion(translations["reverb"], open=False, visible=reverb_check_box.value) as reverb_accordion:
63
- reverb_freeze_mode = gr.Checkbox(label=translations["reverb_freeze"], info=translations["reverb_freeze_info"], value=False, interactive=True)
64
- reverb_room_size = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.15, label=translations["room_size"], info=translations["room_size_info"], interactive=True)
65
- reverb_damping = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.7, label=translations["damping"], info=translations["damping_info"], interactive=True)
66
- reverb_wet_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label=translations["wet_level"], info=translations["wet_level_info"], interactive=True)
67
- reverb_dry_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.8, label=translations["dry_level"], info=translations["dry_level_info"], interactive=True)
68
- reverb_width = gr.Slider(minimum=0, maximum=1, step=0.01, value=1, label=translations["width"], info=translations["width_info"], interactive=True)
69
- with gr.Row():
70
- with gr.Accordion(translations["chorus"], open=False, visible=chorus_check_box.value) as chorus_accordion:
71
- chorus_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_depth"], info=translations["chorus_depth_info"], interactive=True)
72
- chorus_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.5, label=translations["chorus_rate_hz"], info=translations["chorus_rate_hz_info"], interactive=True)
73
- chorus_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_mix"], info=translations["chorus_mix_info"], interactive=True)
74
- chorus_centre_delay_ms = gr.Slider(minimum=0, maximum=50, step=1, value=10, label=translations["chorus_centre_delay_ms"], info=translations["chorus_centre_delay_ms_info"], interactive=True)
75
- chorus_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["chorus_feedback"], info=translations["chorus_feedback_info"], interactive=True)
76
- with gr.Row():
77
- with gr.Accordion(translations["delay"], open=False, visible=delay_check_box.value) as delay_accordion:
78
- delay_second = gr.Slider(minimum=0, maximum=5, step=0.01, value=0.5, label=translations["delay_seconds"], info=translations["delay_seconds_info"], interactive=True)
79
- delay_feedback = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_feedback"], info=translations["delay_feedback_info"], interactive=True)
80
- delay_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_mix"], info=translations["delay_mix_info"], interactive=True)
81
- with gr.Column():
82
- with gr.Row():
83
- with gr.Accordion(translations["more_option"], open=False, visible=more_options.value) as more_accordion:
84
- with gr.Row():
85
- fade = gr.Checkbox(label=translations["fade"], value=False, interactive=True)
86
- bass_or_treble = gr.Checkbox(label=translations["bass_or_treble"], value=False, interactive=True)
87
- limiter = gr.Checkbox(label=translations["limiter"], value=False, interactive=True)
88
- resample_checkbox = gr.Checkbox(label=translations["resample"], value=False, interactive=True)
89
- with gr.Row():
90
- distortion_checkbox = gr.Checkbox(label=translations["distortion"], value=False, interactive=True)
91
- gain_checkbox = gr.Checkbox(label=translations["gain"], value=False, interactive=True)
92
- bitcrush_checkbox = gr.Checkbox(label=translations["bitcrush"], value=False, interactive=True)
93
- clipping_checkbox = gr.Checkbox(label=translations["clipping"], value=False, interactive=True)
94
- with gr.Accordion(translations["fade"], open=True, visible=fade.value) as fade_accordion:
95
- with gr.Row():
96
- fade_in = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_in"], info=translations["fade_in_info"], interactive=True)
97
- fade_out = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_out"], info=translations["fade_out_info"], interactive=True)
98
- with gr.Accordion(translations["bass_or_treble"], open=True, visible=bass_or_treble.value) as bass_treble_accordion:
99
- with gr.Row():
100
- bass_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["bass_boost"], info=translations["bass_boost_info"], interactive=True)
101
- bass_frequency = gr.Slider(minimum=20, maximum=200, step=10, value=100, label=translations["bass_frequency"], info=translations["bass_frequency_info"], interactive=True)
102
- with gr.Row():
103
- treble_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["treble_boost"], info=translations["treble_boost_info"], interactive=True)
104
- treble_frequency = gr.Slider(minimum=1000, maximum=10000, step=500, value=3000, label=translations["treble_frequency"], info=translations["treble_frequency_info"], interactive=True)
105
- with gr.Accordion(translations["limiter"], open=True, visible=limiter.value) as limiter_accordion:
106
- with gr.Row():
107
- limiter_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["limiter_threshold_db"], info=translations["limiter_threshold_db_info"], interactive=True)
108
- limiter_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["limiter_release_ms"], info=translations["limiter_release_ms_info"], interactive=True)
109
- with gr.Column():
110
- pitch_shift_semitones = gr.Slider(minimum=-20, maximum=20, step=1, value=0, label=translations["pitch"], info=translations["pitch_info"], interactive=True)
111
- audio_effect_resample_sr = gr.Radio(choices=[0]+sample_rate_choice, value=0, label=translations["resample"], info=translations["resample_info"], interactive=True, visible=resample_checkbox.value)
112
- distortion_drive_db = gr.Slider(minimum=0, maximum=50, step=1, value=20, label=translations["distortion"], info=translations["distortion_info"], interactive=True, visible=distortion_checkbox.value)
113
- gain_db = gr.Slider(minimum=-60, maximum=60, step=1, value=0, label=translations["gain"], info=translations["gain_info"], interactive=True, visible=gain_checkbox.value)
114
- clipping_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["clipping_threshold_db"], info=translations["clipping_threshold_db_info"], interactive=True, visible=clipping_checkbox.value)
115
- bitcrush_bit_depth = gr.Slider(minimum=1, maximum=24, step=1, value=16, label=translations["bitcrush_bit_depth"], info=translations["bitcrush_bit_depth_info"], interactive=True, visible=bitcrush_checkbox.value)
116
- with gr.Row():
117
- with gr.Accordion(translations["phaser"], open=False, visible=phaser_check_box.value) as phaser_accordion:
118
- phaser_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_depth"], info=translations["phaser_depth_info"], interactive=True)
119
- phaser_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1, label=translations["phaser_rate_hz"], info=translations["phaser_rate_hz_info"], interactive=True)
120
- phaser_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_mix"], info=translations["phaser_mix_info"], interactive=True)
121
- phaser_centre_frequency_hz = gr.Slider(minimum=50, maximum=5000, step=10, value=1000, label=translations["phaser_centre_frequency_hz"], info=translations["phaser_centre_frequency_hz_info"], interactive=True)
122
- phaser_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["phaser_feedback"], info=translations["phaser_feedback_info"], interactive=True)
123
- with gr.Row():
124
- with gr.Accordion(translations["compressor"], open=False, visible=compressor_check_box.value) as compressor_accordion:
125
- compressor_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-20, label=translations["compressor_threshold_db"], info=translations["compressor_threshold_db_info"], interactive=True)
126
- compressor_ratio = gr.Slider(minimum=1, maximum=20, step=0.1, value=1, label=translations["compressor_ratio"], info=translations["compressor_ratio_info"], interactive=True)
127
- compressor_attack_ms = gr.Slider(minimum=0.1, maximum=100, step=0.1, value=10, label=translations["compressor_attack_ms"], info=translations["compressor_attack_ms_info"], interactive=True)
128
- compressor_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["compressor_release_ms"], info=translations["compressor_release_ms_info"], interactive=True)
129
- with gr.Row():
130
- gr.Markdown(translations["output_audio"])
131
- with gr.Row():
132
- audio_play_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
133
- audio_play_output = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"])
134
- with gr.Row():
135
- reverb_check_box.change(fn=visible, inputs=[reverb_check_box], outputs=[reverb_accordion])
136
- chorus_check_box.change(fn=visible, inputs=[chorus_check_box], outputs=[chorus_accordion])
137
- delay_check_box.change(fn=visible, inputs=[delay_check_box], outputs=[delay_accordion])
138
- with gr.Row():
139
- compressor_check_box.change(fn=visible, inputs=[compressor_check_box], outputs=[compressor_accordion])
140
- phaser_check_box.change(fn=visible, inputs=[phaser_check_box], outputs=[phaser_accordion])
141
- more_options.change(fn=visible, inputs=[more_options], outputs=[more_accordion])
142
- with gr.Row():
143
- fade.change(fn=visible, inputs=[fade], outputs=[fade_accordion])
144
- bass_or_treble.change(fn=visible, inputs=[bass_or_treble], outputs=[bass_treble_accordion])
145
- limiter.change(fn=visible, inputs=[limiter], outputs=[limiter_accordion])
146
- resample_checkbox.change(fn=visible, inputs=[resample_checkbox], outputs=[audio_effect_resample_sr])
147
- with gr.Row():
148
- distortion_checkbox.change(fn=visible, inputs=[distortion_checkbox], outputs=[distortion_drive_db])
149
- gain_checkbox.change(fn=visible, inputs=[gain_checkbox], outputs=[gain_db])
150
- clipping_checkbox.change(fn=visible, inputs=[clipping_checkbox], outputs=[clipping_threshold_db])
151
- bitcrush_checkbox.change(fn=visible, inputs=[bitcrush_checkbox], outputs=[bitcrush_bit_depth])
152
- with gr.Row():
153
- upload_audio.upload(fn=lambda audio_in: shutil_move(audio_in.name, configs["audios_path"]), inputs=[upload_audio], outputs=[audio_in_path])
154
- audio_in_path.change(fn=lambda audio: audio if audio else None, inputs=[audio_in_path], outputs=[audio_play_input])
155
- audio_effects_refresh.click(fn=lambda a, b: [change_audios_choices(a), change_audios_choices(b)], inputs=[audio_in_path, audio_combination_input], outputs=[audio_in_path, audio_combination_input])
156
- with gr.Row():
157
- more_options.change(fn=lambda: [False]*8, inputs=[], outputs=[fade, bass_or_treble, limiter, resample_checkbox, distortion_checkbox, gain_checkbox, clipping_checkbox, bitcrush_checkbox])
158
- audio_combination.change(fn=visible, inputs=[audio_combination], outputs=[audio_combination_input])
159
- audio_combination.change(fn=lambda a: [visible(a)]*2, inputs=[audio_combination], outputs=[main_vol, combine_vol])
160
- with gr.Row():
161
- upload_presets.upload(fn=lambda audio_in: shutil_move(audio_in.name, configs["presets_path"]), inputs=[upload_presets], outputs=[presets_name])
162
- refresh_click.click(fn=change_effect_preset_choices, inputs=[], outputs=[presets_name])
163
- with gr.Row():
164
- load_click.click(
165
- fn=audio_effect_load_presets,
166
- inputs=[
167
- presets_name,
168
- resample_checkbox,
169
- audio_effect_resample_sr,
170
- chorus_depth,
171
- chorus_rate_hz,
172
- chorus_mix,
173
- chorus_centre_delay_ms,
174
- chorus_feedback,
175
- distortion_drive_db,
176
- reverb_room_size,
177
- reverb_damping,
178
- reverb_wet_level,
179
- reverb_dry_level,
180
- reverb_width,
181
- reverb_freeze_mode,
182
- pitch_shift_semitones,
183
- delay_second,
184
- delay_feedback,
185
- delay_mix,
186
- compressor_threshold_db,
187
- compressor_ratio,
188
- compressor_attack_ms,
189
- compressor_release_ms,
190
- limiter_threshold_db,
191
- limiter_release_ms,
192
- gain_db,
193
- bitcrush_bit_depth,
194
- clipping_threshold_db,
195
- phaser_rate_hz,
196
- phaser_depth,
197
- phaser_centre_frequency_hz,
198
- phaser_feedback,
199
- phaser_mix,
200
- bass_boost,
201
- bass_frequency,
202
- treble_boost,
203
- treble_frequency,
204
- fade_in,
205
- fade_out,
206
- chorus_check_box,
207
- distortion_checkbox,
208
- reverb_check_box,
209
- delay_check_box,
210
- compressor_check_box,
211
- limiter,
212
- gain_checkbox,
213
- bitcrush_checkbox,
214
- clipping_checkbox,
215
- phaser_check_box,
216
- bass_or_treble,
217
- fade
218
- ],
219
- outputs=[
220
- resample_checkbox,
221
- audio_effect_resample_sr,
222
- chorus_depth,
223
- chorus_rate_hz,
224
- chorus_mix,
225
- chorus_centre_delay_ms,
226
- chorus_feedback,
227
- distortion_drive_db,
228
- reverb_room_size,
229
- reverb_damping,
230
- reverb_wet_level,
231
- reverb_dry_level,
232
- reverb_width,
233
- reverb_freeze_mode,
234
- pitch_shift_semitones,
235
- delay_second,
236
- delay_feedback,
237
- delay_mix,
238
- compressor_threshold_db,
239
- compressor_ratio,
240
- compressor_attack_ms,
241
- compressor_release_ms,
242
- limiter_threshold_db,
243
- limiter_release_ms,
244
- gain_db,
245
- bitcrush_bit_depth,
246
- clipping_threshold_db,
247
- phaser_rate_hz,
248
- phaser_depth,
249
- phaser_centre_frequency_hz,
250
- phaser_feedback,
251
- phaser_mix,
252
- bass_boost,
253
- bass_frequency,
254
- treble_boost,
255
- treble_frequency,
256
- fade_in,
257
- fade_out,
258
- chorus_check_box,
259
- distortion_checkbox,
260
- reverb_check_box,
261
- delay_check_box,
262
- compressor_check_box,
263
- limiter,
264
- gain_checkbox,
265
- bitcrush_checkbox,
266
- clipping_checkbox,
267
- phaser_check_box,
268
- bass_or_treble,
269
- fade
270
- ],
271
- )
272
- save_file_button.click(
273
- fn=audio_effect_save_presets,
274
- inputs=[
275
- name_to_save_file,
276
- resample_checkbox,
277
- audio_effect_resample_sr,
278
- chorus_depth,
279
- chorus_rate_hz,
280
- chorus_mix,
281
- chorus_centre_delay_ms,
282
- chorus_feedback,
283
- distortion_drive_db,
284
- reverb_room_size,
285
- reverb_damping,
286
- reverb_wet_level,
287
- reverb_dry_level,
288
- reverb_width,
289
- reverb_freeze_mode,
290
- pitch_shift_semitones,
291
- delay_second,
292
- delay_feedback,
293
- delay_mix,
294
- compressor_threshold_db,
295
- compressor_ratio,
296
- compressor_attack_ms,
297
- compressor_release_ms,
298
- limiter_threshold_db,
299
- limiter_release_ms,
300
- gain_db,
301
- bitcrush_bit_depth,
302
- clipping_threshold_db,
303
- phaser_rate_hz,
304
- phaser_depth,
305
- phaser_centre_frequency_hz,
306
- phaser_feedback,
307
- phaser_mix,
308
- bass_boost,
309
- bass_frequency,
310
- treble_boost,
311
- treble_frequency,
312
- fade_in,
313
- fade_out,
314
- chorus_check_box,
315
- distortion_checkbox,
316
- reverb_check_box,
317
- delay_check_box,
318
- compressor_check_box,
319
- limiter,
320
- gain_checkbox,
321
- bitcrush_checkbox,
322
- clipping_checkbox,
323
- phaser_check_box,
324
- bass_or_treble,
325
- fade
326
- ],
327
- outputs=[presets_name]
328
- )
329
- with gr.Row():
330
- apply_effects_button.click(
331
- fn=audio_effects,
332
- inputs=[
333
- audio_in_path,
334
- audio_out_path,
335
- resample_checkbox,
336
- audio_effect_resample_sr,
337
- chorus_depth,
338
- chorus_rate_hz,
339
- chorus_mix,
340
- chorus_centre_delay_ms,
341
- chorus_feedback,
342
- distortion_drive_db,
343
- reverb_room_size,
344
- reverb_damping,
345
- reverb_wet_level,
346
- reverb_dry_level,
347
- reverb_width,
348
- reverb_freeze_mode,
349
- pitch_shift_semitones,
350
- delay_second,
351
- delay_feedback,
352
- delay_mix,
353
- compressor_threshold_db,
354
- compressor_ratio,
355
- compressor_attack_ms,
356
- compressor_release_ms,
357
- limiter_threshold_db,
358
- limiter_release_ms,
359
- gain_db,
360
- bitcrush_bit_depth,
361
- clipping_threshold_db,
362
- phaser_rate_hz,
363
- phaser_depth,
364
- phaser_centre_frequency_hz,
365
- phaser_feedback,
366
- phaser_mix,
367
- bass_boost,
368
- bass_frequency,
369
- treble_boost,
370
- treble_frequency,
371
- fade_in,
372
- fade_out,
373
- audio_output_format,
374
- chorus_check_box,
375
- distortion_checkbox,
376
- reverb_check_box,
377
- delay_check_box,
378
- compressor_check_box,
379
- limiter,
380
- gain_checkbox,
381
- bitcrush_checkbox,
382
- clipping_checkbox,
383
- phaser_check_box,
384
- bass_or_treble,
385
- fade,
386
- audio_combination,
387
- audio_combination_input,
388
- main_vol,
389
- combine_vol
390
- ],
391
- outputs=[audio_play_output],
392
- api_name="audio_effects"
393
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/editing/child/quirk.py DELETED
@@ -1,48 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.editing import apply_voice_quirk
9
- from main.app.core.ui import change_audios_choices, shutil_move
10
- from main.app.variables import translations, paths_for_files, configs
11
-
12
- def quirk_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["quirk_markdown"])
15
- with gr.Row():
16
- input_audio_play = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
17
- with gr.Row():
18
- quirk_choice = gr.Radio(label=translations["quirk_label"], info=translations["quirk_label_info"], choices=list(translations["quirk_choice"].keys()), interactive=True, value=list(translations["quirk_choice"].keys())[0])
19
- with gr.Row():
20
- apply_quirk_button = gr.Button(translations["apply"], variant="primary")
21
- with gr.Row():
22
- with gr.Accordion(translations["input_output"], open=False):
23
- with gr.Row():
24
- quirk_upload_audio = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
25
- with gr.Column():
26
- quirk_export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
27
- quirk_input_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
28
- quirk_output_path = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
29
- with gr.Column():
30
- quirk_refresh = gr.Button(translations["refresh"])
31
- with gr.Row():
32
- output_audio_play = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"])
33
- with gr.Row():
34
- quirk_upload_audio.upload(fn=lambda audio_in: shutil_move(audio_in.name, configs["audios_path"]), inputs=[quirk_upload_audio], outputs=[quirk_input_path])
35
- quirk_input_path.change(fn=lambda audio: audio if audio else None, inputs=[quirk_input_path], outputs=[input_audio_play])
36
- quirk_refresh.click(fn=change_audios_choices, inputs=[quirk_input_path], outputs=[quirk_input_path])
37
- with gr.Row():
38
- apply_quirk_button.click(
39
- fn=apply_voice_quirk,
40
- inputs=[
41
- quirk_input_path,
42
- quirk_choice,
43
- quirk_output_path,
44
- quirk_export_format
45
- ],
46
- outputs=[output_audio_play],
47
- api_name="quirk"
48
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/editing/editing.py DELETED
@@ -1,20 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import configs, translations
9
- from main.app.tabs.editing.child.quirk import quirk_tab
10
- from main.app.tabs.editing.child.audio_effects import audio_effects_tab
11
-
12
- def editing_tab():
13
- with gr.TabItem(translations["editing"], visible=configs.get("editing_tab", True)):
14
- with gr.TabItem(translations["audio_effects"], visible=configs.get("effects_tab", True)):
15
- gr.Markdown(translations["apply_audio_effects"])
16
- audio_effects_tab()
17
-
18
- with gr.TabItem(translations["quirk"], visible=configs.get("quirk", True)):
19
- gr.Markdown(translations["quirk_info"])
20
- quirk_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/convert_model.py DELETED
@@ -1,31 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.ui import visible, shutil_move
9
- from main.app.core.model_utils import onnx_export
10
- from main.app.variables import translations, configs
11
-
12
- def convert_model_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["pytorch2onnx_markdown"])
15
- with gr.Row():
16
- model_pth_upload = gr.File(label=translations["drop_model"], file_types=[".pth"])
17
- with gr.Row():
18
- convert_onnx = gr.Button(translations["convert_model"], variant="primary", scale=2)
19
- with gr.Row():
20
- model_pth_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True)
21
- with gr.Row():
22
- output_model2 = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False)
23
- with gr.Row():
24
- model_pth_upload.upload(fn=lambda model_pth_upload: shutil_move(model_pth_upload.name, configs["weights_path"]), inputs=[model_pth_upload], outputs=[model_pth_path])
25
- convert_onnx.click(
26
- fn=onnx_export,
27
- inputs=[model_pth_path],
28
- outputs=[output_model2],
29
- api_name="model_onnx_export"
30
- )
31
- convert_onnx.click(fn=lambda: visible(True), inputs=[], outputs=[output_model2])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/f0_extract.py DELETED
@@ -1,51 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.f0_extract import f0_extract
9
- from main.app.core.ui import change_audios_choices, unlock_f0, shutil_move
10
- from main.app.variables import translations, paths_for_files, method_f0, configs
11
-
12
- def f0_extract_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["f0_extractor_markdown_2"])
15
- with gr.Row():
16
- extractor_button = gr.Button(translations["extract_button"].replace("2. ", ""), variant="primary")
17
- with gr.Row():
18
- with gr.Column():
19
- upload_audio_file = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
20
- audioplay = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
21
- with gr.Column():
22
- with gr.Accordion(translations["f0_method"], open=False):
23
- with gr.Group():
24
- with gr.Row():
25
- onnx_f0_mode3 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
26
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
27
- f0_method_extract = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True)
28
- with gr.Accordion(translations["audio_path"], open=True):
29
- input_audio_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True)
30
- refresh_audio_button = gr.Button(translations["refresh"])
31
- with gr.Row():
32
- gr.Markdown("___")
33
- with gr.Row():
34
- file_output = gr.File(label="", file_types=[".txt"], interactive=False)
35
- image_output = gr.Image(label="", interactive=False, show_download_button=True)
36
- with gr.Row():
37
- upload_audio_file.upload(fn=lambda audio_in: shutil_move(audio_in.name, configs["audios_path"]), inputs=[upload_audio_file], outputs=[input_audio_path])
38
- input_audio_path.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio_path], outputs=[audioplay])
39
- refresh_audio_button.click(fn=change_audios_choices, inputs=[input_audio_path], outputs=[input_audio_path])
40
- with gr.Row():
41
- unlock_full_method.change(fn=lambda method: [m for m in unlock_f0(method) if m != "hybrid"], inputs=[unlock_full_method], outputs=[f0_method_extract])
42
- extractor_button.click(
43
- fn=f0_extract,
44
- inputs=[
45
- input_audio_path,
46
- f0_method_extract,
47
- onnx_f0_mode3
48
- ],
49
- outputs=[file_output, image_output],
50
- api_name="f0_extract"
51
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/fushion.py DELETED
@@ -1,45 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.ui import visible, shutil_move
9
- from main.app.core.model_utils import fushion_model
10
- from main.app.variables import translations, configs
11
-
12
- def fushion_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["fushion_markdown_2"])
15
- with gr.Row():
16
- name_to_save = gr.Textbox(label=translations["modelname"], placeholder="Model.pth", value="", max_lines=1, interactive=True)
17
- with gr.Row():
18
- fushion_button = gr.Button(translations["fushion"], variant="primary", scale=4)
19
- with gr.Column():
20
- with gr.Row():
21
- model_a = gr.File(label=f"{translations['model_name']} 1", file_types=[".pth", ".onnx"])
22
- model_b = gr.File(label=f"{translations['model_name']} 2", file_types=[".pth", ".onnx"])
23
- with gr.Row():
24
- model_path_a = gr.Textbox(label=f"{translations['model_path']} 1", value="", placeholder="assets/weights/Model_1.pth")
25
- model_path_b = gr.Textbox(label=f"{translations['model_path']} 2", value="", placeholder="assets/weights/Model_2.pth")
26
- with gr.Row():
27
- ratio = gr.Slider(minimum=0, maximum=1, label=translations["model_ratio"], info=translations["model_ratio_info"], value=0.5, interactive=True)
28
- with gr.Row():
29
- output_model = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False)
30
- with gr.Row():
31
- model_a.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_a], outputs=[model_path_a])
32
- model_b.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_b], outputs=[model_path_b])
33
- with gr.Row():
34
- fushion_button.click(
35
- fn=fushion_model,
36
- inputs=[
37
- name_to_save,
38
- model_path_a,
39
- model_path_b,
40
- ratio
41
- ],
42
- outputs=[name_to_save, output_model],
43
- api_name="fushion_model"
44
- )
45
- fushion_button.click(fn=lambda: visible(True), inputs=[], outputs=[output_model])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/read_model.py DELETED
@@ -1,29 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.ui import shutil_move
9
- from main.app.core.model_utils import model_info
10
- from main.app.variables import translations, configs
11
-
12
- def read_model_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["read_model_markdown_2"])
15
- with gr.Row():
16
- model = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx"])
17
- with gr.Row():
18
- read_button = gr.Button(translations["readmodel"], variant="primary", scale=2)
19
- with gr.Column():
20
- model_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True)
21
- output_info = gr.Textbox(label=translations["modelinfo"], value="", interactive=False, scale=6)
22
- with gr.Row():
23
- model.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model], outputs=[model_path])
24
- read_button.click(
25
- fn=model_info,
26
- inputs=[model_path],
27
- outputs=[output_info],
28
- api_name="read_model"
29
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/report_bugs.py DELETED
@@ -1,24 +0,0 @@
1
- import os
2
- import sys
3
- import codecs
4
-
5
- import gradio as gr
6
-
7
- sys.path.append(os.getcwd())
8
-
9
- from main.app.core.utils import report_bug
10
- from main.app.variables import translations
11
-
12
- def report_bugs_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["report_bug_info"])
15
- with gr.Row():
16
- with gr.Column():
17
- with gr.Group():
18
- agree_log = gr.Checkbox(label=translations["agree_log"], value=True, interactive=True)
19
- report_text = gr.Textbox(label=translations["error_info"], info=translations["error_info_2"], interactive=True)
20
- report_button = gr.Button(translations["report_bugs"], variant="primary", scale=2)
21
- with gr.Row():
22
- gr.Markdown(translations["report_info"].format(github=codecs.decode("uggcf://tvguho.pbz/CunzUhlauNau16/Ivrganzrfr-EIP/vffhrf", "rot13")))
23
- with gr.Row():
24
- report_button.click(fn=report_bug, inputs=[report_text, agree_log], outputs=[])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/settings.py DELETED
@@ -1,61 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.ui import change_fp
9
- from main.app.core.utils import stop_pid
10
- from main.app.core.restart import change_font, change_language, change_theme
11
- from main.app.variables import translations, theme, font, configs, language, config
12
-
13
- def settings_tab(app):
14
- with gr.Row():
15
- gr.Markdown(translations["settings_markdown_2"])
16
- with gr.Row():
17
- toggle_button = gr.Button(translations["change_light_dark"], variant="secondary", scale=2)
18
- with gr.Row():
19
- with gr.Column():
20
- language_dropdown = gr.Dropdown(label=translations["lang"], interactive=True, info=translations["lang_restart"], choices=configs.get("support_language", "vi-VN"), value=language)
21
- change_lang = gr.Button(translations["change_lang"], variant="primary", scale=2)
22
- with gr.Column():
23
- theme_dropdown = gr.Dropdown(label=translations["theme"], interactive=True, info=translations["theme_restart"], choices=configs.get("themes", theme), value=theme, allow_custom_value=True)
24
- changetheme = gr.Button(translations["theme_button"], variant="primary", scale=2)
25
- with gr.Row():
26
- with gr.Column():
27
- fp_choice = gr.Radio(choices=["fp16","fp32"], value="fp16" if configs.get("fp16", False) else "fp32", label=translations["precision"], info=translations["precision_info"], interactive=config.device not in ["cpu", "mps", "ocl:0"])
28
- fp_button = gr.Button(translations["update_precision"], variant="secondary", scale=2)
29
- with gr.Column():
30
- font_choice = gr.Textbox(label=translations["font"], info=translations["font_info"], value=font, interactive=True)
31
- font_button = gr.Button(translations["change_font"])
32
- with gr.Row():
33
- with gr.Column():
34
- with gr.Accordion(translations["stop"], open=False, visible=config.debug_mode):
35
- separate_stop = gr.Button(translations["stop_separate"])
36
- convert_stop = gr.Button(translations["stop_convert"])
37
- create_dataset_stop = gr.Button(translations["stop_create_dataset"])
38
- with gr.Accordion(translations["stop_training"], open=False):
39
- model_name_stop = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True)
40
- preprocess_stop = gr.Button(translations["stop_preprocess"])
41
- extract_stop = gr.Button(translations["stop_extract"])
42
- train_stop = gr.Button(translations["stop_training"])
43
- with gr.Row():
44
- toggle_button.click(fn=None, js="() => {document.body.classList.toggle('dark')}")
45
- fp_button.click(fn=change_fp, inputs=[fp_choice], outputs=[fp_choice])
46
- with gr.Row():
47
- change_lang.click(fn=lambda a: change_language(a, app), inputs=[language_dropdown], outputs=[])
48
- changetheme.click(fn=lambda a: change_theme(a, app) , inputs=[theme_dropdown], outputs=[])
49
- font_button.click(fn=lambda a: change_font(a, app), inputs=[font_choice], outputs=[])
50
- with gr.Row():
51
- change_lang.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[])
52
- changetheme.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[])
53
- font_button.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[])
54
- with gr.Row():
55
- separate_stop.click(fn=lambda: stop_pid("separate_pid", None, False), inputs=[], outputs=[])
56
- convert_stop.click(fn=lambda: stop_pid("convert_pid", None, False), inputs=[], outputs=[])
57
- create_dataset_stop.click(fn=lambda: stop_pid("create_dataset_pid", None, False), inputs=[], outputs=[])
58
- with gr.Row():
59
- preprocess_stop.click(fn=lambda model_name_stop: stop_pid("preprocess_pid", model_name_stop, False), inputs=[model_name_stop], outputs=[])
60
- extract_stop.click(fn=lambda model_name_stop: stop_pid("extract_pid", model_name_stop, False), inputs=[model_name_stop], outputs=[])
61
- train_stop.click(fn=lambda model_name_stop: stop_pid("train_pid", model_name_stop, True), inputs=[model_name_stop], outputs=[])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/extra.py DELETED
@@ -1,40 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import translations, configs
9
- from main.app.tabs.extra.child.fushion import fushion_tab
10
- from main.app.tabs.extra.child.settings import settings_tab
11
- from main.app.tabs.extra.child.read_model import read_model_tab
12
- from main.app.tabs.extra.child.f0_extract import f0_extract_tab
13
- from main.app.tabs.extra.child.report_bugs import report_bugs_tab
14
- from main.app.tabs.extra.child.convert_model import convert_model_tab
15
-
16
- def extra_tab(app):
17
- with gr.TabItem(translations["extra"], visible=configs.get("extra_tab", True)):
18
- with gr.TabItem(translations["fushion"], visible=configs.get("fushion_tab", True)):
19
- gr.Markdown(translations["fushion_markdown"])
20
- fushion_tab()
21
-
22
- with gr.TabItem(translations["read_model"], visible=configs.get("read_tab", True)):
23
- gr.Markdown(translations["read_model_markdown"])
24
- read_model_tab()
25
-
26
- with gr.TabItem(translations["convert_model"], visible=configs.get("onnx_tab", True)):
27
- gr.Markdown(translations["pytorch2onnx"])
28
- convert_model_tab()
29
-
30
- with gr.TabItem(translations["f0_extractor_tab"], visible=configs.get("f0_extractor_tab", True)):
31
- gr.Markdown(translations["f0_extractor_markdown"])
32
- f0_extract_tab()
33
-
34
- with gr.TabItem(translations["settings"], visible=configs.get("settings_tab", True)):
35
- gr.Markdown(translations["settings_markdown"])
36
- settings_tab(app)
37
-
38
- with gr.TabItem(translations["report_bugs"], visible=configs.get("report_bug_tab", True)):
39
- gr.Markdown(translations["report_bugs"])
40
- report_bugs_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/inference/child/convert.py DELETED
@@ -1,313 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.presets import load_presets, save_presets
9
- from main.app.core.inference import convert_audio, convert_selection
10
- from main.app.variables import translations, paths_for_files, sample_rate_choice, model_name, index_path, method_f0, f0_file, embedders_mode, embedders_model, presets_file, configs
11
- from main.app.core.ui import visible, valueFalse_interactive, change_audios_choices, change_f0_choices, unlock_f0, change_preset_choices, change_backing_choices, hoplength_show, change_models_choices, get_index, index_strength_show, visible_embedders, shutil_move
12
-
13
- def convert_tab():
14
- with gr.Row():
15
- gr.Markdown(translations["convert_info"])
16
- with gr.Row():
17
- with gr.Column():
18
- with gr.Group():
19
- with gr.Row():
20
- cleaner0 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
21
- autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
22
- use_audio = gr.Checkbox(label=translations["use_audio"], value=False, interactive=True)
23
- checkpointing = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
24
- with gr.Row():
25
- use_original = gr.Checkbox(label=translations["convert_original"], value=False, interactive=True, visible=use_audio.value)
26
- convert_backing = gr.Checkbox(label=translations["convert_backing"], value=False, interactive=True, visible=use_audio.value)
27
- not_merge_backing = gr.Checkbox(label=translations["not_merge_backing"], value=False, interactive=True, visible=use_audio.value)
28
- merge_instrument = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True, visible=use_audio.value)
29
- with gr.Row():
30
- pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
31
- clean_strength0 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner0.value)
32
- with gr.Row():
33
- with gr.Column():
34
- audio_select = gr.Dropdown(label=translations["select_separate"], choices=[], value="", interactive=True, allow_custom_value=True, visible=False)
35
- convert_button_2 = gr.Button(translations["convert_audio"], visible=False)
36
- with gr.Row():
37
- with gr.Column():
38
- convert_button = gr.Button(translations["convert_audio"], variant="primary")
39
- with gr.Row():
40
- with gr.Column():
41
- input0 = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
42
- play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
43
- with gr.Column():
44
- with gr.Accordion(translations["model_accordion"], open=True):
45
- with gr.Row():
46
- model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
47
- model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
48
- with gr.Row():
49
- refresh = gr.Button(translations["refresh"])
50
- with gr.Row():
51
- index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "")
52
- with gr.Accordion(translations["input_output"], open=False):
53
- with gr.Column():
54
- export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
55
- input_audio0 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
56
- output_audio = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
57
- with gr.Column():
58
- refresh0 = gr.Button(translations["refresh"])
59
- with gr.Accordion(translations["setting"], open=False):
60
- with gr.Accordion(translations["f0_method"], open=False):
61
- with gr.Group():
62
- with gr.Row():
63
- onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
64
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
65
- method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
66
- hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method.value == "hybrid")
67
- hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
68
- with gr.Accordion(translations["f0_file"], open=False):
69
- upload_f0_file = gr.File(label=translations["upload_f0"], file_types=[".txt"])
70
- f0_file_dropdown = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True)
71
- refresh_f0_file = gr.Button(translations["refresh"])
72
- with gr.Accordion(translations["hubert_model"], open=False):
73
- embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
74
- embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
75
- custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
76
- with gr.Accordion(translations["use_presets"], open=False):
77
- with gr.Row():
78
- presets_name = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True)
79
- with gr.Row():
80
- load_click = gr.Button(translations["load_file"], variant="primary")
81
- refresh_click = gr.Button(translations["refresh"])
82
- with gr.Accordion(translations["export_file"], open=False):
83
- with gr.Row():
84
- with gr.Column():
85
- with gr.Group():
86
- with gr.Row():
87
- cleaner_chbox = gr.Checkbox(label=translations["save_clean"], value=True, interactive=True)
88
- autotune_chbox = gr.Checkbox(label=translations["save_autotune"], value=True, interactive=True)
89
- pitch_chbox = gr.Checkbox(label=translations["save_pitch"], value=True, interactive=True)
90
- index_strength_chbox = gr.Checkbox(label=translations["save_index_2"], value=True, interactive=True)
91
- resample_sr_chbox = gr.Checkbox(label=translations["save_resample"], value=True, interactive=True)
92
- filter_radius_chbox = gr.Checkbox(label=translations["save_filter"], value=True, interactive=True)
93
- rms_mix_rate_chbox = gr.Checkbox(label=translations["save_envelope"], value=True, interactive=True)
94
- protect_chbox = gr.Checkbox(label=translations["save_protect"], value=True, interactive=True)
95
- split_audio_chbox = gr.Checkbox(label=translations["save_split"], value=True, interactive=True)
96
- formant_shifting_chbox = gr.Checkbox(label=translations["formantshift"], value=True, interactive=True)
97
- with gr.Row():
98
- with gr.Column():
99
- name_to_save_file = gr.Textbox(label=translations["filename_to_save"])
100
- save_file_button = gr.Button(translations["export_file"])
101
- with gr.Row():
102
- upload_presets = gr.File(label=translations["upload_presets"], file_types=[".conversion.json"])
103
- with gr.Column():
104
- with gr.Row():
105
- split_audio = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True)
106
- formant_shifting = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
107
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
108
- resample_sr = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True)
109
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
110
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value)
111
- filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
112
- rms_mix_rate = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
113
- protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
114
- with gr.Row():
115
- formant_qfrency = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
116
- formant_timbre = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
117
- with gr.Row():
118
- gr.Markdown(translations["output_convert"])
119
- with gr.Row():
120
- main_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["main_convert"])
121
- backing_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_backing"], visible=convert_backing.value)
122
- main_backing = gr.Audio(show_download_button=True, interactive=False, label=translations["main_or_backing"], visible=convert_backing.value)
123
- with gr.Row():
124
- original_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_original"], visible=use_original.value)
125
- vocal_instrument = gr.Audio(show_download_button=True, interactive=False, label=translations["voice_or_instruments"], visible=merge_instrument.value)
126
- with gr.Row():
127
- upload_f0_file.upload(fn=lambda inp: shutil_move(inp.name, configs["f0_path"]), inputs=[upload_f0_file], outputs=[f0_file_dropdown])
128
- refresh_f0_file.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown])
129
- unlock_full_method.change(fn=unlock_f0, inputs=[unlock_full_method], outputs=[method])
130
- with gr.Row():
131
- load_click.click(
132
- fn=load_presets,
133
- inputs=[
134
- presets_name,
135
- cleaner0,
136
- autotune,
137
- pitch,
138
- clean_strength0,
139
- index_strength,
140
- resample_sr,
141
- filter_radius,
142
- rms_mix_rate,
143
- protect,
144
- split_audio,
145
- f0_autotune_strength,
146
- formant_qfrency,
147
- formant_timbre
148
- ],
149
- outputs=[
150
- cleaner0,
151
- autotune,
152
- pitch,
153
- clean_strength0,
154
- index_strength,
155
- resample_sr,
156
- filter_radius,
157
- rms_mix_rate,
158
- protect,
159
- split_audio,
160
- f0_autotune_strength,
161
- formant_shifting,
162
- formant_qfrency,
163
- formant_timbre
164
- ]
165
- )
166
- refresh_click.click(fn=change_preset_choices, inputs=[], outputs=[presets_name])
167
- save_file_button.click(
168
- fn=save_presets,
169
- inputs=[
170
- name_to_save_file,
171
- cleaner0,
172
- autotune,
173
- pitch,
174
- clean_strength0,
175
- index_strength,
176
- resample_sr,
177
- filter_radius,
178
- rms_mix_rate,
179
- protect,
180
- split_audio,
181
- f0_autotune_strength,
182
- cleaner_chbox,
183
- autotune_chbox,
184
- pitch_chbox,
185
- index_strength_chbox,
186
- resample_sr_chbox,
187
- filter_radius_chbox,
188
- rms_mix_rate_chbox,
189
- protect_chbox,
190
- split_audio_chbox,
191
- formant_shifting_chbox,
192
- formant_shifting,
193
- formant_qfrency,
194
- formant_timbre
195
- ],
196
- outputs=[presets_name]
197
- )
198
- with gr.Row():
199
- upload_presets.upload(fn=lambda audio_in: shutil_move(audio_in.name, configs["presets_path"]), inputs=[upload_presets], outputs=[presets_name])
200
- autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength])
201
- use_audio.change(fn=lambda a: [visible(a), visible(a), visible(a), visible(a), visible(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), visible(not a), visible(not a), visible(not a), visible(not a)], inputs=[use_audio], outputs=[main_backing, use_original, convert_backing, not_merge_backing, merge_instrument, use_original, convert_backing, not_merge_backing, merge_instrument, input_audio0, output_audio, input0, play_audio])
202
- with gr.Row():
203
- convert_backing.change(fn=lambda a,b: [change_backing_choices(a, b), visible(a)], inputs=[convert_backing, not_merge_backing], outputs=[use_original, backing_convert])
204
- use_original.change(fn=lambda audio, original: [visible(original), visible(not original), visible(audio and not original), valueFalse_interactive(not original), valueFalse_interactive(not original)], inputs=[use_audio, use_original], outputs=[original_convert, main_convert, main_backing, convert_backing, not_merge_backing])
205
- cleaner0.change(fn=visible, inputs=[cleaner0], outputs=[clean_strength0])
206
- with gr.Row():
207
- merge_instrument.change(fn=visible, inputs=[merge_instrument], outputs=[vocal_instrument])
208
- not_merge_backing.change(fn=lambda audio, merge, cvb: [visible(audio and not merge), change_backing_choices(cvb, merge)], inputs=[use_audio, not_merge_backing, convert_backing], outputs=[main_backing, use_original])
209
- method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method, hybrid_method], outputs=[hybrid_method, hop_length])
210
- with gr.Row():
211
- hybrid_method.change(fn=hoplength_show, inputs=[method, hybrid_method], outputs=[hop_length])
212
- refresh.click(fn=change_models_choices, inputs=[], outputs=[model_pth, model_index])
213
- model_pth.change(fn=get_index, inputs=[model_pth], outputs=[model_index])
214
- with gr.Row():
215
- input0.upload(fn=lambda audio_in: shutil_move(audio_in.name, configs["audios_path"]), inputs=[input0], outputs=[input_audio0])
216
- input_audio0.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio0], outputs=[play_audio])
217
- formant_shifting.change(fn=lambda a: [visible(a)]*2, inputs=[formant_shifting], outputs=[formant_qfrency, formant_timbre])
218
- with gr.Row():
219
- embedders.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders], outputs=[custom_embedders])
220
- refresh0.click(fn=change_audios_choices, inputs=[input_audio0], outputs=[input_audio0])
221
- model_index.change(fn=index_strength_show, inputs=[model_index], outputs=[index_strength])
222
- with gr.Row():
223
- convert_button.click(fn=lambda: visible(False), inputs=[], outputs=[convert_button])
224
- convert_button_2.click(fn=lambda: [visible(False), visible(False)], inputs=[], outputs=[audio_select, convert_button_2])
225
- with gr.Row():
226
- embed_mode.change(fn=visible_embedders, inputs=[embed_mode], outputs=[embedders])
227
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
228
- with gr.Row():
229
- convert_button.click(
230
- fn=convert_selection,
231
- inputs=[
232
- cleaner0,
233
- autotune,
234
- use_audio,
235
- use_original,
236
- convert_backing,
237
- not_merge_backing,
238
- merge_instrument,
239
- pitch,
240
- clean_strength0,
241
- model_pth,
242
- model_index,
243
- index_strength,
244
- input_audio0,
245
- output_audio,
246
- export_format,
247
- method,
248
- hybrid_method,
249
- hop_length,
250
- embedders,
251
- custom_embedders,
252
- resample_sr,
253
- filter_radius,
254
- rms_mix_rate,
255
- protect,
256
- split_audio,
257
- f0_autotune_strength,
258
- checkpointing,
259
- onnx_f0_mode,
260
- formant_shifting,
261
- formant_qfrency,
262
- formant_timbre,
263
- f0_file_dropdown,
264
- embed_mode,
265
- proposal_pitch,
266
- proposal_pitch_threshold
267
- ],
268
- outputs=[audio_select, main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button, convert_button_2],
269
- api_name="convert_selection"
270
- )
271
- convert_button_2.click(
272
- fn=convert_audio,
273
- inputs=[
274
- cleaner0,
275
- autotune,
276
- use_audio,
277
- use_original,
278
- convert_backing,
279
- not_merge_backing,
280
- merge_instrument,
281
- pitch,
282
- clean_strength0,
283
- model_pth,
284
- model_index,
285
- index_strength,
286
- input_audio0,
287
- output_audio,
288
- export_format,
289
- method,
290
- hybrid_method,
291
- hop_length,
292
- embedders,
293
- custom_embedders,
294
- resample_sr,
295
- filter_radius,
296
- rms_mix_rate,
297
- protect,
298
- split_audio,
299
- f0_autotune_strength,
300
- audio_select,
301
- checkpointing,
302
- onnx_f0_mode,
303
- formant_shifting,
304
- formant_qfrency,
305
- formant_timbre,
306
- f0_file_dropdown,
307
- embed_mode,
308
- proposal_pitch,
309
- proposal_pitch_threshold
310
- ],
311
- outputs=[main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button],
312
- api_name="convert_audio"
313
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/inference/child/convert_tts.py DELETED
@@ -1,171 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.tts import TTS
9
- from main.app.core.process import process_input
10
- from main.app.core.inference import convert_tts
11
- from main.app.core.utils import google_translate
12
- from main.app.variables import translations, sample_rate_choice, model_name, index_path, method_f0, f0_file, embedders_mode, embedders_model, edgetts, google_tts_voice, configs
13
- from main.app.core.ui import visible, change_f0_choices, unlock_f0, hoplength_show, change_models_choices, get_index, index_strength_show, visible_embedders, change_tts_voice_choices, shutil_move
14
-
15
- def convert_tts_tab():
16
- with gr.Row():
17
- gr.Markdown(translations["convert_text_markdown_2"])
18
- with gr.Row():
19
- with gr.Column():
20
- with gr.Group():
21
- with gr.Row():
22
- use_txt = gr.Checkbox(label=translations["input_txt"], value=False, interactive=True)
23
- google_tts_check_box = gr.Checkbox(label=translations["googletts"], value=False, interactive=True)
24
- prompt = gr.Textbox(label=translations["text_to_speech"], value="", placeholder="Hello Words", lines=3)
25
- with gr.Column():
26
- speed = gr.Slider(label=translations["voice_speed"], info=translations["voice_speed_info"], minimum=-100, maximum=100, value=0, step=1)
27
- pitch0 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
28
- with gr.Row():
29
- tts_button = gr.Button(translations["tts_1"], variant="primary", scale=2)
30
- convert_button0 = gr.Button(translations["tts_2"], variant="secondary", scale=2)
31
- with gr.Row():
32
- with gr.Column():
33
- txt_input = gr.File(label=translations["drop_text"], file_types=[".txt", ".srt", ".docx"], visible=use_txt.value)
34
- tts_voice = gr.Dropdown(label=translations["voice"], choices=edgetts, interactive=True, value="vi-VN-NamMinhNeural")
35
- tts_pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info_2"], label=translations["pitch"], value=0, interactive=True)
36
- with gr.Accordion(translations["translate"], open=False):
37
- with gr.Row():
38
- source_lang = gr.Dropdown(label=translations["source_lang"], choices=["auto"]+google_tts_voice, interactive=True, value="auto")
39
- target_lang = gr.Dropdown(label=translations["target_lang"], choices=google_tts_voice, interactive=True, value="en")
40
- translate_button = gr.Button(translations["translate"])
41
- with gr.Column():
42
- with gr.Accordion(translations["model_accordion"], open=True):
43
- with gr.Row():
44
- model_pth0 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
45
- model_index0 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
46
- with gr.Row():
47
- refresh1 = gr.Button(translations["refresh"])
48
- with gr.Row():
49
- index_strength0 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index0.value != "")
50
- with gr.Accordion(translations["output_path"], open=False):
51
- export_format0 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
52
- output_audio0 = gr.Textbox(label=translations["output_tts"], value="audios/tts.wav", placeholder="audios/tts.wav", info=translations["tts_output"], interactive=True)
53
- output_audio1 = gr.Textbox(label=translations["output_tts_convert"], value="audios/tts-convert.wav", placeholder="audios/tts-convert.wav", info=translations["tts_output"], interactive=True)
54
- with gr.Accordion(translations["setting"], open=False):
55
- with gr.Accordion(translations["f0_method"], open=False):
56
- with gr.Group():
57
- with gr.Row():
58
- onnx_f0_mode1 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
59
- unlock_full_method3 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
60
- method0 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
61
- hybrid_method0 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method0.value == "hybrid")
62
- hop_length0 = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
63
- with gr.Accordion(translations["f0_file"], open=False):
64
- upload_f0_file0 = gr.File(label=translations["upload_f0"], file_types=[".txt"])
65
- f0_file_dropdown0 = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True)
66
- refresh_f0_file0 = gr.Button(translations["refresh"])
67
- with gr.Accordion(translations["hubert_model"], open=False):
68
- embed_mode1 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
69
- embedders0 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
70
- custom_embedders0 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders0.value == "custom")
71
- with gr.Group():
72
- with gr.Row():
73
- formant_shifting1 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
74
- split_audio0 = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True)
75
- cleaner1 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
76
- with gr.Row():
77
- autotune3 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
78
- checkpointing0 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
79
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
80
- with gr.Column():
81
- resample_sr0 = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True)
82
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
83
- f0_autotune_strength0 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune3.value)
84
- clean_strength1 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner1.value)
85
- filter_radius0 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
86
- rms_mix_rate0 = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
87
- protect0 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
88
- with gr.Row():
89
- formant_qfrency1 = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
90
- formant_timbre1 = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
91
- with gr.Row():
92
- gr.Markdown(translations["output_tts_markdown"])
93
- with gr.Row():
94
- tts_voice_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["output_text_to_speech"])
95
- tts_voice_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"])
96
- with gr.Row():
97
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
98
- translate_button.click(fn=google_translate, inputs=[prompt, source_lang, target_lang], outputs=[prompt], api_name="google_translate")
99
- with gr.Row():
100
- unlock_full_method3.change(fn=unlock_f0, inputs=[unlock_full_method3], outputs=[method0])
101
- upload_f0_file0.upload(fn=lambda inp: shutil_move(inp.name, configs["f0_path"]), inputs=[upload_f0_file0], outputs=[f0_file_dropdown0])
102
- refresh_f0_file0.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown0])
103
- with gr.Row():
104
- embed_mode1.change(fn=visible_embedders, inputs=[embed_mode1], outputs=[embedders0])
105
- autotune3.change(fn=visible, inputs=[autotune3], outputs=[f0_autotune_strength0])
106
- model_pth0.change(fn=get_index, inputs=[model_pth0], outputs=[model_index0])
107
- with gr.Row():
108
- cleaner1.change(fn=visible, inputs=[cleaner1], outputs=[clean_strength1])
109
- method0.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method0, hybrid_method0], outputs=[hybrid_method0, hop_length0])
110
- hybrid_method0.change(fn=hoplength_show, inputs=[method0, hybrid_method0], outputs=[hop_length0])
111
- with gr.Row():
112
- refresh1.click(fn=change_models_choices, inputs=[], outputs=[model_pth0, model_index0])
113
- embedders0.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders0], outputs=[custom_embedders0])
114
- formant_shifting1.change(fn=lambda a: [visible(a)]*2, inputs=[formant_shifting1], outputs=[formant_qfrency1, formant_timbre1])
115
- with gr.Row():
116
- model_index0.change(fn=index_strength_show, inputs=[model_index0], outputs=[index_strength0])
117
- txt_input.upload(fn=process_input, inputs=[txt_input], outputs=[prompt])
118
- use_txt.change(fn=visible, inputs=[use_txt], outputs=[txt_input])
119
- with gr.Row():
120
- google_tts_check_box.change(fn=change_tts_voice_choices, inputs=[google_tts_check_box], outputs=[tts_voice])
121
- tts_button.click(
122
- fn=TTS,
123
- inputs=[
124
- prompt,
125
- tts_voice,
126
- speed,
127
- output_audio0,
128
- tts_pitch,
129
- google_tts_check_box,
130
- txt_input
131
- ],
132
- outputs=[tts_voice_audio],
133
- api_name="text-to-speech"
134
- )
135
- convert_button0.click(
136
- fn=convert_tts,
137
- inputs=[
138
- cleaner1,
139
- autotune3,
140
- pitch0,
141
- clean_strength1,
142
- model_pth0,
143
- model_index0,
144
- index_strength0,
145
- output_audio0,
146
- output_audio1,
147
- export_format0,
148
- method0,
149
- hybrid_method0,
150
- hop_length0,
151
- embedders0,
152
- custom_embedders0,
153
- resample_sr0,
154
- filter_radius0,
155
- rms_mix_rate0,
156
- protect0,
157
- split_audio0,
158
- f0_autotune_strength0,
159
- checkpointing0,
160
- onnx_f0_mode1,
161
- formant_shifting1,
162
- formant_qfrency1,
163
- formant_timbre1,
164
- f0_file_dropdown0,
165
- embed_mode1,
166
- proposal_pitch,
167
- proposal_pitch_threshold
168
- ],
169
- outputs=[tts_voice_convert],
170
- api_name="convert_tts"
171
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/inference/child/convert_with_whisper.py DELETED
@@ -1,160 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.inference import convert_with_whisper
9
- from main.app.variables import translations, paths_for_files, sample_rate_choice, model_name, index_path, method_f0, embedders_mode, embedders_model, configs
10
- from main.app.core.ui import visible, change_audios_choices, unlock_f0, hoplength_show, change_models_choices, get_index, index_strength_show, visible_embedders, shutil_move
11
-
12
- def convert_with_whisper_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["convert_with_whisper_info"])
15
- with gr.Row():
16
- with gr.Column():
17
- with gr.Group():
18
- with gr.Row():
19
- cleaner2 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
20
- autotune2 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
21
- checkpointing2 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
22
- formant_shifting2 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
23
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
24
- with gr.Row():
25
- num_spk = gr.Slider(minimum=2, maximum=8, step=1, info=translations["num_spk_info"], label=translations["num_spk"], value=2, interactive=True)
26
- with gr.Row():
27
- with gr.Column():
28
- convert_button3 = gr.Button(translations["convert_audio"], variant="primary")
29
- with gr.Row():
30
- with gr.Column():
31
- with gr.Accordion(translations["model_accordion"] + " 1", open=True):
32
- with gr.Row():
33
- model_pth2 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
34
- model_index2 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
35
- with gr.Row():
36
- refresh2 = gr.Button(translations["refresh"])
37
- with gr.Row():
38
- pitch3 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
39
- index_strength2 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index2.value != "")
40
- with gr.Accordion(translations["input_output"], open=False):
41
- with gr.Column():
42
- export_format2 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
43
- input_audio1 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
44
- output_audio2 = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
45
- with gr.Column():
46
- refresh4 = gr.Button(translations["refresh"])
47
- with gr.Row():
48
- input2 = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
49
- with gr.Column():
50
- with gr.Accordion(translations["model_accordion"] + " 2", open=True):
51
- with gr.Row():
52
- model_pth3 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
53
- model_index3 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
54
- with gr.Row():
55
- refresh3 = gr.Button(translations["refresh"])
56
- with gr.Row():
57
- pitch4 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
58
- index_strength3 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index3.value != "")
59
- with gr.Accordion(translations["setting"], open=False):
60
- with gr.Row():
61
- model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"], value="medium", interactive=True)
62
- with gr.Accordion(translations["f0_method"], open=False):
63
- with gr.Group():
64
- with gr.Row():
65
- onnx_f0_mode4 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
66
- unlock_full_method2 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
67
- method3 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
68
- hybrid_method3 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method3.value == "hybrid")
69
- hop_length3 = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
70
- with gr.Accordion(translations["hubert_model"], open=False):
71
- embed_mode3 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
72
- embedders3 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
73
- custom_embedders3 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders3.value == "custom")
74
- with gr.Column():
75
- resample_sr3 = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True)
76
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
77
- clean_strength3 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner2.value)
78
- f0_autotune_strength3 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune2.value)
79
- filter_radius3 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
80
- rms_mix_rate3 = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
81
- protect3 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
82
- with gr.Row():
83
- formant_qfrency3 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 1", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
84
- formant_timbre3 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 1", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
85
- with gr.Row():
86
- formant_qfrency4 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 2", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
87
- formant_timbre4 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 2", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
88
- with gr.Row():
89
- gr.Markdown(translations["input_output"])
90
- with gr.Row():
91
- play_audio2 = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
92
- play_audio3 = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"])
93
- with gr.Row():
94
- autotune2.change(fn=visible, inputs=[autotune2], outputs=[f0_autotune_strength3])
95
- cleaner2.change(fn=visible, inputs=[cleaner2], outputs=[clean_strength3])
96
- method3.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method3, hybrid_method3], outputs=[hybrid_method3, hop_length3])
97
- with gr.Row():
98
- hybrid_method3.change(fn=hoplength_show, inputs=[method3, hybrid_method3], outputs=[hop_length3])
99
- refresh2.click(fn=change_models_choices, inputs=[], outputs=[model_pth2, model_index2])
100
- model_pth2.change(fn=get_index, inputs=[model_pth2], outputs=[model_index2])
101
- with gr.Row():
102
- refresh3.click(fn=change_models_choices, inputs=[], outputs=[model_pth3, model_index3])
103
- model_pth3.change(fn=get_index, inputs=[model_pth3], outputs=[model_index3])
104
- input2.upload(fn=lambda audio_in: shutil_move(audio_in.name, configs["audios_path"]), inputs=[input2], outputs=[input_audio1])
105
- with gr.Row():
106
- input_audio1.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio1], outputs=[play_audio2])
107
- formant_shifting2.change(fn=lambda a: [visible(a)]*4, inputs=[formant_shifting2], outputs=[formant_qfrency3, formant_timbre3, formant_qfrency4, formant_timbre4])
108
- embedders3.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders3], outputs=[custom_embedders3])
109
- with gr.Row():
110
- refresh4.click(fn=change_audios_choices, inputs=[input_audio1], outputs=[input_audio1])
111
- model_index2.change(fn=index_strength_show, inputs=[model_index2], outputs=[index_strength2])
112
- model_index3.change(fn=index_strength_show, inputs=[model_index3], outputs=[index_strength3])
113
- with gr.Row():
114
- unlock_full_method2.change(fn=unlock_f0, inputs=[unlock_full_method2], outputs=[method3])
115
- embed_mode3.change(fn=visible_embedders, inputs=[embed_mode3], outputs=[embedders3])
116
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
117
- with gr.Row():
118
- convert_button3.click(
119
- fn=convert_with_whisper,
120
- inputs=[
121
- num_spk,
122
- model_size,
123
- cleaner2,
124
- clean_strength3,
125
- autotune2,
126
- f0_autotune_strength3,
127
- checkpointing2,
128
- model_pth2,
129
- model_pth3,
130
- model_index2,
131
- model_index3,
132
- pitch3,
133
- pitch4,
134
- index_strength2,
135
- index_strength3,
136
- export_format2,
137
- input_audio1,
138
- output_audio2,
139
- onnx_f0_mode4,
140
- method3,
141
- hybrid_method3,
142
- hop_length3,
143
- embed_mode3,
144
- embedders3,
145
- custom_embedders3,
146
- resample_sr3,
147
- filter_radius3,
148
- rms_mix_rate3,
149
- protect3,
150
- formant_shifting2,
151
- formant_qfrency3,
152
- formant_timbre3,
153
- formant_qfrency4,
154
- formant_timbre4,
155
- proposal_pitch,
156
- proposal_pitch_threshold
157
- ],
158
- outputs=[play_audio3],
159
- api_name="convert_with_whisper"
160
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/inference/child/separate.py DELETED
@@ -1,108 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.downloads import download_url
9
- from main.app.core.separate import separator_music
10
- from main.app.core.ui import visible, valueFalse_interactive, change_audios_choices, shutil_move
11
- from main.app.variables import translations, uvr_model, paths_for_files, mdx_model, sample_rate_choice, configs
12
-
13
- def separate_tab():
14
- with gr.Row():
15
- gr.Markdown(translations["4_part"])
16
- with gr.Row():
17
- with gr.Column():
18
- with gr.Group():
19
- with gr.Row():
20
- cleaner = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True, min_width=140)
21
- backing = gr.Checkbox(label=translations["separator_backing"], value=False, interactive=True, min_width=140)
22
- reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=True, min_width=140)
23
- backing_reverb = gr.Checkbox(label=translations["dereveb_backing"], value=False, interactive=False, min_width=140)
24
- denoise = gr.Checkbox(label=translations["denoise_mdx"], value=False, interactive=False, min_width=140)
25
- with gr.Row():
26
- separator_model = gr.Dropdown(label=translations["separator_model"], value=uvr_model[0], choices=uvr_model, interactive=True)
27
- separator_backing_model = gr.Dropdown(label=translations["separator_backing_model"], value="Version-1", choices=["Version-1", "Version-2"], interactive=True, visible=backing.value)
28
- with gr.Row():
29
- with gr.Column():
30
- separator_button = gr.Button(translations["separator_tab"], variant="primary")
31
- with gr.Row():
32
- with gr.Column():
33
- with gr.Group():
34
- with gr.Row():
35
- shifts = gr.Slider(label=translations["shift"], info=translations["shift_info"], minimum=1, maximum=20, value=2, step=1, interactive=True)
36
- segment_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True)
37
- with gr.Row():
38
- mdx_batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True, visible=backing.value or reverb.value or separator_model.value in mdx_model)
39
- with gr.Column():
40
- with gr.Group():
41
- with gr.Row():
42
- overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True)
43
- with gr.Row():
44
- mdx_hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=8192, value=1024, step=1, interactive=True, visible=backing.value or reverb.value or separator_model.value in mdx_model)
45
- with gr.Row():
46
- with gr.Column():
47
- input = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
48
- with gr.Accordion(translations["use_url"], open=False):
49
- url = gr.Textbox(label=translations["url_audio"], value="", placeholder="https://www.youtube.com/...", scale=6)
50
- download_button = gr.Button(translations["downloads"])
51
- with gr.Column():
52
- with gr.Row():
53
- clean_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner.value)
54
- sample_rate1 = gr.Radio(choices=sample_rate_choice, value=44100, label=translations["sr"], info=translations["sr_info"], interactive=True)
55
- with gr.Accordion(translations["input_output"], open=False):
56
- format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
57
- input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True)
58
- refresh_separator = gr.Button(translations["refresh"])
59
- output_separator = gr.Textbox(label=translations["output_folder"], value="audios", placeholder="audios", info=translations["output_folder_info"], interactive=True)
60
- audio_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
61
- with gr.Row():
62
- gr.Markdown(translations["output_separator"])
63
- with gr.Row():
64
- instruments_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["instruments"])
65
- original_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["original_vocal"])
66
- main_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["main_vocal"], visible=backing.value)
67
- backing_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["backing_vocal"], visible=backing.value)
68
- with gr.Row():
69
- separator_model.change(fn=lambda a, b, c: [visible(a or b or c in mdx_model), visible(a or b or c in mdx_model), valueFalse_interactive(a or b or c in mdx_model), visible(c not in mdx_model)], inputs=[backing, reverb, separator_model], outputs=[mdx_batch_size, mdx_hop_length, denoise, shifts])
70
- backing.change(fn=lambda a, b, c: [visible(a or b or c in mdx_model), visible(a or b or c in mdx_model), valueFalse_interactive(a or b or c in mdx_model), visible(a), visible(a), visible(a), valueFalse_interactive(a and b)], inputs=[backing, reverb, separator_model], outputs=[mdx_batch_size, mdx_hop_length, denoise, separator_backing_model, main_vocals, backing_vocals, backing_reverb])
71
- reverb.change(fn=lambda a, b, c: [visible(a or b or c in mdx_model), visible(a or b or c in mdx_model), valueFalse_interactive(a or b or c in mdx_model), valueFalse_interactive(a and b)], inputs=[backing, reverb, separator_model], outputs=[mdx_batch_size, mdx_hop_length, denoise, backing_reverb])
72
- with gr.Row():
73
- input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[audio_input])
74
- cleaner.change(fn=visible, inputs=[cleaner], outputs=[clean_strength])
75
- with gr.Row():
76
- input.upload(fn=lambda audio_in: shutil_move(audio_in.name, configs["audios_path"]), inputs=[input], outputs=[input_audio])
77
- refresh_separator.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio])
78
- with gr.Row():
79
- download_button.click(
80
- fn=download_url,
81
- inputs=[url],
82
- outputs=[input_audio, audio_input, url],
83
- api_name='download_url'
84
- )
85
- separator_button.click(
86
- fn=separator_music,
87
- inputs=[
88
- input_audio,
89
- output_separator,
90
- format,
91
- shifts,
92
- segment_size,
93
- overlap,
94
- cleaner,
95
- clean_strength,
96
- denoise,
97
- separator_model,
98
- separator_backing_model,
99
- backing,
100
- reverb,
101
- backing_reverb,
102
- mdx_hop_length,
103
- mdx_batch_size,
104
- sample_rate1
105
- ],
106
- outputs=[original_vocals, instruments_audio, main_vocals, backing_vocals],
107
- api_name='separator_music'
108
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/inference/inference.py DELETED
@@ -1,30 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import translations, configs
9
- from main.app.tabs.inference.child.convert import convert_tab
10
- from main.app.tabs.inference.child.separate import separate_tab
11
- from main.app.tabs.inference.child.convert_tts import convert_tts_tab
12
- from main.app.tabs.inference.child.convert_with_whisper import convert_with_whisper_tab
13
-
14
- def inference_tab():
15
- with gr.TabItem(translations["inference"], visible=configs.get("inference_tab", True)):
16
- with gr.TabItem(translations["separator_tab"], visible=configs.get("separator_tab", True)):
17
- gr.Markdown(f"## {translations['separator_tab']}")
18
- separate_tab()
19
-
20
- with gr.TabItem(translations["convert_audio"], visible=configs.get("convert_tab", True)):
21
- gr.Markdown(f"## {translations['convert_audio']}")
22
- convert_tab()
23
-
24
- with gr.TabItem(translations["convert_with_whisper"], visible=configs.get("convert_with_whisper", True)):
25
- gr.Markdown(f"## {translations['convert_with_whisper']}")
26
- convert_with_whisper_tab()
27
-
28
- with gr.TabItem(translations["convert_text"], visible=configs.get("tts_tab", True)):
29
- gr.Markdown(translations["convert_text_markdown"])
30
- convert_tts_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/training/child/create_dataset.py DELETED
@@ -1,71 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.training import create_dataset
9
- from main.app.core.ui import visible, valueEmpty_visible1
10
- from main.app.variables import translations, sample_rate_choice
11
-
12
- def create_dataset_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["create_dataset_markdown_2"])
15
- with gr.Row():
16
- dataset_url = gr.Textbox(label=translations["url_audio"], info=translations["create_dataset_url"], value="", placeholder="https://www.youtube.com/...", interactive=True)
17
- output_dataset = gr.Textbox(label=translations["output_data"], info=translations["output_data_info"], value="dataset", placeholder="dataset", interactive=True)
18
- with gr.Row():
19
- with gr.Column():
20
- with gr.Group():
21
- with gr.Row():
22
- separator_reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=True)
23
- denoise_mdx = gr.Checkbox(label=translations["denoise"], value=False, interactive=True)
24
- with gr.Row():
25
- kim_vocal_version = gr.Radio(label=translations["model_ver"], info=translations["model_ver_info"], choices=["Version-1", "Version-2"], value="Version-2", interactive=True)
26
- kim_vocal_overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True)
27
- with gr.Row():
28
- kim_vocal_hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=8192, value=1024, step=1, interactive=True)
29
- kim_vocal_batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True)
30
- with gr.Row():
31
- kim_vocal_segments_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True)
32
- with gr.Row():
33
- sample_rate0 = gr.Radio(choices=sample_rate_choice, value=44100, label=translations["sr"], info=translations["sr_info"], interactive=True)
34
- with gr.Column():
35
- create_button = gr.Button(translations["createdataset"], variant="primary", scale=2, min_width=4000)
36
- with gr.Group():
37
- with gr.Row():
38
- clean_audio = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
39
- skip = gr.Checkbox(label=translations["skip"], value=False, interactive=True)
40
- with gr.Row():
41
- dataset_clean_strength = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.5, label=translations["clean_strength"], info=translations["clean_strength_info"], interactive=True, visible=clean_audio.value)
42
- with gr.Row():
43
- skip_start = gr.Textbox(label=translations["skip_start"], info=translations["skip_start_info"], value="", placeholder="0,...", interactive=True, visible=skip.value)
44
- skip_end = gr.Textbox(label=translations["skip_end"], info=translations["skip_end_info"], value="", placeholder="0,...", interactive=True, visible=skip.value)
45
- create_dataset_info = gr.Textbox(label=translations["create_dataset_info"], value="", interactive=False)
46
- with gr.Row():
47
- clean_audio.change(fn=visible, inputs=[clean_audio], outputs=[dataset_clean_strength])
48
- skip.change(fn=lambda a: [valueEmpty_visible1(a)]*2, inputs=[skip], outputs=[skip_start, skip_end])
49
- with gr.Row():
50
- create_button.click(
51
- fn=create_dataset,
52
- inputs=[
53
- dataset_url,
54
- output_dataset,
55
- clean_audio,
56
- dataset_clean_strength,
57
- separator_reverb,
58
- kim_vocal_version,
59
- kim_vocal_overlap,
60
- kim_vocal_segments_size,
61
- denoise_mdx,
62
- skip,
63
- skip_start,
64
- skip_end,
65
- kim_vocal_hop_length,
66
- kim_vocal_batch_size,
67
- sample_rate0
68
- ],
69
- outputs=[create_dataset_info],
70
- api_name="create_dataset"
71
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/training/child/training.py DELETED
@@ -1,237 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.process import zip_file
9
- from main.app.core.training import preprocess, extract, create_index, training
10
- from main.app.variables import translations, model_name, index_path, method_f0, embedders_mode, embedders_model, pretrainedD, pretrainedG, config
11
- from main.app.core.ui import gr_warning, visible, unlock_f0, hoplength_show, change_models_choices, get_gpu_info, visible_embedders, pitch_guidance_lock, vocoders_lock, unlock_ver, unlock_vocoder, change_pretrained_choices, gpu_number_str, shutil_move
12
-
13
- def training_model_tab():
14
- with gr.Row():
15
- gr.Markdown(translations["training_markdown"])
16
- with gr.Row():
17
- with gr.Column():
18
- with gr.Row():
19
- with gr.Column():
20
- training_name = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True)
21
- training_sr = gr.Radio(label=translations["sample_rate"], info=translations["sample_rate_info"], choices=["32k", "40k", "48k"], value="48k", interactive=True)
22
- training_ver = gr.Radio(label=translations["training_version"], info=translations["training_version_info"], choices=["v1", "v2"], value="v2", interactive=True)
23
- with gr.Row():
24
- clean_dataset = gr.Checkbox(label=translations["clear_dataset"], value=False, interactive=True)
25
- preprocess_cut = gr.Checkbox(label=translations["split_audio"], value=True, interactive=True)
26
- process_effects = gr.Checkbox(label=translations["preprocess_effect"], value=False, interactive=True)
27
- checkpointing1 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
28
- training_f0 = gr.Checkbox(label=translations["training_pitch"], value=True, interactive=True)
29
- upload = gr.Checkbox(label=translations["upload_dataset"], value=False, interactive=True)
30
- with gr.Row():
31
- clean_dataset_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.7, step=0.1, interactive=True, visible=clean_dataset.value)
32
- with gr.Column():
33
- preprocess_button = gr.Button(translations["preprocess_button"], scale=2)
34
- upload_dataset = gr.Files(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"], visible=upload.value)
35
- preprocess_info = gr.Textbox(label=translations["preprocess_info"], value="", interactive=False)
36
- with gr.Column():
37
- with gr.Row():
38
- with gr.Column():
39
- with gr.Accordion(label=translations["f0_method"], open=False):
40
- with gr.Group():
41
- with gr.Row():
42
- onnx_f0_mode2 = gr.Checkbox(label=translations["f0_onnx_mode"], value=False, interactive=True)
43
- unlock_full_method4 = gr.Checkbox(label=translations["f0_unlock"], value=False, interactive=True)
44
- autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
45
- extract_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
46
- extract_hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=extract_method.value == "hybrid")
47
- extract_hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
48
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value)
49
- with gr.Accordion(label=translations["hubert_model"], open=False):
50
- with gr.Group():
51
- embed_mode2 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
52
- extract_embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
53
- with gr.Row():
54
- extract_embedders_custom = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=extract_embedders.value == "custom")
55
- with gr.Column():
56
- extract_button = gr.Button(translations["extract_button"], scale=2)
57
- extract_info = gr.Textbox(label=translations["extract_info"], value="", interactive=False)
58
- with gr.Column():
59
- with gr.Row():
60
- with gr.Column():
61
- total_epochs = gr.Slider(label=translations["total_epoch"], info=translations["total_epoch_info"], minimum=1, maximum=10000, value=300, step=1, interactive=True)
62
- save_epochs = gr.Slider(label=translations["save_epoch"], info=translations["save_epoch_info"], minimum=1, maximum=10000, value=50, step=1, interactive=True)
63
- with gr.Column():
64
- index_button = gr.Button(f"3. {translations['create_index']}", variant="primary", scale=2)
65
- training_button = gr.Button(f"4. {translations['training_model']}", variant="primary", scale=2)
66
- with gr.Row():
67
- with gr.Accordion(label=translations["setting"], open=False):
68
- with gr.Row():
69
- index_algorithm = gr.Radio(label=translations["index_algorithm"], info=translations["index_algorithm_info"], choices=["Auto", "Faiss", "KMeans"], value="Auto", interactive=True)
70
- with gr.Row():
71
- cache_in_gpu = gr.Checkbox(label=translations["cache_in_gpu"], info=translations["cache_in_gpu_info"], value=True, interactive=True)
72
- rms_extract = gr.Checkbox(label=translations["train&energy"], info=translations["train&energy_info"], value=False, interactive=True)
73
- overtraining_detector = gr.Checkbox(label=translations["overtraining_detector"], info=translations["overtraining_detector_info"], value=False, interactive=True)
74
- with gr.Row():
75
- custom_dataset = gr.Checkbox(label=translations["custom_dataset"], info=translations["custom_dataset_info"], value=False, interactive=True)
76
- save_only_latest = gr.Checkbox(label=translations["save_only_latest"], info=translations["save_only_latest_info"], value=True, interactive=True)
77
- save_every_weights = gr.Checkbox(label=translations["save_every_weights"], info=translations["save_every_weights_info"], value=True, interactive=True)
78
- with gr.Row():
79
- clean_up = gr.Checkbox(label=translations["cleanup_training"], info=translations["cleanup_training_info"], value=False, interactive=True)
80
- not_use_pretrain = gr.Checkbox(label=translations["not_use_pretrain_2"], info=translations["not_use_pretrain_info"], value=False, interactive=True)
81
- custom_pretrain = gr.Checkbox(label=translations["custom_pretrain"], info=translations["custom_pretrain_info"], value=False, interactive=True)
82
- with gr.Column():
83
- dataset_path = gr.Textbox(label=translations["dataset_folder"], value="dataset", interactive=True, visible=custom_dataset.value)
84
- with gr.Column():
85
- threshold = gr.Slider(minimum=1, maximum=100, value=50, step=1, label=translations["threshold"], interactive=True, visible=overtraining_detector.value)
86
- with gr.Accordion(translations["setting_cpu_gpu"], open=False):
87
- with gr.Column():
88
- gpu_number = gr.Textbox(label=translations["gpu_number"], value=gpu_number_str(), info=translations["gpu_number_info"], interactive=True)
89
- gpu_info = gr.Textbox(label=translations["gpu_info"], value=get_gpu_info(), info=translations["gpu_info_2"], interactive=False)
90
- cpu_core = gr.Slider(label=translations["cpu_core"], info=translations["cpu_core_info"], minimum=1, maximum=os.cpu_count(), value=os.cpu_count(), step=1, interactive=True)
91
- train_batch_size = gr.Slider(label=translations["batch_size"], info=translations["batch_size_info"], minimum=1, maximum=64, value=8, step=1, interactive=True)
92
- with gr.Row():
93
- vocoders = gr.Radio(label=translations["vocoder"], info=translations["vocoder_info"], choices=["Default", "MRF-HiFi-GAN", "RefineGAN"], value="Default", interactive=True)
94
- with gr.Row():
95
- deterministic = gr.Checkbox(label=translations["deterministic"], info=translations["deterministic_info"], value=False, interactive=config.device.startswith("cuda"))
96
- benchmark = gr.Checkbox(label=translations["benchmark"], info=translations["benchmark_info"], value=False, interactive=config.device.startswith("cuda"))
97
- with gr.Row():
98
- optimizer = gr.Radio(label=translations["optimizer"], info=translations["optimizer_info"], value="AdamW", choices=["AdamW", "RAdam"], interactive=True)
99
- with gr.Row():
100
- model_author = gr.Textbox(label=translations["training_author"], info=translations["training_author_info"], value="", placeholder=translations["training_author"], interactive=True)
101
- with gr.Row():
102
- with gr.Column():
103
- with gr.Accordion(translations["custom_pretrain_info"], open=False, visible=custom_pretrain.value and not not_use_pretrain.value) as pretrain_setting:
104
- pretrained_D = gr.Dropdown(label=translations["pretrain_file"].format(dg="D"), choices=pretrainedD, value=pretrainedD[0] if len(pretrainedD) > 0 else '', interactive=True, allow_custom_value=True)
105
- pretrained_G = gr.Dropdown(label=translations["pretrain_file"].format(dg="G"), choices=pretrainedG, value=pretrainedG[0] if len(pretrainedG) > 0 else '', interactive=True, allow_custom_value=True)
106
- refresh_pretrain = gr.Button(translations["refresh"], scale=2)
107
- with gr.Row():
108
- training_info = gr.Textbox(label=translations["train_info"], value="", interactive=False)
109
- with gr.Row():
110
- with gr.Column():
111
- with gr.Accordion(translations["export_model"], open=False):
112
- with gr.Row():
113
- model_file= gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
114
- index_file = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
115
- with gr.Row():
116
- refresh_file = gr.Button(f"1. {translations['refresh']}", scale=2)
117
- zip_model = gr.Button(translations["zip_model"], variant="primary", scale=2)
118
- with gr.Row():
119
- zip_output = gr.File(label=translations["output_zip"], file_types=[".zip"], interactive=False, visible=False)
120
- with gr.Row():
121
- vocoders.change(fn=pitch_guidance_lock, inputs=[vocoders], outputs=[training_f0])
122
- training_f0.change(fn=vocoders_lock, inputs=[training_f0, vocoders], outputs=[vocoders])
123
- unlock_full_method4.change(fn=unlock_f0, inputs=[unlock_full_method4], outputs=[extract_method])
124
- with gr.Row():
125
- refresh_file.click(fn=change_models_choices, inputs=[], outputs=[model_file, index_file])
126
- zip_model.click(fn=zip_file, inputs=[training_name, model_file, index_file], outputs=[zip_output])
127
- dataset_path.change(fn=lambda folder: os.makedirs(folder, exist_ok=True), inputs=[dataset_path], outputs=[])
128
- with gr.Row():
129
- upload.change(fn=visible, inputs=[upload], outputs=[upload_dataset])
130
- overtraining_detector.change(fn=visible, inputs=[overtraining_detector], outputs=[threshold])
131
- clean_dataset.change(fn=visible, inputs=[clean_dataset], outputs=[clean_dataset_strength])
132
- with gr.Row():
133
- custom_dataset.change(fn=lambda custom_dataset: [visible(custom_dataset), "dataset"],inputs=[custom_dataset], outputs=[dataset_path, dataset_path])
134
- training_ver.change(fn=unlock_vocoder, inputs=[training_ver, vocoders], outputs=[vocoders])
135
- vocoders.change(fn=unlock_ver, inputs=[training_ver, vocoders], outputs=[training_ver])
136
- with gr.Row():
137
- extract_method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[extract_method, extract_hybrid_method], outputs=[extract_hybrid_method, extract_hop_length])
138
- extract_hybrid_method.change(fn=hoplength_show, inputs=[extract_method, extract_hybrid_method], outputs=[extract_hop_length])
139
- with gr.Row():
140
- autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength])
141
- upload_dataset.upload(
142
- fn=lambda files, folder: [shutil_move(f.name, os.path.join(folder, os.path.split(f.name)[1])) for f in files] if folder != "" else gr_warning(translations["dataset_folder1"]),
143
- inputs=[upload_dataset, dataset_path],
144
- outputs=[],
145
- api_name="upload_dataset"
146
- )
147
- with gr.Row():
148
- not_use_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting])
149
- custom_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting])
150
- refresh_pretrain.click(fn=change_pretrained_choices, inputs=[], outputs=[pretrained_D, pretrained_G])
151
- with gr.Row():
152
- preprocess_button.click(
153
- fn=preprocess,
154
- inputs=[
155
- training_name,
156
- training_sr,
157
- cpu_core,
158
- preprocess_cut,
159
- process_effects,
160
- dataset_path,
161
- clean_dataset,
162
- clean_dataset_strength
163
- ],
164
- outputs=[preprocess_info],
165
- api_name="preprocess"
166
- )
167
- with gr.Row():
168
- embed_mode2.change(fn=visible_embedders, inputs=[embed_mode2], outputs=[extract_embedders])
169
- extract_embedders.change(fn=lambda extract_embedders: visible(extract_embedders == "custom"), inputs=[extract_embedders], outputs=[extract_embedders_custom])
170
- with gr.Row():
171
- extract_button.click(
172
- fn=extract,
173
- inputs=[
174
- training_name,
175
- training_ver,
176
- extract_method,
177
- training_f0,
178
- extract_hop_length,
179
- cpu_core,
180
- gpu_number,
181
- training_sr,
182
- extract_embedders,
183
- extract_embedders_custom,
184
- onnx_f0_mode2,
185
- embed_mode2,
186
- autotune,
187
- f0_autotune_strength,
188
- extract_hybrid_method,
189
- rms_extract
190
- ],
191
- outputs=[extract_info],
192
- api_name="extract"
193
- )
194
- with gr.Row():
195
- index_button.click(
196
- fn=create_index,
197
- inputs=[
198
- training_name,
199
- training_ver,
200
- index_algorithm
201
- ],
202
- outputs=[training_info],
203
- api_name="create_index"
204
- )
205
- with gr.Row():
206
- training_button.click(
207
- fn=training,
208
- inputs=[
209
- training_name,
210
- training_ver,
211
- save_epochs,
212
- save_only_latest,
213
- save_every_weights,
214
- total_epochs,
215
- training_sr,
216
- train_batch_size,
217
- gpu_number,
218
- training_f0,
219
- not_use_pretrain,
220
- custom_pretrain,
221
- pretrained_G,
222
- pretrained_D,
223
- overtraining_detector,
224
- threshold,
225
- clean_up,
226
- cache_in_gpu,
227
- model_author,
228
- vocoders,
229
- checkpointing1,
230
- deterministic,
231
- benchmark,
232
- optimizer,
233
- rms_extract
234
- ],
235
- outputs=[training_info],
236
- api_name="training_model"
237
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/training/training.py DELETED
@@ -1,20 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import translations, configs
9
- from main.app.tabs.training.child.training import training_model_tab
10
- from main.app.tabs.training.child.create_dataset import create_dataset_tab
11
-
12
- def training_tab():
13
- with gr.TabItem(translations["training_model"], visible=configs.get("create_and_training_tab", True)):
14
- with gr.TabItem(translations["createdataset"], visible=configs.get("create_dataset_tab", True)):
15
- gr.Markdown(translations["create_dataset_markdown"])
16
- create_dataset_tab()
17
-
18
- with gr.TabItem(translations["training_model"], visible=configs.get("training_tab", True)):
19
- gr.Markdown(f"## {translations['training_model']}")
20
- training_model_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/variables.py DELETED
@@ -1,106 +0,0 @@
1
- import os
2
- import sys
3
- import csv
4
- import json
5
- import codecs
6
- import logging
7
- import urllib.request
8
- import logging.handlers
9
-
10
- sys.path.append(os.getcwd())
11
-
12
- from main.configs.config import Config
13
-
14
- logger = logging.getLogger(__name__)
15
- logger.propagate = False
16
-
17
- config = Config()
18
- python = sys.executable
19
- translations = config.translations
20
- configs_json = os.path.join("main", "configs", "config.json")
21
- configs = json.load(open(configs_json, "r"))
22
-
23
- if not logger.hasHandlers():
24
- console_handler = logging.StreamHandler()
25
- console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
26
- console_handler.setFormatter(console_formatter)
27
- console_handler.setLevel(logging.DEBUG if config.debug_mode else logging.INFO)
28
- file_handler = logging.handlers.RotatingFileHandler(os.path.join(configs["logs_path"], "app.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8')
29
- file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
30
- file_handler.setFormatter(file_formatter)
31
- file_handler.setLevel(logging.DEBUG)
32
- logger.addHandler(console_handler)
33
- logger.addHandler(file_handler)
34
- logger.setLevel(logging.DEBUG)
35
-
36
- if config.device in ["cpu", "mps", "ocl:0"] and configs.get("fp16", False):
37
- logger.warning(translations["fp16_not_support"])
38
- configs["fp16"] = config.is_half = False
39
-
40
- with open(configs_json, "w") as f:
41
- json.dump(configs, f, indent=4)
42
-
43
- models = {}
44
- model_options = {}
45
-
46
- method_f0 = ["mangio-crepe-full", "crepe-full", "fcpe", "rmvpe", "harvest", "pyin", "hybrid"]
47
- method_f0_full = ["pm-ac", "pm-cc", "pm-shs", "dio", "mangio-crepe-tiny", "mangio-crepe-small", "mangio-crepe-medium", "mangio-crepe-large", "mangio-crepe-full", "crepe-tiny", "crepe-small", "crepe-medium", "crepe-large", "crepe-full", "fcpe", "fcpe-legacy", "rmvpe", "rmvpe-legacy", "harvest", "yin", "pyin", "swipe", "piptrack", "fcn", "hybrid"]
48
-
49
- embedders_mode = ["fairseq", "onnx", "transformers", "spin"]
50
- embedders_model = ["contentvec_base", "hubert_base", "vietnamese_hubert_base", "japanese_hubert_base", "korean_hubert_base", "chinese_hubert_base", "portuguese_hubert_base", "custom"]
51
-
52
- paths_for_files = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["audios_path"]) for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")])
53
-
54
- model_name = sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_")))
55
- index_path = sorted([os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name])
56
-
57
- pretrainedD = [model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "D" in model]
58
- pretrainedG = [model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "G" in model]
59
-
60
- presets_file = sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".conversion.json")))
61
- audio_effect_presets_file = sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".effect.json")))
62
- f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["f0_path"]) for f in files if f.endswith(".txt")])
63
-
64
- language = configs.get("language", "vi-VN")
65
- theme = configs.get("theme", "NoCrypt/miku")
66
-
67
- edgetts = configs.get("edge_tts", ["vi-VN-HoaiMyNeural", "vi-VN-NamMinhNeural"])
68
- google_tts_voice = configs.get("google_tts_voice", ["vi", "en"])
69
-
70
- mdx_model = configs.get("mdx_model", "MDXNET_Main")
71
- uvr_model = configs.get("demucs_model", "HD_MMI") + mdx_model
72
-
73
- font = configs.get("font", "https://fonts.googleapis.com/css2?family=Courgette&display=swap")
74
- sample_rate_choice = [8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000, 96000]
75
- csv_path = configs["csv_path"]
76
-
77
- if "--allow_all_disk" in sys.argv and sys.platform == "win32":
78
- try:
79
- import win32api
80
- except:
81
- os.system(f"{python} -m pip install pywin32")
82
- import win32api
83
-
84
- allow_disk = win32api.GetLogicalDriveStrings().split('\x00')[:-1]
85
- else: allow_disk = []
86
-
87
- try:
88
- if os.path.exists(csv_path): reader = list(csv.DictReader(open(csv_path, newline='', encoding='utf-8')))
89
- else:
90
- reader = list(csv.DictReader([line.decode('utf-8') for line in urllib.request.urlopen(codecs.decode("uggcf://qbpf.tbbtyr.pbz/fcernqfurrgf/q/1gNHnDeRULtEfz1Yieaw14USUQjWJy0Oq9k0DrCrjApb/rkcbeg?sbezng=pfi&tvq=1977693859", "rot13")).readlines()]))
91
- writer = csv.DictWriter(open(csv_path, mode='w', newline='', encoding='utf-8'), fieldnames=reader[0].keys())
92
- writer.writeheader()
93
- writer.writerows(reader)
94
-
95
- for row in reader:
96
- filename = row['Filename']
97
- url = None
98
-
99
- for value in row.values():
100
- if isinstance(value, str) and "huggingface" in value:
101
- url = value
102
- break
103
-
104
- if url: models[filename] = url
105
- except:
106
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/config.json DELETED
@@ -1,584 +0,0 @@
1
- {
2
- "language": "vi-VN",
3
- "support_language": [
4
- "en-US",
5
- "vi-VN"
6
- ],
7
- "theme": "NoCrypt/miku",
8
- "themes": [
9
- "NoCrypt/miku",
10
- "gstaff/xkcd",
11
- "JohnSmith9982/small_and_pretty",
12
- "ParityError/Interstellar",
13
- "earneleh/paris",
14
- "shivi/calm_seafoam",
15
- "Hev832/Applio",
16
- "YTheme/Minecraft",
17
- "gstaff/sketch",
18
- "SebastianBravo/simci_css",
19
- "allenai/gradio-theme",
20
- "Nymbo/Nymbo_Theme_5",
21
- "lone17/kotaemon",
22
- "Zarkel/IBM_Carbon_Theme",
23
- "SherlockRamos/Feliz",
24
- "freddyaboulton/dracula_revamped",
25
- "freddyaboulton/bad-theme-space",
26
- "gradio/dracula_revamped",
27
- "abidlabs/dracula_revamped",
28
- "gradio/dracula_test",
29
- "gradio/seafoam",
30
- "gradio/glass",
31
- "gradio/monochrome",
32
- "gradio/soft",
33
- "gradio/default",
34
- "gradio/base",
35
- "abidlabs/pakistan",
36
- "dawood/microsoft_windows",
37
- "ysharma/steampunk",
38
- "ysharma/huggingface",
39
- "abidlabs/Lime",
40
- "freddyaboulton/this-theme-does-not-exist-2",
41
- "aliabid94/new-theme",
42
- "aliabid94/test2",
43
- "aliabid94/test3",
44
- "aliabid94/test4",
45
- "abidlabs/banana",
46
- "freddyaboulton/test-blue",
47
- "gstaff/whiteboard",
48
- "ysharma/llamas",
49
- "abidlabs/font-test",
50
- "YenLai/Superhuman",
51
- "bethecloud/storj_theme",
52
- "sudeepshouche/minimalist",
53
- "knotdgaf/gradiotest",
54
- "ParityError/Anime",
55
- "Ajaxon6255/Emerald_Isle",
56
- "ParityError/LimeFace",
57
- "finlaymacklon/smooth_slate",
58
- "finlaymacklon/boxy_violet",
59
- "derekzen/stardust",
60
- "EveryPizza/Cartoony-Gradio-Theme",
61
- "Ifeanyi/Cyanister",
62
- "Tshackelton/IBMPlex-DenseReadable",
63
- "snehilsanyal/scikit-learn",
64
- "Himhimhim/xkcd",
65
- "nota-ai/theme",
66
- "rawrsor1/Everforest",
67
- "rottenlittlecreature/Moon_Goblin",
68
- "abidlabs/test-yellow",
69
- "abidlabs/test-yellow3",
70
- "idspicQstitho/dracula_revamped",
71
- "kfahn/AnimalPose",
72
- "HaleyCH/HaleyCH_Theme",
73
- "simulKitke/dracula_test",
74
- "braintacles/CrimsonNight",
75
- "wentaohe/whiteboardv2",
76
- "reilnuud/polite",
77
- "remilia/Ghostly",
78
- "Franklisi/darkmode",
79
- "coding-alt/soft",
80
- "xiaobaiyuan/theme_land",
81
- "step-3-profit/Midnight-Deep",
82
- "xiaobaiyuan/theme_demo",
83
- "Taithrah/Minimal",
84
- "Insuz/SimpleIndigo",
85
- "zkunn/Alipay_Gradio_theme",
86
- "Insuz/Mocha",
87
- "xiaobaiyuan/theme_brief",
88
- "Ama434/434-base-Barlow",
89
- "Ama434/def_barlow",
90
- "Ama434/neutral-barlow",
91
- "dawood/dracula_test",
92
- "nuttea/Softblue",
93
- "BlueDancer/Alien_Diffusion",
94
- "naughtondale/monochrome",
95
- "Dagfinn1962/standard",
96
- "default"
97
- ],
98
- "mdx_model": [
99
- "Main_340",
100
- "Main_390",
101
- "Main_406",
102
- "Main_427",
103
- "Main_438",
104
- "Inst_full_292",
105
- "Inst_HQ_1",
106
- "Inst_HQ_2",
107
- "Inst_HQ_3",
108
- "Inst_HQ_4",
109
- "Inst_HQ_5",
110
- "Kim_Vocal_1",
111
- "Kim_Vocal_2",
112
- "Kim_Inst",
113
- "Inst_187_beta",
114
- "Inst_82_beta",
115
- "Inst_90_beta",
116
- "Voc_FT",
117
- "Crowd_HQ",
118
- "Inst_1",
119
- "Inst_2",
120
- "Inst_3",
121
- "MDXNET_1_9703",
122
- "MDXNET_2_9682",
123
- "MDXNET_3_9662",
124
- "Inst_Main",
125
- "MDXNET_Main",
126
- "MDXNET_9482"
127
- ],
128
- "demucs_model": [
129
- "HT-Normal",
130
- "HT-Tuned",
131
- "HD_MMI",
132
- "HT_6S"
133
- ],
134
- "edge_tts": [
135
- "af-ZA-AdriNeural",
136
- "af-ZA-WillemNeural",
137
- "sq-AL-AnilaNeural",
138
- "sq-AL-IlirNeural",
139
- "am-ET-AmehaNeural",
140
- "am-ET-MekdesNeural",
141
- "ar-DZ-AminaNeural",
142
- "ar-DZ-IsmaelNeural",
143
- "ar-BH-AliNeural",
144
- "ar-BH-LailaNeural",
145
- "ar-EG-SalmaNeural",
146
- "ar-EG-ShakirNeural",
147
- "ar-IQ-BasselNeural",
148
- "ar-IQ-RanaNeural",
149
- "ar-JO-SanaNeural",
150
- "ar-JO-TaimNeural",
151
- "ar-KW-FahedNeural",
152
- "ar-KW-NouraNeural",
153
- "ar-LB-LaylaNeural",
154
- "ar-LB-RamiNeural",
155
- "ar-LY-ImanNeural",
156
- "ar-LY-OmarNeural",
157
- "ar-MA-JamalNeural",
158
- "ar-MA-MounaNeural",
159
- "ar-OM-AbdullahNeural",
160
- "ar-OM-AyshaNeural",
161
- "ar-QA-AmalNeural",
162
- "ar-QA-MoazNeural",
163
- "ar-SA-HamedNeural",
164
- "ar-SA-ZariyahNeural",
165
- "ar-SY-AmanyNeural",
166
- "ar-SY-LaithNeural",
167
- "ar-TN-HediNeural",
168
- "ar-TN-ReemNeural",
169
- "ar-AE-FatimaNeural",
170
- "ar-AE-HamdanNeural",
171
- "ar-YE-MaryamNeural",
172
- "ar-YE-SalehNeural",
173
- "az-AZ-BabekNeural",
174
- "az-AZ-BanuNeural",
175
- "bn-BD-NabanitaNeural",
176
- "bn-BD-PradeepNeural",
177
- "bn-IN-BashkarNeural",
178
- "bn-IN-TanishaaNeural",
179
- "bs-BA-GoranNeural",
180
- "bs-BA-VesnaNeural",
181
- "bg-BG-BorislavNeural",
182
- "bg-BG-KalinaNeural",
183
- "my-MM-NilarNeural",
184
- "my-MM-ThihaNeural",
185
- "ca-ES-EnricNeural",
186
- "ca-ES-JoanaNeural",
187
- "zh-HK-HiuGaaiNeural",
188
- "zh-HK-HiuMaanNeural",
189
- "zh-HK-WanLungNeural",
190
- "zh-CN-XiaoxiaoNeural",
191
- "zh-CN-XiaoyiNeural",
192
- "zh-CN-YunjianNeural",
193
- "zh-CN-YunxiNeural",
194
- "zh-CN-YunxiaNeural",
195
- "zh-CN-YunyangNeural",
196
- "zh-CN-liaoning-XiaobeiNeural",
197
- "zh-TW-HsiaoChenNeural",
198
- "zh-TW-YunJheNeural",
199
- "zh-TW-HsiaoYuNeural",
200
- "zh-CN-shaanxi-XiaoniNeural",
201
- "hr-HR-GabrijelaNeural",
202
- "hr-HR-SreckoNeural",
203
- "cs-CZ-AntoninNeural",
204
- "cs-CZ-VlastaNeural",
205
- "da-DK-ChristelNeural",
206
- "da-DK-JeppeNeural",
207
- "nl-BE-ArnaudNeural",
208
- "nl-BE-DenaNeural",
209
- "nl-NL-ColetteNeural",
210
- "nl-NL-FennaNeural",
211
- "nl-NL-MaartenNeural",
212
- "en-AU-NatashaNeural",
213
- "en-AU-WilliamNeural",
214
- "en-CA-ClaraNeural",
215
- "en-CA-LiamNeural",
216
- "en-HK-SamNeural",
217
- "en-HK-YanNeural",
218
- "en-IN-NeerjaExpressiveNeural",
219
- "en-IN-NeerjaNeural",
220
- "en-IN-PrabhatNeural",
221
- "en-IE-ConnorNeural",
222
- "en-IE-EmilyNeural",
223
- "en-KE-AsiliaNeural",
224
- "en-KE-ChilembaNeural",
225
- "en-NZ-MitchellNeural",
226
- "en-NZ-MollyNeural",
227
- "en-NG-AbeoNeural",
228
- "en-NG-EzinneNeural",
229
- "en-PH-JamesNeural",
230
- "en-PH-RosaNeural",
231
- "en-SG-LunaNeural",
232
- "en-SG-WayneNeural",
233
- "en-ZA-LeahNeural",
234
- "en-ZA-LukeNeural",
235
- "en-TZ-ElimuNeural",
236
- "en-TZ-ImaniNeural",
237
- "en-GB-LibbyNeural",
238
- "en-GB-MaisieNeural",
239
- "en-GB-RyanNeural",
240
- "en-GB-SoniaNeural",
241
- "en-GB-ThomasNeural",
242
- "en-US-AvaMultilingualNeural",
243
- "en-US-AndrewMultilingualNeural",
244
- "en-US-EmmaMultilingualNeural",
245
- "en-US-BrianMultilingualNeural",
246
- "en-US-AvaNeural",
247
- "en-US-AndrewNeural",
248
- "en-US-EmmaNeural",
249
- "en-US-BrianNeural",
250
- "en-US-AnaNeural",
251
- "en-US-AriaNeural",
252
- "en-US-ChristopherNeural",
253
- "en-US-EricNeural",
254
- "en-US-GuyNeural",
255
- "en-US-JennyNeural",
256
- "en-US-MichelleNeural",
257
- "en-US-RogerNeural",
258
- "en-US-SteffanNeural",
259
- "et-EE-AnuNeural",
260
- "et-EE-KertNeural",
261
- "fil-PH-AngeloNeural",
262
- "fil-PH-BlessicaNeural",
263
- "fi-FI-HarriNeural",
264
- "fi-FI-NooraNeural",
265
- "fr-BE-CharlineNeural",
266
- "fr-BE-GerardNeural",
267
- "fr-CA-ThierryNeural",
268
- "fr-CA-AntoineNeural",
269
- "fr-CA-JeanNeural",
270
- "fr-CA-SylvieNeural",
271
- "fr-FR-VivienneMultilingualNeural",
272
- "fr-FR-RemyMultilingualNeural",
273
- "fr-FR-DeniseNeural",
274
- "fr-FR-EloiseNeural",
275
- "fr-FR-HenriNeural",
276
- "fr-CH-ArianeNeural",
277
- "fr-CH-FabriceNeural",
278
- "gl-ES-RoiNeural",
279
- "gl-ES-SabelaNeural",
280
- "ka-GE-EkaNeural",
281
- "ka-GE-GiorgiNeural",
282
- "de-AT-IngridNeural",
283
- "de-AT-JonasNeural",
284
- "de-DE-SeraphinaMultilingualNeural",
285
- "de-DE-FlorianMultilingualNeural",
286
- "de-DE-AmalaNeural",
287
- "de-DE-ConradNeural",
288
- "de-DE-KatjaNeural",
289
- "de-DE-KillianNeural",
290
- "de-CH-JanNeural",
291
- "de-CH-LeniNeural",
292
- "el-GR-AthinaNeural",
293
- "el-GR-NestorasNeural",
294
- "gu-IN-DhwaniNeural",
295
- "gu-IN-NiranjanNeural",
296
- "he-IL-AvriNeural",
297
- "he-IL-HilaNeural",
298
- "hi-IN-MadhurNeural",
299
- "hi-IN-SwaraNeural",
300
- "hu-HU-NoemiNeural",
301
- "hu-HU-TamasNeural",
302
- "is-IS-GudrunNeural",
303
- "is-IS-GunnarNeural",
304
- "id-ID-ArdiNeural",
305
- "id-ID-GadisNeural",
306
- "ga-IE-ColmNeural",
307
- "ga-IE-OrlaNeural",
308
- "it-IT-GiuseppeNeural",
309
- "it-IT-DiegoNeural",
310
- "it-IT-ElsaNeural",
311
- "it-IT-IsabellaNeural",
312
- "ja-JP-KeitaNeural",
313
- "ja-JP-NanamiNeural",
314
- "jv-ID-DimasNeural",
315
- "jv-ID-SitiNeural",
316
- "kn-IN-GaganNeural",
317
- "kn-IN-SapnaNeural",
318
- "kk-KZ-AigulNeural",
319
- "kk-KZ-DauletNeural",
320
- "km-KH-PisethNeural",
321
- "km-KH-SreymomNeural",
322
- "ko-KR-HyunsuNeural",
323
- "ko-KR-InJoonNeural",
324
- "ko-KR-SunHiNeural",
325
- "lo-LA-ChanthavongNeural",
326
- "lo-LA-KeomanyNeural",
327
- "lv-LV-EveritaNeural",
328
- "lv-LV-NilsNeural",
329
- "lt-LT-LeonasNeural",
330
- "lt-LT-OnaNeural",
331
- "mk-MK-AleksandarNeural",
332
- "mk-MK-MarijaNeural",
333
- "ms-MY-OsmanNeural",
334
- "ms-MY-YasminNeural",
335
- "ml-IN-MidhunNeural",
336
- "ml-IN-SobhanaNeural",
337
- "mt-MT-GraceNeural",
338
- "mt-MT-JosephNeural",
339
- "mr-IN-AarohiNeural",
340
- "mr-IN-ManoharNeural",
341
- "mn-MN-BataaNeural",
342
- "mn-MN-YesuiNeural",
343
- "ne-NP-HemkalaNeural",
344
- "ne-NP-SagarNeural",
345
- "nb-NO-FinnNeural",
346
- "nb-NO-PernilleNeural",
347
- "ps-AF-GulNawazNeural",
348
- "ps-AF-LatifaNeural",
349
- "fa-IR-DilaraNeural",
350
- "fa-IR-FaridNeural",
351
- "pl-PL-MarekNeural",
352
- "pl-PL-ZofiaNeural",
353
- "pt-BR-ThalitaNeural",
354
- "pt-BR-AntonioNeural",
355
- "pt-BR-FranciscaNeural",
356
- "pt-PT-DuarteNeural",
357
- "pt-PT-RaquelNeural",
358
- "ro-RO-AlinaNeural",
359
- "ro-RO-EmilNeural",
360
- "ru-RU-DmitryNeural",
361
- "ru-RU-SvetlanaNeural",
362
- "sr-RS-NicholasNeural",
363
- "sr-RS-SophieNeural",
364
- "si-LK-SameeraNeural",
365
- "si-LK-ThiliniNeural",
366
- "sk-SK-LukasNeural",
367
- "sk-SK-ViktoriaNeural",
368
- "sl-SI-PetraNeural",
369
- "sl-SI-RokNeural",
370
- "so-SO-MuuseNeural",
371
- "so-SO-UbaxNeural",
372
- "es-AR-ElenaNeural",
373
- "es-AR-TomasNeural",
374
- "es-BO-MarceloNeural",
375
- "es-BO-SofiaNeural",
376
- "es-CL-CatalinaNeural",
377
- "es-CL-LorenzoNeural",
378
- "es-ES-XimenaNeural",
379
- "es-CO-GonzaloNeural",
380
- "es-CO-SalomeNeural",
381
- "es-CR-JuanNeural",
382
- "es-CR-MariaNeural",
383
- "es-CU-BelkysNeural",
384
- "es-CU-ManuelNeural",
385
- "es-DO-EmilioNeural",
386
- "es-DO-RamonaNeural",
387
- "es-EC-AndreaNeural",
388
- "es-EC-LuisNeural",
389
- "es-SV-LorenaNeural",
390
- "es-SV-RodrigoNeural",
391
- "es-GQ-JavierNeural",
392
- "es-GQ-TeresaNeural",
393
- "es-GT-AndresNeural",
394
- "es-GT-MartaNeural",
395
- "es-HN-CarlosNeural",
396
- "es-HN-KarlaNeural",
397
- "es-MX-DaliaNeural",
398
- "es-MX-JorgeNeural",
399
- "es-NI-FedericoNeural",
400
- "es-NI-YolandaNeural",
401
- "es-PA-MargaritaNeural",
402
- "es-PA-RobertoNeural",
403
- "es-PY-MarioNeural",
404
- "es-PY-TaniaNeural",
405
- "es-PE-AlexNeural",
406
- "es-PE-CamilaNeural",
407
- "es-PR-KarinaNeural",
408
- "es-PR-VictorNeural",
409
- "es-ES-AlvaroNeural",
410
- "es-ES-ElviraNeural",
411
- "es-US-AlonsoNeural",
412
- "es-US-PalomaNeural",
413
- "es-UY-MateoNeural",
414
- "es-UY-ValentinaNeural",
415
- "es-VE-PaolaNeural",
416
- "es-VE-SebastianNeural",
417
- "su-ID-JajangNeural",
418
- "su-ID-TutiNeural",
419
- "sw-KE-RafikiNeural",
420
- "sw-KE-ZuriNeural",
421
- "sw-TZ-DaudiNeural",
422
- "sw-TZ-RehemaNeural",
423
- "sv-SE-MattiasNeural",
424
- "sv-SE-SofieNeural",
425
- "ta-IN-PallaviNeural",
426
- "ta-IN-ValluvarNeural",
427
- "ta-MY-KaniNeural",
428
- "ta-MY-SuryaNeural",
429
- "ta-SG-AnbuNeural",
430
- "ta-SG-VenbaNeural",
431
- "ta-LK-KumarNeural",
432
- "ta-LK-SaranyaNeural",
433
- "te-IN-MohanNeural",
434
- "te-IN-ShrutiNeural",
435
- "th-TH-NiwatNeural",
436
- "th-TH-PremwadeeNeural",
437
- "tr-TR-AhmetNeural",
438
- "tr-TR-EmelNeural",
439
- "uk-UA-OstapNeural",
440
- "uk-UA-PolinaNeural",
441
- "ur-IN-GulNeural",
442
- "ur-IN-SalmanNeural",
443
- "ur-PK-AsadNeural",
444
- "ur-PK-UzmaNeural",
445
- "uz-UZ-MadinaNeural",
446
- "uz-UZ-SardorNeural",
447
- "vi-VN-HoaiMyNeural",
448
- "vi-VN-NamMinhNeural",
449
- "cy-GB-AledNeural",
450
- "cy-GB-NiaNeural",
451
- "zu-ZA-ThandoNeural",
452
- "zu-ZA-ThembaNeural"
453
- ],
454
- "google_tts_voice": [
455
- "af",
456
- "am",
457
- "ar",
458
- "bg",
459
- "bn",
460
- "bs",
461
- "ca",
462
- "cs",
463
- "cy",
464
- "da",
465
- "de",
466
- "el",
467
- "en",
468
- "es",
469
- "et",
470
- "eu",
471
- "fi",
472
- "fr",
473
- "fr-CA",
474
- "gl",
475
- "gu",
476
- "ha",
477
- "hi",
478
- "hr",
479
- "hu",
480
- "id",
481
- "is",
482
- "it",
483
- "iw",
484
- "ja",
485
- "jw",
486
- "km",
487
- "kn",
488
- "ko",
489
- "la",
490
- "lt",
491
- "lv",
492
- "ml",
493
- "mr",
494
- "ms",
495
- "my",
496
- "ne",
497
- "nl",
498
- "no",
499
- "pa",
500
- "pl",
501
- "pt",
502
- "pt-PT",
503
- "ro",
504
- "ru",
505
- "si",
506
- "sk",
507
- "sq",
508
- "sr",
509
- "su",
510
- "sv",
511
- "sw",
512
- "ta",
513
- "te",
514
- "th",
515
- "tl",
516
- "tr",
517
- "uk",
518
- "ur",
519
- "vi",
520
- "yue",
521
- "zh-CN",
522
- "zh-TW",
523
- "zh"
524
- ],
525
- "fp16": false,
526
- "editing_tab": true,
527
- "inference_tab": true,
528
- "create_and_training_tab": true,
529
- "extra_tab": true,
530
- "separator_tab": true,
531
- "convert_tab": true,
532
- "convert_with_whisper": true,
533
- "tts_tab": true,
534
- "effects_tab": true,
535
- "quirk": true,
536
- "create_dataset_tab": true,
537
- "training_tab": true,
538
- "fushion_tab": true,
539
- "read_tab": true,
540
- "onnx_tab": true,
541
- "downloads_tab": true,
542
- "f0_extractor_tab": true,
543
- "settings_tab": true,
544
- "report_bug_tab": false,
545
- "font": "https://fonts.googleapis.com/css2?family=Roboto&display=swap",
546
- "app_port": 7860,
547
- "tensorboard_port": 6870,
548
- "num_of_restart": 5,
549
- "server_name": "0.0.0.0",
550
- "app_show_error": true,
551
- "delete_exists_file": false,
552
- "audio_effects_path": "main/inference/audio_effects.py",
553
- "convert_path": "main/inference/conversion/convert.py",
554
- "separate_path": "main/inference/separator_music.py",
555
- "create_dataset_path": "main/inference/create_dataset.py",
556
- "preprocess_path": "main/inference/preprocess/preprocess.py",
557
- "extract_path": "main/inference/extracting/extract.py",
558
- "create_index_path": "main/inference/create_index.py",
559
- "train_path": "main/inference/training/train.py",
560
- "ico_path": "assets/ico.png",
561
- "csv_path": "assets/spreadsheet.csv",
562
- "weights_path": "assets/weights",
563
- "logs_path": "assets/logs",
564
- "binary_path": "assets/binary",
565
- "f0_path": "assets/f0",
566
- "language_path": "assets/languages",
567
- "presets_path": "assets/presets",
568
- "embedders_path": "assets/models/embedders",
569
- "predictors_path": "assets/models/predictors",
570
- "pretrained_custom_path": "assets/models/pretrained_custom",
571
- "pretrained_v1_path": "assets/models/pretrained_v1",
572
- "pretrained_v2_path": "assets/models/pretrained_v2",
573
- "speaker_diarization_path": "assets/models/speaker_diarization",
574
- "uvr5_path": "assets/models/uvr5",
575
- "audios_path": "audios",
576
- "demucs_segments_enable": true,
577
- "demucs_cpu_mode": false,
578
- "limit_f0": 8,
579
- "debug_mode": false,
580
- "pretrain_verify_shape": true,
581
- "pretrain_strict": true,
582
- "cpu_mode": false,
583
- "brain": false
584
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/config.py DELETED
@@ -1,101 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import torch
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.library import opencl
9
-
10
- version_config_paths = [os.path.join(version, size) for version in ["v1", "v2"] for size in ["32000.json", "40000.json", "48000.json"]]
11
-
12
- def singleton(cls):
13
- instances = {}
14
-
15
- def get_instance(*args, **kwargs):
16
- if cls not in instances: instances[cls] = cls(*args, **kwargs)
17
- return instances[cls]
18
-
19
- return get_instance
20
-
21
- @singleton
22
- class Config:
23
- def __init__(self):
24
- self.device = "cuda:0" if torch.cuda.is_available() else ("ocl:0" if opencl.is_available() else "cpu")
25
- self.configs_path = os.path.join("main", "configs", "config.json")
26
- self.configs = json.load(open(self.configs_path, "r"))
27
- self.translations = self.multi_language()
28
- self.json_config = self.load_config_json()
29
- self.gpu_mem = None
30
- self.per_preprocess = 3.7
31
- self.is_half = self.is_fp16()
32
- self.brain = self.configs.get("brain", False)
33
- self.cpu_mode = self.configs.get("cpu_mode", False)
34
- if self.cpu_mode: self.device = "cpu"
35
- self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
36
- self.debug_mode = self.configs.get("debug_mode", False)
37
-
38
- def multi_language(self):
39
- try:
40
- lang = self.configs.get("language", "vi-VN")
41
- if len([l for l in os.listdir(self.configs["language_path"]) if l.endswith(".json")]) < 1: raise FileNotFoundError("Không tìm thấy bất cứ gói ngôn ngữ nào(No package languages found)")
42
-
43
- if not lang: lang = "vi-VN"
44
- if lang not in self.configs["support_language"]: raise ValueError("Ngôn ngữ không được hỗ trợ(Language not supported)")
45
-
46
- lang_path = os.path.join(self.configs["language_path"], f"{lang}.json")
47
- if not os.path.exists(lang_path): lang_path = os.path.join(self.configs["language_path"], "vi-VN.json")
48
-
49
- with open(lang_path, encoding="utf-8") as f:
50
- translations = json.load(f)
51
- except json.JSONDecodeError:
52
- print(self.translations["empty_json"].format(file=lang))
53
- pass
54
-
55
- return translations
56
-
57
- def is_fp16(self):
58
- fp16 = self.configs.get("fp16", False)
59
-
60
- if self.device in ["cpu", "mps"] and fp16:
61
- self.configs["fp16"] = False
62
- fp16 = False
63
-
64
- with open(self.configs_path, "w") as f:
65
- json.dump(self.configs, f, indent=4)
66
-
67
- if not fp16: self.preprocess_per = 3.0
68
- return fp16
69
-
70
- def load_config_json(self):
71
- configs = {}
72
-
73
- for config_file in version_config_paths:
74
- try:
75
- with open(os.path.join("main", "configs", config_file), "r") as f:
76
- configs[config_file] = json.load(f)
77
- except json.JSONDecodeError:
78
- print(self.translations["empty_json"].format(file=config_file))
79
- pass
80
-
81
- return configs
82
-
83
- def device_config(self):
84
- if not self.cpu_mode:
85
- if self.device.startswith("cuda"): self.set_cuda_config()
86
- elif opencl.is_available(): self.device = "ocl:0"
87
- elif self.has_mps(): self.device = "mps"
88
- else: self.device = "cpu"
89
-
90
- if self.gpu_mem is not None and self.gpu_mem <= 4:
91
- self.preprocess_per = 3.0
92
- return 1, 5, 30, 32
93
-
94
- return (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
95
-
96
- def set_cuda_config(self):
97
- i_device = int(self.device.split(":")[-1])
98
- self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (1024**3)
99
-
100
- def has_mps(self):
101
- return torch.backends.mps.is_available()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v1/32000.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 0.0001,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-09,
9
- "batch_size": 4,
10
- "lr_decay": 0.999875,
11
- "segment_size": 12800,
12
- "init_lr_ratio": 1,
13
- "warmup_epochs": 0,
14
- "c_mel": 45,
15
- "c_kl": 1.0
16
- },
17
- "data": {
18
- "max_wav_value": 32768.0,
19
- "sample_rate": 32000,
20
- "filter_length": 1024,
21
- "hop_length": 320,
22
- "win_length": 1024,
23
- "n_mel_channels": 80,
24
- "mel_fmin": 0.0,
25
- "mel_fmax": null
26
- },
27
- "model": {
28
- "inter_channels": 192,
29
- "hidden_channels": 192,
30
- "filter_channels": 768,
31
- "text_enc_hidden_dim": 256,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3, 7, 11],
38
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
39
- "upsample_rates": [10, 4, 2, 2, 2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16, 16, 4, 4, 4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v1/40000.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 0.0001,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-09,
9
- "batch_size": 4,
10
- "lr_decay": 0.999875,
11
- "segment_size": 12800,
12
- "init_lr_ratio": 1,
13
- "warmup_epochs": 0,
14
- "c_mel": 45,
15
- "c_kl": 1.0
16
- },
17
- "data": {
18
- "max_wav_value": 32768.0,
19
- "sample_rate": 40000,
20
- "filter_length": 2048,
21
- "hop_length": 400,
22
- "win_length": 2048,
23
- "n_mel_channels": 125,
24
- "mel_fmin": 0.0,
25
- "mel_fmax": null
26
- },
27
- "model": {
28
- "inter_channels": 192,
29
- "hidden_channels": 192,
30
- "filter_channels": 768,
31
- "text_enc_hidden_dim": 256,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3, 7, 11],
38
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
39
- "upsample_rates": [10, 10, 2, 2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16, 16, 4, 4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v1/48000.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 0.0001,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-09,
9
- "batch_size": 4,
10
- "lr_decay": 0.999875,
11
- "segment_size": 11520,
12
- "init_lr_ratio": 1,
13
- "warmup_epochs": 0,
14
- "c_mel": 45,
15
- "c_kl": 1.0
16
- },
17
- "data": {
18
- "max_wav_value": 32768.0,
19
- "sample_rate": 48000,
20
- "filter_length": 2048,
21
- "hop_length": 480,
22
- "win_length": 2048,
23
- "n_mel_channels": 128,
24
- "mel_fmin": 0.0,
25
- "mel_fmax": null
26
- },
27
- "model": {
28
- "inter_channels": 192,
29
- "hidden_channels": 192,
30
- "filter_channels": 768,
31
- "text_enc_hidden_dim": 256,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3, 7, 11],
38
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
39
- "upsample_rates": [10, 6, 2, 2, 2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16, 16, 4, 4, 4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v2/32000.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "learning_rate": 0.0001,
6
- "betas": [0.8, 0.99],
7
- "eps": 1e-09,
8
- "lr_decay": 0.999875,
9
- "segment_size": 12800,
10
- "c_mel": 45,
11
- "c_kl": 1.0
12
- },
13
- "data": {
14
- "max_wav_value": 32768.0,
15
- "sample_rate": 32000,
16
- "filter_length": 1024,
17
- "hop_length": 320,
18
- "win_length": 1024,
19
- "n_mel_channels": 80,
20
- "mel_fmin": 0.0,
21
- "mel_fmax": null
22
- },
23
- "model": {
24
- "inter_channels": 192,
25
- "hidden_channels": 192,
26
- "filter_channels": 768,
27
- "text_enc_hidden_dim": 768,
28
- "n_heads": 2,
29
- "n_layers": 6,
30
- "kernel_size": 3,
31
- "p_dropout": 0,
32
- "resblock": "1",
33
- "resblock_kernel_sizes": [3, 7, 11],
34
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
35
- "upsample_rates": [10, 8, 2, 2],
36
- "upsample_initial_channel": 512,
37
- "upsample_kernel_sizes": [20, 16, 4, 4],
38
- "use_spectral_norm": false,
39
- "gin_channels": 256,
40
- "spk_embed_dim": 109
41
- }
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v2/40000.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "learning_rate": 0.0001,
6
- "betas": [0.8, 0.99],
7
- "eps": 1e-09,
8
- "lr_decay": 0.999875,
9
- "segment_size": 12800,
10
- "c_mel": 45,
11
- "c_kl": 1.0
12
- },
13
- "data": {
14
- "max_wav_value": 32768.0,
15
- "sample_rate": 40000,
16
- "filter_length": 2048,
17
- "hop_length": 400,
18
- "win_length": 2048,
19
- "n_mel_channels": 125,
20
- "mel_fmin": 0.0,
21
- "mel_fmax": null
22
- },
23
- "model": {
24
- "inter_channels": 192,
25
- "hidden_channels": 192,
26
- "filter_channels": 768,
27
- "text_enc_hidden_dim": 768,
28
- "n_heads": 2,
29
- "n_layers": 6,
30
- "kernel_size": 3,
31
- "p_dropout": 0,
32
- "resblock": "1",
33
- "resblock_kernel_sizes": [3, 7, 11],
34
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
35
- "upsample_rates": [10, 10, 2, 2],
36
- "upsample_initial_channel": 512,
37
- "upsample_kernel_sizes": [16, 16, 4, 4],
38
- "use_spectral_norm": false,
39
- "gin_channels": 256,
40
- "spk_embed_dim": 109
41
- }
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v2/48000.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "learning_rate": 0.0001,
6
- "betas": [0.8, 0.99],
7
- "eps": 1e-09,
8
- "lr_decay": 0.999875,
9
- "segment_size": 17280,
10
- "c_mel": 45,
11
- "c_kl": 1.0
12
- },
13
- "data": {
14
- "max_wav_value": 32768.0,
15
- "sample_rate": 48000,
16
- "filter_length": 2048,
17
- "hop_length": 480,
18
- "win_length": 2048,
19
- "n_mel_channels": 128,
20
- "mel_fmin": 0.0,
21
- "mel_fmax": null
22
- },
23
- "model": {
24
- "inter_channels": 192,
25
- "hidden_channels": 192,
26
- "filter_channels": 768,
27
- "text_enc_hidden_dim": 768,
28
- "n_heads": 2,
29
- "n_layers": 6,
30
- "kernel_size": 3,
31
- "p_dropout": 0,
32
- "resblock": "1",
33
- "resblock_kernel_sizes": [3, 7, 11],
34
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
35
- "upsample_rates": [12, 10, 2, 2],
36
- "upsample_initial_channel": 512,
37
- "upsample_kernel_sizes": [24, 20, 4, 4],
38
- "use_spectral_norm": false,
39
- "gin_channels": 256,
40
- "spk_embed_dim": 109
41
- }
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/inference/audio_effects.py DELETED
@@ -1,185 +0,0 @@
1
- import os
2
- import sys
3
- import librosa
4
- import argparse
5
-
6
- import numpy as np
7
- import soundfile as sf
8
-
9
- from distutils.util import strtobool
10
- from scipy.signal import butter, filtfilt
11
- from pedalboard import Pedalboard, Chorus, Distortion, Reverb, PitchShift, Delay, Limiter, Gain, Bitcrush, Clipping, Compressor, Phaser, HighpassFilter
12
-
13
- sys.path.append(os.getcwd())
14
-
15
- from main.library.utils import pydub_load
16
- from main.app.variables import translations, logger
17
-
18
- def parse_arguments():
19
- parser = argparse.ArgumentParser()
20
- parser.add_argument("--audio_effects", action='store_true')
21
- parser.add_argument("--input_path", type=str, required=True)
22
- parser.add_argument("--output_path", type=str, default="./audios/apply_effects.wav")
23
- parser.add_argument("--export_format", type=str, default="wav")
24
- parser.add_argument("--resample", type=lambda x: bool(strtobool(x)), default=False)
25
- parser.add_argument("--resample_sr", type=int, default=0)
26
- parser.add_argument("--chorus", type=lambda x: bool(strtobool(x)), default=False)
27
- parser.add_argument("--chorus_depth", type=float, default=0.5)
28
- parser.add_argument("--chorus_rate", type=float, default=1.5)
29
- parser.add_argument("--chorus_mix", type=float, default=0.5)
30
- parser.add_argument("--chorus_delay", type=int, default=10)
31
- parser.add_argument("--chorus_feedback", type=float, default=0)
32
- parser.add_argument("--distortion", type=lambda x: bool(strtobool(x)), default=False)
33
- parser.add_argument("--drive_db", type=int, default=20)
34
- parser.add_argument("--reverb", type=lambda x: bool(strtobool(x)), default=False)
35
- parser.add_argument("--reverb_room_size", type=float, default=0.5)
36
- parser.add_argument("--reverb_damping", type=float, default=0.5)
37
- parser.add_argument("--reverb_wet_level", type=float, default=0.33)
38
- parser.add_argument("--reverb_dry_level", type=float, default=0.67)
39
- parser.add_argument("--reverb_width", type=float, default=1)
40
- parser.add_argument("--reverb_freeze_mode", type=lambda x: bool(strtobool(x)), default=False)
41
- parser.add_argument("--pitchshift", type=lambda x: bool(strtobool(x)), default=False)
42
- parser.add_argument("--pitch_shift", type=int, default=0)
43
- parser.add_argument("--delay", type=lambda x: bool(strtobool(x)), default=False)
44
- parser.add_argument("--delay_seconds", type=float, default=0.5)
45
- parser.add_argument("--delay_feedback", type=float, default=0.5)
46
- parser.add_argument("--delay_mix", type=float, default=0.5)
47
- parser.add_argument("--compressor", type=lambda x: bool(strtobool(x)), default=False)
48
- parser.add_argument("--compressor_threshold", type=int, default=-20)
49
- parser.add_argument("--compressor_ratio", type=float, default=4)
50
- parser.add_argument("--compressor_attack_ms", type=float, default=10)
51
- parser.add_argument("--compressor_release_ms", type=int, default=200)
52
- parser.add_argument("--limiter", type=lambda x: bool(strtobool(x)), default=False)
53
- parser.add_argument("--limiter_threshold", type=int, default=0)
54
- parser.add_argument("--limiter_release", type=int, default=100)
55
- parser.add_argument("--gain", type=lambda x: bool(strtobool(x)), default=False)
56
- parser.add_argument("--gain_db", type=int, default=0)
57
- parser.add_argument("--bitcrush", type=lambda x: bool(strtobool(x)), default=False)
58
- parser.add_argument("--bitcrush_bit_depth", type=int, default=16)
59
- parser.add_argument("--clipping", type=lambda x: bool(strtobool(x)), default=False)
60
- parser.add_argument("--clipping_threshold", type=int, default=-10)
61
- parser.add_argument("--phaser", type=lambda x: bool(strtobool(x)), default=False)
62
- parser.add_argument("--phaser_rate_hz", type=float, default=0.5)
63
- parser.add_argument("--phaser_depth", type=float, default=0.5)
64
- parser.add_argument("--phaser_centre_frequency_hz", type=int, default=1000)
65
- parser.add_argument("--phaser_feedback", type=float, default=0)
66
- parser.add_argument("--phaser_mix", type=float, default=0.5)
67
- parser.add_argument("--treble_bass_boost", type=lambda x: bool(strtobool(x)), default=False)
68
- parser.add_argument("--bass_boost_db", type=int, default=0)
69
- parser.add_argument("--bass_boost_frequency", type=int, default=100)
70
- parser.add_argument("--treble_boost_db", type=int, default=0)
71
- parser.add_argument("--treble_boost_frequency", type=int, default=3000)
72
- parser.add_argument("--fade_in_out", type=lambda x: bool(strtobool(x)), default=False)
73
- parser.add_argument("--fade_in_duration", type=float, default=2000)
74
- parser.add_argument("--fade_out_duration", type=float, default=2000)
75
- parser.add_argument("--audio_combination", type=lambda x: bool(strtobool(x)), default=False)
76
- parser.add_argument("--audio_combination_input", type=str)
77
- parser.add_argument("--main_volume", type=int, default=0)
78
- parser.add_argument("--combination_volume", type=int, default=-7)
79
-
80
- return parser.parse_args()
81
-
82
- def process_audio(input_path, output_path, resample, resample_sr, chorus_depth, chorus_rate, chorus_mix, chorus_delay, chorus_feedback, distortion_drive, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift, delay_seconds, delay_feedback, delay_mix, compressor_threshold, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold, limiter_release, gain_db, bitcrush_bit_depth, clipping_threshold, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost_db, bass_boost_frequency, treble_boost_db, treble_boost_frequency, fade_in_duration, fade_out_duration, export_format, chorus, distortion, reverb, pitchshift, delay, compressor, limiter, gain, bitcrush, clipping, phaser, treble_bass_boost, fade_in_out, audio_combination, audio_combination_input, main_volume, combination_volume):
83
- def _filtfilt(b, a, audio):
84
- padlen = 3 * max(len(a), len(b))
85
- original_len = len(audio)
86
-
87
- if original_len <= padlen:
88
- pad_width = padlen - original_len + 1
89
- audio = np.pad(audio, (pad_width, 0), mode='reflect')
90
-
91
- filtered = filtfilt(b, a, audio, padlen=0)
92
- return filtered[-original_len:]
93
-
94
- def bass_boost(audio, gain_db, frequency, sample_rate):
95
- if gain_db >= 1:
96
- b, a = butter(4, frequency / (0.5 * sample_rate), btype='low')
97
- boosted = _filtfilt(b, a, audio)
98
- return boosted * (10 ** (gain_db / 20))
99
- return audio
100
-
101
- def treble_boost(audio, gain_db, frequency, sample_rate):
102
- if gain_db >= 1:
103
- b, a = butter(4, frequency / (0.5 * sample_rate), btype='high')
104
- boosted = _filtfilt(b, a, audio)
105
- return boosted * (10 ** (gain_db / 20))
106
- return audio
107
-
108
- def fade_out_effect(audio, sr, duration=3.0):
109
- length = int(duration * sr)
110
- end = audio.shape[0]
111
- if length > end: length = end
112
- start = end - length
113
- audio[start:end] = audio[start:end] * np.linspace(1.0, 0.0, length)
114
- return audio
115
-
116
- def fade_in_effect(audio, sr, duration=3.0):
117
- length = int(duration * sr)
118
- start = 0
119
- if length > audio.shape[0]: length = audio.shape[0]
120
- end = length
121
- audio[start:end] = audio[start:end] * np.linspace(0.0, 1.0, length)
122
- return audio
123
-
124
- if not input_path or not os.path.exists(input_path):
125
- logger.warning(translations["input_not_valid"])
126
- sys.exit(1)
127
-
128
- if not output_path:
129
- logger.warning(translations["output_not_valid"])
130
- sys.exit(1)
131
-
132
- if os.path.exists(output_path): os.remove(output_path)
133
-
134
- try:
135
- input_path = input_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
136
- try:
137
- audio, sample_rate = sf.read(input_path, dtype=np.float32)
138
- except:
139
- audio, sample_rate = librosa.load(input_path, sr=None)
140
- except Exception as e:
141
- logger.debug(f"{translations['errors_loading_audio']}: {e}")
142
- raise RuntimeError(f"{translations['errors_loading_audio']}: {e}")
143
-
144
- try:
145
- board = Pedalboard([HighpassFilter()])
146
-
147
- if chorus: board.append(Chorus(depth=chorus_depth, rate_hz=chorus_rate, mix=chorus_mix, centre_delay_ms=chorus_delay, feedback=chorus_feedback))
148
- if distortion: board.append(Distortion(drive_db=distortion_drive))
149
- if reverb: board.append(Reverb(room_size=reverb_room_size, damping=reverb_damping, wet_level=reverb_wet_level, dry_level=reverb_dry_level, width=reverb_width, freeze_mode=1 if reverb_freeze_mode else 0))
150
- if pitchshift: board.append(PitchShift(semitones=pitch_shift))
151
- if delay: board.append(Delay(delay_seconds=delay_seconds, feedback=delay_feedback, mix=delay_mix))
152
- if compressor: board.append(Compressor(threshold_db=compressor_threshold, ratio=compressor_ratio, attack_ms=compressor_attack_ms, release_ms=compressor_release_ms))
153
- if limiter: board.append(Limiter(threshold_db=limiter_threshold, release_ms=limiter_release))
154
- if gain: board.append(Gain(gain_db=gain_db))
155
- if bitcrush: board.append(Bitcrush(bit_depth=bitcrush_bit_depth))
156
- if clipping: board.append(Clipping(threshold_db=clipping_threshold))
157
- if phaser: board.append(Phaser(rate_hz=phaser_rate_hz, depth=phaser_depth, centre_frequency_hz=phaser_centre_frequency_hz, feedback=phaser_feedback, mix=phaser_mix))
158
-
159
- processed_audio = board(audio, sample_rate)
160
-
161
- if treble_bass_boost:
162
- processed_audio = bass_boost(processed_audio, bass_boost_db, bass_boost_frequency, sample_rate)
163
- processed_audio = treble_boost(processed_audio, treble_boost_db, treble_boost_frequency, sample_rate)
164
-
165
- if fade_in_out:
166
- processed_audio = fade_in_effect(processed_audio, sample_rate, fade_in_duration)
167
- processed_audio = fade_out_effect(processed_audio, sample_rate, fade_out_duration)
168
-
169
- if resample and resample_sr != sample_rate and resample_sr > 0:
170
- processed_audio = librosa.resample(processed_audio, orig_sr=sample_rate, target_sr=resample_sr, res_type="soxr_vhq")
171
- sample_rate = resample_sr
172
-
173
- sf.write(output_path.replace("wav", export_format), processed_audio, sample_rate, format=export_format)
174
- if audio_combination: pydub_load(audio_combination_input, combination_volume).overlay(pydub_load(output_path.replace("wav", export_format), main_volume)).export(output_path.replace("wav", export_format), format=export_format)
175
- except Exception as e:
176
- import traceback
177
- logger.debug(traceback.format_exc())
178
- raise RuntimeError(translations["apply_error"].format(e=e))
179
- return output_path
180
-
181
- def main():
182
- args = parse_arguments()
183
- process_audio(input_path=args.input_path, output_path=args.output_path, resample=args.resample, resample_sr=args.resample_sr, chorus_depth=args.chorus_depth, chorus_rate=args.chorus_rate, chorus_mix=args.chorus_mix, chorus_delay=args.chorus_delay, chorus_feedback=args.chorus_feedback, distortion_drive=args.drive_db, reverb_room_size=args.reverb_room_size, reverb_damping=args.reverb_damping, reverb_wet_level=args.reverb_wet_level, reverb_dry_level=args.reverb_dry_level, reverb_width=args.reverb_width, reverb_freeze_mode=args.reverb_freeze_mode, pitch_shift=args.pitch_shift, delay_seconds=args.delay_seconds, delay_feedback=args.delay_feedback, delay_mix=args.delay_mix, compressor_threshold=args.compressor_threshold, compressor_ratio=args.compressor_ratio, compressor_attack_ms=args.compressor_attack_ms, compressor_release_ms=args.compressor_release_ms, limiter_threshold=args.limiter_threshold, limiter_release=args.limiter_release, gain_db=args.gain_db, bitcrush_bit_depth=args.bitcrush_bit_depth, clipping_threshold=args.clipping_threshold, phaser_rate_hz=args.phaser_rate_hz, phaser_depth=args.phaser_depth, phaser_centre_frequency_hz=args.phaser_centre_frequency_hz, phaser_feedback=args.phaser_feedback, phaser_mix=args.phaser_mix, bass_boost_db=args.bass_boost_db, bass_boost_frequency=args.bass_boost_frequency, treble_boost_db=args.treble_boost_db, treble_boost_frequency=args.treble_boost_frequency, fade_in_duration=args.fade_in_duration, fade_out_duration=args.fade_out_duration, export_format=args.export_format, chorus=args.chorus, distortion=args.distortion, reverb=args.reverb, pitchshift=args.pitchshift, delay=args.delay, compressor=args.compressor, limiter=args.limiter, gain=args.gain, bitcrush=args.bitcrush, clipping=args.clipping, phaser=args.phaser, treble_bass_boost=args.treble_bass_boost, fade_in_out=args.fade_in_out, audio_combination=args.audio_combination, audio_combination_input=args.audio_combination_input, main_volume=args.main_volume, combination_volume=args.combination_volume)
184
-
185
- if __name__ == "__main__": main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/inference/conversion/convert.py DELETED
@@ -1,300 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import onnx
5
- import time
6
- import torch
7
- import librosa
8
- import logging
9
- import argparse
10
- import warnings
11
- import onnxruntime
12
-
13
- import numpy as np
14
- import soundfile as sf
15
-
16
- from tqdm import tqdm
17
- from distutils.util import strtobool
18
-
19
- warnings.filterwarnings("ignore")
20
- sys.path.append(os.getcwd())
21
-
22
- from main.inference.conversion.pipeline import Pipeline
23
- from main.app.variables import config, logger, translations
24
- from main.library.algorithm.synthesizers import Synthesizer
25
- from main.inference.conversion.utils import clear_gpu_cache
26
- from main.library.utils import check_assets, load_audio, load_embedders_model, cut, restore, get_providers
27
-
28
- for l in ["torch", "faiss", "omegaconf", "httpx", "httpcore", "faiss.loader", "numba.core", "urllib3", "transformers", "matplotlib"]:
29
- logging.getLogger(l).setLevel(logging.ERROR)
30
-
31
- def parse_arguments():
32
- parser = argparse.ArgumentParser()
33
- parser.add_argument("--convert", action='store_true')
34
- parser.add_argument("--pitch", type=int, default=0)
35
- parser.add_argument("--filter_radius", type=int, default=3)
36
- parser.add_argument("--index_rate", type=float, default=0.5)
37
- parser.add_argument("--rms_mix_rate", type=float, default=1)
38
- parser.add_argument("--protect", type=float, default=0.33)
39
- parser.add_argument("--hop_length", type=int, default=64)
40
- parser.add_argument("--f0_method", type=str, default="rmvpe")
41
- parser.add_argument("--embedder_model", type=str, default="contentvec_base")
42
- parser.add_argument("--input_path", type=str, required=True)
43
- parser.add_argument("--output_path", type=str, default="./audios/output.wav")
44
- parser.add_argument("--export_format", type=str, default="wav")
45
- parser.add_argument("--pth_path", type=str, required=True)
46
- parser.add_argument("--index_path", type=str, default="")
47
- parser.add_argument("--f0_autotune", type=lambda x: bool(strtobool(x)), default=False)
48
- parser.add_argument("--f0_autotune_strength", type=float, default=1)
49
- parser.add_argument("--clean_audio", type=lambda x: bool(strtobool(x)), default=False)
50
- parser.add_argument("--clean_strength", type=float, default=0.7)
51
- parser.add_argument("--resample_sr", type=int, default=0)
52
- parser.add_argument("--split_audio", type=lambda x: bool(strtobool(x)), default=False)
53
- parser.add_argument("--checkpointing", type=lambda x: bool(strtobool(x)), default=False)
54
- parser.add_argument("--f0_file", type=str, default="")
55
- parser.add_argument("--f0_onnx", type=lambda x: bool(strtobool(x)), default=False)
56
- parser.add_argument("--embedders_mode", type=str, default="fairseq")
57
- parser.add_argument("--formant_shifting", type=lambda x: bool(strtobool(x)), default=False)
58
- parser.add_argument("--formant_qfrency", type=float, default=0.8)
59
- parser.add_argument("--formant_timbre", type=float, default=0.8)
60
- parser.add_argument("--proposal_pitch", type=lambda x: bool(strtobool(x)), default=False)
61
- parser.add_argument("--proposal_pitch_threshold", type=float, default=255.0)
62
-
63
- return parser.parse_args()
64
-
65
- def main():
66
- args = parse_arguments()
67
- pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, f0_autotune_strength, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, checkpointing, f0_file, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold = args.pitch, args.filter_radius, args.index_rate, args.rms_mix_rate,args.protect, args.hop_length, args.f0_method, args.input_path, args.output_path, args.pth_path, args.index_path, args.f0_autotune, args.f0_autotune_strength, args.clean_audio, args.clean_strength, args.export_format, args.embedder_model, args.resample_sr, args.split_audio, args.checkpointing, args.f0_file, args.f0_onnx, args.embedders_mode, args.formant_shifting, args.formant_qfrency, args.formant_timbre, args.proposal_pitch, args.proposal_pitch_threshold
68
-
69
- run_convert_script(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, rms_mix_rate=rms_mix_rate, protect=protect, hop_length=hop_length, f0_method=f0_method, input_path=input_path, output_path=output_path, pth_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, split_audio=split_audio, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, embedders_mode=embedders_mode, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre, proposal_pitch=proposal_pitch, proposal_pitch_threshold=proposal_pitch_threshold)
70
-
71
- def run_convert_script(pitch=0, filter_radius=3, index_rate=0.5, rms_mix_rate=1, protect=0.5, hop_length=64, f0_method="rmvpe", input_path=None, output_path="./output.wav", pth_path=None, index_path=None, f0_autotune=False, f0_autotune_strength=1, clean_audio=False, clean_strength=0.7, export_format="wav", embedder_model="contentvec_base", resample_sr=0, split_audio=False, checkpointing=False, f0_file=None, f0_onnx=False, embedders_mode="fairseq", formant_shifting=False, formant_qfrency=0.8, formant_timbre=0.8, proposal_pitch=False, proposal_pitch_threshold=255.0):
72
- check_assets(f0_method, embedder_model, f0_onnx=f0_onnx, embedders_mode=embedders_mode)
73
- log_data = {translations['pitch']: pitch, translations['filter_radius']: filter_radius, translations['index_strength']: index_rate, translations['rms_mix_rate']: rms_mix_rate, translations['protect']: protect, "Hop length": hop_length, translations['f0_method']: f0_method, translations['audio_path']: input_path, translations['output_path']: output_path.replace('wav', export_format), translations['model_path']: pth_path, translations['indexpath']: index_path, translations['autotune']: f0_autotune, translations['clear_audio']: clean_audio, translations['export_format']: export_format, translations['hubert_model']: embedder_model, translations['split_audio']: split_audio, translations['memory_efficient_training']: checkpointing, translations["f0_onnx_mode"]: f0_onnx, translations["embed_mode"]: embedders_mode, translations["proposal_pitch"]: proposal_pitch}
74
-
75
- if clean_audio: log_data[translations['clean_strength']] = clean_strength
76
- if resample_sr != 0: log_data[translations['sample_rate']] = resample_sr
77
- if f0_autotune: log_data[translations['autotune_rate_info']] = f0_autotune_strength
78
- if os.path.isfile(f0_file): log_data[translations['f0_file']] = f0_file
79
- if proposal_pitch: log_data[translations["proposal_pitch_threshold"]] = proposal_pitch_threshold
80
- if formant_shifting:
81
- log_data[translations['formant_qfrency']] = formant_qfrency
82
- log_data[translations['formant_timbre']] = formant_timbre
83
-
84
- for key, value in log_data.items():
85
- logger.debug(f"{key}: {value}")
86
-
87
- if not pth_path or not os.path.exists(pth_path) or os.path.isdir(pth_path) or not pth_path.endswith((".pth", ".onnx")):
88
- logger.warning(translations["provide_file"].format(filename=translations["model"]))
89
- sys.exit(1)
90
-
91
- cvt = VoiceConverter(pth_path, 0)
92
- start_time = time.time()
93
-
94
- pid_path = os.path.join("assets", "convert_pid.txt")
95
- with open(pid_path, "w") as pid_file:
96
- pid_file.write(str(os.getpid()))
97
-
98
- if os.path.isdir(input_path):
99
- logger.info(translations["convert_batch"])
100
- audio_files = [f for f in os.listdir(input_path) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]
101
-
102
- if not audio_files:
103
- logger.warning(translations["not_found_audio"])
104
- sys.exit(1)
105
-
106
- logger.info(translations["found_audio"].format(audio_files=len(audio_files)))
107
-
108
- for audio in audio_files:
109
- audio_path = os.path.join(input_path, audio)
110
- output_audio = os.path.join(input_path, os.path.splitext(audio)[0] + f"_output.{export_format}")
111
-
112
- logger.info(f"{translations['convert_audio']} '{audio_path}'...")
113
- if os.path.exists(output_audio): os.remove(output_audio)
114
-
115
- cvt.convert_audio(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, rms_mix_rate=rms_mix_rate, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=audio_path, audio_output_path=output_audio, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, embedders_mode=embedders_mode, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre, split_audio=split_audio, proposal_pitch=proposal_pitch, proposal_pitch_threshold=proposal_pitch_threshold)
116
-
117
- logger.info(translations["convert_batch_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}", output_path=output_path.replace('wav', export_format)))
118
- else:
119
- if not os.path.exists(input_path):
120
- logger.warning(translations["not_found_audio"])
121
- sys.exit(1)
122
-
123
- logger.info(f"{translations['convert_audio']} '{input_path}'...")
124
- if os.path.exists(output_path): os.remove(output_path)
125
-
126
- cvt.convert_audio(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, rms_mix_rate=rms_mix_rate, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=input_path, audio_output_path=output_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, embedders_mode=embedders_mode, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre, split_audio=split_audio, proposal_pitch=proposal_pitch, proposal_pitch_threshold=proposal_pitch_threshold)
127
- logger.info(translations["convert_audio_success"].format(input_path=input_path, elapsed_time=f"{(time.time() - start_time):.2f}", output_path=output_path.replace('wav', export_format)))
128
-
129
- if os.path.exists(pid_path): os.remove(pid_path)
130
-
131
- class VoiceConverter:
132
- def __init__(self, model_path, sid = 0):
133
- self.config = config
134
- self.device = config.device
135
- self.hubert_model = None
136
- self.tgt_sr = None
137
- self.net_g = None
138
- self.vc = None
139
- self.cpt = None
140
- self.version = None
141
- self.n_spk = None
142
- self.use_f0 = None
143
- self.loaded_model = None
144
- self.vocoder = "Default"
145
- self.checkpointing = False
146
- self.sample_rate = 16000
147
- self.sid = sid
148
- self.get_vc(model_path, sid)
149
-
150
- def convert_audio(self, audio_input_path, audio_output_path, index_path, embedder_model, pitch, f0_method, index_rate, rms_mix_rate, protect, hop_length, f0_autotune, f0_autotune_strength, filter_radius, clean_audio, clean_strength, export_format, resample_sr = 0, checkpointing = False, f0_file = None, f0_onnx = False, embedders_mode = "fairseq", formant_shifting = False, formant_qfrency = 0.8, formant_timbre = 0.8, split_audio = False, proposal_pitch = False, proposal_pitch_threshold = 255.0):
151
- try:
152
- with tqdm(total=10, desc=translations["convert_audio"], ncols=100, unit="a", leave=not split_audio) as pbar:
153
- audio = load_audio(audio_input_path, self.sample_rate, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre)
154
- self.checkpointing = checkpointing
155
-
156
- audio_max = np.abs(audio).max() / 0.95
157
- if audio_max > 1: audio /= audio_max
158
-
159
- if not self.hubert_model:
160
- models, embed_suffix = load_embedders_model(embedder_model, embedders_mode)
161
- self.hubert_model = (models.to(self.device).half() if self.config.is_half else models.to(self.device).float()).eval() if embed_suffix in [".pt", ".safetensors"] else models
162
- self.embed_suffix = embed_suffix
163
-
164
- pbar.update(1)
165
- if split_audio:
166
- pbar.close()
167
- chunks = cut(audio, self.sample_rate, db_thresh=-60, min_interval=500)
168
-
169
- logger.info(f"{translations['split_total']}: {len(chunks)}")
170
- pbar = tqdm(total=len(chunks) * 5 + 4, desc=translations["convert_audio"], ncols=100, unit="a", leave=True)
171
- else: chunks = [(audio, 0, 0)]
172
-
173
- pbar.update(1)
174
- converted_chunks = [(
175
- start,
176
- end,
177
- self.vc.pipeline(
178
- logger=logger,
179
- model=self.hubert_model,
180
- net_g=self.net_g,
181
- sid=self.sid,
182
- audio=waveform,
183
- f0_up_key=pitch,
184
- f0_method=f0_method,
185
- file_index=(index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added")),
186
- index_rate=index_rate,
187
- pitch_guidance=self.use_f0,
188
- filter_radius=filter_radius,
189
- rms_mix_rate=rms_mix_rate,
190
- version=self.version,
191
- protect=protect,
192
- hop_length=hop_length,
193
- f0_autotune=f0_autotune,
194
- f0_autotune_strength=f0_autotune_strength,
195
- suffix=self.suffix,
196
- embed_suffix=self.embed_suffix,
197
- f0_file=f0_file,
198
- f0_onnx=f0_onnx,
199
- pbar=pbar,
200
- proposal_pitch=proposal_pitch,
201
- proposal_pitch_threshold=proposal_pitch_threshold,
202
- energy_use=self.energy
203
- )
204
- ) for waveform, start, end in chunks]
205
-
206
- pbar.update(1)
207
-
208
- del self.net_g, self.hubert_model
209
- audio_output = restore(converted_chunks, total_len=len(audio), dtype=converted_chunks[0][2].dtype) if split_audio else converted_chunks[0][2]
210
-
211
- if self.tgt_sr != resample_sr and resample_sr > 0:
212
- audio_output = librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=resample_sr, res_type="soxr_vhq")
213
- self.tgt_sr = resample_sr
214
-
215
- pbar.update(1)
216
- if clean_audio:
217
- from main.tools.noisereduce import reduce_noise
218
- audio_output = reduce_noise(y=audio_output, sr=self.tgt_sr, prop_decrease=clean_strength, device=self.device)
219
-
220
- if len(audio) / self.sample_rate > len(audio_output) / self.tgt_sr:
221
- padding = np.zeros(int(np.round(len(audio) / self.sample_rate * self.tgt_sr) - len(audio_output)), dtype=audio_output.dtype)
222
- audio_output = np.concatenate([audio_output, padding])
223
-
224
- try:
225
- sf.write(audio_output_path, audio_output, self.tgt_sr, format=export_format)
226
- except:
227
- sf.write(audio_output_path, librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=48000, res_type="soxr_vhq"), 48000, format=export_format)
228
-
229
- pbar.update(1)
230
- except Exception as e:
231
- logger.error(translations["error_convert"].format(e=e))
232
- import traceback
233
- logger.debug(traceback.format_exc())
234
-
235
- def get_vc(self, weight_root, sid):
236
- if sid == "" or sid == []:
237
- self.cleanup()
238
- clear_gpu_cache()
239
-
240
- if not self.loaded_model or self.loaded_model != weight_root:
241
- self.loaded_model = weight_root
242
- self.load_model()
243
- if self.cpt is not None: self.setup()
244
-
245
- def cleanup(self):
246
- if self.hubert_model is not None:
247
- del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
248
- self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
249
- clear_gpu_cache()
250
-
251
- del self.net_g, self.cpt
252
- clear_gpu_cache()
253
- self.cpt = None
254
-
255
- def load_model(self):
256
- if os.path.isfile(self.loaded_model):
257
- if self.loaded_model.endswith(".pth"): self.cpt = torch.load(self.loaded_model, map_location="cpu", weights_only=True)
258
- else:
259
- sess_options = onnxruntime.SessionOptions()
260
- sess_options.log_severity_level = 3
261
- self.cpt = onnxruntime.InferenceSession(self.loaded_model, sess_options=sess_options, providers=get_providers())
262
- else: self.cpt = None
263
-
264
- def setup(self):
265
- if self.cpt is not None:
266
- if self.loaded_model.endswith(".pth"):
267
- self.tgt_sr = self.cpt["config"][-1]
268
- self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
269
-
270
- self.use_f0 = self.cpt.get("f0", 1)
271
- self.version = self.cpt.get("version", "v1")
272
- self.vocoder = self.cpt.get("vocoder", "Default")
273
- self.energy = self.cpt.get("energy", False)
274
-
275
- if self.vocoder != "Default": self.config.is_half = False
276
- self.net_g = Synthesizer(*self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=768 if self.version == "v2" else 256, vocoder=self.vocoder, checkpointing=self.checkpointing, energy=self.energy)
277
- del self.net_g.enc_q
278
-
279
- self.net_g.load_state_dict(self.cpt["weight"], strict=False)
280
- self.net_g.eval().to(self.device)
281
- self.net_g = (self.net_g.half() if self.config.is_half else self.net_g.float())
282
- self.n_spk = self.cpt["config"][-3]
283
- self.suffix = ".pth"
284
- else:
285
- metadata_dict = None
286
- for prop in onnx.load(self.loaded_model).metadata_props:
287
- if prop.key == "model_info":
288
- metadata_dict = json.loads(prop.value)
289
- break
290
-
291
- self.net_g = self.cpt
292
- self.tgt_sr = metadata_dict.get("sr", 32000)
293
- self.use_f0 = metadata_dict.get("f0", 1)
294
- self.version = metadata_dict.get("version", "v1")
295
- self.energy = metadata_dict.get("energy", False)
296
- self.suffix = ".onnx"
297
-
298
- self.vc = Pipeline(self.tgt_sr, self.config)
299
-
300
- if __name__ == "__main__": main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/inference/conversion/pipeline.py DELETED
@@ -1,251 +0,0 @@
1
- import os
2
- import sys
3
- import torch
4
- import faiss
5
-
6
- import numpy as np
7
- import torch.nn.functional as F
8
-
9
- from scipy import signal
10
-
11
- sys.path.append(os.getcwd())
12
-
13
- from main.app.variables import translations
14
- from main.library.utils import extract_features
15
- from main.library.predictors.Generator import Generator
16
- from main.inference.extracting.rms import RMSEnergyExtractor
17
- from main.inference.conversion.utils import change_rms, clear_gpu_cache, get_onnx_argument
18
-
19
- bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
20
-
21
- class Pipeline:
22
- def __init__(self, tgt_sr, config):
23
- self.x_pad = config.x_pad
24
- self.x_query = config.x_query
25
- self.x_center = config.x_center
26
- self.x_max = config.x_max
27
- self.sample_rate = 16000
28
- self.window = 160
29
- self.t_pad = self.sample_rate * self.x_pad
30
- self.t_pad_tgt = tgt_sr * self.x_pad
31
- self.t_pad2 = self.t_pad * 2
32
- self.t_query = self.sample_rate * self.x_query
33
- self.t_center = self.sample_rate * self.x_center
34
- self.t_max = self.sample_rate * self.x_max
35
- self.f0_min = 50
36
- self.f0_max = 1100
37
- self.device = config.device
38
- self.is_half = config.is_half
39
-
40
- def voice_conversion(self, model, net_g, sid, audio0, pitch, pitchf, index, big_npy, index_rate, version, protect, energy):
41
- pitch_guidance = pitch != None and pitchf != None
42
- energy_use = energy != None
43
-
44
- feats = torch.from_numpy(audio0)
45
- feats = feats.half() if self.is_half else feats.float()
46
-
47
- feats = feats.mean(-1) if feats.dim() == 2 else feats
48
- assert feats.dim() == 1, feats.dim()
49
- feats = feats.view(1, -1)
50
-
51
- with torch.no_grad():
52
- if self.embed_suffix == ".pt":
53
- padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
54
- logits = model.extract_features(**{"source": feats.to(self.device), "padding_mask": padding_mask, "output_layer": 9 if version == "v1" else 12})
55
- feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
56
- elif self.embed_suffix == ".onnx": feats = extract_features(model, feats.to(self.device), version).to(self.device)
57
- elif self.embed_suffix == ".safetensors":
58
- logits = model(feats.to(self.device))["last_hidden_state"]
59
- feats = model.final_proj(logits[0]).unsqueeze(0) if version == "v1" else logits
60
- else: raise ValueError(translations["option_not_valid"])
61
-
62
- feats0 = feats.clone() if protect < 0.5 and pitch_guidance else None
63
-
64
- if (not isinstance(index, type(None)) and not isinstance(big_npy, type(None)) and index_rate != 0):
65
- npy = feats[0].cpu().numpy()
66
- if self.is_half: npy = npy.astype(np.float32)
67
-
68
- score, ix = index.search(npy, k=8)
69
- weight = np.square(1 / score)
70
-
71
- npy = np.sum(big_npy[ix] * np.expand_dims(weight / weight.sum(axis=1, keepdims=True), axis=2), axis=1)
72
- if self.is_half: npy = npy.astype(np.float16)
73
-
74
- feats = (torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats)
75
-
76
- feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
77
- p_len = min(audio0.shape[0] // self.window, feats.shape[1])
78
-
79
- if pitch_guidance: pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len]
80
- if energy_use: energy = energy[:, :p_len]
81
-
82
- if feats0 is not None:
83
- pitchff = pitchf.clone()
84
- pitchff[pitchf > 0] = 1
85
- pitchff[pitchf < 1] = protect
86
- pitchff = pitchff.unsqueeze(-1)
87
-
88
- feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
89
- feats = (feats * pitchff + feats0 * (1 - pitchff)).to(feats0.dtype)
90
-
91
- p_len = torch.tensor([p_len], device=self.device).long()
92
- feats = feats.half() if self.is_half else feats.float()
93
-
94
- if not pitch_guidance: pitch, pitchf = None, None
95
- else: pitchf = pitchf.half() if self.is_half else pitchf.float()
96
- if not energy_use: energy = None
97
- else: energy = energy.half() if self.is_half else energy.float()
98
-
99
- audio1 = (
100
- (
101
- net_g.infer(
102
- feats,
103
- p_len,
104
- pitch,
105
- pitchf,
106
- sid,
107
- energy
108
- )[0][0, 0]
109
- ).data.cpu().float().numpy()
110
- ) if self.suffix == ".pth" else (
111
- net_g.run(
112
- [net_g.get_outputs()[0].name], (
113
- get_onnx_argument(
114
- net_g,
115
- feats,
116
- p_len,
117
- sid,
118
- pitch,
119
- pitchf,
120
- energy,
121
- pitch_guidance,
122
- energy_use
123
- )
124
- )
125
- )[0][0, 0]
126
- )
127
-
128
- if self.embed_suffix == ".pt": del padding_mask
129
- del feats, feats0, p_len
130
-
131
- clear_gpu_cache()
132
- return audio1
133
-
134
- def pipeline(self, logger, model, net_g, sid, audio, f0_up_key, f0_method, file_index, index_rate, pitch_guidance, filter_radius, rms_mix_rate, version, protect, hop_length, f0_autotune, f0_autotune_strength, suffix, embed_suffix, f0_file=None, f0_onnx=False, pbar=None, proposal_pitch=False, proposal_pitch_threshold=255.0, energy_use=False):
135
- self.suffix = suffix
136
- self.embed_suffix = embed_suffix
137
-
138
- if file_index != "" and os.path.exists(file_index) and index_rate != 0:
139
- try:
140
- index = faiss.read_index(file_index)
141
- big_npy = index.reconstruct_n(0, index.ntotal)
142
- except Exception as e:
143
- logger.error(translations["read_faiss_index_error"].format(e=e))
144
- index = big_npy = None
145
- else: index = big_npy = None
146
-
147
- if pbar: pbar.update(1)
148
- opt_ts, audio_opt = [], []
149
- audio = signal.filtfilt(bh, ah, audio)
150
- audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
151
-
152
- if audio_pad.shape[0] > self.t_max:
153
- audio_sum = np.zeros_like(audio)
154
-
155
- for i in range(self.window):
156
- audio_sum += audio_pad[i : i - self.window]
157
-
158
- for t in range(self.t_center, audio.shape[0], self.t_center):
159
- opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min())[0][0])
160
-
161
- s = 0
162
- t, inp_f0 = None, None
163
- audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
164
- sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
165
- p_len = audio_pad.shape[0] // self.window
166
-
167
- if hasattr(f0_file, "name"):
168
- try:
169
- with open(f0_file.name, "r") as f:
170
- raw_lines = f.read()
171
-
172
- if len(raw_lines) > 0:
173
- inp_f0 = []
174
-
175
- for line in raw_lines.strip("\n").split("\n"):
176
- inp_f0.append([float(i) for i in line.split(",")])
177
-
178
- inp_f0 = np.array(inp_f0, dtype=np.float32)
179
- except:
180
- logger.error(translations["error_readfile"])
181
- inp_f0 = None
182
-
183
- if pbar: pbar.update(1)
184
- if pitch_guidance:
185
- if not hasattr(self, "f0_generator"): self.f0_generator = Generator(self.sample_rate, hop_length, self.f0_min, self.f0_max, self.is_half, self.device, f0_onnx, f0_onnx)
186
- pitch, pitchf = self.f0_generator.calculator(self.x_pad, f0_method, audio_pad, f0_up_key, p_len, filter_radius, f0_autotune, f0_autotune_strength, manual_f0=inp_f0, proposal_pitch=proposal_pitch, proposal_pitch_threshold=proposal_pitch_threshold)
187
-
188
- if self.device == "mps": pitchf = pitchf.astype(np.float32)
189
- pitch, pitchf = torch.tensor(pitch[:p_len], device=self.device).unsqueeze(0).long(), torch.tensor(pitchf[:p_len], device=self.device).unsqueeze(0).float()
190
-
191
- if pbar: pbar.update(1)
192
-
193
- if energy_use:
194
- if not hasattr(self, "rms_extract"): self.rms_extract = RMSEnergyExtractor(frame_length=2048, hop_length=self.window, center=True, pad_mode = "reflect").to(self.device).eval()
195
- energy = self.rms_extract(torch.from_numpy(audio_pad).to(self.device).unsqueeze(0)).cpu().numpy()
196
-
197
- if self.device == "mps": energy = energy.astype(np.float32)
198
- energy = torch.tensor(energy[:p_len], device=self.device).unsqueeze(0).float()
199
-
200
- if pbar: pbar.update(1)
201
-
202
- for t in opt_ts:
203
- t = t // self.window * self.window
204
- audio_opt.append(
205
- self.voice_conversion(
206
- model,
207
- net_g,
208
- sid,
209
- audio_pad[s : t + self.t_pad2 + self.window],
210
- pitch[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None,
211
- pitchf[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None,
212
- index,
213
- big_npy,
214
- index_rate,
215
- version,
216
- protect,
217
- energy[:, s // self.window : (t + self.t_pad2) // self.window] if energy_use else None
218
- )[self.t_pad_tgt : -self.t_pad_tgt]
219
- )
220
- s = t
221
-
222
- audio_opt.append(
223
- self.voice_conversion(
224
- model,
225
- net_g,
226
- sid,
227
- audio_pad[t:],
228
- (pitch[:, t // self.window :] if t is not None else pitch) if pitch_guidance else None,
229
- (pitchf[:, t // self.window :] if t is not None else pitchf) if pitch_guidance else None,
230
- index,
231
- big_npy,
232
- index_rate,
233
- version,
234
- protect,
235
- (energy[:, t // self.window :] if t is not None else energy) if energy_use else None
236
- )[self.t_pad_tgt : -self.t_pad_tgt]
237
- )
238
-
239
- audio_opt = np.concatenate(audio_opt)
240
- if pbar: pbar.update(1)
241
-
242
- if rms_mix_rate != 1: audio_opt = change_rms(audio, self.sample_rate, audio_opt, self.sample_rate, rms_mix_rate)
243
-
244
- audio_max = np.abs(audio_opt).max() / 0.99
245
- if audio_max > 1: audio_opt /= audio_max
246
-
247
- if pitch_guidance: del pitch, pitchf
248
- del sid
249
-
250
- clear_gpu_cache()
251
- return audio_opt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/inference/conversion/utils.py DELETED
@@ -1,66 +0,0 @@
1
- import os
2
- import gc
3
- import sys
4
- import torch
5
- import librosa
6
-
7
- import numpy as np
8
- import torch.nn.functional as F
9
-
10
- sys.path.append(os.getcwd())
11
-
12
- from main.library import opencl
13
-
14
- def autotune_f0(note_dict, f0, f0_autotune_strength):
15
- autotuned_f0 = np.zeros_like(f0)
16
-
17
- for i, freq in enumerate(f0):
18
- autotuned_f0[i] = freq + (min(note_dict, key=lambda x: abs(x - freq)) - freq) * f0_autotune_strength
19
-
20
- return autotuned_f0
21
-
22
- def change_rms(source_audio, source_rate, target_audio, target_rate, rate):
23
- rms2 = F.interpolate(torch.from_numpy(librosa.feature.rms(y=target_audio, frame_length=target_rate // 2 * 2, hop_length=target_rate // 2)).float().unsqueeze(0), size=target_audio.shape[0], mode="linear").squeeze()
24
- return (target_audio * (torch.pow(F.interpolate(torch.from_numpy(librosa.feature.rms(y=source_audio, frame_length=source_rate // 2 * 2, hop_length=source_rate // 2)).float().unsqueeze(0), size=target_audio.shape[0], mode="linear").squeeze(), 1 - rate) * torch.pow(torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6), rate - 1)).numpy())
25
-
26
- def clear_gpu_cache():
27
- gc.collect()
28
-
29
- if torch.cuda.is_available(): torch.cuda.empty_cache()
30
- elif torch.backends.mps.is_available(): torch.mps.empty_cache()
31
- elif opencl.is_available(): opencl.pytorch_ocl.empty_cache()
32
-
33
- def extract_median_f0(f0):
34
- f0 = np.where(f0 == 0, np.nan, f0)
35
- return float(np.median(np.interp(np.arange(len(f0)), np.where(~np.isnan(f0))[0], f0[~np.isnan(f0)])))
36
-
37
- def proposal_f0_up_key(f0, target_f0 = 155.0, limit = 12):
38
- return max(-limit, min(limit, int(np.round(12 * np.log2(target_f0 / extract_median_f0(f0))))))
39
-
40
- def get_onnx_argument(net_g, feats, p_len, sid, pitch, pitchf, energy, pitch_guidance, energy_use):
41
- inputs = {
42
- net_g.get_inputs()[0].name: feats.cpu().numpy().astype(np.float32),
43
- net_g.get_inputs()[1].name: p_len.cpu().numpy(),
44
- net_g.get_inputs()[2].name: np.array([sid.cpu().item()], dtype=np.int64),
45
- net_g.get_inputs()[3].name: np.random.randn(1, 192, p_len).astype(np.float32)
46
- }
47
-
48
- if energy_use:
49
- if pitch_guidance:
50
- inputs.update({
51
- net_g.get_inputs()[4].name: pitch.cpu().numpy().astype(np.int64),
52
- net_g.get_inputs()[5].name: pitchf.cpu().numpy().astype(np.float32),
53
- net_g.get_inputs()[6].name: energy.cpu().numpy().astype(np.float32)
54
- })
55
- else:
56
- inputs.update({
57
- net_g.get_inputs()[4].name: energy.cpu().numpy().astype(np.float32)
58
- })
59
- else:
60
- if pitch_guidance:
61
- inputs.update({
62
- net_g.get_inputs()[4].name: pitch.cpu().numpy().astype(np.int64),
63
- net_g.get_inputs()[5].name: pitchf.cpu().numpy().astype(np.float32)
64
- })
65
-
66
- return inputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/inference/create_dataset.py DELETED
@@ -1,212 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
- import yt_dlp
5
- import shutil
6
- import librosa
7
- import argparse
8
- import warnings
9
-
10
- from soundfile import read, write
11
- from distutils.util import strtobool
12
-
13
- sys.path.append(os.getcwd())
14
-
15
- from main.app.variables import config, logger, translations
16
- from main.library.uvr5_lib.separator import Separator
17
-
18
- dataset_temp = "dataset_temp"
19
-
20
- def parse_arguments():
21
- parser = argparse.ArgumentParser()
22
- parser.add_argument("--create_dataset", action='store_true')
23
- parser.add_argument("--input_audio", type=str, required=True)
24
- parser.add_argument("--output_dataset", type=str, default="./dataset")
25
- parser.add_argument("--sample_rate", type=int, default=44100)
26
- parser.add_argument("--clean_dataset", type=lambda x: bool(strtobool(x)), default=False)
27
- parser.add_argument("--clean_strength", type=float, default=0.7)
28
- parser.add_argument("--separator_reverb", type=lambda x: bool(strtobool(x)), default=False)
29
- parser.add_argument("--kim_vocal_version", type=int, default=2)
30
- parser.add_argument("--overlap", type=float, default=0.25)
31
- parser.add_argument("--segments_size", type=int, default=256)
32
- parser.add_argument("--mdx_hop_length", type=int, default=1024)
33
- parser.add_argument("--mdx_batch_size", type=int, default=1)
34
- parser.add_argument("--denoise_mdx", type=lambda x: bool(strtobool(x)), default=False)
35
- parser.add_argument("--skip", type=lambda x: bool(strtobool(x)), default=False)
36
- parser.add_argument("--skip_start_audios", type=str, default="0")
37
- parser.add_argument("--skip_end_audios", type=str, default="0")
38
-
39
- return parser.parse_args()
40
-
41
- def main():
42
- pid_path = os.path.join("assets", "create_dataset_pid.txt")
43
- with open(pid_path, "w") as pid_file:
44
- pid_file.write(str(os.getpid()))
45
-
46
- args = parse_arguments()
47
- input_audio, output_dataset, sample_rate, clean_dataset, clean_strength, separator_reverb, kim_vocal_version, overlap, segments_size, hop_length, batch_size, denoise_mdx, skip, skip_start_audios, skip_end_audios = args.input_audio, args.output_dataset, args.sample_rate, args.clean_dataset, args.clean_strength, args.separator_reverb, args.kim_vocal_version, args.overlap, args.segments_size, args.mdx_hop_length, args.mdx_batch_size, args.denoise_mdx, args.skip, args.skip_start_audios, args.skip_end_audios
48
- log_data = {translations['audio_path']: input_audio, translations['output_path']: output_dataset, translations['sr']: sample_rate, translations['clear_dataset']: clean_dataset, translations['dereveb_audio']: separator_reverb, translations['segments_size']: segments_size, translations['overlap']: overlap, "Hop length": hop_length, translations['batch_size']: batch_size, translations['denoise_mdx']: denoise_mdx, translations['skip']: skip}
49
-
50
- if clean_dataset: log_data[translations['clean_strength']] = clean_strength
51
- if skip:
52
- log_data[translations['skip_start']] = skip_start_audios
53
- log_data[translations['skip_end']] = skip_end_audios
54
-
55
- for key, value in log_data.items():
56
- logger.debug(f"{key}: {value}")
57
-
58
- if kim_vocal_version not in [1, 2]: raise ValueError(translations["version_not_valid"])
59
- start_time = time.time()
60
-
61
- try:
62
- paths = []
63
-
64
- if not os.path.exists(dataset_temp): os.makedirs(dataset_temp, exist_ok=True)
65
- urls = input_audio.replace(", ", ",").split(",")
66
-
67
- for url in urls:
68
- path = downloader(url, urls.index(url))
69
- paths.append(path)
70
-
71
- if skip:
72
- skip_start_audios, skip_end_audios = skip_start_audios.replace(", ", ",").split(","), skip_end_audios.replace(", ", ",").split(",")
73
-
74
- if len(skip_start_audios) < len(paths) or len(skip_end_audios) < len(paths):
75
- logger.warning(translations["skip<audio"])
76
- sys.exit(1)
77
- elif len(skip_start_audios) > len(paths) or len(skip_end_audios) > len(paths):
78
- logger.warning(translations["skip>audio"])
79
- sys.exit(1)
80
- else:
81
- for audio, skip_start_audio, skip_end_audio in zip(paths, skip_start_audios, skip_end_audios):
82
- skip_start(audio, skip_start_audio)
83
- skip_end(audio, skip_end_audio)
84
-
85
- separator_paths = []
86
-
87
- for audio in paths:
88
- vocals = separator_music_main(audio, dataset_temp, segments_size, overlap, denoise_mdx, kim_vocal_version, hop_length, batch_size, sample_rate)
89
- if separator_reverb: vocals = separator_reverb_audio(vocals, dataset_temp, segments_size, overlap, denoise_mdx, hop_length, batch_size, sample_rate)
90
- separator_paths.append(vocals)
91
-
92
- paths = separator_paths
93
-
94
- for audio_path in paths:
95
- data, sample_rate = read(audio_path)
96
- data = librosa.to_mono(data.T)
97
-
98
- if clean_dataset:
99
- from main.tools.noisereduce import reduce_noise
100
- data = reduce_noise(y=data, sr=sample_rate, prop_decrease=clean_strength, device=config.device)
101
-
102
- write(audio_path, data, sample_rate)
103
- except Exception as e:
104
- logger.error(f"{translations['create_dataset_error']}: {e}")
105
- import traceback
106
- logger.error(traceback.format_exc())
107
- finally:
108
- for audio in paths:
109
- shutil.move(audio, output_dataset)
110
-
111
- if os.path.exists(dataset_temp): shutil.rmtree(dataset_temp, ignore_errors=True)
112
-
113
- elapsed_time = time.time() - start_time
114
- if os.path.exists(pid_path): os.remove(pid_path)
115
- logger.info(translations["create_dataset_success"].format(elapsed_time=f"{elapsed_time:.2f}"))
116
-
117
- def downloader(url, name):
118
- with warnings.catch_warnings():
119
- warnings.simplefilter("ignore")
120
-
121
- ydl_opts = {"format": "bestaudio/best", "outtmpl": os.path.join(dataset_temp, f"{name}"), "postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "wav", "preferredquality": "192"}], "no_warnings": True, "noplaylist": True, "noplaylist": True, "verbose": False}
122
- logger.info(f"{translations['starting_download']}: {url}...")
123
-
124
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
125
- ydl.extract_info(url)
126
- logger.info(f"{translations['download_success']}: {url}")
127
-
128
- return os.path.join(dataset_temp, f"{name}" + ".wav")
129
-
130
- def skip_start(input_file, seconds):
131
- data, sr = read(input_file)
132
- total_duration = len(data) / sr
133
-
134
- if seconds <= 0: logger.warning(translations["=<0"])
135
- elif seconds >= total_duration: logger.warning(translations["skip_warning"].format(seconds=seconds, total_duration=f"{total_duration:.2f}"))
136
- else:
137
- logger.info(f"{translations['skip_start']}: {input_file}...")
138
- write(input_file, data[int(seconds * sr):], sr)
139
-
140
- logger.info(translations["skip_start_audio"].format(input_file=input_file))
141
-
142
- def skip_end(input_file, seconds):
143
- data, sr = read(input_file)
144
- total_duration = len(data) / sr
145
-
146
- if seconds <= 0: logger.warning(translations["=<0"])
147
- elif seconds > total_duration: logger.warning(translations["skip_warning"].format(seconds=seconds, total_duration=f"{total_duration:.2f}"))
148
- else:
149
- logger.info(f"{translations['skip_end']}: {input_file}...")
150
- write(input_file, data[:-int(seconds * sr)], sr)
151
-
152
- logger.info(translations["skip_end_audio"].format(input_file=input_file))
153
-
154
- def separator_music_main(input, output, segments_size, overlap, denoise, version, hop_length, batch_size, sample_rate):
155
- if not os.path.exists(input):
156
- logger.warning(translations["input_not_valid"])
157
- return None
158
-
159
- if not os.path.exists(output):
160
- logger.warning(translations["output_not_valid"])
161
- return None
162
-
163
- model = f"Kim_Vocal_{version}.onnx"
164
- output_separator = separator_main(audio_file=input, model_filename=model, output_format="wav", output_dir=output, mdx_segment_size=segments_size, mdx_overlap=overlap, mdx_batch_size=batch_size, mdx_hop_length=hop_length, mdx_enable_denoise=denoise, sample_rate=sample_rate)
165
-
166
- for f in output_separator:
167
- path = os.path.join(output, f)
168
- if not os.path.exists(path): logger.error(translations["not_found"].format(name=path))
169
-
170
- if '_(Instrumental)_' in f: os.rename(path, os.path.splitext(path)[0].replace("(", "").replace(")", "") + ".wav")
171
- elif '_(Vocals)_' in f:
172
- rename_file = os.path.splitext(path)[0].replace("(", "").replace(")", "") + ".wav"
173
- os.rename(path, rename_file)
174
-
175
- return rename_file
176
-
177
- def separator_reverb_audio(input, output, segments_size, overlap, denoise, hop_length, batch_size, sample_rate):
178
- if not os.path.exists(input):
179
- logger.warning(translations["input_not_valid"])
180
- return None
181
-
182
- if not os.path.exists(output):
183
- logger.warning(translations["output_not_valid"])
184
- return None
185
-
186
- logger.info(f"{translations['dereverb']}: {input}...")
187
- output_dereverb = separator_main(audio_file=input, model_filename="Reverb_HQ_By_FoxJoy.onnx", output_format="wav", output_dir=output, mdx_segment_size=segments_size, mdx_overlap=overlap, mdx_batch_size=hop_length, mdx_hop_length=batch_size, mdx_enable_denoise=denoise, sample_rate=sample_rate)
188
-
189
- for f in output_dereverb:
190
- path = os.path.join(output, f)
191
- if not os.path.exists(path): logger.error(translations["not_found"].format(name=path))
192
-
193
- if '_(Reverb)_' in f: os.rename(path, os.path.splitext(path)[0].replace("(", "").replace(")", "") + ".wav")
194
- elif '_(No Reverb)_' in f:
195
- rename_file = os.path.splitext(path)[0].replace("(", "").replace(")", "") + ".wav"
196
- os.rename(path, rename_file)
197
-
198
- logger.info(f"{translations['dereverb_success']}: {rename_file}")
199
- return rename_file
200
-
201
- def separator_main(audio_file=None, model_filename="Kim_Vocal_1.onnx", output_format="wav", output_dir=".", mdx_segment_size=256, mdx_overlap=0.25, mdx_batch_size=1, mdx_hop_length=1024, mdx_enable_denoise=True, sample_rate=44100):
202
- try:
203
- separator = Separator(logger=logger, output_dir=output_dir, output_format=output_format, output_bitrate=None, normalization_threshold=0.9, sample_rate=sample_rate, mdx_params={"hop_length": mdx_hop_length, "segment_size": mdx_segment_size, "overlap": mdx_overlap, "batch_size": mdx_batch_size, "enable_denoise": mdx_enable_denoise})
204
- separator.load_model(model_filename=model_filename)
205
- return separator.separate(audio_file)
206
- except:
207
- logger.debug(translations["default_setting"])
208
- separator = Separator(logger=logger, output_dir=output_dir, output_format=output_format, output_bitrate=None, normalization_threshold=0.9, sample_rate=44100, mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": mdx_enable_denoise})
209
- separator.load_model(model_filename=model_filename)
210
- return separator.separate(audio_file)
211
-
212
- if __name__ == "__main__": main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/inference/create_index.py DELETED
@@ -1,73 +0,0 @@
1
- import os
2
- import sys
3
- import faiss
4
- import argparse
5
-
6
- import numpy as np
7
-
8
- from multiprocessing import cpu_count
9
- from sklearn.cluster import MiniBatchKMeans
10
-
11
- sys.path.append(os.getcwd())
12
-
13
- from main.app.variables import logger, translations, configs
14
-
15
- def parse_arguments():
16
- parser = argparse.ArgumentParser()
17
- parser.add_argument("--create_index", action='store_true')
18
- parser.add_argument("--model_name", type=str, required=True)
19
- parser.add_argument("--rvc_version", type=str, default="v2")
20
- parser.add_argument("--index_algorithm", type=str, default="Auto")
21
-
22
- return parser.parse_args()
23
-
24
- def main():
25
- args = parse_arguments()
26
- exp_dir = os.path.join(configs["logs_path"], args.model_name)
27
- version, index_algorithm = args.rvc_version, args.index_algorithm
28
-
29
- log_data = {translations['modelname']: args.model_name, translations['model_path']: exp_dir, translations['training_version']: version, translations['index_algorithm_info']: index_algorithm}
30
- for key, value in log_data.items():
31
- logger.debug(f"{key}: {value}")
32
-
33
- try:
34
- npys = []
35
- feature_dir = os.path.join(exp_dir, f"{version}_extracted")
36
- model_name = os.path.basename(exp_dir)
37
-
38
- for name in sorted(os.listdir(feature_dir)):
39
- npys.append(np.load(os.path.join(feature_dir, name)))
40
-
41
- big_npy = np.concatenate(npys, axis=0)
42
- big_npy_idx = np.arange(big_npy.shape[0])
43
- np.random.shuffle(big_npy_idx)
44
- big_npy = big_npy[big_npy_idx]
45
-
46
- if big_npy.shape[0] > 2e5 and (index_algorithm == "Auto" or index_algorithm == "KMeans"): big_npy = (MiniBatchKMeans(n_clusters=10000, verbose=True, batch_size=256 * cpu_count(), compute_labels=False, init="random").fit(big_npy).cluster_centers_)
47
- np.save(os.path.join(exp_dir, "total_fea.npy"), big_npy)
48
-
49
- n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
50
- index_trained = faiss.index_factory(256 if version == "v1" else 768, f"IVF{n_ivf},Flat")
51
- index_ivf_trained = faiss.extract_index_ivf(index_trained)
52
- index_ivf_trained.nprobe = 1
53
- index_trained.train(big_npy)
54
- faiss.write_index(index_trained, os.path.join(exp_dir, f"trained_IVF{n_ivf}_Flat_nprobe_{index_ivf_trained.nprobe}_{model_name}_{version}.index"))
55
-
56
- index_added = faiss.index_factory(256 if version == "v1" else 768, f"IVF{n_ivf},Flat")
57
- index_ivf_added = faiss.extract_index_ivf(index_added)
58
- index_ivf_added.nprobe = 1
59
- index_added.train(big_npy)
60
- batch_size_add = 8192
61
-
62
- for i in range(0, big_npy.shape[0], batch_size_add):
63
- index_added.add(big_npy[i : i + batch_size_add])
64
-
65
- index_filepath_added = os.path.join(exp_dir, f"added_IVF{n_ivf}_Flat_nprobe_{index_ivf_added.nprobe}_{model_name}_{version}.index")
66
- faiss.write_index(index_added, index_filepath_added)
67
- logger.info(f"{translations['save_index']} '{index_filepath_added}'")
68
- except Exception as e:
69
- logger.error(f"{translations['create_index_error']}: {e}")
70
- import traceback
71
- logger.debug(traceback.format_exc())
72
-
73
- if __name__ == "__main__": main()