nevreal commited on
Commit
d1614fd
·
verified ·
1 Parent(s): 780f01d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -249
app.py CHANGED
@@ -1,240 +1,10 @@
1
- import os
2
- import glob
3
- import json
4
- import traceback
5
- import logging
6
  import gradio as gr
7
- import numpy as np
8
- import librosa
9
- import torch
10
- import asyncio
11
- import edge_tts
12
- import yt_dlp
13
- import ffmpeg
14
- import subprocess
15
- import sys
16
- import io
17
- import wave
18
- from datetime import datetime
19
- from fairseq import checkpoint_utils
20
- from lib.infer_pack.models import (
21
- SynthesizerTrnMs256NSFsid,
22
- SynthesizerTrnMs256NSFsid_nono,
23
- SynthesizerTrnMs768NSFsid,
24
- SynthesizerTrnMs768NSFsid_nono,
25
- )
26
- from vc_infer_pipeline import VC
27
- from config import Config
28
- config = Config()
29
- logging.getLogger("numba").setLevel(logging.WARNING)
30
- limitation = os.getenv("SYSTEM") == "spaces"
31
 
32
- audio_mode = []
33
- f0method_mode = []
34
- f0method_info = ""
35
- if limitation is True:
36
- audio_mode = ["Upload audio", "TTS Audio"]
37
- f0method_mode = ["pm", "crepe", "harvest"]
38
- f0method_info = "PM is fast, rmvpe is middle, Crepe or harvest is good but it was extremely slow (Default: PM)"
39
- else:
40
- audio_mode = ["Upload audio", "Youtube", "TTS Audio"]
41
- f0method_mode = ["pm", "crepe", "harvest"]
42
- f0method_info = "PM is fast, rmvpe is middle. Crepe or harvest is good but it was extremely slow (Default: PM))"
43
 
44
- if os.path.isfile("rmvpe.pt"):
45
- f0method_mode.insert(2, "rmvpe")
46
 
47
- def create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, file_index):
48
- def vc_fn(
49
- vc_audio_mode,
50
- vc_input,
51
- vc_upload,
52
- tts_text,
53
- tts_voice,
54
- f0_up_key,
55
- f0_method,
56
- index_rate,
57
- filter_radius,
58
- resample_sr,
59
- rms_mix_rate,
60
- protect,
61
- ):
62
- try:
63
- if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
64
- audio, sr = librosa.load(vc_input, sr=16000, mono=True)
65
- elif vc_audio_mode == "Upload audio":
66
- if vc_upload is None:
67
- return "You need to upload an audio", None
68
- sampling_rate, audio = vc_upload
69
- duration = audio.shape[0] / sampling_rate
70
- if duration > 360 and limitation:
71
- return "Please upload an audio file that is less than 1 minute.", None
72
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
73
- if len(audio.shape) > 1:
74
- audio = librosa.to_mono(audio.transpose(1, 0))
75
- if sampling_rate != 16000:
76
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
77
- elif vc_audio_mode == "TTS Audio":
78
- if len(tts_text) > 600 and limitation:
79
- return "Text is too long", None
80
- if tts_text is None or tts_voice is None:
81
- return "You need to enter text and select a voice", None
82
- asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
83
- audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
84
- vc_input = "tts.mp3"
85
- times = [0, 0, 0]
86
- f0_up_key = int(f0_up_key)
87
- audio_opt = vc.pipeline(
88
- hubert_model,
89
- net_g,
90
- 0,
91
- audio,
92
- vc_input,
93
- times,
94
- f0_up_key,
95
- f0_method,
96
- file_index,
97
- # file_big_npy,
98
- index_rate,
99
- if_f0,
100
- filter_radius,
101
- tgt_sr,
102
- resample_sr,
103
- rms_mix_rate,
104
- version,
105
- protect,
106
- f0_file=None,
107
- )
108
- info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
109
- print(f"{model_title} | {info}")
110
- return info, (tgt_sr, audio_opt)
111
- except:
112
- info = traceback.format_exc()
113
- print(info)
114
- return info, (None, None)
115
- return vc_fn
116
 
117
- def load_model():
118
- categories = []
119
- with open("weights/folder_info.json", "r", encoding="utf-8") as f:
120
- folder_info = json.load(f)
121
- for category_name, category_info in folder_info.items():
122
- if not category_info['enable']:
123
- continue
124
- category_title = category_info['title']
125
- category_folder = category_info['folder_path']
126
- models = []
127
- with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
128
- models_info = json.load(f)
129
- for character_name, info in models_info.items():
130
- if not info['enable']:
131
- continue
132
- model_title = info['title']
133
- model_name = info['model_path']
134
- model_author = info.get("author", None)
135
- model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
136
- model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
137
- cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
138
- tgt_sr = cpt["config"][-1]
139
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
140
- if_f0 = cpt.get("f0", 1)
141
- version = cpt.get("version", "v1")
142
- if version == "v1":
143
- if if_f0 == 1:
144
- net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
145
- else:
146
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
147
- model_version = "V1"
148
- elif version == "v2":
149
- if if_f0 == 1:
150
- net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
151
- else:
152
- net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
153
- model_version = "V2"
154
- del net_g.enc_q
155
- print(net_g.load_state_dict(cpt["weight"], strict=False))
156
- net_g.eval().to(config.device)
157
- if config.is_half:
158
- net_g = net_g.half()
159
- else:
160
- net_g = net_g.float()
161
- vc = VC(tgt_sr, config)
162
- print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
163
- models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, model_index)))
164
- categories.append([category_title, category_folder, models])
165
- return categories
166
-
167
- def cut_vocal_and_inst(url, audio_provider, split_model):
168
- if url != "":
169
- if not os.path.exists("dl_audio"):
170
- os.mkdir("dl_audio")
171
- if audio_provider == "Youtube":
172
- ydl_opts = {
173
- 'format': 'bestaudio/best',
174
- 'postprocessors': [{
175
- 'key': 'FFmpegExtractAudio',
176
- 'preferredcodec': 'wav',
177
- }],
178
- "outtmpl": 'dl_audio/youtube_audio',
179
- }
180
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
181
- ydl.download([url])
182
- audio_path = "dl_audio/youtube_audio.wav"
183
- else:
184
- # Spotify doesnt work.
185
- # Need to find other solution soon.
186
- '''
187
- command = f"spotdl download {url} --output dl_audio/.wav"
188
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
189
- print(result.stdout.decode())
190
- audio_path = "dl_audio/spotify_audio.wav"
191
- '''
192
- if split_model == "htdemucs":
193
- command = f"demucs --two-stems=vocals {audio_path} -o output"
194
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
195
- print(result.stdout.decode())
196
- return "output/htdemucs/youtube_audio/vocals.wav", "output/htdemucs/youtube_audio/no_vocals.wav", audio_path, "output/htdemucs/youtube_audio/vocals.wav"
197
- else:
198
- command = f"demucs --two-stems=vocals -n mdx_extra_q {audio_path} -o output"
199
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
200
- print(result.stdout.decode())
201
- return "output/mdx_extra_q/youtube_audio/vocals.wav", "output/mdx_extra_q/youtube_audio/no_vocals.wav", audio_path, "output/mdx_extra_q/youtube_audio/vocals.wav"
202
- else:
203
- raise gr.Error("URL Required!")
204
- return None, None, None, None
205
 
206
- def combine_vocal_and_inst(audio_data, audio_volume, split_model):
207
- if not os.path.exists("output/result"):
208
- os.mkdir("output/result")
209
- vocal_path = "output/result/output.wav"
210
- output_path = "output/result/combine.mp3"
211
- if split_model == "htdemucs":
212
- inst_path = "output/htdemucs/youtube_audio/no_vocals.wav"
213
- else:
214
- inst_path = "output/mdx_extra_q/youtube_audio/no_vocals.wav"
215
- with wave.open(vocal_path, "w") as wave_file:
216
- wave_file.setnchannels(1)
217
- wave_file.setsampwidth(2)
218
- wave_file.setframerate(audio_data[0])
219
- wave_file.writeframes(audio_data[1].tobytes())
220
- command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume={audio_volume}dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
221
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
222
- print(result.stdout.decode())
223
- return output_path
224
-
225
- def load_hubert():
226
- global hubert_model
227
- models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
228
- ["hubert_base.pt"],
229
- suffix="",
230
- )
231
- hubert_model = models[0]
232
- hubert_model = hubert_model.to(config.device)
233
- if config.is_half:
234
- hubert_model = hubert_model.half()
235
- else:
236
- hubert_model = hubert_model.float()
237
- hubert_model.eval()
238
 
239
  def change_audio_mode(vc_audio_mode):
240
  if vc_audio_mode == "Input path":
@@ -346,9 +116,7 @@ if __name__ == '__main__':
346
  with gr.Blocks(theme="nevreal/blues") as app:
347
  gr.Markdown(
348
  "# <center> Hololive RVC Models\n"
349
- "### <center> will update every hololive ai model that i can find or make.\n"
350
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aziib/hololive-rvc-models-v2/blob/main/hololive_rvc_models_v2.ipynb)\n\n"
351
- "[![ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/megaaziib)\n\n"
352
  )
353
  for (folder_title, folder, models) in categories:
354
  with gr.TabItem(folder_title):
@@ -370,28 +138,29 @@ if __name__ == '__main__':
370
  )
371
  with gr.Row():
372
  with gr.Column():
373
- vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
374
- # Input and Upload
375
- vc_input = gr.Textbox(label="Input audio path", visible=False)
376
- vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
377
- # Youtube
378
- vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
379
- vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
380
- vc_split_model = gr.Dropdown(label="Splitter Model", choices=["htdemucs", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
381
- vc_split = gr.Button("Split Audio", variant="primary", visible=False)
382
- vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
383
- vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
384
- vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
385
- # TTS
386
- tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
387
- tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
 
388
  with gr.Column():
389
  vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
390
  f0method0 = gr.Radio(
391
  label="Pitch extraction algorithm",
392
  info=f0method_info,
393
  choices=f0method_mode,
394
- value="pm",
395
  interactive=True
396
  )
397
  index_rate1 = gr.Slider(
 
 
 
 
 
 
1
  import gradio as gr
2
+ from rvc import *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
 
 
 
 
 
 
 
 
 
 
 
4
 
 
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def change_audio_mode(vc_audio_mode):
10
  if vc_audio_mode == "Input path":
 
116
  with gr.Blocks(theme="nevreal/blues") as app:
117
  gr.Markdown(
118
  "# <center> Hololive RVC Models\n"
 
119
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aziib/hololive-rvc-models-v2/blob/main/hololive_rvc_models_v2.ipynb)\n\n"
 
120
  )
121
  for (folder_title, folder, models) in categories:
122
  with gr.TabItem(folder_title):
 
138
  )
139
  with gr.Row():
140
  with gr.Column():
141
+ with gr.Accordion("Main Options", open=False):
142
+ vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
143
+ # Input and Upload
144
+ vc_input = gr.Textbox(label="Input audio path", visible=False)
145
+ vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
146
+ # Youtube
147
+ vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
148
+ vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
149
+ vc_split_model = gr.Dropdown(label="Splitter Model", choices=["htdemucs", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
150
+ vc_split = gr.Button("Split Audio", variant="primary", visible=False)
151
+ vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
152
+ vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
153
+ vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
154
+ # TTS
155
+ tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
156
+ tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
157
  with gr.Column():
158
  vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
159
  f0method0 = gr.Radio(
160
  label="Pitch extraction algorithm",
161
  info=f0method_info,
162
  choices=f0method_mode,
163
+ value="rmvpe",
164
  interactive=True
165
  )
166
  index_rate1 = gr.Slider(