wolfofbackstreet commited on
Commit
8aa5548
·
verified ·
1 Parent(s): 4a16501
Files changed (3) hide show
  1. Dockerfile +7 -0
  2. melotts_training.py +799 -0
  3. requirements.txt +1 -0
Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+ WORKDIR /app
3
+ COPY . .
4
+ RUN pip install --no-cache-dir -r requirements.txt
5
+ EXPOSE 7860
6
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
7
+ CMD ["python", "app.py"]
melotts_training.py ADDED
@@ -0,0 +1,799 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """melotts training.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1srmto1Bf7xQl7la1-5cTZOvbTnL-KWDG
8
+ """
9
+
10
+ # Fetch `notebook_utils` module
11
+ import requests
12
+ from pathlib import Path
13
+
14
+ if not Path("notebook_utils.py").exists():
15
+
16
+ r = requests.get(
17
+ url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
18
+ )
19
+ open("notebook_utils.py", "w").write(r.text)
20
+
21
+ if not Path("cmd_helper.py").exists():
22
+ r = requests.get(
23
+ url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py",
24
+ )
25
+ open("cmd_helper.py", "w").write(r.text)
26
+
27
+ if not Path("pip_helper.py").exists():
28
+ r = requests.get(
29
+ url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py",
30
+ )
31
+ open("pip_helper.py", "w").write(r.text)
32
+
33
+ # !!! have to restart session
34
+
35
+ from pathlib import Path
36
+
37
+ from cmd_helper import clone_repo
38
+ from pip_helper import pip_install
39
+ import platform
40
+
41
+
42
+ repo_dir = Path("OpenVoice")
43
+
44
+ clone_repo("https://github.com/myshell-ai/OpenVoice")
45
+ orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
46
+ english_path = Path("OpenVoice/openvoice/text/english.py")
47
+
48
+ if not orig_english_path.exists():
49
+ orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
50
+ english_path = Path("OpenVoice/openvoice/text/english.py")
51
+
52
+ english_path.rename(orig_english_path)
53
+
54
+ with orig_english_path.open("r") as f:
55
+ data = f.read()
56
+ data = data.replace("unidecode", "anyascii")
57
+ with english_path.open("w") as out_f:
58
+ out_f.write(data)
59
+
60
+
61
+ # fix a problem with silero downloading and installing
62
+ with Path("OpenVoice/openvoice/se_extractor.py").open("r") as orig_file:
63
+ data = orig_file.read()
64
+ data = data.replace('method="silero"', 'method="silero:3.0"')
65
+ with Path("OpenVoice/openvoice/se_extractor.py").open("w") as out_f:
66
+ out_f.write(data)
67
+
68
+ # clone melotts
69
+ clone_repo("https://github.com/myshell-ai/MeloTTS")
70
+
71
+ pip_install(
72
+ "--no-deps",
73
+ "librosa==0.9.1",
74
+ "pydub==0.25.1",
75
+ "tqdm",
76
+ "inflect==7.0.0",
77
+ "pypinyin==0.50.0",
78
+ "openvino>=2025.0",
79
+ )
80
+ # Since we don't convert Japanese models, we have removed many heavy Japanese-related pip install dependencies. If you want to try, we recommend using a Python 3.10 environment on Ubuntu and uncommenting the relevant lines.
81
+ pip_install(
82
+ "--extra-index-url",
83
+ "https://download.pytorch.org/whl/cpu",
84
+ # "mecab-python3==1.0.9",
85
+ "nncf",
86
+ "wavmark>=0.0.3",
87
+ "faster-whisper>=0.9.0",
88
+ "eng_to_ipa==0.0.2",
89
+ "cn2an==0.5.22",
90
+ "jieba==0.42.1",
91
+ "langid==1.1.6",
92
+ "ipywebrtc",
93
+ "anyascii==0.3.2",
94
+ "torch>=2.1",
95
+ "torchaudio",
96
+ "cached_path",
97
+ "transformers>=4.38,<5.0",
98
+ "num2words==0.5.12",
99
+ # "unidic_lite==1.0.8",
100
+ # "unidic==1.1.0",
101
+ "pykakasi==2.2.1",
102
+ # "fugashi==1.3.0",
103
+ "g2p_en==2.1.0",
104
+ "jamo==0.4.1",
105
+ "gruut[de,es,fr]==2.2.3",
106
+ "g2pkk>=0.1.1",
107
+ "dtw-python",
108
+ "more-itertools",
109
+ "tiktoken",
110
+ "tensorboard==2.16.2",
111
+ "loguru==0.7.2",
112
+ "nltk",
113
+ "gradio",
114
+ )
115
+ pip_install("--no-deps", "whisper-timestamped>=1.14.2", "openai-whisper")
116
+
117
+ if platform.system() == "Darwin":
118
+ pip_install("numpy<2.0")
119
+
120
+ # fix the problem of `module 'botocore.exceptions' has no attribute 'HTTPClientError'`
121
+ pip_install("--upgrade", "botocore")
122
+
123
+ # donwload nltk data
124
+ import nltk
125
+
126
+ nltk.download("averaged_perceptron_tagger_eng")
127
+
128
+ # install unidic
129
+ # !python -m unidic download
130
+
131
+ # remove Japanese-related module in MeloTTS to fix dependencies issue
132
+ # If you want to use Japanese, please do not modify these files
133
+ import re
134
+
135
+ with Path("MeloTTS/melo/text/english.py").open("r", encoding="utf-8") as orig_file:
136
+ data = orig_file.read()
137
+ japanese_import = "from .japanese import distribute_phone"
138
+ replacement_function = """
139
+ def distribute_phone(n_phone, n_word):
140
+ phones_per_word = [0] * n_word
141
+ for task in range(n_phone):
142
+ min_tasks = min(phones_per_word)
143
+ min_index = phones_per_word.index(min_tasks)
144
+ phones_per_word[min_index] += 1
145
+ return phones_per_word
146
+ """
147
+ data = data.replace(japanese_import, replacement_function) # replace `from .japanese import distribute_phone` with the function
148
+ with Path("MeloTTS/melo/text/english.py").open("w", encoding="utf-8") as out_f:
149
+ out_f.write(data)
150
+
151
+ with Path("MeloTTS/melo/text/__init__.py").open("r", encoding="utf-8") as orig_file:
152
+ data = orig_file.read()
153
+ data = data.replace("from .japanese_bert import get_bert_feature as jp_bert", "")
154
+ data = data.replace("from .spanish_bert import get_bert_feature as sp_bert", "")
155
+ data = data.replace("from .french_bert import get_bert_feature as fr_bert", "")
156
+ data = data.replace("from .korean import get_bert_feature as kr_bert", "")
157
+ # Replace the lang_bert_func_map dictionary, keeping only the keys ZH, EN, and ZH_MIX_EN
158
+ pattern = re.compile(r"lang_bert_func_map\s*=\s*\{[^}]+\}", re.DOTALL)
159
+
160
+ replacement = """lang_bert_func_map = {
161
+ "ZH": zh_bert,
162
+ "EN": en_bert,
163
+ "ZH_MIX_EN": zh_mix_en_bert,
164
+ }"""
165
+ data = pattern.sub(replacement, data)
166
+
167
+ with Path("MeloTTS/melo/text/__init__.py").open("w", encoding="utf-8") as out_f:
168
+ out_f.write(data)
169
+
170
+ # clean the modules
171
+ for filename in ["japanese.py", "japanese_bert.py"]:
172
+ Path(f"MeloTTS/melo/text/{filename}").write_text("", encoding="utf-8")
173
+
174
+ import os
175
+ import torch
176
+ import openvino as ov
177
+ import ipywidgets as widgets
178
+ from IPython.display import Audio
179
+ from notebook_utils import download_file, device_widget
180
+
181
+ core = ov.Core()
182
+
183
+ from openvoice.api import ToneColorConverter, OpenVoiceBaseClass
184
+ import openvoice.se_extractor as se_extractor
185
+ from melo.api import TTS
186
+
187
+ CKPT_BASE_PATH = Path("checkpoints")
188
+
189
+ base_speakers_suffix = CKPT_BASE_PATH / "base_speakers" / "ses"
190
+ converter_suffix = CKPT_BASE_PATH / "converter"
191
+
192
+ melotts_chinese_suffix = CKPT_BASE_PATH / "MeloTTS-Chinese"
193
+ melotts_english_suffix = CKPT_BASE_PATH / "MeloTTS-English-v3"
194
+
195
+ def download_from_hf_hub(repo_id, filename, local_dir="./"):
196
+ from huggingface_hub import hf_hub_download
197
+
198
+ local_path = Path(local_dir)
199
+ hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_path)
200
+
201
+
202
+ # Download OpenVoice2
203
+ download_from_hf_hub("myshell-ai/OpenVoiceV2", "converter/checkpoint.pth", CKPT_BASE_PATH)
204
+ download_from_hf_hub("myshell-ai/OpenVoiceV2", "converter/config.json", CKPT_BASE_PATH)
205
+
206
+ download_from_hf_hub("myshell-ai/OpenVoiceV2", "base_speakers/ses/en-newest.pth", CKPT_BASE_PATH)
207
+ download_from_hf_hub("myshell-ai/OpenVoiceV2", "base_speakers/ses/zh.pth", CKPT_BASE_PATH)
208
+
209
+ # Download MeloTTS
210
+ download_from_hf_hub("myshell-ai/MeloTTS-Chinese", "checkpoint.pth", melotts_chinese_suffix)
211
+ download_from_hf_hub("myshell-ai/MeloTTS-Chinese", "config.json", melotts_chinese_suffix)
212
+ download_from_hf_hub("myshell-ai/MeloTTS-English-v3", "checkpoint.pth", melotts_english_suffix)
213
+ download_from_hf_hub("myshell-ai/MeloTTS-English-v3", "config.json", melotts_english_suffix)
214
+
215
+ class OVSynthesizerTTSWrapper(torch.nn.Module):
216
+ """
217
+ Wrapper for SynthesizerTrn model from MeloTTS to make it compatible with Torch-style inference.
218
+ """
219
+
220
+ def __init__(self, model, language):
221
+ super().__init__()
222
+ self.model = model
223
+ self.language = language
224
+
225
+ def forward(
226
+ self,
227
+ x,
228
+ x_lengths,
229
+ sid,
230
+ tone,
231
+ language,
232
+ bert,
233
+ ja_bert,
234
+ noise_scale,
235
+ length_scale,
236
+ noise_scale_w,
237
+ sdp_ratio,
238
+ ):
239
+ """
240
+ Forward call to the underlying SynthesizerTrn model. Accepts arbitrary arguments
241
+ and forwards them directly to the model's inference method.
242
+ """
243
+ return self.model.infer(
244
+ x,
245
+ x_lengths,
246
+ sid,
247
+ tone,
248
+ language,
249
+ bert,
250
+ ja_bert,
251
+ sdp_ratio=sdp_ratio,
252
+ noise_scale=noise_scale,
253
+ noise_scale_w=noise_scale_w,
254
+ length_scale=length_scale,
255
+ )
256
+
257
+ def get_example_input(self):
258
+ """
259
+ Return a tuple of example inputs for tracing/ONNX exporting or debugging.
260
+ When exporting the SynthesizerTrn function,
261
+ This model has been found to be very sensitive to the example_input used for model transformation.
262
+ Here, we have implemented some simple rules or considered using real input data.
263
+ """
264
+
265
+ def gen_interleaved_random_tensor(length, value_range):
266
+ """Generate a Tensor in the format [0, val, 0, val, ..., 0], val ∈ [low, high)."""
267
+ return torch.tensor([[0 if i % 2 == 0 else torch.randint(*value_range, (1,)).item() for i in range(length)]], dtype=torch.int64).to(pt_device)
268
+
269
+ def gen_interleaved_fixed_tensor(length, fixed_value):
270
+ """Generate a Tensor in the format [0, val, 0, val, ..., 0]"""
271
+ interleaved = [0 if i % 2 == 0 else fixed_value for i in range(length)]
272
+ return torch.tensor([interleaved], dtype=torch.int64).to(pt_device)
273
+
274
+ if self.language == "EN_NEWEST":
275
+ seq_len = 73
276
+ x_tst = gen_interleaved_random_tensor(seq_len, (14, 220))
277
+ x_tst[:3] = 0
278
+ x_tst[-3:] = 0
279
+ x_tst_lengths = torch.tensor([seq_len], dtype=torch.int64).to(pt_device)
280
+ speakers = torch.tensor([0], dtype=torch.int64).to(pt_device) # This model has only one fixed id for speakers.
281
+ tones = gen_interleaved_random_tensor(seq_len, (5, 10))
282
+ lang_ids = gen_interleaved_fixed_tensor(seq_len, 2) # lang_id for english
283
+ bert = torch.randn((1, 1024, seq_len), dtype=torch.float32).to(pt_device)
284
+ ja_bert = torch.randn(1, 768, seq_len, dtype=torch.float32).to(pt_device)
285
+ sdp_ratio = torch.tensor(0.2).to(pt_device)
286
+ noise_scale = torch.tensor(0.6).to(pt_device)
287
+ noise_scale_w = torch.tensor(0.8).to(pt_device)
288
+ length_scale = torch.tensor(1.0).to(pt_device)
289
+ elif self.language == "ZH":
290
+ seq_len = 37
291
+ x_tst = gen_interleaved_random_tensor(seq_len, (7, 100))
292
+ x_tst[:3] = 0
293
+ x_tst[-3:] = 0
294
+ x_tst_lengths = torch.tensor([37], dtype=torch.int64).to(pt_device)
295
+ speakers = torch.tensor([1], dtype=torch.int64).to(pt_device) # This model has only one fixed id for speakers.
296
+ tones = gen_interleaved_random_tensor(seq_len, (4, 9))
297
+ lang_ids = gen_interleaved_fixed_tensor(seq_len, 3) # lang_id for chinese
298
+ bert = torch.zeros((1, 1024, 37), dtype=torch.float32).to(pt_device)
299
+ ja_bert = torch.randn(1, 768, 37).float().to(pt_device)
300
+ sdp_ratio = torch.tensor(0.2).to(pt_device)
301
+ noise_scale = torch.tensor(0.6).to(pt_device)
302
+ noise_scale_w = torch.tensor(0.8).to(pt_device)
303
+ length_scale = torch.tensor(1.0).to(pt_device)
304
+ return (
305
+ x_tst,
306
+ x_tst_lengths,
307
+ speakers,
308
+ tones,
309
+ lang_ids,
310
+ bert,
311
+ ja_bert,
312
+ noise_scale,
313
+ length_scale,
314
+ noise_scale_w,
315
+ sdp_ratio,
316
+ )
317
+
318
+
319
+ class OVOpenVoiceConverter(torch.nn.Module):
320
+ def __init__(self, voice_model: OpenVoiceBaseClass):
321
+ super().__init__()
322
+ self.voice_model = voice_model
323
+ for par in voice_model.model.parameters():
324
+ par.requires_grad = False
325
+
326
+ def get_example_input(self):
327
+ y = torch.randn([1, 513, 238], dtype=torch.float32)
328
+ y_lengths = torch.LongTensor([y.size(-1)])
329
+ target_se = torch.randn(*(1, 256, 1))
330
+ source_se = torch.randn(*(1, 256, 1))
331
+ tau = torch.tensor(0.3)
332
+ return (y, y_lengths, source_se, target_se, tau)
333
+
334
+ def forward(self, y, y_lengths, sid_src, sid_tgt, tau):
335
+ """
336
+ wraps the 'voice_conversion' method with forward.
337
+ """
338
+ return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau)
339
+
340
+ pt_device = "cpu"
341
+
342
+ melo_tts_en_newest = TTS(
343
+ "EN_NEWEST",
344
+ pt_device,
345
+ use_hf=False,
346
+ config_path=melotts_english_suffix / "config.json",
347
+ ckpt_path=melotts_english_suffix / "checkpoint.pth",
348
+ )
349
+ melo_tts_zh = TTS(
350
+ "ZH",
351
+ pt_device,
352
+ use_hf=False,
353
+ config_path=melotts_chinese_suffix / "config.json",
354
+ ckpt_path=melotts_chinese_suffix / "checkpoint.pth",
355
+ )
356
+
357
+ tone_color_converter = ToneColorConverter(converter_suffix / "config.json", device=pt_device)
358
+ tone_color_converter.load_ckpt(converter_suffix / "checkpoint.pth")
359
+ print(f"ToneColorConverter version: {tone_color_converter.version}")
360
+
361
+ import nncf
362
+
363
+
364
+ IRS_PATH = Path("openvino_irs/")
365
+ EN_TTS_IR = IRS_PATH / "melo_tts_en_newest.xml"
366
+ ZH_TTS_IR = IRS_PATH / "melo_tts_zh.xml"
367
+ VOICE_CONVERTER_IR = IRS_PATH / "openvoice2_tone_conversion.xml"
368
+
369
+ paths = [EN_TTS_IR, ZH_TTS_IR, VOICE_CONVERTER_IR]
370
+ models = [
371
+ OVSynthesizerTTSWrapper(melo_tts_en_newest.model, "EN_NEWEST"),
372
+ OVSynthesizerTTSWrapper(melo_tts_zh.model, "ZH"),
373
+ OVOpenVoiceConverter(tone_color_converter),
374
+ ]
375
+
376
+ ov_models = []
377
+
378
+ for model, path in zip(models, paths):
379
+ if not path.exists():
380
+ ov_model = ov.convert_model(model, example_input=model.get_example_input())
381
+ ov_model = nncf.compress_weights(ov_model)
382
+ ov.save_model(ov_model, path)
383
+ else:
384
+ ov_model = core.read_model(path)
385
+ ov_models.append(ov_model)
386
+
387
+ ov_en_tts, ov_zh_tts, ov_voice_conversion = ov_models
388
+
389
+ core = ov.Core()
390
+
391
+ device = device_widget("CPU", exclude=["NPU"])
392
+ device
393
+
394
+ REFERENCE_VOICES_PATH = f"{repo_dir}/resources/"
395
+ reference_speakers = [
396
+ *[path for path in os.listdir(REFERENCE_VOICES_PATH) if os.path.splitext(path)[-1] == ".mp3"],
397
+ "record_manually",
398
+ "load_manually",
399
+ ]
400
+
401
+ ref_speaker = widgets.Dropdown(
402
+ options=reference_speakers,
403
+ value=reference_speakers[0],
404
+ description="reference voice from which tone color will be copied",
405
+ disabled=False,
406
+ )
407
+
408
+ ref_speaker
409
+
410
+ OUTPUT_DIR = Path("outputs/")
411
+ OUTPUT_DIR.mkdir(exist_ok=True)
412
+
413
+ ref_speaker_path = f"{REFERENCE_VOICES_PATH}/{ref_speaker.value}"
414
+ allowed_audio_types = ".mp4,.mp3,.wav,.wma,.aac,.m4a,.m4b,.webm"
415
+
416
+ if ref_speaker.value == "record_manually":
417
+ ref_speaker_path = OUTPUT_DIR / "custom_example_sample.webm"
418
+ from ipywebrtc import AudioRecorder, CameraStream
419
+
420
+ camera = CameraStream(constraints={"audio": True, "video": False})
421
+ recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True)
422
+ display(recorder)
423
+ elif ref_speaker.value == "load_manually":
424
+ upload_ref = widgets.FileUpload(
425
+ accept=allowed_audio_types,
426
+ multiple=False,
427
+ description="Select audio with reference voice",
428
+ )
429
+ display(upload_ref)
430
+
431
+ def save_audio(voice_source: widgets.FileUpload, out_path: str):
432
+ with open(out_path, "wb") as output_file:
433
+ assert len(voice_source.value) > 0, "Please select audio file"
434
+ output_file.write(voice_source.value[0]["content"])
435
+
436
+
437
+ if ref_speaker.value == "load_manually":
438
+ ref_speaker_path = f"{OUTPUT_DIR}/{upload_ref.value[0].name}"
439
+ save_audio(upload_ref, ref_speaker_path)
440
+
441
+ Audio(ref_speaker_path)
442
+
443
+ # Commented out IPython magic to ensure Python compatibility.
444
+
445
+ torch_hub_local = Path("torch_hub_local/")
446
+ # %env TORCH_HOME={str(torch_hub_local.absolute())}
447
+
448
+ # second step to fix a problem with silero downloading and installing
449
+ import os
450
+ import zipfile
451
+
452
+ url = "https://github.com/snakers4/silero-vad/zipball/v3.0"
453
+
454
+ torch_hub_dir = torch_hub_local / "hub"
455
+ torch.hub.set_dir(torch_hub_dir.as_posix())
456
+
457
+ zip_filename = "v3.0.zip"
458
+ output_path = torch_hub_dir / "v3.0"
459
+ if not (torch_hub_dir / zip_filename).exists():
460
+ download_file(url, directory=torch_hub_dir, filename=zip_filename)
461
+ zip_ref = zipfile.ZipFile((torch_hub_dir / zip_filename).as_posix(), "r")
462
+ zip_ref.extractall(path=output_path.as_posix())
463
+ zip_ref.close()
464
+
465
+ v3_dirs = [d for d in output_path.iterdir() if "snakers4-silero-vad" in d.as_posix()]
466
+ if len(v3_dirs) > 0 and not (torch_hub_dir / "snakers4_silero-vad_v3.0").exists():
467
+ v3_dir = str(v3_dirs[0])
468
+ os.rename(str(v3_dirs[0]), (torch_hub_dir / "snakers4_silero-vad_v3.0").as_posix())
469
+
470
+ en_source_newest_se = torch.load(base_speakers_suffix / "en-newest.pth")
471
+ zh_source_se = torch.load(base_speakers_suffix / "zh.pth")
472
+
473
+ target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
474
+
475
+ def get_pathched_infer(ov_model: ov.Model, device: str) -> callable:
476
+ compiled_model = core.compile_model(ov_model, device)
477
+
478
+ def infer_impl(
479
+ x,
480
+ x_lengths,
481
+ sid,
482
+ tone,
483
+ language,
484
+ bert,
485
+ ja_bert,
486
+ noise_scale,
487
+ length_scale,
488
+ noise_scale_w,
489
+ max_len=None,
490
+ sdp_ratio=1.0,
491
+ y=None,
492
+ g=None,
493
+ ):
494
+ ov_output = compiled_model(
495
+ (
496
+ x,
497
+ x_lengths,
498
+ sid,
499
+ tone,
500
+ language,
501
+ bert,
502
+ ja_bert,
503
+ noise_scale,
504
+ length_scale,
505
+ noise_scale_w,
506
+ sdp_ratio,
507
+ )
508
+ )
509
+ return (torch.tensor(ov_output[0]),)
510
+
511
+ return infer_impl
512
+
513
+
514
+ def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable:
515
+ compiled_model = core.compile_model(ov_model, device)
516
+
517
+ def voice_conversion_impl(y, y_lengths, sid_src, sid_tgt, tau):
518
+ ov_output = compiled_model((y, y_lengths, sid_src, sid_tgt, tau))
519
+ return (torch.tensor(ov_output[0]),)
520
+
521
+ return voice_conversion_impl
522
+
523
+
524
+ melo_tts_en_newest.model.infer = get_pathched_infer(ov_en_tts, device.value)
525
+ melo_tts_zh.model.infer = get_pathched_infer(ov_zh_tts, device.value)
526
+ tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value)
527
+
528
+ voice_source = widgets.Dropdown(
529
+ options=["use TTS", "choose_manually"],
530
+ value="use TTS",
531
+ description="Voice source",
532
+ disabled=False,
533
+ )
534
+
535
+ voice_source
536
+
537
+ if voice_source.value == "choose_manually":
538
+ upload_orig_voice = widgets.FileUpload(
539
+ accept=allowed_audio_types,
540
+ multiple=False,
541
+ description="audio whose tone will be replaced",
542
+ )
543
+ display(upload_orig_voice)
544
+
545
+ from IPython.display import Audio, display
546
+
547
+ if voice_source.value == "choose_manually":
548
+ orig_voice_path = f"{OUTPUT_DIR}/{upload_orig_voice.value[0].name}"
549
+ save_audio(upload_orig_voice, orig_voice_path)
550
+ source_se, _ = se_extractor.get_se(orig_voice_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
551
+ else:
552
+ en_text = """
553
+ I love going to school by bus
554
+ """
555
+ # source_se = en_source_newest_se
556
+ en_orig_voice_path = OUTPUT_DIR / "output_ov_en-newest.wav"
557
+ print("use output_ov_en-newest.wav")
558
+ speaker_id = 0 # Choose the first speaker
559
+ melo_tts_en_newest.tts_to_file(en_text, speaker_id, en_orig_voice_path, speed=1.0)
560
+ zh_text = """
561
+ OpenVINO 是一个全面的开发工具集,旨在快速开发和部署各类应用程序及解决方案,可用于模仿人类视觉、自动语音识别、自然语言处理、
562
+ 推荐系统等多种任务。
563
+ """
564
+ # source_se = zh_source_se
565
+ zh_orig_voice_path = OUTPUT_DIR / "output_ov_zh.wav"
566
+ print("use output_ov_zh.wav")
567
+ speaker_id = 1 # Choose the first speaker
568
+ melo_tts_zh.tts_to_file(zh_text, speaker_id, zh_orig_voice_path, speed=1.0)
569
+ print("Playing English Original voice")
570
+ display(Audio(en_orig_voice_path))
571
+ print("Playing Chinese Original voice")
572
+ display(Audio(zh_orig_voice_path))
573
+
574
+ tau_slider = widgets.FloatSlider(
575
+ value=0.3,
576
+ min=0.01,
577
+ max=2.0,
578
+ step=0.01,
579
+ description="tau",
580
+ disabled=False,
581
+ readout_format=".2f",
582
+ )
583
+ tau_slider
584
+
585
+ from IPython.display import Audio, display
586
+
587
+ if voice_source.value == "choose_manually":
588
+ resulting_voice_path = OUTPUT_DIR / "output_ov_cloned.wav"
589
+ tone_color_converter.convert(
590
+ audio_src_path=orig_voice_path,
591
+ src_se=source_se,
592
+ tgt_se=target_se,
593
+ output_path=resulting_voice_path,
594
+ tau=tau_slider.value,
595
+ message="@MyShell",
596
+ )
597
+ print("Playing manually chosen cloned voice:")
598
+ display(Audio(resulting_voice_path))
599
+ else:
600
+ en_resulting_voice_path = OUTPUT_DIR / "output_ov_en-newest_cloned.wav"
601
+ zh_resulting_voice_path = OUTPUT_DIR / "output_ov_zh_cloned.wav"
602
+
603
+ tone_color_converter.convert(
604
+ audio_src_path=en_orig_voice_path,
605
+ src_se=en_source_newest_se,
606
+ tgt_se=target_se,
607
+ output_path=en_resulting_voice_path,
608
+ tau=tau_slider.value,
609
+ message="@MyShell",
610
+ )
611
+ tone_color_converter.convert(
612
+ audio_src_path=zh_orig_voice_path,
613
+ src_se=zh_source_se,
614
+ tgt_se=target_se,
615
+ output_path=zh_resulting_voice_path,
616
+ tau=tau_slider.value,
617
+ message="@MyShell",
618
+ )
619
+ print("Playing English cloned voice:")
620
+ display(Audio(en_resulting_voice_path))
621
+ print("Playing Chinese cloned voice:")
622
+ display(Audio(zh_resulting_voice_path))
623
+
624
+ import gradio as gr
625
+ import langid
626
+
627
+ supported_languages = ["zh", "en"]
628
+ supported_styles = {
629
+ "zh": "zh_default",
630
+ "en": [
631
+ "en_latest",
632
+ ],
633
+ }
634
+
635
+
636
+ def predict_impl(
637
+ prompt,
638
+ style,
639
+ audio_file_pth,
640
+ agree,
641
+ output_dir,
642
+ tone_color_converter,
643
+ en_tts_model,
644
+ zh_tts_model,
645
+ en_source_se,
646
+ zh_source_se,
647
+ ):
648
+ text_hint = ""
649
+ if not agree:
650
+ text_hint += "[ERROR] Please accept the Terms & Condition!\n"
651
+ gr.Warning("Please accept the Terms & Condition!")
652
+ return (
653
+ text_hint,
654
+ None,
655
+ None,
656
+ )
657
+
658
+ language_predicted = langid.classify(prompt)[0].strip()
659
+
660
+ if language_predicted not in supported_languages:
661
+ text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
662
+ gr.Warning(f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}")
663
+
664
+ return (
665
+ text_hint,
666
+ None,
667
+ None,
668
+ )
669
+
670
+ # check the style
671
+ if style not in supported_styles[language_predicted]:
672
+ text_hint += f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.\n"
673
+ gr.Warning(
674
+ f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior."
675
+ )
676
+
677
+ if len(prompt.split()) < 2:
678
+ text_hint += "[ERROR] Please give a longer prompt text \n"
679
+ gr.Warning("Please give a longer prompt text")
680
+ return (
681
+ text_hint,
682
+ None,
683
+ None,
684
+ )
685
+ if len(prompt.split()) > 50:
686
+ text_hint += "[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749 \n"
687
+ gr.Warning(
688
+ "Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749"
689
+ )
690
+ return (
691
+ text_hint,
692
+ None,
693
+ None,
694
+ )
695
+
696
+ speaker_wav = audio_file_pth
697
+
698
+ if language_predicted == "zh":
699
+ tts_model = zh_tts_model
700
+ if zh_tts_model is None:
701
+ gr.Warning("TTS model for Chinece language was not loaded")
702
+ return (
703
+ text_hint,
704
+ None,
705
+ None,
706
+ )
707
+ source_se = zh_source_se
708
+ speaker_id = 1
709
+
710
+ else:
711
+ tts_model = en_tts_model
712
+ if en_tts_model is None:
713
+ gr.Warning("TTS model for English language was not loaded")
714
+ return (
715
+ text_hint,
716
+ None,
717
+ None,
718
+ )
719
+ source_se = en_source_se
720
+ speaker_id = 0
721
+
722
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
723
+ try:
724
+ target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
725
+ except Exception as e:
726
+ text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
727
+ gr.Warning("[ERROR] Get target tone color error {str(e)} \n")
728
+ return (
729
+ text_hint,
730
+ None,
731
+ None,
732
+ )
733
+
734
+ src_path = f"{output_dir}/tmp.wav"
735
+ tts_model.tts_to_file(prompt, speaker_id, src_path, speed=1.0)
736
+
737
+ if tone_color_converter is None or source_se is None:
738
+ gr.Warning("Tone Color Converter model was not loaded")
739
+ return (
740
+ text_hint,
741
+ None,
742
+ None,
743
+ )
744
+ save_path = f"{output_dir}/output.wav"
745
+ encode_message = "@MyShell"
746
+ tone_color_converter.convert(
747
+ audio_src_path=src_path,
748
+ src_se=source_se,
749
+ tgt_se=target_se,
750
+ output_path=save_path,
751
+ tau=0.3,
752
+ message=encode_message,
753
+ )
754
+
755
+ text_hint += "Get response successfully \n"
756
+
757
+ return (
758
+ text_hint,
759
+ src_path,
760
+ save_path,
761
+ )
762
+
763
+ from functools import partial
764
+
765
+
766
+ predict = partial(
767
+ predict_impl,
768
+ output_dir=OUTPUT_DIR,
769
+ tone_color_converter=tone_color_converter,
770
+ en_tts_model=melo_tts_en_newest,
771
+ zh_tts_model=melo_tts_zh,
772
+ en_source_se=en_source_newest_se,
773
+ zh_source_se=zh_source_se,
774
+ )
775
+
776
+ import sys
777
+
778
+ if "gradio_helper" in sys.modules:
779
+ del sys.modules["gradio_helper"]
780
+
781
+ if not Path("gradio_helper.py").exists():
782
+ r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/openvoice/gradio_helper.py")
783
+ open("gradio_helper.py", "w").write(r.text)
784
+
785
+ from gradio_helper import make_demo
786
+
787
+ demo = make_demo(fn=predict)
788
+
789
+ # demo.queue(max_size=1).launch(share=True, debug=True, height=1000)
790
+
791
+ demo.queue(max_size=1).launch(server_name="0.0.0.0", server_port=7860)
792
+
793
+ # try:
794
+ # demo.queue(max_size=1).launch(debug=True, height=1000)
795
+ # except Exception:
796
+ # demo.queue(max_size=1).launch(share=True, debug=True, height=1000)
797
+ # if you are launching remotely, specify server_name and server_port
798
+ # demo.launch(server_name='your server name', server_port='server port in int')
799
+ # Read more in the docs: https://gradio.app/docs/
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ requests