rakhlin commited on
Commit
f016728
·
1 Parent(s): ad8ece8

Upload folder using huggingface_hub

Browse files
.ipynb_checkpoints/Coqui.ai-checkpoint.ipynb ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "6065d339",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import gradio as gr\n",
11
+ "import numpy as np\n",
12
+ "import torch\n",
13
+ "import torch.nn.functional as F\n",
14
+ "from pathlib import Path\n",
15
+ "\n",
16
+ "from TTS.api import TTS\n",
17
+ "from TTS.utils.manage import ModelManager"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 8,
23
+ "id": "1e64dfd7",
24
+ "metadata": {
25
+ "scrolled": false
26
+ },
27
+ "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "Running on local URL: http://127.0.0.1:7863\n",
33
+ "\n",
34
+ "To create a public link, set `share=True` in `launch()`.\n"
35
+ ]
36
+ },
37
+ {
38
+ "data": {
39
+ "text/html": [
40
+ "<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
41
+ ],
42
+ "text/plain": [
43
+ "<IPython.core.display.HTML object>"
44
+ ]
45
+ },
46
+ "metadata": {},
47
+ "output_type": "display_data"
48
+ },
49
+ {
50
+ "data": {
51
+ "text/plain": []
52
+ },
53
+ "execution_count": 8,
54
+ "metadata": {},
55
+ "output_type": "execute_result"
56
+ },
57
+ {
58
+ "name": "stdout",
59
+ "output_type": "stream",
60
+ "text": [
61
+ "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
62
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
63
+ " > Model's license - apache 2.0\n",
64
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
65
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
66
+ " > Model's license - apache 2.0\n",
67
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
68
+ " > Using model: Tacotron2\n",
69
+ " > Setting up Audio Processor...\n",
70
+ " | > sample_rate:22050\n",
71
+ " | > resample:False\n",
72
+ " | > num_mels:80\n",
73
+ " | > log_func:np.log10\n",
74
+ " | > min_level_db:-100\n",
75
+ " | > frame_shift_ms:None\n",
76
+ " | > frame_length_ms:None\n",
77
+ " | > ref_level_db:20\n",
78
+ " | > fft_size:1024\n",
79
+ " | > power:1.5\n",
80
+ " | > preemphasis:0.0\n",
81
+ " | > griffin_lim_iters:60\n",
82
+ " | > signal_norm:True\n",
83
+ " | > symmetric_norm:True\n",
84
+ " | > mel_fmin:50.0\n",
85
+ " | > mel_fmax:7600.0\n",
86
+ " | > pitch_fmin:0.0\n",
87
+ " | > pitch_fmax:640.0\n",
88
+ " | > spec_gain:1.0\n",
89
+ " | > stft_pad_mode:reflect\n",
90
+ " | > max_norm:4.0\n",
91
+ " | > clip_norm:True\n",
92
+ " | > do_trim_silence:True\n",
93
+ " | > trim_db:60\n",
94
+ " | > do_sound_norm:False\n",
95
+ " | > do_amp_to_db_linear:True\n",
96
+ " | > do_amp_to_db_mel:True\n",
97
+ " | > do_rms_norm:False\n",
98
+ " | > db_level:None\n",
99
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
100
+ " | > base:10\n",
101
+ " | > hop_length:256\n",
102
+ " | > win_length:1024\n",
103
+ " > Model's reduction rate `r` is set to: 2\n",
104
+ " > Vocoder Model: univnet\n",
105
+ " > Setting up Audio Processor...\n",
106
+ " | > sample_rate:22050\n",
107
+ " | > resample:False\n",
108
+ " | > num_mels:80\n",
109
+ " | > log_func:np.log10\n",
110
+ " | > min_level_db:-100\n",
111
+ " | > frame_shift_ms:None\n",
112
+ " | > frame_length_ms:None\n",
113
+ " | > ref_level_db:20\n",
114
+ " | > fft_size:1024\n",
115
+ " | > power:1.5\n",
116
+ " | > preemphasis:0.0\n",
117
+ " | > griffin_lim_iters:60\n",
118
+ " | > signal_norm:True\n",
119
+ " | > symmetric_norm:True\n",
120
+ " | > mel_fmin:50.0\n",
121
+ " | > mel_fmax:7600.0\n",
122
+ " | > pitch_fmin:1.0\n",
123
+ " | > pitch_fmax:640.0\n",
124
+ " | > spec_gain:1.0\n",
125
+ " | > stft_pad_mode:reflect\n",
126
+ " | > max_norm:4.0\n",
127
+ " | > clip_norm:True\n",
128
+ " | > do_trim_silence:True\n",
129
+ " | > trim_db:60\n",
130
+ " | > do_sound_norm:False\n",
131
+ " | > do_amp_to_db_linear:True\n",
132
+ " | > do_amp_to_db_mel:True\n",
133
+ " | > do_rms_norm:False\n",
134
+ " | > db_level:None\n",
135
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
136
+ " | > base:10\n",
137
+ " | > hop_length:256\n",
138
+ " | > win_length:1024\n",
139
+ " > Generator Model: univnet_generator\n",
140
+ " > Discriminator Model: univnet_discriminator\n",
141
+ "Passing through TTS model tts_models/en/ljspeech/tacotron2-DDC_ph\n",
142
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
143
+ "language: \n",
144
+ "speaker: \n",
145
+ "Using original voice\n",
146
+ " > Text splitted to sentences.\n",
147
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
148
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
149
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
150
+ " > Processing time: 3.316999912261963\n",
151
+ " > Real-time factor: 0.38182763983344614\n",
152
+ "Loading TTS model from tts_models/en/ek1/tacotron2\n",
153
+ " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
154
+ " > Model's license - apache 2.0\n",
155
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
156
+ " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
157
+ " > Model's license - apache 2.0\n",
158
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
159
+ " > Using model: Tacotron2\n",
160
+ " > Setting up Audio Processor...\n",
161
+ " | > sample_rate:22050\n",
162
+ " | > resample:False\n",
163
+ " | > num_mels:80\n",
164
+ " | > log_func:np.log10\n",
165
+ " | > min_level_db:-10\n",
166
+ " | > frame_shift_ms:None\n",
167
+ " | > frame_length_ms:None\n",
168
+ " | > ref_level_db:0\n",
169
+ " | > fft_size:1024\n",
170
+ " | > power:1.8\n",
171
+ " | > preemphasis:0.99\n",
172
+ " | > griffin_lim_iters:60\n",
173
+ " | > signal_norm:True\n",
174
+ " | > symmetric_norm:True\n",
175
+ " | > mel_fmin:0\n",
176
+ " | > mel_fmax:8000.0\n",
177
+ " | > pitch_fmin:1.0\n",
178
+ " | > pitch_fmax:640.0\n",
179
+ " | > spec_gain:1.0\n",
180
+ " | > stft_pad_mode:reflect\n",
181
+ " | > max_norm:4.0\n",
182
+ " | > clip_norm:True\n",
183
+ " | > do_trim_silence:True\n",
184
+ " | > trim_db:60\n",
185
+ " | > do_sound_norm:False\n",
186
+ " | > do_amp_to_db_linear:True\n",
187
+ " | > do_amp_to_db_mel:True\n",
188
+ " | > do_rms_norm:False\n",
189
+ " | > db_level:None\n",
190
+ " | > stats_path:None\n",
191
+ " | > base:10\n",
192
+ " | > hop_length:256\n",
193
+ " | > win_length:1024\n",
194
+ " > Model's reduction rate `r` is set to: 2\n",
195
+ " > Vocoder Model: wavegrad\n",
196
+ "Passing through TTS model tts_models/en/ek1/tacotron2\n",
197
+ "model: tts_models/en/ek1/tacotron2\n",
198
+ "language: \n",
199
+ "speaker: \n",
200
+ "Using original voice\n",
201
+ " > Text splitted to sentences.\n",
202
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n"
203
+ ]
204
+ }
205
+ ],
206
+ "source": [
207
+ "title = \"\"\n",
208
+ "description = \"\"\"\"\"\"\n",
209
+ "article = \"\"\"\"\"\"\n",
210
+ "\n",
211
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
212
+ "GPU = device == \"cuda\"\n",
213
+ "INT16MAX = np.iinfo(np.int16).max\n",
214
+ "\n",
215
+ "model_ids = ModelManager(verbose=False).list_models()\n",
216
+ "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
217
+ "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
218
+ "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
219
+ "examples_pt = 'examples'\n",
220
+ "allowed_extentions = ['.mp3', '.wav']\n",
221
+ "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
222
+ "verse = \"\"\"Mary had a little lamb,\n",
223
+ "Its fleece was white as snow.\n",
224
+ "Everywhere the child went,\n",
225
+ "The little lamb was sure to go.\"\"\"\n",
226
+ "\n",
227
+ "\n",
228
+ "\n",
229
+ "def on_model_tts_select(model_name, tts_var):\n",
230
+ " if tts_var is None or tts_var.model_name != model_name:\n",
231
+ " print(f'Loading TTS model from {model_name}')\n",
232
+ " tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
233
+ " else:\n",
234
+ " print(f'Passing through TTS model {tts_var.model_name}')\n",
235
+ " languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
236
+ " speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
237
+ " language = languages[0]\n",
238
+ " speaker = speakers[0]\n",
239
+ " return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n",
240
+ " gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n",
241
+ "\n",
242
+ "\n",
243
+ "def on_model_vc_select(model_name, vc_var):\n",
244
+ " if vc_var is None or vc_var.model_name != model_name:\n",
245
+ " print(f'Loading voice conversion model from {model_name}')\n",
246
+ " vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
247
+ " else:\n",
248
+ " print(f'Passing through voice conversion model {vc_var.model_name}')\n",
249
+ " return vc_var\n",
250
+ "\n",
251
+ "\n",
252
+ "def on_voicedropdown(x):\n",
253
+ " return examples[x]\n",
254
+ "\n",
255
+ "\n",
256
+ "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n",
257
+ " if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n",
258
+ " return (16000, np.zeros(0).astype(np.int16))\n",
259
+ " \n",
260
+ " sample_rate = tts_model.synthesizer.output_sample_rate\n",
261
+ " if tts_model.is_multi_speaker:\n",
262
+ " speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n",
263
+ " print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n",
264
+ " \n",
265
+ " language = None if language == '' else language\n",
266
+ " speaker = None if speaker == '' else speaker\n",
267
+ " if use_original_voice:\n",
268
+ " print('Using original voice')\n",
269
+ " speech = tts_model.tts(text, language=language, speaker=speaker) \n",
270
+ " elif tts_model.synthesizer.tts_model.speaker_manager:\n",
271
+ " print('voice cloning with the tts')\n",
272
+ " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
273
+ " else:\n",
274
+ " print('voice cloning with the voice conversion model')\n",
275
+ " speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
276
+ "\n",
277
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
278
+ " return (sample_rate, speech)\n",
279
+ "\n",
280
+ "\n",
281
+ "def voice_clone(vc_model, source_wav, target_wav):\n",
282
+ " print(f'model: {vc_model.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
283
+ " sample_rate = vc_model.voice_converter.output_sample_rate\n",
284
+ " if vc_model is None or source_wav is None or target_wav is None:\n",
285
+ " return (sample_rate, np.zeros(0).astype(np.int16))\n",
286
+ "\n",
287
+ " speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
288
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
289
+ " return (sample_rate, speech)\n",
290
+ "\n",
291
+ "\n",
292
+ "with gr.Blocks() as demo:\n",
293
+ " tts_model = gr.State(None)\n",
294
+ " vc_model = gr.State(None)\n",
295
+ " def activate(*args):\n",
296
+ " return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
297
+ " def deactivate(*args):\n",
298
+ " return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n",
299
+ "\n",
300
+ " gr.Markdown(description)\n",
301
+ "\n",
302
+ " with gr.Row(equal_height=True):\n",
303
+ " with gr.Column(scale=5, min_width=50):\n",
304
+ " model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)\n",
305
+ " with gr.Column(scale=1, min_width=10):\n",
306
+ " language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n",
307
+ " with gr.Column(scale=1, min_width=10):\n",
308
+ " speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n",
309
+ " with gr.Column(scale=5, min_width=50):\n",
310
+ " with gr.Row(equal_height=True):\n",
311
+ "# model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)\n",
312
+ " model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)\n",
313
+ " \n",
314
+ " with gr.Accordion(\"Target voice\", open=False) as accordion:\n",
315
+ " gr.Markdown(\"Upload target voice...\")\n",
316
+ " with gr.Row(equal_height=True):\n",
317
+ " voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n",
318
+ " voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n",
319
+ "\n",
320
+ " with gr.Row(equal_height=True):\n",
321
+ " with gr.Column(scale=2):\n",
322
+ " with gr.Row(equal_height=True):\n",
323
+ " with gr.Column():\n",
324
+ " text_to_convert = gr.Textbox(verse)\n",
325
+ " orig_voice = gr.Checkbox(label='Use original voice')\n",
326
+ " voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n",
327
+ " with gr.Row(equal_height=True):\n",
328
+ " button_text = gr.Button('Text to speech', interactive=True)\n",
329
+ " button_audio = gr.Button('Convert audio', interactive=True)\n",
330
+ " with gr.Row(equal_height=True):\n",
331
+ " speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n",
332
+ " \n",
333
+ " # actions\n",
334
+ " model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
335
+ " then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
336
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
337
+ " model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
338
+ " then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
339
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
340
+ " voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
341
+ " then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n",
342
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
343
+ " \n",
344
+ " button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
345
+ " then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
346
+ " then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n",
347
+ " outputs=speech).\\\n",
348
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
349
+ "\n",
350
+ " button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
351
+ " then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
352
+ " then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\\\n",
353
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
354
+ " \n",
355
+ " gr.HTML(article)\n",
356
+ "demo.launch(share=False)"
357
+ ]
358
+ }
359
+ ],
360
+ "metadata": {
361
+ "kernelspec": {
362
+ "display_name": "Python 3",
363
+ "language": "python",
364
+ "name": "python3"
365
+ },
366
+ "language_info": {
367
+ "codemirror_mode": {
368
+ "name": "ipython",
369
+ "version": 3
370
+ },
371
+ "file_extension": ".py",
372
+ "mimetype": "text/x-python",
373
+ "name": "python",
374
+ "nbconvert_exporter": "python",
375
+ "pygments_lexer": "ipython3",
376
+ "version": "3.7.9"
377
+ }
378
+ },
379
+ "nbformat": 4,
380
+ "nbformat_minor": 5
381
+ }
Coqui.ai.ipynb ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "57fc627d",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import gradio as gr\n",
11
+ "import numpy as np\n",
12
+ "import torch\n",
13
+ "import torch.nn.functional as F\n",
14
+ "from pathlib import Path\n",
15
+ "\n",
16
+ "from TTS.api import TTS\n",
17
+ "from TTS.utils.manage import ModelManager"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 9,
23
+ "id": "a5789dee",
24
+ "metadata": {
25
+ "scrolled": false
26
+ },
27
+ "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "Running on local URL: http://127.0.0.1:7864\n",
33
+ "\n",
34
+ "To create a public link, set `share=True` in `launch()`.\n"
35
+ ]
36
+ },
37
+ {
38
+ "data": {
39
+ "text/html": [
40
+ "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
41
+ ],
42
+ "text/plain": [
43
+ "<IPython.core.display.HTML object>"
44
+ ]
45
+ },
46
+ "metadata": {},
47
+ "output_type": "display_data"
48
+ },
49
+ {
50
+ "data": {
51
+ "text/plain": []
52
+ },
53
+ "execution_count": 9,
54
+ "metadata": {},
55
+ "output_type": "execute_result"
56
+ },
57
+ {
58
+ "name": "stdout",
59
+ "output_type": "stream",
60
+ "text": [
61
+ "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
62
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
63
+ " > Model's license - apache 2.0\n",
64
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
65
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
66
+ " > Model's license - apache 2.0\n",
67
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
68
+ " > Using model: Tacotron2\n",
69
+ " > Setting up Audio Processor...\n",
70
+ " | > sample_rate:22050\n",
71
+ " | > resample:False\n",
72
+ " | > num_mels:80\n",
73
+ " | > log_func:np.log10\n",
74
+ " | > min_level_db:-100\n",
75
+ " | > frame_shift_ms:None\n",
76
+ " | > frame_length_ms:None\n",
77
+ " | > ref_level_db:20\n",
78
+ " | > fft_size:1024\n",
79
+ " | > power:1.5\n",
80
+ " | > preemphasis:0.0\n",
81
+ " | > griffin_lim_iters:60\n",
82
+ " | > signal_norm:True\n",
83
+ " | > symmetric_norm:True\n",
84
+ " | > mel_fmin:50.0\n",
85
+ " | > mel_fmax:7600.0\n",
86
+ " | > pitch_fmin:0.0\n",
87
+ " | > pitch_fmax:640.0\n",
88
+ " | > spec_gain:1.0\n",
89
+ " | > stft_pad_mode:reflect\n",
90
+ " | > max_norm:4.0\n",
91
+ " | > clip_norm:True\n",
92
+ " | > do_trim_silence:True\n",
93
+ " | > trim_db:60\n",
94
+ " | > do_sound_norm:False\n",
95
+ " | > do_amp_to_db_linear:True\n",
96
+ " | > do_amp_to_db_mel:True\n",
97
+ " | > do_rms_norm:False\n",
98
+ " | > db_level:None\n",
99
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
100
+ " | > base:10\n",
101
+ " | > hop_length:256\n",
102
+ " | > win_length:1024\n",
103
+ " > Model's reduction rate `r` is set to: 2\n",
104
+ " > Vocoder Model: univnet\n",
105
+ " > Setting up Audio Processor...\n",
106
+ " | > sample_rate:22050\n",
107
+ " | > resample:False\n",
108
+ " | > num_mels:80\n",
109
+ " | > log_func:np.log10\n",
110
+ " | > min_level_db:-100\n",
111
+ " | > frame_shift_ms:None\n",
112
+ " | > frame_length_ms:None\n",
113
+ " | > ref_level_db:20\n",
114
+ " | > fft_size:1024\n",
115
+ " | > power:1.5\n",
116
+ " | > preemphasis:0.0\n",
117
+ " | > griffin_lim_iters:60\n",
118
+ " | > signal_norm:True\n",
119
+ " | > symmetric_norm:True\n",
120
+ " | > mel_fmin:50.0\n",
121
+ " | > mel_fmax:7600.0\n",
122
+ " | > pitch_fmin:1.0\n",
123
+ " | > pitch_fmax:640.0\n",
124
+ " | > spec_gain:1.0\n",
125
+ " | > stft_pad_mode:reflect\n",
126
+ " | > max_norm:4.0\n",
127
+ " | > clip_norm:True\n",
128
+ " | > do_trim_silence:True\n",
129
+ " | > trim_db:60\n",
130
+ " | > do_sound_norm:False\n",
131
+ " | > do_amp_to_db_linear:True\n",
132
+ " | > do_amp_to_db_mel:True\n",
133
+ " | > do_rms_norm:False\n",
134
+ " | > db_level:None\n",
135
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
136
+ " | > base:10\n",
137
+ " | > hop_length:256\n",
138
+ " | > win_length:1024\n",
139
+ " > Generator Model: univnet_generator\n",
140
+ " > Discriminator Model: univnet_discriminator\n",
141
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
142
+ "language: \n",
143
+ "speaker: \n",
144
+ "Using original voice\n",
145
+ " > Text splitted to sentences.\n",
146
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
147
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
148
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
149
+ " > Processing time: 24.694000244140625\n",
150
+ " > Real-time factor: 2.8425842872081772\n"
151
+ ]
152
+ }
153
+ ],
154
+ "source": [
155
+ "title = \"\"\n",
156
+ "description = \"\"\"\"\"\"\n",
157
+ "article = \"\"\"\"\"\"\n",
158
+ "\n",
159
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
160
+ "GPU = device == \"cuda\"\n",
161
+ "INT16MAX = np.iinfo(np.int16).max\n",
162
+ "\n",
163
+ "model_ids = ModelManager(verbose=False).list_models()\n",
164
+ "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
165
+ "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
166
+ "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
167
+ "examples_pt = 'examples'\n",
168
+ "allowed_extentions = ['.mp3', '.wav']\n",
169
+ "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
170
+ "verse = \"\"\"Mary had a little lamb,\n",
171
+ "Its fleece was white as snow.\n",
172
+ "Everywhere the child went,\n",
173
+ "The little lamb was sure to go.\"\"\"\n",
174
+ "\n",
175
+ "\n",
176
+ "\n",
177
+ "def on_model_tts_select(model_name, tts_var):\n",
178
+ " if tts_var is None or tts_var.model_name != model_name:\n",
179
+ " print(f'Loading TTS model from {model_name}')\n",
180
+ " tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
181
+ " else:\n",
182
+ " print(f'Passing through TTS model {tts_var.model_name}')\n",
183
+ " languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
184
+ " speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
185
+ " language = languages[0]\n",
186
+ " speaker = speakers[0]\n",
187
+ " return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n",
188
+ " gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n",
189
+ "\n",
190
+ "\n",
191
+ "def on_model_vc_select(model_name, vc_var):\n",
192
+ " if vc_var is None or vc_var.model_name != model_name:\n",
193
+ " print(f'Loading voice conversion model from {model_name}')\n",
194
+ " vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
195
+ " else:\n",
196
+ " print(f'Passing through voice conversion model {vc_var.model_name}')\n",
197
+ " return vc_var\n",
198
+ "\n",
199
+ "\n",
200
+ "def on_voicedropdown(x):\n",
201
+ " return examples[x]\n",
202
+ "\n",
203
+ "\n",
204
+ "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n",
205
+ " if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n",
206
+ " return (16000, np.zeros(0).astype(np.int16))\n",
207
+ " \n",
208
+ " sample_rate = tts_model.synthesizer.output_sample_rate\n",
209
+ " if tts_model.is_multi_speaker:\n",
210
+ " speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n",
211
+ " print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n",
212
+ " \n",
213
+ " language = None if language == '' else language\n",
214
+ " speaker = None if speaker == '' else speaker\n",
215
+ " if use_original_voice:\n",
216
+ " print('Using original voice')\n",
217
+ " speech = tts_model.tts(text, language=language, speaker=speaker) \n",
218
+ " elif tts_model.synthesizer.tts_model.speaker_manager:\n",
219
+ " print('voice cloning with the tts')\n",
220
+ " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
221
+ " else:\n",
222
+ " print('voice cloning with the voice conversion model')\n",
223
+ " speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
224
+ "\n",
225
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
226
+ " return (sample_rate, speech)\n",
227
+ "\n",
228
+ "\n",
229
+ "def voice_clone(vc_model, source_wav, target_wav):\n",
230
+ " print(f'model: {vc_model.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
231
+ " sample_rate = vc_model.voice_converter.output_sample_rate\n",
232
+ " if vc_model is None or source_wav is None or target_wav is None:\n",
233
+ " return (sample_rate, np.zeros(0).astype(np.int16))\n",
234
+ "\n",
235
+ " speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
236
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
237
+ " return (sample_rate, speech)\n",
238
+ "\n",
239
+ "\n",
240
+ "with gr.Blocks() as demo:\n",
241
+ " tts_model = gr.State(None)\n",
242
+ " vc_model = gr.State(None)\n",
243
+ " def activate(*args):\n",
244
+ " return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
245
+ " def deactivate(*args):\n",
246
+ " return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n",
247
+ "\n",
248
+ " gr.Markdown(description)\n",
249
+ "\n",
250
+ " with gr.Row(equal_height=True):\n",
251
+ " with gr.Column(scale=5, min_width=50):\n",
252
+ " model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)\n",
253
+ " with gr.Column(scale=1, min_width=10):\n",
254
+ " language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n",
255
+ " with gr.Column(scale=1, min_width=10):\n",
256
+ " speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n",
257
+ " with gr.Column(scale=5, min_width=50):\n",
258
+ " with gr.Row(equal_height=True):\n",
259
+ "# model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)\n",
260
+ " model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)\n",
261
+ " \n",
262
+ " with gr.Accordion(\"Target voice\", open=False) as accordion:\n",
263
+ " gr.Markdown(\"Upload target voice...\")\n",
264
+ " with gr.Row(equal_height=True):\n",
265
+ " voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n",
266
+ " voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n",
267
+ "\n",
268
+ " with gr.Row(equal_height=True):\n",
269
+ " with gr.Column(scale=2):\n",
270
+ " with gr.Row(equal_height=True):\n",
271
+ " with gr.Column():\n",
272
+ " text_to_convert = gr.Textbox(verse)\n",
273
+ " orig_voice = gr.Checkbox(label='Use original voice')\n",
274
+ " voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n",
275
+ " with gr.Row(equal_height=True):\n",
276
+ " button_text = gr.Button('Text to speech', interactive=True)\n",
277
+ " button_audio = gr.Button('Convert audio', interactive=True)\n",
278
+ " with gr.Row(equal_height=True):\n",
279
+ " speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n",
280
+ " \n",
281
+ " # actions\n",
282
+ " model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
283
+ " then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
284
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
285
+ " model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
286
+ " then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
287
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
288
+ " voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
289
+ " then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n",
290
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
291
+ " \n",
292
+ " button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
293
+ " then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
294
+ " then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n",
295
+ " outputs=speech).\\\n",
296
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
297
+ "\n",
298
+ " button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
299
+ " then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
300
+ " then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\\\n",
301
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
302
+ " \n",
303
+ " gr.HTML(article)\n",
304
+ "demo.launch(share=False)"
305
+ ]
306
+ }
307
+ ],
308
+ "metadata": {
309
+ "kernelspec": {
310
+ "display_name": "Python 3",
311
+ "language": "python",
312
+ "name": "python3"
313
+ },
314
+ "language_info": {
315
+ "codemirror_mode": {
316
+ "name": "ipython",
317
+ "version": 3
318
+ },
319
+ "file_extension": ".py",
320
+ "mimetype": "text/x-python",
321
+ "name": "python",
322
+ "nbconvert_exporter": "python",
323
+ "pygments_lexer": "ipython3",
324
+ "version": "3.7.9"
325
+ }
326
+ },
327
+ "nbformat": 4,
328
+ "nbformat_minor": 5
329
+ }
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
  title: Coqui.ai
3
- emoji: 🏆
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.33.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Coqui.ai
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 3.33.1
 
 
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from pathlib import Path
6
+
7
+ from TTS.api import TTS
8
+ from TTS.utils.manage import ModelManager
9
+
10
+
11
+ title = ""
12
+ description = """"""
13
+ article = """"""
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ GPU = device == "cuda"
17
+ INT16MAX = np.iinfo(np.int16).max
18
+
19
+ model_ids = ModelManager(verbose=False).list_models()
20
+ model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
21
+ model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
22
+ model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
23
+ examples_pt = 'examples'
24
+ allowed_extentions = ['.mp3', '.wav']
25
+ examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
26
+ verse = """Mary had a little lamb,
27
+ Its fleece was white as snow.
28
+ Everywhere the child went,
29
+ The little lamb was sure to go."""
30
+
31
+
32
+
33
+ def on_model_tts_select(model_name, tts_var):
34
+ if tts_var is None or tts_var.model_name != model_name:
35
+ print(f'Loading TTS model from {model_name}')
36
+ tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
37
+ else:
38
+ print(f'Passing through TTS model {tts_var.model_name}')
39
+ languages = tts_var.languages if tts_var.is_multi_lingual else ['']
40
+ speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting
41
+ language = languages[0]
42
+ speaker = speakers[0]
43
+ return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\
44
+ gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)
45
+
46
+
47
+ def on_model_vc_select(model_name, vc_var):
48
+ if vc_var is None or vc_var.model_name != model_name:
49
+ print(f'Loading voice conversion model from {model_name}')
50
+ vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
51
+ else:
52
+ print(f'Passing through voice conversion model {vc_var.model_name}')
53
+ return vc_var
54
+
55
+
56
+ def on_voicedropdown(x):
57
+ return examples[x]
58
+
59
+
60
+ def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):
61
+ if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):
62
+ return (16000, np.zeros(0).astype(np.int16))
63
+
64
+ sample_rate = tts_model.synthesizer.output_sample_rate
65
+ if tts_model.is_multi_speaker:
66
+ speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting
67
+ print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}')
68
+
69
+ language = None if language == '' else language
70
+ speaker = None if speaker == '' else speaker
71
+ if use_original_voice:
72
+ print('Using original voice')
73
+ speech = tts_model.tts(text, language=language, speaker=speaker)
74
+ elif tts_model.synthesizer.tts_model.speaker_manager:
75
+ print('voice cloning with the tts')
76
+ speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
77
+ else:
78
+ print('voice cloning with the voice conversion model')
79
+ speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)
80
+
81
+ speech = (np.array(speech) * INT16MAX).astype(np.int16)
82
+ return (sample_rate, speech)
83
+
84
+
85
+ def voice_clone(vc_model, source_wav, target_wav):
86
+ print(f'model: {vc_model.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}')
87
+ sample_rate = vc_model.voice_converter.output_sample_rate
88
+ if vc_model is None or source_wav is None or target_wav is None:
89
+ return (sample_rate, np.zeros(0).astype(np.int16))
90
+
91
+ speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)
92
+ speech = (np.array(speech) * INT16MAX).astype(np.int16)
93
+ return (sample_rate, speech)
94
+
95
+
96
+ with gr.Blocks() as demo:
97
+ tts_model = gr.State(None)
98
+ vc_model = gr.State(None)
99
+ def activate(*args):
100
+ return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
101
+ def deactivate(*args):
102
+ return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)
103
+
104
+ gr.Markdown(description)
105
+
106
+ with gr.Row(equal_height=True):
107
+ with gr.Column(scale=5, min_width=50):
108
+ model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)
109
+ with gr.Column(scale=1, min_width=10):
110
+ language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)
111
+ with gr.Column(scale=1, min_width=10):
112
+ speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)
113
+ with gr.Column(scale=5, min_width=50):
114
+ with gr.Row(equal_height=True):
115
+ # model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)
116
+ model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)
117
+
118
+ with gr.Accordion("Target voice", open=False) as accordion:
119
+ gr.Markdown("Upload target voice...")
120
+ with gr.Row(equal_height=True):
121
+ voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')
122
+ voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)
123
+
124
+ with gr.Row(equal_height=True):
125
+ with gr.Column(scale=2):
126
+ with gr.Row(equal_height=True):
127
+ with gr.Column():
128
+ text_to_convert = gr.Textbox(verse)
129
+ orig_voice = gr.Checkbox(label='Use original voice')
130
+ voice_to_convert = gr.Audio(label="Upload voice to convert", source='upload', type='filepath')
131
+ with gr.Row(equal_height=True):
132
+ button_text = gr.Button('Text to speech', interactive=True)
133
+ button_audio = gr.Button('Convert audio', interactive=True)
134
+ with gr.Row(equal_height=True):
135
+ speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False)
136
+
137
+ # actions
138
+ model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
139
+ then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
140
+ then(activate, [button_text, button_audio], [button_text, button_audio])
141
+ model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
142
+ then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
143
+ then(activate, [button_text, button_audio], [button_text, button_audio])
144
+ voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
145
+ then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
146
+ then(activate, [button_text, button_audio], [button_text, button_audio])
147
+
148
+ button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
149
+ then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
150
+ then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice],
151
+ outputs=speech).\
152
+ then(activate, [button_text, button_audio], [button_text, button_audio])
153
+
154
+ button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
155
+ then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
156
+ then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\
157
+ then(activate, [button_text, button_audio], [button_text, button_audio])
158
+
159
+ gr.HTML(article)
160
+ demo.launch(share=False)
examples/arctic_a0023_bdl.wav ADDED
Binary file (168 kB). View file
 
examples/arctic_a0023_clb.wav ADDED
Binary file (189 kB). View file
 
examples/arctic_a0023_rms.wav ADDED
Binary file (172 kB). View file
 
examples/arctic_a0023_slt.wav ADDED
Binary file (153 kB). View file
 
examples/arctic_a0366_bdl.wav ADDED
Binary file (166 kB). View file
 
examples/arctic_a0366_rms.wav ADDED
Binary file (184 kB). View file
 
examples/arctic_a0407_bdl.wav ADDED
Binary file (183 kB). View file
 
examples/arctic_a0407_clb.wav ADDED
Binary file (200 kB). View file
 
examples/arctic_a0407_rms.wav ADDED
Binary file (216 kB). View file
 
examples/arctic_a0407_slt.wav ADDED
Binary file (171 kB). View file
 
examples/arctic_b0496_clb.wav ADDED
Binary file (192 kB). View file
 
examples/arctic_b0496_slt.wav ADDED
Binary file (171 kB). View file
 
examples/henry5.mp3 ADDED
Binary file (375 kB). View file
 
examples/hmm_i_dont_know.wav ADDED
Binary file (203 kB). View file
 
examples/see_in_eyes.wav ADDED
Binary file (65.2 kB). View file
 
examples/yearn_for_time.mp3 ADDED
Binary file (56.3 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TTS
2
+ numpy==1.21.6;python_version<"3.10"
3
+ numpy;python_version=="3.10"
4
+ cython==0.29.28
5
+ scipy>=1.4.0
6
+ torch>=1.7
7
+ torchaudio
8
+ soundfile
9
+ librosa==0.10.0.*
10
+ numba==0.55.1;python_version<"3.9"
11
+ numba==0.56.4;python_version>="3.9"
12
+ inflect==5.6.0
13
+ tqdm
14
+ anyascii
15
+ pyyaml
16
+ fsspec>=2021.04.0
17
+ aiohttp
18
+ packaging