Delete app.bak.py
Browse files- app.bak.py +0 -160
app.bak.py
DELETED
@@ -1,160 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import numpy as np
|
3 |
-
import torch
|
4 |
-
import torch.nn.functional as F
|
5 |
-
from pathlib import Path
|
6 |
-
|
7 |
-
from TTS.api import TTS
|
8 |
-
from TTS.utils.manage import ModelManager
|
9 |
-
|
10 |
-
|
11 |
-
title = ""
|
12 |
-
description = """"""
|
13 |
-
article = """"""
|
14 |
-
|
15 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
-
GPU = device == "cuda"
|
17 |
-
INT16MAX = np.iinfo(np.int16).max
|
18 |
-
|
19 |
-
model_ids = ModelManager(verbose=False).list_models()
|
20 |
-
model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
|
21 |
-
model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
|
22 |
-
model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
|
23 |
-
examples_pt = 'examples'
|
24 |
-
allowed_extentions = ['.mp3', '.wav']
|
25 |
-
examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
|
26 |
-
verse = """Mary had a little lamb,
|
27 |
-
Its fleece was white as snow.
|
28 |
-
Everywhere the child went,
|
29 |
-
The little lamb was sure to go."""
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
def on_model_tts_select(model_name, tts_var):
|
34 |
-
if tts_var is None or tts_var.model_name != model_name:
|
35 |
-
print(f'Loading TTS model from {model_name}')
|
36 |
-
tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
|
37 |
-
else:
|
38 |
-
print(f'Passing through TTS model {tts_var.model_name}')
|
39 |
-
languages = tts_var.languages if tts_var.is_multi_lingual else ['']
|
40 |
-
speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting
|
41 |
-
language = languages[0]
|
42 |
-
speaker = speakers[0]
|
43 |
-
return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\
|
44 |
-
gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)
|
45 |
-
|
46 |
-
|
47 |
-
def on_model_vc_select(model_name, vc_var):
|
48 |
-
if vc_var is None or vc_var.model_name != model_name:
|
49 |
-
print(f'Loading voice conversion model from {model_name}')
|
50 |
-
vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
|
51 |
-
else:
|
52 |
-
print(f'Passing through voice conversion model {vc_var.model_name}')
|
53 |
-
return vc_var
|
54 |
-
|
55 |
-
|
56 |
-
def on_voicedropdown(x):
|
57 |
-
return examples[x]
|
58 |
-
|
59 |
-
|
60 |
-
def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):
|
61 |
-
if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):
|
62 |
-
return (16000, np.zeros(0).astype(np.int16))
|
63 |
-
|
64 |
-
sample_rate = tts_model.synthesizer.output_sample_rate
|
65 |
-
if tts_model.is_multi_speaker:
|
66 |
-
speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting
|
67 |
-
print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}')
|
68 |
-
|
69 |
-
language = None if language == '' else language
|
70 |
-
speaker = None if speaker == '' else speaker
|
71 |
-
if use_original_voice:
|
72 |
-
print('Using original voice')
|
73 |
-
speech = tts_model.tts(text, language=language, speaker=speaker)
|
74 |
-
elif tts_model.synthesizer.tts_model.speaker_manager:
|
75 |
-
print('voice cloning with the tts')
|
76 |
-
speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
|
77 |
-
else:
|
78 |
-
print('voice cloning with the voice conversion model')
|
79 |
-
speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)
|
80 |
-
|
81 |
-
speech = (np.array(speech) * INT16MAX).astype(np.int16)
|
82 |
-
return (sample_rate, speech)
|
83 |
-
|
84 |
-
|
85 |
-
def voice_clone(vc_model, source_wav, target_wav):
|
86 |
-
print(f'model: {vc_model.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}')
|
87 |
-
sample_rate = vc_model.voice_converter.output_sample_rate
|
88 |
-
if vc_model is None or source_wav is None or target_wav is None:
|
89 |
-
return (sample_rate, np.zeros(0).astype(np.int16))
|
90 |
-
|
91 |
-
speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)
|
92 |
-
speech = (np.array(speech) * INT16MAX).astype(np.int16)
|
93 |
-
return (sample_rate, speech)
|
94 |
-
|
95 |
-
|
96 |
-
with gr.Blocks() as demo:
|
97 |
-
tts_model = gr.State(None)
|
98 |
-
vc_model = gr.State(None)
|
99 |
-
def activate(*args):
|
100 |
-
return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
|
101 |
-
def deactivate(*args):
|
102 |
-
return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)
|
103 |
-
|
104 |
-
gr.Markdown(description)
|
105 |
-
|
106 |
-
with gr.Row(equal_height=True):
|
107 |
-
with gr.Column(scale=5, min_width=50):
|
108 |
-
model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)
|
109 |
-
with gr.Column(scale=1, min_width=10):
|
110 |
-
language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)
|
111 |
-
with gr.Column(scale=1, min_width=10):
|
112 |
-
speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)
|
113 |
-
with gr.Column(scale=5, min_width=50):
|
114 |
-
with gr.Row(equal_height=True):
|
115 |
-
# model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)
|
116 |
-
model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)
|
117 |
-
|
118 |
-
with gr.Accordion("Target voice", open=False) as accordion:
|
119 |
-
gr.Markdown("Upload target voice...")
|
120 |
-
with gr.Row(equal_height=True):
|
121 |
-
voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')
|
122 |
-
voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)
|
123 |
-
|
124 |
-
with gr.Row(equal_height=True):
|
125 |
-
with gr.Column(scale=2):
|
126 |
-
with gr.Row(equal_height=True):
|
127 |
-
with gr.Column():
|
128 |
-
text_to_convert = gr.Textbox(verse)
|
129 |
-
orig_voice = gr.Checkbox(label='Use original voice')
|
130 |
-
voice_to_convert = gr.Audio(label="Upload voice to convert", source='upload', type='filepath')
|
131 |
-
with gr.Row(equal_height=True):
|
132 |
-
button_text = gr.Button('Text to speech', interactive=True)
|
133 |
-
button_audio = gr.Button('Convert audio', interactive=True)
|
134 |
-
with gr.Row(equal_height=True):
|
135 |
-
speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False)
|
136 |
-
|
137 |
-
# actions
|
138 |
-
model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
|
139 |
-
then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
|
140 |
-
then(activate, [button_text, button_audio], [button_text, button_audio])
|
141 |
-
model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
|
142 |
-
then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
|
143 |
-
then(activate, [button_text, button_audio], [button_text, button_audio])
|
144 |
-
voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
|
145 |
-
then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
|
146 |
-
then(activate, [button_text, button_audio], [button_text, button_audio])
|
147 |
-
|
148 |
-
button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
|
149 |
-
then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
|
150 |
-
then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice],
|
151 |
-
outputs=speech).\
|
152 |
-
then(activate, [button_text, button_audio], [button_text, button_audio])
|
153 |
-
|
154 |
-
button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
|
155 |
-
then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
|
156 |
-
then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\
|
157 |
-
then(activate, [button_text, button_audio], [button_text, button_audio])
|
158 |
-
|
159 |
-
gr.HTML(article)
|
160 |
-
demo.launch(share=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|