Spaces:
Running
Running
Upload 2 files
Browse files- gradio_gui_6lang_blocks.py +180 -0
- syn_hifigan.py +42 -23
gradio_gui_6lang_blocks.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
|
5 |
+
import syn_hifigan as syn
|
6 |
+
#import syn_vgan as syn
|
7 |
+
#import syn_k_univnet_multi as syn
|
8 |
+
|
9 |
+
description_text = """
|
10 |
+
# Multilingual TTS for Sámi languages (+ Finnish and Estonian)
|
11 |
+
|
12 |
+
Welcome! This is a demonstration of a multi-lingual and multi-speaker Text-to-Speech (TTS) model.
|
13 |
+
The demo is related to research on TTS for low-resource languages, and the effect of augmenting the training data with
|
14 |
+
areally close languages.
|
15 |
+
|
16 |
+
|
17 |
+
Disclaimers:
|
18 |
+
For convenience, the demo uses pretrained HiFi-GAN vocoder which doesn't work well with male voices.
|
19 |
+
English does not well due to small dataset and orthographic transcriptions. Use the demo just for testing, not for frequent or commercial use.
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
"""
|
24 |
+
speakers = {
|
25 |
+
"aj(sma)": 2,
|
26 |
+
"am(sme)": 3,
|
27 |
+
"ms(sme)": 4,
|
28 |
+
"ln(sme)": 5,
|
29 |
+
"mu(smj)": 7,
|
30 |
+
"sa(smj)": 8,
|
31 |
+
"bi(smj": 10,
|
32 |
+
"css(fin)": 11,
|
33 |
+
"ti(fin)": 13,
|
34 |
+
"ta(fin)": 14,
|
35 |
+
"liivika(est)": 15,
|
36 |
+
"indek(est)": 16,
|
37 |
+
"kylli(est)": 17,
|
38 |
+
"andreas(est)": 18,
|
39 |
+
"peeter(est)": 19,
|
40 |
+
"kersti(est)": 20,
|
41 |
+
"M6670(eng)": 21,
|
42 |
+
"M6097(eng)": 22,
|
43 |
+
"F92(eng)": 23,
|
44 |
+
"F9136(eng)": 24
|
45 |
+
}
|
46 |
+
|
47 |
+
mean_pitch = {
|
48 |
+
"aj0": 130,
|
49 |
+
"aj1": 130,
|
50 |
+
"am": 120,
|
51 |
+
"ms": 120,
|
52 |
+
"ln": 120,
|
53 |
+
"lo": 120,
|
54 |
+
"mu": 120,
|
55 |
+
"sa": 120,
|
56 |
+
"kd": 120,
|
57 |
+
"bi": 120,
|
58 |
+
"ti": 130,
|
59 |
+
"ta": 115,
|
60 |
+
"liivika": 120,
|
61 |
+
"indek": 90,
|
62 |
+
"kylli": 140,
|
63 |
+
"andreas": 100,
|
64 |
+
"peeter": 80,
|
65 |
+
"kersti": 120
|
66 |
+
}
|
67 |
+
|
68 |
+
languages = {
|
69 |
+
"guess": -1,
|
70 |
+
"South Sámi": 0, #South
|
71 |
+
"North Sámi": 1, #North
|
72 |
+
"Lule Sámi": 2, #Lule
|
73 |
+
"Finnish": 3,
|
74 |
+
"Estonian": 4,
|
75 |
+
"English": 5
|
76 |
+
}
|
77 |
+
|
78 |
+
# --- NEW: Add a dictionary for default prompts per language ---
|
79 |
+
default_prompts = {
|
80 |
+
"guess": "Sáhtta go esso-burgera luohti, Koskenkorva dahje carpool karajoiki gádjut árgabeaivveluođi?",
|
81 |
+
|
82 |
+
"North Sámi": "Riektačállinreaidduid lassin Divvun-joavkkus ovdanit dál maiddái hállanteknologiijareaidduid.",
|
83 |
+
|
84 |
+
"South Sámi": " Buerie aerede gaajhkesh dovnesh jïh buerie båeteme dan bæjhkoehtæmman.", #Guktie datnine?",
|
85 |
+
"Lule Sámi": "Sáme hållamsyntiesaj baktu máhttá adnegoahtet sáme gielajt ådå aktijvuodajn.",
|
86 |
+
|
87 |
+
"Finnish": "Joka kuuseen kurkottaa, se katajaan kapsahtaa.",
|
88 |
+
"Estonian": "Aprilli lõpp pani aiapidajate kannatuse jälle proovile – pärast mõnepäevast sooja saabub ootamatu külmalaine.",
|
89 |
+
|
90 |
+
"English": "This obscure language is not supported by this model."
|
91 |
+
}
|
92 |
+
|
93 |
+
|
94 |
+
public = False
|
95 |
+
|
96 |
+
tempdir = tempfile.gettempdir()
|
97 |
+
|
98 |
+
tts = syn.Synthesizer()
|
99 |
+
|
100 |
+
|
101 |
+
def speak(text, language, speaker, l_weight, s_weight, pace, postfilter): # pitch_shift,pitch_std):
|
102 |
+
|
103 |
+
# text frontend not implemented...
|
104 |
+
text = text.replace("...", "…")
|
105 |
+
#print(speakers[speaker])
|
106 |
+
#print(language)
|
107 |
+
use_lid = False
|
108 |
+
if language == "guess":
|
109 |
+
use_lid = True
|
110 |
+
|
111 |
+
audio = tts.speak(text, output_file=f'{tempdir}/tmp', lang=languages[language],
|
112 |
+
spkr=speakers[speaker], l_weight=l_weight, s_weight=s_weight,
|
113 |
+
pace=pace, clarity=postfilter, guess_lang=use_lid) # , mean_pitch = mean_pitch[speaker])
|
114 |
+
|
115 |
+
if not public:
|
116 |
+
try:
|
117 |
+
os.system("play " + tempdir + "/tmp.wav &")
|
118 |
+
except:
|
119 |
+
pass
|
120 |
+
|
121 |
+
return (22050, audio)
|
122 |
+
|
123 |
+
# update the text box based on language selection
|
124 |
+
def update_text_prompt(language):
|
125 |
+
"""
|
126 |
+
Updates the text in the textbox to the default prompt for the selected language.
|
127 |
+
"""
|
128 |
+
prompt = default_prompts.get(language, "") # Get the prompt, or an empty string if not found
|
129 |
+
return gr.Textbox(value=prompt)
|
130 |
+
|
131 |
+
|
132 |
+
#
|
133 |
+
with gr.Blocks() as tts_gui:
|
134 |
+
gr.Markdown(description_text) #"## Multilingual TTS for Sámi languages (+ Finnish and Estonian)")
|
135 |
+
with gr.Row():
|
136 |
+
with gr.Column(scale=2):
|
137 |
+
# Define each component and assign it to a variable
|
138 |
+
text_input = gr.Textbox(label="Text", value=default_prompts["North Sámi"])
|
139 |
+
language_dd = gr.Dropdown(list(languages.keys()), label="Language", value="North Sámi")
|
140 |
+
speaker_dd = gr.Dropdown(list(speakers.keys()), label="Voice", value="ms(sme)")
|
141 |
+
|
142 |
+
with gr.Row():
|
143 |
+
l_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Language Weight")
|
144 |
+
s_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Speaker Weight")
|
145 |
+
|
146 |
+
pace_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="Speech Rate")
|
147 |
+
postfilter_slider = gr.Slider(minimum=0., maximum=2, step=0.05, value=1.0, label="Post-processing")
|
148 |
+
|
149 |
+
with gr.Column(scale=1):
|
150 |
+
# Add a button to trigger synthesis
|
151 |
+
speak_button = gr.Button("Speak", variant="primary")
|
152 |
+
audio_output = gr.Audio(label="Output")
|
153 |
+
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
language_dd.change(
|
158 |
+
fn=update_text_prompt,
|
159 |
+
inputs=[language_dd],
|
160 |
+
outputs=[text_input]
|
161 |
+
)
|
162 |
+
|
163 |
+
|
164 |
+
speak_button.click(
|
165 |
+
fn=speak,
|
166 |
+
inputs=[
|
167 |
+
text_input,
|
168 |
+
language_dd,
|
169 |
+
speaker_dd,
|
170 |
+
l_weight_slider,
|
171 |
+
s_weight_slider,
|
172 |
+
pace_slider,
|
173 |
+
postfilter_slider
|
174 |
+
],
|
175 |
+
outputs=[audio_output]
|
176 |
+
)
|
177 |
+
|
178 |
+
|
179 |
+
if __name__ == "__main__":
|
180 |
+
tts_gui.launch(share=public)
|
syn_hifigan.py
CHANGED
@@ -15,17 +15,17 @@ from scipy.io.wavfile import write
|
|
15 |
from torch.nn.utils.rnn import pad_sequence
|
16 |
#import style_controller
|
17 |
from common.utils import load_wav_to_torch
|
18 |
-
|
19 |
|
20 |
from common import utils, layers
|
21 |
|
22 |
from common.text.text_processing import TextProcessing
|
23 |
-
|
24 |
|
25 |
import os
|
26 |
#os.environ["CUDA_VISIBLE_DEVICES"]=""
|
27 |
-
|
28 |
-
device = "cpu"
|
29 |
|
30 |
vocoder = "hifigan"
|
31 |
SHARPEN = True
|
@@ -53,12 +53,12 @@ def parse_args(parser):
|
|
53 |
parser.add_argument('--cudnn-benchmark', action='store_true',
|
54 |
help='Enable cudnn benchmark mode')
|
55 |
|
56 |
-
#parser.add_argument('--fastpitch', type=str, default='
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
parser.add_argument('-d', '--denoising-strength', default=0.01, type=float,
|
63 |
help='WaveGlow denoising')
|
64 |
parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
|
@@ -83,14 +83,14 @@ def parse_args(parser):
|
|
83 |
text_processing.add_argument('--text-cleaners', nargs='*',
|
84 |
default=['basic_cleaners'], type=str,
|
85 |
help='Type of text cleaners for input text')
|
86 |
-
text_processing.add_argument('--symbol-set', type=str, default='all_sami', #################
|
87 |
help='Define symbol set for input text')
|
88 |
|
89 |
cond = parser.add_argument_group('conditioning on additional attributes')
|
90 |
|
91 |
-
cond.add_argument('--n-speakers', type=int, default=10
|
92 |
help='Number of speakers in the model.')
|
93 |
-
cond.add_argument('--n-languages', type=int, default=3
|
94 |
help='Number of languages in the model.')
|
95 |
|
96 |
return parser
|
@@ -192,7 +192,7 @@ class Synthesizer:
|
|
192 |
self.vocoder, voc_train_setup= self._load_pyt_or_ts_model('HiFi-GAN', self.hifigan_model)
|
193 |
self.denoiser = Denoiser(self.vocoder,device=device) #, win_length=self.args.win_length).to(device)
|
194 |
self.tp = TextProcessing(self.args.symbol_set, self.args.text_cleaners, p_arpabet=0.0)
|
195 |
-
|
196 |
|
197 |
|
198 |
def unsharp_mask(self, img, radius=1, amount=1):
|
@@ -200,12 +200,31 @@ class Synthesizer:
|
|
200 |
sharpened = img + amount * ( img - blurred)
|
201 |
return sharpened
|
202 |
|
203 |
-
def speak(self, text, output_file="/tmp/tmp", spkr=0, lang=0, l_weight=1, s_weight=1, pace=0.95, clarity=1):
|
204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
text = self.tp.encode_text(text)
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
text = torch.LongTensor([text]).to(device)
|
208 |
-
|
209 |
for p in [0]:
|
210 |
|
211 |
with torch.no_grad():
|
@@ -216,8 +235,8 @@ class Synthesizer:
|
|
216 |
|
217 |
mel_np = mel.float().data.cpu().numpy()[0]
|
218 |
tgt_min = -11
|
219 |
-
tgt_max = 1.
|
220 |
-
#print(np.min(mel_np), np.max(mel_np))
|
221 |
mel_np = self.unsharp_mask(mel_np, radius = 0.5, amount=0.5)
|
222 |
mel_np = self.unsharp_mask(mel_np, radius = 3, amount=.05)
|
223 |
# mel_np = self.unsharp_mask(mel_np, radius = 7, amount=0.05)
|
@@ -239,7 +258,7 @@ class Synthesizer:
|
|
239 |
sharpened[i, :]+=(i-40)*0.01 #0.01 ta
|
240 |
mel[0] = torch.from_numpy(sharpened).float().to(device)
|
241 |
|
242 |
-
|
243 |
with torch.no_grad():
|
244 |
|
245 |
y_g_hat = self.vocoder(mel).float() ###########
|
@@ -252,7 +271,7 @@ class Synthesizer:
|
|
252 |
|
253 |
write(output_file+".wav", 22050, audio)
|
254 |
|
255 |
-
os.system("play -q "+output_file+".wav")
|
256 |
return audio
|
257 |
|
258 |
|
@@ -280,8 +299,8 @@ if __name__ == '__main__':
|
|
280 |
|
281 |
text = input(">")
|
282 |
text1 = text.split(" ")
|
283 |
-
syn.speak(text, output_file="/tmp/tmp.wav", spkr=
|
284 |
-
syn.speak(text, output_file="/tmp/tmp.wav", spkr=
|
285 |
continue
|
286 |
for s in range(1,10):
|
287 |
for l in range(3): ##
|
|
|
15 |
from torch.nn.utils.rnn import pad_sequence
|
16 |
#import style_controller
|
17 |
from common.utils import load_wav_to_torch
|
18 |
+
from langid.langid import WordLid
|
19 |
|
20 |
from common import utils, layers
|
21 |
|
22 |
from common.text.text_processing import TextProcessing
|
23 |
+
from collections import Counter
|
24 |
|
25 |
import os
|
26 |
#os.environ["CUDA_VISIBLE_DEVICES"]=""
|
27 |
+
device = "cuda:0"
|
28 |
+
#device = "cpu"
|
29 |
|
30 |
vocoder = "hifigan"
|
31 |
SHARPEN = True
|
|
|
53 |
parser.add_argument('--cudnn-benchmark', action='store_true',
|
54 |
help='Enable cudnn benchmark mode')
|
55 |
|
56 |
+
#parser.add_argument('--fastpitch', type=str, default='output_multilang/FastPitch_checkpoint_200.pt',
|
57 |
+
# help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
|
58 |
+
#parser.add_argument('--fastpitch', type=str, default='output_uralic/FastPitch_checkpoint_200.pt',
|
59 |
+
# help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
|
60 |
+
parser.add_argument('--fastpitch', type=str, default='output_6lang/FastPitch_checkpoint_50.pt',
|
61 |
+
help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
|
62 |
parser.add_argument('-d', '--denoising-strength', default=0.01, type=float,
|
63 |
help='WaveGlow denoising')
|
64 |
parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
|
|
|
83 |
text_processing.add_argument('--text-cleaners', nargs='*',
|
84 |
default=['basic_cleaners'], type=str,
|
85 |
help='Type of text cleaners for input text')
|
86 |
+
text_processing.add_argument('--symbol-set', type=str, default='uralic', #'all_sami', #################
|
87 |
help='Define symbol set for input text')
|
88 |
|
89 |
cond = parser.add_argument_group('conditioning on additional attributes')
|
90 |
|
91 |
+
cond.add_argument('--n-speakers', type=int, default=30, #10
|
92 |
help='Number of speakers in the model.')
|
93 |
+
cond.add_argument('--n-languages', type=int, default=6, #3
|
94 |
help='Number of languages in the model.')
|
95 |
|
96 |
return parser
|
|
|
192 |
self.vocoder, voc_train_setup= self._load_pyt_or_ts_model('HiFi-GAN', self.hifigan_model)
|
193 |
self.denoiser = Denoiser(self.vocoder,device=device) #, win_length=self.args.win_length).to(device)
|
194 |
self.tp = TextProcessing(self.args.symbol_set, self.args.text_cleaners, p_arpabet=0.0)
|
195 |
+
self.lid = WordLid("langid/lang_id_model_q.bin")
|
196 |
|
197 |
|
198 |
def unsharp_mask(self, img, radius=1, amount=1):
|
|
|
200 |
sharpened = img + amount * ( img - blurred)
|
201 |
return sharpened
|
202 |
|
203 |
+
def speak(self, text, output_file="/tmp/tmp", spkr=0, lang=0, l_weight=1, s_weight=1, pace=0.95, clarity=1, guess_lang=True):
|
204 |
|
205 |
+
text = " "+text+" "
|
206 |
+
|
207 |
+
if guess_lang:
|
208 |
+
lang = self.lid.get_lang_array(text)
|
209 |
+
main_lang = Counter(lang).most_common(1)[0][0]
|
210 |
+
|
211 |
+
lang = torch.tensor(lang).to(device)
|
212 |
+
lang_weight = torch.zeros(len(lang))
|
213 |
+
lang_weight[:] = l_weight
|
214 |
+
lang_weight[lang!=main_lang] = 0.5*l_weight
|
215 |
+
|
216 |
text = self.tp.encode_text(text)
|
217 |
+
|
218 |
+
if guess_lang == False:
|
219 |
+
lang = torch.tensor(lang).to(device)
|
220 |
+
else:
|
221 |
+
if len(text) != len(lang):
|
222 |
+
print("text length not equal to language list length!")
|
223 |
+
lang = lang[0]
|
224 |
+
l_weight = l_weight[0]
|
225 |
+
|
226 |
text = torch.LongTensor([text]).to(device)
|
227 |
+
|
228 |
for p in [0]:
|
229 |
|
230 |
with torch.no_grad():
|
|
|
235 |
|
236 |
mel_np = mel.float().data.cpu().numpy()[0]
|
237 |
tgt_min = -11
|
238 |
+
tgt_max = 1.5
|
239 |
+
#print(np.min(mel_np) , np.max(mel_np))
|
240 |
mel_np = self.unsharp_mask(mel_np, radius = 0.5, amount=0.5)
|
241 |
mel_np = self.unsharp_mask(mel_np, radius = 3, amount=.05)
|
242 |
# mel_np = self.unsharp_mask(mel_np, radius = 7, amount=0.05)
|
|
|
258 |
sharpened[i, :]+=(i-40)*0.01 #0.01 ta
|
259 |
mel[0] = torch.from_numpy(sharpened).float().to(device)
|
260 |
|
261 |
+
"""
|
262 |
with torch.no_grad():
|
263 |
|
264 |
y_g_hat = self.vocoder(mel).float() ###########
|
|
|
271 |
|
272 |
write(output_file+".wav", 22050, audio)
|
273 |
|
274 |
+
#os.system("play -q "+output_file+".wav")
|
275 |
return audio
|
276 |
|
277 |
|
|
|
299 |
|
300 |
text = input(">")
|
301 |
text1 = text.split(" ")
|
302 |
+
syn.speak(text, output_file="/tmp/tmp.wav", spkr=14, lang=4)
|
303 |
+
syn.speak(text, output_file="/tmp/tmp.wav", spkr=14, lang=4, guess_lang=False)
|
304 |
continue
|
305 |
for s in range(1,10):
|
306 |
for l in range(3): ##
|