asuni commited on
Commit
4f54df3
·
verified ·
1 Parent(s): 7a10627

Upload 2 files

Browse files
Files changed (2) hide show
  1. gradio_gui_6lang_blocks.py +180 -0
  2. syn_hifigan.py +42 -23
gradio_gui_6lang_blocks.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+
5
+ import syn_hifigan as syn
6
+ #import syn_vgan as syn
7
+ #import syn_k_univnet_multi as syn
8
+
9
+ description_text = """
10
+ # Multilingual TTS for Sámi languages (+ Finnish and Estonian)
11
+
12
+ Welcome! This is a demonstration of a multi-lingual and multi-speaker Text-to-Speech (TTS) model.
13
+ The demo is related to research on TTS for low-resource languages, and the effect of augmenting the training data with
14
+ areally close languages.
15
+
16
+
17
+ Disclaimers:
18
+ For convenience, the demo uses pretrained HiFi-GAN vocoder which doesn't work well with male voices.
19
+ English does not well due to small dataset and orthographic transcriptions. Use the demo just for testing, not for frequent or commercial use.
20
+
21
+
22
+
23
+ """
24
+ speakers = {
25
+ "aj(sma)": 2,
26
+ "am(sme)": 3,
27
+ "ms(sme)": 4,
28
+ "ln(sme)": 5,
29
+ "mu(smj)": 7,
30
+ "sa(smj)": 8,
31
+ "bi(smj": 10,
32
+ "css(fin)": 11,
33
+ "ti(fin)": 13,
34
+ "ta(fin)": 14,
35
+ "liivika(est)": 15,
36
+ "indek(est)": 16,
37
+ "kylli(est)": 17,
38
+ "andreas(est)": 18,
39
+ "peeter(est)": 19,
40
+ "kersti(est)": 20,
41
+ "M6670(eng)": 21,
42
+ "M6097(eng)": 22,
43
+ "F92(eng)": 23,
44
+ "F9136(eng)": 24
45
+ }
46
+
47
+ mean_pitch = {
48
+ "aj0": 130,
49
+ "aj1": 130,
50
+ "am": 120,
51
+ "ms": 120,
52
+ "ln": 120,
53
+ "lo": 120,
54
+ "mu": 120,
55
+ "sa": 120,
56
+ "kd": 120,
57
+ "bi": 120,
58
+ "ti": 130,
59
+ "ta": 115,
60
+ "liivika": 120,
61
+ "indek": 90,
62
+ "kylli": 140,
63
+ "andreas": 100,
64
+ "peeter": 80,
65
+ "kersti": 120
66
+ }
67
+
68
+ languages = {
69
+ "guess": -1,
70
+ "South Sámi": 0, #South
71
+ "North Sámi": 1, #North
72
+ "Lule Sámi": 2, #Lule
73
+ "Finnish": 3,
74
+ "Estonian": 4,
75
+ "English": 5
76
+ }
77
+
78
+ # --- NEW: Add a dictionary for default prompts per language ---
79
+ default_prompts = {
80
+ "guess": "Sáhtta go esso-burgera luohti, Koskenkorva dahje carpool karajoiki gádjut árgabeaivveluođi?",
81
+
82
+ "North Sámi": "Riektačállinreaidduid lassin Divvun-joavkkus ovdanit dál maiddái hállanteknologiijareaidduid.",
83
+
84
+ "South Sámi": " Buerie aerede gaajhkesh dovnesh jïh buerie båeteme dan bæjhkoehtæmman.", #Guktie datnine?",
85
+ "Lule Sámi": "Sáme hållamsyntiesaj baktu máhttá adnegoahtet sáme gielajt ådå aktijvuodajn.",
86
+
87
+ "Finnish": "Joka kuuseen kurkottaa, se katajaan kapsahtaa.",
88
+ "Estonian": "Aprilli lõpp pani aiapidajate kannatuse jälle proovile – pärast mõnepäevast sooja saabub ootamatu külmalaine.",
89
+
90
+ "English": "This obscure language is not supported by this model."
91
+ }
92
+
93
+
94
+ public = False
95
+
96
+ tempdir = tempfile.gettempdir()
97
+
98
+ tts = syn.Synthesizer()
99
+
100
+
101
+ def speak(text, language, speaker, l_weight, s_weight, pace, postfilter): # pitch_shift,pitch_std):
102
+
103
+ # text frontend not implemented...
104
+ text = text.replace("...", "…")
105
+ #print(speakers[speaker])
106
+ #print(language)
107
+ use_lid = False
108
+ if language == "guess":
109
+ use_lid = True
110
+
111
+ audio = tts.speak(text, output_file=f'{tempdir}/tmp', lang=languages[language],
112
+ spkr=speakers[speaker], l_weight=l_weight, s_weight=s_weight,
113
+ pace=pace, clarity=postfilter, guess_lang=use_lid) # , mean_pitch = mean_pitch[speaker])
114
+
115
+ if not public:
116
+ try:
117
+ os.system("play " + tempdir + "/tmp.wav &")
118
+ except:
119
+ pass
120
+
121
+ return (22050, audio)
122
+
123
+ # update the text box based on language selection
124
+ def update_text_prompt(language):
125
+ """
126
+ Updates the text in the textbox to the default prompt for the selected language.
127
+ """
128
+ prompt = default_prompts.get(language, "") # Get the prompt, or an empty string if not found
129
+ return gr.Textbox(value=prompt)
130
+
131
+
132
+ #
133
+ with gr.Blocks() as tts_gui:
134
+ gr.Markdown(description_text) #"## Multilingual TTS for Sámi languages (+ Finnish and Estonian)")
135
+ with gr.Row():
136
+ with gr.Column(scale=2):
137
+ # Define each component and assign it to a variable
138
+ text_input = gr.Textbox(label="Text", value=default_prompts["North Sámi"])
139
+ language_dd = gr.Dropdown(list(languages.keys()), label="Language", value="North Sámi")
140
+ speaker_dd = gr.Dropdown(list(speakers.keys()), label="Voice", value="ms(sme)")
141
+
142
+ with gr.Row():
143
+ l_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Language Weight")
144
+ s_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Speaker Weight")
145
+
146
+ pace_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="Speech Rate")
147
+ postfilter_slider = gr.Slider(minimum=0., maximum=2, step=0.05, value=1.0, label="Post-processing")
148
+
149
+ with gr.Column(scale=1):
150
+ # Add a button to trigger synthesis
151
+ speak_button = gr.Button("Speak", variant="primary")
152
+ audio_output = gr.Audio(label="Output")
153
+
154
+
155
+
156
+
157
+ language_dd.change(
158
+ fn=update_text_prompt,
159
+ inputs=[language_dd],
160
+ outputs=[text_input]
161
+ )
162
+
163
+
164
+ speak_button.click(
165
+ fn=speak,
166
+ inputs=[
167
+ text_input,
168
+ language_dd,
169
+ speaker_dd,
170
+ l_weight_slider,
171
+ s_weight_slider,
172
+ pace_slider,
173
+ postfilter_slider
174
+ ],
175
+ outputs=[audio_output]
176
+ )
177
+
178
+
179
+ if __name__ == "__main__":
180
+ tts_gui.launch(share=public)
syn_hifigan.py CHANGED
@@ -15,17 +15,17 @@ from scipy.io.wavfile import write
15
  from torch.nn.utils.rnn import pad_sequence
16
  #import style_controller
17
  from common.utils import load_wav_to_torch
18
-
19
 
20
  from common import utils, layers
21
 
22
  from common.text.text_processing import TextProcessing
23
-
24
 
25
  import os
26
  #os.environ["CUDA_VISIBLE_DEVICES"]=""
27
- #device = "cuda:0"
28
- device = "cpu"
29
 
30
  vocoder = "hifigan"
31
  SHARPEN = True
@@ -53,12 +53,12 @@ def parse_args(parser):
53
  parser.add_argument('--cudnn-benchmark', action='store_true',
54
  help='Enable cudnn benchmark mode')
55
 
56
- #parser.add_argument('--fastpitch', type=str, default='output_smj_sander/FastPitch_checkpoint_660.pt',
57
- #help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
58
-
59
- parser.add_argument('--fastpitch', type=str, default='output_multilang/FastPitch_checkpoint_200.pt',
60
- help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
61
-
62
  parser.add_argument('-d', '--denoising-strength', default=0.01, type=float,
63
  help='WaveGlow denoising')
64
  parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
@@ -83,14 +83,14 @@ def parse_args(parser):
83
  text_processing.add_argument('--text-cleaners', nargs='*',
84
  default=['basic_cleaners'], type=str,
85
  help='Type of text cleaners for input text')
86
- text_processing.add_argument('--symbol-set', type=str, default='all_sami', #################
87
  help='Define symbol set for input text')
88
 
89
  cond = parser.add_argument_group('conditioning on additional attributes')
90
 
91
- cond.add_argument('--n-speakers', type=int, default=10,
92
  help='Number of speakers in the model.')
93
- cond.add_argument('--n-languages', type=int, default=3,
94
  help='Number of languages in the model.')
95
 
96
  return parser
@@ -192,7 +192,7 @@ class Synthesizer:
192
  self.vocoder, voc_train_setup= self._load_pyt_or_ts_model('HiFi-GAN', self.hifigan_model)
193
  self.denoiser = Denoiser(self.vocoder,device=device) #, win_length=self.args.win_length).to(device)
194
  self.tp = TextProcessing(self.args.symbol_set, self.args.text_cleaners, p_arpabet=0.0)
195
-
196
 
197
 
198
  def unsharp_mask(self, img, radius=1, amount=1):
@@ -200,12 +200,31 @@ class Synthesizer:
200
  sharpened = img + amount * ( img - blurred)
201
  return sharpened
202
 
203
- def speak(self, text, output_file="/tmp/tmp", spkr=0, lang=0, l_weight=1, s_weight=1, pace=0.95, clarity=1):
204
 
 
 
 
 
 
 
 
 
 
 
 
205
  text = self.tp.encode_text(text)
206
- #text = [9]+self.tp.encode_text(text)+[9]
 
 
 
 
 
 
 
 
207
  text = torch.LongTensor([text]).to(device)
208
- #probs = surprisals
209
  for p in [0]:
210
 
211
  with torch.no_grad():
@@ -216,8 +235,8 @@ class Synthesizer:
216
 
217
  mel_np = mel.float().data.cpu().numpy()[0]
218
  tgt_min = -11
219
- tgt_max = 1.25
220
- #print(np.min(mel_np), np.max(mel_np))
221
  mel_np = self.unsharp_mask(mel_np, radius = 0.5, amount=0.5)
222
  mel_np = self.unsharp_mask(mel_np, radius = 3, amount=.05)
223
  # mel_np = self.unsharp_mask(mel_np, radius = 7, amount=0.05)
@@ -239,7 +258,7 @@ class Synthesizer:
239
  sharpened[i, :]+=(i-40)*0.01 #0.01 ta
240
  mel[0] = torch.from_numpy(sharpened).float().to(device)
241
 
242
- """
243
  with torch.no_grad():
244
 
245
  y_g_hat = self.vocoder(mel).float() ###########
@@ -252,7 +271,7 @@ class Synthesizer:
252
 
253
  write(output_file+".wav", 22050, audio)
254
 
255
- os.system("play -q "+output_file+".wav")
256
  return audio
257
 
258
 
@@ -280,8 +299,8 @@ if __name__ == '__main__':
280
 
281
  text = input(">")
282
  text1 = text.split(" ")
283
- syn.speak(text, output_file="/tmp/tmp.wav", spkr=6, lang=1)
284
- syn.speak(text, output_file="/tmp/tmp.wav", spkr=7, lang=1)
285
  continue
286
  for s in range(1,10):
287
  for l in range(3): ##
 
15
  from torch.nn.utils.rnn import pad_sequence
16
  #import style_controller
17
  from common.utils import load_wav_to_torch
18
+ from langid.langid import WordLid
19
 
20
  from common import utils, layers
21
 
22
  from common.text.text_processing import TextProcessing
23
+ from collections import Counter
24
 
25
  import os
26
  #os.environ["CUDA_VISIBLE_DEVICES"]=""
27
+ device = "cuda:0"
28
+ #device = "cpu"
29
 
30
  vocoder = "hifigan"
31
  SHARPEN = True
 
53
  parser.add_argument('--cudnn-benchmark', action='store_true',
54
  help='Enable cudnn benchmark mode')
55
 
56
+ #parser.add_argument('--fastpitch', type=str, default='output_multilang/FastPitch_checkpoint_200.pt',
57
+ # help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
58
+ #parser.add_argument('--fastpitch', type=str, default='output_uralic/FastPitch_checkpoint_200.pt',
59
+ # help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
60
+ parser.add_argument('--fastpitch', type=str, default='output_6lang/FastPitch_checkpoint_50.pt',
61
+ help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
62
  parser.add_argument('-d', '--denoising-strength', default=0.01, type=float,
63
  help='WaveGlow denoising')
64
  parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
 
83
  text_processing.add_argument('--text-cleaners', nargs='*',
84
  default=['basic_cleaners'], type=str,
85
  help='Type of text cleaners for input text')
86
+ text_processing.add_argument('--symbol-set', type=str, default='uralic', #'all_sami', #################
87
  help='Define symbol set for input text')
88
 
89
  cond = parser.add_argument_group('conditioning on additional attributes')
90
 
91
+ cond.add_argument('--n-speakers', type=int, default=30, #10
92
  help='Number of speakers in the model.')
93
+ cond.add_argument('--n-languages', type=int, default=6, #3
94
  help='Number of languages in the model.')
95
 
96
  return parser
 
192
  self.vocoder, voc_train_setup= self._load_pyt_or_ts_model('HiFi-GAN', self.hifigan_model)
193
  self.denoiser = Denoiser(self.vocoder,device=device) #, win_length=self.args.win_length).to(device)
194
  self.tp = TextProcessing(self.args.symbol_set, self.args.text_cleaners, p_arpabet=0.0)
195
+ self.lid = WordLid("langid/lang_id_model_q.bin")
196
 
197
 
198
  def unsharp_mask(self, img, radius=1, amount=1):
 
200
  sharpened = img + amount * ( img - blurred)
201
  return sharpened
202
 
203
+ def speak(self, text, output_file="/tmp/tmp", spkr=0, lang=0, l_weight=1, s_weight=1, pace=0.95, clarity=1, guess_lang=True):
204
 
205
+ text = " "+text+" "
206
+
207
+ if guess_lang:
208
+ lang = self.lid.get_lang_array(text)
209
+ main_lang = Counter(lang).most_common(1)[0][0]
210
+
211
+ lang = torch.tensor(lang).to(device)
212
+ lang_weight = torch.zeros(len(lang))
213
+ lang_weight[:] = l_weight
214
+ lang_weight[lang!=main_lang] = 0.5*l_weight
215
+
216
  text = self.tp.encode_text(text)
217
+
218
+ if guess_lang == False:
219
+ lang = torch.tensor(lang).to(device)
220
+ else:
221
+ if len(text) != len(lang):
222
+ print("text length not equal to language list length!")
223
+ lang = lang[0]
224
+ l_weight = l_weight[0]
225
+
226
  text = torch.LongTensor([text]).to(device)
227
+
228
  for p in [0]:
229
 
230
  with torch.no_grad():
 
235
 
236
  mel_np = mel.float().data.cpu().numpy()[0]
237
  tgt_min = -11
238
+ tgt_max = 1.5
239
+ #print(np.min(mel_np) , np.max(mel_np))
240
  mel_np = self.unsharp_mask(mel_np, radius = 0.5, amount=0.5)
241
  mel_np = self.unsharp_mask(mel_np, radius = 3, amount=.05)
242
  # mel_np = self.unsharp_mask(mel_np, radius = 7, amount=0.05)
 
258
  sharpened[i, :]+=(i-40)*0.01 #0.01 ta
259
  mel[0] = torch.from_numpy(sharpened).float().to(device)
260
 
261
+ """
262
  with torch.no_grad():
263
 
264
  y_g_hat = self.vocoder(mel).float() ###########
 
271
 
272
  write(output_file+".wav", 22050, audio)
273
 
274
+ #os.system("play -q "+output_file+".wav")
275
  return audio
276
 
277
 
 
299
 
300
  text = input(">")
301
  text1 = text.split(" ")
302
+ syn.speak(text, output_file="/tmp/tmp.wav", spkr=14, lang=4)
303
+ syn.speak(text, output_file="/tmp/tmp.wav", spkr=14, lang=4, guess_lang=False)
304
  continue
305
  for s in range(1,10):
306
  for l in range(3): ##