Spaces:

divvun-tts
/

6L-TTS

Running

App Files Files Community

asuni commited on Jun 27

Commit

4f54df3

verified ·

1 Parent(s): 7a10627

Upload 2 files

Browse files

Files changed (2) hide show

gradio_gui_6lang_blocks.py +180 -0
syn_hifigan.py +42 -23

gradio_gui_6lang_blocks.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import gradio as gr
+import os
+import tempfile
+import syn_hifigan as syn
+#import syn_vgan as syn
+#import syn_k_univnet_multi as syn
+description_text = """
+# Multilingual TTS for Sámi languages (+ Finnish and Estonian)
+Welcome! This is a demonstration of a multi-lingual and multi-speaker Text-to-Speech (TTS) model.
+The demo is related to research on TTS for low-resource languages, and the effect of augmenting the training data with
+areally close languages.
+Disclaimers:
+For convenience, the demo uses pretrained HiFi-GAN vocoder which doesn't work well with male voices.
+English does not well due to small dataset and orthographic transcriptions. Use the demo just for testing, not for frequent or commercial use.
+"""
+speakers = {
+    "aj(sma)": 2,
+    "am(sme)": 3,
+    "ms(sme)": 4,
+    "ln(sme)": 5,
+    "mu(smj)": 7,
+    "sa(smj)": 8,
+    "bi(smj": 10,
+    "css(fin)": 11,
+    "ti(fin)": 13,
+    "ta(fin)": 14,
+    "liivika(est)": 15,
+    "indek(est)": 16,
+    "kylli(est)": 17,
+    "andreas(est)": 18,
+    "peeter(est)": 19,
+    "kersti(est)": 20,
+    "M6670(eng)": 21,
+    "M6097(eng)": 22,
+    "F92(eng)": 23,
+    "F9136(eng)": 24
+}
+mean_pitch = {
+    "aj0": 130,
+    "aj1": 130,
+    "am": 120,
+    "ms": 120,
+    "ln": 120,
+    "lo": 120,
+    "mu": 120,
+    "sa": 120,
+    "kd": 120,
+    "bi": 120,
+    "ti": 130,
+    "ta": 115,
+    "liivika": 120,
+    "indek": 90,
+    "kylli": 140,
+    "andreas": 100,
+    "peeter": 80,
+    "kersti": 120
+}
+languages = {
+    "guess": -1,
+    "South Sámi": 0, #South
+    "North Sámi": 1, #North
+    "Lule Sámi": 2, #Lule
+    "Finnish": 3,
+    "Estonian": 4,
+    "English": 5
+}
+# --- NEW: Add a dictionary for default prompts per language ---
+default_prompts = {
+    "guess": "Sáhtta go esso-burgera luohti, Koskenkorva dahje carpool karajoiki gádjut árgabeaivveluođi?",
+    "North Sámi": "Riektačállinreaidduid lassin Divvun-joavkkus ovdanit dál maiddái hállanteknologiijareaidduid.",
+    "South Sámi": " Buerie aerede gaajhkesh dovnesh jïh buerie båeteme dan bæjhkoehtæmman.", #Guktie datnine?",
+    "Lule Sámi": "Sáme hållamsyntiesaj baktu máhttá adnegoahtet sáme gielajt ådå aktijvuodajn.",
+    "Finnish": "Joka kuuseen kurkottaa, se katajaan kapsahtaa.",
+    "Estonian": "Aprilli lõpp pani aiapidajate kannatuse jälle proovile – pärast mõnepäevast sooja saabub ootamatu külmalaine.",
+    "English": "This obscure language is not supported by this model."
+}
+public = False
+tempdir = tempfile.gettempdir()
+tts = syn.Synthesizer()
+def speak(text, language, speaker, l_weight, s_weight, pace, postfilter):  # pitch_shift,pitch_std):
+    # text frontend not implemented...
+    text = text.replace("...", "…")
+    #print(speakers[speaker])
+    #print(language)
+    use_lid = False
+    if language == "guess":
+        use_lid = True
+    audio = tts.speak(text, output_file=f'{tempdir}/tmp', lang=languages[language],
+                      spkr=speakers[speaker], l_weight=l_weight, s_weight=s_weight,
+                      pace=pace, clarity=postfilter, guess_lang=use_lid)  # , mean_pitch = mean_pitch[speaker])
+    if not public:
+        try:
+            os.system("play " + tempdir + "/tmp.wav &")
+        except:
+            pass
+    return (22050, audio)
+# update the text box based on language selection
+def update_text_prompt(language):
+    """
+    Updates the text in the textbox to the default prompt for the selected language.
+    """
+    prompt = default_prompts.get(language, "") # Get the prompt, or an empty string if not found
+    return gr.Textbox(value=prompt)
+#
+with gr.Blocks() as tts_gui:
+    gr.Markdown(description_text) #"## Multilingual TTS for Sámi languages (+ Finnish and Estonian)")
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Define each component and assign it to a variable
+            text_input = gr.Textbox(label="Text", value=default_prompts["North Sámi"])
+            language_dd = gr.Dropdown(list(languages.keys()), label="Language", value="North Sámi")
+            speaker_dd = gr.Dropdown(list(speakers.keys()), label="Voice", value="ms(sme)")
+            with gr.Row():
+                l_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Language Weight")
+                s_weight_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1, label="Speaker Weight")
+            pace_slider = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="Speech Rate")
+            postfilter_slider = gr.Slider(minimum=0., maximum=2, step=0.05, value=1.0, label="Post-processing")
+        with gr.Column(scale=1):
+            # Add a button to trigger synthesis
+            speak_button = gr.Button("Speak", variant="primary")
+            audio_output = gr.Audio(label="Output")
+    language_dd.change(
+        fn=update_text_prompt,
+        inputs=[language_dd],
+        outputs=[text_input]
+    )
+    speak_button.click(
+        fn=speak,
+        inputs=[
+            text_input,
+            language_dd,
+            speaker_dd,
+            l_weight_slider,
+            s_weight_slider,
+            pace_slider,
+            postfilter_slider
+        ],
+        outputs=[audio_output]
+    )
+if __name__ == "__main__":
+    tts_gui.launch(share=public)

syn_hifigan.py CHANGED Viewed

@@ -15,17 +15,17 @@ from scipy.io.wavfile import write
 from torch.nn.utils.rnn import pad_sequence
 #import style_controller
 from common.utils import load_wav_to_torch
 from common import utils, layers
 from common.text.text_processing import TextProcessing
 import os
 #os.environ["CUDA_VISIBLE_DEVICES"]=""
-#device = "cuda:0"
-device = "cpu"
 vocoder = "hifigan"
 SHARPEN = True
@@ -53,12 +53,12 @@ def parse_args(parser):
     parser.add_argument('--cudnn-benchmark', action='store_true',
                         help='Enable cudnn benchmark mode')
-    #parser.add_argument('--fastpitch', type=str, default='output_smj_sander/FastPitch_checkpoint_660.pt',
-                        #help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
-    parser.add_argument('--fastpitch', type=str, default='output_multilang/FastPitch_checkpoint_200.pt',
-                        help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
     parser.add_argument('-d', '--denoising-strength', default=0.01, type=float,
                         help='WaveGlow denoising')
     parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
@@ -83,14 +83,14 @@ def parse_args(parser):
     text_processing.add_argument('--text-cleaners', nargs='*',
                                  default=['basic_cleaners'], type=str,
                                  help='Type of text cleaners for input text')
-    text_processing.add_argument('--symbol-set', type=str, default='all_sami', #################
                                  help='Define symbol set for input text')
     cond = parser.add_argument_group('conditioning on additional attributes')
-    cond.add_argument('--n-speakers', type=int, default=10,
                       help='Number of speakers in the model.')
-    cond.add_argument('--n-languages', type=int, default=3,
                       help='Number of languages in the model.')
     return parser
@@ -192,7 +192,7 @@ class Synthesizer:
         self.vocoder, voc_train_setup= self._load_pyt_or_ts_model('HiFi-GAN', self.hifigan_model)
         self.denoiser = Denoiser(self.vocoder,device=device) #, win_length=self.args.win_length).to(device)
         self.tp = TextProcessing(self.args.symbol_set, self.args.text_cleaners, p_arpabet=0.0)
     def unsharp_mask(self, img, radius=1, amount=1):
@@ -200,12 +200,31 @@ class Synthesizer:
         sharpened = img + amount * ( img - blurred)
         return sharpened
-    def speak(self, text, output_file="/tmp/tmp", spkr=0, lang=0, l_weight=1, s_weight=1, pace=0.95, clarity=1):
         text = self.tp.encode_text(text)
-        #text = [9]+self.tp.encode_text(text)+[9]
         text = torch.LongTensor([text]).to(device)
-        #probs = surprisals
         for p in [0]:
             with torch.no_grad():
@@ -216,8 +235,8 @@ class Synthesizer:
                 mel_np = mel.float().data.cpu().numpy()[0]
                 tgt_min = -11
-                tgt_max = 1.25
-                #print(np.min(mel_np), np.max(mel_np))
                 mel_np = self.unsharp_mask(mel_np, radius = 0.5, amount=0.5)
                 mel_np = self.unsharp_mask(mel_np, radius = 3, amount=.05)
                 # mel_np = self.unsharp_mask(mel_np, radius = 7, amount=0.05)
@@ -239,7 +258,7 @@ class Synthesizer:
                 sharpened[i, :]+=(i-40)*0.01 #0.01 ta
             mel[0] = torch.from_numpy(sharpened).float().to(device)
-            """
             with torch.no_grad():
                 y_g_hat = self.vocoder(mel).float() ###########
@@ -252,7 +271,7 @@ class Synthesizer:
                 write(output_file+".wav", 22050, audio)
-            os.system("play -q "+output_file+".wav")
             return audio
@@ -280,8 +299,8 @@ if __name__ == '__main__':
         text = input(">")
         text1 = text.split(" ")
-        syn.speak(text, output_file="/tmp/tmp.wav", spkr=6, lang=1)
-        syn.speak(text, output_file="/tmp/tmp.wav", spkr=7, lang=1)
         continue
         for s in range(1,10):
             for l in range(3): ##

 from torch.nn.utils.rnn import pad_sequence
 #import style_controller
 from common.utils import load_wav_to_torch
+from langid.langid import WordLid
 from common import utils, layers
 from common.text.text_processing import TextProcessing
+from collections import Counter
 import os
 #os.environ["CUDA_VISIBLE_DEVICES"]=""
+device = "cuda:0"
+#device = "cpu"
 vocoder = "hifigan"
 SHARPEN = True
     parser.add_argument('--cudnn-benchmark', action='store_true',
                         help='Enable cudnn benchmark mode')
+    #parser.add_argument('--fastpitch', type=str, default='output_multilang/FastPitch_checkpoint_200.pt',
+    #                    help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
+    #parser.add_argument('--fastpitch', type=str, default='output_uralic/FastPitch_checkpoint_200.pt',
+    #                   help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
+    parser.add_argument('--fastpitch', type=str, default='output_6lang/FastPitch_checkpoint_50.pt',
+                        help='Full path to the generator checkpoint file (skip to use ground truth mels)') #########
     parser.add_argument('-d', '--denoising-strength', default=0.01, type=float,
                         help='WaveGlow denoising')
     parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
     text_processing.add_argument('--text-cleaners', nargs='*',
                                  default=['basic_cleaners'], type=str,
                                  help='Type of text cleaners for input text')
+    text_processing.add_argument('--symbol-set', type=str, default='uralic', #'all_sami', #################
                                  help='Define symbol set for input text')
     cond = parser.add_argument_group('conditioning on additional attributes')
+    cond.add_argument('--n-speakers', type=int, default=30, #10
                       help='Number of speakers in the model.')
+    cond.add_argument('--n-languages', type=int, default=6, #3
                       help='Number of languages in the model.')
     return parser
         self.vocoder, voc_train_setup= self._load_pyt_or_ts_model('HiFi-GAN', self.hifigan_model)
         self.denoiser = Denoiser(self.vocoder,device=device) #, win_length=self.args.win_length).to(device)
         self.tp = TextProcessing(self.args.symbol_set, self.args.text_cleaners, p_arpabet=0.0)
+        self.lid = WordLid("langid/lang_id_model_q.bin")
     def unsharp_mask(self, img, radius=1, amount=1):
         sharpened = img + amount * ( img - blurred)
         return sharpened
+    def speak(self, text, output_file="/tmp/tmp", spkr=0, lang=0, l_weight=1, s_weight=1, pace=0.95, clarity=1, guess_lang=True):
+        text = " "+text+" "
+        if guess_lang:
+            lang = self.lid.get_lang_array(text)
+            main_lang = Counter(lang).most_common(1)[0][0]
+            lang = torch.tensor(lang).to(device)
+            lang_weight = torch.zeros(len(lang))
+            lang_weight[:] = l_weight
+            lang_weight[lang!=main_lang] = 0.5*l_weight
         text = self.tp.encode_text(text)
+        if guess_lang == False:
+            lang = torch.tensor(lang).to(device)
+        else:
+            if len(text) != len(lang):
+                print("text length not equal to language list length!")
+                lang = lang[0]
+                l_weight = l_weight[0]
         text = torch.LongTensor([text]).to(device)
         for p in [0]:
             with torch.no_grad():
                 mel_np = mel.float().data.cpu().numpy()[0]
                 tgt_min = -11
+                tgt_max = 1.5
+                #print(np.min(mel_np) , np.max(mel_np))
                 mel_np = self.unsharp_mask(mel_np, radius = 0.5, amount=0.5)
                 mel_np = self.unsharp_mask(mel_np, radius = 3, amount=.05)
                 # mel_np = self.unsharp_mask(mel_np, radius = 7, amount=0.05)
                 sharpened[i, :]+=(i-40)*0.01 #0.01 ta
             mel[0] = torch.from_numpy(sharpened).float().to(device)
+             """
             with torch.no_grad():
                 y_g_hat = self.vocoder(mel).float() ###########
                 write(output_file+".wav", 22050, audio)
+            #os.system("play -q "+output_file+".wav")
             return audio
         text = input(">")
         text1 = text.split(" ")
+        syn.speak(text, output_file="/tmp/tmp.wav", spkr=14, lang=4)
+        syn.speak(text, output_file="/tmp/tmp.wav", spkr=14, lang=4, guess_lang=False)
         continue
         for s in range(1,10):
             for l in range(3): ##