Spaces:

robinhad
/

ukrainian-tts

Running

App Files Files Community

Yurii Paniv commited on Nov 6, 2021

Commit

49fc4a4

1 Parent(s): f826887

Add support for vocoder

Browse files

Files changed (3) hide show

README.md +2 -0
app.py +26 -9
vocoder_config.json +185 -0

README.md CHANGED Viewed

@@ -13,6 +13,8 @@ Ukrainian TTS (text-to-speech) using Coqui TTS.
 Trained on [M-AILABS Ukrainian dataset](https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/).
 # Example
 https://user-images.githubusercontent.com/5759207/139459556-35aa077b-0425-421f-a8d3-4c503315008d.mp4

 Trained on [M-AILABS Ukrainian dataset](https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/).
+# Support
+If you like my work, please support -> [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm)
 # Example
 https://user-images.githubusercontent.com/5759207/139459556-35aa077b-0425-421f-a8d3-4c503315008d.mp4

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import numpy as np
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
 MODEL_NAMES = [
     "uk/mai/glow-tts"
@@ -14,16 +16,31 @@ MODELS = {}
 manager = ModelManager()
 for MODEL_NAME in MODEL_NAMES:
     print(f"downloading {MODEL_NAME}")
     model_path, config_path, model_item = manager.download_model(
         f"tts_models/{MODEL_NAME}")
     vocoder_name: Optional[str] = model_item["default_vocoder"]
-    vocoder_path = None
-    vocoder_config_path = None
-    if vocoder_name is not None:
-        vocoder_path, vocoder_config_path, _ = manager.download_model(
-            vocoder_name)
     synthesizer = Synthesizer(
         model_path, config_path, None, vocoder_path, vocoder_config_path,
@@ -52,14 +69,14 @@ iface = gr.Interface(
             default="Привіт, як твої справи?",
         ),
         gr.inputs.Radio(
-            label="Pick a TTS Model",
             choices=MODEL_NAMES,
         ),
     ],
     outputs=gr.outputs.Audio(label="Output"),
-    title="🐸💬 - Coqui TTS",
     theme="huggingface",
-    description="🐸💬 - a deep learning toolkit for Text-to-Speech, battle-tested in research and production",
-    article="more info at https://github.com/coqui-ai/TTS",
 )
 iface.launch()

 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
+import requests
+from os.path import exists
 MODEL_NAMES = [
     "uk/mai/glow-tts"
 manager = ModelManager()
+def download(url, file_name):
+    if not exists(file_name):
+        print(f"Downloading {file_name}")
+        r = requests.get(url, allow_redirects=True)
+        with open(file_name, 'wb') as file:
+            file.write(r.content)
+    else:
+        print(f"Found {file_name}. Skipping download...")
 for MODEL_NAME in MODEL_NAMES:
     print(f"downloading {MODEL_NAME}")
     model_path, config_path, model_item = manager.download_model(
         f"tts_models/{MODEL_NAME}")
     vocoder_name: Optional[str] = model_item["default_vocoder"]
+    release_number = "0.0.1"
+    vocoder_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/vocoder.pth.tar"
+    vocoder_config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/vocoder_config.json"
+    vocoder_path = "vocoder.pth.tar"
+    vocoder_config_path = "vocoder_config.json"
+    download(vocoder_link, vocoder_path)
+    download(vocoder_config_link, vocoder_config_path)
     synthesizer = Synthesizer(
         model_path, config_path, None, vocoder_path, vocoder_config_path,
             default="Привіт, як твої справи?",
         ),
         gr.inputs.Radio(
+            label="Виберіть TTS модель",
             choices=MODEL_NAMES,
         ),
     ],
     outputs=gr.outputs.Audio(label="Output"),
+    title="🐸💬🇺🇦 - Coqui TTS",
     theme="huggingface",
+    description="Україномовний🇺🇦 TTS за допомогою Coqui TTS",
+    article="Якщо вам подобається, підтримайте за посиланням: [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm)",
 )
 iface.launch()

vocoder_config.json ADDED Viewed

	@@ -0,0 +1,185 @@

+{
+    "model": "multiband_melgan",
+    "run_name": "coqui_tts",
+    "run_description": "",
+    "epochs": 2000,
+    "batch_size": 32,
+    "eval_batch_size": 16,
+    "mixed_precision": true,
+    "scheduler_after_epoch": false,
+    "run_eval": true,
+    "test_delay_epochs": 5,
+    "print_eval": false,
+    "dashboard_logger": "tensorboard",
+    "print_step": 25,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "project_name": null,
+    "log_model_step": null,
+    "wandb_entity": null,
+    "save_step": 10000,
+    "checkpoint": true,
+    "keep_all_best": false,
+    "keep_after": 10000,
+    "num_loader_workers": 12,
+    "num_eval_loader_workers": 12,
+    "use_noise_augment": true,
+    "output_path": "/home/robinhad/Projects/TTS/recipes/ljspeech/multiband_melgan",
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "audio": {
+        "fft_size": 1024,
+        "win_length": 1024,
+        "hop_length": 256,
+        "frame_shift_ms": null,
+        "frame_length_ms": null,
+        "stft_pad_mode": "reflect",
+        "sample_rate": 16000,
+        "resample": false,
+        "preemphasis": 0.0,
+        "ref_level_db": 20,
+        "do_sound_norm": false,
+        "log_func": "np.log10",
+        "do_trim_silence": true,
+        "trim_db": 45,
+        "power": 1.5,
+        "griffin_lim_iters": 60,
+        "num_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null,
+        "spec_gain": 20,
+        "do_amp_to_db_linear": true,
+        "do_amp_to_db_mel": true,
+        "signal_norm": true,
+        "min_level_db": -100,
+        "symmetric_norm": true,
+        "max_norm": 4.0,
+        "clip_norm": true,
+        "stats_path": null
+    },
+    "eval_split_size": 10,
+    "data_path": "../Data/uk_UK/by_book/female",
+    "feature_path": null,
+    "seq_len": 8192,
+    "pad_short": 2000,
+    "conv_pad": 0,
+    "use_cache": true,
+    "wd": 0.0,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "weight_decay": 0.0
+    },
+    "use_stft_loss": true,
+    "use_subband_stft_loss": true,
+    "use_mse_gan_loss": true,
+    "use_hinge_gan_loss": false,
+    "use_feat_match_loss": false,
+    "use_l1_spec_loss": false,
+    "stft_loss_weight": 0.5,
+    "subband_stft_loss_weight": 0,
+    "mse_G_loss_weight": 2.5,
+    "hinge_G_loss_weight": 0,
+    "feat_match_loss_weight": 108,
+    "l1_spec_loss_weight": 0,
+    "stft_loss_params": {
+        "n_ffts": [
+            1024,
+            2048,
+            512
+        ],
+        "hop_lengths": [
+            120,
+            240,
+            50
+        ],
+        "win_lengths": [
+            600,
+            1200,
+            240
+        ]
+    },
+    "l1_spec_loss_params": {
+        "use_mel": true,
+        "sample_rate": 16000,
+        "n_fft": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "target_loss": "loss_0",
+    "grad_clip": [
+        5,
+        5
+    ],
+    "lr_gen": 0.0001,
+    "lr_disc": 0.0001,
+    "lr_scheduler_gen": "MultiStepLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.5,
+        "milestones": [
+            100000,
+            200000,
+            300000,
+            400000,
+            500000,
+            600000
+        ]
+    },
+    "lr_scheduler_disc": "MultiStepLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.5,
+        "milestones": [
+            100000,
+            200000,
+            300000,
+            400000,
+            500000,
+            600000
+        ]
+    },
+    "use_pqmf": true,
+    "diff_samples_for_G_and_D": false,
+    "discriminator_model": "melgan_multiscale_discriminator",
+    "discriminator_model_params": {
+        "base_channels": 16,
+        "max_channels": 512,
+        "downsample_factors": [
+            4,
+            4,
+            4
+        ]
+    },
+    "generator_model": "multiband_melgan_generator",
+    "generator_model_params": {
+        "upsample_factors": [
+            8,
+            4,
+            2
+        ],
+        "num_res_blocks": 4
+    },
+    "steps_to_start_discriminator": 200000,
+    "subband_stft_loss_params": {
+        "n_ffts": [
+            384,
+            683,
+            171
+        ],
+        "hop_lengths": [
+            30,
+            60,
+            10
+        ],
+        "win_lengths": [
+            150,
+            300,
+            60
+        ]
+    }
+}