ahnafsamin commited on
Commit
4aa9e1f
·
1 Parent(s): 242f5b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -87
app.py CHANGED
@@ -1,109 +1,102 @@
1
  import gradio as gr
2
  import time
3
- import urllib.request
4
- from pathlib import Path
5
- import os
6
  import torch
7
- import numpy
8
  import scipy.io.wavfile
9
  from espnet2.bin.tts_inference import Text2Speech
10
  from espnet2.utils.types import str_or_none
11
 
 
 
12
 
13
- # def load_model(model_tag, vocoder_tag):
14
- # from espnet_model_zoo.downloader import ModelDownloader
15
-
16
- # kwargs = {}
17
-
18
- # # Model
19
- # d = ModelDownloader()
20
- # kwargs = d.download_and_unpack(model_tag)
21
-
22
- # # Vocoder
23
- # download_dir = Path(os.path.expanduser("~/.cache/parallel_wavegan"))
24
- # vocoder_dir = download_dir / vocoder_tag
25
- # os.makedirs(vocoder_dir, exist_ok=True)
 
 
 
 
 
26
 
27
- # kwargs["vocoder_config"] = vocoder_dir / "config.yml"
28
- # if not kwargs["vocoder_config"].exists():
29
- # urllib.request.urlretrieve(f"https://huggingface.co/{vocoder_tag}/resolve/main/config.yml", kwargs["vocoder_config"])
30
 
31
- # kwargs["vocoder_file"] = vocoder_dir / "checkpoint-50000steps.pkl"
32
- # if not kwargs["vocoder_file"].exists():
33
- # urllib.request.urlretrieve(f"https://huggingface.co/{vocoder_tag}/resolve/main/checkpoint-50000steps.pkl", kwargs["vocoder_file"])
34
 
35
- # return Text2Speech(
36
- # **kwargs,
37
- # device="cpu",
38
- # threshold=0.5,
39
- # minlenratio=0.0,
40
- # maxlenratio=10.0,
41
- # use_att_constraint=True,
42
- # backward_window=1,
43
- # forward_window=4,
44
- # )
 
 
 
 
 
 
 
 
45
 
46
- # gos_text2speech = load_model('https://huggingface.co/wietsedv/tacotron2-gronings/resolve/main/tts_ljspeech_finetune_tacotron2.v5_train.loss.ave.zip', 'wietsedv/parallelwavegan-gronings')
47
- # nld_text2speech = load_model('https://huggingface.co/wietsedv/tacotron2-dutch/resolve/main/tts_ljspeech_finetune_tacotron2.v5_train.loss.ave.zip', 'wietsedv/parallelwavegan-dutch')
48
 
49
- gos_text2speech = Text2Speech.from_pretrained(
50
- model_tag="https://huggingface.co/wietsedv/tacotron2-gronings/resolve/main/tts_ljspeech_finetune_tacotron2.v5_train.loss.ave.zip",
51
- vocoder_tag="parallel_wavegan/ljspeech_parallel_wavegan.v3",
52
- device="cpu",
53
- threshold=0.5,
54
- minlenratio=0.0,
55
- maxlenratio=10.0,
56
- use_att_constraint=True,
57
- backward_window=1,
58
- forward_window=4,
59
- )
60
- nld_text2speech = Text2Speech.from_pretrained(
61
- model_tag="https://huggingface.co/wietsedv/tacotron2-dutch/resolve/main/tts_ljspeech_finetune_tacotron2.v5_train.loss.ave.zip",
62
- vocoder_tag="parallel_wavegan/ljspeech_parallel_wavegan.v3",
63
- device="cpu",
64
- threshold=0.5,
65
- minlenratio=0.0,
66
- maxlenratio=10.0,
67
- use_att_constraint=True,
68
- backward_window=1,
69
- forward_window=4,
70
  )
71
- #eng_text2speech = Text2Speech.from_pretrained(
72
- # model_tag="kan-bayashi/ljspeech_tacotron2",
73
- # vocoder_tag="parallel_wavegan/ljspeech_parallel_wavegan.v3",
74
- # device="cpu",
75
- # threshold=0.5,
76
- # minlenratio=0.0,
77
- # maxlenratio=10.0,
78
- # use_att_constraint=True,
79
- # backward_window=1,
80
- # forward_window=4,
81
- #)
82
 
83
  def inference(text,lang):
84
  with torch.no_grad():
85
- if lang == "gronings":
86
- wav = gos_text2speech(text)["wav"]
87
- scipy.io.wavfile.write("out.wav", gos_text2speech.fs , wav.view(-1).cpu().numpy())
88
- if lang == "dutch":
89
- wav = nld_text2speech(text)["wav"]
90
- scipy.io.wavfile.write("out.wav", nld_text2speech.fs , wav.view(-1).cpu().numpy())
91
- #if lang == "english":
92
- # wav = eng_text2speech(text)["wav"]
93
- # scipy.io.wavfile.write("out.wav", eng_text2speech.fs , wav.view(-1).cpu().numpy())
94
-
95
- return "out.wav", "out.wav"
 
 
96
 
97
- title = "GroTTS"
98
- examples = [
99
- ['Ze gingen mit klas noar waddendiek, over en deur bragel lopen.', 'gronings']
100
- ]
101
 
102
  gr.Interface(
103
- inference,
104
- [gr.inputs.Textbox(label="input text", lines=3), gr.inputs.Radio(choices=["gronings", "dutch"], type="value", default="gronings", label="language")],
105
- [gr.outputs.Audio(type="file", label="Output"), gr.outputs.File()],
106
  title=title,
 
 
 
107
  examples=examples
108
- ).launch(enable_queue=True)
109
-
 
1
  import gradio as gr
2
  import time
 
 
 
3
  import torch
 
4
  import scipy.io.wavfile
5
  from espnet2.bin.tts_inference import Text2Speech
6
  from espnet2.utils.types import str_or_none
7
 
8
+ tagen = 'kan-bayashi/ljspeech_vits'
9
+ vocoder_tagen = "none"
10
 
11
+ text2speechen = Text2Speech.from_pretrained(
12
+ model_tag=str_or_none(tagen),
13
+ vocoder_tag=str_or_none(vocoder_tagen),
14
+ device="cpu",
15
+ # Only for Tacotron 2 & Transformer
16
+ threshold=0.5,
17
+ # Only for Tacotron 2
18
+ minlenratio=0.0,
19
+ maxlenratio=10.0,
20
+ use_att_constraint=False,
21
+ backward_window=1,
22
+ forward_window=3,
23
+ # Only for FastSpeech & FastSpeech2 & VITS
24
+ speed_control_alpha=1.0,
25
+ # Only for VITS
26
+ noise_scale=0.333,
27
+ noise_scale_dur=0.333,
28
+ )
29
 
 
 
 
30
 
31
+ tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
32
+ vocoder_tagjp = 'none'
 
33
 
34
+ text2speechjp = Text2Speech.from_pretrained(
35
+ model_tag=str_or_none(tagjp),
36
+ vocoder_tag=str_or_none(vocoder_tagjp),
37
+ device="cpu",
38
+ # Only for Tacotron 2 & Transformer
39
+ threshold=0.5,
40
+ # Only for Tacotron 2
41
+ minlenratio=0.0,
42
+ maxlenratio=10.0,
43
+ use_att_constraint=False,
44
+ backward_window=1,
45
+ forward_window=3,
46
+ # Only for FastSpeech & FastSpeech2 & VITS
47
+ speed_control_alpha=1.0,
48
+ # Only for VITS
49
+ noise_scale=0.333,
50
+ noise_scale_dur=0.333,
51
+ )
52
 
53
+ tagch = 'kan-bayashi/csmsc_full_band_vits'
54
+ vocoder_tagch = "none"
55
 
56
+ text2speechch = Text2Speech.from_pretrained(
57
+ model_tag=str_or_none(tagch),
58
+ vocoder_tag=str_or_none(vocoder_tagch),
59
+ device="cpu",
60
+ # Only for Tacotron 2 & Transformer
61
+ threshold=0.5,
62
+ # Only for Tacotron 2
63
+ minlenratio=0.0,
64
+ maxlenratio=10.0,
65
+ use_att_constraint=False,
66
+ backward_window=1,
67
+ forward_window=3,
68
+ # Only for FastSpeech & FastSpeech2 & VITS
69
+ speed_control_alpha=1.0,
70
+ # Only for VITS
71
+ noise_scale=0.333,
72
+ noise_scale_dur=0.333,
 
 
 
 
73
  )
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  def inference(text,lang):
76
  with torch.no_grad():
77
+ if lang == "english":
78
+ wav = text2speechen(text)["wav"]
79
+ scipy.io.wavfile.write("out.wav",text2speechen.fs , wav.view(-1).cpu().numpy())
80
+ if lang == "chinese":
81
+ wav = text2speechch(text)["wav"]
82
+ scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
83
+ if lang == "japanese":
84
+ wav = text2speechjp(text)["wav"]
85
+ scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
86
+ return "out.wav"
87
+ title = "ESPnet2-TTS"
88
+ description = "Gradio demo for ESPnet2-TTS: Extending the Edge of TTS Research. To use it, simply add your audio, or click one of the examples to load them. Read more at the links below."
89
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2110.07840' target='_blank'>ESPnet2-TTS: Extending the Edge of TTS Research</a> | <a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
90
 
91
+ examples=[['This paper describes ESPnet2-TTS, an end-to-end text-to-speech (E2E-TTS) toolkit. ESPnet2-TTS extends our earlier version, ESPnet-TTS, by adding many new features, including: on-the-fly flexible pre-processing, joint training with neural vocoders, and state-of-the-art TTS models with extensions like full-band E2E text-to-waveform modeling, which simplify the training pipeline and further enhance TTS performance. The unified design of our recipes enables users to quickly reproduce state-of-the-art E2E-TTS results',"english"],['レシピの統一された設計により、ユーザーは最先端のE2E-TTSの結果をすばやく再現できます。また、推論用の統合Pythonインターフェースで事前にトレーニングされたモデルを多数提供し、ユーザーがベースラインサンプルを生成してデモを構築するための迅速な手段を提供します。',"japanese"],['对英语和日语语料库的实验评估表明,我们提供的模型合成了与真实情况相当的话语,达到了最先进的水平',"chinese"]]
 
 
 
92
 
93
  gr.Interface(
94
+ inference,
95
+ [gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english", "chinese", "japanese"], type="value", default="english", label="language")],
96
+ gr.outputs.Audio(type="file", label="Output"),
97
  title=title,
98
+ description=description,
99
+ article=article,
100
+ enable_queue=True,
101
  examples=examples
102
+ ).launch(debug=True)