cymic commited on
Commit
1d854ef
Β·
1 Parent(s): 44a7e09

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -57
app.py CHANGED
@@ -1,92 +1,102 @@
1
  import os
2
 
3
  os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
 
 
 
 
 
 
 
 
 
 
4
 
5
  import librosa
6
- import numpy as np
 
 
 
 
 
 
7
  import torch
8
- from torch import no_grad, LongTensor
 
 
 
9
  import commons
10
  import utils
11
- import gradio as gr
12
  from models import SynthesizerTrn
13
- from text import text_to_sequence
14
- from mel_processing import spectrogram_torch
 
 
15
 
 
16
 
17
- def get_text(text):
18
- text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
19
  if hps.data.add_blank:
20
  text_norm = commons.intersperse(text_norm, 0)
21
- text_norm = LongTensor(text_norm)
22
  return text_norm
 
23
 
24
 
25
- def tts_fn(text, speaker_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  if len(text) > 150:
27
  return "Error: Text is too long", None
28
- stn_tst = get_text(text)
29
- with no_grad():
30
  x_tst = stn_tst.unsqueeze(0)
31
- x_tst_lengths = LongTensor([stn_tst.size(0)])
32
- sid = LongTensor([speaker_id])
33
- audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
34
- 0, 0].data.cpu().float().numpy()
35
- return "Success", (hps.data.sampling_rate, audio)
36
 
37
 
38
- def vc_fn(original_speaker_id, target_speaker_id, input_audio):
39
- if input_audio is None:
40
- return "You need to upload an audio", None
41
- sampling_rate, audio = input_audio
42
- duration = audio.shape[0] / sampling_rate
43
- if duration > 30:
44
- return "Error: Audio is too long", None
45
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
46
- if len(audio.shape) > 1:
47
- audio = librosa.to_mono(audio.transpose(1, 0))
48
- if sampling_rate != hps.data.sampling_rate:
49
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
50
- y = torch.FloatTensor(audio)
51
- y = y.unsqueeze(0)
52
- spec = spectrogram_torch(y, hps.data.filter_length,
53
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
54
- center=False)
55
- spec_lengths = LongTensor([spec.size(-1)])
56
- sid_src = LongTensor([original_speaker_id])
57
- sid_tgt = LongTensor([target_speaker_id])
58
- with no_grad():
59
- audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
60
  0, 0].data.cpu().float().numpy()
61
  return "Success", (hps.data.sampling_rate, audio)
62
 
63
 
64
  if __name__ == '__main__':
65
- config_path = "saved_model/config.json"
66
- model_path = "saved_model/model.pth"
67
- hps = utils.get_hparams_from_file(config_path)
68
- model = SynthesizerTrn(
69
- len(hps.symbols),
70
- hps.data.filter_length // 2 + 1,
71
- hps.train.segment_size // hps.data.hop_length,
72
- n_speakers=hps.data.n_speakers,
73
- **hps.model)
74
- utils.load_checkpoint(model_path, model, None)
75
- model.eval()
76
 
77
  app = gr.Blocks()
78
 
79
  with app:
80
  with gr.Tabs():
81
- with gr.TabItem("TTS"):
82
- with gr.Column():
83
- tts_input1 = gr.TextArea(label="Text (150 words limitation)", value="こんにけは。")
84
- tts_input2 = gr.Dropdown(label="Speaker", choices=hps.speakers, type="index", value=hps.speakers[0])
85
- tts_submit = gr.Button("Generate", variant="primary")
86
- tts_output1 = gr.Textbox(label="Output Message")
87
- tts_output2 = gr.Audio(label="Output Audio")
88
 
89
  tts_submit.click(tts_fn, [tts_input1, tts_input2], [tts_output1, tts_output2])
90
- vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
91
 
92
  app.launch()
 
1
  import os
2
 
3
  os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
4
+ os.system('pip install pypinyin Cython==0.29.21 librosa==0.8.0 matplotlib==3.3.1 numpy==1.18.5 phonemizer==2.2.1 scipy==1.5.2 Unidecode==1.1.1 >log.log')
5
+ os.system('sudo apt-get install espeak -y >log.log')
6
+ os.system('pip install gdown >log.log')
7
+ os.system('pip install pyopenjtalk janome > log.log')
8
+ os.system('pip install cloud-tpu-client > log.log')
9
+
10
+ import logging
11
+
12
+ numba_logger = logging.getLogger('numba')
13
+ numba_logger.setLevel(logging.WARNING)
14
 
15
  import librosa
16
+
17
+ import matplotlib.pyplot as plt
18
+ import IPython.display as ipd
19
+
20
+ import os
21
+ import json
22
+ import math
23
  import torch
24
+ from torch import nn
25
+ from torch.nn import functional as F
26
+ from torch.utils.data import DataLoader
27
+
28
  import commons
29
  import utils
30
+ from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
31
  from models import SynthesizerTrn
32
+ from text.symbols import symbols
33
+ from text.cleaners import japanese_phrase_cleaners
34
+ from text import cleaned_text_to_sequence
35
+ from pypinyin import lazy_pinyin, Style
36
 
37
+ from scipy.io.wavfile import write
38
 
39
+ def get_text(text, hps):
40
+ text_norm = cleaned_text_to_sequence(text)
41
  if hps.data.add_blank:
42
  text_norm = commons.intersperse(text_norm, 0)
43
+ text_norm = torch.LongTensor(text_norm)
44
  return text_norm
45
+ # hps_ms = utils.get_hparams_from_file("./configs/vctk_base.json")
46
 
47
 
48
+ hps = utils.get_hparams_from_file("./configs/tokaiteio.json")
49
+ # net_g_ms = SynthesizerTrn(
50
+ # len(symbols),
51
+ # hps_ms.data.filter_length // 2 + 1,
52
+ # hps_ms.train.segment_size // hps.data.hop_length,
53
+ # n_speakers=hps_ms.data.n_speakers,
54
+ # **hps_ms.model)
55
+
56
+ net_g = SynthesizerTrn(
57
+ len(symbols),
58
+ hps.data.filter_length // 2 + 1,
59
+ hps.train.segment_size // hps.data.hop_length,
60
+ **hps.model)
61
+ _ = net_g.eval()
62
+
63
+
64
+ def tts(text):
65
  if len(text) > 150:
66
  return "Error: Text is too long", None
67
+ stn_tst = get_text(text, hps)
68
+ with torch.no_grad():
69
  x_tst = stn_tst.unsqueeze(0)
70
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
71
+ audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy()
72
+ ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))
 
 
73
 
74
 
75
+ def tts_fn(text, speaker_id):
76
+ if len(text) > 150:
77
+ return "Error: Text is too long", None
78
+ stn_tst = get_text(text, hps)
79
+ with torch.no_grad():
80
+ x_tst = stn_tst.unsqueeze(0)
81
+ x_tst_lengths = LongTensor([stn_tst.size(0)])
82
+ audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  0, 0].data.cpu().float().numpy()
84
  return "Success", (hps.data.sampling_rate, audio)
85
 
86
 
87
  if __name__ == '__main__':
88
+ _ = utils.load_checkpoint("G_50000.pth", net_g, None)
 
 
 
 
 
 
 
 
 
 
89
 
90
  app = gr.Blocks()
91
 
92
  with app:
93
  with gr.Tabs():
94
+ with gr.Column():
95
+ tts_input1 = gr.TextArea(label="Text (150 words limitation)", value="こんにけは。")
96
+ tts_submit = gr.Button("Generate", variant="primary")
97
+ tts_output1 = gr.Textbox(label="Output Message")
98
+ tts_output2 = gr.Audio(label="Output Audio")
 
 
99
 
100
  tts_submit.click(tts_fn, [tts_input1, tts_input2], [tts_output1, tts_output2])
 
101
 
102
  app.launch()