Flux9665 commited on
Commit
7d147e2
·
1 Parent(s): 987fd27

speed improvements and documentation

Browse files
InferenceInterfaces/ControllableInterface.py CHANGED
@@ -49,7 +49,7 @@ class ControllableInterface:
49
  else:
50
  wavs = list()
51
  pitch, energy, durations = None, None, None
52
- for i in range(10):
53
  self.wgan.set_latent(i)
54
  controllability_vector = torch.tensor([0.0,
55
  0.0,
@@ -71,12 +71,15 @@ class ControllableInterface:
71
  pitch=pitch,
72
  energy=energy,
73
  durations=durations)
 
 
 
74
  wavs.append(wav)
75
  wav = sum(wavs) / len(wavs)
76
  else:
77
  self.model.set_utterance_embedding(reference_audio)
78
 
79
- if not voice_seed and reference_audio is not None:
80
  wav, sr, pitch, energy, durations = self.model(prompt,
81
  input_is_phones=True,
82
  duration_scaling_factor=1.0,
 
49
  else:
50
  wavs = list()
51
  pitch, energy, durations = None, None, None
52
+ for i in range(3, 8):
53
  self.wgan.set_latent(i)
54
  controllability_vector = torch.tensor([0.0,
55
  0.0,
 
71
  pitch=pitch,
72
  energy=energy,
73
  durations=durations)
74
+ print(pitch.shape)
75
+ print(energy.shape)
76
+ print(durations.shape)
77
  wavs.append(wav)
78
  wav = sum(wavs) / len(wavs)
79
  else:
80
  self.model.set_utterance_embedding(reference_audio)
81
 
82
+ if not voice_seed:
83
  wav, sr, pitch, energy, durations = self.model(prompt,
84
  input_is_phones=True,
85
  duration_scaling_factor=1.0,
app.py CHANGED
@@ -37,7 +37,7 @@ class TTSWebUI:
37
  value="~tə ɡɛt ɐ pˈɔːz~ plˈeɪs ɐ tˈɪldə sˈɪmbəl. jˈuːs pʌŋktʃuːˈeɪʃən~ æz ɪf ðɪs wʌz tˈɛkst.~#",
38
  label="IPA Input"),
39
  gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
40
- gr.Checkbox(value=True, label="Speak in many Voices"),
41
  gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
42
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
43
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
 
37
  value="~tə ɡɛt ɐ pˈɔːz~ plˈeɪs ɐ tˈɪldə sˈɪmbəl. jˈuːs pʌŋktʃuːˈeɪʃən~ æz ɪf ðɪs wʌz tˈɛkst.~#",
38
  label="IPA Input"),
39
  gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
40
+ gr.Checkbox(value=False, label="Speak in many Voices"),
41
  gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
42
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
43
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),