Spaces:
Running
Running
speed improvements and documentation
Browse files- InferenceInterfaces/ControllableInterface.py +5 -2
- app.py +1 -1
InferenceInterfaces/ControllableInterface.py
CHANGED
@@ -49,7 +49,7 @@ class ControllableInterface:
|
|
49 |
else:
|
50 |
wavs = list()
|
51 |
pitch, energy, durations = None, None, None
|
52 |
-
for i in range(
|
53 |
self.wgan.set_latent(i)
|
54 |
controllability_vector = torch.tensor([0.0,
|
55 |
0.0,
|
@@ -71,12 +71,15 @@ class ControllableInterface:
|
|
71 |
pitch=pitch,
|
72 |
energy=energy,
|
73 |
durations=durations)
|
|
|
|
|
|
|
74 |
wavs.append(wav)
|
75 |
wav = sum(wavs) / len(wavs)
|
76 |
else:
|
77 |
self.model.set_utterance_embedding(reference_audio)
|
78 |
|
79 |
-
if not voice_seed
|
80 |
wav, sr, pitch, energy, durations = self.model(prompt,
|
81 |
input_is_phones=True,
|
82 |
duration_scaling_factor=1.0,
|
|
|
49 |
else:
|
50 |
wavs = list()
|
51 |
pitch, energy, durations = None, None, None
|
52 |
+
for i in range(3, 8):
|
53 |
self.wgan.set_latent(i)
|
54 |
controllability_vector = torch.tensor([0.0,
|
55 |
0.0,
|
|
|
71 |
pitch=pitch,
|
72 |
energy=energy,
|
73 |
durations=durations)
|
74 |
+
print(pitch.shape)
|
75 |
+
print(energy.shape)
|
76 |
+
print(durations.shape)
|
77 |
wavs.append(wav)
|
78 |
wav = sum(wavs) / len(wavs)
|
79 |
else:
|
80 |
self.model.set_utterance_embedding(reference_audio)
|
81 |
|
82 |
+
if not voice_seed:
|
83 |
wav, sr, pitch, energy, durations = self.model(prompt,
|
84 |
input_is_phones=True,
|
85 |
duration_scaling_factor=1.0,
|
app.py
CHANGED
@@ -37,7 +37,7 @@ class TTSWebUI:
|
|
37 |
value="~tə ɡɛt ɐ pˈɔːz~ plˈeɪs ɐ tˈɪldə sˈɪmbəl. jˈuːs pʌŋktʃuːˈeɪʃən~ æz ɪf ðɪs wʌz tˈɛkst.~#",
|
38 |
label="IPA Input"),
|
39 |
gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
|
40 |
-
gr.Checkbox(value=
|
41 |
gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
|
42 |
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
|
43 |
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|
|
|
37 |
value="~tə ɡɛt ɐ pˈɔːz~ plˈeɪs ɐ tˈɪldə sˈɪmbəl. jˈuːs pʌŋktʃuːˈeɪʃən~ æz ɪf ðɪs wʌz tˈɛkst.~#",
|
38 |
label="IPA Input"),
|
39 |
gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
|
40 |
+
gr.Checkbox(value=False, label="Speak in many Voices"),
|
41 |
gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
|
42 |
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
|
43 |
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|