NeutralToucan

Running

Flux9665 commited on May 17

Commit

7d147e2

1 Parent(s): 987fd27

speed improvements and documentation

Files changed (2) hide show

InferenceInterfaces/ControllableInterface.py CHANGED Viewed

@@ -49,7 +49,7 @@ class ControllableInterface:
             else:
                 wavs = list()
                 pitch, energy, durations = None, None, None
-                for i in range(10):
                     self.wgan.set_latent(i)
                     controllability_vector = torch.tensor([0.0,
                                                            0.0,
@@ -71,12 +71,15 @@ class ControllableInterface:
                                                                    pitch=pitch,
                                                                    energy=energy,
                                                                    durations=durations)
                     wavs.append(wav)
                 wav = sum(wavs) / len(wavs)
         else:
             self.model.set_utterance_embedding(reference_audio)
-        if not voice_seed and reference_audio is not None:
             wav, sr, pitch, energy, durations = self.model(prompt,
                                                            input_is_phones=True,
                                                            duration_scaling_factor=1.0,

             else:
                 wavs = list()
                 pitch, energy, durations = None, None, None
+                for i in range(3, 8):
                     self.wgan.set_latent(i)
                     controllability_vector = torch.tensor([0.0,
                                                            0.0,
                                                                    pitch=pitch,
                                                                    energy=energy,
                                                                    durations=durations)
+                    print(pitch.shape)
+                    print(energy.shape)
+                    print(durations.shape)
                     wavs.append(wav)
                 wav = sum(wavs) / len(wavs)
         else:
             self.model.set_utterance_embedding(reference_audio)
+        if not voice_seed:
             wav, sr, pitch, energy, durations = self.model(prompt,
                                                            input_is_phones=True,
                                                            duration_scaling_factor=1.0,

app.py CHANGED Viewed

@@ -37,7 +37,7 @@ class TTSWebUI:
                                                      value="~tə ɡɛt ɐ pˈɔːz~ plˈeɪs ɐ tˈɪldə sˈɪmbəl. jˈuːs pʌŋktʃuːˈeɪʃən~ æz ɪf ðɪs wʌz tˈɛkst.~#",
                                                      label="IPA Input"),
                                           gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
-                                          gr.Checkbox(value=True, label="Speak in many Voices"),
                                           gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
                                           # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
                                           # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),

                                                      value="~tə ɡɛt ɐ pˈɔːz~ plˈeɪs ɐ tˈɪldə sˈɪmbəl. jˈuːs pʌŋktʃuːˈeɪʃən~ æz ɪf ðɪs wʌz tˈɛkst.~#",
                                                      label="IPA Input"),
                                           gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
+                                          gr.Checkbox(value=False, label="Speak in many Voices"),
                                           gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
                                           # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
                                           # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),