Real-Time-Voice-Cloning

Runtime error

App Files Files Community

Ahsen Khaliq commited on Sep 30, 2021

Commit

a32e829

1 Parent(s): 0e4103d

Update demo_cli.py

Browse files

Files changed (1) hide show

demo_cli.py +37 -37

demo_cli.py CHANGED Viewed

@@ -82,45 +82,45 @@ if __name__ == '__main__':
     ## Run a test
-    print("Testing your configuration with small inputs.")
-    # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
-    # sampling rate, which may differ.
-    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
-    # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
-    # The sampling rate is the number of values (samples) recorded per second, it is set to
-    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
-    # to an audio of 1 second.
-    print("	Testing the encoder...")
-    encoder.embed_utterance(np.zeros(encoder.sampling_rate))
-    # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
-    # returns, but here we're going to make one ourselves just for the sake of showing that it's
-    # possible.
-    embed = np.random.rand(speaker_embedding_size)
-    # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
-    # embeddings it will be).
-    embed /= np.linalg.norm(embed)
-    # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
-    # illustrate that
-    embeds = [embed, np.zeros(speaker_embedding_size)]
-    texts = ["test 1", "test 2"]
-    print("	Testing the synthesizer... (loading the model will output a lot of text)")
-    mels = synthesizer.synthesize_spectrograms(texts, embeds)
-    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
-    # can concatenate the mel spectrograms to a single one.
-    mel = np.concatenate(mels, axis=1)
-    # The vocoder can take a callback function to display the generation. More on that later. For
-    # now we'll simply hide it like this:
-    no_action = lambda *args: None
-    print("	Testing the vocoder...")
-    # For the sake of making this test short, we'll pass a short target length. The target length
-    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
-    # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
-    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
-    # that has a detrimental effect on the quality of the audio. The default parameters are
-    # recommended in general.
-    vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
     print("All test passed! You can now synthesize speech.\n\n")

     ## Run a test
+    # print("Testing your configuration with small inputs.")
+    # # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
+    # # sampling rate, which may differ.
+    # # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
+    # # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
+    # # The sampling rate is the number of values (samples) recorded per second, it is set to
+    # # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
+    # # to an audio of 1 second.
+    # print("	Testing the encoder...")
+    # encoder.embed_utterance(np.zeros(encoder.sampling_rate))
+    # # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
+    # # returns, but here we're going to make one ourselves just for the sake of showing that it's
+    # # possible.
+    # embed = np.random.rand(speaker_embedding_size)
+    # # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
+    # # embeddings it will be).
+    # embed /= np.linalg.norm(embed)
+    # # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
+    # # illustrate that
+    # embeds = [embed, np.zeros(speaker_embedding_size)]
+    # texts = ["test 1", "test 2"]
+    # print("	Testing the synthesizer... (loading the model will output a lot of text)")
+    # mels = synthesizer.synthesize_spectrograms(texts, embeds)
+    # # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
+    # # can concatenate the mel spectrograms to a single one.
+    # mel = np.concatenate(mels, axis=1)
+    # # The vocoder can take a callback function to display the generation. More on that later. For
+    # # now we'll simply hide it like this:
+    # no_action = lambda *args: None
+    # print("	Testing the vocoder...")
+    # # For the sake of making this test short, we'll pass a short target length. The target length
+    # # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
+    # # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
+    # # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
+    # # that has a detrimental effect on the quality of the audio. The default parameters are
+    # # recommended in general.
+    # vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
     print("All test passed! You can now synthesize speech.\n\n")