Spaces:
Runtime error
Runtime error
Ahsen Khaliq
commited on
Commit
·
a32e829
1
Parent(s):
0e4103d
Update demo_cli.py
Browse files- demo_cli.py +37 -37
demo_cli.py
CHANGED
|
@@ -82,45 +82,45 @@ if __name__ == '__main__':
|
|
| 82 |
|
| 83 |
|
| 84 |
## Run a test
|
| 85 |
-
print("Testing your configuration with small inputs.")
|
| 86 |
-
# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
|
| 87 |
-
# sampling rate, which may differ.
|
| 88 |
-
# If you're unfamiliar with digital audio, know that it is encoded as an array of floats
|
| 89 |
-
# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
|
| 90 |
-
# The sampling rate is the number of values (samples) recorded per second, it is set to
|
| 91 |
-
# 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
|
| 92 |
-
# to an audio of 1 second.
|
| 93 |
-
print(" Testing the encoder...")
|
| 94 |
-
encoder.embed_utterance(np.zeros(encoder.sampling_rate))
|
| 95 |
|
| 96 |
-
# Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
|
| 97 |
-
# returns, but here we're going to make one ourselves just for the sake of showing that it's
|
| 98 |
-
# possible.
|
| 99 |
-
embed = np.random.rand(speaker_embedding_size)
|
| 100 |
-
# Embeddings are L2-normalized (this isn't important here, but if you want to make your own
|
| 101 |
-
# embeddings it will be).
|
| 102 |
-
embed /= np.linalg.norm(embed)
|
| 103 |
-
# The synthesizer can handle multiple inputs with batching. Let's create another embedding to
|
| 104 |
-
# illustrate that
|
| 105 |
-
embeds = [embed, np.zeros(speaker_embedding_size)]
|
| 106 |
-
texts = ["test 1", "test 2"]
|
| 107 |
-
print(" Testing the synthesizer... (loading the model will output a lot of text)")
|
| 108 |
-
mels = synthesizer.synthesize_spectrograms(texts, embeds)
|
| 109 |
|
| 110 |
-
# The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
|
| 111 |
-
# can concatenate the mel spectrograms to a single one.
|
| 112 |
-
mel = np.concatenate(mels, axis=1)
|
| 113 |
-
# The vocoder can take a callback function to display the generation. More on that later. For
|
| 114 |
-
# now we'll simply hide it like this:
|
| 115 |
-
no_action = lambda *args: None
|
| 116 |
-
print(" Testing the vocoder...")
|
| 117 |
-
# For the sake of making this test short, we'll pass a short target length. The target length
|
| 118 |
-
# is the length of the wav segments that are processed in parallel. E.g. for audio sampled
|
| 119 |
-
# at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
|
| 120 |
-
# 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
|
| 121 |
-
# that has a detrimental effect on the quality of the audio. The default parameters are
|
| 122 |
-
# recommended in general.
|
| 123 |
-
vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
|
| 124 |
|
| 125 |
print("All test passed! You can now synthesize speech.\n\n")
|
| 126 |
|
|
|
|
| 82 |
|
| 83 |
|
| 84 |
## Run a test
|
| 85 |
+
# print("Testing your configuration with small inputs.")
|
| 86 |
+
# # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
|
| 87 |
+
# # sampling rate, which may differ.
|
| 88 |
+
# # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
|
| 89 |
+
# # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
|
| 90 |
+
# # The sampling rate is the number of values (samples) recorded per second, it is set to
|
| 91 |
+
# # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
|
| 92 |
+
# # to an audio of 1 second.
|
| 93 |
+
# print(" Testing the encoder...")
|
| 94 |
+
# encoder.embed_utterance(np.zeros(encoder.sampling_rate))
|
| 95 |
|
| 96 |
+
# # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
|
| 97 |
+
# # returns, but here we're going to make one ourselves just for the sake of showing that it's
|
| 98 |
+
# # possible.
|
| 99 |
+
# embed = np.random.rand(speaker_embedding_size)
|
| 100 |
+
# # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
|
| 101 |
+
# # embeddings it will be).
|
| 102 |
+
# embed /= np.linalg.norm(embed)
|
| 103 |
+
# # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
|
| 104 |
+
# # illustrate that
|
| 105 |
+
# embeds = [embed, np.zeros(speaker_embedding_size)]
|
| 106 |
+
# texts = ["test 1", "test 2"]
|
| 107 |
+
# print(" Testing the synthesizer... (loading the model will output a lot of text)")
|
| 108 |
+
# mels = synthesizer.synthesize_spectrograms(texts, embeds)
|
| 109 |
|
| 110 |
+
# # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
|
| 111 |
+
# # can concatenate the mel spectrograms to a single one.
|
| 112 |
+
# mel = np.concatenate(mels, axis=1)
|
| 113 |
+
# # The vocoder can take a callback function to display the generation. More on that later. For
|
| 114 |
+
# # now we'll simply hide it like this:
|
| 115 |
+
# no_action = lambda *args: None
|
| 116 |
+
# print(" Testing the vocoder...")
|
| 117 |
+
# # For the sake of making this test short, we'll pass a short target length. The target length
|
| 118 |
+
# # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
|
| 119 |
+
# # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
|
| 120 |
+
# # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
|
| 121 |
+
# # that has a detrimental effect on the quality of the audio. The default parameters are
|
| 122 |
+
# # recommended in general.
|
| 123 |
+
# vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
|
| 124 |
|
| 125 |
print("All test passed! You can now synthesize speech.\n\n")
|
| 126 |
|