Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -66,32 +66,32 @@ class Tango:
|
|
| 66 |
else:
|
| 67 |
return list(self.chunks(outputs, samples))
|
| 68 |
|
| 69 |
-
# Initialize
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
-
def gradio_generate(prompt):
|
| 73 |
-
|
| 74 |
-
output_wave = tango.generate(prompt)
|
| 75 |
-
|
| 76 |
-
# Save the output_wave as a temporary WAV file
|
| 77 |
output_filename = "temp_output.wav"
|
| 78 |
wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
|
| 79 |
|
| 80 |
return output_filename
|
| 81 |
|
| 82 |
-
# Add the description text box
|
| 83 |
description_text = '''
|
| 84 |
TANGO is a latent diffusion model (LDM) for text-to-audio (TTA) generation. TANGO can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. We use the frozen instruction-tuned LLM Flan-T5 as the text encoder and train a UNet based diffusion model for audio generation. We perform comparably to current state-of-the-art models for TTA across both objective and subjective metrics, despite training the LDM on a 63 times smaller dataset. We release our model, training, inference code, and pre-trained checkpoints for the research community.
|
| 85 |
'''
|
| 86 |
|
| 87 |
-
#
|
| 88 |
input_text = gr.inputs.Textbox(lines=2, label="Prompt")
|
| 89 |
output_audio = gr.outputs.Audio(label="Generated Audio", type="filepath")
|
|
|
|
| 90 |
|
| 91 |
-
#
|
| 92 |
gr_interface = gr.Interface(
|
| 93 |
fn=gradio_generate,
|
| 94 |
-
inputs=input_text,
|
| 95 |
outputs=[output_audio],
|
| 96 |
title="TANGO: Text to Audio using Instruction-Guided Diffusion",
|
| 97 |
description="Generate audio using TANGO by providing a text prompt.",
|
|
@@ -99,16 +99,17 @@ gr_interface = gr.Interface(
|
|
| 99 |
examples=[
|
| 100 |
["An audience cheering and clapping"],
|
| 101 |
["Rolling thunder with lightning strikes"],
|
|
|
|
| 102 |
["A car engine revving"],
|
| 103 |
["A dog barking"],
|
| 104 |
["A cat meowing"],
|
| 105 |
["Emergency sirens wailing"],
|
| 106 |
["Whistling with birds chirping"],
|
| 107 |
-
["A
|
| 108 |
["Motor vehicles are driving with loud engines and a person whistles"],
|
| 109 |
-
["People cheering in a stadium while
|
| 110 |
["A helicopter is in flight"],
|
| 111 |
-
["A
|
| 112 |
],
|
| 113 |
cache_examples=False,
|
| 114 |
)
|
|
|
|
| 66 |
else:
|
| 67 |
return list(self.chunks(outputs, samples))
|
| 68 |
|
| 69 |
+
# Initialize TANGO
|
| 70 |
+
if torch.cuda.is_available():
|
| 71 |
+
tango = Tango()
|
| 72 |
+
else:
|
| 73 |
+
tango = Tango(device="cpu")
|
| 74 |
|
| 75 |
+
def gradio_generate(prompt, steps):
|
| 76 |
+
output_wave = tango.generate(prompt, int(steps))
|
|
|
|
|
|
|
|
|
|
| 77 |
output_filename = "temp_output.wav"
|
| 78 |
wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
|
| 79 |
|
| 80 |
return output_filename
|
| 81 |
|
|
|
|
| 82 |
description_text = '''
|
| 83 |
TANGO is a latent diffusion model (LDM) for text-to-audio (TTA) generation. TANGO can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. We use the frozen instruction-tuned LLM Flan-T5 as the text encoder and train a UNet based diffusion model for audio generation. We perform comparably to current state-of-the-art models for TTA across both objective and subjective metrics, despite training the LDM on a 63 times smaller dataset. We release our model, training, inference code, and pre-trained checkpoints for the research community.
|
| 84 |
'''
|
| 85 |
|
| 86 |
+
# Gradio input and output components
|
| 87 |
input_text = gr.inputs.Textbox(lines=2, label="Prompt")
|
| 88 |
output_audio = gr.outputs.Audio(label="Generated Audio", type="filepath")
|
| 89 |
+
denoising_steps = gr.Number(value=100, label="Steps", interactive=True, precision=0)
|
| 90 |
|
| 91 |
+
# Gradio interface
|
| 92 |
gr_interface = gr.Interface(
|
| 93 |
fn=gradio_generate,
|
| 94 |
+
inputs=[input_text, denoising_steps],
|
| 95 |
outputs=[output_audio],
|
| 96 |
title="TANGO: Text to Audio using Instruction-Guided Diffusion",
|
| 97 |
description="Generate audio using TANGO by providing a text prompt.",
|
|
|
|
| 99 |
examples=[
|
| 100 |
["An audience cheering and clapping"],
|
| 101 |
["Rolling thunder with lightning strikes"],
|
| 102 |
+
["Gentle water stream, birds chirping and sudden gun shot"]
|
| 103 |
["A car engine revving"],
|
| 104 |
["A dog barking"],
|
| 105 |
["A cat meowing"],
|
| 106 |
["Emergency sirens wailing"],
|
| 107 |
["Whistling with birds chirping"],
|
| 108 |
+
["A person snoring"],
|
| 109 |
["Motor vehicles are driving with loud engines and a person whistles"],
|
| 110 |
+
["People cheering in a stadium while thunder and lightning strikes"],
|
| 111 |
["A helicopter is in flight"],
|
| 112 |
+
["A dog barking and a man talking and a racing car passes by"],
|
| 113 |
],
|
| 114 |
cache_examples=False,
|
| 115 |
)
|