Woleek's picture
Init
c4e7950
raw
history blame
2.46 kB
import torch
import gradio as gr
from transformers import ViTImageProcessor, ViTModel
from audiodiffusion import AudioDiffusionPipeline, ImageEncoder
device = "cuda" if torch.cuda.is_available() else "cpu"
generator1 = torch.Generator(device)
generator2 = torch.Generator(device)
pipe = AudioDiffusionPipeline.from_pretrained('Woleek/clMusDiff').to(device)
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
extractor = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
image_encoder = ImageEncoder(processor, extractor)
def _encode_image(image):
return torch.unsqueeze(image_encoder.encode(image), axis=1).to(device)
def _generate_spectrogram(condition, steps, eta):
images, (sample_rate, audios) = pipe(
batch_size=1,
steps=steps,
generator=generator1,
step_generator=generator2,
encoding=condition,
eta=eta,
return_dict=False,
)
return images[0], (sample_rate, audios[0])
def run_generation(image, steps, eta):
condition = _encode_image(image)
spectrogram, (sr, audio) = _generate_spectrogram(condition, steps, eta)
return spectrogram, (sr, audio)
with gr.Blocks(title="Image-based soundtrack generation") as demo:
gr.Markdown('''
# Image-based soundtrack generation
''')
with gr.Row():
with gr.Column():
image = gr.Image(
type="pil",
label="Conditioning image"
)
steps = gr.Slider(
minimum=1,
maximum=1000,
step=1,
value=50,
label="Denoising steps"
)
eta = gr.Slider(
minimum=0.1,
maximum=1.0,
step=0.1,
value=0.9,
label="η"
)
gr.Markdown('''
Eta (η) is a variable that controls the level of interpolation between a deterministic DDIM (η=0.0) and a stochastic DDPM (η=1.0).
''')
btn = gr.Button("Generate")
clear = gr.ClearButton(image)
with gr.Column():
spectrogram = gr.Image(
label="Generated Mel spectrogram"
)
audio = gr.Audio(
label="Resulting audio"
)
btn.click(run_generation, inputs=[image, steps, eta], outputs=[spectrogram, audio])
demo.launch()