File size: 2,464 Bytes
c4e7950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
import gradio as gr
from transformers import ViTImageProcessor, ViTModel
from audiodiffusion import AudioDiffusionPipeline, ImageEncoder

device = "cuda" if torch.cuda.is_available() else "cpu"
generator1 = torch.Generator(device)
generator2 = torch.Generator(device)
    
pipe = AudioDiffusionPipeline.from_pretrained('Woleek/clMusDiff').to(device)

processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
extractor = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
image_encoder = ImageEncoder(processor, extractor)

def _encode_image(image):
    return torch.unsqueeze(image_encoder.encode(image), axis=1).to(device)

def _generate_spectrogram(condition, steps, eta):
    images, (sample_rate, audios) = pipe(
        batch_size=1,
        steps=steps,
        generator=generator1,
        step_generator=generator2,
        encoding=condition, 
        eta=eta,
        return_dict=False,
    )
    return images[0], (sample_rate, audios[0])

def run_generation(image, steps, eta):
    condition = _encode_image(image)
    spectrogram, (sr, audio) = _generate_spectrogram(condition, steps, eta)
    return spectrogram, (sr, audio)

with gr.Blocks(title="Image-based soundtrack generation") as demo:
    gr.Markdown('''
        # Image-based soundtrack generation
    ''')
    with gr.Row():
        with gr.Column():
            image = gr.Image(
                type="pil", 
                label="Conditioning image"
            )
            steps = gr.Slider(
                minimum=1, 
                maximum=1000, 
                step=1, 
                value=50,
                label="Denoising steps"
            )
            eta = gr.Slider(
                minimum=0.1, 
                maximum=1.0, 
                step=0.1, 
                value=0.9,
                label="η"
            )
            gr.Markdown('''
                Eta (η) is a variable that controls the level of interpolation between a deterministic DDIM (η=0.0) and a stochastic DDPM (η=1.0).
            ''')
            btn = gr.Button("Generate")
            clear = gr.ClearButton(image)
        with gr.Column():
            spectrogram = gr.Image(
                label="Generated Mel spectrogram"
            )
            audio = gr.Audio(
                label="Resulting audio"
            )
    btn.click(run_generation, inputs=[image, steps, eta], outputs=[spectrogram, audio])
    
demo.launch()