Spaces:
Runtime error
Runtime error
File size: 2,217 Bytes
4b414b1 20017db 4b414b1 7d967cc 20017db 8e74b09 4b414b1 20017db 8e74b09 4b414b1 8e74b09 4b414b1 20017db 8e74b09 20017db 4b414b1 8e74b09 4b414b1 8e74b09 20017db 4b414b1 8e74b09 4b414b1 8e74b09 4b414b1 8e74b09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import gradio as gr
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
from PIL import Image
import torch
import os
import tempfile
# βββββ μ΄λ―Έμ§ μΊ‘μ
λ λͺ¨λΈ λ‘λ© βββββ
caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# βββββ MusicGen λͺ¨λΈ λ‘λ© βββββ
musicgen = MusicGen.get_pretrained("facebook/musicgen-small")
musicgen.set_generation_params(duration=10) # μμ±ν μμ
κΈΈμ΄ (μ΄)
# βββββ μ΄λ―Έμ§ β μ€λͺ
λ¬Έμ₯ μμ± βββββ
def generate_caption(image):
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = caption_model.generate(pixel_values, max_length=50)
caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return caption
# βββββ μ€λͺ
β μμ
μμ± βββββ
def generate_music(prompt):
wav = musicgen.generate([prompt]) # batch size 1
tmp_dir = tempfile.mkdtemp()
audio_path = os.path.join(tmp_dir, "musicgen_output.wav")
audio_write(audio_path, wav[0], musicgen.sample_rate, strategy="loudness")
return audio_path
# βββββ μ 체 νμ΄νλΌμΈ μ°κ²° βββββ
def process(image):
caption = generate_caption(image)
prompt = f"A cheerful melody inspired by: {caption}"
audio_path = generate_music(prompt)
return caption, audio_path
# βββββ Gradio μΈν°νμ΄μ€ κ΅¬μ± βββββ
demo = gr.Interface(
fn=process,
inputs=gr.Image(type="pil"),
outputs=[
gr.Text(label="AIκ° μμ±ν κ·Έλ¦Ό μ€λͺ
"),
gr.Audio(label="μμ±λ AI μμ
(MusicGen)")
],
title="π¨ AI κ·Έλ¦Ό μμ
μμ±κΈ°",
description="κ·Έλ¦Όμ μ
λ‘λνλ©΄ AIκ° μ€λͺ
μ λ§λ€κ³ , μ€λͺ
μ λ°νμΌλ‘ μμ
μ λ§λ€μ΄ λ€λ €μ€λλ€."
)
if __name__ == "__main__":
demo.launch()
|