Spaces:
Runtime error
Runtime error
File size: 4,160 Bytes
06f6c9e bfbdf81 06f6c9e 4ae4657 06f6c9e bfbdf81 06f6c9e 4ae4657 06f6c9e 87e6f23 06f6c9e 87e6f23 4ae4657 06f6c9e 4ae4657 87e6f23 c3cf7db 4b414b1 20017db 4b414b1 8e74b09 06f6c9e ad5c75b 9472531 06f6c9e ad5c75b 4b414b1 06f6c9e 20017db 06f6c9e 20017db 06f6c9e 4b414b1 8e74b09 06f6c9e 4b414b1 06f6c9e 20017db 4b414b1 06f6c9e 4b414b1 8e74b09 20017db 4b414b1 06f6c9e 4b414b1 8e74b09 06f6c9e 4b414b1 8e74b09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import os
import sys
import types
import subprocess
import tempfile
# ββ νκ²½ λ³μ μ€μ ββββββββββββββββββββββββββββββββββββββββββββββ
os.environ["HF_FORCE_SAFE_SERIALIZATION"] = "1"
os.environ["XFORMERS_FORCE_DISABLE"] = "1" # xformers λΉνμ±ν
# ββ β¨ xformers λλ―Έ λͺ¨λ μ½μ
ββββββββββββββββββββββββββββββββββ
dummy = types.ModuleType("xformers")
dummy.ops = types.ModuleType("xformers.ops") # audiocraftκ° ops νμλͺ¨λλ μ°Ύμ
sys.modules["xformers"] = dummy
sys.modules["xformers.ops"] = dummy.ops
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# ββ audiocraft λμ μ€μΉ βββββββββββββββββββββββββββββββββββββββ
try:
from audiocraft.models import MusicGen
except ModuleNotFoundError:
subprocess.check_call([
sys.executable, "-m", "pip", "install",
"git+https://github.com/facebookresearch/audiocraft@main",
"--use-pep517" # μμ‘΄μ± ν¬ν¨ μ€μΉ
])
from audiocraft.models import MusicGen
import gradio as gr
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from audiocraft.data.audio import audio_write
from PIL import Image
import torch
# βββββ μ΄λ―Έμ§ μΊ‘μ
λ λͺ¨λΈ λ‘λ© βββββββββββββββββββββββββββββββββ
caption_model = VisionEncoderDecoderModel.from_pretrained(
"nlpconnect/vit-gpt2-image-captioning",
use_safetensors=True,
low_cpu_mem_usage=True
)
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# βββββ MusicGen λͺ¨λΈ λ‘λ© βββββββββββββββββββββββββββββββββββββ
musicgen = MusicGen.get_pretrained("facebook/musicgen-small")
musicgen.set_generation_params(duration=10) # μμ± μμ
κΈΈμ΄(μ΄)
# βββββ μ΄λ―Έμ§ β μ€λͺ
λ¬Έμ₯ μμ± ν¨μ ββββββββββββββββββββββββββββ
def generate_caption(image: Image.Image) -> str:
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = caption_model.generate(pixel_values, max_length=50)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
# βββββ μ€λͺ
β μμ
μμ± ν¨μ ββββββββββββββββββββββββββββββββββ
def generate_music(prompt: str) -> str:
wav = musicgen.generate([prompt]) # batch size = 1
tmp_dir = tempfile.mkdtemp()
audio_path = os.path.join(tmp_dir, "musicgen_output.wav")
audio_write(audio_path, wav[0], musicgen.sample_rate, strategy="loudness")
return audio_path
# βββββ μ 체 νμ΄νλΌμΈ ββββββββββββββββββββββββββββββββββββββββ
def process(image: Image.Image):
caption = generate_caption(image)
prompt = f"A cheerful melody inspired by: {caption}"
audio_path = generate_music(prompt)
return caption, audio_path
# βββββ Gradio μΈν°νμ΄μ€ βββββββββββββββββββββββββββββββββββββ
demo = gr.Interface(
fn=process,
inputs=gr.Image(type="pil"),
outputs=[
gr.Text(label="AIκ° μμ±ν κ·Έλ¦Ό μ€λͺ
"),
gr.Audio(label="μμ±λ AI μμ
(MusicGen)")
],
title="π¨ AI κ·Έλ¦Ό-μμ
μμ±κΈ°",
description="κ·Έλ¦Όμ μ
λ‘λνλ©΄ AIκ° μ€λͺ
μ λ§λ€κ³ , μ€λͺ
μ λ°νμΌλ‘ μμ
μ μμ±ν΄ λ€λ €μ€λλ€."
)
if __name__ == "__main__":
demo.launch()
|