imagetoaudio / app.py
yongyeol's picture
Update app.py
06f6c9e verified
raw
history blame
4.16 kB
import os
import sys
import types
import subprocess
import tempfile
# ── ν™˜κ²½ λ³€μˆ˜ μ„€μ • ──────────────────────────────────────────────
os.environ["HF_FORCE_SAFE_SERIALIZATION"] = "1"
os.environ["XFORMERS_FORCE_DISABLE"] = "1" # xformers λΉ„ν™œμ„±ν™”
# ── ✨ xformers 더미 λͺ¨λ“ˆ μ‚½μž… ──────────────────────────────────
dummy = types.ModuleType("xformers")
dummy.ops = types.ModuleType("xformers.ops") # audiocraftκ°€ ops ν•˜μœ„λͺ¨λ“ˆλ„ 찾음
sys.modules["xformers"] = dummy
sys.modules["xformers.ops"] = dummy.ops
# ───────────────────────────────────────────────────────────────
# ── audiocraft 동적 μ„€μΉ˜ ───────────────────────────────────────
try:
from audiocraft.models import MusicGen
except ModuleNotFoundError:
subprocess.check_call([
sys.executable, "-m", "pip", "install",
"git+https://github.com/facebookresearch/audiocraft@main",
"--use-pep517" # μ˜μ‘΄μ„± 포함 μ„€μΉ˜
])
from audiocraft.models import MusicGen
import gradio as gr
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from audiocraft.data.audio import audio_write
from PIL import Image
import torch
# ───── 이미지 캑셔닝 λͺ¨λΈ λ‘œλ”© ─────────────────────────────────
caption_model = VisionEncoderDecoderModel.from_pretrained(
"nlpconnect/vit-gpt2-image-captioning",
use_safetensors=True,
low_cpu_mem_usage=True
)
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# ───── MusicGen λͺ¨λΈ λ‘œλ”© ─────────────────────────────────────
musicgen = MusicGen.get_pretrained("facebook/musicgen-small")
musicgen.set_generation_params(duration=10) # 생성 μŒμ•… 길이(초)
# ───── 이미지 β†’ μ„€λͺ… λ¬Έμž₯ 생성 ν•¨μˆ˜ ────────────────────────────
def generate_caption(image: Image.Image) -> str:
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = caption_model.generate(pixel_values, max_length=50)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
# ───── μ„€λͺ… β†’ μŒμ•… 생성 ν•¨μˆ˜ ──────────────────────────────────
def generate_music(prompt: str) -> str:
wav = musicgen.generate([prompt]) # batch size = 1
tmp_dir = tempfile.mkdtemp()
audio_path = os.path.join(tmp_dir, "musicgen_output.wav")
audio_write(audio_path, wav[0], musicgen.sample_rate, strategy="loudness")
return audio_path
# ───── 전체 νŒŒμ΄ν”„λΌμΈ ────────────────────────────────────────
def process(image: Image.Image):
caption = generate_caption(image)
prompt = f"A cheerful melody inspired by: {caption}"
audio_path = generate_music(prompt)
return caption, audio_path
# ───── Gradio μΈν„°νŽ˜μ΄μŠ€ ─────────────────────────────────────
demo = gr.Interface(
fn=process,
inputs=gr.Image(type="pil"),
outputs=[
gr.Text(label="AIκ°€ μƒμ„±ν•œ κ·Έλ¦Ό μ„€λͺ…"),
gr.Audio(label="μƒμ„±λœ AI μŒμ•… (MusicGen)")
],
title="🎨 AI κ·Έλ¦Ό-μŒμ•… 생성기",
description="그림을 μ—…λ‘œλ“œν•˜λ©΄ AIκ°€ μ„€λͺ…을 λ§Œλ“€κ³ , μ„€λͺ…을 λ°”νƒ•μœΌλ‘œ μŒμ•…μ„ 생성해 λ“€λ €μ€λ‹ˆλ‹€."
)
if __name__ == "__main__":
demo.launch()