imagetoaudio / app.py
yongyeol's picture
Update app.py
343dde8 verified
import os, tempfile, soundfile as sf
import gradio as gr
from PIL import Image
from transformers import pipeline
# ────────────────────────────────────────────────
# 1. νŒŒμ΄ν”„λΌμΈ λ‘œλ“œ (CPU: device=-1)
# ────────────────────────────────────────────────
CAPTION_ID = "Salesforce/blip-image-captioning-base" # μš©λŸ‰β†“: blip-image-captioning
MUSIC_ID = "facebook/musicgen-melody" # μš©λŸ‰β†“: musicgen-small
caption_pipe = pipeline(
"image-to-text",
model=CAPTION_ID,
device=-1
)
music_pipe = pipeline(
"text-to-audio",
model=MUSIC_ID,
device=-1,
generate_kwargs={"duration": 10} # 초 λ‹¨μœ„
)
# ────────────────────────────────────────────────
# 2. μœ ν‹Έ ν•¨μˆ˜
# ────────────────────────────────────────────────
def generate_caption(img: Image.Image) -> str:
return caption_pipe(img)[0]["generated_text"]
def generate_music(prompt: str) -> str:
result = music_pipe(prompt, forward_params={"do_sample": True})[0]
audio, sr = result["audio"], result["sampling_rate"]
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
sf.write(tmp.name, audio, sr)
return tmp.name
# ────────────────────────────────────────────────
# 3. 전체 νŒŒμ΄ν”„λΌμΈ
# ────────────────────────────────────────────────
def process(image):
caption = generate_caption(image)
audio = generate_music(f"A cheerful melody inspired by: {caption}")
return caption, audio
# ────────────────────────────────────────────────
# 4. Gradio UI
# ────────────────────────────────────────────────
demo = gr.Interface(
fn=process,
inputs=gr.Image(type="pil"),
outputs=[
gr.Text(label="AIκ°€ μƒμ„±ν•œ κ·Έλ¦Ό μ„€λͺ…"),
gr.Audio(label="μƒμ„±λœ AI μŒμ•… (MusicGen)")
],
title="🎨 둜컬 BLIP-base + MusicGen-melody",
description="이미지λ₯Ό μ—…λ‘œλ“œν•˜λ©΄ BLIP-baseκ°€ μ„€λͺ…을 μƒμ„±ν•˜κ³ , "
"κ·Έ μ„€λͺ…μœΌλ‘œ MusicGen-melodyκ°€ 10초 μŒμ•…μ„ λ§Œλ“­λ‹ˆλ‹€."
).queue()
if __name__ == "__main__":
demo.launch()