File size: 2,820 Bytes
343dde8
6748e07
78ea8dc
d2e22b1
bfbdf81
343dde8
 
 
 
 
 
d2e22b1
 
343dde8
 
 
 
 
 
 
 
 
d2e22b1
78ea8dc
343dde8
 
 
 
 
 
 
 
 
87e6f23
6748e07
343dde8
6748e07
20017db
343dde8
 
 
 
 
 
 
4b414b1
343dde8
 
 
4b414b1
 
 
343dde8
 
 
 
 
 
 
d2e22b1
2b108d4
8e74b09
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os, tempfile, soundfile as sf
import gradio as gr
from PIL import Image
from transformers import pipeline

# ────────────────────────────────────────────────
# 1. νŒŒμ΄ν”„λΌμΈ λ‘œλ“œ (CPU: device=-1)
# ────────────────────────────────────────────────
CAPTION_ID = "Salesforce/blip-image-captioning-base"   # μš©λŸ‰β†“: blip-image-captioning
MUSIC_ID   = "facebook/musicgen-melody"                # μš©λŸ‰β†“: musicgen-small

caption_pipe = pipeline(
    "image-to-text",
    model=CAPTION_ID,
    device=-1
)

music_pipe = pipeline(
    "text-to-audio",
    model=MUSIC_ID,
    device=-1,
    generate_kwargs={"duration": 10}   # 초 λ‹¨μœ„
)

# ────────────────────────────────────────────────
# 2. μœ ν‹Έ ν•¨μˆ˜
# ────────────────────────────────────────────────
def generate_caption(img: Image.Image) -> str:
    return caption_pipe(img)[0]["generated_text"]

def generate_music(prompt: str) -> str:
    result = music_pipe(prompt, forward_params={"do_sample": True})[0]
    audio, sr = result["audio"], result["sampling_rate"]

    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    sf.write(tmp.name, audio, sr)
    return tmp.name

# ────────────────────────────────────────────────
# 3. 전체 νŒŒμ΄ν”„λΌμΈ
# ────────────────────────────────────────────────
def process(image):
    caption = generate_caption(image)
    audio   = generate_music(f"A cheerful melody inspired by: {caption}")
    return caption, audio

# ────────────────────────────────────────────────
# 4. Gradio UI
# ────────────────────────────────────────────────
demo = gr.Interface(
    fn=process,
    inputs=gr.Image(type="pil"),
    outputs=[
        gr.Text(label="AIκ°€ μƒμ„±ν•œ κ·Έλ¦Ό μ„€λͺ…"),
        gr.Audio(label="μƒμ„±λœ AI μŒμ•… (MusicGen)")
    ],
    title="🎨 둜컬 BLIP-base + MusicGen-melody",
    description="이미지λ₯Ό μ—…λ‘œλ“œν•˜λ©΄ BLIP-baseκ°€ μ„€λͺ…을 μƒμ„±ν•˜κ³ , "
                "κ·Έ μ„€λͺ…μœΌλ‘œ MusicGen-melodyκ°€ 10초 μŒμ•…μ„ λ§Œλ“­λ‹ˆλ‹€."
).queue()

if __name__ == "__main__":
    demo.launch()