File size: 4,369 Bytes
6748e07
 
78ea8dc
bfbdf81
2a55caa
6748e07
2a55caa
6748e07
 
 
78ea8dc
6748e07
 
 
87e6f23
2a55caa
6748e07
2a55caa
6748e07
 
 
 
2a55caa
6748e07
 
 
 
 
 
8e74b09
2a55caa
6748e07
2a55caa
6748e07
 
 
 
07cf72c
6748e07
 
 
 
 
20017db
2a55caa
6748e07
2a55caa
6748e07
4b414b1
6748e07
 
4b414b1
2a55caa
6748e07
2a55caa
2b108d4
4b414b1
 
 
8e74b09
 
 
 
6748e07
2b108d4
 
 
 
 
4b414b1
 
2b108d4
8e74b09
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os, io, base64, tempfile, requests
import gradio as gr
from PIL import Image

# ─────────────────────────────────────────────────────────────
# 1. ν™˜κ²½ λ³€μˆ˜ & HF Inference API μ„€μ •
# ─────────────────────────────────────────────────────────────
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    raise RuntimeError("HF_TOKEN λΉ„λ°€ 값이 μ„€μ •λ˜μ–΄ μžˆμ§€ μ•ŠμŠ΅λ‹ˆλ‹€. Spaces Settings β†’ Secretsμ—μ„œ 등둝해 μ£Όμ„Έμš”.")

HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
CAPTION_API = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
MUSIC_API   = "https://api-inference.huggingface.co/models/facebook/musicgen-small"

# ─────────────────────────────────────────────────────────────
# 2. 이미지 μΊ‘μ…˜ 생성 (BLIP-base via API)
# ─────────────────────────────────────────────────────────────
def generate_caption(image_pil: Image.Image) -> str:
    buf = io.BytesIO()
    image_pil.save(buf, format="PNG")
    buf.seek(0)

    # binary upload 방식
    response = requests.post(CAPTION_API, headers=HEADERS, data=buf.getvalue(), timeout=60)
    response.raise_for_status()
    result = response.json()
    # API 응닡: [{"generated_text": "..."}]
    return result[0]["generated_text"]

# ─────────────────────────────────────────────────────────────
# 3. MusicGen-small μŒμ•… 생성 (10초, via API)
# ─────────────────────────────────────────────────────────────
def generate_music(prompt: str, duration: int = 10) -> str:
    payload = {"inputs": prompt, "parameters": {"duration": duration}}
    response = requests.post(MUSIC_API, headers=HEADERS, json=payload, timeout=120)
    response.raise_for_status()

    # API 응닡은 WAV λ°”μ΄λ„ˆλ¦¬
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    tmp.write(response.content)
    tmp.close()
    return tmp.name

# ─────────────────────────────────────────────────────────────
# 4. 전체 νŒŒμ΄ν”„λΌμΈ
# ─────────────────────────────────────────────────────────────
def process(image):
    caption = generate_caption(image)
    audio   = generate_music(f"A cheerful melody inspired by: {caption}")
    return caption, audio

# ─────────────────────────────────────────────────────────────
# 5. Gradio μΈν„°νŽ˜μ΄μŠ€
# ─────────────────────────────────────────────────────────────
# 5. Gradio μΈν„°νŽ˜μ΄μŠ€ ──────────────────────────────────────
demo = gr.Interface(
    fn=process,
    inputs=gr.Image(type="pil"),
    outputs=[
        gr.Text(label="AIκ°€ μƒμ„±ν•œ κ·Έλ¦Ό μ„€λͺ…"),
        gr.Audio(label="μƒμ„±λœ AI μŒμ•… (MusicGen)")
    ],
    title="🎨 AI κ·Έλ¦Ό-μŒμ•… 생성기 (Inference API 버전)",
    description="이미지λ₯Ό μ—…λ‘œλ“œν•˜λ©΄ BLIP-baseκ°€ μ„€λͺ…을 μƒμ„±ν•˜κ³ , "
                "ν•΄λ‹Ή μ„€λͺ…μœΌλ‘œ MusicGen-small이 10초 μŒμ•…μ„ λ§Œλ“­λ‹ˆλ‹€.",
    # concurrency_count=1   ← ❌ μ‚­μ œ
    queue=True              # (선택) λ™μ‹œ μš”μ²­μ„ νμž‰ν•˜λ €λ©΄ μ΄λ ‡κ²Œλ§Œ λ‘μ„Έμš”
    # cache_examples=False  # ν•„μš”ν•˜λ©΄ μœ μ§€
)


if __name__ == "__main__":
    demo.launch()