imagetoaudio / app.py
yongyeol's picture
Update app.py
855f2cd verified
raw
history blame
3.44 kB
import os, io, base64, tempfile, requests
import gradio as gr
from PIL import Image
# ───────────────────────────────────────────────
# 1. HF Inference API μ€€λΉ„
# ───────────────────────────────────────────────
HF_TOKEN = os.getenv("HF_TOKEN") # Spaces β†’ Settings β†’ Secrets
if not HF_TOKEN:
raise RuntimeError("HF_TOKEN 비밀값이 μ—†μŠ΅λ‹ˆλ‹€. Settings β†’ Secrets에 λ“±λ‘ν•˜μ„Έμš”.")
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
CAPTION_API = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
MUSIC_API = "https://api-inference.huggingface.co/models/facebook/musicgen-small"
# ───────────────────────────────────────────────
# 2. μΊ‘μ…˜ 생성 ν•¨μˆ˜
# ───────────────────────────────────────────────
def generate_caption(image_pil: Image.Image) -> str:
buf = io.BytesIO()
image_pil.save(buf, format="PNG")
resp = requests.post(CAPTION_API, headers=HEADERS, data=buf.getvalue(), timeout=60)
resp.raise_for_status()
return resp.json()[0]["generated_text"]
# ───────────────────────────────────────────────
# 3. μŒμ•… 생성 ν•¨μˆ˜
# ───────────────────────────────────────────────
def generate_music(prompt: str, duration: int = 10) -> str:
payload = {"inputs": prompt, "parameters": {"duration": duration}}
resp = requests.post(MUSIC_API, headers=HEADERS, json=payload, timeout=120)
resp.raise_for_status()
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
tmp.write(resp.content)
tmp.close()
return tmp.name
# ───────────────────────────────────────────────
# 4. 전체 νŒŒμ΄ν”„λΌμΈ
# ───────────────────────────────────────────────
def process(image):
caption = generate_caption(image)
audio = generate_music(f"A cheerful melody inspired by: {caption}")
return caption, audio
# ───────────────────────────────────────────────
# 5. Gradio UI
# ───────────────────────────────────────────────
demo = gr.Interface(
fn=process,
inputs=gr.Image(type="pil"),
outputs=[
gr.Text(label="AIκ°€ μƒμ„±ν•œ κ·Έλ¦Ό μ„€λͺ…"),
gr.Audio(label="μƒμ„±λœ AI μŒμ•… (MusicGen)")
],
title="🎨 AI κ·Έλ¦Ό-μŒμ•… 생성기 (Inference API)",
description="이미지λ₯Ό μ—…λ‘œλ“œν•˜λ©΄ BLIP-baseκ°€ μ„€λͺ…을 λ§Œλ“€κ³ , "
"ν•΄λ‹Ή μ„€λͺ…μœΌλ‘œ MusicGen-small이 10초 μŒμ•…μ„ μƒμ„±ν•©λ‹ˆλ‹€."
).queue() # β˜… ν•„μš”ν•˜λ©΄ μ΄λ ‡κ²Œ μ²΄μ΄λ‹μœΌλ‘œ 큐 ν™œμ„±ν™”
if __name__ == "__main__":
demo.launch()