imagetoaudio / app.py
yongyeol's picture
Update app.py
2b108d4 verified
raw
history blame
4.37 kB
import os, io, base64, tempfile, requests
import gradio as gr
from PIL import Image
# ─────────────────────────────────────────────────────────────
# 1. ν™˜κ²½ λ³€μˆ˜ & HF Inference API μ„€μ •
# ─────────────────────────────────────────────────────────────
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
raise RuntimeError("HF_TOKEN λΉ„λ°€ 값이 μ„€μ •λ˜μ–΄ μžˆμ§€ μ•ŠμŠ΅λ‹ˆλ‹€. Spaces Settings β†’ Secretsμ—μ„œ 등둝해 μ£Όμ„Έμš”.")
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
CAPTION_API = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
MUSIC_API = "https://api-inference.huggingface.co/models/facebook/musicgen-small"
# ─────────────────────────────────────────────────────────────
# 2. 이미지 μΊ‘μ…˜ 생성 (BLIP-base via API)
# ─────────────────────────────────────────────────────────────
def generate_caption(image_pil: Image.Image) -> str:
buf = io.BytesIO()
image_pil.save(buf, format="PNG")
buf.seek(0)
# binary upload 방식
response = requests.post(CAPTION_API, headers=HEADERS, data=buf.getvalue(), timeout=60)
response.raise_for_status()
result = response.json()
# API 응닡: [{"generated_text": "..."}]
return result[0]["generated_text"]
# ─────────────────────────────────────────────────────────────
# 3. MusicGen-small μŒμ•… 생성 (10초, via API)
# ─────────────────────────────────────────────────────────────
def generate_music(prompt: str, duration: int = 10) -> str:
payload = {"inputs": prompt, "parameters": {"duration": duration}}
response = requests.post(MUSIC_API, headers=HEADERS, json=payload, timeout=120)
response.raise_for_status()
# API 응닡은 WAV λ°”μ΄λ„ˆλ¦¬
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
tmp.write(response.content)
tmp.close()
return tmp.name
# ─────────────────────────────────────────────────────────────
# 4. 전체 νŒŒμ΄ν”„λΌμΈ
# ─────────────────────────────────────────────────────────────
def process(image):
caption = generate_caption(image)
audio = generate_music(f"A cheerful melody inspired by: {caption}")
return caption, audio
# ─────────────────────────────────────────────────────────────
# 5. Gradio μΈν„°νŽ˜μ΄μŠ€
# ─────────────────────────────────────────────────────────────
# 5. Gradio μΈν„°νŽ˜μ΄μŠ€ ──────────────────────────────────────
demo = gr.Interface(
fn=process,
inputs=gr.Image(type="pil"),
outputs=[
gr.Text(label="AIκ°€ μƒμ„±ν•œ κ·Έλ¦Ό μ„€λͺ…"),
gr.Audio(label="μƒμ„±λœ AI μŒμ•… (MusicGen)")
],
title="🎨 AI κ·Έλ¦Ό-μŒμ•… 생성기 (Inference API 버전)",
description="이미지λ₯Ό μ—…λ‘œλ“œν•˜λ©΄ BLIP-baseκ°€ μ„€λͺ…을 μƒμ„±ν•˜κ³ , "
"ν•΄λ‹Ή μ„€λͺ…μœΌλ‘œ MusicGen-small이 10초 μŒμ•…μ„ λ§Œλ“­λ‹ˆλ‹€.",
# concurrency_count=1 ← ❌ μ‚­μ œ
queue=True # (선택) λ™μ‹œ μš”μ²­μ„ νμž‰ν•˜λ €λ©΄ μ΄λ ‡κ²Œλ§Œ λ‘μ„Έμš”
# cache_examples=False # ν•„μš”ν•˜λ©΄ μœ μ§€
)
if __name__ == "__main__":
demo.launch()