Spaces:

yongyeol
/

imagetoaudio

Runtime error

File size: 3,438 Bytes

6748e07
 
78ea8dc
bfbdf81
855f2cd
 
 
 
6748e07
855f2cd
78ea8dc
6748e07
 
 
87e6f23
855f2cd
 
 
6748e07
 
 
855f2cd
 
 
2a55caa
855f2cd
 
 
6748e07
 
855f2cd
 
6748e07
855f2cd
6748e07
 
20017db
855f2cd
6748e07
855f2cd
6748e07
4b414b1
6748e07
 
4b414b1
855f2cd
 
 
4b414b1
 
 
8e74b09
 
 
 
855f2cd
 
 
 
2b108d4
8e74b09

import os, io, base64, tempfile, requests
import gradio as gr
from PIL import Image

# ───────────────────────────────────────────────
# 1. HF Inference API 준비
# ───────────────────────────────────────────────
HF_TOKEN = os.getenv("HF_TOKEN")           # Spaces → Settings → Secrets
if not HF_TOKEN:
    raise RuntimeError("HF_TOKEN 비밀값이 없습니다. Settings → Secrets에 등록하세요.")

HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
CAPTION_API = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
MUSIC_API   = "https://api-inference.huggingface.co/models/facebook/musicgen-small"

# ───────────────────────────────────────────────
# 2. 캡션 생성 함수
# ───────────────────────────────────────────────
def generate_caption(image_pil: Image.Image) -> str:
    buf = io.BytesIO()
    image_pil.save(buf, format="PNG")
    resp = requests.post(CAPTION_API, headers=HEADERS, data=buf.getvalue(), timeout=60)
    resp.raise_for_status()
    return resp.json()[0]["generated_text"]

# ───────────────────────────────────────────────
# 3. 음악 생성 함수
# ───────────────────────────────────────────────
def generate_music(prompt: str, duration: int = 10) -> str:
    payload = {"inputs": prompt, "parameters": {"duration": duration}}
    resp = requests.post(MUSIC_API, headers=HEADERS, json=payload, timeout=120)
    resp.raise_for_status()
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    tmp.write(resp.content)
    tmp.close()
    return tmp.name

# ───────────────────────────────────────────────
# 4. 전체 파이프라인
# ───────────────────────────────────────────────
def process(image):
    caption = generate_caption(image)
    audio   = generate_music(f"A cheerful melody inspired by: {caption}")
    return caption, audio

# ───────────────────────────────────────────────
# 5. Gradio UI
# ───────────────────────────────────────────────
demo = gr.Interface(
    fn=process,
    inputs=gr.Image(type="pil"),
    outputs=[
        gr.Text(label="AI가 생성한 그림 설명"),
        gr.Audio(label="생성된 AI 음악 (MusicGen)")
    ],
    title="🎨 AI 그림-음악 생성기 (Inference API)",
    description="이미지를 업로드하면 BLIP-base가 설명을 만들고, "
                "해당 설명으로 MusicGen-small이 10초 음악을 생성합니다."
).queue()   # ★ 필요하면 이렇게 체이닝으로 큐 활성화

if __name__ == "__main__":
    demo.launch()