Spaces:

yongyeol
/

imagetoaudio

Runtime error

App Files Files

xet

Community

yongyeol commited on Jul 8

Commit

343dde8

verified ·

1 Parent(s): e3eaf60

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -23

app.py CHANGED Viewed

@@ -1,42 +1,62 @@
-import os, tempfile, requests
 import gradio as gr
 from PIL import Image
 from transformers import pipeline
-# ────────────────────── 1. 캡셔닝 파이프라인 ──────────────────────
 caption_pipe = pipeline(
     "image-to-text",
-    model="Salesforce/blip-image-captioning-base",  # tiny 모델로 바꾸려면 여기만 수정
-    device=-1,          # -1 → CPU, 0 이상 → GPU ID (Spaces CPU라면 -1 유지)
 )
-# ────────────────────── 2. MusicGen(Inf-API) ─────────────────────
-HF_TOKEN = os.getenv("HF_TOKEN")
-HEADERS  = {"Authorization": f"Bearer {HF_TOKEN}"}
-MUSIC_API = "https://api-inference.huggingface.co/models/facebook/musicgen-small"
-def generate_music(prompt: str, duration=10) -> str:
-    payload = {"inputs": prompt, "parameters": {"duration": duration}}
-    r = requests.post(MUSIC_API, headers=HEADERS, json=payload, timeout=120)
-    r.raise_for_status()
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    tmp.write(r.content)
-    tmp.close()
     return tmp.name
-# ────────────────────── 3. 전체 파이프라인 ──────────────────────
-def process(image: Image.Image):
-    caption = caption_pipe(image)[0]["generated_text"]
-    music   = generate_music(f"A cheerful melody inspired by: {caption}")
-    return caption, music
-# ────────────────────── 4. Gradio UI ────────────────────────────
 demo = gr.Interface(
     fn=process,
     inputs=gr.Image(type="pil"),
-    outputs=[gr.Text(), gr.Audio()],
-    title="🎨 로컬 BLIP-base + MusicGen-API",
-    description="CPU에서 BLIP-base로 캡션을 생성하고, 해당 캡션을 MusicGen-small Inference API로 전달해 10초 음악을 만듭니다."
 ).queue()
 if __name__ == "__main__":

+import os, tempfile, soundfile as sf
 import gradio as gr
 from PIL import Image
 from transformers import pipeline
+# ────────────────────────────────────────────────
+# 1. 파이프라인 로드 (CPU: device=-1)
+# ────────────────────────────────────────────────
+CAPTION_ID = "Salesforce/blip-image-captioning-base"   # 용량↓: blip-image-captioning
+MUSIC_ID   = "facebook/musicgen-melody"                # 용량↓: musicgen-small
 caption_pipe = pipeline(
     "image-to-text",
+    model=CAPTION_ID,
+    device=-1
+)
+music_pipe = pipeline(
+    "text-to-audio",
+    model=MUSIC_ID,
+    device=-1,
+    generate_kwargs={"duration": 10}   # 초 단위
 )
+# ────────────────────────────────────────────────
+# 2. 유틸 함수
+# ────────────────────────────────────────────────
+def generate_caption(img: Image.Image) -> str:
+    return caption_pipe(img)[0]["generated_text"]
+def generate_music(prompt: str) -> str:
+    result = music_pipe(prompt, forward_params={"do_sample": True})[0]
+    audio, sr = result["audio"], result["sampling_rate"]
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    sf.write(tmp.name, audio, sr)
     return tmp.name
+# ────────────────────────────────────────────────
+# 3. 전체 파이프라인
+# ────────────────────────────────────────────────
+def process(image):
+    caption = generate_caption(image)
+    audio   = generate_music(f"A cheerful melody inspired by: {caption}")
+    return caption, audio
+# ────────────────────────────────────────────────
+# 4. Gradio UI
+# ────────────────────────────────────────────────
 demo = gr.Interface(
     fn=process,
     inputs=gr.Image(type="pil"),
+    outputs=[
+        gr.Text(label="AI가 생성한 그림 설명"),
+        gr.Audio(label="생성된 AI 음악 (MusicGen)")
+    ],
+    title="🎨 로컬 BLIP-base + MusicGen-melody",
+    description="이미지를 업로드하면 BLIP-base가 설명을 생성하고, "
+                "그 설명으로 MusicGen-melody가 10초 음악을 만듭니다."
 ).queue()
 if __name__ == "__main__":