Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -4,42 +4,57 @@ from PIL import Image
|
|
4 |
import torch
|
5 |
import requests
|
6 |
import os
|
|
|
7 |
|
8 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
10 |
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
11 |
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
12 |
|
13 |
-
#
|
14 |
-
CHAT_TTS_API = "https://api-inference.huggingface.co/models/2Noise/ChatTTS"
|
15 |
-
headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
|
16 |
-
|
17 |
def generate_caption(image):
|
18 |
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
|
19 |
-
output_ids = caption_model.generate(pixel_values, max_length=50)
|
20 |
caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
21 |
return caption
|
22 |
|
23 |
-
|
24 |
-
def
|
25 |
-
payload = {"inputs":
|
26 |
-
response = requests.post(
|
27 |
if response.status_code == 200:
|
28 |
return response.content
|
29 |
else:
|
30 |
-
raise Exception(f"
|
31 |
-
|
32 |
|
|
|
33 |
def process(image):
|
34 |
caption = generate_caption(image)
|
35 |
-
|
36 |
-
|
|
|
37 |
|
|
|
38 |
demo = gr.Interface(
|
39 |
fn=process,
|
40 |
inputs=gr.Image(type="pil"),
|
41 |
-
outputs=[
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
)
|
44 |
|
45 |
-
|
|
|
|
4 |
import torch
|
5 |
import requests
|
6 |
import os
|
7 |
+
from dotenv import load_dotenv
|
8 |
|
9 |
+
# βββββ νκ²½ λ³μ λ‘λ© (ν ν° μμ νκ² κ°μ Έμ€κΈ°) βββββ
|
10 |
+
load_dotenv()
|
11 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
12 |
+
if HF_TOKEN is None:
|
13 |
+
raise ValueError("HF_TOKENμ΄ .env νμΌμ μμ΅λλ€.")
|
14 |
+
|
15 |
+
# βββββ Hugging Face Inference API μ€μ βββββ
|
16 |
+
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
17 |
+
MUSICGEN_API = "https://api-inference.huggingface.co/models/facebook/musicgen-small"
|
18 |
+
|
19 |
+
# βββββ μ΄λ―Έμ§ μΊ‘μ
λ λͺ¨λΈ λ‘λ© βββββ
|
20 |
caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
21 |
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
22 |
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
23 |
|
24 |
+
# βββββ μ΄λ―Έμ§ β μ€λͺ
λ¬Έμ₯ μμ± βββββ
|
|
|
|
|
|
|
25 |
def generate_caption(image):
|
26 |
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
|
27 |
+
output_ids = caption_model.generate(pixel_values, max_length=50)
|
28 |
caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
29 |
return caption
|
30 |
|
31 |
+
# βββββ μ€λͺ
β μμ
μμ± (MusicGen API νΈμΆ) βββββ
|
32 |
+
def generate_music(prompt):
|
33 |
+
payload = {"inputs": prompt}
|
34 |
+
response = requests.post(MUSICGEN_API, headers=headers, json=payload)
|
35 |
if response.status_code == 200:
|
36 |
return response.content
|
37 |
else:
|
38 |
+
raise Exception(f"MusicGen μ€λ₯: {response.status_code}, {response.text}")
|
|
|
39 |
|
40 |
+
# βββββ μ 체 νμ΄νλΌμΈ μ°κ²° βββββ
|
41 |
def process(image):
|
42 |
caption = generate_caption(image)
|
43 |
+
prompt = f"A cheerful melody inspired by: {caption}"
|
44 |
+
audio = generate_music(prompt)
|
45 |
+
return caption, (audio, "musicgen_output.wav")
|
46 |
|
47 |
+
# βββββ Gradio μΈν°νμ΄μ€ κ΅¬μ± βββββ
|
48 |
demo = gr.Interface(
|
49 |
fn=process,
|
50 |
inputs=gr.Image(type="pil"),
|
51 |
+
outputs=[
|
52 |
+
gr.Text(label="AIκ° μμ±ν κ·Έλ¦Ό μ€λͺ
"),
|
53 |
+
gr.Audio(label="μμ±λ AI μμ
(MusicGen)")
|
54 |
+
],
|
55 |
+
title="π¨ AI κ·Έλ¦Ό μμ
μμ±κΈ°",
|
56 |
+
description="κ·Έλ¦Όμ μ
λ‘λνλ©΄ AIκ° μ€λͺ
μ λ§λ€κ³ , μ€λͺ
μ λ°νμΌλ‘ μμ
μ λ§λ€μ΄ λ€λ €μ€λλ€."
|
57 |
)
|
58 |
|
59 |
+
if __name__ == "__main__":
|
60 |
+
demo.launch()
|