msVision_3 / app.py
seawolf2357's picture
Update app.py
3bf7aef verified
raw
history blame
2.44 kB
import gradio as gr
from transformers import pipeline
from gradio_client import Client
# 이미지 인식 νŒŒμ΄ν”„λΌμΈ λ‘œλ“œ
image_model = pipeline("image-classification", model="google/vit-base-patch16-224")
def generate_music(prompt):
client = Client("https://haoheliu-audioldm-48k-text-to-hifiaudio-generation.hf.space/")
# Assuming the API requires these five arguments: prompt, duration, guidance_scale, seed, and num_waveforms
# Adjust the names and values according to the API's actual requirements
result = client.predict(
prompt=prompt,
duration=5,
guidance_scale=5.5,
seed=5,
num_waveforms=3,
# Remove the api_name if it's not expected/needed, or adjust accordingly
# api_name="/text2audio" # Comment this out if api_name is not an expected argument
)
# Process the result
print(result)
return result
def generate_voice(prompt):
# Tango APIλ₯Ό μ‚¬μš©ν•˜μ—¬ μŒμ„± 생성
client = Client("https://declare-lab-tango.hf.space/")
result = client.predict(
prompt, # 이미지 λΆ„λ₯˜ κ²°κ³Όλ₯Ό ν”„λ‘¬ν”„νŠΈλ‘œ μ‚¬μš©
100, # Steps
1, # Guidance Scale
api_name="/predict" # API μ—”λ“œν¬μΈνŠΈ 경둜
)
# Tango API 호좜 결과 처리
# 예: resultμ—μ„œ μŒμ„± 파일 URL λ˜λŠ” 데이터 μΆ”μΆœ
return result
def classify_and_generate_voice(uploaded_image):
# 이미지 λΆ„λ₯˜
predictions = image_model(uploaded_image)
top_prediction = predictions[0]['label'] # κ°€μž₯ ν™•λ₯ μ΄ 높은 λΆ„λ₯˜ κ²°κ³Ό
# μŒμ„± 생성
voice_result = generate_voice("this is " + top_prediction)
# μŒμ•… 생성
music_result = generate_music("The rnb beat of 85BPM drums." + top_prediction + ".")
# λ°˜ν™˜λœ μŒμ„± 및 μŒμ•… κ²°κ³Όλ₯Ό Gradio μΈν„°νŽ˜μ΄μŠ€λ‘œ 전달
# 예: voice_result['url'] λ˜λŠ” voice_result['audio_data'] λ“±
return top_prediction, voice_result, music_result
# Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
iface = gr.Interface(
fn=classify_and_generate_voice,
inputs=gr.Image(type="pil"),
outputs=[gr.Label(), gr.Audio(), gr.Audio()],
title="msVision_3",
description="이미지λ₯Ό μ—…λ‘œλ“œν•˜λ©΄, 사물을 μΈμ‹ν•˜κ³  ν•΄λ‹Ήν•˜λŠ” μŒμ„± 및 μŒμ•…μ„ μƒμ„±ν•©λ‹ˆλ‹€.(recognizes object and generate Voice&Music)",
examples=["dog.jpg","cafe.jpg","seoul.png"]
)
# μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
iface.launch()