msVision_3 / app.py
seawolf2357's picture
Update app.py
0048511 verified
raw
history blame
2.43 kB
import gradio as gr
from transformers import pipeline
from gradio_client import Client
# 이미지 인식 νŒŒμ΄ν”„λΌμΈ λ‘œλ“œ
image_model = pipeline("image-classification", model="google/vit-base-patch16-224")
def generate_music(prompt):
client = Client("https://haoheliu-audioldm-48k-text-to-hifiaudio-generation.hf.space/")
result = client.predict(
prompt="Howdy!", # 'Input your text here' ν…μŠ€νŠΈ λ°•μŠ€ μ»΄ν¬λ„ŒνŠΈ
duration=5, # 'Duration (seconds)' μŠ¬λΌμ΄λ” μ»΄ν¬λ„ŒνŠΈμ—μ„œμ˜ κ°’ λ²”μœ„ (5 ~ 15)
guidance_scale=0, # 'Guidance scale' μŠ¬λΌμ΄λ” μ»΄ν¬λ„ŒνŠΈμ—μ„œμ˜ κ°’ λ²”μœ„ (0 ~ 6)
seed=5, # 'Seed' 숫자 μ»΄ν¬λ„ŒνŠΈμ˜ κ°’
num_waveforms=1, # 'Number waveforms to generate' μŠ¬λΌμ΄λ” μ»΄ν¬λ„ŒνŠΈμ—μ„œμ˜ κ°’ λ²”μœ„ (1 ~ 3)
api_name="/text2audio" # API μ—”λ“œν¬μΈνŠΈ 경둜
)
print(result)
def generate_voice(prompt):
# Tango APIλ₯Ό μ‚¬μš©ν•˜μ—¬ μŒμ„± 생성
client = Client("https://declare-lab-tango.hf.space/")
result = client.predict(
prompt, # 이미지 λΆ„λ₯˜ κ²°κ³Όλ₯Ό ν”„λ‘¬ν”„νŠΈλ‘œ μ‚¬μš©
100, # Steps
1, # Guidance Scale
api_name="/predict" # API μ—”λ“œν¬μΈνŠΈ 경둜
)
# Tango API 호좜 결과 처리
# 예: resultμ—μ„œ μŒμ„± 파일 URL λ˜λŠ” 데이터 μΆ”μΆœ
return result
def classify_and_generate_voice(uploaded_image):
# 이미지 λΆ„λ₯˜
predictions = image_model(uploaded_image)
top_prediction = predictions[0]['label'] # κ°€μž₯ ν™•λ₯ μ΄ 높은 λΆ„λ₯˜ κ²°κ³Ό
# μŒμ„± 생성
voice_result = generate_voice("this is " + top_prediction)
# μŒμ•… 생성
music_result = generate_music("The rnb beat of 85BPM drums." + top_prediction + ".")
# λ°˜ν™˜λœ μŒμ„± 및 μŒμ•… κ²°κ³Όλ₯Ό Gradio μΈν„°νŽ˜μ΄μŠ€λ‘œ 전달
# 예: voice_result['url'] λ˜λŠ” voice_result['audio_data'] λ“±
return top_prediction, voice_result, music_result
# Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
iface = gr.Interface(
fn=classify_and_generate_voice,
inputs=gr.Image(type="pil"),
outputs=[gr.Label(), gr.Audio(), gr.Audio()],
title="msVision_3",
description="이미지λ₯Ό μ—…λ‘œλ“œν•˜λ©΄, 사물을 μΈμ‹ν•˜κ³  ν•΄λ‹Ήν•˜λŠ” μŒμ„± 및 μŒμ•…μ„ μƒμ„±ν•©λ‹ˆλ‹€.(recognizes object and generate Voice&Music)",
examples=["dog.jpg","cafe.jpg","seoul.png"]
)
# μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
iface.launch()