File size: 2,443 Bytes
2bd9468
 
7047a68
dedab71
9902a40
8770d52
2bd9468
ba5f8a7
0f49e19
3bf7aef
 
 
0f49e19
3bf7aef
 
 
 
 
 
 
0f49e19
3bf7aef
 
0f49e19
3b24c11
46ad89d
3377e03
a6d7b81
1edfb40
a6d7b81
 
 
 
 
 
 
 
 
87c119f
3377e03
8770d52
 
a6d7b81
3377e03
59ff24b
ebcd803
59ff24b
ebcd803
a6d7b81
7d07c61
ad7babb
a6d7b81
23708c8
3377e03
23708c8
7d07c61
872e164
5174dc4
0ad2ed2
23708c8
2bd9468
ad7babb
3382a71
a6d7b81
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import gradio as gr
from transformers import pipeline
from gradio_client import Client 

# 이미지 인식 νŒŒμ΄ν”„λΌμΈ λ‘œλ“œ
image_model = pipeline("image-classification", model="google/vit-base-patch16-224")

def generate_music(prompt):
    client = Client("https://haoheliu-audioldm-48k-text-to-hifiaudio-generation.hf.space/")
    
    # Assuming the API requires these five arguments: prompt, duration, guidance_scale, seed, and num_waveforms
    # Adjust the names and values according to the API's actual requirements
    result = client.predict(
        prompt=prompt,
        duration=5,
        guidance_scale=5.5,
        seed=5,
        num_waveforms=3,
        # Remove the api_name if it's not expected/needed, or adjust accordingly
        # api_name="/text2audio"  # Comment this out if api_name is not an expected argument
    )
    
    # Process the result
    print(result)
    return result

def generate_voice(prompt):
    # Tango APIλ₯Ό μ‚¬μš©ν•˜μ—¬ μŒμ„± 생성
    client = Client("https://declare-lab-tango.hf.space/")
    result = client.predict(
        prompt,  # 이미지 λΆ„λ₯˜ κ²°κ³Όλ₯Ό ν”„λ‘¬ν”„νŠΈλ‘œ μ‚¬μš©
        100,  # Steps
        1,  # Guidance Scale
        api_name="/predict"  # API μ—”λ“œν¬μΈνŠΈ 경둜
    )
    # Tango API 호좜 결과 처리
    # 예: resultμ—μ„œ μŒμ„± 파일 URL λ˜λŠ” 데이터 μΆ”μΆœ
    return result

def classify_and_generate_voice(uploaded_image):
    # 이미지 λΆ„λ₯˜
    predictions = image_model(uploaded_image)
    top_prediction = predictions[0]['label']  # κ°€μž₯ ν™•λ₯ μ΄ 높은 λΆ„λ₯˜ κ²°κ³Ό
    # μŒμ„± 생성
    voice_result = generate_voice("this is " + top_prediction)
    # μŒμ•… 생성
    music_result = generate_music("The rnb beat of 85BPM drums." + top_prediction + ".")
    # λ°˜ν™˜λœ μŒμ„± 및 μŒμ•… κ²°κ³Όλ₯Ό Gradio μΈν„°νŽ˜μ΄μŠ€λ‘œ 전달
    # 예: voice_result['url'] λ˜λŠ” voice_result['audio_data'] λ“±
    return top_prediction, voice_result, music_result
    
# Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
iface = gr.Interface(
    fn=classify_and_generate_voice,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Label(), gr.Audio(), gr.Audio()],
    title="msVision_3",
    description="이미지λ₯Ό μ—…λ‘œλ“œν•˜λ©΄, 사물을 μΈμ‹ν•˜κ³  ν•΄λ‹Ήν•˜λŠ” μŒμ„± 및 μŒμ•…μ„ μƒμ„±ν•©λ‹ˆλ‹€.(recognizes object and generate Voice&Music)",
    examples=["dog.jpg","cafe.jpg","seoul.png"]
)


# μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
iface.launch()