File size: 6,442 Bytes
28fa3d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
import base64
import numpy as np
from PIL import Image, ImageChops, ImageDraw

import io
import requests
import replicate
import gradio as gr
import openai
from openai import OpenAI

from dotenv import load_dotenv, find_dotenv

# Locate the .env file
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)
REPLICATE_API_TOKEN = os.getenv('REPLICATE_API_TOKEN')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

client = OpenAI()

# 1 - send image to vision-language model
        # Localised Speech 
        # Non-localised speech e.g. people in the background
        # Inanimate objects e.g. Bell, iconic sounds
        # Ambient sound e.g. wind, water ripple, tree, traffic
        # Spatial dimension of the image
        # music

# 2 - generate sounds from audioldm
    # localized speech can be a different speech-specific model

# 3 - create soundtrack (not all sounds at once)


# Could use different system prompts depending on what time of sound
# Could use audio-ldm for sound effects and a different one for music


# audio ldm: start music prompt with "background music that sounds like"

CHECKBOX_INPUTS = ["Localised Speech", "Non-localised speech", "Inanimate objects", "Ambient sound",  "music"]

def call_openai(image_data, prompt):
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": image_data,
                            },
                        },
                    ],
                }
            ],
            max_tokens=100,
        )
        return response.choices[0].message.content
    except openai.BadRequestError as e:
        print(e)
        print("e type")
        print(type(e))
        raise gr.Error(f"Please retry with a different moodboard file (below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'])")
    except Exception as e:
        raise gr.Error("Unknown Error")

def img_to_base64(img):
    buffered = io.BytesIO()
    img.save(buffered, format="JPEG")
    img_base_64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
    return "data:image/jpeg;base64," + img_base_64

def vision_language_model(img):
    return

def generate_prompt_from_description(checkbox_label, img):
    print(checkbox_label)
    if checkbox_label == CHECKBOX_INPUTS[0]:
        prompt = "reply with a single sentence that the person in the image might say"
        return call_openai(img, prompt)
        # use https://replicate.com/afiaka87/tortoise-tts

    if checkbox_label == CHECKBOX_INPUTS[1]:
        prompt = "in 5 words or less, describe the background noise (like people talking) of this image"
        return call_openai(img, prompt)
    elif checkbox_label == CHECKBOX_INPUTS[2]:
        prompt = "in 5 words or less, describe an inanimate noise, such as a bell or an appliance, that might be heard in this image"
        return call_openai(img, prompt)
    elif checkbox_label == CHECKBOX_INPUTS[3]:
        prompt = "in 5 words or less, describe an ambient sound, such as wind, water ripple, tree or traffic, that might be heard in this image"
        return call_openai(img, prompt)
    elif checkbox_label == CHECKBOX_INPUTS[4]:
        prompt = "in 6 words or less, write a prompt to generate music that might be in this image"
        return call_openai(img, prompt)

    # https://replicate.com/meta/llama-2-70b-chat
    # You are a talented prompt writer. you turn paragraphs into short 5-word prompts to generate a song. These go directly into systems, so there should be no other text.
    return

def generate_music(prompt):
    return

def combine_music_clips(audio):
    return


def download_audio(url):
    response = requests.get(url)
    response.raise_for_status()
    return io.BytesIO(response.content)

def generate_silent_audio():
    silent_audio = np.zeros((22050,), dtype=np.int16)
    silent_bytes = io.BytesIO()
    silent_bytes.write(silent_audio.tobytes())
    silent_bytes.seek(0)
    return silent_bytes

def main(image, checkboxes):
    image = Image.fromarray(image.astype('uint8'))
    base_64_image = img_to_base64(image)

    generated_content = []

    for selection in checkboxes:
        prompt = generate_prompt_from_description(selection, base_64_image)
        if not prompt:
            continue
        
        if selection == CHECKBOX_INPUTS[0]:
            output = replicate.run(
                "afiaka87/tortoise-tts:e9658de4b325863c4fcdc12d94bb7c9b54cbfe351b7ca1b36860008172b91c71",
                input={"seed": 0, "text": prompt, "preset": "fast", "voice_a": "halle"}
            )
        elif selection == CHECKBOX_INPUTS[4]:
            output = replicate.run(
                "riffusion/riffusion:8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05",
                input={"alpha": 0.5, "prompt_a": prompt, "denoising": 0.75, "seed_image_id": "vibes", "num_inference_steps": 50}
            )
            output = output['audio']
        else:
            output = replicate.run(
                "haoheliu/audio-ldm:b61392adecdd660326fc9cfc5398182437dbe5e97b5decfb36e1a36de68b5b95",
                input={"text": prompt, "duration": "5.0", "n_candidates": 3, "guidance_scale": 2.5}
            )

        audio_file = download_audio(output)
        generated_content.append({"prompt": prompt, "audio": audio_file})

    print(generated_content)

    # Ensure 5 pairs of prompt and audio
    while len(generated_content) < 5:
        generated_content.append({"prompt": "", "audio": generate_silent_audio()})

    result_prompts = [item["prompt"] for item in generated_content]
    result_audios = [item["audio"].getvalue() for item in generated_content]

    return result_prompts[0], result_audios[0], result_prompts[1], result_audios[1], result_prompts[2], result_audios[2], result_prompts[3], result_audios[3], result_prompts[4], result_audios[4]

demo = gr.Interface(fn=main, inputs=["image", gr.CheckboxGroup(CHECKBOX_INPUTS, label="Sounds to Generate", info="Based on Taxonomy of Sounds")], outputs=["text", "audio", "text", "audio", "text", "audio", "text", "audio", "text", "audio"])
demo.launch(share=False)