Spaces:
Runtime error
Runtime error
import os | |
import base64 | |
import numpy as np | |
from PIL import Image, ImageChops, ImageDraw | |
import io | |
import requests | |
import replicate | |
import gradio as gr | |
import openai | |
from openai import OpenAI | |
from dotenv import load_dotenv, find_dotenv | |
# Locate the .env file | |
dotenv_path = find_dotenv() | |
load_dotenv(dotenv_path) | |
REPLICATE_API_TOKEN = os.getenv('REPLICATE_API_TOKEN') | |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') | |
client = OpenAI() | |
# 1 - send image to vision-language model | |
# Localised Speech | |
# Non-localised speech e.g. people in the background | |
# Inanimate objects e.g. Bell, iconic sounds | |
# Ambient sound e.g. wind, water ripple, tree, traffic | |
# Spatial dimension of the image | |
# music | |
# 2 - generate sounds from audioldm | |
# localized speech can be a different speech-specific model | |
# 3 - create soundtrack (not all sounds at once) | |
# Could use different system prompts depending on what time of sound | |
# Could use audio-ldm for sound effects and a different one for music | |
# audio ldm: start music prompt with "background music that sounds like" | |
CHECKBOX_INPUTS = ["Localised Speech", "Non-localised speech", "Inanimate objects", "Ambient sound", "music"] | |
def call_openai(image_data, prompt): | |
try: | |
response = client.chat.completions.create( | |
model="gpt-4o", | |
messages=[ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": prompt}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": image_data, | |
}, | |
}, | |
], | |
} | |
], | |
max_tokens=100, | |
) | |
return response.choices[0].message.content | |
except openai.BadRequestError as e: | |
print(e) | |
print("e type") | |
print(type(e)) | |
raise gr.Error(f"Please retry with a different moodboard file (below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'])") | |
except Exception as e: | |
raise gr.Error("Unknown Error") | |
def img_to_base64(img): | |
buffered = io.BytesIO() | |
img.save(buffered, format="JPEG") | |
img_base_64 = base64.b64encode(buffered.getvalue()).decode('utf-8') | |
return "data:image/jpeg;base64," + img_base_64 | |
def vision_language_model(img): | |
return | |
def generate_prompt_from_description(checkbox_label, img): | |
print(checkbox_label) | |
if checkbox_label == CHECKBOX_INPUTS[0]: | |
prompt = "reply with a single sentence that the person in the image might say" | |
return call_openai(img, prompt) | |
# use https://replicate.com/afiaka87/tortoise-tts | |
if checkbox_label == CHECKBOX_INPUTS[1]: | |
prompt = "in 5 words or less, describe the background noise (like people talking) of this image" | |
return call_openai(img, prompt) | |
elif checkbox_label == CHECKBOX_INPUTS[2]: | |
prompt = "in 5 words or less, describe an inanimate noise, such as a bell or an appliance, that might be heard in this image" | |
return call_openai(img, prompt) | |
elif checkbox_label == CHECKBOX_INPUTS[3]: | |
prompt = "in 5 words or less, describe an ambient sound, such as wind, water ripple, tree or traffic, that might be heard in this image" | |
return call_openai(img, prompt) | |
elif checkbox_label == CHECKBOX_INPUTS[4]: | |
prompt = "in 6 words or less, write a prompt to generate music that might be in this image" | |
return call_openai(img, prompt) | |
# https://replicate.com/meta/llama-2-70b-chat | |
# You are a talented prompt writer. you turn paragraphs into short 5-word prompts to generate a song. These go directly into systems, so there should be no other text. | |
return | |
def generate_music(prompt): | |
return | |
def combine_music_clips(audio): | |
return | |
def download_audio(url): | |
response = requests.get(url) | |
response.raise_for_status() | |
return io.BytesIO(response.content) | |
def generate_silent_audio(): | |
silent_audio = np.zeros((22050,), dtype=np.int16) | |
silent_bytes = io.BytesIO() | |
silent_bytes.write(silent_audio.tobytes()) | |
silent_bytes.seek(0) | |
return silent_bytes | |
def main(image, checkboxes): | |
image = Image.fromarray(image.astype('uint8')) | |
base_64_image = img_to_base64(image) | |
generated_content = [] | |
for selection in checkboxes: | |
prompt = generate_prompt_from_description(selection, base_64_image) | |
if not prompt: | |
continue | |
if selection == CHECKBOX_INPUTS[0]: | |
output = replicate.run( | |
"afiaka87/tortoise-tts:e9658de4b325863c4fcdc12d94bb7c9b54cbfe351b7ca1b36860008172b91c71", | |
input={"seed": 0, "text": prompt, "preset": "fast", "voice_a": "halle"} | |
) | |
elif selection == CHECKBOX_INPUTS[4]: | |
output = replicate.run( | |
"riffusion/riffusion:8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05", | |
input={"alpha": 0.5, "prompt_a": prompt, "denoising": 0.75, "seed_image_id": "vibes", "num_inference_steps": 50} | |
) | |
output = output['audio'] | |
else: | |
output = replicate.run( | |
"haoheliu/audio-ldm:b61392adecdd660326fc9cfc5398182437dbe5e97b5decfb36e1a36de68b5b95", | |
input={"text": prompt, "duration": "5.0", "n_candidates": 3, "guidance_scale": 2.5} | |
) | |
audio_file = download_audio(output) | |
generated_content.append({"prompt": prompt, "audio": audio_file}) | |
print(generated_content) | |
# Ensure 5 pairs of prompt and audio | |
while len(generated_content) < 5: | |
generated_content.append({"prompt": "", "audio": generate_silent_audio()}) | |
result_prompts = [item["prompt"] for item in generated_content] | |
result_audios = [item["audio"].getvalue() for item in generated_content] | |
return result_prompts[0], result_audios[0], result_prompts[1], result_audios[1], result_prompts[2], result_audios[2], result_prompts[3], result_audios[3], result_prompts[4], result_audios[4] | |
demo = gr.Interface(fn=main, inputs=["image", gr.CheckboxGroup(CHECKBOX_INPUTS, label="Sounds to Generate", info="Based on Taxonomy of Sounds")], outputs=["text", "audio", "text", "audio", "text", "audio", "text", "audio", "text", "audio"]) | |
demo.launch(share=False) |