Leeps's picture
Upload folder using huggingface_hub
28fa3d8 verified
import os
import base64
import numpy as np
from PIL import Image, ImageChops, ImageDraw
import io
import requests
import replicate
import gradio as gr
import openai
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv
# Locate the .env file
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)
REPLICATE_API_TOKEN = os.getenv('REPLICATE_API_TOKEN')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = OpenAI()
# 1 - send image to vision-language model
# Localised Speech
# Non-localised speech e.g. people in the background
# Inanimate objects e.g. Bell, iconic sounds
# Ambient sound e.g. wind, water ripple, tree, traffic
# Spatial dimension of the image
# music
# 2 - generate sounds from audioldm
# localized speech can be a different speech-specific model
# 3 - create soundtrack (not all sounds at once)
# Could use different system prompts depending on what time of sound
# Could use audio-ldm for sound effects and a different one for music
# audio ldm: start music prompt with "background music that sounds like"
CHECKBOX_INPUTS = ["Localised Speech", "Non-localised speech", "Inanimate objects", "Ambient sound", "music"]
def call_openai(image_data, prompt):
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": image_data,
},
},
],
}
],
max_tokens=100,
)
return response.choices[0].message.content
except openai.BadRequestError as e:
print(e)
print("e type")
print(type(e))
raise gr.Error(f"Please retry with a different moodboard file (below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'])")
except Exception as e:
raise gr.Error("Unknown Error")
def img_to_base64(img):
buffered = io.BytesIO()
img.save(buffered, format="JPEG")
img_base_64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
return "data:image/jpeg;base64," + img_base_64
def vision_language_model(img):
return
def generate_prompt_from_description(checkbox_label, img):
print(checkbox_label)
if checkbox_label == CHECKBOX_INPUTS[0]:
prompt = "reply with a single sentence that the person in the image might say"
return call_openai(img, prompt)
# use https://replicate.com/afiaka87/tortoise-tts
if checkbox_label == CHECKBOX_INPUTS[1]:
prompt = "in 5 words or less, describe the background noise (like people talking) of this image"
return call_openai(img, prompt)
elif checkbox_label == CHECKBOX_INPUTS[2]:
prompt = "in 5 words or less, describe an inanimate noise, such as a bell or an appliance, that might be heard in this image"
return call_openai(img, prompt)
elif checkbox_label == CHECKBOX_INPUTS[3]:
prompt = "in 5 words or less, describe an ambient sound, such as wind, water ripple, tree or traffic, that might be heard in this image"
return call_openai(img, prompt)
elif checkbox_label == CHECKBOX_INPUTS[4]:
prompt = "in 6 words or less, write a prompt to generate music that might be in this image"
return call_openai(img, prompt)
# https://replicate.com/meta/llama-2-70b-chat
# You are a talented prompt writer. you turn paragraphs into short 5-word prompts to generate a song. These go directly into systems, so there should be no other text.
return
def generate_music(prompt):
return
def combine_music_clips(audio):
return
def download_audio(url):
response = requests.get(url)
response.raise_for_status()
return io.BytesIO(response.content)
def generate_silent_audio():
silent_audio = np.zeros((22050,), dtype=np.int16)
silent_bytes = io.BytesIO()
silent_bytes.write(silent_audio.tobytes())
silent_bytes.seek(0)
return silent_bytes
def main(image, checkboxes):
image = Image.fromarray(image.astype('uint8'))
base_64_image = img_to_base64(image)
generated_content = []
for selection in checkboxes:
prompt = generate_prompt_from_description(selection, base_64_image)
if not prompt:
continue
if selection == CHECKBOX_INPUTS[0]:
output = replicate.run(
"afiaka87/tortoise-tts:e9658de4b325863c4fcdc12d94bb7c9b54cbfe351b7ca1b36860008172b91c71",
input={"seed": 0, "text": prompt, "preset": "fast", "voice_a": "halle"}
)
elif selection == CHECKBOX_INPUTS[4]:
output = replicate.run(
"riffusion/riffusion:8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05",
input={"alpha": 0.5, "prompt_a": prompt, "denoising": 0.75, "seed_image_id": "vibes", "num_inference_steps": 50}
)
output = output['audio']
else:
output = replicate.run(
"haoheliu/audio-ldm:b61392adecdd660326fc9cfc5398182437dbe5e97b5decfb36e1a36de68b5b95",
input={"text": prompt, "duration": "5.0", "n_candidates": 3, "guidance_scale": 2.5}
)
audio_file = download_audio(output)
generated_content.append({"prompt": prompt, "audio": audio_file})
print(generated_content)
# Ensure 5 pairs of prompt and audio
while len(generated_content) < 5:
generated_content.append({"prompt": "", "audio": generate_silent_audio()})
result_prompts = [item["prompt"] for item in generated_content]
result_audios = [item["audio"].getvalue() for item in generated_content]
return result_prompts[0], result_audios[0], result_prompts[1], result_audios[1], result_prompts[2], result_audios[2], result_prompts[3], result_audios[3], result_prompts[4], result_audios[4]
demo = gr.Interface(fn=main, inputs=["image", gr.CheckboxGroup(CHECKBOX_INPUTS, label="Sounds to Generate", info="Based on Taxonomy of Sounds")], outputs=["text", "audio", "text", "audio", "text", "audio", "text", "audio", "text", "audio"])
demo.launch(share=False)