import os import base64 import numpy as np from PIL import Image, ImageChops, ImageDraw import io import requests import replicate import gradio as gr import openai from openai import OpenAI from dotenv import load_dotenv, find_dotenv # Locate the .env file dotenv_path = find_dotenv() load_dotenv(dotenv_path) REPLICATE_API_TOKEN = os.getenv('REPLICATE_API_TOKEN') OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') client = OpenAI() # 1 - send image to vision-language model # Localised Speech # Non-localised speech e.g. people in the background # Inanimate objects e.g. Bell, iconic sounds # Ambient sound e.g. wind, water ripple, tree, traffic # Spatial dimension of the image # music # 2 - generate sounds from audioldm # localized speech can be a different speech-specific model # 3 - create soundtrack (not all sounds at once) # Could use different system prompts depending on what time of sound # Could use audio-ldm for sound effects and a different one for music # audio ldm: start music prompt with "background music that sounds like" CHECKBOX_INPUTS = ["Localised Speech", "Non-localised speech", "Inanimate objects", "Ambient sound", "music"] def call_openai(image_data, prompt): try: response = client.chat.completions.create( model="gpt-4o", messages=[ { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": { "url": image_data, }, }, ], } ], max_tokens=100, ) return response.choices[0].message.content except openai.BadRequestError as e: print(e) print("e type") print(type(e)) raise gr.Error(f"Please retry with a different moodboard file (below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'])") except Exception as e: raise gr.Error("Unknown Error") def img_to_base64(img): buffered = io.BytesIO() img.save(buffered, format="JPEG") img_base_64 = base64.b64encode(buffered.getvalue()).decode('utf-8') return "data:image/jpeg;base64," + img_base_64 def vision_language_model(img): return def generate_prompt_from_description(checkbox_label, img): print(checkbox_label) if checkbox_label == CHECKBOX_INPUTS[0]: prompt = "reply with a single sentence that the person in the image might say" return call_openai(img, prompt) # use https://replicate.com/afiaka87/tortoise-tts if checkbox_label == CHECKBOX_INPUTS[1]: prompt = "in 5 words or less, describe the background noise (like people talking) of this image" return call_openai(img, prompt) elif checkbox_label == CHECKBOX_INPUTS[2]: prompt = "in 5 words or less, describe an inanimate noise, such as a bell or an appliance, that might be heard in this image" return call_openai(img, prompt) elif checkbox_label == CHECKBOX_INPUTS[3]: prompt = "in 5 words or less, describe an ambient sound, such as wind, water ripple, tree or traffic, that might be heard in this image" return call_openai(img, prompt) elif checkbox_label == CHECKBOX_INPUTS[4]: prompt = "in 6 words or less, write a prompt to generate music that might be in this image" return call_openai(img, prompt) # https://replicate.com/meta/llama-2-70b-chat # You are a talented prompt writer. you turn paragraphs into short 5-word prompts to generate a song. These go directly into systems, so there should be no other text. return def generate_music(prompt): return def combine_music_clips(audio): return def download_audio(url): response = requests.get(url) response.raise_for_status() return io.BytesIO(response.content) def generate_silent_audio(): silent_audio = np.zeros((22050,), dtype=np.int16) silent_bytes = io.BytesIO() silent_bytes.write(silent_audio.tobytes()) silent_bytes.seek(0) return silent_bytes def main(image, checkboxes): image = Image.fromarray(image.astype('uint8')) base_64_image = img_to_base64(image) generated_content = [] for selection in checkboxes: prompt = generate_prompt_from_description(selection, base_64_image) if not prompt: continue if selection == CHECKBOX_INPUTS[0]: output = replicate.run( "afiaka87/tortoise-tts:e9658de4b325863c4fcdc12d94bb7c9b54cbfe351b7ca1b36860008172b91c71", input={"seed": 0, "text": prompt, "preset": "fast", "voice_a": "halle"} ) elif selection == CHECKBOX_INPUTS[4]: output = replicate.run( "riffusion/riffusion:8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05", input={"alpha": 0.5, "prompt_a": prompt, "denoising": 0.75, "seed_image_id": "vibes", "num_inference_steps": 50} ) output = output['audio'] else: output = replicate.run( "haoheliu/audio-ldm:b61392adecdd660326fc9cfc5398182437dbe5e97b5decfb36e1a36de68b5b95", input={"text": prompt, "duration": "5.0", "n_candidates": 3, "guidance_scale": 2.5} ) audio_file = download_audio(output) generated_content.append({"prompt": prompt, "audio": audio_file}) print(generated_content) # Ensure 5 pairs of prompt and audio while len(generated_content) < 5: generated_content.append({"prompt": "", "audio": generate_silent_audio()}) result_prompts = [item["prompt"] for item in generated_content] result_audios = [item["audio"].getvalue() for item in generated_content] return result_prompts[0], result_audios[0], result_prompts[1], result_audios[1], result_prompts[2], result_audios[2], result_prompts[3], result_audios[3], result_prompts[4], result_audios[4] demo = gr.Interface(fn=main, inputs=["image", gr.CheckboxGroup(CHECKBOX_INPUTS, label="Sounds to Generate", info="Based on Taxonomy of Sounds")], outputs=["text", "audio", "text", "audio", "text", "audio", "text", "audio", "text", "audio"]) demo.launch(share=False)