Spaces:

Leeps
/

background-sounds-generator

Runtime error

@@ -1,12 +1,6 @@
 ---
-title: Background Sounds Generator
-emoji: 🚀
-colorFrom: purple
-colorTo: blue
 sdk: gradio
-sdk_version: 4.37.2
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: background-sounds-generator
+app_file: index.py
 sdk: gradio
+sdk_version: 4.36.1
 ---

index.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+import base64
+import numpy as np
+from PIL import Image, ImageChops, ImageDraw
+import io
+import requests
+import replicate
+import gradio as gr
+import openai
+from openai import OpenAI
+from dotenv import load_dotenv, find_dotenv
+# Locate the .env file
+dotenv_path = find_dotenv()
+load_dotenv(dotenv_path)
+REPLICATE_API_TOKEN = os.getenv('REPLICATE_API_TOKEN')
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+client = OpenAI()
+# 1 - send image to vision-language model
+        # Localised Speech
+        # Non-localised speech e.g. people in the background
+        # Inanimate objects e.g. Bell, iconic sounds
+        # Ambient sound e.g. wind, water ripple, tree, traffic
+        # Spatial dimension of the image
+        # music
+# 2 - generate sounds from audioldm
+    # localized speech can be a different speech-specific model
+# 3 - create soundtrack (not all sounds at once)
+# Could use different system prompts depending on what time of sound
+# Could use audio-ldm for sound effects and a different one for music
+# audio ldm: start music prompt with "background music that sounds like"
+CHECKBOX_INPUTS = ["Localised Speech", "Non-localised speech", "Inanimate objects", "Ambient sound",  "music"]
+def call_openai(image_data, prompt):
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_data,
+                            },
+                        },
+                    ],
+                }
+            ],
+            max_tokens=100,
+        )
+        return response.choices[0].message.content
+    except openai.BadRequestError as e:
+        print(e)
+        print("e type")
+        print(type(e))
+        raise gr.Error(f"Please retry with a different moodboard file (below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'])")
+    except Exception as e:
+        raise gr.Error("Unknown Error")
+def img_to_base64(img):
+    buffered = io.BytesIO()
+    img.save(buffered, format="JPEG")
+    img_base_64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    return "data:image/jpeg;base64," + img_base_64
+def vision_language_model(img):
+    return
+def generate_prompt_from_description(checkbox_label, img):
+    print(checkbox_label)
+    if checkbox_label == CHECKBOX_INPUTS[0]:
+        prompt = "reply with a single sentence that the person in the image might say"
+        return call_openai(img, prompt)
+        # use https://replicate.com/afiaka87/tortoise-tts
+    if checkbox_label == CHECKBOX_INPUTS[1]:
+        prompt = "in 5 words or less, describe the background noise (like people talking) of this image"
+        return call_openai(img, prompt)
+    elif checkbox_label == CHECKBOX_INPUTS[2]:
+        prompt = "in 5 words or less, describe an inanimate noise, such as a bell or an appliance, that might be heard in this image"
+        return call_openai(img, prompt)
+    elif checkbox_label == CHECKBOX_INPUTS[3]:
+        prompt = "in 5 words or less, describe an ambient sound, such as wind, water ripple, tree or traffic, that might be heard in this image"
+        return call_openai(img, prompt)
+    elif checkbox_label == CHECKBOX_INPUTS[4]:
+        prompt = "in 6 words or less, write a prompt to generate music that might be in this image"
+        return call_openai(img, prompt)
+    # https://replicate.com/meta/llama-2-70b-chat
+    # You are a talented prompt writer. you turn paragraphs into short 5-word prompts to generate a song. These go directly into systems, so there should be no other text.
+    return
+def generate_music(prompt):
+    return
+def combine_music_clips(audio):
+    return
+def download_audio(url):
+    response = requests.get(url)
+    response.raise_for_status()
+    return io.BytesIO(response.content)
+def generate_silent_audio():
+    silent_audio = np.zeros((22050,), dtype=np.int16)
+    silent_bytes = io.BytesIO()
+    silent_bytes.write(silent_audio.tobytes())
+    silent_bytes.seek(0)
+    return silent_bytes
+def main(image, checkboxes):
+    image = Image.fromarray(image.astype('uint8'))
+    base_64_image = img_to_base64(image)
+    generated_content = []
+    for selection in checkboxes:
+        prompt = generate_prompt_from_description(selection, base_64_image)
+        if not prompt:
+            continue
+        if selection == CHECKBOX_INPUTS[0]:
+            output = replicate.run(
+                "afiaka87/tortoise-tts:e9658de4b325863c4fcdc12d94bb7c9b54cbfe351b7ca1b36860008172b91c71",
+                input={"seed": 0, "text": prompt, "preset": "fast", "voice_a": "halle"}
+            )
+        elif selection == CHECKBOX_INPUTS[4]:
+            output = replicate.run(
+                "riffusion/riffusion:8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05",
+                input={"alpha": 0.5, "prompt_a": prompt, "denoising": 0.75, "seed_image_id": "vibes", "num_inference_steps": 50}
+            )
+            output = output['audio']
+        else:
+            output = replicate.run(
+                "haoheliu/audio-ldm:b61392adecdd660326fc9cfc5398182437dbe5e97b5decfb36e1a36de68b5b95",
+                input={"text": prompt, "duration": "5.0", "n_candidates": 3, "guidance_scale": 2.5}
+            )
+        audio_file = download_audio(output)
+        generated_content.append({"prompt": prompt, "audio": audio_file})
+    print(generated_content)
+    # Ensure 5 pairs of prompt and audio
+    while len(generated_content) < 5:
+        generated_content.append({"prompt": "", "audio": generate_silent_audio()})
+    result_prompts = [item["prompt"] for item in generated_content]
+    result_audios = [item["audio"].getvalue() for item in generated_content]
+    return result_prompts[0], result_audios[0], result_prompts[1], result_audios[1], result_prompts[2], result_audios[2], result_prompts[3], result_audios[3], result_prompts[4], result_audios[4]
+demo = gr.Interface(fn=main, inputs=["image", gr.CheckboxGroup(CHECKBOX_INPUTS, label="Sounds to Generate", info="Based on Taxonomy of Sounds")], outputs=["text", "audio", "text", "audio", "text", "audio", "text", "audio", "text", "audio"])
+demo.launch(share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio
2	+ replicate