Leeps commited on
Commit
28fa3d8
·
verified ·
1 Parent(s): 1c19253

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .env +2 -0
  2. .gitignore +13 -0
  3. README.md +3 -9
  4. index.py +170 -0
  5. requirements.txt +2 -0
.env ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ REPLICATE_API_TOKEN=r8_DAzyOBdCwUdt0b26ZMPWLyvyHTh55uh2Lwb3c
2
+ OPENAI_API_KEY=sk-proj-6lTXmIwTYmNo7uUpQwujT3BlbkFJDMVzyH5hzblFbgYLLMCP
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .vercel
2
+ *.log
3
+ *.pyc
4
+ __pycache__
5
+
6
+ # Environments
7
+ .env
8
+ .venv
9
+ env/
10
+ venv/
11
+ ENV/
12
+ env.bak/
13
+ venv.bak/
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Background Sounds Generator
3
- emoji: 🚀
4
- colorFrom: purple
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.37.2
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: background-sounds-generator
3
+ app_file: index.py
 
 
4
  sdk: gradio
5
+ sdk_version: 4.36.1
 
 
6
  ---
 
 
index.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import numpy as np
4
+ from PIL import Image, ImageChops, ImageDraw
5
+
6
+ import io
7
+ import requests
8
+ import replicate
9
+ import gradio as gr
10
+ import openai
11
+ from openai import OpenAI
12
+
13
+ from dotenv import load_dotenv, find_dotenv
14
+
15
+ # Locate the .env file
16
+ dotenv_path = find_dotenv()
17
+ load_dotenv(dotenv_path)
18
+ REPLICATE_API_TOKEN = os.getenv('REPLICATE_API_TOKEN')
19
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
20
+
21
+ client = OpenAI()
22
+
23
+ # 1 - send image to vision-language model
24
+ # Localised Speech
25
+ # Non-localised speech e.g. people in the background
26
+ # Inanimate objects e.g. Bell, iconic sounds
27
+ # Ambient sound e.g. wind, water ripple, tree, traffic
28
+ # Spatial dimension of the image
29
+ # music
30
+
31
+ # 2 - generate sounds from audioldm
32
+ # localized speech can be a different speech-specific model
33
+
34
+ # 3 - create soundtrack (not all sounds at once)
35
+
36
+
37
+ # Could use different system prompts depending on what time of sound
38
+ # Could use audio-ldm for sound effects and a different one for music
39
+
40
+
41
+ # audio ldm: start music prompt with "background music that sounds like"
42
+
43
+ CHECKBOX_INPUTS = ["Localised Speech", "Non-localised speech", "Inanimate objects", "Ambient sound", "music"]
44
+
45
+ def call_openai(image_data, prompt):
46
+
47
+ try:
48
+ response = client.chat.completions.create(
49
+ model="gpt-4o",
50
+ messages=[
51
+ {
52
+ "role": "user",
53
+ "content": [
54
+ {"type": "text", "text": prompt},
55
+ {
56
+ "type": "image_url",
57
+ "image_url": {
58
+ "url": image_data,
59
+ },
60
+ },
61
+ ],
62
+ }
63
+ ],
64
+ max_tokens=100,
65
+ )
66
+ return response.choices[0].message.content
67
+ except openai.BadRequestError as e:
68
+ print(e)
69
+ print("e type")
70
+ print(type(e))
71
+ raise gr.Error(f"Please retry with a different moodboard file (below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'])")
72
+ except Exception as e:
73
+ raise gr.Error("Unknown Error")
74
+
75
+ def img_to_base64(img):
76
+ buffered = io.BytesIO()
77
+ img.save(buffered, format="JPEG")
78
+ img_base_64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
79
+ return "data:image/jpeg;base64," + img_base_64
80
+
81
+ def vision_language_model(img):
82
+ return
83
+
84
+ def generate_prompt_from_description(checkbox_label, img):
85
+ print(checkbox_label)
86
+ if checkbox_label == CHECKBOX_INPUTS[0]:
87
+ prompt = "reply with a single sentence that the person in the image might say"
88
+ return call_openai(img, prompt)
89
+ # use https://replicate.com/afiaka87/tortoise-tts
90
+
91
+ if checkbox_label == CHECKBOX_INPUTS[1]:
92
+ prompt = "in 5 words or less, describe the background noise (like people talking) of this image"
93
+ return call_openai(img, prompt)
94
+ elif checkbox_label == CHECKBOX_INPUTS[2]:
95
+ prompt = "in 5 words or less, describe an inanimate noise, such as a bell or an appliance, that might be heard in this image"
96
+ return call_openai(img, prompt)
97
+ elif checkbox_label == CHECKBOX_INPUTS[3]:
98
+ prompt = "in 5 words or less, describe an ambient sound, such as wind, water ripple, tree or traffic, that might be heard in this image"
99
+ return call_openai(img, prompt)
100
+ elif checkbox_label == CHECKBOX_INPUTS[4]:
101
+ prompt = "in 6 words or less, write a prompt to generate music that might be in this image"
102
+ return call_openai(img, prompt)
103
+
104
+ # https://replicate.com/meta/llama-2-70b-chat
105
+ # You are a talented prompt writer. you turn paragraphs into short 5-word prompts to generate a song. These go directly into systems, so there should be no other text.
106
+ return
107
+
108
+ def generate_music(prompt):
109
+ return
110
+
111
+ def combine_music_clips(audio):
112
+ return
113
+
114
+
115
+ def download_audio(url):
116
+ response = requests.get(url)
117
+ response.raise_for_status()
118
+ return io.BytesIO(response.content)
119
+
120
+ def generate_silent_audio():
121
+ silent_audio = np.zeros((22050,), dtype=np.int16)
122
+ silent_bytes = io.BytesIO()
123
+ silent_bytes.write(silent_audio.tobytes())
124
+ silent_bytes.seek(0)
125
+ return silent_bytes
126
+
127
+ def main(image, checkboxes):
128
+ image = Image.fromarray(image.astype('uint8'))
129
+ base_64_image = img_to_base64(image)
130
+
131
+ generated_content = []
132
+
133
+ for selection in checkboxes:
134
+ prompt = generate_prompt_from_description(selection, base_64_image)
135
+ if not prompt:
136
+ continue
137
+
138
+ if selection == CHECKBOX_INPUTS[0]:
139
+ output = replicate.run(
140
+ "afiaka87/tortoise-tts:e9658de4b325863c4fcdc12d94bb7c9b54cbfe351b7ca1b36860008172b91c71",
141
+ input={"seed": 0, "text": prompt, "preset": "fast", "voice_a": "halle"}
142
+ )
143
+ elif selection == CHECKBOX_INPUTS[4]:
144
+ output = replicate.run(
145
+ "riffusion/riffusion:8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05",
146
+ input={"alpha": 0.5, "prompt_a": prompt, "denoising": 0.75, "seed_image_id": "vibes", "num_inference_steps": 50}
147
+ )
148
+ output = output['audio']
149
+ else:
150
+ output = replicate.run(
151
+ "haoheliu/audio-ldm:b61392adecdd660326fc9cfc5398182437dbe5e97b5decfb36e1a36de68b5b95",
152
+ input={"text": prompt, "duration": "5.0", "n_candidates": 3, "guidance_scale": 2.5}
153
+ )
154
+
155
+ audio_file = download_audio(output)
156
+ generated_content.append({"prompt": prompt, "audio": audio_file})
157
+
158
+ print(generated_content)
159
+
160
+ # Ensure 5 pairs of prompt and audio
161
+ while len(generated_content) < 5:
162
+ generated_content.append({"prompt": "", "audio": generate_silent_audio()})
163
+
164
+ result_prompts = [item["prompt"] for item in generated_content]
165
+ result_audios = [item["audio"].getvalue() for item in generated_content]
166
+
167
+ return result_prompts[0], result_audios[0], result_prompts[1], result_audios[1], result_prompts[2], result_audios[2], result_prompts[3], result_audios[3], result_prompts[4], result_audios[4]
168
+
169
+ demo = gr.Interface(fn=main, inputs=["image", gr.CheckboxGroup(CHECKBOX_INPUTS, label="Sounds to Generate", info="Based on Taxonomy of Sounds")], outputs=["text", "audio", "text", "audio", "text", "audio", "text", "audio", "text", "audio"])
170
+ demo.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ replicate