ak0601 commited on
Commit
e605e02
·
verified ·
1 Parent(s): ad7c71c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -9
app.py CHANGED
@@ -140,7 +140,7 @@ css = """
140
  iface = gr.Blocks(css=css)
141
 
142
  with iface:
143
- gr.HTML(
144
  """
145
  <div style="text-align: center; max-width: 700px; margin: 0 auto;">
146
  <div
@@ -149,14 +149,89 @@ with iface:
149
  "
150
  >
151
  <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
152
- AudioLDM: Text-to-Audio Generation with Latent Diffusion Models
153
  </h1>
154
- </div> <p style="margin-bottom: 10px; font-size: 94%">
155
- <a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Project
156
- page]</a> <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm">[🧨
157
- Diffusers]</a>
158
- </p>
159
- </div>
160
  """
161
  )
162
- iface.queue(max_size=10).launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  iface = gr.Blocks(css=css)
141
 
142
  with iface:
143
+ gr.HTML(
144
  """
145
  <div style="text-align: center; max-width: 700px; margin: 0 auto;">
146
  <div
 
149
  "
150
  >
151
  <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
152
+ AudioLDM: Text-to-Audio Generation Diffusion Models
153
  </h1>
 
 
 
 
 
 
154
  """
155
  )
156
+ with gr.Group():
157
+ with gr.Box():
158
+ textbox = gr.Textbox(
159
+ value="A hammer is hitting a wooden surface",
160
+ max_lines=1,
161
+ label="Input text",
162
+ info="Your text is important for the audio quality. Please ensure it is descriptive by using more adjectives.",
163
+ elem_id="prompt-in",
164
+ )
165
+ negative_textbox = gr.Textbox(
166
+ value="low quality, average quality",
167
+ max_lines=1,
168
+ label="Negative prompt",
169
+ info="Enter a negative prompt not to guide the audio generation. Selecting appropriate negative prompts can improve the audio quality significantly.",
170
+ elem_id="prompt-in",
171
+ )
172
+
173
+ with gr.Accordion("Click to modify detailed configurations", open=False):
174
+ seed = gr.Number(
175
+ value=45,
176
+ label="Seed",
177
+ info="Change this value (any integer number) will lead to a different generation result.",
178
+ )
179
+ duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
180
+ guidance_scale = gr.Slider(
181
+ 0,
182
+ 5,
183
+ value=3.5,
184
+ step=0.5,
185
+ label="Guidance scale",
186
+ info="Large => better quality and relevancy to text; Small => better diversity",
187
+ )
188
+ n_candidates = gr.Slider(
189
+ 1,
190
+ 3,
191
+ value=3,
192
+ step=1,
193
+ label="Number waveforms to generate",
194
+ info="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
195
+ )
196
+
197
+ outputs = gr.Video(label="Output", elem_id="output-video")
198
+ btn = gr.Button("Submit").style(full_width=True)
199
+
200
+ with gr.Group(elem_id="share-btn-container", visible=False):
201
+ community_icon = gr.HTML(community_icon_html)
202
+ loading_icon = gr.HTML(loading_icon_html)
203
+ share_button = gr.Button("Share to community", elem_id="share-btn")
204
+
205
+ btn.click(
206
+ text2audio,
207
+ inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
208
+ outputs=[outputs],
209
+ )
210
+
211
+ share_button.click(None, [], [], _js=share_js)
212
+ gr.HTML(
213
+ gr.Examples(
214
+ [
215
+ ["A hammer is hitting a wooden surface", "low quality, average quality", 5, 2.5, 45, 3],
216
+ ["Peaceful and calming ambient music with singing bowl and other instruments.", "low quality, average quality", 5, 2.5, 45, 3],
217
+ ["A man is speaking in a small room.", "low quality, average quality", 5, 2.5, 45, 3],
218
+ ["A female is speaking followed by footstep sound", "low quality, average quality", 5, 2.5, 45, 3],
219
+ ["Wooden table tapping sound followed by water pouring sound.", "low quality, average quality", 5, 2.5, 45, 3],
220
+ ],
221
+ fn=text2audio,
222
+ inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
223
+ outputs=[outputs],
224
+ cache_examples=True,
225
+ )
226
+ gr.HTML(
227
+ """
228
+ <div class="acknowledgements"> <p>Essential Tricks for Enhancing the Quality of Your Generated
229
+ Audio</p> <p>1. Try to use more adjectives to describe your sound. For example: "A man is speaking
230
+ clearly and slowly in a large room" is better than "A man is speaking". This can make sure AudioLDM
231
+ understands what you want.</p> <p>2. Try to use different random seeds, which can affect the generation
232
+ quality significantly sometimes.</p> <p>3. It's better to use general terms like 'man' or 'woman'
233
+ instead of specific names for individuals or abstract objects that humans may not be familiar with,
234
+ such as 'mummy'.</p> <p>4. Using a negative prompt to not guide the diffusion process can improve the
235
+ audio quality significantly. Try using negative prompts like 'low quality'.</p> </div>
236
+ """
237
+ )