RootingInLoad commited on
Commit
30d2f0b
·
unverified ·
1 Parent(s): 46d391a

Batch Inference & Podcast Generation

Browse files

Here's what the Batch Inference part does:

- Try to put as much characters as possible into one batch (200 max)
- If it's not possible, it'll try to do a cut whenever there's a semicolon character
- If it's not possible, it'll try to do a cut whenever there's a comma character
- If it's not possible, it'll try to do a cut after the most logical word (thus, therefore etc.) --> There's a list at the top of the Gradio script, and it's possible to modify it in Advanced Settings
- If nothing above worked, it's just going to go past that 200 line (realistically, if your text isn't gibberish, this shouldn't happen :D)

The Podcast Generation feature has these features built in:
- Takes two reference speeches and two reference texts (or empty and then transcribed automatically)
- You have to give a name to each of the two speakers
- You can then paste the podcast script, with one speaker's name followed by a semicolon and then their text, you can do the same with the other speaker, all as long as you want (because it's using the same batch inference as before)

All in all, the batch inference feature allow for a little bit more than real-time inference. (I might do another pull request with real-time streaming)

Immense thanks to all of those who worked on this project, it's really great. There's of course still room for improvement, but I think this is a step forward in terms of OSS TTS, so thanks !

Files changed (1) hide show
  1. gradio_app.py +294 -79
gradio_app.py CHANGED
@@ -19,6 +19,18 @@ from model.utils import (
19
  from transformers import pipeline
20
  import librosa
21
  import click
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  device = (
24
  "cuda"
@@ -87,11 +99,179 @@ E2TTS_ema_model = load_model(
87
  "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
88
  )
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence):
92
  print(gen_text)
93
- if len(gen_text) > 200:
94
- raise gr.Error("Please keep your text under 200 chars.")
95
  gr.Info("Converting audio...")
96
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
97
  aseg = AudioSegment.from_file(ref_audio_orig)
@@ -101,14 +281,10 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence):
101
  aseg = aseg[:15000]
102
  aseg.export(f.name, format="wav")
103
  ref_audio = f.name
104
- if exp_name == "F5-TTS":
105
- ema_model = F5TTS_ema_model
106
- elif exp_name == "E2-TTS":
107
- ema_model = E2TTS_ema_model
108
 
109
  if not ref_text.strip():
110
  gr.Info("No reference text provided, transcribing reference audio...")
111
- ref_text = outputs = pipe(
112
  ref_audio,
113
  chunk_length_s=30,
114
  batch_size=128,
@@ -118,80 +294,67 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence):
118
  gr.Info("Finished transcription")
119
  else:
120
  gr.Info("Using custom reference text...")
121
- audio, sr = torchaudio.load(ref_audio)
122
- if audio.shape[0] > 1:
123
- audio = torch.mean(audio, dim=0, keepdim=True)
124
-
125
- rms = torch.sqrt(torch.mean(torch.square(audio)))
126
- if rms < target_rms:
127
- audio = audio * target_rms / rms
128
- if sr != target_sample_rate:
129
- resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
130
- audio = resampler(audio)
131
- audio = audio.to(device)
132
-
133
- # Prepare the text
134
- text_list = [ref_text + gen_text]
135
- final_text_list = convert_char_to_pinyin(text_list)
136
-
137
- # Calculate duration
138
- ref_audio_len = audio.shape[-1] // hop_length
139
- # if fix_duration is not None:
140
- # duration = int(fix_duration * target_sample_rate / hop_length)
141
- # else:
142
- zh_pause_punc = r"。,、;:?!"
143
- ref_text_len = len(ref_text) + len(re.findall(zh_pause_punc, ref_text))
144
- gen_text_len = len(gen_text) + len(re.findall(zh_pause_punc, gen_text))
145
- duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
146
-
147
- # inference
148
- gr.Info(f"Generating audio using {exp_name}")
149
- with torch.inference_mode():
150
- generated, _ = ema_model.sample(
151
- cond=audio,
152
- text=final_text_list,
153
- duration=duration,
154
- steps=nfe_step,
155
- cfg_strength=cfg_strength,
156
- sway_sampling_coef=sway_sampling_coef,
157
- )
158
-
159
- generated = generated[:, ref_audio_len:, :]
160
- generated_mel_spec = rearrange(generated, "1 n d -> 1 d n")
161
- gr.Info("Running vocoder")
162
- vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
163
- generated_wave = vocos.decode(generated_mel_spec.cpu())
164
- if rms < target_rms:
165
- generated_wave = generated_wave * rms / target_rms
166
-
167
- # wav -> numpy
168
- generated_wave = generated_wave.squeeze().cpu().numpy()
169
-
170
- if remove_silence:
171
- gr.Info("Removing audio silences... This may take a moment")
172
- non_silent_intervals = librosa.effects.split(generated_wave, top_db=30)
173
- non_silent_wave = np.array([])
174
- for interval in non_silent_intervals:
175
- start, end = interval
176
- non_silent_wave = np.concatenate(
177
- [non_silent_wave, generated_wave[start:end]]
178
- )
179
- generated_wave = non_silent_wave
180
-
181
- # spectogram
182
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
183
- spectrogram_path = tmp_spectrogram.name
184
- save_spectrogram(generated_mel_spec[0].cpu().numpy(), spectrogram_path)
185
-
186
- return (target_sample_rate, generated_wave), spectrogram_path
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  with gr.Blocks() as app:
190
  gr.Markdown(
191
  """
192
- # E2/F5 TTS
193
 
194
- This is a local web UI for F5 TTS, based on the unofficial [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS). This app supports the following TTS models:
195
 
196
  * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
197
  * [E2-TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
@@ -205,7 +368,7 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
205
  )
206
 
207
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
208
- gen_text_input = gr.Textbox(label="Text to Generate (max 200 chars.)", lines=4)
209
  model_choice = gr.Radio(
210
  choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
211
  )
@@ -221,23 +384,75 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
221
  info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
222
  value=True,
223
  )
 
 
 
 
 
224
 
225
  audio_output = gr.Audio(label="Synthesized Audio")
226
  spectrogram_output = gr.Image(label="Spectrogram")
227
 
 
 
 
 
 
 
 
228
  generate_btn.click(
229
- infer,
230
  inputs=[
231
  ref_audio_input,
232
  ref_text_input,
233
  gen_text_input,
234
  model_choice,
235
  remove_silence,
 
236
  ],
237
  outputs=[audio_output, spectrogram_output],
238
  )
239
-
240
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  @click.command()
242
  @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
243
  @click.option("--host", "-H", default=None, help="Host to run the app on")
 
19
  from transformers import pipeline
20
  import librosa
21
  import click
22
+ import soundfile as sf
23
+
24
+ SPLIT_WORDS = [
25
+ "but", "however", "nevertheless", "yet", "still",
26
+ "therefore", "thus", "hence", "consequently",
27
+ "moreover", "furthermore", "additionally",
28
+ "meanwhile", "alternatively", "otherwise",
29
+ "namely", "specifically", "for example", "such as",
30
+ "in fact", "indeed", "notably",
31
+ "in contrast", "on the other hand", "conversely",
32
+ "in conclusion", "to summarize", "finally"
33
+ ]
34
 
35
  device = (
36
  "cuda"
 
99
  "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
100
  )
101
 
102
+ def split_text_into_batches(text, max_chars=200, split_words=SPLIT_WORDS):
103
+ sentences = re.split('([。.!?!?])', text)
104
+ sentences = [''.join(i) for i in zip(sentences[0::2], sentences[1::2])]
105
+
106
+ batches = []
107
+ current_batch = ""
108
+
109
+ def split_by_words(text):
110
+ words = text.split()
111
+ current_word_part = ""
112
+ word_batches = []
113
+ for word in words:
114
+ if len(current_word_part) + len(word) + 1 <= max_chars:
115
+ current_word_part += word + ' '
116
+ else:
117
+ if current_word_part:
118
+ # Try to find a suitable split word
119
+ for split_word in split_words:
120
+ split_index = current_word_part.rfind(' ' + split_word + ' ')
121
+ if split_index != -1:
122
+ word_batches.append(current_word_part[:split_index].strip())
123
+ current_word_part = current_word_part[split_index:].strip() + ' '
124
+ break
125
+ else:
126
+ # If no suitable split word found, just append the current part
127
+ word_batches.append(current_word_part.strip())
128
+ current_word_part = ""
129
+ current_word_part += word + ' '
130
+ if current_word_part:
131
+ word_batches.append(current_word_part.strip())
132
+ return word_batches
133
+
134
+ for sentence in sentences:
135
+ if len(current_batch) + len(sentence) <= max_chars:
136
+ current_batch += sentence
137
+ else:
138
+ # If adding this sentence would exceed the limit
139
+ if current_batch:
140
+ batches.append(current_batch)
141
+ current_batch = ""
142
+
143
+ # If the sentence itself is longer than max_chars, split it
144
+ if len(sentence) > max_chars:
145
+ # First, try to split by colon
146
+ colon_parts = sentence.split(':')
147
+ if len(colon_parts) > 1:
148
+ for part in colon_parts:
149
+ if len(part) <= max_chars:
150
+ batches.append(part)
151
+ else:
152
+ # If colon part is still too long, split by comma
153
+ comma_parts = part.split(',')
154
+ if len(comma_parts) > 1:
155
+ current_comma_part = ""
156
+ for comma_part in comma_parts:
157
+ if len(current_comma_part) + len(comma_part) <= max_chars:
158
+ current_comma_part += comma_part + ','
159
+ else:
160
+ if current_comma_part:
161
+ batches.append(current_comma_part.rstrip(','))
162
+ current_comma_part = comma_part + ','
163
+ if current_comma_part:
164
+ batches.append(current_comma_part.rstrip(','))
165
+ else:
166
+ # If no comma, split by words
167
+ batches.extend(split_by_words(part))
168
+ else:
169
+ # If no colon, split by comma
170
+ comma_parts = sentence.split(',')
171
+ if len(comma_parts) > 1:
172
+ current_comma_part = ""
173
+ for comma_part in comma_parts:
174
+ if len(current_comma_part) + len(comma_part) <= max_chars:
175
+ current_comma_part += comma_part + ','
176
+ else:
177
+ if current_comma_part:
178
+ batches.append(current_comma_part.rstrip(','))
179
+ current_comma_part = comma_part + ','
180
+ if current_comma_part:
181
+ batches.append(current_comma_part.rstrip(','))
182
+ else:
183
+ # If no comma, split by words
184
+ batches.extend(split_by_words(sentence))
185
+ else:
186
+ current_batch = sentence
187
+
188
+ if current_batch:
189
+ batches.append(current_batch)
190
+
191
+ return batches
192
+
193
+ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
194
+ if exp_name == "F5-TTS":
195
+ ema_model = F5TTS_ema_model
196
+ elif exp_name == "E2-TTS":
197
+ ema_model = E2TTS_ema_model
198
+
199
+ audio, sr = torchaudio.load(ref_audio)
200
+ if audio.shape[0] > 1:
201
+ audio = torch.mean(audio, dim=0, keepdim=True)
202
+
203
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
204
+ if rms < target_rms:
205
+ audio = audio * target_rms / rms
206
+ if sr != target_sample_rate:
207
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
208
+ audio = resampler(audio)
209
+ audio = audio.to(device)
210
+
211
+ generated_waves = []
212
+ spectrograms = []
213
+
214
+ for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
215
+ # Prepare the text
216
+ text_list = [ref_text + gen_text]
217
+ final_text_list = convert_char_to_pinyin(text_list)
218
+
219
+ # Calculate duration
220
+ ref_audio_len = audio.shape[-1] // hop_length
221
+ zh_pause_punc = r"。,、;:?!"
222
+ ref_text_len = len(ref_text) + len(re.findall(zh_pause_punc, ref_text))
223
+ gen_text_len = len(gen_text) + len(re.findall(zh_pause_punc, gen_text))
224
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
225
+
226
+ # inference
227
+ with torch.inference_mode():
228
+ generated, _ = ema_model.sample(
229
+ cond=audio,
230
+ text=final_text_list,
231
+ duration=duration,
232
+ steps=nfe_step,
233
+ cfg_strength=cfg_strength,
234
+ sway_sampling_coef=sway_sampling_coef,
235
+ )
236
+
237
+ generated = generated[:, ref_audio_len:, :]
238
+ generated_mel_spec = rearrange(generated, "1 n d -> 1 d n")
239
+
240
+ vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
241
+ generated_wave = vocos.decode(generated_mel_spec.cpu())
242
+ if rms < target_rms:
243
+ generated_wave = generated_wave * rms / target_rms
244
+
245
+ # wav -> numpy
246
+ generated_wave = generated_wave.squeeze().cpu().numpy()
247
+
248
+ if remove_silence:
249
+ non_silent_intervals = librosa.effects.split(generated_wave, top_db=30)
250
+ non_silent_wave = np.array([])
251
+ for interval in non_silent_intervals:
252
+ start, end = interval
253
+ non_silent_wave = np.concatenate(
254
+ [non_silent_wave, generated_wave[start:end]]
255
+ )
256
+ generated_wave = non_silent_wave
257
+
258
+ generated_waves.append(generated_wave)
259
+ spectrograms.append(generated_mel_spec[0].cpu().numpy())
260
+
261
+ # Combine all generated waves
262
+ final_wave = np.concatenate(generated_waves)
263
+
264
+ # Create a combined spectrogram
265
+ combined_spectrogram = np.concatenate(spectrograms, axis=1)
266
+
267
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
268
+ spectrogram_path = tmp_spectrogram.name
269
+ save_spectrogram(combined_spectrogram, spectrogram_path)
270
+
271
+ return (target_sample_rate, final_wave), spectrogram_path
272
 
273
  def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence):
274
  print(gen_text)
 
 
275
  gr.Info("Converting audio...")
276
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
277
  aseg = AudioSegment.from_file(ref_audio_orig)
 
281
  aseg = aseg[:15000]
282
  aseg.export(f.name, format="wav")
283
  ref_audio = f.name
 
 
 
 
284
 
285
  if not ref_text.strip():
286
  gr.Info("No reference text provided, transcribing reference audio...")
287
+ ref_text = pipe(
288
  ref_audio,
289
  chunk_length_s=30,
290
  batch_size=128,
 
294
  gr.Info("Finished transcription")
295
  else:
296
  gr.Info("Using custom reference text...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
+ # Split the input text into batches
299
+ gen_text_batches = split_text_into_batches(gen_text)
300
+
301
+ gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
302
+ return infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence)
303
+
304
+ def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
305
+ # Split the script into speaker blocks
306
+ speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
307
+ speaker_blocks = speaker_pattern.split(script)[1:] # Skip the first empty element
308
+
309
+ generated_audio_segments = []
310
+
311
+ for i in range(0, len(speaker_blocks), 2):
312
+ speaker = speaker_blocks[i]
313
+ text = speaker_blocks[i+1].strip()
314
+
315
+ # Determine which speaker is talking
316
+ if speaker == speaker1_name:
317
+ ref_audio = ref_audio1
318
+ ref_text = ref_text1
319
+ elif speaker == speaker2_name:
320
+ ref_audio = ref_audio2
321
+ ref_text = ref_text2
322
+ else:
323
+ continue # Skip if the speaker is neither speaker1 nor speaker2
324
+
325
+ # Generate audio for this block
326
+ audio, _ = infer(ref_audio, ref_text, text, exp_name, remove_silence)
327
+
328
+ # Convert the generated audio to a numpy array
329
+ sr, audio_data = audio
330
+
331
+ # Save the audio data as a WAV file
332
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
333
+ sf.write(temp_file.name, audio_data, sr)
334
+ audio_segment = AudioSegment.from_wav(temp_file.name)
335
+
336
+ generated_audio_segments.append(audio_segment)
337
+
338
+ # Add a short pause between speakers
339
+ pause = AudioSegment.silent(duration=500) # 500ms pause
340
+ generated_audio_segments.append(pause)
341
+
342
+ # Concatenate all audio segments
343
+ final_podcast = sum(generated_audio_segments)
344
+
345
+ # Export the final podcast
346
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
347
+ podcast_path = temp_file.name
348
+ final_podcast.export(podcast_path, format="wav")
349
+
350
+ return podcast_path
351
 
352
  with gr.Blocks() as app:
353
  gr.Markdown(
354
  """
355
+ # E2/F5 TTS with Advanced Batch Processing
356
 
357
+ This is a local web UI for F5 TTS with advanced batch processing support, based on the unofficial [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS). This app supports the following TTS models:
358
 
359
  * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
360
  * [E2-TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
 
368
  )
369
 
370
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
371
+ gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
372
  model_choice = gr.Radio(
373
  choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
374
  )
 
384
  info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
385
  value=True,
386
  )
387
+ split_words_input = gr.Textbox(
388
+ label="Custom Split Words",
389
+ info="Enter custom words to split on, separated by commas. Leave blank to use default list.",
390
+ lines=2,
391
+ )
392
 
393
  audio_output = gr.Audio(label="Synthesized Audio")
394
  spectrogram_output = gr.Image(label="Spectrogram")
395
 
396
+ def infer_with_custom_split(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_split_words):
397
+ if custom_split_words:
398
+ custom_words = [word.strip() for word in custom_split_words.split(',')]
399
+ global SPLIT_WORDS
400
+ SPLIT_WORDS = custom_words
401
+ return infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence)
402
+
403
  generate_btn.click(
404
+ infer_with_custom_split,
405
  inputs=[
406
  ref_audio_input,
407
  ref_text_input,
408
  gen_text_input,
409
  model_choice,
410
  remove_silence,
411
+ split_words_input,
412
  ],
413
  outputs=[audio_output, spectrogram_output],
414
  )
415
+ with gr.Tab("Podcast Generation"):
416
+ speaker1_name = gr.Textbox(label="Speaker 1 Name")
417
+ ref_audio_input1 = gr.Audio(label="Reference Audio (Speaker 1)", type="filepath")
418
+ ref_text_input1 = gr.Textbox(label="Reference Text (Speaker 1)", lines=2)
419
+
420
+ speaker2_name = gr.Textbox(label="Speaker 2 Name")
421
+ ref_audio_input2 = gr.Audio(label="Reference Audio (Speaker 2)", type="filepath")
422
+ ref_text_input2 = gr.Textbox(label="Reference Text (Speaker 2)", lines=2)
423
+
424
+ script_input = gr.Textbox(label="Podcast Script", lines=10,
425
+ placeholder="Enter the script with speaker names at the start of each block, e.g.:\nSean: How did you start studying...\n\nMeghan: I came to my interest in technology...\nIt was a long journey...\n\nSean: That's fascinating. Can you elaborate...")
426
+
427
+ podcast_model_choice = gr.Radio(
428
+ choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS"
429
+ )
430
+ podcast_remove_silence = gr.Checkbox(
431
+ label="Remove Silences",
432
+ value=True,
433
+ )
434
+ generate_podcast_btn = gr.Button("Generate Podcast", variant="primary")
435
+ podcast_output = gr.Audio(label="Generated Podcast")
436
+
437
+ def podcast_generation(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence):
438
+ return generate_podcast(script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence)
439
+
440
+ generate_podcast_btn.click(
441
+ podcast_generation,
442
+ inputs=[
443
+ script_input,
444
+ speaker1_name,
445
+ ref_audio_input1,
446
+ ref_text_input1,
447
+ speaker2_name,
448
+ ref_audio_input2,
449
+ ref_text_input2,
450
+ podcast_model_choice,
451
+ podcast_remove_silence,
452
+ ],
453
+ outputs=podcast_output,
454
+ )
455
+
456
  @click.command()
457
  @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
458
  @click.option("--host", "-H", default=None, help="Host to run the app on")