Spaces:
Running
Running
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
app.py
CHANGED
|
@@ -79,8 +79,10 @@ def generate_response(messages, model, tokenizer):
|
|
| 79 |
|
| 80 |
|
| 81 |
@gpu_decorator
|
| 82 |
-
def infer(
|
| 83 |
-
|
|
|
|
|
|
|
| 84 |
|
| 85 |
if model == "F5-TTS":
|
| 86 |
ema_model = F5TTS_ema_model
|
|
@@ -94,7 +96,7 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_
|
|
| 94 |
ema_model,
|
| 95 |
cross_fade_duration=cross_fade_duration,
|
| 96 |
speed=speed,
|
| 97 |
-
show_info=
|
| 98 |
progress=gr.Progress(),
|
| 99 |
)
|
| 100 |
|
|
@@ -183,24 +185,24 @@ def parse_speechtypes_text(gen_text):
|
|
| 183 |
|
| 184 |
segments = []
|
| 185 |
|
| 186 |
-
|
| 187 |
|
| 188 |
for i in range(len(tokens)):
|
| 189 |
if i % 2 == 0:
|
| 190 |
# This is text
|
| 191 |
text = tokens[i].strip()
|
| 192 |
if text:
|
| 193 |
-
segments.append({"
|
| 194 |
else:
|
| 195 |
-
# This is
|
| 196 |
-
|
| 197 |
-
|
| 198 |
|
| 199 |
return segments
|
| 200 |
|
| 201 |
|
| 202 |
with gr.Blocks() as app_multistyle:
|
| 203 |
-
# New section for
|
| 204 |
gr.Markdown(
|
| 205 |
"""
|
| 206 |
# Multiple Speech-Type Generation
|
|
@@ -313,29 +315,29 @@ with gr.Blocks() as app_multistyle:
|
|
| 313 |
delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
|
| 314 |
|
| 315 |
# Text input for the prompt
|
| 316 |
-
|
| 317 |
label="Text to Generate",
|
| 318 |
lines=10,
|
| 319 |
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
|
| 320 |
)
|
| 321 |
|
| 322 |
# Model choice
|
| 323 |
-
|
| 324 |
|
| 325 |
with gr.Accordion("Advanced Settings", open=False):
|
| 326 |
-
|
| 327 |
label="Remove Silences",
|
| 328 |
value=False,
|
| 329 |
)
|
| 330 |
|
| 331 |
# Generate button
|
| 332 |
-
|
| 333 |
|
| 334 |
# Output audio
|
| 335 |
-
|
| 336 |
|
| 337 |
@gpu_decorator
|
| 338 |
-
def
|
| 339 |
regular_audio,
|
| 340 |
regular_ref_text,
|
| 341 |
gen_text,
|
|
@@ -362,23 +364,23 @@ with gr.Blocks() as app_multistyle:
|
|
| 362 |
|
| 363 |
# For each segment, generate speech
|
| 364 |
generated_audio_segments = []
|
| 365 |
-
|
| 366 |
|
| 367 |
for segment in segments:
|
| 368 |
-
|
| 369 |
text = segment["text"]
|
| 370 |
|
| 371 |
-
if
|
| 372 |
-
|
| 373 |
else:
|
| 374 |
-
# If
|
| 375 |
-
|
| 376 |
|
| 377 |
-
ref_audio = speech_types[
|
| 378 |
-
ref_text = speech_types[
|
| 379 |
|
| 380 |
# Generate speech for this segment
|
| 381 |
-
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0)
|
| 382 |
sr, audio_data = audio
|
| 383 |
|
| 384 |
generated_audio_segments.append(audio_data)
|
|
@@ -391,21 +393,21 @@ with gr.Blocks() as app_multistyle:
|
|
| 391 |
gr.Warning("No audio generated.")
|
| 392 |
return None
|
| 393 |
|
| 394 |
-
|
| 395 |
-
|
| 396 |
inputs=[
|
| 397 |
regular_audio,
|
| 398 |
regular_ref_text,
|
| 399 |
-
|
| 400 |
]
|
| 401 |
+ speech_type_names
|
| 402 |
+ speech_type_audios
|
| 403 |
+ speech_type_ref_texts
|
| 404 |
+ [
|
| 405 |
-
|
| 406 |
-
|
| 407 |
],
|
| 408 |
-
outputs=
|
| 409 |
)
|
| 410 |
|
| 411 |
# Validation function to disable Generate button if speech types are missing
|
|
@@ -423,7 +425,7 @@ with gr.Blocks() as app_multistyle:
|
|
| 423 |
|
| 424 |
# Parse the gen_text to get the speech types used
|
| 425 |
segments = parse_speechtypes_text(gen_text)
|
| 426 |
-
speech_types_in_text = set(segment["
|
| 427 |
|
| 428 |
# Check if all speech types in text are available
|
| 429 |
missing_speech_types = speech_types_in_text - speech_types_available
|
|
@@ -435,10 +437,10 @@ with gr.Blocks() as app_multistyle:
|
|
| 435 |
# Enable the generate button
|
| 436 |
return gr.update(interactive=True)
|
| 437 |
|
| 438 |
-
|
| 439 |
validate_speech_types,
|
| 440 |
-
inputs=[
|
| 441 |
-
outputs=
|
| 442 |
)
|
| 443 |
|
| 444 |
|
|
@@ -576,6 +578,7 @@ Have a conversation with an AI using your reference voice!
|
|
| 576 |
remove_silence,
|
| 577 |
cross_fade_duration=0.15,
|
| 578 |
speed=1.0,
|
|
|
|
| 579 |
)
|
| 580 |
return audio_result
|
| 581 |
|
|
|
|
| 79 |
|
| 80 |
|
| 81 |
@gpu_decorator
|
| 82 |
+
def infer(
|
| 83 |
+
ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
|
| 84 |
+
):
|
| 85 |
+
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
| 86 |
|
| 87 |
if model == "F5-TTS":
|
| 88 |
ema_model = F5TTS_ema_model
|
|
|
|
| 96 |
ema_model,
|
| 97 |
cross_fade_duration=cross_fade_duration,
|
| 98 |
speed=speed,
|
| 99 |
+
show_info=show_info,
|
| 100 |
progress=gr.Progress(),
|
| 101 |
)
|
| 102 |
|
|
|
|
| 185 |
|
| 186 |
segments = []
|
| 187 |
|
| 188 |
+
current_style = "Regular"
|
| 189 |
|
| 190 |
for i in range(len(tokens)):
|
| 191 |
if i % 2 == 0:
|
| 192 |
# This is text
|
| 193 |
text = tokens[i].strip()
|
| 194 |
if text:
|
| 195 |
+
segments.append({"style": current_style, "text": text})
|
| 196 |
else:
|
| 197 |
+
# This is style
|
| 198 |
+
style = tokens[i].strip()
|
| 199 |
+
current_style = style
|
| 200 |
|
| 201 |
return segments
|
| 202 |
|
| 203 |
|
| 204 |
with gr.Blocks() as app_multistyle:
|
| 205 |
+
# New section for multistyle generation
|
| 206 |
gr.Markdown(
|
| 207 |
"""
|
| 208 |
# Multiple Speech-Type Generation
|
|
|
|
| 315 |
delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
|
| 316 |
|
| 317 |
# Text input for the prompt
|
| 318 |
+
gen_text_input_multistyle = gr.Textbox(
|
| 319 |
label="Text to Generate",
|
| 320 |
lines=10,
|
| 321 |
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
|
| 322 |
)
|
| 323 |
|
| 324 |
# Model choice
|
| 325 |
+
model_choice_multistyle = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
| 326 |
|
| 327 |
with gr.Accordion("Advanced Settings", open=False):
|
| 328 |
+
remove_silence_multistyle = gr.Checkbox(
|
| 329 |
label="Remove Silences",
|
| 330 |
value=False,
|
| 331 |
)
|
| 332 |
|
| 333 |
# Generate button
|
| 334 |
+
generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
|
| 335 |
|
| 336 |
# Output audio
|
| 337 |
+
audio_output_multistyle = gr.Audio(label="Synthesized Audio")
|
| 338 |
|
| 339 |
@gpu_decorator
|
| 340 |
+
def generate_multistyle_speech(
|
| 341 |
regular_audio,
|
| 342 |
regular_ref_text,
|
| 343 |
gen_text,
|
|
|
|
| 364 |
|
| 365 |
# For each segment, generate speech
|
| 366 |
generated_audio_segments = []
|
| 367 |
+
current_style = "Regular"
|
| 368 |
|
| 369 |
for segment in segments:
|
| 370 |
+
style = segment["style"]
|
| 371 |
text = segment["text"]
|
| 372 |
|
| 373 |
+
if style in speech_types:
|
| 374 |
+
current_style = style
|
| 375 |
else:
|
| 376 |
+
# If style not available, default to Regular
|
| 377 |
+
current_style = "Regular"
|
| 378 |
|
| 379 |
+
ref_audio = speech_types[current_style]["audio"]
|
| 380 |
+
ref_text = speech_types[current_style].get("ref_text", "")
|
| 381 |
|
| 382 |
# Generate speech for this segment
|
| 383 |
+
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0, show_info=None)
|
| 384 |
sr, audio_data = audio
|
| 385 |
|
| 386 |
generated_audio_segments.append(audio_data)
|
|
|
|
| 393 |
gr.Warning("No audio generated.")
|
| 394 |
return None
|
| 395 |
|
| 396 |
+
generate_multistyle_btn.click(
|
| 397 |
+
generate_multistyle_speech,
|
| 398 |
inputs=[
|
| 399 |
regular_audio,
|
| 400 |
regular_ref_text,
|
| 401 |
+
gen_text_input_multistyle,
|
| 402 |
]
|
| 403 |
+ speech_type_names
|
| 404 |
+ speech_type_audios
|
| 405 |
+ speech_type_ref_texts
|
| 406 |
+ [
|
| 407 |
+
model_choice_multistyle,
|
| 408 |
+
remove_silence_multistyle,
|
| 409 |
],
|
| 410 |
+
outputs=audio_output_multistyle,
|
| 411 |
)
|
| 412 |
|
| 413 |
# Validation function to disable Generate button if speech types are missing
|
|
|
|
| 425 |
|
| 426 |
# Parse the gen_text to get the speech types used
|
| 427 |
segments = parse_speechtypes_text(gen_text)
|
| 428 |
+
speech_types_in_text = set(segment["style"] for segment in segments)
|
| 429 |
|
| 430 |
# Check if all speech types in text are available
|
| 431 |
missing_speech_types = speech_types_in_text - speech_types_available
|
|
|
|
| 437 |
# Enable the generate button
|
| 438 |
return gr.update(interactive=True)
|
| 439 |
|
| 440 |
+
gen_text_input_multistyle.change(
|
| 441 |
validate_speech_types,
|
| 442 |
+
inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
|
| 443 |
+
outputs=generate_multistyle_btn,
|
| 444 |
)
|
| 445 |
|
| 446 |
|
|
|
|
| 578 |
remove_silence,
|
| 579 |
cross_fade_duration=0.15,
|
| 580 |
speed=1.0,
|
| 581 |
+
show_info=None,
|
| 582 |
)
|
| 583 |
return audio_result
|
| 584 |
|