Commit
·
cc98bbc
1
Parent(s):
2d65456
Add volume adjustment feature to text-to-speech and multi-speaker interfaces
Browse files- app.py +45 -15
- requirements.txt +1 -1
app.py
CHANGED
@@ -126,7 +126,7 @@ async def update_text_from_file(file):
|
|
126 |
return "", gr.Warning("Failed to process the file")
|
127 |
|
128 |
|
129 |
-
async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, uploaded_file=None):
|
130 |
"""Convert text to speech, handling both direct text input and uploaded files"""
|
131 |
if not text.strip() and uploaded_file is None:
|
132 |
return None, None, "Please enter text or upload a file to convert."
|
@@ -156,6 +156,7 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
|
|
156 |
voice_short_name = voice.split(" - ")[0]
|
157 |
rate_str = f"{rate:+d}%"
|
158 |
pitch_str = f"{pitch:+d}Hz"
|
|
|
159 |
|
160 |
# Create temporary file for audio
|
161 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
@@ -185,7 +186,9 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
|
|
185 |
segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
|
186 |
|
187 |
# Generate audio for this segment
|
188 |
-
communicate = edge_tts.Communicate(
|
|
|
|
|
189 |
await communicate.save(segment_file)
|
190 |
|
191 |
audio_segments.append({
|
@@ -195,9 +198,6 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
|
|
195 |
'text': segment_text
|
196 |
})
|
197 |
|
198 |
-
# Combine audio segments with proper timing
|
199 |
-
import wave
|
200 |
-
import audioop
|
201 |
from pydub import AudioSegment
|
202 |
|
203 |
# Initialize final audio
|
@@ -222,7 +222,9 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
|
|
222 |
f.write(f"{entry['text']}\n\n")
|
223 |
else:
|
224 |
# Use the existing approach for regular text
|
225 |
-
communicate = edge_tts.Communicate(
|
|
|
|
|
226 |
if not generate_subtitles:
|
227 |
await communicate.save(audio_path)
|
228 |
if generate_subtitles:
|
@@ -304,8 +306,8 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
|
|
304 |
return audio_path, subtitle_path, None
|
305 |
|
306 |
|
307 |
-
async def tts_interface(text, voice, rate, pitch, generate_subtitles, uploaded_file=None):
|
308 |
-
audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, generate_subtitles, uploaded_file)
|
309 |
if warning:
|
310 |
return audio, subtitle, gr.Warning(warning)
|
311 |
return audio, subtitle, None
|
@@ -387,12 +389,15 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
|
|
387 |
voice_short_name = speaker_settings[speaker_idx]['voice'].split(" - ")[0]
|
388 |
rate_str = f"{speaker_settings[speaker_idx]['rate']:+d}%"
|
389 |
pitch_str = f"{speaker_settings[speaker_idx]['pitch']:+d}Hz"
|
|
|
390 |
|
391 |
# Create temporary file for this segment
|
392 |
segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
|
393 |
|
394 |
# Generate audio for this segment with speaker-specific settings
|
395 |
-
communicate = edge_tts.Communicate(
|
|
|
|
|
396 |
|
397 |
# For subtitle generation, we need word boundaries
|
398 |
if generate_subtitles:
|
@@ -492,8 +497,8 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
|
|
492 |
|
493 |
return final_audio_path, subtitle_path, None
|
494 |
|
495 |
-
async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, speaker1_rate, speaker1_pitch,
|
496 |
-
speaker2_voice, speaker2_rate, speaker2_pitch):
|
497 |
"""Interface function for multi-speaker TTS"""
|
498 |
# Create speaker settings from individual parameters
|
499 |
speaker_settings = []
|
@@ -503,7 +508,8 @@ async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, spea
|
|
503 |
speaker_settings.append({
|
504 |
'voice': speaker1_voice,
|
505 |
'rate': speaker1_rate,
|
506 |
-
'pitch': speaker1_pitch
|
|
|
507 |
})
|
508 |
|
509 |
# Add Speaker 2 if voice is selected
|
@@ -511,7 +517,8 @@ async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, spea
|
|
511 |
speaker_settings.append({
|
512 |
'voice': speaker2_voice,
|
513 |
'rate': speaker2_rate,
|
514 |
-
'pitch': speaker2_pitch
|
|
|
515 |
})
|
516 |
|
517 |
if not speaker_settings:
|
@@ -564,6 +571,13 @@ async def create_demo():
|
|
564 |
label="Speech Rate Adjustment (%)",
|
565 |
step=1,
|
566 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
567 |
pitch_slider = gr.Slider(
|
568 |
minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
|
569 |
)
|
@@ -608,7 +622,7 @@ async def create_demo():
|
|
608 |
submit_single_btn.click(
|
609 |
fn=tts_interface,
|
610 |
api_name="predict",
|
611 |
-
inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, subtitle_checkbox, file_input],
|
612 |
outputs=single_outputs
|
613 |
)
|
614 |
|
@@ -635,6 +649,13 @@ async def create_demo():
|
|
635 |
label="Speaker 1 Rate (%)",
|
636 |
step=1,
|
637 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
638 |
speaker1_pitch = gr.Slider(
|
639 |
minimum=-20,
|
640 |
maximum=20,
|
@@ -656,6 +677,13 @@ async def create_demo():
|
|
656 |
label="Speaker 2 Rate (%)",
|
657 |
step=1,
|
658 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
speaker2_pitch = gr.Slider(
|
660 |
minimum=-20,
|
661 |
maximum=20,
|
@@ -683,9 +711,11 @@ async def create_demo():
|
|
683 |
speaker1_voice,
|
684 |
speaker1_rate,
|
685 |
speaker1_pitch,
|
|
|
686 |
speaker2_voice,
|
687 |
speaker2_rate,
|
688 |
-
speaker2_pitch
|
|
|
689 |
],
|
690 |
outputs=multi_outputs
|
691 |
)
|
|
|
126 |
return "", gr.Warning("Failed to process the file")
|
127 |
|
128 |
|
129 |
+
async def text_to_speech(text, voice, rate, pitch, volume, generate_subtitles=False, uploaded_file=None):
|
130 |
"""Convert text to speech, handling both direct text input and uploaded files"""
|
131 |
if not text.strip() and uploaded_file is None:
|
132 |
return None, None, "Please enter text or upload a file to convert."
|
|
|
156 |
voice_short_name = voice.split(" - ")[0]
|
157 |
rate_str = f"{rate:+d}%"
|
158 |
pitch_str = f"{pitch:+d}Hz"
|
159 |
+
volume_str = f"{volume:+d}%"
|
160 |
|
161 |
# Create temporary file for audio
|
162 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
|
|
186 |
segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
|
187 |
|
188 |
# Generate audio for this segment
|
189 |
+
communicate = edge_tts.Communicate(
|
190 |
+
segment_text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
|
191 |
+
)
|
192 |
await communicate.save(segment_file)
|
193 |
|
194 |
audio_segments.append({
|
|
|
198 |
'text': segment_text
|
199 |
})
|
200 |
|
|
|
|
|
|
|
201 |
from pydub import AudioSegment
|
202 |
|
203 |
# Initialize final audio
|
|
|
222 |
f.write(f"{entry['text']}\n\n")
|
223 |
else:
|
224 |
# Use the existing approach for regular text
|
225 |
+
communicate = edge_tts.Communicate(
|
226 |
+
text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
|
227 |
+
)
|
228 |
if not generate_subtitles:
|
229 |
await communicate.save(audio_path)
|
230 |
if generate_subtitles:
|
|
|
306 |
return audio_path, subtitle_path, None
|
307 |
|
308 |
|
309 |
+
async def tts_interface(text, voice, rate, pitch, volume, generate_subtitles, uploaded_file=None):
|
310 |
+
audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, volume, generate_subtitles, uploaded_file)
|
311 |
if warning:
|
312 |
return audio, subtitle, gr.Warning(warning)
|
313 |
return audio, subtitle, None
|
|
|
389 |
voice_short_name = speaker_settings[speaker_idx]['voice'].split(" - ")[0]
|
390 |
rate_str = f"{speaker_settings[speaker_idx]['rate']:+d}%"
|
391 |
pitch_str = f"{speaker_settings[speaker_idx]['pitch']:+d}Hz"
|
392 |
+
volume_str = f"{speaker_settings[speaker_idx].get('volume', 0):+d}%"
|
393 |
|
394 |
# Create temporary file for this segment
|
395 |
segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
|
396 |
|
397 |
# Generate audio for this segment with speaker-specific settings
|
398 |
+
communicate = edge_tts.Communicate(
|
399 |
+
text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
|
400 |
+
)
|
401 |
|
402 |
# For subtitle generation, we need word boundaries
|
403 |
if generate_subtitles:
|
|
|
497 |
|
498 |
return final_audio_path, subtitle_path, None
|
499 |
|
500 |
+
async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, speaker1_rate, speaker1_pitch, speaker1_volume,
|
501 |
+
speaker2_voice, speaker2_rate, speaker2_pitch, speaker2_volume):
|
502 |
"""Interface function for multi-speaker TTS"""
|
503 |
# Create speaker settings from individual parameters
|
504 |
speaker_settings = []
|
|
|
508 |
speaker_settings.append({
|
509 |
'voice': speaker1_voice,
|
510 |
'rate': speaker1_rate,
|
511 |
+
'pitch': speaker1_pitch,
|
512 |
+
'volume': speaker1_volume,
|
513 |
})
|
514 |
|
515 |
# Add Speaker 2 if voice is selected
|
|
|
517 |
speaker_settings.append({
|
518 |
'voice': speaker2_voice,
|
519 |
'rate': speaker2_rate,
|
520 |
+
'pitch': speaker2_pitch,
|
521 |
+
'volume': speaker2_volume,
|
522 |
})
|
523 |
|
524 |
if not speaker_settings:
|
|
|
571 |
label="Speech Rate Adjustment (%)",
|
572 |
step=1,
|
573 |
)
|
574 |
+
volume_slider = gr.Slider(
|
575 |
+
minimum=-50,
|
576 |
+
maximum=50,
|
577 |
+
value=0,
|
578 |
+
label="Volume Adjustment (%)",
|
579 |
+
step=1,
|
580 |
+
)
|
581 |
pitch_slider = gr.Slider(
|
582 |
minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
|
583 |
)
|
|
|
622 |
submit_single_btn.click(
|
623 |
fn=tts_interface,
|
624 |
api_name="predict",
|
625 |
+
inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, volume_slider, subtitle_checkbox, file_input],
|
626 |
outputs=single_outputs
|
627 |
)
|
628 |
|
|
|
649 |
label="Speaker 1 Rate (%)",
|
650 |
step=1,
|
651 |
)
|
652 |
+
speaker1_volume = gr.Slider(
|
653 |
+
minimum=-50,
|
654 |
+
maximum=50,
|
655 |
+
value=0,
|
656 |
+
label="Speaker 1 Volume (%)",
|
657 |
+
step=1,
|
658 |
+
)
|
659 |
speaker1_pitch = gr.Slider(
|
660 |
minimum=-20,
|
661 |
maximum=20,
|
|
|
677 |
label="Speaker 2 Rate (%)",
|
678 |
step=1,
|
679 |
)
|
680 |
+
speaker2_volume = gr.Slider(
|
681 |
+
minimum=-50,
|
682 |
+
maximum=50,
|
683 |
+
value=0,
|
684 |
+
label="Speaker 2 Volume (%)",
|
685 |
+
step=1,
|
686 |
+
)
|
687 |
speaker2_pitch = gr.Slider(
|
688 |
minimum=-20,
|
689 |
maximum=20,
|
|
|
711 |
speaker1_voice,
|
712 |
speaker1_rate,
|
713 |
speaker1_pitch,
|
714 |
+
speaker1_volume,
|
715 |
speaker2_voice,
|
716 |
speaker2_rate,
|
717 |
+
speaker2_pitch,
|
718 |
+
speaker2_volume,
|
719 |
],
|
720 |
outputs=multi_outputs
|
721 |
)
|
requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
edge_tts
|
2 |
gradio
|
|
|
1 |
+
edge_tts==7.1.0
|
2 |
gradio
|