walidadebayo commited on
Commit
cc98bbc
·
1 Parent(s): 2d65456

Add volume adjustment feature to text-to-speech and multi-speaker interfaces

Browse files
Files changed (2) hide show
  1. app.py +45 -15
  2. requirements.txt +1 -1
app.py CHANGED
@@ -126,7 +126,7 @@ async def update_text_from_file(file):
126
  return "", gr.Warning("Failed to process the file")
127
 
128
 
129
- async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, uploaded_file=None):
130
  """Convert text to speech, handling both direct text input and uploaded files"""
131
  if not text.strip() and uploaded_file is None:
132
  return None, None, "Please enter text or upload a file to convert."
@@ -156,6 +156,7 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
156
  voice_short_name = voice.split(" - ")[0]
157
  rate_str = f"{rate:+d}%"
158
  pitch_str = f"{pitch:+d}Hz"
 
159
 
160
  # Create temporary file for audio
161
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
@@ -185,7 +186,9 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
185
  segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
186
 
187
  # Generate audio for this segment
188
- communicate = edge_tts.Communicate(segment_text, voice_short_name, rate=rate_str, pitch=pitch_str)
 
 
189
  await communicate.save(segment_file)
190
 
191
  audio_segments.append({
@@ -195,9 +198,6 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
195
  'text': segment_text
196
  })
197
 
198
- # Combine audio segments with proper timing
199
- import wave
200
- import audioop
201
  from pydub import AudioSegment
202
 
203
  # Initialize final audio
@@ -222,7 +222,9 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
222
  f.write(f"{entry['text']}\n\n")
223
  else:
224
  # Use the existing approach for regular text
225
- communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
 
 
226
  if not generate_subtitles:
227
  await communicate.save(audio_path)
228
  if generate_subtitles:
@@ -304,8 +306,8 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
304
  return audio_path, subtitle_path, None
305
 
306
 
307
- async def tts_interface(text, voice, rate, pitch, generate_subtitles, uploaded_file=None):
308
- audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, generate_subtitles, uploaded_file)
309
  if warning:
310
  return audio, subtitle, gr.Warning(warning)
311
  return audio, subtitle, None
@@ -387,12 +389,15 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
387
  voice_short_name = speaker_settings[speaker_idx]['voice'].split(" - ")[0]
388
  rate_str = f"{speaker_settings[speaker_idx]['rate']:+d}%"
389
  pitch_str = f"{speaker_settings[speaker_idx]['pitch']:+d}Hz"
 
390
 
391
  # Create temporary file for this segment
392
  segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
393
 
394
  # Generate audio for this segment with speaker-specific settings
395
- communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
 
 
396
 
397
  # For subtitle generation, we need word boundaries
398
  if generate_subtitles:
@@ -492,8 +497,8 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
492
 
493
  return final_audio_path, subtitle_path, None
494
 
495
- async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, speaker1_rate, speaker1_pitch,
496
- speaker2_voice, speaker2_rate, speaker2_pitch):
497
  """Interface function for multi-speaker TTS"""
498
  # Create speaker settings from individual parameters
499
  speaker_settings = []
@@ -503,7 +508,8 @@ async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, spea
503
  speaker_settings.append({
504
  'voice': speaker1_voice,
505
  'rate': speaker1_rate,
506
- 'pitch': speaker1_pitch
 
507
  })
508
 
509
  # Add Speaker 2 if voice is selected
@@ -511,7 +517,8 @@ async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, spea
511
  speaker_settings.append({
512
  'voice': speaker2_voice,
513
  'rate': speaker2_rate,
514
- 'pitch': speaker2_pitch
 
515
  })
516
 
517
  if not speaker_settings:
@@ -564,6 +571,13 @@ async def create_demo():
564
  label="Speech Rate Adjustment (%)",
565
  step=1,
566
  )
 
 
 
 
 
 
 
567
  pitch_slider = gr.Slider(
568
  minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
569
  )
@@ -608,7 +622,7 @@ async def create_demo():
608
  submit_single_btn.click(
609
  fn=tts_interface,
610
  api_name="predict",
611
- inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, subtitle_checkbox, file_input],
612
  outputs=single_outputs
613
  )
614
 
@@ -635,6 +649,13 @@ async def create_demo():
635
  label="Speaker 1 Rate (%)",
636
  step=1,
637
  )
 
 
 
 
 
 
 
638
  speaker1_pitch = gr.Slider(
639
  minimum=-20,
640
  maximum=20,
@@ -656,6 +677,13 @@ async def create_demo():
656
  label="Speaker 2 Rate (%)",
657
  step=1,
658
  )
 
 
 
 
 
 
 
659
  speaker2_pitch = gr.Slider(
660
  minimum=-20,
661
  maximum=20,
@@ -683,9 +711,11 @@ async def create_demo():
683
  speaker1_voice,
684
  speaker1_rate,
685
  speaker1_pitch,
 
686
  speaker2_voice,
687
  speaker2_rate,
688
- speaker2_pitch
 
689
  ],
690
  outputs=multi_outputs
691
  )
 
126
  return "", gr.Warning("Failed to process the file")
127
 
128
 
129
+ async def text_to_speech(text, voice, rate, pitch, volume, generate_subtitles=False, uploaded_file=None):
130
  """Convert text to speech, handling both direct text input and uploaded files"""
131
  if not text.strip() and uploaded_file is None:
132
  return None, None, "Please enter text or upload a file to convert."
 
156
  voice_short_name = voice.split(" - ")[0]
157
  rate_str = f"{rate:+d}%"
158
  pitch_str = f"{pitch:+d}Hz"
159
+ volume_str = f"{volume:+d}%"
160
 
161
  # Create temporary file for audio
162
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
 
186
  segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
187
 
188
  # Generate audio for this segment
189
+ communicate = edge_tts.Communicate(
190
+ segment_text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
191
+ )
192
  await communicate.save(segment_file)
193
 
194
  audio_segments.append({
 
198
  'text': segment_text
199
  })
200
 
 
 
 
201
  from pydub import AudioSegment
202
 
203
  # Initialize final audio
 
222
  f.write(f"{entry['text']}\n\n")
223
  else:
224
  # Use the existing approach for regular text
225
+ communicate = edge_tts.Communicate(
226
+ text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
227
+ )
228
  if not generate_subtitles:
229
  await communicate.save(audio_path)
230
  if generate_subtitles:
 
306
  return audio_path, subtitle_path, None
307
 
308
 
309
+ async def tts_interface(text, voice, rate, pitch, volume, generate_subtitles, uploaded_file=None):
310
+ audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, volume, generate_subtitles, uploaded_file)
311
  if warning:
312
  return audio, subtitle, gr.Warning(warning)
313
  return audio, subtitle, None
 
389
  voice_short_name = speaker_settings[speaker_idx]['voice'].split(" - ")[0]
390
  rate_str = f"{speaker_settings[speaker_idx]['rate']:+d}%"
391
  pitch_str = f"{speaker_settings[speaker_idx]['pitch']:+d}Hz"
392
+ volume_str = f"{speaker_settings[speaker_idx].get('volume', 0):+d}%"
393
 
394
  # Create temporary file for this segment
395
  segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
396
 
397
  # Generate audio for this segment with speaker-specific settings
398
+ communicate = edge_tts.Communicate(
399
+ text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
400
+ )
401
 
402
  # For subtitle generation, we need word boundaries
403
  if generate_subtitles:
 
497
 
498
  return final_audio_path, subtitle_path, None
499
 
500
+ async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, speaker1_rate, speaker1_pitch, speaker1_volume,
501
+ speaker2_voice, speaker2_rate, speaker2_pitch, speaker2_volume):
502
  """Interface function for multi-speaker TTS"""
503
  # Create speaker settings from individual parameters
504
  speaker_settings = []
 
508
  speaker_settings.append({
509
  'voice': speaker1_voice,
510
  'rate': speaker1_rate,
511
+ 'pitch': speaker1_pitch,
512
+ 'volume': speaker1_volume,
513
  })
514
 
515
  # Add Speaker 2 if voice is selected
 
517
  speaker_settings.append({
518
  'voice': speaker2_voice,
519
  'rate': speaker2_rate,
520
+ 'pitch': speaker2_pitch,
521
+ 'volume': speaker2_volume,
522
  })
523
 
524
  if not speaker_settings:
 
571
  label="Speech Rate Adjustment (%)",
572
  step=1,
573
  )
574
+ volume_slider = gr.Slider(
575
+ minimum=-50,
576
+ maximum=50,
577
+ value=0,
578
+ label="Volume Adjustment (%)",
579
+ step=1,
580
+ )
581
  pitch_slider = gr.Slider(
582
  minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
583
  )
 
622
  submit_single_btn.click(
623
  fn=tts_interface,
624
  api_name="predict",
625
+ inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, volume_slider, subtitle_checkbox, file_input],
626
  outputs=single_outputs
627
  )
628
 
 
649
  label="Speaker 1 Rate (%)",
650
  step=1,
651
  )
652
+ speaker1_volume = gr.Slider(
653
+ minimum=-50,
654
+ maximum=50,
655
+ value=0,
656
+ label="Speaker 1 Volume (%)",
657
+ step=1,
658
+ )
659
  speaker1_pitch = gr.Slider(
660
  minimum=-20,
661
  maximum=20,
 
677
  label="Speaker 2 Rate (%)",
678
  step=1,
679
  )
680
+ speaker2_volume = gr.Slider(
681
+ minimum=-50,
682
+ maximum=50,
683
+ value=0,
684
+ label="Speaker 2 Volume (%)",
685
+ step=1,
686
+ )
687
  speaker2_pitch = gr.Slider(
688
  minimum=-20,
689
  maximum=20,
 
711
  speaker1_voice,
712
  speaker1_rate,
713
  speaker1_pitch,
714
+ speaker1_volume,
715
  speaker2_voice,
716
  speaker2_rate,
717
+ speaker2_pitch,
718
+ speaker2_volume,
719
  ],
720
  outputs=multi_outputs
721
  )
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
- edge_tts
2
  gradio
 
1
+ edge_tts==7.1.0
2
  gradio