Spaces:

walidadebayo
/

text-to-speech-clone

Running

App Files Files Community

walidadebayo commited on 12 days ago

Commit

cc98bbc

1 Parent(s): 2d65456

Add volume adjustment feature to text-to-speech and multi-speaker interfaces

Browse files

Files changed (2) hide show

app.py +45 -15
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -126,7 +126,7 @@ async def update_text_from_file(file):
     return "", gr.Warning("Failed to process the file")
-async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, uploaded_file=None):
     """Convert text to speech, handling both direct text input and uploaded files"""
     if not text.strip() and uploaded_file is None:
         return None, None, "Please enter text or upload a file to convert."
@@ -156,6 +156,7 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
     voice_short_name = voice.split(" - ")[0]
     rate_str = f"{rate:+d}%"
     pitch_str = f"{pitch:+d}Hz"
     # Create temporary file for audio
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
@@ -185,7 +186,9 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
                 segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
                 # Generate audio for this segment
-                communicate = edge_tts.Communicate(segment_text, voice_short_name, rate=rate_str, pitch=pitch_str)
                 await communicate.save(segment_file)
                 audio_segments.append({
@@ -195,9 +198,6 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
                     'text': segment_text
                 })
-            # Combine audio segments with proper timing
-            import wave
-            import audioop
             from pydub import AudioSegment
             # Initialize final audio
@@ -222,7 +222,9 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
                             f.write(f"{entry['text']}\n\n")
     else:
         # Use the existing approach for regular text
-        communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
         if not generate_subtitles:
             await communicate.save(audio_path)
         if generate_subtitles:
@@ -304,8 +306,8 @@ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, upl
     return audio_path, subtitle_path, None
-async def tts_interface(text, voice, rate, pitch, generate_subtitles, uploaded_file=None):
-    audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, generate_subtitles, uploaded_file)
     if warning:
         return audio, subtitle, gr.Warning(warning)
     return audio, subtitle, None
@@ -387,12 +389,15 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
             voice_short_name = speaker_settings[speaker_idx]['voice'].split(" - ")[0]
             rate_str = f"{speaker_settings[speaker_idx]['rate']:+d}%"
             pitch_str = f"{speaker_settings[speaker_idx]['pitch']:+d}Hz"
             # Create temporary file for this segment
             segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
             # Generate audio for this segment with speaker-specific settings
-            communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
             # For subtitle generation, we need word boundaries
             if generate_subtitles:
@@ -492,8 +497,8 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
     return final_audio_path, subtitle_path, None
-async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, speaker1_rate, speaker1_pitch,
-                                  speaker2_voice, speaker2_rate, speaker2_pitch):
     """Interface function for multi-speaker TTS"""
     # Create speaker settings from individual parameters
     speaker_settings = []
@@ -503,7 +508,8 @@ async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, spea
         speaker_settings.append({
             'voice': speaker1_voice,
             'rate': speaker1_rate,
-            'pitch': speaker1_pitch
         })
     # Add Speaker 2 if voice is selected
@@ -511,7 +517,8 @@ async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, spea
         speaker_settings.append({
             'voice': speaker2_voice,
             'rate': speaker2_rate,
-            'pitch': speaker2_pitch
         })
     if not speaker_settings:
@@ -564,6 +571,13 @@ async def create_demo():
                             label="Speech Rate Adjustment (%)",
                             step=1,
                         )
                         pitch_slider = gr.Slider(
                             minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
                         )
@@ -608,7 +622,7 @@ async def create_demo():
                 submit_single_btn.click(
                     fn=tts_interface,
                     api_name="predict",
-                    inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, subtitle_checkbox, file_input],
                     outputs=single_outputs
                 )
@@ -635,6 +649,13 @@ async def create_demo():
                                 label="Speaker 1 Rate (%)",
                                 step=1,
                             )
                             speaker1_pitch = gr.Slider(
                                 minimum=-20,
                                 maximum=20,
@@ -656,6 +677,13 @@ async def create_demo():
                                 label="Speaker 2 Rate (%)",
                                 step=1,
                             )
                             speaker2_pitch = gr.Slider(
                                 minimum=-20,
                                 maximum=20,
@@ -683,9 +711,11 @@ async def create_demo():
                         speaker1_voice,
                         speaker1_rate,
                         speaker1_pitch,
                         speaker2_voice,
                         speaker2_rate,
-                        speaker2_pitch
                     ],
                     outputs=multi_outputs
                 )

     return "", gr.Warning("Failed to process the file")
+async def text_to_speech(text, voice, rate, pitch, volume, generate_subtitles=False, uploaded_file=None):
     """Convert text to speech, handling both direct text input and uploaded files"""
     if not text.strip() and uploaded_file is None:
         return None, None, "Please enter text or upload a file to convert."
     voice_short_name = voice.split(" - ")[0]
     rate_str = f"{rate:+d}%"
     pitch_str = f"{pitch:+d}Hz"
+    volume_str = f"{volume:+d}%"
     # Create temporary file for audio
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
                 # Generate audio for this segment
+                communicate = edge_tts.Communicate(
+                    segment_text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
+                )
                 await communicate.save(segment_file)
                 audio_segments.append({
                     'text': segment_text
                 })
             from pydub import AudioSegment
             # Initialize final audio
                             f.write(f"{entry['text']}\n\n")
     else:
         # Use the existing approach for regular text
+        communicate = edge_tts.Communicate(
+            text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
+        )
         if not generate_subtitles:
             await communicate.save(audio_path)
         if generate_subtitles:
     return audio_path, subtitle_path, None
+async def tts_interface(text, voice, rate, pitch, volume, generate_subtitles, uploaded_file=None):
+    audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, volume, generate_subtitles, uploaded_file)
     if warning:
         return audio, subtitle, gr.Warning(warning)
     return audio, subtitle, None
             voice_short_name = speaker_settings[speaker_idx]['voice'].split(" - ")[0]
             rate_str = f"{speaker_settings[speaker_idx]['rate']:+d}%"
             pitch_str = f"{speaker_settings[speaker_idx]['pitch']:+d}Hz"
+            volume_str = f"{speaker_settings[speaker_idx].get('volume', 0):+d}%"
             # Create temporary file for this segment
             segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
             # Generate audio for this segment with speaker-specific settings
+            communicate = edge_tts.Communicate(
+                text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
+            )
             # For subtitle generation, we need word boundaries
             if generate_subtitles:
     return final_audio_path, subtitle_path, None
+async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, speaker1_rate, speaker1_pitch, speaker1_volume,
+                                  speaker2_voice, speaker2_rate, speaker2_pitch, speaker2_volume):
     """Interface function for multi-speaker TTS"""
     # Create speaker settings from individual parameters
     speaker_settings = []
         speaker_settings.append({
             'voice': speaker1_voice,
             'rate': speaker1_rate,
+            'pitch': speaker1_pitch,
+            'volume': speaker1_volume,
         })
     # Add Speaker 2 if voice is selected
         speaker_settings.append({
             'voice': speaker2_voice,
             'rate': speaker2_rate,
+            'pitch': speaker2_pitch,
+            'volume': speaker2_volume,
         })
     if not speaker_settings:
                             label="Speech Rate Adjustment (%)",
                             step=1,
                         )
+                        volume_slider = gr.Slider(
+                            minimum=-50,
+                            maximum=50,
+                            value=0,
+                            label="Volume Adjustment (%)",
+                            step=1,
+                        )
                         pitch_slider = gr.Slider(
                             minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
                         )
                 submit_single_btn.click(
                     fn=tts_interface,
                     api_name="predict",
+                    inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, volume_slider, subtitle_checkbox, file_input],
                     outputs=single_outputs
                 )
                                 label="Speaker 1 Rate (%)",
                                 step=1,
                             )
+                            speaker1_volume = gr.Slider(
+                                minimum=-50,
+                                maximum=50,
+                                value=0,
+                                label="Speaker 1 Volume (%)",
+                                step=1,
+                            )
                             speaker1_pitch = gr.Slider(
                                 minimum=-20,
                                 maximum=20,
                                 label="Speaker 2 Rate (%)",
                                 step=1,
                             )
+                            speaker2_volume = gr.Slider(
+                                minimum=-50,
+                                maximum=50,
+                                value=0,
+                                label="Speaker 2 Volume (%)",
+                                step=1,
+                            )
                             speaker2_pitch = gr.Slider(
                                 minimum=-20,
                                 maximum=20,
                         speaker1_voice,
                         speaker1_rate,
                         speaker1_pitch,
+                        speaker1_volume,
                         speaker2_voice,
                         speaker2_rate,
+                        speaker2_pitch,
+                        speaker2_volume,
                     ],
                     outputs=multi_outputs
                 )

requirements.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- edge_tts
2	gradio


1	+ edge_tts==7.1.0
2	gradio