Spaces:

tdurzynski
/

real-time-speech-translation

Running

App Files Files Community

tdurzynski commited on Feb 7

Commit

1dc3846

verified ·

1 Parent(s): 85f8d5f

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -38

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Speech Translation Demo with Restart and TTS
 This demo performs the following:
   1. Accepts up to 15 seconds of audio recording from the microphone.
@@ -7,10 +7,10 @@ This demo performs the following:
   3. Splits the transcription into segments and translates each segment
      on-the-fly using Facebook’s M2M100 model.
   4. Streams the cumulative translation output to the user.
-  5. Provides a "Restart Recording" button that resets the audio input and translation output.
-  6. Offers a "Read Translated Text" button that converts the final translation
-     into speech using gTTS.
 Note: True real-time translation (i.e. while speaking) requires a continuous streaming
 solution which is not provided by the standard browser microphone input.
 """
@@ -25,8 +25,8 @@ import uuid
 # -----------------------------------------------------------------------------
 # Global Model Loading
 # -----------------------------------------------------------------------------
-# Load the Whisper model (using the "base" model for a balance between speed and accuracy).
-whisper_model = whisper.load_model("base")  # Change model size as needed
 # Load the M2M100 model and tokenizer for translation.
 tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
@@ -45,7 +45,7 @@ LANGUAGES = {
 }
 # -----------------------------------------------------------------------------
-# Main Processing Function
 # -----------------------------------------------------------------------------
 def translate_audio(audio, target_language):
     """
@@ -64,7 +64,7 @@ def translate_audio(audio, target_language):
     cumulative_translation = ""
     for segment in result.get("segments", []):
         segment_text = segment.get("text", "").strip()
-        if segment_text == "":
             continue
         if source_lang == target_lang_code:
@@ -82,15 +82,6 @@ def translate_audio(audio, target_language):
         cumulative_translation += translated_segment + " "
         yield cumulative_translation.strip()
-# -----------------------------------------------------------------------------
-# Restart Function
-# -----------------------------------------------------------------------------
-def restart_recording():
-    """
-    Reset the recording section by clearing the audio input and the translation output.
-    """
-    return None, ""
 # -----------------------------------------------------------------------------
 # TTS Generation Function
 # -----------------------------------------------------------------------------
@@ -108,19 +99,28 @@ def generate_tts(text, target_language):
     return filename
 # -----------------------------------------------------------------------------
-# Gradio Interface Definition
 # -----------------------------------------------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("# Real-time Speech Translation Demo")
     gr.Markdown(
         "Speak into the microphone and your speech will be transcribed and translated "
         "segment-by-segment. (Recording is limited to 15 seconds.)\n\n"
-        "**Note:** Due to browser limitations, the translation starts after you stop recording. "
-        "For a truly real-time experience, a continuous streaming solution would be required."
     )
     with gr.Row():
-        # Use 'sources' (list) to specify that the microphone is an input source.
         audio_input = gr.Audio(
             sources=["microphone"],
             type="filepath",
@@ -133,35 +133,31 @@ with gr.Blocks() as demo:
             label="Select Target Language"
         )
-    # Output textbox for displaying the (streaming) translation.
-    output_text = gr.Textbox(label="Translated Text", lines=10)
     with gr.Row():
         restart_button = gr.Button("Restart Recording")
-        read_aloud_button = gr.Button("Read Translated Text")
-    # Audio output for the TTS result.
     tts_audio = gr.Audio(label="Translated Speech", type="filepath")
-    # When new audio is recorded, stream the translation.
     audio_input.change(
         fn=translate_audio,
         inputs=[audio_input, target_lang_dropdown],
-        outputs=output_text
     )
-    # When the restart button is clicked, clear both the audio input and translation output.
     restart_button.click(
         fn=restart_recording,
         inputs=[],
-        outputs=[audio_input, output_text]
-    )
-    # When the read aloud button is clicked, generate TTS from the translated text.
-    read_aloud_button.click(
-        fn=generate_tts,
-        inputs=[output_text, target_lang_dropdown],
-        outputs=tts_audio
     )
 # Launch the Gradio app (suitable for Hugging Face Spaces).

 """
+Speech Translation Demo with Automatic TTS and Restart Option
 This demo performs the following:
   1. Accepts up to 15 seconds of audio recording from the microphone.
   3. Splits the transcription into segments and translates each segment
      on-the-fly using Facebook’s M2M100 model.
   4. Streams the cumulative translation output to the user.
+  5. Automatically converts the final translated text to speech using gTTS.
+  6. Provides a "Restart Recording" button (located just below the recording section)
+     to reset the audio input, translated text, and TTS output.
 Note: True real-time translation (i.e. while speaking) requires a continuous streaming
 solution which is not provided by the standard browser microphone input.
 """
 # -----------------------------------------------------------------------------
 # Global Model Loading
 # -----------------------------------------------------------------------------
+# Load the Whisper model (using "base" for a balance between speed and accuracy).
+whisper_model = whisper.load_model("base")  # Adjust model size as needed
 # Load the M2M100 model and tokenizer for translation.
 tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 }
 # -----------------------------------------------------------------------------
+# Main Processing Function: Translation (streaming)
 # -----------------------------------------------------------------------------
 def translate_audio(audio, target_language):
     """
     cumulative_translation = ""
     for segment in result.get("segments", []):
         segment_text = segment.get("text", "").strip()
+        if not segment_text:
             continue
         if source_lang == target_lang_code:
         cumulative_translation += translated_segment + " "
         yield cumulative_translation.strip()
 # -----------------------------------------------------------------------------
 # TTS Generation Function
 # -----------------------------------------------------------------------------
     return filename
 # -----------------------------------------------------------------------------
+# Restart Function
+# -----------------------------------------------------------------------------
+def restart_recording():
+    """
+    Reset the recording section by clearing the audio input, the translation textbox,
+    and the TTS audio output.
+    """
+    return None, "", None
+# -----------------------------------------------------------------------------
+# Gradio Interface Definition with Updated Layout and Chained Events
 # -----------------------------------------------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("# Real-time Speech Translation Demo")
     gr.Markdown(
         "Speak into the microphone and your speech will be transcribed and translated "
         "segment-by-segment. (Recording is limited to 15 seconds.)\n\n"
+        "**Note:** The translation and speech synthesis occur automatically after recording."
     )
+    # Top row: Audio input and target language selection.
     with gr.Row():
         audio_input = gr.Audio(
             sources=["microphone"],
             type="filepath",
             label="Select Target Language"
         )
+    # Restart Recording button placed directly below the recording section.
     with gr.Row():
         restart_button = gr.Button("Restart Recording")
+    # Output components: Translated text and TTS audio.
+    output_text = gr.Textbox(label="Translated Text", lines=10)
     tts_audio = gr.Audio(label="Translated Speech", type="filepath")
+    # Chain the audio input change event: first stream translation text, then automatically generate TTS.
     audio_input.change(
         fn=translate_audio,
         inputs=[audio_input, target_lang_dropdown],
+        outputs=output_text,
+        stream=True
+    ).then(
+        fn=generate_tts,
+        inputs=[output_text, target_lang_dropdown],
+        outputs=tts_audio
     )
+    # Restart button clears the audio input, translation text, and TTS output.
     restart_button.click(
         fn=restart_recording,
         inputs=[],
+        outputs=[audio_input, output_text, tts_audio]
     )
 # Launch the Gradio app (suitable for Hugging Face Spaces).