Spaces:

tdurzynski
/

real-time-speech-translation

Running

App Files Files Community

tdurzynski commited on Feb 7

Commit

4e7d1a7

verified ·

1 Parent(s): 5351689

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -13

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Real-time Speech Translation Demo with Restart Option
 This demo performs the following:
   1. Accepts up to 15 seconds of audio recording from the microphone.
@@ -7,16 +7,20 @@ This demo performs the following:
   3. Splits the transcription into segments and translates each segment
      on-the-fly using Facebook’s M2M100 model.
   4. Streams the cumulative translation output to the user.
-  5. Provides a "Restart Recording" button that resets the audio input and
-     translation output.
-Make sure to install all dependencies from requirements.txt.
 """
 import gradio as gr
 import whisper
 import torch
 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 # -----------------------------------------------------------------------------
 # Global Model Loading
@@ -29,9 +33,8 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
 # -----------------------------------------------------------------------------
-# Define Supported Languages
 # -----------------------------------------------------------------------------
-# Added Polish as one of the supported languages.
 LANGUAGES = {
     "English": "en",
     "Spanish": "es",
@@ -85,12 +88,25 @@ def translate_audio(audio, target_language):
 def restart_recording():
     """
     Reset the recording section by clearing the audio input and the translation output.
-    Returns:
-      - None for the audio input (clearing it)
-      - An empty string for the translation textbox.
     """
     return None, ""
 # -----------------------------------------------------------------------------
 # Gradio Interface Definition
 # -----------------------------------------------------------------------------
@@ -98,7 +114,9 @@ with gr.Blocks() as demo:
     gr.Markdown("# Real-time Speech Translation Demo")
     gr.Markdown(
         "Speak into the microphone and your speech will be transcribed and translated "
-        "segment-by-segment. (Recording is limited to 15 seconds.)"
     )
     with gr.Row():
@@ -118,8 +136,12 @@ with gr.Blocks() as demo:
     # Output textbox for displaying the (streaming) translation.
     output_text = gr.Textbox(label="Translated Text", lines=10)
-    # Restart button to clear the current recording and translation.
-    restart_button = gr.Button("Restart Recording")
     # When new audio is recorded, stream the translation.
     audio_input.change(
@@ -134,6 +156,13 @@ with gr.Blocks() as demo:
         inputs=[],
         outputs=[audio_input, output_text]
     )
 # Launch the Gradio app (suitable for Hugging Face Spaces).
 demo.launch()

 """
+Speech Translation Demo with Restart and TTS
 This demo performs the following:
   1. Accepts up to 15 seconds of audio recording from the microphone.
   3. Splits the transcription into segments and translates each segment
      on-the-fly using Facebook’s M2M100 model.
   4. Streams the cumulative translation output to the user.
+  5. Provides a "Restart Recording" button that resets the audio input and translation output.
+  6. Offers a "Read Translated Text" button that converts the final translation
+     into speech using gTTS.
+Note: True real-time translation (i.e. while speaking) requires a continuous streaming
+solution which is not provided by the standard browser microphone input.
 """
 import gradio as gr
 import whisper
 import torch
 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+from gtts import gTTS
+import uuid
 # -----------------------------------------------------------------------------
 # Global Model Loading
 m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
 # -----------------------------------------------------------------------------
+# Define Supported Languages (including Polish)
 # -----------------------------------------------------------------------------
 LANGUAGES = {
     "English": "en",
     "Spanish": "es",
 def restart_recording():
     """
     Reset the recording section by clearing the audio input and the translation output.
     """
     return None, ""
+# -----------------------------------------------------------------------------
+# TTS Generation Function
+# -----------------------------------------------------------------------------
+def generate_tts(text, target_language):
+    """
+    Convert the translated text to speech using gTTS.
+    Returns the filename of the generated audio file.
+    """
+    lang_code = LANGUAGES.get(target_language, "en")
+    if not text or not text.strip():
+        return None
+    filename = f"tts_{uuid.uuid4().hex}.mp3"
+    tts = gTTS(text=text, lang=lang_code)
+    tts.save(filename)
+    return filename
 # -----------------------------------------------------------------------------
 # Gradio Interface Definition
 # -----------------------------------------------------------------------------
     gr.Markdown("# Real-time Speech Translation Demo")
     gr.Markdown(
         "Speak into the microphone and your speech will be transcribed and translated "
+        "segment-by-segment. (Recording is limited to 15 seconds.)\n\n"
+        "**Note:** Due to browser limitations, the translation starts after you stop recording. "
+        "For a truly real-time experience, a continuous streaming solution would be required."
     )
     with gr.Row():
     # Output textbox for displaying the (streaming) translation.
     output_text = gr.Textbox(label="Translated Text", lines=10)
+    with gr.Row():
+        restart_button = gr.Button("Restart Recording")
+        read_aloud_button = gr.Button("Read Translated Text")
+    # Audio output for the TTS result.
+    tts_audio = gr.Audio(label="Translated Speech", type="filepath")
     # When new audio is recorded, stream the translation.
     audio_input.change(
         inputs=[],
         outputs=[audio_input, output_text]
     )
+    # When the read aloud button is clicked, generate TTS from the translated text.
+    read_aloud_button.click(
+        fn=generate_tts,
+        inputs=[output_text, target_lang_dropdown],
+        outputs=tts_audio
+    )
 # Launch the Gradio app (suitable for Hugging Face Spaces).
 demo.launch()