Spaces:

tdurzynski
/

real-time-speech-translation

Running

App Files Files Community

tdurzynski commited on Feb 7

Commit

5351689

verified ·

1 Parent(s): 443fc27

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -46

app.py CHANGED Viewed

@@ -1,12 +1,14 @@
 """
-Real-time Speech Translation Demo
 This demo performs the following:
-  1. Accepts a 15-second audio recording from the microphone.
   2. Uses OpenAI’s Whisper model to transcribe the speech.
-  3. Splits the transcription into segments (each roughly corresponding to a sentence).
-  4. Translates each segment on-the-fly using Facebook’s M2M100 model (via Hugging Face Transformers).
-  5. Streams the cumulative translation output to the user.
 Make sure to install all dependencies from requirements.txt.
 """
@@ -20,25 +22,23 @@ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 # Global Model Loading
 # -----------------------------------------------------------------------------
 # Load the Whisper model (using the "base" model for a balance between speed and accuracy).
-# Note: Loading models may take a few seconds on startup.
-whisper_model = whisper.load_model("base")  # You can choose a larger model if desired
 # Load the M2M100 model and tokenizer for translation.
-# The "facebook/m2m100_418M" model supports translation between many languages.
 tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
 # -----------------------------------------------------------------------------
 # Define Supported Languages
 # -----------------------------------------------------------------------------
-# We define a mapping from display names to language codes used by M2M100.
-# (For a full list of supported languages see the M2M100 docs.)
 LANGUAGES = {
     "English": "en",
     "Spanish": "es",
     "French": "fr",
     "German": "de",
-    "Chinese": "zh"
 }
 # -----------------------------------------------------------------------------
@@ -47,63 +47,50 @@ LANGUAGES = {
 def translate_audio(audio, target_language):
     """
     Process the input audio, transcribe it using Whisper, and translate each segment
-    to the chosen target language. Yields a cumulative translation string for streaming.
-    Parameters:
-      audio (str): Path to the recorded audio file.
-      target_language (str): Display name of the target language (e.g., "English").
-    Yields:
-      str: The cumulative translated text after processing each segment.
     """
     if audio is None:
         yield "No audio provided."
         return
-    # Transcribe the audio file using Whisper.
-    # Using fp16=False to ensure compatibility on CPUs.
     result = whisper_model.transcribe(audio, fp16=False)
-    # Extract the detected source language from the transcription result.
-    # (Whisper returns a language code, for example "en" for English.)
     source_lang = result.get("language", "en")
-    # Get the target language code from our mapping; default to English if not found.
     target_lang_code = LANGUAGES.get(target_language, "en")
     cumulative_translation = ""
-    # Iterate over each segment from the transcription.
-    # Each segment is a dict with keys such as "start", "end", and "text".
     for segment in result.get("segments", []):
-        # Clean up the segment text.
         segment_text = segment.get("text", "").strip()
         if segment_text == "":
             continue
-        # If the source and target languages are the same, no translation is needed.
         if source_lang == target_lang_code:
             translated_segment = segment_text
         else:
-            # Set the tokenizer's source language for proper translation.
             tokenizer.src_lang = source_lang
-            # Tokenize the segment text.
             encoded = tokenizer(segment_text, return_tensors="pt")
-            # Generate translation tokens.
-            # The 'forced_bos_token_id' parameter forces the model to generate text in the target language.
             generated_tokens = m2m100_model.generate(
                 **encoded,
                 forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
             )
-            # Decode the tokens to obtain the translated text.
             translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
-        # Append the new translation segment to the cumulative output.
         cumulative_translation += translated_segment + " "
-        # Yield the updated cumulative translation to simulate streaming output.
         yield cumulative_translation.strip()
 # -----------------------------------------------------------------------------
 # Gradio Interface Definition
 # -----------------------------------------------------------------------------
@@ -115,14 +102,13 @@ with gr.Blocks() as demo:
     )
     with gr.Row():
-        # Audio input: records from the microphone.
         audio_input = gr.Audio(
             sources=["microphone"],
             type="filepath",
             label="Record your speech (max 15 seconds)",
             elem_id="audio_input"
         )
-        # Dropdown to select the target language.
         target_lang_dropdown = gr.Dropdown(
             choices=list(LANGUAGES.keys()),
             value="English",
@@ -132,13 +118,22 @@ with gr.Blocks() as demo:
     # Output textbox for displaying the (streaming) translation.
     output_text = gr.Textbox(label="Translated Text", lines=10)
-    # Connect the audio input and dropdown to our translation function.
-    # Since translate_audio is a generator (it yields partial results), Gradio will stream the output.
     audio_input.change(
         fn=translate_audio,
         inputs=[audio_input, target_lang_dropdown],
         outputs=output_text
     )
 # Launch the Gradio app (suitable for Hugging Face Spaces).
 demo.launch()

 """
+Real-time Speech Translation Demo with Restart Option
 This demo performs the following:
+  1. Accepts up to 15 seconds of audio recording from the microphone.
   2. Uses OpenAI’s Whisper model to transcribe the speech.
+  3. Splits the transcription into segments and translates each segment
+     on-the-fly using Facebook’s M2M100 model.
+  4. Streams the cumulative translation output to the user.
+  5. Provides a "Restart Recording" button that resets the audio input and
+     translation output.
 Make sure to install all dependencies from requirements.txt.
 """
 # Global Model Loading
 # -----------------------------------------------------------------------------
 # Load the Whisper model (using the "base" model for a balance between speed and accuracy).
+whisper_model = whisper.load_model("base")  # Change model size as needed
 # Load the M2M100 model and tokenizer for translation.
 tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
 # -----------------------------------------------------------------------------
 # Define Supported Languages
 # -----------------------------------------------------------------------------
+# Added Polish as one of the supported languages.
 LANGUAGES = {
     "English": "en",
     "Spanish": "es",
     "French": "fr",
     "German": "de",
+    "Chinese": "zh",
+    "Polish": "pl"
 }
 # -----------------------------------------------------------------------------
 def translate_audio(audio, target_language):
     """
     Process the input audio, transcribe it using Whisper, and translate each segment
+    to the chosen target language. Yields cumulative translation output for streaming.
     """
     if audio is None:
         yield "No audio provided."
         return
+    # Transcribe the audio using Whisper (fp16=False for CPU compatibility)
     result = whisper_model.transcribe(audio, fp16=False)
     source_lang = result.get("language", "en")
     target_lang_code = LANGUAGES.get(target_language, "en")
     cumulative_translation = ""
     for segment in result.get("segments", []):
         segment_text = segment.get("text", "").strip()
         if segment_text == "":
             continue
         if source_lang == target_lang_code:
             translated_segment = segment_text
         else:
+            # Set the source language for proper translation.
             tokenizer.src_lang = source_lang
             encoded = tokenizer(segment_text, return_tensors="pt")
             generated_tokens = m2m100_model.generate(
                 **encoded,
                 forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
             )
             translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
         cumulative_translation += translated_segment + " "
         yield cumulative_translation.strip()
+# -----------------------------------------------------------------------------
+# Restart Function
+# -----------------------------------------------------------------------------
+def restart_recording():
+    """
+    Reset the recording section by clearing the audio input and the translation output.
+    Returns:
+      - None for the audio input (clearing it)
+      - An empty string for the translation textbox.
+    """
+    return None, ""
 # -----------------------------------------------------------------------------
 # Gradio Interface Definition
 # -----------------------------------------------------------------------------
     )
     with gr.Row():
+        # Use 'sources' (list) to specify that the microphone is an input source.
         audio_input = gr.Audio(
             sources=["microphone"],
             type="filepath",
             label="Record your speech (max 15 seconds)",
             elem_id="audio_input"
         )
         target_lang_dropdown = gr.Dropdown(
             choices=list(LANGUAGES.keys()),
             value="English",
     # Output textbox for displaying the (streaming) translation.
     output_text = gr.Textbox(label="Translated Text", lines=10)
+    # Restart button to clear the current recording and translation.
+    restart_button = gr.Button("Restart Recording")
+    # When new audio is recorded, stream the translation.
     audio_input.change(
         fn=translate_audio,
         inputs=[audio_input, target_lang_dropdown],
         outputs=output_text
     )
+    # When the restart button is clicked, clear both the audio input and translation output.
+    restart_button.click(
+        fn=restart_recording,
+        inputs=[],
+        outputs=[audio_input, output_text]
+    )
 # Launch the Gradio app (suitable for Hugging Face Spaces).
 demo.launch()