tdurzynski commited on
Commit
4e7d1a7
·
verified ·
1 Parent(s): 5351689

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -13
app.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Real-time Speech Translation Demo with Restart Option
3
 
4
  This demo performs the following:
5
  1. Accepts up to 15 seconds of audio recording from the microphone.
@@ -7,16 +7,20 @@ This demo performs the following:
7
  3. Splits the transcription into segments and translates each segment
8
  on-the-fly using Facebook’s M2M100 model.
9
  4. Streams the cumulative translation output to the user.
10
- 5. Provides a "Restart Recording" button that resets the audio input and
11
- translation output.
12
-
13
- Make sure to install all dependencies from requirements.txt.
 
 
14
  """
15
 
16
  import gradio as gr
17
  import whisper
18
  import torch
19
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 
 
20
 
21
  # -----------------------------------------------------------------------------
22
  # Global Model Loading
@@ -29,9 +33,8 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
29
  m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
30
 
31
  # -----------------------------------------------------------------------------
32
- # Define Supported Languages
33
  # -----------------------------------------------------------------------------
34
- # Added Polish as one of the supported languages.
35
  LANGUAGES = {
36
  "English": "en",
37
  "Spanish": "es",
@@ -85,12 +88,25 @@ def translate_audio(audio, target_language):
85
  def restart_recording():
86
  """
87
  Reset the recording section by clearing the audio input and the translation output.
88
- Returns:
89
- - None for the audio input (clearing it)
90
- - An empty string for the translation textbox.
91
  """
92
  return None, ""
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  # -----------------------------------------------------------------------------
95
  # Gradio Interface Definition
96
  # -----------------------------------------------------------------------------
@@ -98,7 +114,9 @@ with gr.Blocks() as demo:
98
  gr.Markdown("# Real-time Speech Translation Demo")
99
  gr.Markdown(
100
  "Speak into the microphone and your speech will be transcribed and translated "
101
- "segment-by-segment. (Recording is limited to 15 seconds.)"
 
 
102
  )
103
 
104
  with gr.Row():
@@ -118,8 +136,12 @@ with gr.Blocks() as demo:
118
  # Output textbox for displaying the (streaming) translation.
119
  output_text = gr.Textbox(label="Translated Text", lines=10)
120
 
121
- # Restart button to clear the current recording and translation.
122
- restart_button = gr.Button("Restart Recording")
 
 
 
 
123
 
124
  # When new audio is recorded, stream the translation.
125
  audio_input.change(
@@ -134,6 +156,13 @@ with gr.Blocks() as demo:
134
  inputs=[],
135
  outputs=[audio_input, output_text]
136
  )
 
 
 
 
 
 
 
137
 
138
  # Launch the Gradio app (suitable for Hugging Face Spaces).
139
  demo.launch()
 
1
  """
2
+ Speech Translation Demo with Restart and TTS
3
 
4
  This demo performs the following:
5
  1. Accepts up to 15 seconds of audio recording from the microphone.
 
7
  3. Splits the transcription into segments and translates each segment
8
  on-the-fly using Facebook’s M2M100 model.
9
  4. Streams the cumulative translation output to the user.
10
+ 5. Provides a "Restart Recording" button that resets the audio input and translation output.
11
+ 6. Offers a "Read Translated Text" button that converts the final translation
12
+ into speech using gTTS.
13
+
14
+ Note: True real-time translation (i.e. while speaking) requires a continuous streaming
15
+ solution which is not provided by the standard browser microphone input.
16
  """
17
 
18
  import gradio as gr
19
  import whisper
20
  import torch
21
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
22
+ from gtts import gTTS
23
+ import uuid
24
 
25
  # -----------------------------------------------------------------------------
26
  # Global Model Loading
 
33
  m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
34
 
35
  # -----------------------------------------------------------------------------
36
+ # Define Supported Languages (including Polish)
37
  # -----------------------------------------------------------------------------
 
38
  LANGUAGES = {
39
  "English": "en",
40
  "Spanish": "es",
 
88
  def restart_recording():
89
  """
90
  Reset the recording section by clearing the audio input and the translation output.
 
 
 
91
  """
92
  return None, ""
93
 
94
+ # -----------------------------------------------------------------------------
95
+ # TTS Generation Function
96
+ # -----------------------------------------------------------------------------
97
+ def generate_tts(text, target_language):
98
+ """
99
+ Convert the translated text to speech using gTTS.
100
+ Returns the filename of the generated audio file.
101
+ """
102
+ lang_code = LANGUAGES.get(target_language, "en")
103
+ if not text or not text.strip():
104
+ return None
105
+ filename = f"tts_{uuid.uuid4().hex}.mp3"
106
+ tts = gTTS(text=text, lang=lang_code)
107
+ tts.save(filename)
108
+ return filename
109
+
110
  # -----------------------------------------------------------------------------
111
  # Gradio Interface Definition
112
  # -----------------------------------------------------------------------------
 
114
  gr.Markdown("# Real-time Speech Translation Demo")
115
  gr.Markdown(
116
  "Speak into the microphone and your speech will be transcribed and translated "
117
+ "segment-by-segment. (Recording is limited to 15 seconds.)\n\n"
118
+ "**Note:** Due to browser limitations, the translation starts after you stop recording. "
119
+ "For a truly real-time experience, a continuous streaming solution would be required."
120
  )
121
 
122
  with gr.Row():
 
136
  # Output textbox for displaying the (streaming) translation.
137
  output_text = gr.Textbox(label="Translated Text", lines=10)
138
 
139
+ with gr.Row():
140
+ restart_button = gr.Button("Restart Recording")
141
+ read_aloud_button = gr.Button("Read Translated Text")
142
+
143
+ # Audio output for the TTS result.
144
+ tts_audio = gr.Audio(label="Translated Speech", type="filepath")
145
 
146
  # When new audio is recorded, stream the translation.
147
  audio_input.change(
 
156
  inputs=[],
157
  outputs=[audio_input, output_text]
158
  )
159
+
160
+ # When the read aloud button is clicked, generate TTS from the translated text.
161
+ read_aloud_button.click(
162
+ fn=generate_tts,
163
+ inputs=[output_text, target_lang_dropdown],
164
+ outputs=tts_audio
165
+ )
166
 
167
  # Launch the Gradio app (suitable for Hugging Face Spaces).
168
  demo.launch()