tdurzynski commited on
Commit
1dc3846
·
verified ·
1 Parent(s): 85f8d5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -38
app.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Speech Translation Demo with Restart and TTS
3
 
4
  This demo performs the following:
5
  1. Accepts up to 15 seconds of audio recording from the microphone.
@@ -7,10 +7,10 @@ This demo performs the following:
7
  3. Splits the transcription into segments and translates each segment
8
  on-the-fly using Facebook’s M2M100 model.
9
  4. Streams the cumulative translation output to the user.
10
- 5. Provides a "Restart Recording" button that resets the audio input and translation output.
11
- 6. Offers a "Read Translated Text" button that converts the final translation
12
- into speech using gTTS.
13
-
14
  Note: True real-time translation (i.e. while speaking) requires a continuous streaming
15
  solution which is not provided by the standard browser microphone input.
16
  """
@@ -25,8 +25,8 @@ import uuid
25
  # -----------------------------------------------------------------------------
26
  # Global Model Loading
27
  # -----------------------------------------------------------------------------
28
- # Load the Whisper model (using the "base" model for a balance between speed and accuracy).
29
- whisper_model = whisper.load_model("base") # Change model size as needed
30
 
31
  # Load the M2M100 model and tokenizer for translation.
32
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
@@ -45,7 +45,7 @@ LANGUAGES = {
45
  }
46
 
47
  # -----------------------------------------------------------------------------
48
- # Main Processing Function
49
  # -----------------------------------------------------------------------------
50
  def translate_audio(audio, target_language):
51
  """
@@ -64,7 +64,7 @@ def translate_audio(audio, target_language):
64
  cumulative_translation = ""
65
  for segment in result.get("segments", []):
66
  segment_text = segment.get("text", "").strip()
67
- if segment_text == "":
68
  continue
69
 
70
  if source_lang == target_lang_code:
@@ -82,15 +82,6 @@ def translate_audio(audio, target_language):
82
  cumulative_translation += translated_segment + " "
83
  yield cumulative_translation.strip()
84
 
85
- # -----------------------------------------------------------------------------
86
- # Restart Function
87
- # -----------------------------------------------------------------------------
88
- def restart_recording():
89
- """
90
- Reset the recording section by clearing the audio input and the translation output.
91
- """
92
- return None, ""
93
-
94
  # -----------------------------------------------------------------------------
95
  # TTS Generation Function
96
  # -----------------------------------------------------------------------------
@@ -108,19 +99,28 @@ def generate_tts(text, target_language):
108
  return filename
109
 
110
  # -----------------------------------------------------------------------------
111
- # Gradio Interface Definition
 
 
 
 
 
 
 
 
 
 
112
  # -----------------------------------------------------------------------------
113
  with gr.Blocks() as demo:
114
  gr.Markdown("# Real-time Speech Translation Demo")
115
  gr.Markdown(
116
  "Speak into the microphone and your speech will be transcribed and translated "
117
  "segment-by-segment. (Recording is limited to 15 seconds.)\n\n"
118
- "**Note:** Due to browser limitations, the translation starts after you stop recording. "
119
- "For a truly real-time experience, a continuous streaming solution would be required."
120
  )
121
 
 
122
  with gr.Row():
123
- # Use 'sources' (list) to specify that the microphone is an input source.
124
  audio_input = gr.Audio(
125
  sources=["microphone"],
126
  type="filepath",
@@ -133,35 +133,31 @@ with gr.Blocks() as demo:
133
  label="Select Target Language"
134
  )
135
 
136
- # Output textbox for displaying the (streaming) translation.
137
- output_text = gr.Textbox(label="Translated Text", lines=10)
138
-
139
  with gr.Row():
140
  restart_button = gr.Button("Restart Recording")
141
- read_aloud_button = gr.Button("Read Translated Text")
142
 
143
- # Audio output for the TTS result.
 
144
  tts_audio = gr.Audio(label="Translated Speech", type="filepath")
145
 
146
- # When new audio is recorded, stream the translation.
147
  audio_input.change(
148
  fn=translate_audio,
149
  inputs=[audio_input, target_lang_dropdown],
150
- outputs=output_text
 
 
 
 
 
151
  )
152
 
153
- # When the restart button is clicked, clear both the audio input and translation output.
154
  restart_button.click(
155
  fn=restart_recording,
156
  inputs=[],
157
- outputs=[audio_input, output_text]
158
- )
159
-
160
- # When the read aloud button is clicked, generate TTS from the translated text.
161
- read_aloud_button.click(
162
- fn=generate_tts,
163
- inputs=[output_text, target_lang_dropdown],
164
- outputs=tts_audio
165
  )
166
 
167
  # Launch the Gradio app (suitable for Hugging Face Spaces).
 
1
  """
2
+ Speech Translation Demo with Automatic TTS and Restart Option
3
 
4
  This demo performs the following:
5
  1. Accepts up to 15 seconds of audio recording from the microphone.
 
7
  3. Splits the transcription into segments and translates each segment
8
  on-the-fly using Facebook’s M2M100 model.
9
  4. Streams the cumulative translation output to the user.
10
+ 5. Automatically converts the final translated text to speech using gTTS.
11
+ 6. Provides a "Restart Recording" button (located just below the recording section)
12
+ to reset the audio input, translated text, and TTS output.
13
+
14
  Note: True real-time translation (i.e. while speaking) requires a continuous streaming
15
  solution which is not provided by the standard browser microphone input.
16
  """
 
25
  # -----------------------------------------------------------------------------
26
  # Global Model Loading
27
  # -----------------------------------------------------------------------------
28
+ # Load the Whisper model (using "base" for a balance between speed and accuracy).
29
+ whisper_model = whisper.load_model("base") # Adjust model size as needed
30
 
31
  # Load the M2M100 model and tokenizer for translation.
32
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 
45
  }
46
 
47
  # -----------------------------------------------------------------------------
48
+ # Main Processing Function: Translation (streaming)
49
  # -----------------------------------------------------------------------------
50
  def translate_audio(audio, target_language):
51
  """
 
64
  cumulative_translation = ""
65
  for segment in result.get("segments", []):
66
  segment_text = segment.get("text", "").strip()
67
+ if not segment_text:
68
  continue
69
 
70
  if source_lang == target_lang_code:
 
82
  cumulative_translation += translated_segment + " "
83
  yield cumulative_translation.strip()
84
 
 
 
 
 
 
 
 
 
 
85
  # -----------------------------------------------------------------------------
86
  # TTS Generation Function
87
  # -----------------------------------------------------------------------------
 
99
  return filename
100
 
101
  # -----------------------------------------------------------------------------
102
+ # Restart Function
103
+ # -----------------------------------------------------------------------------
104
+ def restart_recording():
105
+ """
106
+ Reset the recording section by clearing the audio input, the translation textbox,
107
+ and the TTS audio output.
108
+ """
109
+ return None, "", None
110
+
111
+ # -----------------------------------------------------------------------------
112
+ # Gradio Interface Definition with Updated Layout and Chained Events
113
  # -----------------------------------------------------------------------------
114
  with gr.Blocks() as demo:
115
  gr.Markdown("# Real-time Speech Translation Demo")
116
  gr.Markdown(
117
  "Speak into the microphone and your speech will be transcribed and translated "
118
  "segment-by-segment. (Recording is limited to 15 seconds.)\n\n"
119
+ "**Note:** The translation and speech synthesis occur automatically after recording."
 
120
  )
121
 
122
+ # Top row: Audio input and target language selection.
123
  with gr.Row():
 
124
  audio_input = gr.Audio(
125
  sources=["microphone"],
126
  type="filepath",
 
133
  label="Select Target Language"
134
  )
135
 
136
+ # Restart Recording button placed directly below the recording section.
 
 
137
  with gr.Row():
138
  restart_button = gr.Button("Restart Recording")
 
139
 
140
+ # Output components: Translated text and TTS audio.
141
+ output_text = gr.Textbox(label="Translated Text", lines=10)
142
  tts_audio = gr.Audio(label="Translated Speech", type="filepath")
143
 
144
+ # Chain the audio input change event: first stream translation text, then automatically generate TTS.
145
  audio_input.change(
146
  fn=translate_audio,
147
  inputs=[audio_input, target_lang_dropdown],
148
+ outputs=output_text,
149
+ stream=True
150
+ ).then(
151
+ fn=generate_tts,
152
+ inputs=[output_text, target_lang_dropdown],
153
+ outputs=tts_audio
154
  )
155
 
156
+ # Restart button clears the audio input, translation text, and TTS output.
157
  restart_button.click(
158
  fn=restart_recording,
159
  inputs=[],
160
+ outputs=[audio_input, output_text, tts_audio]
 
 
 
 
 
 
 
161
  )
162
 
163
  # Launch the Gradio app (suitable for Hugging Face Spaces).