Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
"""
|
2 |
-
|
3 |
|
4 |
This demo performs the following:
|
5 |
1. Accepts up to 15 seconds of audio recording from the microphone.
|
@@ -7,16 +7,20 @@ This demo performs the following:
|
|
7 |
3. Splits the transcription into segments and translates each segment
|
8 |
on-the-fly using Facebook’s M2M100 model.
|
9 |
4. Streams the cumulative translation output to the user.
|
10 |
-
5. Provides a "Restart Recording" button that resets the audio input and
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
14 |
"""
|
15 |
|
16 |
import gradio as gr
|
17 |
import whisper
|
18 |
import torch
|
19 |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
|
|
|
|
20 |
|
21 |
# -----------------------------------------------------------------------------
|
22 |
# Global Model Loading
|
@@ -29,9 +33,8 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
|
|
29 |
m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
|
30 |
|
31 |
# -----------------------------------------------------------------------------
|
32 |
-
# Define Supported Languages
|
33 |
# -----------------------------------------------------------------------------
|
34 |
-
# Added Polish as one of the supported languages.
|
35 |
LANGUAGES = {
|
36 |
"English": "en",
|
37 |
"Spanish": "es",
|
@@ -85,12 +88,25 @@ def translate_audio(audio, target_language):
|
|
85 |
def restart_recording():
|
86 |
"""
|
87 |
Reset the recording section by clearing the audio input and the translation output.
|
88 |
-
Returns:
|
89 |
-
- None for the audio input (clearing it)
|
90 |
-
- An empty string for the translation textbox.
|
91 |
"""
|
92 |
return None, ""
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
# -----------------------------------------------------------------------------
|
95 |
# Gradio Interface Definition
|
96 |
# -----------------------------------------------------------------------------
|
@@ -98,7 +114,9 @@ with gr.Blocks() as demo:
|
|
98 |
gr.Markdown("# Real-time Speech Translation Demo")
|
99 |
gr.Markdown(
|
100 |
"Speak into the microphone and your speech will be transcribed and translated "
|
101 |
-
"segment-by-segment. (Recording is limited to 15 seconds.)"
|
|
|
|
|
102 |
)
|
103 |
|
104 |
with gr.Row():
|
@@ -118,8 +136,12 @@ with gr.Blocks() as demo:
|
|
118 |
# Output textbox for displaying the (streaming) translation.
|
119 |
output_text = gr.Textbox(label="Translated Text", lines=10)
|
120 |
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
123 |
|
124 |
# When new audio is recorded, stream the translation.
|
125 |
audio_input.change(
|
@@ -134,6 +156,13 @@ with gr.Blocks() as demo:
|
|
134 |
inputs=[],
|
135 |
outputs=[audio_input, output_text]
|
136 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
# Launch the Gradio app (suitable for Hugging Face Spaces).
|
139 |
demo.launch()
|
|
|
1 |
"""
|
2 |
+
Speech Translation Demo with Restart and TTS
|
3 |
|
4 |
This demo performs the following:
|
5 |
1. Accepts up to 15 seconds of audio recording from the microphone.
|
|
|
7 |
3. Splits the transcription into segments and translates each segment
|
8 |
on-the-fly using Facebook’s M2M100 model.
|
9 |
4. Streams the cumulative translation output to the user.
|
10 |
+
5. Provides a "Restart Recording" button that resets the audio input and translation output.
|
11 |
+
6. Offers a "Read Translated Text" button that converts the final translation
|
12 |
+
into speech using gTTS.
|
13 |
+
|
14 |
+
Note: True real-time translation (i.e. while speaking) requires a continuous streaming
|
15 |
+
solution which is not provided by the standard browser microphone input.
|
16 |
"""
|
17 |
|
18 |
import gradio as gr
|
19 |
import whisper
|
20 |
import torch
|
21 |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
22 |
+
from gtts import gTTS
|
23 |
+
import uuid
|
24 |
|
25 |
# -----------------------------------------------------------------------------
|
26 |
# Global Model Loading
|
|
|
33 |
m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
|
34 |
|
35 |
# -----------------------------------------------------------------------------
|
36 |
+
# Define Supported Languages (including Polish)
|
37 |
# -----------------------------------------------------------------------------
|
|
|
38 |
LANGUAGES = {
|
39 |
"English": "en",
|
40 |
"Spanish": "es",
|
|
|
88 |
def restart_recording():
|
89 |
"""
|
90 |
Reset the recording section by clearing the audio input and the translation output.
|
|
|
|
|
|
|
91 |
"""
|
92 |
return None, ""
|
93 |
|
94 |
+
# -----------------------------------------------------------------------------
|
95 |
+
# TTS Generation Function
|
96 |
+
# -----------------------------------------------------------------------------
|
97 |
+
def generate_tts(text, target_language):
|
98 |
+
"""
|
99 |
+
Convert the translated text to speech using gTTS.
|
100 |
+
Returns the filename of the generated audio file.
|
101 |
+
"""
|
102 |
+
lang_code = LANGUAGES.get(target_language, "en")
|
103 |
+
if not text or not text.strip():
|
104 |
+
return None
|
105 |
+
filename = f"tts_{uuid.uuid4().hex}.mp3"
|
106 |
+
tts = gTTS(text=text, lang=lang_code)
|
107 |
+
tts.save(filename)
|
108 |
+
return filename
|
109 |
+
|
110 |
# -----------------------------------------------------------------------------
|
111 |
# Gradio Interface Definition
|
112 |
# -----------------------------------------------------------------------------
|
|
|
114 |
gr.Markdown("# Real-time Speech Translation Demo")
|
115 |
gr.Markdown(
|
116 |
"Speak into the microphone and your speech will be transcribed and translated "
|
117 |
+
"segment-by-segment. (Recording is limited to 15 seconds.)\n\n"
|
118 |
+
"**Note:** Due to browser limitations, the translation starts after you stop recording. "
|
119 |
+
"For a truly real-time experience, a continuous streaming solution would be required."
|
120 |
)
|
121 |
|
122 |
with gr.Row():
|
|
|
136 |
# Output textbox for displaying the (streaming) translation.
|
137 |
output_text = gr.Textbox(label="Translated Text", lines=10)
|
138 |
|
139 |
+
with gr.Row():
|
140 |
+
restart_button = gr.Button("Restart Recording")
|
141 |
+
read_aloud_button = gr.Button("Read Translated Text")
|
142 |
+
|
143 |
+
# Audio output for the TTS result.
|
144 |
+
tts_audio = gr.Audio(label="Translated Speech", type="filepath")
|
145 |
|
146 |
# When new audio is recorded, stream the translation.
|
147 |
audio_input.change(
|
|
|
156 |
inputs=[],
|
157 |
outputs=[audio_input, output_text]
|
158 |
)
|
159 |
+
|
160 |
+
# When the read aloud button is clicked, generate TTS from the translated text.
|
161 |
+
read_aloud_button.click(
|
162 |
+
fn=generate_tts,
|
163 |
+
inputs=[output_text, target_lang_dropdown],
|
164 |
+
outputs=tts_audio
|
165 |
+
)
|
166 |
|
167 |
# Launch the Gradio app (suitable for Hugging Face Spaces).
|
168 |
demo.launch()
|