tdurzynski commited on
Commit
5351689
·
verified ·
1 Parent(s): 443fc27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -46
app.py CHANGED
@@ -1,12 +1,14 @@
1
  """
2
- Real-time Speech Translation Demo
3
 
4
  This demo performs the following:
5
- 1. Accepts a 15-second audio recording from the microphone.
6
  2. Uses OpenAI’s Whisper model to transcribe the speech.
7
- 3. Splits the transcription into segments (each roughly corresponding to a sentence).
8
- 4. Translates each segment on-the-fly using Facebook’s M2M100 model (via Hugging Face Transformers).
9
- 5. Streams the cumulative translation output to the user.
 
 
10
 
11
  Make sure to install all dependencies from requirements.txt.
12
  """
@@ -20,25 +22,23 @@ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
20
  # Global Model Loading
21
  # -----------------------------------------------------------------------------
22
  # Load the Whisper model (using the "base" model for a balance between speed and accuracy).
23
- # Note: Loading models may take a few seconds on startup.
24
- whisper_model = whisper.load_model("base") # You can choose a larger model if desired
25
 
26
  # Load the M2M100 model and tokenizer for translation.
27
- # The "facebook/m2m100_418M" model supports translation between many languages.
28
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
29
  m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
30
 
31
  # -----------------------------------------------------------------------------
32
  # Define Supported Languages
33
  # -----------------------------------------------------------------------------
34
- # We define a mapping from display names to language codes used by M2M100.
35
- # (For a full list of supported languages see the M2M100 docs.)
36
  LANGUAGES = {
37
  "English": "en",
38
  "Spanish": "es",
39
  "French": "fr",
40
  "German": "de",
41
- "Chinese": "zh"
 
42
  }
43
 
44
  # -----------------------------------------------------------------------------
@@ -47,63 +47,50 @@ LANGUAGES = {
47
  def translate_audio(audio, target_language):
48
  """
49
  Process the input audio, transcribe it using Whisper, and translate each segment
50
- to the chosen target language. Yields a cumulative translation string for streaming.
51
-
52
- Parameters:
53
- audio (str): Path to the recorded audio file.
54
- target_language (str): Display name of the target language (e.g., "English").
55
-
56
- Yields:
57
- str: The cumulative translated text after processing each segment.
58
  """
59
  if audio is None:
60
  yield "No audio provided."
61
  return
62
 
63
- # Transcribe the audio file using Whisper.
64
- # Using fp16=False to ensure compatibility on CPUs.
65
  result = whisper_model.transcribe(audio, fp16=False)
66
-
67
- # Extract the detected source language from the transcription result.
68
- # (Whisper returns a language code, for example "en" for English.)
69
  source_lang = result.get("language", "en")
70
-
71
- # Get the target language code from our mapping; default to English if not found.
72
  target_lang_code = LANGUAGES.get(target_language, "en")
73
-
74
  cumulative_translation = ""
75
-
76
- # Iterate over each segment from the transcription.
77
- # Each segment is a dict with keys such as "start", "end", and "text".
78
  for segment in result.get("segments", []):
79
- # Clean up the segment text.
80
  segment_text = segment.get("text", "").strip()
81
  if segment_text == "":
82
  continue
83
-
84
- # If the source and target languages are the same, no translation is needed.
85
  if source_lang == target_lang_code:
86
  translated_segment = segment_text
87
  else:
88
- # Set the tokenizer's source language for proper translation.
89
  tokenizer.src_lang = source_lang
90
- # Tokenize the segment text.
91
  encoded = tokenizer(segment_text, return_tensors="pt")
92
- # Generate translation tokens.
93
- # The 'forced_bos_token_id' parameter forces the model to generate text in the target language.
94
  generated_tokens = m2m100_model.generate(
95
  **encoded,
96
  forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
97
  )
98
- # Decode the tokens to obtain the translated text.
99
  translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
100
-
101
- # Append the new translation segment to the cumulative output.
102
  cumulative_translation += translated_segment + " "
103
-
104
- # Yield the updated cumulative translation to simulate streaming output.
105
  yield cumulative_translation.strip()
106
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  # -----------------------------------------------------------------------------
108
  # Gradio Interface Definition
109
  # -----------------------------------------------------------------------------
@@ -115,14 +102,13 @@ with gr.Blocks() as demo:
115
  )
116
 
117
  with gr.Row():
118
- # Audio input: records from the microphone.
119
  audio_input = gr.Audio(
120
  sources=["microphone"],
121
  type="filepath",
122
  label="Record your speech (max 15 seconds)",
123
  elem_id="audio_input"
124
  )
125
- # Dropdown to select the target language.
126
  target_lang_dropdown = gr.Dropdown(
127
  choices=list(LANGUAGES.keys()),
128
  value="English",
@@ -132,13 +118,22 @@ with gr.Blocks() as demo:
132
  # Output textbox for displaying the (streaming) translation.
133
  output_text = gr.Textbox(label="Translated Text", lines=10)
134
 
135
- # Connect the audio input and dropdown to our translation function.
136
- # Since translate_audio is a generator (it yields partial results), Gradio will stream the output.
 
 
137
  audio_input.change(
138
  fn=translate_audio,
139
  inputs=[audio_input, target_lang_dropdown],
140
  outputs=output_text
141
  )
 
 
 
 
 
 
 
142
 
143
  # Launch the Gradio app (suitable for Hugging Face Spaces).
144
  demo.launch()
 
1
  """
2
+ Real-time Speech Translation Demo with Restart Option
3
 
4
  This demo performs the following:
5
+ 1. Accepts up to 15 seconds of audio recording from the microphone.
6
  2. Uses OpenAI’s Whisper model to transcribe the speech.
7
+ 3. Splits the transcription into segments and translates each segment
8
+ on-the-fly using Facebook’s M2M100 model.
9
+ 4. Streams the cumulative translation output to the user.
10
+ 5. Provides a "Restart Recording" button that resets the audio input and
11
+ translation output.
12
 
13
  Make sure to install all dependencies from requirements.txt.
14
  """
 
22
  # Global Model Loading
23
  # -----------------------------------------------------------------------------
24
  # Load the Whisper model (using the "base" model for a balance between speed and accuracy).
25
+ whisper_model = whisper.load_model("base") # Change model size as needed
 
26
 
27
  # Load the M2M100 model and tokenizer for translation.
 
28
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
29
  m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
30
 
31
  # -----------------------------------------------------------------------------
32
  # Define Supported Languages
33
  # -----------------------------------------------------------------------------
34
+ # Added Polish as one of the supported languages.
 
35
  LANGUAGES = {
36
  "English": "en",
37
  "Spanish": "es",
38
  "French": "fr",
39
  "German": "de",
40
+ "Chinese": "zh",
41
+ "Polish": "pl"
42
  }
43
 
44
  # -----------------------------------------------------------------------------
 
47
  def translate_audio(audio, target_language):
48
  """
49
  Process the input audio, transcribe it using Whisper, and translate each segment
50
+ to the chosen target language. Yields cumulative translation output for streaming.
 
 
 
 
 
 
 
51
  """
52
  if audio is None:
53
  yield "No audio provided."
54
  return
55
 
56
+ # Transcribe the audio using Whisper (fp16=False for CPU compatibility)
 
57
  result = whisper_model.transcribe(audio, fp16=False)
 
 
 
58
  source_lang = result.get("language", "en")
 
 
59
  target_lang_code = LANGUAGES.get(target_language, "en")
60
+
61
  cumulative_translation = ""
 
 
 
62
  for segment in result.get("segments", []):
 
63
  segment_text = segment.get("text", "").strip()
64
  if segment_text == "":
65
  continue
66
+
 
67
  if source_lang == target_lang_code:
68
  translated_segment = segment_text
69
  else:
70
+ # Set the source language for proper translation.
71
  tokenizer.src_lang = source_lang
 
72
  encoded = tokenizer(segment_text, return_tensors="pt")
 
 
73
  generated_tokens = m2m100_model.generate(
74
  **encoded,
75
  forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
76
  )
 
77
  translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
78
+
 
79
  cumulative_translation += translated_segment + " "
 
 
80
  yield cumulative_translation.strip()
81
 
82
+ # -----------------------------------------------------------------------------
83
+ # Restart Function
84
+ # -----------------------------------------------------------------------------
85
+ def restart_recording():
86
+ """
87
+ Reset the recording section by clearing the audio input and the translation output.
88
+ Returns:
89
+ - None for the audio input (clearing it)
90
+ - An empty string for the translation textbox.
91
+ """
92
+ return None, ""
93
+
94
  # -----------------------------------------------------------------------------
95
  # Gradio Interface Definition
96
  # -----------------------------------------------------------------------------
 
102
  )
103
 
104
  with gr.Row():
105
+ # Use 'sources' (list) to specify that the microphone is an input source.
106
  audio_input = gr.Audio(
107
  sources=["microphone"],
108
  type="filepath",
109
  label="Record your speech (max 15 seconds)",
110
  elem_id="audio_input"
111
  )
 
112
  target_lang_dropdown = gr.Dropdown(
113
  choices=list(LANGUAGES.keys()),
114
  value="English",
 
118
  # Output textbox for displaying the (streaming) translation.
119
  output_text = gr.Textbox(label="Translated Text", lines=10)
120
 
121
+ # Restart button to clear the current recording and translation.
122
+ restart_button = gr.Button("Restart Recording")
123
+
124
+ # When new audio is recorded, stream the translation.
125
  audio_input.change(
126
  fn=translate_audio,
127
  inputs=[audio_input, target_lang_dropdown],
128
  outputs=output_text
129
  )
130
+
131
+ # When the restart button is clicked, clear both the audio input and translation output.
132
+ restart_button.click(
133
+ fn=restart_recording,
134
+ inputs=[],
135
+ outputs=[audio_input, output_text]
136
+ )
137
 
138
  # Launch the Gradio app (suitable for Hugging Face Spaces).
139
  demo.launch()