tdurzynski commited on
Commit
b4d8745
·
verified ·
1 Parent(s): a9e31b2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Real-time Speech Translation Demo
3
+
4
+ This demo performs the following:
5
+ 1. Accepts a 15-second audio recording from the microphone.
6
+ 2. Uses OpenAI’s Whisper model to transcribe the speech.
7
+ 3. Splits the transcription into segments (each roughly corresponding to a sentence).
8
+ 4. Translates each segment on-the-fly using Facebook’s M2M100 model (via Hugging Face Transformers).
9
+ 5. Streams the cumulative translation output to the user.
10
+
11
+ Make sure to install all dependencies from requirements.txt.
12
+ """
13
+
14
+ import gradio as gr
15
+ import whisper
16
+ import torch
17
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
18
+
19
+ # -----------------------------------------------------------------------------
20
+ # Global Model Loading
21
+ # -----------------------------------------------------------------------------
22
+ # Load the Whisper model (using the "base" model for a balance between speed and accuracy).
23
+ # Note: Loading models may take a few seconds on startup.
24
+ whisper_model = whisper.load_model("base") # You can choose a larger model if desired
25
+
26
+ # Load the M2M100 model and tokenizer for translation.
27
+ # The "facebook/m2m100_418M" model supports translation between many languages.
28
+ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
29
+ m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
30
+
31
+ # -----------------------------------------------------------------------------
32
+ # Define Supported Languages
33
+ # -----------------------------------------------------------------------------
34
+ # We define a mapping from display names to language codes used by M2M100.
35
+ # (For a full list of supported languages see the M2M100 docs.)
36
+ LANGUAGES = {
37
+ "English": "en",
38
+ "Spanish": "es",
39
+ "French": "fr",
40
+ "German": "de",
41
+ "Chinese": "zh"
42
+ }
43
+
44
+ # -----------------------------------------------------------------------------
45
+ # Main Processing Function
46
+ # -----------------------------------------------------------------------------
47
+ def translate_audio(audio, target_language):
48
+ """
49
+ Process the input audio, transcribe it using Whisper, and translate each segment
50
+ to the chosen target language. Yields a cumulative translation string for streaming.
51
+
52
+ Parameters:
53
+ audio (str): Path to the recorded audio file.
54
+ target_language (str): Display name of the target language (e.g., "English").
55
+
56
+ Yields:
57
+ str: The cumulative translated text after processing each segment.
58
+ """
59
+ if audio is None:
60
+ yield "No audio provided."
61
+ return
62
+
63
+ # Transcribe the audio file using Whisper.
64
+ # Using fp16=False to ensure compatibility on CPUs.
65
+ result = whisper_model.transcribe(audio, fp16=False)
66
+
67
+ # Extract the detected source language from the transcription result.
68
+ # (Whisper returns a language code, for example "en" for English.)
69
+ source_lang = result.get("language", "en")
70
+
71
+ # Get the target language code from our mapping; default to English if not found.
72
+ target_lang_code = LANGUAGES.get(target_language, "en")
73
+
74
+ cumulative_translation = ""
75
+
76
+ # Iterate over each segment from the transcription.
77
+ # Each segment is a dict with keys such as "start", "end", and "text".
78
+ for segment in result.get("segments", []):
79
+ # Clean up the segment text.
80
+ segment_text = segment.get("text", "").strip()
81
+ if segment_text == "":
82
+ continue
83
+
84
+ # If the source and target languages are the same, no translation is needed.
85
+ if source_lang == target_lang_code:
86
+ translated_segment = segment_text
87
+ else:
88
+ # Set the tokenizer's source language for proper translation.
89
+ tokenizer.src_lang = source_lang
90
+ # Tokenize the segment text.
91
+ encoded = tokenizer(segment_text, return_tensors="pt")
92
+ # Generate translation tokens.
93
+ # The 'forced_bos_token_id' parameter forces the model to generate text in the target language.
94
+ generated_tokens = m2m100_model.generate(
95
+ **encoded,
96
+ forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
97
+ )
98
+ # Decode the tokens to obtain the translated text.
99
+ translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
100
+
101
+ # Append the new translation segment to the cumulative output.
102
+ cumulative_translation += translated_segment + " "
103
+
104
+ # Yield the updated cumulative translation to simulate streaming output.
105
+ yield cumulative_translation.strip()
106
+
107
+ # -----------------------------------------------------------------------------
108
+ # Gradio Interface Definition
109
+ # -----------------------------------------------------------------------------
110
+ with gr.Blocks() as demo:
111
+ gr.Markdown("# Real-time Speech Translation Demo")
112
+ gr.Markdown(
113
+ "Speak into the microphone and your speech will be transcribed and translated "
114
+ "segment-by-segment. (Recording is limited to 15 seconds.)"
115
+ )
116
+
117
+ with gr.Row():
118
+ # Audio input: records from the microphone.
119
+ audio_input = gr.Audio(
120
+ source="microphone",
121
+ type="filepath",
122
+ label="Record your speech (max 15 seconds)",
123
+ elem_id="audio_input"
124
+ )
125
+ # Dropdown to select the target language.
126
+ target_lang_dropdown = gr.Dropdown(
127
+ choices=list(LANGUAGES.keys()),
128
+ value="English",
129
+ label="Select Target Language"
130
+ )
131
+
132
+ # Output textbox for displaying the (streaming) translation.
133
+ output_text = gr.Textbox(label="Translated Text", lines=10)
134
+
135
+ # Connect the audio input and dropdown to our translation function.
136
+ # Since translate_audio is a generator (it yields partial results), Gradio will stream the output.
137
+ audio_input.change(
138
+ fn=translate_audio,
139
+ inputs=[audio_input, target_lang_dropdown],
140
+ outputs=output_text
141
+ )
142
+
143
+ # Launch the Gradio app (suitable for Hugging Face Spaces).
144
+ demo.launch()