File size: 8,010 Bytes
b4d8745
951b505
b4d8745
 
5351689
b4d8745
5351689
 
 
1dc3846
 
 
4e7d1a7
 
b4d8745
 
 
 
 
 
4e7d1a7
 
b4d8745
 
 
 
1dc3846
 
b4d8745
 
 
 
 
 
4e7d1a7
b4d8745
 
 
 
 
 
5351689
 
b4d8745
 
 
1dc3846
b4d8745
 
 
 
5351689
b4d8745
 
 
 
 
5351689
b4d8745
 
 
5351689
b4d8745
 
 
1dc3846
b4d8745
5351689
b4d8745
 
 
5351689
b4d8745
 
 
 
 
 
 
5351689
b4d8745
 
 
4e7d1a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4d8745
1dc3846
 
 
 
 
 
 
 
 
 
951b505
b4d8745
 
951b505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4d8745
 
 
951b505
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
Speech Translation Demo with Automatic TTS, Restart Option, and About Tab

This demo performs the following:
  1. Accepts up to 15 seconds of audio recording from the microphone.
  2. Uses OpenAI’s Whisper model to transcribe the speech.
  3. Splits the transcription into segments and translates each segment
     on-the-fly using Facebook’s M2M100 model.
  4. Streams the cumulative translation output to the user.
  5. Automatically converts the final translated text to speech using gTTS.
  6. Provides a "Restart Recording" button (located just below the recording section)
     to reset the audio input, translated text, and TTS output.
Note: True real-time translation (i.e. while speaking) requires a continuous streaming
solution which is not provided by the standard browser microphone input.
"""

import gradio as gr
import whisper
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from gtts import gTTS
import uuid

# -----------------------------------------------------------------------------
# Global Model Loading
# -----------------------------------------------------------------------------
# Load the Whisper model (using "base" for a balance between speed and accuracy).
whisper_model = whisper.load_model("base")  # Adjust model size as needed

# Load the M2M100 model and tokenizer for translation.
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

# -----------------------------------------------------------------------------
# Define Supported Languages (including Polish)
# -----------------------------------------------------------------------------
LANGUAGES = {
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Chinese": "zh",
    "Polish": "pl"
}

# -----------------------------------------------------------------------------
# Main Processing Function: Translation (streaming)
# -----------------------------------------------------------------------------
def translate_audio(audio, target_language):
    """
    Process the input audio, transcribe it using Whisper, and translate each segment
    to the chosen target language. Yields cumulative translation output for streaming.
    """
    if audio is None:
        yield "No audio provided."
        return

    # Transcribe the audio using Whisper (fp16=False for CPU compatibility)
    result = whisper_model.transcribe(audio, fp16=False)
    source_lang = result.get("language", "en")
    target_lang_code = LANGUAGES.get(target_language, "en")

    cumulative_translation = ""
    for segment in result.get("segments", []):
        segment_text = segment.get("text", "").strip()
        if not segment_text:
            continue

        if source_lang == target_lang_code:
            translated_segment = segment_text
        else:
            # Set the source language for proper translation.
            tokenizer.src_lang = source_lang
            encoded = tokenizer(segment_text, return_tensors="pt")
            generated_tokens = m2m100_model.generate(
                **encoded,
                forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
            )
            translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

        cumulative_translation += translated_segment + " "
        yield cumulative_translation.strip()

# -----------------------------------------------------------------------------
# TTS Generation Function
# -----------------------------------------------------------------------------
def generate_tts(text, target_language):
    """
    Convert the translated text to speech using gTTS.
    Returns the filename of the generated audio file.
    """
    lang_code = LANGUAGES.get(target_language, "en")
    if not text or not text.strip():
        return None
    filename = f"tts_{uuid.uuid4().hex}.mp3"
    tts = gTTS(text=text, lang=lang_code)
    tts.save(filename)
    return filename

# -----------------------------------------------------------------------------
# Restart Function
# -----------------------------------------------------------------------------
def restart_recording():
    """
    Reset the recording section by clearing the audio input, the translation textbox,
    and the TTS audio output.
    """
    return None, "", None

# -----------------------------------------------------------------------------
# Gradio Interface Definition with Tabs
# -----------------------------------------------------------------------------
with gr.Blocks() as demo:
    with gr.Tabs():
        # "Demo" Tab: Contains the interactive interface.
        with gr.TabItem("Demo"):
            gr.Markdown("# Real-time Speech Translation Demo")
            gr.Markdown(
                "Speak into the microphone and your speech will be transcribed and translated "
                "segment-by-segment. (Recording is limited to 15 seconds.)\n\n"
                "**Note:** The translation and speech synthesis occur automatically after recording."
            )
            
            # Row for audio input and target language selection.
            with gr.Row():
                audio_input = gr.Audio(
                    sources=["microphone"],
                    type="filepath",
                    label="Record your speech (max 15 seconds)",
                    elem_id="audio_input"
                )
                target_lang_dropdown = gr.Dropdown(
                    choices=list(LANGUAGES.keys()),
                    value="English",
                    label="Select Target Language"
                )
            
            # Row for the Restart Recording button (placed just below the recording section).
            with gr.Row():
                restart_button = gr.Button("Restart Recording")
            
            # Output components: Translated text and TTS audio.
            output_text = gr.Textbox(label="Translated Text", lines=10)
            tts_audio = gr.Audio(label="Translated Speech", type="filepath")
            
            # Chain the events:
            # 1. When new audio is recorded, stream the translation text.
            # 2. Once translation is complete, automatically generate the TTS audio.
            audio_input.change(
                fn=translate_audio,
                inputs=[audio_input, target_lang_dropdown],
                outputs=output_text,
                stream=True
            ).then(
                fn=generate_tts,
                inputs=[output_text, target_lang_dropdown],
                outputs=tts_audio
            )
            
            # The Restart button clears the audio input, translation text, and TTS audio.
            restart_button.click(
                fn=restart_recording,
                inputs=[],
                outputs=[audio_input, output_text, tts_audio]
            )
        
        # "About" Tab: Displays the descriptive text.
        with gr.TabItem("About"):
            gr.Markdown(
                """
**Speech Translation Demo with Automatic TTS and Restart Option**

This demo performs the following:
  1. Accepts up to 15 seconds of audio recording from the microphone.
  2. Uses OpenAI’s Whisper model to transcribe the speech.
  3. Splits the transcription into segments and translates each segment on-the-fly using Facebook’s M2M100 model.
  4. Streams the cumulative translation output to the user.
  5. Automatically converts the final translated text to speech using gTTS.
  6. Provides a "Restart Recording" button (located just below the recording section) to reset the audio input, translated text, and TTS output.

**Note:** True real-time translation (i.e. while speaking) requires a continuous streaming solution which is not provided by the standard browser microphone input.
                """
            )

# Launch the Gradio app (suitable for Hugging Face Spaces).
demo.launch()