File size: 6,061 Bytes
b4d8745 f4f0bbf b4d8745 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
"""
Real-time Speech Translation Demo
This demo performs the following:
1. Accepts a 15-second audio recording from the microphone.
2. Uses OpenAI’s Whisper model to transcribe the speech.
3. Splits the transcription into segments (each roughly corresponding to a sentence).
4. Translates each segment on-the-fly using Facebook’s M2M100 model (via Hugging Face Transformers).
5. Streams the cumulative translation output to the user.
Make sure to install all dependencies from requirements.txt.
"""
import gradio as gr
import whisper
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
# -----------------------------------------------------------------------------
# Global Model Loading
# -----------------------------------------------------------------------------
# Load the Whisper model (using the "base" model for a balance between speed and accuracy).
# Note: Loading models may take a few seconds on startup.
whisper_model = whisper.load_model("base") # You can choose a larger model if desired
# Load the M2M100 model and tokenizer for translation.
# The "facebook/m2m100_418M" model supports translation between many languages.
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
# -----------------------------------------------------------------------------
# Define Supported Languages
# -----------------------------------------------------------------------------
# We define a mapping from display names to language codes used by M2M100.
# (For a full list of supported languages see the M2M100 docs.)
LANGUAGES = {
"English": "en",
"Spanish": "es",
"French": "fr",
"German": "de",
"Chinese": "zh"
}
# -----------------------------------------------------------------------------
# Main Processing Function
# -----------------------------------------------------------------------------
def translate_audio(audio, target_language):
"""
Process the input audio, transcribe it using Whisper, and translate each segment
to the chosen target language. Yields a cumulative translation string for streaming.
Parameters:
audio (str): Path to the recorded audio file.
target_language (str): Display name of the target language (e.g., "English").
Yields:
str: The cumulative translated text after processing each segment.
"""
if audio is None:
yield "No audio provided."
return
# Transcribe the audio file using Whisper.
# Using fp16=False to ensure compatibility on CPUs.
result = whisper_model.transcribe(audio, fp16=False)
# Extract the detected source language from the transcription result.
# (Whisper returns a language code, for example "en" for English.)
source_lang = result.get("language", "en")
# Get the target language code from our mapping; default to English if not found.
target_lang_code = LANGUAGES.get(target_language, "en")
cumulative_translation = ""
# Iterate over each segment from the transcription.
# Each segment is a dict with keys such as "start", "end", and "text".
for segment in result.get("segments", []):
# Clean up the segment text.
segment_text = segment.get("text", "").strip()
if segment_text == "":
continue
# If the source and target languages are the same, no translation is needed.
if source_lang == target_lang_code:
translated_segment = segment_text
else:
# Set the tokenizer's source language for proper translation.
tokenizer.src_lang = source_lang
# Tokenize the segment text.
encoded = tokenizer(segment_text, return_tensors="pt")
# Generate translation tokens.
# The 'forced_bos_token_id' parameter forces the model to generate text in the target language.
generated_tokens = m2m100_model.generate(
**encoded,
forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
)
# Decode the tokens to obtain the translated text.
translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Append the new translation segment to the cumulative output.
cumulative_translation += translated_segment + " "
# Yield the updated cumulative translation to simulate streaming output.
yield cumulative_translation.strip()
# -----------------------------------------------------------------------------
# Gradio Interface Definition
# -----------------------------------------------------------------------------
with gr.Blocks() as demo:
gr.Markdown("# Real-time Speech Translation Demo")
gr.Markdown(
"Speak into the microphone and your speech will be transcribed and translated "
"segment-by-segment. (Recording is limited to 15 seconds.)"
)
with gr.Row():
# Audio input: records from the microphone.
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record your speech (max 15 seconds)",
elem_id="audio_input"
)
# Dropdown to select the target language.
target_lang_dropdown = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English",
label="Select Target Language"
)
# Output textbox for displaying the (streaming) translation.
output_text = gr.Textbox(label="Translated Text", lines=10)
# Connect the audio input and dropdown to our translation function.
# Since translate_audio is a generator (it yields partial results), Gradio will stream the output.
audio_input.change(
fn=translate_audio,
inputs=[audio_input, target_lang_dropdown],
outputs=output_text
)
# Launch the Gradio app (suitable for Hugging Face Spaces).
demo.launch()
|