Spaces:

tdurzynski
/

real-time-speech-translation

Running

App Files Files Community

real-time-speech-translation / app.py

tdurzynski

Update app.py

4e7d1a7 verified 6 months ago

raw

history blame

6.57 kB

	"""
	Speech Translation Demo with Restart and TTS

	This demo performs the following:
	1. Accepts up to 15 seconds of audio recording from the microphone.
	2. Uses OpenAI’s Whisper model to transcribe the speech.
	3. Splits the transcription into segments and translates each segment
	on-the-fly using Facebook’s M2M100 model.
	4. Streams the cumulative translation output to the user.
	5. Provides a "Restart Recording" button that resets the audio input and translation output.
	6. Offers a "Read Translated Text" button that converts the final translation
	into speech using gTTS.

	Note: True real-time translation (i.e. while speaking) requires a continuous streaming
	solution which is not provided by the standard browser microphone input.
	"""

	import gradio as gr
	import whisper
	import torch
	from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
	from gtts import gTTS
	import uuid

	# -----------------------------------------------------------------------------
	# Global Model Loading
	# -----------------------------------------------------------------------------
	# Load the Whisper model (using the "base" model for a balance between speed and accuracy).
	whisper_model = whisper.load_model("base") # Change model size as needed

	# Load the M2M100 model and tokenizer for translation.
	tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
	m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

	# -----------------------------------------------------------------------------
	# Define Supported Languages (including Polish)
	# -----------------------------------------------------------------------------
	LANGUAGES = {
	"English": "en",
	"Spanish": "es",
	"French": "fr",
	"German": "de",
	"Chinese": "zh",
	"Polish": "pl"
	}

	# -----------------------------------------------------------------------------
	# Main Processing Function
	# -----------------------------------------------------------------------------
	def translate_audio(audio, target_language):
	"""
	Process the input audio, transcribe it using Whisper, and translate each segment
	to the chosen target language. Yields cumulative translation output for streaming.
	"""
	if audio is None:
	yield "No audio provided."
	return

	# Transcribe the audio using Whisper (fp16=False for CPU compatibility)
	result = whisper_model.transcribe(audio, fp16=False)
	source_lang = result.get("language", "en")
	target_lang_code = LANGUAGES.get(target_language, "en")

	cumulative_translation = ""
	for segment in result.get("segments", []):
	segment_text = segment.get("text", "").strip()
	if segment_text == "":
	continue

	if source_lang == target_lang_code:
	translated_segment = segment_text
	else:
	# Set the source language for proper translation.
	tokenizer.src_lang = source_lang
	encoded = tokenizer(segment_text, return_tensors="pt")
	generated_tokens = m2m100_model.generate(
	**encoded,
	forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
	)
	translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

	cumulative_translation += translated_segment + " "
	yield cumulative_translation.strip()

	# -----------------------------------------------------------------------------
	# Restart Function
	# -----------------------------------------------------------------------------
	def restart_recording():
	"""
	Reset the recording section by clearing the audio input and the translation output.
	"""
	return None, ""

	# -----------------------------------------------------------------------------
	# TTS Generation Function
	# -----------------------------------------------------------------------------
	def generate_tts(text, target_language):
	"""
	Convert the translated text to speech using gTTS.
	Returns the filename of the generated audio file.
	"""
	lang_code = LANGUAGES.get(target_language, "en")
	if not text or not text.strip():
	return None
	filename = f"tts_{uuid.uuid4().hex}.mp3"
	tts = gTTS(text=text, lang=lang_code)
	tts.save(filename)
	return filename

	# -----------------------------------------------------------------------------
	# Gradio Interface Definition
	# -----------------------------------------------------------------------------
	with gr.Blocks() as demo:
	gr.Markdown("# Real-time Speech Translation Demo")
	gr.Markdown(
	"Speak into the microphone and your speech will be transcribed and translated "
	"segment-by-segment. (Recording is limited to 15 seconds.)\n\n"
	"Note: Due to browser limitations, the translation starts after you stop recording. "
	"For a truly real-time experience, a continuous streaming solution would be required."
	)

	with gr.Row():
	# Use 'sources' (list) to specify that the microphone is an input source.
	audio_input = gr.Audio(
	sources=["microphone"],
	type="filepath",
	label="Record your speech (max 15 seconds)",
	elem_id="audio_input"
	)
	target_lang_dropdown = gr.Dropdown(
	choices=list(LANGUAGES.keys()),
	value="English",
	label="Select Target Language"
	)

	# Output textbox for displaying the (streaming) translation.
	output_text = gr.Textbox(label="Translated Text", lines=10)

	with gr.Row():
	restart_button = gr.Button("Restart Recording")
	read_aloud_button = gr.Button("Read Translated Text")

	# Audio output for the TTS result.
	tts_audio = gr.Audio(label="Translated Speech", type="filepath")

	# When new audio is recorded, stream the translation.
	audio_input.change(
	fn=translate_audio,
	inputs=[audio_input, target_lang_dropdown],
	outputs=output_text
	)

	# When the restart button is clicked, clear both the audio input and translation output.
	restart_button.click(
	fn=restart_recording,
	inputs=[],
	outputs=[audio_input, output_text]
	)

	# When the read aloud button is clicked, generate TTS from the translated text.
	read_aloud_button.click(
	fn=generate_tts,
	inputs=[output_text, target_lang_dropdown],
	outputs=tts_audio
	)

	# Launch the Gradio app (suitable for Hugging Face Spaces).
	demo.launch()