import gradio as gr import torch import numpy as np import librosa from transformers import pipeline # -------------------------------------------------- # ASR Pipeline (for English transcription) # -------------------------------------------------- asr = pipeline( "automatic-speech-recognition", model="facebook/wav2vec2-large-960h-lv60-self" ) # -------------------------------------------------- # Mapping for Target Languages and Models # -------------------------------------------------- translation_models = { "Spanish": "Helsinki-NLP/opus-mt-en-es", "French": "Helsinki-NLP/opus-mt-en-fr", "German": "Helsinki-NLP/opus-mt-en-de", "Chinese": "Helsinki-NLP/opus-mt-en-zh", "Russian": "Helsinki-NLP/opus-mt-en-ru", "Arabic": "Helsinki-NLP/opus-mt-en-ar", "Portuguese": "Helsinki-NLP/opus-mt-en-pt", "Japanese": "Helsinki-NLP/opus-mt-en-ja", "Italian": "Helsinki-NLP/opus-mt-en-it", "Korean": "Helsinki-NLP/opus-mt-en-ko" } tts_models = { "Spanish": "tts_models/es/tacotron2-DDC", "French": "tts_models/fr/tacotron2", "German": "tts_models/de/tacotron2", "Chinese": "tts_models/zh/tacotron2", "Russian": "tts_models/ru/tacotron2", "Arabic": "tts_models/ar/tacotron2", "Portuguese": "tts_models/pt/tacotron2", "Japanese": "tts_models/ja/tacotron2", "Italian": "tts_models/it/tacotron2", "Korean": "tts_models/ko/tacotron2" } # Caches for translator and TTS pipelines translator_cache = {} tts_cache = {} def get_translator(target_language): if target_language in translator_cache: return translator_cache[target_language] model_name = translation_models[target_language] # Pipeline task naming is case sensitive; here we assume task "translation_en_to_" translator = pipeline("translation_en_to_" + target_language.lower(), model=model_name) translator_cache[target_language] = translator return translator def get_tts(target_language): if target_language in tts_cache: return tts_cache[target_language] model_name = tts_models[target_language] tts = pipeline("text-to-speech", model=model_name) tts_cache[target_language] = tts return tts # -------------------------------------------------- # Prediction Function # -------------------------------------------------- def predict(audio, text, target_language): # Use text input if provided; otherwise, use ASR on audio if text.strip() != "": english_text = text.strip() elif audio is not None: sample_rate, audio_data = audio # Ensure the audio is floating-point for librosa if audio_data.dtype not in [np.float32, np.float64]: audio_data = audio_data.astype(np.float32) # Convert stereo to mono if needed if len(audio_data.shape) > 1 and audio_data.shape[1] > 1: audio_data = np.mean(audio_data, axis=1) # Resample to 16 kHz if necessary if sample_rate != 16000: audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) input_audio = {"array": audio_data, "sampling_rate": 16000} asr_result = asr(input_audio) english_text = asr_result["text"] else: return "No input provided.", "", None # Translation step translator = get_translator(target_language) translation_result = translator(english_text) translated_text = translation_result[0]["translation_text"] # TTS step: synthesize speech from the translated text tts = get_tts(target_language) tts_result = tts(translated_text) # The TTS pipeline returns a dict with "wav" and "sample_rate" synthesized_audio = (tts_result["sample_rate"], tts_result["wav"]) return english_text, translated_text, synthesized_audio # -------------------------------------------------- # Gradio Interface Setup # -------------------------------------------------- iface = gr.Interface( fn=predict, inputs=[ gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"), gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"), gr.Dropdown(choices=list(translation_models.keys()), value="Spanish", label="Target Language") ], outputs=[ gr.Textbox(label="English Transcription"), gr.Textbox(label="Translation (Target Language)"), gr.Audio(label="Synthesized Speech in Target Language") ], title="Multimodal Language Learning Aid", description=( "This app helps language learners by providing three outputs:\n" "1. English transcription (from ASR or text input),\n" "2. Translation to a target language, and\n" "3. Synthetic speech in the target language.\n\n" "Choose one of the top 10 commonly used languages from the dropdown.\n" "You can either record/upload an English audio sample or enter English text directly." ), allow_flagging="never" ) if __name__ == "__main__": iface.launch()