File size: 5,045 Bytes
63ee3e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import gradio as gr
import torch
import numpy as np
import librosa
from transformers import pipeline
# --------------------------------------------------
# ASR Pipeline (for English transcription)
# --------------------------------------------------
asr = pipeline(
"automatic-speech-recognition",
model="facebook/wav2vec2-large-960h-lv60-self"
)
# --------------------------------------------------
# Mapping for Target Languages and Models
# --------------------------------------------------
translation_models = {
"Spanish": "Helsinki-NLP/opus-mt-en-es",
"French": "Helsinki-NLP/opus-mt-en-fr",
"German": "Helsinki-NLP/opus-mt-en-de",
"Chinese": "Helsinki-NLP/opus-mt-en-zh",
"Russian": "Helsinki-NLP/opus-mt-en-ru",
"Arabic": "Helsinki-NLP/opus-mt-en-ar",
"Portuguese": "Helsinki-NLP/opus-mt-en-pt",
"Japanese": "Helsinki-NLP/opus-mt-en-ja",
"Italian": "Helsinki-NLP/opus-mt-en-it",
"Korean": "Helsinki-NLP/opus-mt-en-ko"
}
tts_models = {
"Spanish": "tts_models/es/tacotron2-DDC",
"French": "tts_models/fr/tacotron2",
"German": "tts_models/de/tacotron2",
"Chinese": "tts_models/zh/tacotron2",
"Russian": "tts_models/ru/tacotron2",
"Arabic": "tts_models/ar/tacotron2",
"Portuguese": "tts_models/pt/tacotron2",
"Japanese": "tts_models/ja/tacotron2",
"Italian": "tts_models/it/tacotron2",
"Korean": "tts_models/ko/tacotron2"
}
# Caches for translator and TTS pipelines
translator_cache = {}
tts_cache = {}
def get_translator(target_language):
if target_language in translator_cache:
return translator_cache[target_language]
model_name = translation_models[target_language]
# Pipeline task naming is case sensitive; here we assume task "translation_en_to_<lang>"
translator = pipeline("translation_en_to_" + target_language.lower(), model=model_name)
translator_cache[target_language] = translator
return translator
def get_tts(target_language):
if target_language in tts_cache:
return tts_cache[target_language]
model_name = tts_models[target_language]
tts = pipeline("text-to-speech", model=model_name)
tts_cache[target_language] = tts
return tts
# --------------------------------------------------
# Prediction Function
# --------------------------------------------------
def predict(audio, text, target_language):
# Use text input if provided; otherwise, use ASR on audio
if text.strip() != "":
english_text = text.strip()
elif audio is not None:
sample_rate, audio_data = audio
# Ensure the audio is floating-point for librosa
if audio_data.dtype not in [np.float32, np.float64]:
audio_data = audio_data.astype(np.float32)
# Convert stereo to mono if needed
if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
audio_data = np.mean(audio_data, axis=1)
# Resample to 16 kHz if necessary
if sample_rate != 16000:
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
input_audio = {"array": audio_data, "sampling_rate": 16000}
asr_result = asr(input_audio)
english_text = asr_result["text"]
else:
return "No input provided.", "", None
# Translation step
translator = get_translator(target_language)
translation_result = translator(english_text)
translated_text = translation_result[0]["translation_text"]
# TTS step: synthesize speech from the translated text
tts = get_tts(target_language)
tts_result = tts(translated_text)
# The TTS pipeline returns a dict with "wav" and "sample_rate"
synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
return english_text, translated_text, synthesized_audio
# --------------------------------------------------
# Gradio Interface Setup
# --------------------------------------------------
iface = gr.Interface(
fn=predict,
inputs=[
gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
gr.Dropdown(choices=list(translation_models.keys()), value="Spanish", label="Target Language")
],
outputs=[
gr.Textbox(label="English Transcription"),
gr.Textbox(label="Translation (Target Language)"),
gr.Audio(label="Synthesized Speech in Target Language")
],
title="Multimodal Language Learning Aid",
description=(
"This app helps language learners by providing three outputs:\n"
"1. English transcription (from ASR or text input),\n"
"2. Translation to a target language, and\n"
"3. Synthetic speech in the target language.\n\n"
"Choose one of the top 10 commonly used languages from the dropdown.\n"
"You can either record/upload an English audio sample or enter English text directly."
),
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()
|