Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 25

Commit

63ee3e5

verified ·

1 Parent(s): 76b5526

Create app.py

Browse files

Files changed (1) hide show

app.py +134 -0

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import gradio as gr
+import torch
+import numpy as np
+import librosa
+from transformers import pipeline
+# --------------------------------------------------
+# ASR Pipeline (for English transcription)
+# --------------------------------------------------
+asr = pipeline(
+    "automatic-speech-recognition",
+    model="facebook/wav2vec2-large-960h-lv60-self"
+)
+# --------------------------------------------------
+# Mapping for Target Languages and Models
+# --------------------------------------------------
+translation_models = {
+    "Spanish": "Helsinki-NLP/opus-mt-en-es",
+    "French": "Helsinki-NLP/opus-mt-en-fr",
+    "German": "Helsinki-NLP/opus-mt-en-de",
+    "Chinese": "Helsinki-NLP/opus-mt-en-zh",
+    "Russian": "Helsinki-NLP/opus-mt-en-ru",
+    "Arabic": "Helsinki-NLP/opus-mt-en-ar",
+    "Portuguese": "Helsinki-NLP/opus-mt-en-pt",
+    "Japanese": "Helsinki-NLP/opus-mt-en-ja",
+    "Italian": "Helsinki-NLP/opus-mt-en-it",
+    "Korean": "Helsinki-NLP/opus-mt-en-ko"
+}
+tts_models = {
+    "Spanish": "tts_models/es/tacotron2-DDC",
+    "French": "tts_models/fr/tacotron2",
+    "German": "tts_models/de/tacotron2",
+    "Chinese": "tts_models/zh/tacotron2",
+    "Russian": "tts_models/ru/tacotron2",
+    "Arabic": "tts_models/ar/tacotron2",
+    "Portuguese": "tts_models/pt/tacotron2",
+    "Japanese": "tts_models/ja/tacotron2",
+    "Italian": "tts_models/it/tacotron2",
+    "Korean": "tts_models/ko/tacotron2"
+}
+# Caches for translator and TTS pipelines
+translator_cache = {}
+tts_cache = {}
+def get_translator(target_language):
+    if target_language in translator_cache:
+        return translator_cache[target_language]
+    model_name = translation_models[target_language]
+    # Pipeline task naming is case sensitive; here we assume task "translation_en_to_<lang>"
+    translator = pipeline("translation_en_to_" + target_language.lower(), model=model_name)
+    translator_cache[target_language] = translator
+    return translator
+def get_tts(target_language):
+    if target_language in tts_cache:
+        return tts_cache[target_language]
+    model_name = tts_models[target_language]
+    tts = pipeline("text-to-speech", model=model_name)
+    tts_cache[target_language] = tts
+    return tts
+# --------------------------------------------------
+# Prediction Function
+# --------------------------------------------------
+def predict(audio, text, target_language):
+    # Use text input if provided; otherwise, use ASR on audio
+    if text.strip() != "":
+        english_text = text.strip()
+    elif audio is not None:
+        sample_rate, audio_data = audio
+        # Ensure the audio is floating-point for librosa
+        if audio_data.dtype not in [np.float32, np.float64]:
+            audio_data = audio_data.astype(np.float32)
+        # Convert stereo to mono if needed
+        if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
+            audio_data = np.mean(audio_data, axis=1)
+        # Resample to 16 kHz if necessary
+        if sample_rate != 16000:
+            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+        input_audio = {"array": audio_data, "sampling_rate": 16000}
+        asr_result = asr(input_audio)
+        english_text = asr_result["text"]
+    else:
+        return "No input provided.", "", None
+    # Translation step
+    translator = get_translator(target_language)
+    translation_result = translator(english_text)
+    translated_text = translation_result[0]["translation_text"]
+    # TTS step: synthesize speech from the translated text
+    tts = get_tts(target_language)
+    tts_result = tts(translated_text)
+    # The TTS pipeline returns a dict with "wav" and "sample_rate"
+    synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
+    return english_text, translated_text, synthesized_audio
+# --------------------------------------------------
+# Gradio Interface Setup
+# --------------------------------------------------
+iface = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
+        gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
+        gr.Dropdown(choices=list(translation_models.keys()), value="Spanish", label="Target Language")
+    ],
+    outputs=[
+        gr.Textbox(label="English Transcription"),
+        gr.Textbox(label="Translation (Target Language)"),
+        gr.Audio(label="Synthesized Speech in Target Language")
+    ],
+    title="Multimodal Language Learning Aid",
+    description=(
+        "This app helps language learners by providing three outputs:\n"
+        "1. English transcription (from ASR or text input),\n"
+        "2. Translation to a target language, and\n"
+        "3. Synthetic speech in the target language.\n\n"
+        "Choose one of the top 10 commonly used languages from the dropdown.\n"
+        "You can either record/upload an English audio sample or enter English text directly."
+    ),
+    allow_flagging="never"
+)
+if __name__ == "__main__":
+    iface.launch()