Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 25

Commit

c098e72

verified ·

1 Parent(s): df9ae3f

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -54

app.py CHANGED Viewed

@@ -3,8 +3,6 @@ import torch
 import numpy as np
 import librosa
 from transformers import pipeline
-from transformers import VitsModel, AutoTokenizer
-import scipy  # imported if needed for processing
 # --------------------------------------------------
 # ASR Pipeline (for English transcription)
@@ -15,48 +13,24 @@ asr = pipeline(
 )
 # --------------------------------------------------
-# Mapping for Target Languages and Translation Pipelines
 # --------------------------------------------------
 translation_models = {
     "Spanish": "Helsinki-NLP/opus-mt-en-es",
-    "French": "Helsinki-NLP/opus-mt-en-fr",
-    "German": "Helsinki-NLP/opus-mt-en-de",
     "Chinese": "Helsinki-NLP/opus-mt-en-zh",
-    "Russian": "Helsinki-NLP/opus-mt-en-ru",
-    "Arabic": "Helsinki-NLP/opus-mt-en-ar",
-    "Portuguese": "Helsinki-NLP/opus-mt-en-pt",
-    "Japanese": "Helsinki-NLP/opus-mt-en-ja",
-    "Italian": "Helsinki-NLP/opus-mt-en-it",
-    "Korean": "Helsinki-NLP/opus-mt-en-ko"
 }
 translation_tasks = {
     "Spanish": "translation_en_to_es",
-    "French": "translation_en_to_fr",
-    "German": "translation_en_to_de",
     "Chinese": "translation_en_to_zh",
-    "Russian": "translation_en_to_ru",
-    "Arabic": "translation_en_to_ar",
-    "Portuguese": "translation_en_to_pt",
-    "Japanese": "translation_en_to_ja",
-    "Italian": "translation_en_to_it",
-    "Korean": "translation_en_to_ko"
 }
-# --------------------------------------------------
-# TTS Models (using real Facebook MMS TTS & others)
-# --------------------------------------------------
 tts_models = {
     "Spanish": "facebook/mms-tts-spa",
-    "French": "facebook/mms-tts-fra",
-    "German": "facebook/mms-tts-deu",
     "Chinese": "facebook/mms-tts-che",
-    "Russian": "facebook/mms-tts-rus",
-    "Arabic": "facebook/mms-tts-ara",
-    "Portuguese": "facebook/mms-tts-por",
-    "Japanese": "esnya/japanese_speecht5_tts",
-    "Italian": "tts_models/it/tacotron2",
-    "Korean": "facebook/mms-tts-kor"
 }
 # --------------------------------------------------
@@ -66,12 +40,8 @@ translator_cache = {}
 tts_cache = {}
 def get_translator(target_language):
-    """
-    Retrieve or create a translation pipeline for the specified language.
-    """
     if target_language in translator_cache:
         return translator_cache[target_language]
     model_name = translation_models[target_language]
     task_name = translation_tasks[target_language]
     translator = pipeline(task_name, model=model_name)
@@ -79,23 +49,15 @@ def get_translator(target_language):
     return translator
 def get_tts(target_language):
-    """
-    Retrieve or create a TTS pipeline for the specified language.
-    """
     if target_language in tts_cache:
         return tts_cache[target_language]
     model_name = tts_models.get(target_language)
     if model_name is None:
         raise ValueError(f"No TTS model available for {target_language}.")
     try:
         tts_pipeline = pipeline("text-to-speech", model=model_name)
     except Exception as e:
-        raise ValueError(
-            f"Failed to load TTS model for {target_language} with model '{model_name}'.\nError: {e}"
-        )
     tts_cache[target_language] = tts_pipeline
     return tts_pipeline
@@ -103,12 +65,7 @@ def get_tts(target_language):
 # Prediction Function
 # --------------------------------------------------
 def predict(audio, text, target_language):
-    """
-    1. Obtain English text (from text input or ASR).
-    2. Translate English -> target_language.
-    3. Synthesize speech in target_language.
-    """
-    # Step 1: Get English text from text input (if provided) or from ASR.
     if text.strip():
         english_text = text.strip()
     elif audio is not None:
@@ -125,7 +82,7 @@ def predict(audio, text, target_language):
     else:
         return "No input provided.", "", None
-    # Step 2: Translation
     translator = get_translator(target_language)
     try:
         translation_result = translator(english_text)
@@ -133,11 +90,10 @@ def predict(audio, text, target_language):
     except Exception as e:
         return english_text, f"Translation error: {e}", None
-    # Step 3: TTS synthesis using Facebook MMS TTS (or alternative) pipeline.
     try:
         tts_pipeline = get_tts(target_language)
         tts_result = tts_pipeline(translated_text)
-        # Expected output: a dict with "wav" and "sample_rate"
         synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
@@ -163,9 +119,8 @@ iface = gr.Interface(
     description=(
         "This app provides three outputs:\n"
         "1. English transcription (from ASR or text input),\n"
-        "2. Translation to a target language (using Helsinki-NLP models), and\n"
         "3. Synthetic speech in the target language (using Facebook MMS TTS or equivalent).\n\n"
-        "Select one of the top 10 commonly used languages from the dropdown.\n"
         "Either record/upload an English audio sample or enter English text directly."
     ),
     allow_flagging="never"

 import numpy as np
 import librosa
 from transformers import pipeline
 # --------------------------------------------------
 # ASR Pipeline (for English transcription)
 )
 # --------------------------------------------------
+# Mapping for Target Languages (Spanish, Chinese, Japanese)
 # --------------------------------------------------
 translation_models = {
     "Spanish": "Helsinki-NLP/opus-mt-en-es",
     "Chinese": "Helsinki-NLP/opus-mt-en-zh",
+    "Japanese": "Helsinki-NLP/opus-mt-en-ja"
 }
 translation_tasks = {
     "Spanish": "translation_en_to_es",
     "Chinese": "translation_en_to_zh",
+    "Japanese": "translation_en_to_ja"
 }
 tts_models = {
     "Spanish": "facebook/mms-tts-spa",
     "Chinese": "facebook/mms-tts-che",
+    "Japanese": "esnya/japanese_speecht5_tts"
 }
 # --------------------------------------------------
 tts_cache = {}
 def get_translator(target_language):
     if target_language in translator_cache:
         return translator_cache[target_language]
     model_name = translation_models[target_language]
     task_name = translation_tasks[target_language]
     translator = pipeline(task_name, model=model_name)
     return translator
 def get_tts(target_language):
     if target_language in tts_cache:
         return tts_cache[target_language]
     model_name = tts_models.get(target_language)
     if model_name is None:
         raise ValueError(f"No TTS model available for {target_language}.")
     try:
         tts_pipeline = pipeline("text-to-speech", model=model_name)
     except Exception as e:
+        raise ValueError(f"Failed to load TTS model for {target_language} with model '{model_name}'.\nError: {e}")
     tts_cache[target_language] = tts_pipeline
     return tts_pipeline
 # Prediction Function
 # --------------------------------------------------
 def predict(audio, text, target_language):
+    # Step 1: Obtain English text from text input if provided, otherwise use ASR.
     if text.strip():
         english_text = text.strip()
     elif audio is not None:
     else:
         return "No input provided.", "", None
+    # Step 2: Translate the English text to the target language.
     translator = get_translator(target_language)
     try:
         translation_result = translator(english_text)
     except Exception as e:
         return english_text, f"Translation error: {e}", None
+    # Step 3: Synthesize speech using the TTS pipeline.
     try:
         tts_pipeline = get_tts(target_language)
         tts_result = tts_pipeline(translated_text)
         synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
     description=(
         "This app provides three outputs:\n"
         "1. English transcription (from ASR or text input),\n"
+        "2. Translation to Spanish, Chinese, or Japanese (using Helsinki-NLP models), and\n"
         "3. Synthetic speech in the target language (using Facebook MMS TTS or equivalent).\n\n"
         "Either record/upload an English audio sample or enter English text directly."
     ),
     allow_flagging="never"