Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 26

Commit

cc8959c

verified ·

1 Parent(s): 5fb2e7c

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -69

app.py CHANGED Viewed

@@ -2,11 +2,12 @@ import gradio as gr
 import torch
 import numpy as np
 import librosa
 from transformers import pipeline, VitsModel, AutoTokenizer
-import scipy  # if needed for processing
 # ------------------------------------------------------
-# 1. ASR Pipeline (English) using Wav2Vec2
 # ------------------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
@@ -29,36 +30,33 @@ translation_tasks = {
 }
 # ------------------------------------------------------
-# 3. TTS Model Configurations
-#    - Spanish: facebook/mms-tts-spa
-#    - Chinese: myshell-ai/MeloTTS-Chinese
-#    - Japanese: myshell-ai/MeloTTS-Japanese
 # ------------------------------------------------------
-tts_config = {
-    "Spanish": {
-        "model_id": "facebook/mms-tts-spa",
-        "architecture": "vits"
-    },
-    "Chinese": {
-        "model_id": "myshell-ai/MeloTTS-Chinese",
-        "architecture": "vits"
-    },
-    "Japanese": {
-        "model_id": "myshell-ai/MeloTTS-Japanese",
-        "architecture": "vits"
-    }
 }
 # ------------------------------------------------------
-# 4. Caches
 # ------------------------------------------------------
 translator_cache = {}
-tts_model_cache = {}  # store (model, tokenizer, architecture)
-# ------------------------------------------------------
-# 5. Translator Helper
-# ------------------------------------------------------
 def get_translator(lang):
     if lang in translator_cache:
         return translator_cache[lang]
     model_name = translation_models[lang]
@@ -67,66 +65,91 @@ def get_translator(lang):
     translator_cache[lang] = translator
     return translator
-# ------------------------------------------------------
-# 6. TTS Loading Helper
-# ------------------------------------------------------
-def get_tts_model(lang):
     """
-    Loads (model, tokenizer, architecture) from Hugging Face once, then caches.
     """
-    if lang in tts_model_cache:
-        return tts_model_cache[lang]
-    config = tts_config.get(lang)
-    if config is None:
-        raise ValueError(f"No TTS config found for language: {lang}")
-    model_id = config["model_id"]
-    arch = config["architecture"]
     try:
-        # Attempt VITS-based loading
         model = VitsModel.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
     except Exception as e:
-        raise RuntimeError(f"Failed to load TTS model {model_id}: {e}")
-    tts_model_cache[lang] = (model, tokenizer, arch)
-    return tts_model_cache[lang]
 # ------------------------------------------------------
-# 7. TTS Inference Helper
 # ------------------------------------------------------
-def run_tts_inference(lang, text):
     """
-    Generates waveform using the loaded TTS model and tokenizer.
-    Returns (sample_rate, np_array).
     """
-    model, tokenizer, arch = get_tts_model(lang)
     inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         output = model(**inputs)
-    # VitsModel output is typically `.waveform`
     if not hasattr(output, "waveform"):
-        raise RuntimeError("TTS model output does not contain 'waveform' attribute.")
-    waveform_tensor = output.waveform
-    waveform = waveform_tensor.squeeze().cpu().numpy()
-    # Typically 16 kHz for these VITS models
     sample_rate = 16000
-    return (sample_rate, waveform)
 # ------------------------------------------------------
-# 8. Prediction Function
 # ------------------------------------------------------
 def predict(audio, text, target_language):
     """
-    1. Obtain English text (ASR with Wav2Vec2 or text input).
-    2. Translate English -> target_language.
-    3. TTS for that language (using configured models).
     """
     # Step 1: English text
     if text.strip():
@@ -138,7 +161,7 @@ def predict(audio, text, target_language):
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
-        # Stereo -> mono if needed
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
@@ -162,15 +185,18 @@ def predict(audio, text, target_language):
     # Step 3: TTS
     try:
-        sample_rate, waveform = run_tts_inference(target_language, translated_text)
     except Exception as e:
-        # Return error info in place of audio
         return english_text, translated_text, f"TTS error: {e}"
-    return english_text, translated_text, (sample_rate, waveform)
 # ------------------------------------------------------
-# 9. Gradio Interface
 # ------------------------------------------------------
 iface = gr.Interface(
     fn=predict,
@@ -187,11 +213,15 @@ iface = gr.Interface(
     title="Multimodal Language Learning Aid",
     description=(
         "1. Transcribes English speech using Wav2Vec2 (or takes English text).\n"
-        "2. Translates to Spanish, Chinese, or Japanese (Helsinki-NLP models).\n"
-        "3. Provides synthetic speech with TTS models.\n"
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

 import torch
 import numpy as np
 import librosa
+import soundfile as sf  # likely needed by the pipeline or local saving
 from transformers import pipeline, VitsModel, AutoTokenizer
+from datasets import load_dataset
 # ------------------------------------------------------
+# 1. ASR Pipeline (English) - Wav2Vec2
 # ------------------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
 }
 # ------------------------------------------------------
+# 3. TTS Configuration
+#    - Spanish: VITS-based MMS TTS
+#    - Chinese & Japanese: Microsoft SpeechT5
 # ------------------------------------------------------
+# We'll store them as keys for convenience
+SPANISH_KEY = "Spanish"
+CHINESE_KEY = "Chinese"
+JAPANESE_KEY = "Japanese"
+# VITS config for Spanish only
+mms_spanish_config = {
+    "model_id": "facebook/mms-tts-spa",
+    "architecture": "vits"
 }
 # ------------------------------------------------------
+# 4. Create TTS Pipelines / Models Once (Caching)
 # ------------------------------------------------------
 translator_cache = {}
+vits_model_cache = None  # for Spanish
+speech_t5_pipeline_cache = None  # for Chinese/Japanese
+speech_t5_speaker_embedding = None
 def get_translator(lang):
+    """
+    Return a cached MarianMT translator for the specified language.
+    """
     if lang in translator_cache:
         return translator_cache[lang]
     model_name = translation_models[lang]
     translator_cache[lang] = translator
     return translator
+def load_spanish_vits():
     """
+    Load and cache the Spanish VITS model + tokenizer (facebook/mms-tts-spa).
     """
+    global vits_model_cache
+    if vits_model_cache is not None:
+        return vits_model_cache
     try:
+        model_id = mms_spanish_config["model_id"]
         model = VitsModel.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
+        vits_model_cache = (model, tokenizer)
+    except Exception as e:
+        raise RuntimeError(f"Failed to load Spanish TTS model {mms_spanish_config['model_id']}: {e}")
+    return vits_model_cache
+def load_speech_t5_pipeline():
+    """
+    Load and cache the Microsoft SpeechT5 text-to-speech pipeline
+    and a default speaker embedding.
+    """
+    global speech_t5_pipeline_cache, speech_t5_speaker_embedding
+    if speech_t5_pipeline_cache is not None and speech_t5_speaker_embedding is not None:
+        return speech_t5_pipeline_cache, speech_t5_speaker_embedding
+    try:
+        # Create the pipeline
+        # The pipeline is named "text-to-speech" in Transformers >= 4.29
+        t5_pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts")
     except Exception as e:
+        raise RuntimeError(f"Failed to load Microsoft SpeechT5 pipeline: {e}")
+    # Load a default speaker embedding
+    try:
+        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+        # Just pick an arbitrary index for speaker embedding
+        speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+    except Exception as e:
+        raise RuntimeError(f"Failed to load default speaker embedding: {e}")
+    speech_t5_pipeline_cache = t5_pipe
+    speech_t5_speaker_embedding = speaker_embedding
+    return t5_pipe, speaker_embedding
 # ------------------------------------------------------
+# 5. TTS Inference Helpers
 # ------------------------------------------------------
+def run_vits_inference(text):
     """
+    For Spanish TTS using MMS (facebook/mms-tts-spa).
     """
+    model, tokenizer = load_spanish_vits()
     inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         output = model(**inputs)
     if not hasattr(output, "waveform"):
+        raise RuntimeError("VITS output does not contain 'waveform'.")
+    waveform = output.waveform.squeeze().cpu().numpy()
     sample_rate = 16000
+    return sample_rate, waveform
+def run_speecht5_inference(text):
+    """
+    For Chinese & Japanese TTS using Microsoft SpeechT5 pipeline.
+    """
+    t5_pipe, speaker_embedding = load_speech_t5_pipeline()
+    # The pipeline returns a dict with 'audio' (numpy) and 'sampling_rate'
+    result = t5_pipe(
+        text,
+        forward_params={"speaker_embeddings": speaker_embedding}
+    )
+    waveform = result["audio"]
+    sample_rate = result["sampling_rate"]
+    return sample_rate, waveform
 # ------------------------------------------------------
+# 6. Main Prediction Function
 # ------------------------------------------------------
 def predict(audio, text, target_language):
     """
+    1. Get English text (ASR if audio provided, else text).
+    2. Translate to target_language.
+    3. TTS with the chosen approach (VITS for Spanish, SpeechT5 for Chinese/Japanese).
     """
     # Step 1: English text
     if text.strip():
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
+        # Stereo -> mono
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
     # Step 3: TTS
     try:
+        if target_language == SPANISH_KEY:
+            sr, waveform = run_vits_inference(translated_text)
+        else:
+            # Chinese or Japanese -> SpeechT5
+            sr, waveform = run_speecht5_inference(translated_text)
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
+    return english_text, translated_text, (sr, waveform)
 # ------------------------------------------------------
+# 7. Gradio Interface
 # ------------------------------------------------------
 iface = gr.Interface(
     fn=predict,
     title="Multimodal Language Learning Aid",
     description=(
         "1. Transcribes English speech using Wav2Vec2 (or takes English text).\n"
+        "2. Translates to Spanish, Chinese, or Japanese (via Helsinki-NLP models).\n"
+        "3. Synthesizes speech:\n"
+        "   - Spanish -> facebook/mms-tts-spa (VITS)\n"
+        "   - Chinese & Japanese -> microsoft/speecht5_tts (SpeechT5)\n\n"
+        "Note: SpeechT5 is not officially trained for Japanese, so results may vary.\n"
+        "You can also try inputting short, clear audio for best ASR results."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)