Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 27

Commit

e947b77

verified ·

1 Parent(s): 799659c

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -42

app.py CHANGED Viewed

@@ -9,11 +9,11 @@ import os
 from transformers import pipeline, VitsModel, AutoTokenizer
 from datasets import load_dataset
-# For Coqui TTS (XTTS-v2)
 try:
-    from TTS.api import TTS as CoquiTTS
 except ImportError:
-    raise ImportError("Please install Coqui TTS via `pip install TTS`.")
 # ------------------------------------------------------
 # 1. ASR Pipeline (English) using Wav2Vec2
@@ -51,7 +51,7 @@ translation_tasks = {
 # ------------------------------------------------------
 # 3. TTS Configuration
 #    - MMS TTS (VITS) for: Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
-#    - Coqui XTTS-v2 for: Chinese and Japanese
 # ------------------------------------------------------
 tts_config = {
     "Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
@@ -60,14 +60,8 @@ tts_config = {
     "Turkish": {"model_id": "facebook/mms-tts-tur", "architecture": "vits", "type": "mms"},
     "Portuguese": {"model_id": "facebook/mms-tts-por", "architecture": "vits", "type": "mms"},
     "Korean": {"model_id": "facebook/mms-tts-kor", "architecture": "vits", "type": "mms"},
-    "Chinese": {"type": "coqui"},
-    "Japanese": {"type": "coqui"}
-}
-# For Coqui, we map our languages to language codes expected by the model.
-coqui_lang_map = {
-    "Chinese": "zh",
-    "Japanese": "ja"
 }
 # ------------------------------------------------------
@@ -75,7 +69,7 @@ coqui_lang_map = {
 # ------------------------------------------------------
 translator_cache = {}
 mms_tts_cache = {}     # For MMS (VITS-based) TTS models
-coqui_tts_cache = None  # Single instance for Coqui XTTS-v2
 # ------------------------------------------------------
 # 5. Translator Helper
@@ -116,31 +110,31 @@ def run_mms_tts(text, lang):
     return sample_rate, waveform
 # ------------------------------------------------------
-# 7. Coqui TTS Helper for Chinese and Japanese
 # ------------------------------------------------------
-def load_coqui_tts():
-    global coqui_tts_cache
-    if coqui_tts_cache is not None:
-        return coqui_tts_cache
-    try:
-        # Set gpu=True if a GPU is available.
-        coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
-    except Exception as e:
-        raise RuntimeError(f"Failed to load Coqui XTTS-v2 TTS: {e}")
-    return coqui_tts_cache
-def run_coqui_tts(text, lang):
-    coqui_tts = load_coqui_tts()
-    lang_code = coqui_lang_map[lang]  # "zh" for Chinese or "ja" for Japanese
-    # Write the output to a temporary file and then read it back.
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         tmp_name = tmp.name
     try:
-        coqui_tts.tts_to_file(
-            text=text,
-            file_path=tmp_name,
-            language=lang_code  # using default voice; for cloning, add speaker_wav parameter
-        )
         data, sr = sf.read(tmp_name)
     finally:
         if os.path.exists(tmp_name):
@@ -153,8 +147,8 @@ def run_coqui_tts(text, lang):
 def predict(audio, text, target_language):
     """
     1. Obtain English text (via ASR if audio provided, else text).
-    2. Translate English text to target_language.
-    3. Generate TTS audio using either MMS TTS (VITS) or Coqui XTTS-v2.
     """
     # Step 1: Get English text.
     if text.strip():
@@ -186,8 +180,8 @@ def predict(audio, text, target_language):
         tts_type = tts_config[target_language]["type"]
         if tts_type == "mms":
             sr, waveform = run_mms_tts(translated_text, target_language)
-        elif tts_type == "coqui":
-            sr, waveform = run_coqui_tts(translated_text, target_language)
         else:
             raise RuntimeError("Unknown TTS type for target language.")
     except Exception as e:
@@ -218,12 +212,14 @@ iface = gr.Interface(
     description=(
         "This app performs the following steps:\n"
         "1. Transcribes English speech using Wav2Vec2 (or accepts text input).\n"
-        "2. Translates the English text to the target language using Helsinki-NLP models.\n"
-        "3. Provides Synthetic speech:\n"
-        "For Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

 from transformers import pipeline, VitsModel, AutoTokenizer
 from datasets import load_dataset
+# For MeloTTS (Chinese and Japanese)
 try:
+    from melo.api import TTS as MeloTTS
 except ImportError:
+    raise ImportError("Please install the MeloTTS package (e.g., pip install myshell-ai/MeloTTS-Chinese)")
 # ------------------------------------------------------
 # 1. ASR Pipeline (English) using Wav2Vec2
 # ------------------------------------------------------
 # 3. TTS Configuration
 #    - MMS TTS (VITS) for: Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
+#    - MeloTTS for: Chinese and Japanese
 # ------------------------------------------------------
 tts_config = {
     "Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
     "Turkish": {"model_id": "facebook/mms-tts-tur", "architecture": "vits", "type": "mms"},
     "Portuguese": {"model_id": "facebook/mms-tts-por", "architecture": "vits", "type": "mms"},
     "Korean": {"model_id": "facebook/mms-tts-kor", "architecture": "vits", "type": "mms"},
+    "Chinese": {"type": "melo"},
+    "Japanese": {"type": "melo"}
 }
 # ------------------------------------------------------
 # ------------------------------------------------------
 translator_cache = {}
 mms_tts_cache = {}     # For MMS (VITS-based) TTS models
+melo_tts_cache = {}    # For MeloTTS models (Chinese/Japanese)
 # ------------------------------------------------------
 # 5. Translator Helper
     return sample_rate, waveform
 # ------------------------------------------------------
+# 7. MeloTTS Helper for Chinese and Japanese
 # ------------------------------------------------------
+def run_melo_tts(text, lang):
+    """
+    Uses the myshell-ai MeloTTS model.
+    For Chinese, use language parameter 'ZH'; for Japanese, use 'JP'.
+    """
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    lang_param = 'ZH' if lang == "Chinese" else 'JP'
+    if lang not in melo_tts_cache:
+        try:
+            model = MeloTTS(language=lang_param, device=device)
+            melo_tts_cache[lang] = model
+        except Exception as e:
+            raise RuntimeError(f"Failed to load MeloTTS model for {lang}: {e}")
+    else:
+        model = melo_tts_cache[lang]
+    speaker_ids = model.hps.data.spk2id
+    # Assume the speaker key is the same as lang_param
+    speaker_key = lang_param
+    speed = 1.0
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         tmp_name = tmp.name
     try:
+        model.tts_to_file(text, speaker_ids[speaker_key], tmp_name, speed=speed)
         data, sr = sf.read(tmp_name)
     finally:
         if os.path.exists(tmp_name):
 def predict(audio, text, target_language):
     """
     1. Obtain English text (via ASR if audio provided, else text).
+    2. Translate the English text to target_language.
+    3. Generate TTS audio using either MMS TTS (VITS) or MeloTTS.
     """
     # Step 1: Get English text.
     if text.strip():
         tts_type = tts_config[target_language]["type"]
         if tts_type == "mms":
             sr, waveform = run_mms_tts(translated_text, target_language)
+        elif tts_type == "melo":
+            sr, waveform = run_melo_tts(translated_text, target_language)
         else:
             raise RuntimeError("Unknown TTS type for target language.")
     except Exception as e:
     description=(
         "This app performs the following steps:\n"
         "1. Transcribes English speech using Wav2Vec2 (or accepts text input).\n"
+        "2. Translates the English text to the target language using Helsinki-NLP MarianMT models.\n"
+        "3. Synthesizes speech:\n"
+        "   - For Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean: uses Facebook MMS TTS (VITS-based).\n"
+        "   - For Chinese and Japanese: uses myshell-ai MeloTTS models.\n"
+        "\nSelect your target language from the dropdown."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)