Yilin0601 commited on
Commit
be4098e
·
verified ·
1 Parent(s): 1ce7fad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -39
app.py CHANGED
@@ -3,7 +3,7 @@ import torch
3
  import numpy as np
4
  import librosa
5
  from transformers import pipeline
6
- from melo.api import TTS
7
 
8
  # --------------------------------------------------
9
  # ASR Pipeline (for English transcription)
@@ -14,7 +14,7 @@ asr = pipeline(
14
  )
15
 
16
  # --------------------------------------------------
17
- # Mapping for Target Languages and Models
18
  # --------------------------------------------------
19
  translation_models = {
20
  "Spanish": "Helsinki-NLP/opus-mt-en-es",
@@ -29,8 +29,6 @@ translation_models = {
29
  "Korean": "Helsinki-NLP/opus-mt-en-ko"
30
  }
31
 
32
- # Each language often requires a specific pipeline task name
33
- # (e.g., "translation_en_to_zh" rather than "translation_en_to_chinese")
34
  translation_tasks = {
35
  "Spanish": "translation_en_to_es",
36
  "French": "translation_en_to_fr",
@@ -44,18 +42,20 @@ translation_tasks = {
44
  "Korean": "translation_en_to_ko"
45
  }
46
 
47
- # TTS models (some may not exist or may be unofficial)
 
 
48
  tts_models = {
49
- "Spanish": "myshell-ai/MeloTTS-Spanish",
50
- "French": "myshell-ai/MeloTTS-French",
51
- "German": "tts_models/de/tacotron2",
52
- "Chinese": "myshell-ai/MeloTTS-English-v2", # Verify if this actually exists on Hugging Face
53
- "Russian": "tts_models/ru/tacotron2", # Same note
54
- "Arabic": "tts_models/ar/tacotron2", # Same note
55
- "Portuguese": "tts_models/pt/tacotron2", # Same note
56
- "Japanese": "myshell-ai/MeloTTS-Japanese", # Same note
57
- "Italian": "tts_models/it/tacotron2", # Same note
58
- "Korean": "myshell-ai/MeloTTS-Korean" # Same note
59
  }
60
 
61
  # --------------------------------------------------
@@ -73,31 +73,28 @@ def get_translator(target_language):
73
 
74
  model_name = translation_models[target_language]
75
  task_name = translation_tasks[target_language]
76
-
77
  translator = pipeline(task_name, model=model_name)
78
  translator_cache[target_language] = translator
79
  return translator
80
 
81
  def get_tts(target_language):
82
  """
83
- Retrieve or create a TTS pipeline for the specified language, if available.
84
  """
85
  if target_language in tts_cache:
86
  return tts_cache[target_language]
87
 
88
  model_name = tts_models.get(target_language)
89
  if model_name is None:
90
- # If no TTS model is mapped, raise an error or handle gracefully
91
  raise ValueError(f"No TTS model available for {target_language}.")
92
-
93
  try:
94
  tts_pipeline = pipeline("text-to-speech", model=model_name)
95
  except Exception as e:
96
  raise ValueError(
97
- f"Failed to load TTS model for {target_language}. "
98
- f"Make sure '{model_name}' exists on Hugging Face.\nError: {e}"
99
  )
100
-
101
  tts_cache[target_language] = tts_pipeline
102
  return tts_pipeline
103
 
@@ -110,47 +107,38 @@ def predict(audio, text, target_language):
110
  2. Translate English -> target_language.
111
  3. Synthesize speech in target_language.
112
  """
113
- # 1. English text from text input (if provided), else from audio via ASR
114
  if text.strip():
115
  english_text = text.strip()
116
  elif audio is not None:
117
  sample_rate, audio_data = audio
118
-
119
- # Ensure the audio is float32 for librosa
120
  if audio_data.dtype not in [np.float32, np.float64]:
121
  audio_data = audio_data.astype(np.float32)
122
-
123
- # Convert stereo to mono if needed
124
  if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
125
  audio_data = np.mean(audio_data, axis=1)
126
-
127
- # Resample to 16 kHz if necessary
128
  if sample_rate != 16000:
129
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
130
-
131
  input_audio = {"array": audio_data, "sampling_rate": 16000}
132
  asr_result = asr(input_audio)
133
  english_text = asr_result["text"]
134
  else:
135
  return "No input provided.", "", None
136
 
137
- # 2. Translation step
138
  translator = get_translator(target_language)
139
  try:
140
  translation_result = translator(english_text)
141
  translated_text = translation_result[0]["translation_text"]
142
  except Exception as e:
143
- # If there's an error in translation, return partial results
144
  return english_text, f"Translation error: {e}", None
145
 
146
- # 3. TTS step: synthesize speech from the translated text
147
  try:
148
  tts_pipeline = get_tts(target_language)
149
  tts_result = tts_pipeline(translated_text)
150
- # The TTS pipeline returns a dict with "wav" and "sample_rate"
151
  synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
152
  except Exception as e:
153
- # If TTS fails, return partial results
154
  return english_text, translated_text, f"TTS error: {e}"
155
 
156
  return english_text, translated_text, synthesized_audio
@@ -172,13 +160,12 @@ iface = gr.Interface(
172
  ],
173
  title="Multimodal Language Learning Aid",
174
  description=(
175
- "This app helps language learners by providing three outputs:\n"
176
  "1. English transcription (from ASR or text input),\n"
177
  "2. Translation to a target language (using Helsinki-NLP models), and\n"
178
- "3. Synthetic speech in the target language.\n\n"
179
  "Select one of the top 10 commonly used languages from the dropdown.\n"
180
- "Either record/upload an English audio sample or enter English text directly.\n\n"
181
- "Note: Some TTS models may not exist or be unstable for certain languages."
182
  ),
183
  allow_flagging="never"
184
  )
 
3
  import numpy as np
4
  import librosa
5
  from transformers import pipeline
6
+ import scipy # imported if needed for processing
7
 
8
  # --------------------------------------------------
9
  # ASR Pipeline (for English transcription)
 
14
  )
15
 
16
  # --------------------------------------------------
17
+ # Mapping for Target Languages and Translation Pipelines
18
  # --------------------------------------------------
19
  translation_models = {
20
  "Spanish": "Helsinki-NLP/opus-mt-en-es",
 
29
  "Korean": "Helsinki-NLP/opus-mt-en-ko"
30
  }
31
 
 
 
32
  translation_tasks = {
33
  "Spanish": "translation_en_to_es",
34
  "French": "translation_en_to_fr",
 
42
  "Korean": "translation_en_to_ko"
43
  }
44
 
45
+ # --------------------------------------------------
46
+ # TTS Models (using real Facebook MMS TTS & others)
47
+ # --------------------------------------------------
48
  tts_models = {
49
+ "Spanish": "facebook/mms-tts-spa",
50
+ "French": "facebook/mms-tts-fra",
51
+ "German": "facebook/mms-tts-deu",
52
+ "Chinese": "facebook/mms-tts-che",
53
+ "Russian": "facebook/mms-tts-rus",
54
+ "Arabic": "facebook/mms-tts-ara",
55
+ "Portuguese": "facebook/mms-tts-por",
56
+ "Japanese": "esnya/japanese_speecht5_tts",
57
+ "Italian": "tts_models/it/tacotron2",
58
+ "Korean": "facebook/mms-tts-kor"
59
  }
60
 
61
  # --------------------------------------------------
 
73
 
74
  model_name = translation_models[target_language]
75
  task_name = translation_tasks[target_language]
 
76
  translator = pipeline(task_name, model=model_name)
77
  translator_cache[target_language] = translator
78
  return translator
79
 
80
  def get_tts(target_language):
81
  """
82
+ Retrieve or create a TTS pipeline for the specified language.
83
  """
84
  if target_language in tts_cache:
85
  return tts_cache[target_language]
86
 
87
  model_name = tts_models.get(target_language)
88
  if model_name is None:
 
89
  raise ValueError(f"No TTS model available for {target_language}.")
90
+
91
  try:
92
  tts_pipeline = pipeline("text-to-speech", model=model_name)
93
  except Exception as e:
94
  raise ValueError(
95
+ f"Failed to load TTS model for {target_language} with model '{model_name}'.\nError: {e}"
 
96
  )
97
+
98
  tts_cache[target_language] = tts_pipeline
99
  return tts_pipeline
100
 
 
107
  2. Translate English -> target_language.
108
  3. Synthesize speech in target_language.
109
  """
110
+ # Step 1: Get English text from text input (if provided) or from ASR.
111
  if text.strip():
112
  english_text = text.strip()
113
  elif audio is not None:
114
  sample_rate, audio_data = audio
 
 
115
  if audio_data.dtype not in [np.float32, np.float64]:
116
  audio_data = audio_data.astype(np.float32)
 
 
117
  if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
118
  audio_data = np.mean(audio_data, axis=1)
 
 
119
  if sample_rate != 16000:
120
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
 
121
  input_audio = {"array": audio_data, "sampling_rate": 16000}
122
  asr_result = asr(input_audio)
123
  english_text = asr_result["text"]
124
  else:
125
  return "No input provided.", "", None
126
 
127
+ # Step 2: Translation
128
  translator = get_translator(target_language)
129
  try:
130
  translation_result = translator(english_text)
131
  translated_text = translation_result[0]["translation_text"]
132
  except Exception as e:
 
133
  return english_text, f"Translation error: {e}", None
134
 
135
+ # Step 3: TTS synthesis using Facebook MMS TTS (or alternative) pipeline.
136
  try:
137
  tts_pipeline = get_tts(target_language)
138
  tts_result = tts_pipeline(translated_text)
139
+ # Expected output: a dict with "wav" and "sample_rate"
140
  synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
141
  except Exception as e:
 
142
  return english_text, translated_text, f"TTS error: {e}"
143
 
144
  return english_text, translated_text, synthesized_audio
 
160
  ],
161
  title="Multimodal Language Learning Aid",
162
  description=(
163
+ "This app provides three outputs:\n"
164
  "1. English transcription (from ASR or text input),\n"
165
  "2. Translation to a target language (using Helsinki-NLP models), and\n"
166
+ "3. Synthetic speech in the target language (using Facebook MMS TTS or equivalent).\n\n"
167
  "Select one of the top 10 commonly used languages from the dropdown.\n"
168
+ "Either record/upload an English audio sample or enter English text directly."
 
169
  ),
170
  allow_flagging="never"
171
  )