Yilin0601 commited on
Commit
63ee3e5
·
verified ·
1 Parent(s): 76b5526

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import librosa
5
+ from transformers import pipeline
6
+
7
+ # --------------------------------------------------
8
+ # ASR Pipeline (for English transcription)
9
+ # --------------------------------------------------
10
+ asr = pipeline(
11
+ "automatic-speech-recognition",
12
+ model="facebook/wav2vec2-large-960h-lv60-self"
13
+ )
14
+
15
+ # --------------------------------------------------
16
+ # Mapping for Target Languages and Models
17
+ # --------------------------------------------------
18
+ translation_models = {
19
+ "Spanish": "Helsinki-NLP/opus-mt-en-es",
20
+ "French": "Helsinki-NLP/opus-mt-en-fr",
21
+ "German": "Helsinki-NLP/opus-mt-en-de",
22
+ "Chinese": "Helsinki-NLP/opus-mt-en-zh",
23
+ "Russian": "Helsinki-NLP/opus-mt-en-ru",
24
+ "Arabic": "Helsinki-NLP/opus-mt-en-ar",
25
+ "Portuguese": "Helsinki-NLP/opus-mt-en-pt",
26
+ "Japanese": "Helsinki-NLP/opus-mt-en-ja",
27
+ "Italian": "Helsinki-NLP/opus-mt-en-it",
28
+ "Korean": "Helsinki-NLP/opus-mt-en-ko"
29
+ }
30
+
31
+ tts_models = {
32
+ "Spanish": "tts_models/es/tacotron2-DDC",
33
+ "French": "tts_models/fr/tacotron2",
34
+ "German": "tts_models/de/tacotron2",
35
+ "Chinese": "tts_models/zh/tacotron2",
36
+ "Russian": "tts_models/ru/tacotron2",
37
+ "Arabic": "tts_models/ar/tacotron2",
38
+ "Portuguese": "tts_models/pt/tacotron2",
39
+ "Japanese": "tts_models/ja/tacotron2",
40
+ "Italian": "tts_models/it/tacotron2",
41
+ "Korean": "tts_models/ko/tacotron2"
42
+ }
43
+
44
+ # Caches for translator and TTS pipelines
45
+ translator_cache = {}
46
+ tts_cache = {}
47
+
48
+ def get_translator(target_language):
49
+ if target_language in translator_cache:
50
+ return translator_cache[target_language]
51
+ model_name = translation_models[target_language]
52
+ # Pipeline task naming is case sensitive; here we assume task "translation_en_to_<lang>"
53
+ translator = pipeline("translation_en_to_" + target_language.lower(), model=model_name)
54
+ translator_cache[target_language] = translator
55
+ return translator
56
+
57
+ def get_tts(target_language):
58
+ if target_language in tts_cache:
59
+ return tts_cache[target_language]
60
+ model_name = tts_models[target_language]
61
+ tts = pipeline("text-to-speech", model=model_name)
62
+ tts_cache[target_language] = tts
63
+ return tts
64
+
65
+ # --------------------------------------------------
66
+ # Prediction Function
67
+ # --------------------------------------------------
68
+ def predict(audio, text, target_language):
69
+ # Use text input if provided; otherwise, use ASR on audio
70
+ if text.strip() != "":
71
+ english_text = text.strip()
72
+ elif audio is not None:
73
+ sample_rate, audio_data = audio
74
+
75
+ # Ensure the audio is floating-point for librosa
76
+ if audio_data.dtype not in [np.float32, np.float64]:
77
+ audio_data = audio_data.astype(np.float32)
78
+
79
+ # Convert stereo to mono if needed
80
+ if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
81
+ audio_data = np.mean(audio_data, axis=1)
82
+
83
+ # Resample to 16 kHz if necessary
84
+ if sample_rate != 16000:
85
+ audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
86
+
87
+ input_audio = {"array": audio_data, "sampling_rate": 16000}
88
+ asr_result = asr(input_audio)
89
+ english_text = asr_result["text"]
90
+ else:
91
+ return "No input provided.", "", None
92
+
93
+ # Translation step
94
+ translator = get_translator(target_language)
95
+ translation_result = translator(english_text)
96
+ translated_text = translation_result[0]["translation_text"]
97
+
98
+ # TTS step: synthesize speech from the translated text
99
+ tts = get_tts(target_language)
100
+ tts_result = tts(translated_text)
101
+ # The TTS pipeline returns a dict with "wav" and "sample_rate"
102
+ synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
103
+
104
+ return english_text, translated_text, synthesized_audio
105
+
106
+ # --------------------------------------------------
107
+ # Gradio Interface Setup
108
+ # --------------------------------------------------
109
+ iface = gr.Interface(
110
+ fn=predict,
111
+ inputs=[
112
+ gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
113
+ gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
114
+ gr.Dropdown(choices=list(translation_models.keys()), value="Spanish", label="Target Language")
115
+ ],
116
+ outputs=[
117
+ gr.Textbox(label="English Transcription"),
118
+ gr.Textbox(label="Translation (Target Language)"),
119
+ gr.Audio(label="Synthesized Speech in Target Language")
120
+ ],
121
+ title="Multimodal Language Learning Aid",
122
+ description=(
123
+ "This app helps language learners by providing three outputs:\n"
124
+ "1. English transcription (from ASR or text input),\n"
125
+ "2. Translation to a target language, and\n"
126
+ "3. Synthetic speech in the target language.\n\n"
127
+ "Choose one of the top 10 commonly used languages from the dropdown.\n"
128
+ "You can either record/upload an English audio sample or enter English text directly."
129
+ ),
130
+ allow_flagging="never"
131
+ )
132
+
133
+ if __name__ == "__main__":
134
+ iface.launch()