| import gradio as gr | |
| import librosa | |
| from asr import transcribe | |
| from tts import synthesize, TTS_EXAMPLES | |
| ALL_LANGUAGES = {} | |
| for task in ["asr", "tts", "lid"]: | |
| ALL_LANGUAGES.setdefault(task, {}) | |
| with open(f"data/{task}/all_langs.tsv") as f: | |
| for line in f: | |
| iso, name = line.split(" ", 1) | |
| ALL_LANGUAGES[task][iso] = name | |
| def identify(microphone, file_upload): | |
| LID_SAMPLING_RATE = 16_000 | |
| warn_output = "" | |
| if (microphone is not None) and (file_upload is not None): | |
| warn_output = ( | |
| "WARNING: You've uploaded an audio file and used the microphone. " | |
| "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" | |
| ) | |
| elif (microphone is None) and (file_upload is None): | |
| return "ERROR: You have to either use the microphone or upload an audio file" | |
| audio_fp = microphone if microphone is not None else file_upload | |
| inputs = librosa.load(audio_fp, sr=LID_SAMPLING_RATE, mono=True)[0] | |
| raw_output = {"eng": 0.9, "hin": 0.04, "heb": 0.03, "ara": 0.02, "fra": 0.01} | |
| return {(k + ": " + ALL_LANGUAGES["lid"][k]): v for k, v in raw_output.items()} | |
| demo = gr.Blocks() | |
| mms_transcribe = gr.Interface( | |
| fn=transcribe, | |
| inputs=[ | |
| gr.Audio(source="microphone", type="filepath"), | |
| gr.Audio(source="upload", type="filepath"), | |
| gr.Dropdown( | |
| [f"{k}: {v}" for k, v in ALL_LANGUAGES["asr"].items()], | |
| label="Language", | |
| value="shn: Shan", | |
| ), | |
| ], | |
| outputs="text", | |
| title="Speech-to-text", | |
| description=("Transcribe audio!"), | |
| allow_flagging="never", | |
| ) | |
| mms_synthesize = gr.Interface( | |
| fn=synthesize, | |
| inputs=[ | |
| gr.Text(label="Input text"), | |
| gr.Dropdown( | |
| [f"{k}: {v}" for k, v in ALL_LANGUAGES["tts"].items()], | |
| label="Language", | |
| value="shn: Shan", | |
| ), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Generated Audio", type="numpy"), | |
| gr.Text(label="Filtered text after removing OOVs"), | |
| ], | |
| examples=TTS_EXAMPLES, | |
| title="Text-to-speech", | |
| description=("Generate audio!"), | |
| allow_flagging="never", | |
| ) | |
| mms_identify = gr.Interface( | |
| fn=identify, | |
| inputs=[ | |
| gr.Audio(source="microphone", type="filepath"), | |
| gr.Audio(source="upload", type="filepath"), | |
| ], | |
| outputs=gr.Label(num_top_classes=10), | |
| title="Language Identification", | |
| description=("Identity the language of audio!"), | |
| allow_flagging="never", | |
| ) | |
| with demo: | |
| gr.TabbedInterface( | |
| [mms_transcribe, mms_synthesize, mms_identify], | |
| ["Speech-to-text", "Text-to-speech", "Language Identification"], | |
| ) | |
| demo.launch() | |