Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torchaudio | |
| import torch | |
| # Define the list of target languages | |
| languages = { | |
| "English": "eng", | |
| "Hindi": "hin", | |
| "Portuguese": "por", | |
| "Russian": "rus", | |
| "Spanish": "spa" | |
| } | |
| def speech_to_text(audio_data, tgt_lang): | |
| audio_input, _ = torchaudio.load(audio_data) | |
| s2t_model = torch.jit.load("unity_on_device_s2t.ptl") | |
| with torch.no_grad(): | |
| text = s2t_model(audio_input, tgt_lang=languages[tgt_lang]) | |
| return text | |
| def speech_to_speech_translation(audio_data, tgt_lang): | |
| audio_input, _ = torchaudio.load(audio_data) | |
| s2st_model = torch.jit.load("unity_on_device.ptl") | |
| with torch.no_grad(): | |
| text, units, waveform = s2st_model(audio_input, tgt_lang=languages[tgt_lang]) | |
| output_file = "/tmp/result.wav" | |
| torchaudio.save(output_file, waveform.unsqueeze(0), sample_rate=16000) | |
| return text, output_file | |
| # Gradio interfaces | |
| iface_s2t = gr.Interface( | |
| fn=speech_to_text, | |
| inputs=[ | |
| gr.Audio(label="Upload or Record Audio for Speech to Text"), | |
| gr.Dropdown(list(languages.keys()), label="Select Target Language") | |
| ], | |
| outputs="text", | |
| title="Speech to Text" | |
| ) | |
| iface_s2st = gr.Interface( | |
| fn=speech_to_speech_translation, | |
| inputs=[ | |
| gr.Audio(label="Upload or Record Audio for Speech to Speech Translation"), | |
| gr.Dropdown(list(languages.keys()), label="Select Target Language") | |
| ], | |
| outputs=["text", "audio"], | |
| title="Speech to Speech Translation" | |
| ) | |
| # Combine into an accordion interface | |
| accordion = gr.Accordion( | |
| iface_s2t, | |
| iface_s2st, | |
| labels=["Speech to Text", "Speech to Speech Translation"] | |
| ) | |
| # Launch the application | |
| accordion.launch() | |