import gradio as gr import torchaudio import torch # Define the list of target languages languages = { "English": "eng", "Hindi": "hin", "Portuguese": "por", "Russian": "rus", "Spanish": "spa" } def speech_to_text(audio_data, tgt_lang): audio_input, _ = torchaudio.load(audio_data) s2t_model = torch.jit.load("unity_on_device_s2t.ptl") with torch.no_grad(): text = s2t_model(audio_input, tgt_lang=languages[tgt_lang]) return text def speech_to_speech_translation(audio_data, tgt_lang): audio_input, _ = torchaudio.load(audio_data) s2st_model = torch.jit.load("unity_on_device.ptl") with torch.no_grad(): text, units, waveform = s2st_model(audio_input, tgt_lang=languages[tgt_lang]) output_file = "/tmp/result.wav" torchaudio.save(output_file, waveform.unsqueeze(0), sample_rate=16000) return text, output_file # Gradio interfaces iface_s2t = gr.Interface( fn=speech_to_text, inputs=[ gr.inputs.Audio(label="Upload or Record Audio for Speech to Text"), gr.inputs.Dropdown(list(languages.keys()), label="Select Target Language") ], outputs="text", title="Speech to Text" ) iface_s2st = gr.Interface( fn=speech_to_speech_translation, inputs=[ gr.inputs.Audio(label="Upload or Record Audio for Speech to Speech Translation"), gr.inputs.Dropdown(list(languages.keys()), label="Select Target Language") ], outputs=["text", "audio"], title="Speech to Speech Translation" ) # Combine into an accordion interface accordion = gr.Accordion( iface_s2t, iface_s2st, labels=["Speech to Text", "Speech to Speech Translation"] ) # Launch the application accordion.launch()