Tonic's picture
Update app.py
8fa6df8
raw
history blame
1.32 kB
import gradio as gr
import torchaudio
import torch
def speech_to_text(audio_file):
audio_input, _ = torchaudio.load(audio_file.name)
s2t_model = torch.jit.load("unity_on_device_s2t.ptl")
with torch.no_grad():
text = s2t_model(audio_input, tgt_lang=TGT_LANG)
return text
def speech_to_speech_translation(audio_file):
audio_input, _ = torchaudio.load(audio_file.name)
s2st_model = torch.jit.load("unity_on_device.ptl")
with torch.no_grad():
text, units, waveform = s2st_model(audio_input, tgt_lang=TGT_LANG)
output_file = "/tmp/result.wav"
torchaudio.save(output_file, waveform.unsqueeze(0), sample_rate=16000)
return text, output_file
# Gradio interfaces
iface_s2t = gr.Interface(
fn=speech_to_text,
inputs=gr.Audio(type="file", label="Upload Audio for Speech to Text"),
outputs="text",
title="Speech to Text"
)
iface_s2st = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(type="file", label="Upload Audio for Speech to Speech Translation"),
outputs=["text", "audio"],
title="Speech to Speech Translation"
)
# Combine into an accordion interface
accordion = gr.Accordion(
iface_s2t,
iface_s2st,
labels=["Speech to Text", "Speech to Speech Translation"]
)
# Launch the application
accordion.launch()