|
import streamlit as st |
|
|
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq |
|
import torch |
|
import tempfile |
|
from pydub import AudioSegment |
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
available_models = ["Yehor/whisper-small-ukrainian"] |
|
|
|
st.title("Voice Recognition App using SpeechSeq2Seq") |
|
|
|
st.write("Upload an audio file and choose a model to transcribe it to text.") |
|
|
|
|
|
model_choice = st.selectbox("Choose a SpeechSeq2Seq model", available_models) |
|
|
|
|
|
|
|
@st.cache_resource |
|
def load_model_and_processor(model_name): |
|
|
|
|
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name) |
|
processor = AutoProcessor.from_pretrained(model_name) |
|
return model, processor |
|
|
|
|
|
processor = AutoProcessor.from_pretrained("Yehor/whisper-small-ukrainian") |
|
model = AutoModelForSpeechSeq2Seq.from_pretrained("Yehor/whisper-small-ukrainian") |
|
|
|
st.write(f"Loading {model_choice} model...") |
|
model, processor = load_model_and_processor(model_choice) |
|
st.write(f"{model_choice} model loaded successfully.") |
|
|
|
|
|
uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"]) |
|
|
|
if uploaded_file is not None: |
|
|
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file: |
|
temp_file.write(uploaded_file.read()) |
|
temp_file_path = temp_file.name |
|
|
|
|
|
audio = AudioSegment.from_file(temp_file_path) |
|
temp_wav_path = tempfile.mktemp(suffix=".wav") |
|
audio.export(temp_wav_path, format="wav") |
|
|
|
st.audio(uploaded_file, format="audio/wav") |
|
|
|
st.write("Transcribing audio...") |
|
|
|
|
|
audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1) |
|
audio_input = np.array(audio_input.get_array_of_samples()) |
|
|
|
|
|
input_features = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values |
|
|
|
|
|
with torch.no_grad(): |
|
predicted_ids = model.generate(input_features) |
|
|
|
transcription = processor.batch_decode(predicted_ids)[0] |
|
|
|
st.write("Transcription:") |
|
st.write(transcription) |
|
|