Futuresony's picture
Create app.py
b3a902e verified
raw
history blame
1.43 kB
import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# Load your model from Hugging Face
model_name = "Futuresony/Future-sw_ASR-24-02-2025"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
# Function to process live audio stream
def transcribe_live(microphone_audio):
speech_array, sample_rate = torchaudio.load(microphone_audio)
# Resample to 16kHz
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
speech_array = resampler(speech_array).squeeze().numpy()
# Process and transcribe
input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
# Decode the text
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Create Gradio interface with live microphone input
interface = gr.Interface(
fn=transcribe_live,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text",
live=True, # Enables real-time updates
title="Live Swahili ASR Transcription",
description="Speak into your microphone, and the model will transcribe in real-time.",
)
# Launch the app
if __name__ == "__main__":
interface.launch()