Futuresony's picture
Rename app.py(good) to app.py
be4915e verified
import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# Load your model from Hugging Face
model_name = "Futuresony/Future-sw_ASR-24-02-2025"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
# Function to transcribe audio
def transcribe(audio_file):
speech_array, sample_rate = torchaudio.load(audio_file)
# Resample to 16kHz
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
speech_array = resampler(speech_array).squeeze().numpy()
# Process and transcribe
input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
# Decode the text
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Create Gradio interface
interface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Swahili ASR Transcription",
description="Upload a Swahili audio file, and the model will transcribe the speech.",
)
# Launch the app
if __name__ == "__main__":
interface.launch()