import gradio as gr import torch import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor # Load your model from Hugging Face model_name = "Futuresony/Future-sw_ASR-24-02-2025" processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) # Function to transcribe audio def transcribe(audio_file): speech_array, sample_rate = torchaudio.load(audio_file) # Resample to 16kHz resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) speech_array = resampler(speech_array).squeeze().numpy() # Process and transcribe input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) # Decode the text transcription = processor.batch_decode(predicted_ids)[0] return transcription # Create Gradio interface interface = gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath"), outputs="text", title="Swahili ASR Transcription", description="Upload a Swahili audio file, and the model will transcribe the speech.", ) # Launch the app if __name__ == "__main__": interface.launch()