import gradio as gr from transformers import pipeline import numpy as np import os from huggingface_hub import login import librosa import spaces HF_TOKEN = os.environ.get("HF_TOKEN") if HF_TOKEN: login(token=HF_TOKEN) MODEL_ID = "badrex/w2v-bert-2.0-kinyarwanda-asr" transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID) @spaces.GPU def transcribe(audio): sr, y = audio # convert to mono if stereo if y.ndim > 1: y = y.mean(axis=1) # resample to 16kHz if needed #if sr != 16000: # y = librosa.resample(y, orig_sr=sr, target_sr=16000) y = y.astype(np.float32) y /= np.max(np.abs(y)) return transcriber({"sampling_rate": sr, "raw": y})["text"] examples = [] examples_dir = "examples" if os.path.exists(examples_dir): for filename in os.listdir(examples_dir): if filename.endswith((".wav", ".mp3", ".ogg")): examples.append([os.path.join(examples_dir, filename)]) print(f"Found {len(examples)} example files") else: print("Examples directory not found") demo = gr.Interface( fn=transcribe, inputs=gr.Audio(), outputs="text", title="
ASRwanda πŸŽ™οΈ
Speech Recognition for Kinyarwanda
", description="""

Developed with ❀ by Badr al-Absi β˜•


Muraho πŸ‘‹πŸΌ

This is a demo for ASRwanda, a Transformer-based automatic speech recognition (ASR) system for Kinyarwanda language. The underlying ASR model was trained on 500 hours of transcribed speech provided by Digital Umuganda as part of the Kinyarwanda ASR hackathon on Kaggle.

Simply upload an audio file πŸ“€ or record yourself speaking πŸŽ™οΈβΊοΈ to try out the model!

""", examples=examples if examples else None, cache_examples=False, flagging_mode=None, ) if __name__ == "__main__": demo.launch()