import gradio as gr from transformers import pipeline import numpy as np import os from huggingface_hub import login import librosa import spaces HF_TOKEN = os.environ.get("HF_TOKEN") if HF_TOKEN: login(token=HF_TOKEN) MODEL_ID = "badrex/w2v-bert-2.0-kinyarwanda-asr" transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID) @spaces.GPU def transcribe(audio): sr, y = audio # convert to mono if stereo if y.ndim > 1: y = y.mean(axis=1) # resample to 16kHz if needed #if sr != 16000: # y = librosa.resample(y, orig_sr=sr, target_sr=16000) y = y.astype(np.float32) y /= np.max(np.abs(y)) return transcriber({"sampling_rate": sr, "raw": y})["text"] examples = [] examples_dir = "examples" if os.path.exists(examples_dir): for filename in os.listdir(examples_dir): if filename.endswith((".wav", ".mp3", ".ogg")): examples.append([os.path.join(examples_dir, filename)]) print(f"Found {len(examples)} example files") else: print("Examples directory not found") demo = gr.Interface( fn=transcribe, inputs=gr.Audio(), outputs="text", title="
Developed with β€ by Badr al-Absi β
Muraho ππΌ
This is a demo for ASRwanda, a Transformer-based automatic speech recognition (ASR) system for Kinyarwanda language.
The underlying ASR model was trained on 500 hours of transcribed speech provided by
Digital Umuganda as part of the Kinyarwanda
ASR hackathon on Kaggle.
Simply upload an audio file π€ or record yourself speaking ποΈβΊοΈ to try out the model!