Spaces:

badrex
/

ASRwanda

Running on Zero

File size: 2,639 Bytes

fda6e40
 
 
 
 
e2dd32d
fda6e40
 
 
 
 
 
4c6f1dd
fda6e40
 
e2dd32d
fda6e40
 
 
e2dd32d
fda6e40
 
 
e2dd32d
 
c6bd35b
 
e2dd32d
5b7288e
 
e2dd32d
c6bd35b
fda6e40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04ac83f
fda6e40
 
 
 
 
 
 
 
b3653e3
 
fda6e40

import gradio as gr
from transformers import pipeline
import numpy as np
import os
from huggingface_hub import login
import librosa
import spaces

HF_TOKEN = os.environ.get("HF_TOKEN")
if HF_TOKEN:
    login(token=HF_TOKEN)

MODEL_ID = "badrex/w2v-bert-2.0-kinyarwanda-asr"
transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)


@spaces.GPU
def transcribe(audio):
    sr, y = audio
    
    # convert to mono if stereo
    if y.ndim > 1:
        y = y.mean(axis=1)
    
    # resample to 16kHz if needed
    #if sr != 16000:
    #    y = librosa.resample(y, orig_sr=sr, target_sr=16000)
    
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    
    return transcriber({"sampling_rate": sr, "raw": y})["text"]

examples = []
examples_dir = "examples"
if os.path.exists(examples_dir):
    for filename in os.listdir(examples_dir):
        if filename.endswith((".wav", ".mp3", ".ogg")):
            examples.append([os.path.join(examples_dir, filename)])
    
    print(f"Found {len(examples)} example files")
else:
    print("Examples directory not found")

    
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(),
    outputs="text",
    title="<div>ASRwanda 🎙️ <br>Speech Recognition for Kinyarwanda</div>",
    description="""
        <div class="centered-content">
            <div>
                <p>
                Developed with ❤ by <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a> ☕
                </p>
                <br>
                <p style="font-size: 15px; line-height: 1.8;">
                 Muraho 👋🏼
                <br>
                <br>
                 This is a demo for ASRwanda, a Transformer-based automatic speech recognition (ASR) system for Kinyarwanda language.
                 The underlying ASR model was trained on 500 hours of transcribed speech provided by 
                 <a href="https://digitalumuganda.com/" style="color: #2563eb;">Digital Umuganda</a> as part of the Kinyarwanda
                 <a href="https://www.kaggle.com/competitions/kinyarwanda-automatic-speech-recognition-track-a" style="color: #2563eb;"> ASR hackathon</a> on Kaggle.
                <br>                   
                <p style="font-size: 15px; line-height: 1.8;">
                Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
                </p>
            </div>
        </div>
        """,
    examples=examples if examples else None,
    cache_examples=False,  
    flagging_mode=None,
)

if __name__ == "__main__":
    demo.launch()