File size: 2,639 Bytes
fda6e40
 
 
 
 
e2dd32d
fda6e40
 
 
 
 
 
4c6f1dd
fda6e40
 
e2dd32d
fda6e40
 
 
e2dd32d
fda6e40
 
 
e2dd32d
 
c6bd35b
 
e2dd32d
5b7288e
 
e2dd32d
c6bd35b
fda6e40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04ac83f
fda6e40
 
 
 
 
 
 
 
b3653e3
 
fda6e40
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import gradio as gr
from transformers import pipeline
import numpy as np
import os
from huggingface_hub import login
import librosa
import spaces

HF_TOKEN = os.environ.get("HF_TOKEN")
if HF_TOKEN:
    login(token=HF_TOKEN)

MODEL_ID = "badrex/w2v-bert-2.0-kinyarwanda-asr"
transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)


@spaces.GPU
def transcribe(audio):
    sr, y = audio
    
    # convert to mono if stereo
    if y.ndim > 1:
        y = y.mean(axis=1)
    
    # resample to 16kHz if needed
    #if sr != 16000:
    #    y = librosa.resample(y, orig_sr=sr, target_sr=16000)
    
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    
    return transcriber({"sampling_rate": sr, "raw": y})["text"]

examples = []
examples_dir = "examples"
if os.path.exists(examples_dir):
    for filename in os.listdir(examples_dir):
        if filename.endswith((".wav", ".mp3", ".ogg")):
            examples.append([os.path.join(examples_dir, filename)])
    
    print(f"Found {len(examples)} example files")
else:
    print("Examples directory not found")

    
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(),
    outputs="text",
    title="<div>ASRwanda 🎙️ <br>Speech Recognition for Kinyarwanda</div>",
    description="""
        <div class="centered-content">
            <div>
                <p>
                Developed with ❤ by <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a> ☕
                </p>
                <br>
                <p style="font-size: 15px; line-height: 1.8;">
                 Muraho 👋🏼
                <br>
                <br>
                 This is a demo for ASRwanda, a Transformer-based automatic speech recognition (ASR) system for Kinyarwanda language.
                 The underlying ASR model was trained on 500 hours of transcribed speech provided by 
                 <a href="https://digitalumuganda.com/" style="color: #2563eb;">Digital Umuganda</a> as part of the Kinyarwanda
                 <a href="https://www.kaggle.com/competitions/kinyarwanda-automatic-speech-recognition-track-a" style="color: #2563eb;"> ASR hackathon</a> on Kaggle.
                <br>                   
                <p style="font-size: 15px; line-height: 1.8;">
                Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
                </p>
            </div>
        </div>
        """,
    examples=examples if examples else None,
    cache_examples=False,  
    flagging_mode=None,
)

if __name__ == "__main__":
    demo.launch()