File size: 4,262 Bytes
c0e8dca
 
 
 
 
0c33dd3
c0e8dca
 
 
5928b3c
0c33dd3
 
 
 
71aa7fe
 
 
 
 
 
 
 
 
 
 
 
c9f09ce
7314930
54f3c62
1ea09d9
7f6cedb
61375c6
7f6cedb
414f828
64dc005
 
87a6add
0663960
6ded37f
87a6add
6ded37f
c0e8dca
 
 
0c33dd3
c0e8dca
 
 
 
0c33dd3
 
c0e8dca
 
 
 
 
 
 
0c33dd3
 
c0e8dca
0c33dd3
c0e8dca
 
0c33dd3
 
c0e8dca
0c33dd3
c0e8dca
 
 
 
 
 
 
 
 
 
 
0c33dd3
 
c0e8dca
 
 
 
 
 
 
 
 
 
 
 
1f8f25c
c0e8dca
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# app.py

import gradio as gr
from transformers import pipeline
import numpy as np
import librosa  # pip install librosa

# --- EDIT THIS: map display names to your HF Hub model IDs ---
language_models = {
    "Akan (Asante Twi)":        "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
    "Ewe":                      "FarmerlineML/w2v-bert-2.0_ewe_2",
    "Kiswahili":                "FarmerlineML/w2v-bert-2.0_swahili_alpha",
    "Luganda":                  "FarmerlineML/w2v-bert-2.0_luganda",
    "Brazilian Portuguese":     "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
    "FANTE":                    "misterkissi/w2v2-lg-xls-r-300m-fante", 
    "BEMBA":                    "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
    "BAMBARA":                  "DarliAI/kissi-w2v2-lg-xls-r-300m-bambara",
    "DAGAARE":                  "DarliAI/kissi-w2v2-lg-xls-r-300m-dagaare",
    "KINYARWANDA":              "DarliAI/kissi-w2v2-lg-xls-r-300m-kinyarwanda",
    "FULA":                     "DarliAI/kissi-wav2vec2-fula-fleurs-full",
    "OROMO":                    "DarliAI/kissi-w2v-bert-2.0-oromo",
    "RUNYANKORE":               "misterkissi/w2v2-lg-xls-r-300m-runyankore",
    "GA":                       "misterkissi/w2v2-lg-xls-r-300m-ga",
    "VAI":                      "misterkissi/whisper-small-vai",
    "KASEM":                    "misterkissi/w2v2-lg-xls-r-300m-kasem",
    "LINGALA":                  "misterkissi/w2v2-lg-xls-r-300m-lingala",
    "FONGBE":                   "misterkissi/whisper-small-fongbe",
    "AMHARIC":                  "misterkissi/w2v2-lg-xls-r-1b-amharic",
    "XHOSA":                    "misterkissi/w2v2-lg-xls-r-300m-xhosa",
    "TSONGA":                   "misterkissi/w2v2-lg-xls-r-300m-tsonga",
    # "WOLOF":                    "misterkissi/w2v2-lg-xls-r-1b-wolof",
    # "HAITIAN CREOLE":           "misterkissi/whisper-small-haitian-creole",
    # "KABYLE":                   "misterkissi/w2v2-lg-xls-r-1b-kabyle",
    "Yoruba":                   "FarmerlineML/w2v-bert-2.0_yoruba_v1",
    "Luganda":                  "FarmerlineML/luganda_fkd",
    "Luo":                      "FarmerlineML/w2v-bert-2.0_luo_v2",
    "Somali":                   "FarmerlineML/w2v-bert-2.0_somali_alpha",
    "Pidgin":                   "FarmerlineML/pidgin_nigerian",
    "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
    "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1" 
    
    # add more as needed
}

# Pre-load pipelines for each language on CPU (device=-1)
asr_pipelines = {
    lang: pipeline(
        task="automatic-speech-recognition",
        model=model_id,
        device=-1,            # force CPU usage
        chunk_length_s=30
    )
    for lang, model_id in language_models.items()
}


def transcribe(audio_path: str, language: str) -> str:
    """
    Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
    convert to mono, then run it through the chosen ASR pipeline.
    """
    if not audio_path:
        return "⚠️ Please upload or record an audio clip."

    # librosa.load returns a 1D np.ndarray (mono) and the sample rate
    speech, sr = librosa.load(audio_path, sr=None, mono=True)

    # Call the Hugging Face ASR pipeline
    result = asr_pipelines[language]({
        "sampling_rate": sr,
        "raw": speech
    })
    return result.get("text", "")


with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
    gr.Markdown(
        """
        ## πŸŽ™οΈ Multilingual Speech-to-Text   
        Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.  
        Then choose the language/model and hit **Transcribe**.
        """
    )

    with gr.Row():
        lang = gr.Dropdown(
            choices=list(language_models.keys()),
            value=list(language_models.keys())[0],
            label="Select Language / Model"
        )

    with gr.Row():
        audio = gr.Audio(
            sources=["upload", "microphone"],
            type="filepath",
            label="Upload or Record Audio"
        )

    btn = gr.Button("Transcribe")
    output = gr.Textbox(label="Transcription")

    btn.click(fn=transcribe, inputs=[audio, lang], outputs=output)

if __name__ == "__main__":
    demo.launch()