# app.py import gradio as gr from transformers import pipeline import numpy as np import librosa # pip install librosa # --- EDIT THIS: map display names to your HF Hub model IDs --- language_models = { "Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1", "Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2", "Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha", "Luganda": "FarmerlineML/w2v-bert-2.0_luganda", "Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha", "FANTE": "misterkissi/w2v2-lg-xls-r-300m-fante", "BEMBA": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba", "BAMBARA": "DarliAI/kissi-w2v2-lg-xls-r-300m-bambara", "DAGAARE": "DarliAI/kissi-w2v2-lg-xls-r-300m-dagaare", "KINYARWANDA": "DarliAI/kissi-w2v2-lg-xls-r-300m-kinyarwanda", "FULA": "DarliAI/kissi-wav2vec2-fula-fleurs-full", "OROMO": "DarliAI/kissi-w2v-bert-2.0-oromo", "RUNYANKORE": "misterkissi/w2v2-lg-xls-r-300m-runyankore", "GA": "misterkissi/w2v2-lg-xls-r-300m-ga", "VAI": "misterkissi/whisper-small-vai", "KASEM": "misterkissi/w2v2-lg-xls-r-300m-kasem", "LINGALA": "misterkissi/w2v2-lg-xls-r-300m-lingala", "FONGBE": "misterkissi/whisper-small-fongbe", "AMHARIC": "misterkissi/w2v2-lg-xls-r-1b-amharic", "XHOSA": "misterkissi/w2v2-lg-xls-r-300m-xhosa", "TSONGA": "misterkissi/w2v2-lg-xls-r-300m-tsonga", # "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof", # "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole", # "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle", "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1", "Luganda": "FarmerlineML/luganda_fkd", "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2", "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha", "Pidgin": "FarmerlineML/pidgin_nigerian", "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu", "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1" # add more as needed } # Pre-load pipelines for each language on CPU (device=-1) asr_pipelines = { lang: pipeline( task="automatic-speech-recognition", model=model_id, device=-1, # force CPU usage chunk_length_s=30 ) for lang, model_id in language_models.items() } def transcribe(audio_path: str, language: str) -> str: """ Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.), convert to mono, then run it through the chosen ASR pipeline. """ if not audio_path: return "⚠️ Please upload or record an audio clip." # librosa.load returns a 1D np.ndarray (mono) and the sample rate speech, sr = librosa.load(audio_path, sr=None, mono=True) # Call the Hugging Face ASR pipeline result = asr_pipelines[language]({ "sampling_rate": sr, "raw": speech }) return result.get("text", "") with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo: gr.Markdown( """ ## 🎙️ Multilingual Speech-to-Text Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone. Then choose the language/model and hit **Transcribe**. """ ) with gr.Row(): lang = gr.Dropdown( choices=list(language_models.keys()), value=list(language_models.keys())[0], label="Select Language / Model" ) with gr.Row(): audio = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Upload or Record Audio" ) btn = gr.Button("Transcribe") output = gr.Textbox(label="Transcription") btn.click(fn=transcribe, inputs=[audio, lang], outputs=output) if __name__ == "__main__": demo.launch()