Spaces:

DarliAI
/

DarliAI_ASR

Sleeping

App Files Files Community

FarmerlineML commited on Jul 11

Commit

0c33dd3

verified ·

1 Parent(s): 7f8e389

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -20

app.py CHANGED Viewed

@@ -2,28 +2,28 @@
 import gradio as gr
 from transformers import pipeline
-import soundfile as sf
 import numpy as np
 # --- EDIT THIS: map display names to your HF Hub model IDs ---
 language_models = {
-    "Akan (Asanti Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
-    "Ewe":  "FarmerlineML/w2v-bert-2.0_ewe_2",
-    "Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
-    "Luganda": "FarmerlineML/w2v-bert-2.0_luganda",
-    "Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
-    "Fante Kissi":"misterkissi/w2v2-lg-xls-r-300m-fante",
-    "Runyankore Kissi":"misterkissi/w2v2-lg-xls-r-300m-runyankore",
     # add more as needed
 }
-# Pre-load pipelines for each language
 asr_pipelines = {
     lang: pipeline(
         task="automatic-speech-recognition",
         model=model_id,
-        # device=0,           # uncomment if you have GPU
-        chunk_length_s=30      # adjust if your audio can be longer
     )
     for lang, model_id in language_models.items()
 }
@@ -31,18 +31,16 @@ asr_pipelines = {
 def transcribe(audio_path: str, language: str) -> str:
     """
-    Load the audio file, convert to mono if needed,
-    and run it through the selected ASR pipeline.
     """
-    if audio_path is None:
         return "⚠️ Please upload or record an audio clip."
-    # Read the file
-    speech, sr = sf.read(audio_path)
-    # Stereo → mono
-    if speech.ndim > 1:
-        speech = np.mean(speech, axis=1)
     result = asr_pipelines[language]({
         "sampling_rate": sr,
         "raw": speech
@@ -54,7 +52,8 @@ with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
     gr.Markdown(
         """
         ## 🎙️ Multilingual Speech-to-Text
-        Upload an audio file or record via your microphone, then choose the language/model and hit **Transcribe**.
         """
     )

 import gradio as gr
 from transformers import pipeline
 import numpy as np
+import librosa  # pip install librosa
 # --- EDIT THIS: map display names to your HF Hub model IDs ---
 language_models = {
+    "Akan (Asanti Twi)":        "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
+    "Ewe":                      "FarmerlineML/w2v-bert-2.0_ewe_2",
+    "Kiswahili":                "FarmerlineML/w2v-bert-2.0_swahili_alpha",
+    "Luganda":                  "FarmerlineML/w2v-bert-2.0_luganda",
+    "Brazilian Portuguese":     "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
+    "Fante Kissi":              "misterkissi/w2v2-lg-xls-r-300m-fante",
+    "Runyankore Kissi":         "misterkissi/w2v2-lg-xls-r-300m-runyankore",
     # add more as needed
 }
+# Pre-load pipelines for each language on CPU (device=-1)
 asr_pipelines = {
     lang: pipeline(
         task="automatic-speech-recognition",
         model=model_id,
+        device=-1,            # force CPU usage
+        chunk_length_s=30
     )
     for lang, model_id in language_models.items()
 }
 def transcribe(audio_path: str, language: str) -> str:
     """
+    Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
+    convert to mono, then run it through the chosen ASR pipeline.
     """
+    if not audio_path:
         return "⚠️ Please upload or record an audio clip."
+    # librosa.load returns a 1D np.ndarray (mono) and the sample rate
+    speech, sr = librosa.load(audio_path, sr=None, mono=True)
+    # Call the Hugging Face ASR pipeline
     result = asr_pipelines[language]({
         "sampling_rate": sr,
         "raw": speech
     gr.Markdown(
         """
         ## 🎙️ Multilingual Speech-to-Text
+        Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
+        Then choose the language/model and hit **Transcribe**.
         """
     )