Spaces:

faimlab
/

Persian_ASR_Model_Fast_Conformer

Running

App Files Files Community

saeedzou commited on Mar 26

Commit

9de87bc

1 Parent(s): fe1d6ad

add HF_TOKEN

Browse files

Files changed (2) hide show

app.py +45 -19
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -1,27 +1,53 @@
 import gradio as gr
 import nemo.collections.asr as nemo_asr
-# Load your private model (assuming you already have access credentials or it is publicly available)
-asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name="faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30")
-# Define a function that takes an audio file, transcribes it, and returns the text
-def transcribe_audio(audio_file):
-    # Convert the audio file to the correct format (16k mono)
-    audio_path = audio_file.name  # Temporary location of the uploaded audio file
-    # Assuming the model expects 16k mono audio, no need for conversion if file is correct format.
-    output = asr_model.transcribe([audio_path])
-    return output[0].text  # Return transcribed text
-# Create the Gradio interface
-iface = gr.Interface(
     fn=transcribe_audio,
-    inputs=gr.inputs.Audio(source="upload", type="file", label="Upload an Audio File"),
-    outputs="text",
-    live=True,
-    title="Speech-to-Text with Private ASR Model",
-    description="Upload a 16kHz mono audio file for transcription."
 )
-# Launch the Gradio app
-if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 import nemo.collections.asr as nemo_asr
+from pydub import AudioSegment
+import os
+from huggingface_hub import login
+# Fetch the token from an environment variable
+HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.")
+# Authenticate with Hugging Face
+login(HF_TOKEN)
+# Load the private NeMo ASR model
+try:
+    asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
+        model_name="faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30"
+    )
+except Exception as e:
+    raise RuntimeError(f"Failed to load model: {str(e)}")
+# Function to convert audio to 16kHz mono WAV
+def convert_to_wav(audio_path, output_path="temp.wav"):
+    audio = AudioSegment.from_file(audio_path)
+    audio = audio.set_channels(1).set_frame_rate(16000)
+    audio.export(output_path, format="wav")
+    return output_path
+# Transcription function
+def transcribe_audio(audio):
+    if audio is None:
+        return "Please upload an audio file."
+    wav_path = convert_to_wav(audio)
+    output = asr_model.transcribe([wav_path])
+    if os.path.exists(wav_path):
+        os.remove(wav_path)
+    return output[0].text
+# Create Gradio interface
+interface = gr.Interface(
     fn=transcribe_audio,
+    inputs=gr.Audio(type="filepath", label="Upload Audio File"),
+    outputs=gr.Textbox(label="Transcription"),
+    title="ASR Transcription with NeMo",
+    description="Upload an audio file to transcribe it using a private NeMo ASR model."
 )
+# Launch the app
+interface.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
 gradio
 pydub
-librosa
-git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[all]
 huggingface_hub

 gradio
 pydub
+git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]
 huggingface_hub