saeedzou commited on
Commit
9de87bc
·
1 Parent(s): fe1d6ad

add HF_TOKEN

Browse files
Files changed (2) hide show
  1. app.py +45 -19
  2. requirements.txt +1 -2
app.py CHANGED
@@ -1,27 +1,53 @@
1
  import gradio as gr
2
  import nemo.collections.asr as nemo_asr
 
 
 
3
 
4
- # Load your private model (assuming you already have access credentials or it is publicly available)
5
- asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name="faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30")
 
 
6
 
7
- # Define a function that takes an audio file, transcribes it, and returns the text
8
- def transcribe_audio(audio_file):
9
- # Convert the audio file to the correct format (16k mono)
10
- audio_path = audio_file.name # Temporary location of the uploaded audio file
11
- # Assuming the model expects 16k mono audio, no need for conversion if file is correct format.
12
- output = asr_model.transcribe([audio_path])
13
- return output[0].text # Return transcribed text
14
 
15
- # Create the Gradio interface
16
- iface = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  fn=transcribe_audio,
18
- inputs=gr.inputs.Audio(source="upload", type="file", label="Upload an Audio File"),
19
- outputs="text",
20
- live=True,
21
- title="Speech-to-Text with Private ASR Model",
22
- description="Upload a 16kHz mono audio file for transcription."
23
  )
24
 
25
- # Launch the Gradio app
26
- if __name__ == "__main__":
27
- iface.launch()
 
1
  import gradio as gr
2
  import nemo.collections.asr as nemo_asr
3
+ from pydub import AudioSegment
4
+ import os
5
+ from huggingface_hub import login
6
 
7
+ # Fetch the token from an environment variable
8
+ HF_TOKEN = os.getenv("HF_TOKEN")
9
+ if not HF_TOKEN:
10
+ raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.")
11
 
12
+ # Authenticate with Hugging Face
13
+ login(HF_TOKEN)
 
 
 
 
 
14
 
15
+ # Load the private NeMo ASR model
16
+ try:
17
+ asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
18
+ model_name="faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30"
19
+ )
20
+ except Exception as e:
21
+ raise RuntimeError(f"Failed to load model: {str(e)}")
22
+
23
+ # Function to convert audio to 16kHz mono WAV
24
+ def convert_to_wav(audio_path, output_path="temp.wav"):
25
+ audio = AudioSegment.from_file(audio_path)
26
+ audio = audio.set_channels(1).set_frame_rate(16000)
27
+ audio.export(output_path, format="wav")
28
+ return output_path
29
+
30
+ # Transcription function
31
+ def transcribe_audio(audio):
32
+ if audio is None:
33
+ return "Please upload an audio file."
34
+
35
+ wav_path = convert_to_wav(audio)
36
+ output = asr_model.transcribe([wav_path])
37
+
38
+ if os.path.exists(wav_path):
39
+ os.remove(wav_path)
40
+
41
+ return output[0].text
42
+
43
+ # Create Gradio interface
44
+ interface = gr.Interface(
45
  fn=transcribe_audio,
46
+ inputs=gr.Audio(type="filepath", label="Upload Audio File"),
47
+ outputs=gr.Textbox(label="Transcription"),
48
+ title="ASR Transcription with NeMo",
49
+ description="Upload an audio file to transcribe it using a private NeMo ASR model."
 
50
  )
51
 
52
+ # Launch the app
53
+ interface.launch()
 
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
  gradio
2
  pydub
3
- librosa
4
- git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[all]
5
  huggingface_hub
 
1
  gradio
2
  pydub
3
+ git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]
 
4
  huggingface_hub