camparchimedes commited on
Commit
f0c35fe
·
verified ·
1 Parent(s): cd7ab60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -13
app.py CHANGED
@@ -16,7 +16,9 @@
16
  import spaces
17
  import gradio as gr
18
  from PIL import Image
19
- from pydub import AudioSegment
 
 
20
  import os
21
  import re
22
  import time
@@ -32,7 +34,7 @@ from gpuinfo import GPUInfo
32
  #import csv
33
  import numpy as np
34
  import torch
35
- import torchaudio
36
  import torchaudio.transforms as transforms
37
 
38
  from transformers import pipeline, AutoModel
@@ -67,22 +69,19 @@ def convert_to_wav(filepath):
67
  pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, generate_kwargs={'task': 'transcribe', 'language': 'no'})
68
 
69
  @spaces.GPU()
70
- def transcribe_audio(audio_file, batch_size=16, sample_rate =16000):
71
- if isinstance(audio_file, tuple):
72
- audio_file = audio_file[0] # assumes first element contains the file path;
73
 
74
- waveform, sample_rate = torchaudio.load(audio_file) # to avoid TypeError here
75
-
76
- if waveform.ndim > 1:
77
- waveform = waveform[0, :]
78
-
79
- waveform = waveform.numpy()
80
 
81
  start_time = time.time()
82
 
83
  # --pipe it
84
  with torch.no_grad():
85
- outputs = pipe(waveform, sampling_rate=sample_rate, batch_size=batch_size, return_timestamps=False)
86
 
87
  end_time = time.time()
88
 
@@ -110,7 +109,6 @@ def transcribe_audio(audio_file, batch_size=16, sample_rate =16000):
110
  return text.strip(), system_info
111
 
112
 
113
-
114
  # ------------summary section------------
115
 
116
 
 
16
  import spaces
17
  import gradio as gr
18
  from PIL import Image
19
+ #from pydub import AudioSegment
20
+ from scipy.io import wavfile
21
+
22
  import os
23
  import re
24
  import time
 
34
  #import csv
35
  import numpy as np
36
  import torch
37
+ #import torchaudio
38
  import torchaudio.transforms as transforms
39
 
40
  from transformers import pipeline, AutoModel
 
69
  pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, generate_kwargs={'task': 'transcribe', 'language': 'no'})
70
 
71
  @spaces.GPU()
72
+ def transcribe_audio(audio_file, batch_size=16): # sample_rate=16000
73
+ sample_rate, samples = wavfile.read(audio_file)
74
+ waveform, sample_rate = torchaudio.load(audio_file) # avoids TypeError here?
75
 
76
+ # --convert to mono
77
+ if len(samples.shape) > 1:
78
+ samples = samples[:, 0]
 
 
 
79
 
80
  start_time = time.time()
81
 
82
  # --pipe it
83
  with torch.no_grad():
84
+ outputs = pipe(samples, sampling_rate=sample_rate, batch_size=batch_size, return_timestamps=False)
85
 
86
  end_time = time.time()
87
 
 
109
  return text.strip(), system_info
110
 
111
 
 
112
  # ------------summary section------------
113
 
114