ales commited on
Commit
51f7123
·
1 Parent(s): a2498ab

change audio loading

Browse files
Files changed (1) hide show
  1. app.py +10 -24
app.py CHANGED
@@ -1,13 +1,9 @@
1
  from pprint import pformat
2
 
3
- import numpy as np
4
-
5
- import torch
6
- import torchaudio
7
- from torchaudio.transforms import Resample
8
-
9
  from huggingface_hub import hf_hub_download
10
 
 
 
11
  import gradio as gr
12
 
13
  from pipeline import PreTrainedPipeline
@@ -18,19 +14,12 @@ LM_HUB_FP = 'language_model/cv8be_5gram.bin'
18
 
19
 
20
  def main(audio_fp: str):
21
- audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)
22
- init_audio_shape = audio.shape
23
-
24
- # convert stereo to mono
25
- converted_to_mono = False
26
- if audio.shape[0] > 1:
27
- audio = torch.mean(audio, dim=0, keepdim=True)
28
- converted_to_mono = True
29
-
30
- # resample audio to 16kHz
31
- resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
32
- audio_resampled = resampler(audio)
33
- inputs = audio_resampled.numpy().flatten() # cast to numpy as expected by the pipeline
34
 
35
  # download Language Model from HF Hub
36
  lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
@@ -46,12 +35,9 @@ def main(audio_fp: str):
46
  tech_data = pipeline_res
47
  del tech_data['text']
48
  tech_data['sampling_rate_orig'] = sampling_rate
49
- tech_data['init_audio_shape'] = init_audio_shape
50
- tech_data['converted_to_mono'] = converted_to_mono
51
- tech_data['resampled_audio_shape'] = audio_resampled.shape
52
  tech_data['inputs_shape'] = inputs.shape
53
- tech_data['inputs_max'] = np.max(inputs).item()
54
- tech_data['inputs_min'] = np.min(inputs).item()
55
 
56
  tech_data_str = pformat(tech_data)
57
 
 
1
  from pprint import pformat
2
 
 
 
 
 
 
 
3
  from huggingface_hub import hf_hub_download
4
 
5
+ import datasets as hfd
6
+
7
  import gradio as gr
8
 
9
  from pipeline import PreTrainedPipeline
 
14
 
15
 
16
  def main(audio_fp: str):
17
+ # read and preprocess audio with huggingface.datasets
18
+ ds = hfd.Dataset.from_dict({'path': [audio_fp]})
19
+ ds = ds.cast_column('path', hfd.Audio(sampling_rate=16_000, mono=True))
20
+ ds = ds.rename_column('path', 'audio')
21
+ inputs = ds[0]['audio']['array']
22
+ sampling_rate = ds[0]['audio']['sampling_rate']
 
 
 
 
 
 
 
23
 
24
  # download Language Model from HF Hub
25
  lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
 
35
  tech_data = pipeline_res
36
  del tech_data['text']
37
  tech_data['sampling_rate_orig'] = sampling_rate
 
 
 
38
  tech_data['inputs_shape'] = inputs.shape
39
+ tech_data['inputs_max'] = inputs.max().item()
40
+ tech_data['inputs_min'] = inputs.min().item()
41
 
42
  tech_data_str = pformat(tech_data)
43