Spaces:

ales
/

wav2vec2-cv-be-lm

Runtime error

ales commited on Apr 14, 2022

Commit

51f7123

1 Parent(s): a2498ab

change audio loading

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,13 +1,9 @@
 from pprint import pformat
-import numpy as np
-import torch
-import torchaudio
-from torchaudio.transforms import Resample
 from huggingface_hub import hf_hub_download
 import gradio as gr
 from pipeline import PreTrainedPipeline
@@ -18,19 +14,12 @@ LM_HUB_FP = 'language_model/cv8be_5gram.bin'
 def main(audio_fp: str):
-    audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)
-    init_audio_shape = audio.shape
-    # convert stereo to mono
-    converted_to_mono = False
-    if audio.shape[0] > 1:
-        audio = torch.mean(audio, dim=0, keepdim=True)
-        converted_to_mono = True
-    # resample audio to 16kHz
-    resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
-    audio_resampled = resampler(audio)
-    inputs = audio_resampled.numpy().flatten()  # cast to numpy as expected by the pipeline
     # download Language Model from HF Hub
     lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
@@ -46,12 +35,9 @@ def main(audio_fp: str):
     tech_data = pipeline_res
     del tech_data['text']
     tech_data['sampling_rate_orig'] = sampling_rate
-    tech_data['init_audio_shape'] = init_audio_shape
-    tech_data['converted_to_mono'] = converted_to_mono
-    tech_data['resampled_audio_shape'] = audio_resampled.shape
     tech_data['inputs_shape'] = inputs.shape
-    tech_data['inputs_max'] = np.max(inputs).item()
-    tech_data['inputs_min'] = np.min(inputs).item()
     tech_data_str = pformat(tech_data)

 from pprint import pformat
 from huggingface_hub import hf_hub_download
+import datasets as hfd
 import gradio as gr
 from pipeline import PreTrainedPipeline
 def main(audio_fp: str):
+    # read and preprocess audio with huggingface.datasets
+    ds = hfd.Dataset.from_dict({'path': [audio_fp]})
+    ds = ds.cast_column('path', hfd.Audio(sampling_rate=16_000, mono=True))
+    ds = ds.rename_column('path', 'audio')
+    inputs = ds[0]['audio']['array']
+    sampling_rate = ds[0]['audio']['sampling_rate']
     # download Language Model from HF Hub
     lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
     tech_data = pipeline_res
     del tech_data['text']
     tech_data['sampling_rate_orig'] = sampling_rate
     tech_data['inputs_shape'] = inputs.shape
+    tech_data['inputs_max'] = inputs.max().item()
+    tech_data['inputs_min'] = inputs.min().item()
     tech_data_str = pformat(tech_data)