osanseviero commited on
Commit
e796aae
·
1 Parent(s): 3523cae
Files changed (1) hide show
  1. model.py +43 -8
model.py CHANGED
@@ -1,9 +1,7 @@
1
  import numpy as np
2
-
3
  from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
4
  from typing import Dict
5
-
6
- from datasets import load_dataset
7
 
8
 
9
  class PreTrainedModel():
@@ -11,12 +9,12 @@ class PreTrainedModel():
11
  """
12
  Loads model and tokenizer from local directory
13
  """
14
- model = Wav2Vec2ForCTC.from_pretrained(".")
15
- tokenizer = AutoTokenizer.from_pretrained(".")
16
- extractor = Wav2Vec2FeatureExtractor.from_pretrained(".")
 
17
 
18
  self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer)
19
-
20
  def __call__(self, inputs)-> Dict[str, str]:
21
  """
22
  Args:
@@ -31,11 +29,48 @@ class PreTrainedModel():
31
 
32
  """
33
  # Just an example using this.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  model = PreTrainedModel()
36
  ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
37
  filename = ds[0]["file"]
38
  with open(filename, "rb") as f:
39
- data = f.read()
40
  print(model(data))
41
  """
 
1
  import numpy as np
 
2
  from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
3
  from typing import Dict
4
+ from pathlib import Path
 
5
 
6
 
7
  class PreTrainedModel():
 
9
  """
10
  Loads model and tokenizer from local directory
11
  """
12
+ current_file_path = Path(__file__)
13
+ model = Wav2Vec2ForCTC.from_pretrained(current_file_path.parent)
14
+ tokenizer = AutoTokenizer.from_pretrained(current_file_path.parent)
15
+ extractor = Wav2Vec2FeatureExtractor.from_pretrained(current_file_path.parent)
16
 
17
  self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer)
 
18
  def __call__(self, inputs)-> Dict[str, str]:
19
  """
20
  Args:
 
29
 
30
  """
31
  # Just an example using this.
32
+ import subprocess
33
+ from datasets import load_dataset
34
+
35
+ def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
36
+ """
37
+ Librosa does that under the hood but forces the use of an actual
38
+ file leading to hitting disk, which is almost always very bad.
39
+ """
40
+ ar = f"{sampling_rate}"
41
+ ac = "1"
42
+ format_for_conversion = "f32le"
43
+ ffmpeg_command = [
44
+ "ffmpeg",
45
+ "-i",
46
+ "pipe:0",
47
+ "-ac",
48
+ ac,
49
+ "-ar",
50
+ ar,
51
+ "-f",
52
+ format_for_conversion,
53
+ "-hide_banner",
54
+ "-loglevel",
55
+ "quiet",
56
+ "pipe:1",
57
+ ]
58
+
59
+ ffmpeg_process = subprocess.Popen(
60
+ ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE
61
+ )
62
+ output_stream = ffmpeg_process.communicate(bpayload)
63
+ out_bytes = output_stream[0]
64
+
65
+ audio = np.frombuffer(out_bytes, np.float32).copy()
66
+ if audio.shape[0] == 0:
67
+ raise ValueError("Malformed soundfile")
68
+ return audio
69
 
70
  model = PreTrainedModel()
71
  ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
72
  filename = ds[0]["file"]
73
  with open(filename, "rb") as f:
74
+ data = ffmpeg_read(f.read(), 16000)
75
  print(model(data))
76
  """