osanseviero
/

asr-with-transformers-wav2vec2

Automatic Speech Recognition

Model card Files Files and versions Community

osanseviero commited on Jul 12, 2021

Commit

e796aae

·

1 Parent(s): 3523cae

Update

Files changed (1) hide show

model.py +43 -8

model.py CHANGED Viewed

@@ -1,9 +1,7 @@
 import numpy as np
 from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
 from typing import Dict
-from datasets import load_dataset
 class PreTrainedModel():
@@ -11,12 +9,12 @@ class PreTrainedModel():
         """
         Loads model and tokenizer from local directory
         """
-        model = Wav2Vec2ForCTC.from_pretrained(".")
-        tokenizer = AutoTokenizer.from_pretrained(".")
-        extractor = Wav2Vec2FeatureExtractor.from_pretrained(".")
         self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer)
     def __call__(self, inputs)-> Dict[str, str]:
         """
         Args:
@@ -31,11 +29,48 @@ class PreTrainedModel():
 """
 # Just an example using this.
 model = PreTrainedModel()
 ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
 filename = ds[0]["file"]
 with open(filename, "rb") as f:
-    data = f.read()
     print(model(data))
 """

 import numpy as np
 from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
 from typing import Dict
+from pathlib import Path
 class PreTrainedModel():
         """
         Loads model and tokenizer from local directory
         """
+        current_file_path = Path(__file__)
+        model = Wav2Vec2ForCTC.from_pretrained(current_file_path.parent)
+        tokenizer = AutoTokenizer.from_pretrained(current_file_path.parent)
+        extractor = Wav2Vec2FeatureExtractor.from_pretrained(current_file_path.parent)
         self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer)
     def __call__(self, inputs)-> Dict[str, str]:
         """
         Args:
 """
 # Just an example using this.
+import subprocess
+from datasets import load_dataset
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+    """
+    Librosa does that under the hood but forces the use of an actual
+    file leading to hitting disk, which is almost always very bad.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+    ffmpeg_process = subprocess.Popen(
+        ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE
+    )
+    output_stream = ffmpeg_process.communicate(bpayload)
+    out_bytes = output_stream[0]
+    audio = np.frombuffer(out_bytes, np.float32).copy()
+    if audio.shape[0] == 0:
+        raise ValueError("Malformed soundfile")
+    return audio
 model = PreTrainedModel()
 ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
 filename = ds[0]["file"]
 with open(filename, "rb") as f:
+    data = ffmpeg_read(f.read(), 16000)
     print(model(data))
 """