Commit
·
e796aae
1
Parent(s):
3523cae
Update
Browse files
model.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
import numpy as np
|
2 |
-
|
3 |
from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
|
4 |
from typing import Dict
|
5 |
-
|
6 |
-
from datasets import load_dataset
|
7 |
|
8 |
|
9 |
class PreTrainedModel():
|
@@ -11,12 +9,12 @@ class PreTrainedModel():
|
|
11 |
"""
|
12 |
Loads model and tokenizer from local directory
|
13 |
"""
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
17 |
|
18 |
self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer)
|
19 |
-
|
20 |
def __call__(self, inputs)-> Dict[str, str]:
|
21 |
"""
|
22 |
Args:
|
@@ -31,11 +29,48 @@ class PreTrainedModel():
|
|
31 |
|
32 |
"""
|
33 |
# Just an example using this.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
model = PreTrainedModel()
|
36 |
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
37 |
filename = ds[0]["file"]
|
38 |
with open(filename, "rb") as f:
|
39 |
-
data = f.read()
|
40 |
print(model(data))
|
41 |
"""
|
|
|
1 |
import numpy as np
|
|
|
2 |
from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
|
3 |
from typing import Dict
|
4 |
+
from pathlib import Path
|
|
|
5 |
|
6 |
|
7 |
class PreTrainedModel():
|
|
|
9 |
"""
|
10 |
Loads model and tokenizer from local directory
|
11 |
"""
|
12 |
+
current_file_path = Path(__file__)
|
13 |
+
model = Wav2Vec2ForCTC.from_pretrained(current_file_path.parent)
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained(current_file_path.parent)
|
15 |
+
extractor = Wav2Vec2FeatureExtractor.from_pretrained(current_file_path.parent)
|
16 |
|
17 |
self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer)
|
|
|
18 |
def __call__(self, inputs)-> Dict[str, str]:
|
19 |
"""
|
20 |
Args:
|
|
|
29 |
|
30 |
"""
|
31 |
# Just an example using this.
|
32 |
+
import subprocess
|
33 |
+
from datasets import load_dataset
|
34 |
+
|
35 |
+
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
|
36 |
+
"""
|
37 |
+
Librosa does that under the hood but forces the use of an actual
|
38 |
+
file leading to hitting disk, which is almost always very bad.
|
39 |
+
"""
|
40 |
+
ar = f"{sampling_rate}"
|
41 |
+
ac = "1"
|
42 |
+
format_for_conversion = "f32le"
|
43 |
+
ffmpeg_command = [
|
44 |
+
"ffmpeg",
|
45 |
+
"-i",
|
46 |
+
"pipe:0",
|
47 |
+
"-ac",
|
48 |
+
ac,
|
49 |
+
"-ar",
|
50 |
+
ar,
|
51 |
+
"-f",
|
52 |
+
format_for_conversion,
|
53 |
+
"-hide_banner",
|
54 |
+
"-loglevel",
|
55 |
+
"quiet",
|
56 |
+
"pipe:1",
|
57 |
+
]
|
58 |
+
|
59 |
+
ffmpeg_process = subprocess.Popen(
|
60 |
+
ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE
|
61 |
+
)
|
62 |
+
output_stream = ffmpeg_process.communicate(bpayload)
|
63 |
+
out_bytes = output_stream[0]
|
64 |
+
|
65 |
+
audio = np.frombuffer(out_bytes, np.float32).copy()
|
66 |
+
if audio.shape[0] == 0:
|
67 |
+
raise ValueError("Malformed soundfile")
|
68 |
+
return audio
|
69 |
|
70 |
model = PreTrainedModel()
|
71 |
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
72 |
filename = ds[0]["file"]
|
73 |
with open(filename, "rb") as f:
|
74 |
+
data = ffmpeg_read(f.read(), 16000)
|
75 |
print(model(data))
|
76 |
"""
|