File size: 1,368 Bytes
e4f06e4 efb0bee e4f06e4 efb0bee e4f06e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import numpy as np
from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
from typing import Dict
from pathlib import Path
class PreTrainedModel():
def __init__(self):
"""
Loads model and tokenizer from local directory
"""
current_file_path = Path(__file__)
model = Wav2Vec2ForCTC.from_pretrained(current_file_path.parent)
tokenizer = AutoTokenizer.from_pretrained(current_file_path.parent)
extractor = Wav2Vec2FeatureExtractor.from_pretrained(current_file_path.parent)
self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer)
def __call__(self, inputs)-> Dict[str, str]:
"""
Args:
inputs (:obj:`np.array`):
The raw waveform of audio received. By default at 16KHz.
Return:
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
the detected text from the input audio.
"""
return self.model(inputs)
"""
# Just an example using this.
model = PreTrainedModel()
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
filename = ds[0]["file"]
with open(filename, "rb") as f:
data = f.read()
print(model(data))
""" |