|
import numpy as np |
|
|
|
from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC |
|
from typing import Dict |
|
from pathlib import Path |
|
|
|
class PreTrainedModel(): |
|
def __init__(self): |
|
""" |
|
Loads model and tokenizer from local directory |
|
""" |
|
current_file_path = Path(__file__) |
|
model = Wav2Vec2ForCTC.from_pretrained(current_file_path.parent) |
|
tokenizer = AutoTokenizer.from_pretrained(current_file_path.parent) |
|
extractor = Wav2Vec2FeatureExtractor.from_pretrained(current_file_path.parent) |
|
|
|
self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer) |
|
|
|
def __call__(self, inputs)-> Dict[str, str]: |
|
""" |
|
Args: |
|
inputs (:obj:`np.array`): |
|
The raw waveform of audio received. By default at 16KHz. |
|
Return: |
|
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing |
|
the detected text from the input audio. |
|
""" |
|
return self.model(inputs) |
|
|
|
|
|
""" |
|
# Just an example using this. |
|
|
|
model = PreTrainedModel() |
|
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") |
|
filename = ds[0]["file"] |
|
with open(filename, "rb") as f: |
|
data = f.read() |
|
print(model(data)) |
|
""" |