File size: 1,368 Bytes
e4f06e4
 
 
 
efb0bee
e4f06e4
 
 
 
 
 
efb0bee
 
 
 
e4f06e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import numpy as np

from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
from typing import Dict
from pathlib import Path

class PreTrainedModel():
    def __init__(self):
        """
        Loads model and tokenizer from local directory
        """
        current_file_path = Path(__file__)
        model = Wav2Vec2ForCTC.from_pretrained(current_file_path.parent)
        tokenizer = AutoTokenizer.from_pretrained(current_file_path.parent)
        extractor = Wav2Vec2FeatureExtractor.from_pretrained(current_file_path.parent)
        
        self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer)

    def __call__(self, inputs)-> Dict[str, str]:
        """
        Args:
            inputs (:obj:`np.array`):
                The raw waveform of audio received. By default at 16KHz.
        Return:
            A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
            the detected text from the input audio.
        """
        return self.model(inputs)


"""
# Just an example using this.

model = PreTrainedModel()
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
filename = ds[0]["file"]
with open(filename, "rb") as f:
    data = f.read()
    print(model(data))
"""