
Merge branch 'main' of https://huggingface.co/osanseviero/asr-with-transformers-wav2vec2 into main
8074454
import numpy as np | |
from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC | |
from typing import Dict | |
from pathlib import Path | |
class PreTrainedModel(): | |
def __init__(self): | |
""" | |
Loads model and tokenizer from local directory | |
""" | |
current_file_path = Path(__file__) | |
model = Wav2Vec2ForCTC.from_pretrained(current_file_path.parent) | |
tokenizer = AutoTokenizer.from_pretrained(current_file_path.parent) | |
extractor = Wav2Vec2FeatureExtractor.from_pretrained(current_file_path.parent) | |
self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer) | |
def __call__(self, inputs)-> Dict[str, str]: | |
""" | |
Args: | |
inputs (:obj:`np.array`): | |
The raw waveform of audio received. By default at 16KHz. | |
Return: | |
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing | |
the detected text from the input audio. | |
""" | |
return self.model(inputs) | |
""" | |
# Just an example using this. | |
import subprocess | |
from datasets import load_dataset | |
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: | |
ar = f"{sampling_rate}" | |
ac = "1" | |
format_for_conversion = "f32le" | |
ffmpeg_command = [ | |
"ffmpeg", | |
"-i", | |
"pipe:0", | |
"-ac", | |
ac, | |
"-ar", | |
ar, | |
"-f", | |
format_for_conversion, | |
"-hide_banner", | |
"-loglevel", | |
"quiet", | |
"pipe:1", | |
] | |
ffmpeg_process = subprocess.Popen( | |
ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE | |
) | |
output_stream = ffmpeg_process.communicate(bpayload) | |
out_bytes = output_stream[0] | |
audio = np.frombuffer(out_bytes, np.float32).copy() | |
if audio.shape[0] == 0: | |
raise ValueError("Malformed soundfile") | |
return audio | |
model = PreTrainedModel() | |
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") | |
filename = ds[0]["file"] | |
with open(filename, "rb") as f: | |
data = ffmpeg_read(f.read(), 16000) | |
print(model(data)) | |
""" |