import torch from transformers import pipeline from datasets import load_dataset device = "cuda:0" if torch.cuda.is_available() else "cpu" pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-small", chunk_length_s=30, device=device, ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = ds[0]["audio"] prediction = pipe(sample.copy(), batch_size=8)["text"] " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel." # we can also return timestamps for the predictions prediction = pipe(sample.copy(), batch_size=8, return_timestamps=True)["chunks"] [{'text': ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.', 'timestamp': (0.0, 5.44)}]