import torch import torchaudio from transformers import WhisperProcessor, WhisperForConditionalGeneration repo_id = "Sven33/maze-whisper-3000" processor = WhisperProcessor.from_pretrained(repo_id) model = WhisperForConditionalGeneration.from_pretrained(repo_id) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) speech_array, sampling_rate = torchaudio.load("../../data/test_audio/673_clip.wav") if sampling_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000) speech_array = resampler(speech_array) input_audio = speech_array[0].numpy() inputs = processor(input_audio, sampling_rate=16000, return_tensors="pt").input_features.to(device) predicted_ids = model.generate(inputs) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] print("Transcription:") print(transcription)