|
import torch |
|
import torchaudio |
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
|
|
repo_id = "Sven33/maze-whisper-3000" |
|
|
|
processor = WhisperProcessor.from_pretrained(repo_id) |
|
model = WhisperForConditionalGeneration.from_pretrained(repo_id) |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model.to(device) |
|
|
|
speech_array, sampling_rate = torchaudio.load("../../data/test_audio/673_clip.wav") |
|
if sampling_rate != 16000: |
|
resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000) |
|
speech_array = resampler(speech_array) |
|
|
|
input_audio = speech_array[0].numpy() |
|
|
|
inputs = processor(input_audio, sampling_rate=16000, return_tensors="pt").input_features.to(device) |
|
|
|
predicted_ids = model.generate(inputs) |
|
|
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
|
|
print("Transcription:") |
|
print(transcription) |
|
|