SATEv1.5 / transcription /test_hf.py
Shuwei Hou
initial_for_hf
5806e12
raw
history blame contribute delete
891 Bytes
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
repo_id = "Sven33/maze-whisper-3000"
processor = WhisperProcessor.from_pretrained(repo_id)
model = WhisperForConditionalGeneration.from_pretrained(repo_id)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
speech_array, sampling_rate = torchaudio.load("../../data/test_audio/673_clip.wav")
if sampling_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
speech_array = resampler(speech_array)
input_audio = speech_array[0].numpy()
inputs = processor(input_audio, sampling_rate=16000, return_tensors="pt").input_features.to(device)
predicted_ids = model.generate(inputs)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print("Transcription:")
print(transcription)