File size: 1,540 Bytes
de07127
 
e6f4bc9
79f46c8
256b607
2f1b912
f547035
de07127
8a965da
 
de07127
8a965da
79f46c8
2996449
decc59e
664eb76
5625d5f
90d7b9b
0b5b7f4
8a965da
 
3b57b43
3826e01
973bb39
62ac43e
 
14427e6
dca4d0e
 
8849149
3836e33
8849149
3925f62
75e29dc
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os 
import sys 
os.system("pip install transformers==4.27.0")
os.system("pip install torch")
os.system("pip install openai")
os.system("pip install accelerate")
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor
os.system("pip install evaluate")
#import evaluate
#os.system("pip install evaluate[evaluator]")
os.system("pip install datasets")
# os.system("pip install llvmlite")
os.system("pip install spicy==1.8.1")
os.system("pip install soundfile")
os.system("pip install jiwer")
os.system("pip install datasets[audio]")
os.system("pip install numba==0.51.2")
import torch
from evaluate import evaluator
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled

set_caching_enabled(False)
disable_caching()

huggingface_token = os.environ["huggingface_token"]


model = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
ds = load_dataset("mskov/miso_test", split="test")
ds = ds.cast_column("audio", Audio(sampling_rate=16000))
inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
print(inputs)
input_features = inputs.input_features
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
list(last_hidden_state.shape)
print(list(last_hidden_state.shape))