Spaces:
Runtime error
Runtime error
File size: 7,756 Bytes
8231ebb 06809f2 8231ebb 06809f2 8c233e9 8231ebb 06809f2 8231ebb 0078df4 8231ebb 06809f2 8231ebb 8c233e9 8231ebb 8c233e9 3854c7e 8c233e9 8231ebb 8c233e9 06809f2 8c233e9 8231ebb 8c233e9 f82b5e5 8c233e9 3854c7e 8c233e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import pandas as pd
import streamlit as st
import numpy as np
import threading
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelWithLMHead
from huggingface_hub import HfApi, hf_hub_download
from torch.utils.data import Dataset, DataLoader
st.set_page_config(
page_title="Koya Recommendation System",
initial_sidebar_state="auto",
)
st.markdown(
"""
# Koya recommeder System
### π Welcome to the to the Koya recommendation system. This system recommeds an LLM for you when you set some given parameters.
You can try it below"""
)
@st.cache
def get_model_infos(multilingual="multilingual"):
api = HfApi()
model_infos = api.list_models(filter=["fill-mask", multilingual], cardData=True)
data = [['id','task', 'lang', 'sha']]
count =0
for model in model_infos:
try:
data.append([model.modelId, model.pipeline_tag, model.cardData['language'], model.sha])
except:
data.append([model.modelId, model.pipeline_tag, None, model.sha])
df = pd.DataFrame.from_records(data[1:], columns=data[0])
return df
class MLMDataset(Dataset):
def __init__(self,sentence,tokenizer,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
self.sentence = sentence
self.tokenizer = tokenizer
self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
self.num_samples = self.tensor_input.size()[-1] - 2
self.batch_input = self.tensor_input.repeat(self.num_samples, 1)
self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],self.num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
# Added by Chris Emezue on 29.01.2023
# Add a term called unk_mask, such that p(w|...) is 0 if w is unk and p(w|...) otherwise
unk_mask = torch.ones(self.batch_input.size()[0],self.batch_input.size()[1],self.tokenizer.vocab_size)
batch_input_for_unk = self.batch_input.unsqueeze(-1).expand(unk_mask.size())
self.unk_mask = unk_mask.masked_fill(batch_input_for_unk == MLM_UNK_TOKEN, 0)
self.mask = torch.zeros(self.batch_input.size())
src = torch.ones(self.batch_input.size(0)).unsqueeze(0).T
self.mask.scatter_(1, self.random_ids, src)
self.masked_input = self.batch_input.masked_fill(self.mask == 1, MLM_MASK_TOKEN)
self.labels = self.batch_input.masked_fill(self.masked_input != MLM_MASK_TOKEN, -100)
# If logits change when labels is not set to -100:
# If we are using the logits, this does not change it then. but if are using the loss,
# then this has an effect.
assert self.masked_input.shape[0]==self.labels.shape[0] == self.mask.shape[0] == self.unk_mask.shape[0]
def __len__(self):
return self.masked_input.shape[0]
def __getitem__(self,idx):
return self.masked_input[idx], self.mask[idx],self.labels[idx], self.unk_mask[idx]
def get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,BATCH_SIZE):
mlm_dataset = MLMDataset(sentence,tokenizer,MLM_MASK_TOKEN,MLM_UNK_TOKEN)
dataloader = DataLoader(mlm_dataset,batch_size=BATCH_SIZE)
score =1
for i,batch in enumerate(dataloader):
masked_input, mask,labels, unk_mask = batch
output = model(masked_input, labels=labels)
logits_ = output['logits']
logits = logits_ * unk_mask # Penalizing the unk tokens by setting their probs to zero
indices = torch.nonzero(mask)
logits_of_interest = logits[indices[:,0],indices[:,1],:]
labels_of_interest = labels[indices[:,0],indices[:,1]]
log_probs = logits_of_interest.gather(1,labels_of_interest.view(-1,1))
batch_score = (log_probs.sum()/(-1 *mlm_dataset.num_samples)).exp().item() # exp(x+y) = exp(x)*exp(y)
score *= batch_score
return score
def get_sense_score(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,num_samples):
'''
IDEA
-----------------
PP = perplexity(P) where perplexity(P) function just computes:
(p_1*p_*p_3*...*p_N)^(-1/N) for p_i in P
In practice you need to do the computation in log space to avoid underflow:
e^-((log(p_1) + log(p_2) + ... + log(p_N)) / N)
Note: everytime you run this function, the results change slightly (but the ordering should be relatively the same),
because the tokens to mask are chosen randomly.
'''
tensor_input = tokenizer(sentence, return_tensors='pt')['input_ids']
batch_input = tensor_input.repeat(num_samples, 1)
random_ids = np.random.choice([i for i in range(1,tensor_input.size(1)-1)],num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
random_ids = torch.Tensor(random_ids).long().unsqueeze(0).T
# Added by Chris Emezue on 29.01.2023
# Add a term called unk_mask, such that p(w|...) is 0 if w is unk and p(w|...) otherwise
unk_mask = torch.ones(batch_input.size()[0],batch_input.size()[1],tokenizer.vocab_size)
batch_input_for_unk = batch_input.unsqueeze(-1).expand(unk_mask.size())
unk_mask = unk_mask.masked_fill(batch_input_for_unk == MLM_UNK_TOKEN, 0)
mask = torch.zeros(batch_input.size())
src = torch.ones(batch_input.size(0)).unsqueeze(0).T
mask.scatter_(1, random_ids, src)
masked_input = batch_input.masked_fill(mask == 1, MLM_MASK_TOKEN)
labels = batch_input.masked_fill( masked_input != MLM_MASK_TOKEN, -100)
# If logits change when labels is not set to -100:
# If we are using the logits, this does not change it then. but if are using the loss,
# then this has an effect.
output = model(masked_input, labels=labels)
logits_ = output['logits']
logits = logits_ * unk_mask # Penalizing the unk tokens by setting their probs to zero
indices = torch.nonzero(mask)
logits_of_interest = logits[indices[:,0],indices[:,1],:]
labels_of_interest = labels[indices[:,0],indices[:,1]]
log_probs = logits_of_interest.gather(1,labels_of_interest.view(-1,1))
score = (log_probs.sum()/(-1 *num_samples)).exp().item()
return score
def sort_dictionary(dict):
keys = list(dict.keys())
values = list(dict.values())
sorted_value_index = np.argsort(values)
sorted_dict = {keys[i]: values[i] for i in sorted_value_index}
return sorted_dict
def set_seed():
np.random.seed(2023)
torch.manual_seed(2023)
sentence = st.text_input("Please input a sample sentence in the target language")
models = get_model_infos(multilingual=None)
selected_models = st.multiselect("Select of number of models you would like to compare", models['id']
)
run = st.button("Get Scores")
if run:
progress_text = "Computing recommendation Scores"
st.write(progress_text)
my_bar = st.progress(0)
scores={}
for index, model_id in enumerate(selected_models):
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelWithLMHead.from_pretrained(model_id)
if model_id.startswith("castorini"):
tokenizer.model_max_length = 512
MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
MLM_UNK_TOKEN = tokenizer.unk_token_id
BATCH_SIZE = 1
score = get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,BATCH_SIZE)
scores[model_id] = score
my_bar.progress(index + 1)
scores = sort_dictionary(scores)
st.write("Our recommendation is:", scores) |