Spaces:
Runtime error
Runtime error
File size: 6,528 Bytes
b8be36c d276596 b8be36c 96059b8 b8be36c e02da08 b8be36c d276596 b8be36c d276596 b8be36c d276596 b8be36c d276596 5e5c4f6 d276596 5e5c4f6 d276596 b8be36c d276596 b8be36c d276596 b8be36c d276596 b8be36c d276596 b8be36c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import streamlit as st
from evaluate import evaluator
import evaluate
import datasets
from huggingface_hub import HfApi, ModelFilter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import pipeline
import matplotlib.pyplot as plt
st.title("Metric Compare")
st.markdown("### Choose the dataset you want to use for the comparison:")
api = HfApi()
datasets = [d.id for d in api.list_datasets(filter="task_categories:text-classification", sort = "downloads", direction=-1, limit = 20)]
dset = st.selectbox('Choose a dataset from the Hub', options=datasets)
dset_split = st.selectbox('Choose a dataset split for evaluation', options=dset.keys())
st.markdown("### Now select up to 5 models to compare their performance:")
filt = ModelFilter(trained_dataset=dset)
all_models = [m.modelId for m in api.list_models(filter=filt, sort = "downloads", direction=-1, limit = 20) if 't5' not in model.tags]
models = st.multiselect(
'Choose the models that have been trained/finetuned on this dataset',
options=all_models)
button = st.button("Print Models",disabled=False)
if button :
if len(location) < 6:
st.write(models)
else:
st.warning("Please select at most 5 models")
st.markdown("### What two metrics do you want to compare?")
metrics = st.multiselect(
'Choose the metrics for the comparison',
options=['f1', 'accuracy', 'precision', 'recall'])
button2 = st.button("Print Metrics",disabled=False)
if button2 :
if len(metrics ) < 3:
st.write(metrics)
else:
st.warning("Please select at most 2 metrics")
st.markdown("### Now wait for the dataset and models to load (this can take some time if they are big!")
### Loading data
try:
data = datasets.load_dataset(dset, split=dset_split)
st.text("Loaded the validation split of dataset "+ str(dset))
except:
data = datasets.load_dataset(dset, split="test")
st.text("Loaded the test split of dataset "+ str(dset))
st.text("Sorry, I can't load this dataset... try another one!")
### Loading models
for i in range (len(models)):
try:
globals()[f"tokenizer_{i}"] = AutoTokenizer.from_pretrained(models[i])
globals()[f"model_{i}"] = AutoModelForSequenceClassification.from_pretrained(models[i])
st.text("Loaded model "+ str(models[i]))
except:
st.text("Sorry, I can't load model "+ str(models[i]))
### Defining metrics
for i in range (len(metrics)):
try:
globals()[f"metrics[i]"] = evaluate.load(metrics[i])
except:
st.text("Sorry, I can't load metric "+ str(metrics[i]) +"... Try another one!")
### Defining Evaluator
eval = evaluator("text-classification")
### Defining pipelines
st.markdown("### Help us pick the right labels for your models")
st.text("The labels for your dataset are: "+ str(data.features['label'].names))
"""
for i in range (len(model_list)):
st.text("The labels for your dataset are: "+ str(data.features['label'].names))
print(model_list[i])
print(AutoConfig.from_pretrained(models[0]).id2label)
for i in range (len(models)):
try:
globals()[f"pipe1_{i}"] = AutoTokenizer.from_pretrained(models[i])
globals()[f"model_{i}"] = AutoModelForSequenceClassification.from_pretrained(models[i])
st.text("Loaded model "+ str(models[i]))
except:
st.text("Sorry, I can't load model "+ str(models[i]))
pipe1 = pipeline("text-classification", model=model1, tokenizer= tokenizer1, device=0)
res_accuracy1 = eval.compute(model_or_pipeline=pipe1, data=data, metric=accuracy,
label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
res_f11 = eval.compute(model_or_pipeline=pipe1, data=data, metric=f1,
label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
print({**res_accuracy1, **res_f11})
pipe2 = pipeline("text-classification", model=model2, tokenizer= tokenizer2, device=0)
res_accuracy2 = eval.compute(model_or_pipeline=pipe2, data=data, metric=accuracy,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f12 = eval.compute(model_or_pipeline=pipe2, data=data, metric=f1,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy2, **res_f12})
pipe3 = pipeline("text-classification", model=model3, tokenizer= tokenizer3, device=0)
res_accuracy3 = eval.compute(model_or_pipeline=pipe3, data=data, metric=accuracy,
label_mapping={"neg": 0, "pos": 1},)
res_f13 = eval.compute(model_or_pipeline=pipe3, data=data, metric=f1,
label_mapping={"neg": 0, "pos": 1},)
print({**res_accuracy3, **res_f13})
pipe4 = pipeline("text-classification", model=model4, tokenizer= tokenizer4, device=0)
res_accuracy4 = eval.compute(model_or_pipeline=pipe4, data=data, metric=accuracy,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f14 = eval.compute(model_or_pipeline=pipe4, data=data, metric=f1,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy4, **res_f14})
pipe5 = pipeline("text-classification", model=model5, tokenizer= tokenizer5, device=0)
res_accuracy5 = eval.compute(model_or_pipeline=pipe5, data=data, metric=accuracy,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f15 = eval.compute(model_or_pipeline=pipe5, data=data, metric=f1,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy5, **res_f15})
plt.plot(res_accuracy1['accuracy'], res_f11['f1'], marker='o', markersize=6, color="red")
plt.annotate('distilbert', xy=(res_accuracy1['accuracy']+0.001, res_f11['f1']))
plt.plot(res_accuracy2['accuracy'], res_f12['f1'], marker='o', markersize=6, color="blue")
plt.annotate('distilbert-base-uncased-finetuned', xy=(res_accuracy2['accuracy']+0.001, res_f12['f1']))
plt.plot(res_accuracy3['accuracy'], res_f13['f1'], marker='o', markersize=6, color="green")
plt.annotate('roberta-base', xy=(res_accuracy3['accuracy']-0.009, res_f13['f1']))
plt.plot(res_accuracy4['accuracy'], res_f14['f1'], marker='o', markersize=6, color="purple")
plt.annotate('funnel-transformer-small', xy=(res_accuracy4['accuracy']-0.015, res_f14['f1']))
plt.plot(res_accuracy5['accuracy'], res_f15['f1'], marker='o', markersize=6, color="black")
plt.annotate('SENATOR', xy=(res_accuracy5['accuracy']+0.001, res_f15['f1']))
plt.xlabel('Accuracy')
plt.ylabel('F1 Score')
#plt.xlim([0.9, 1.0])
#plt.ylim([0.9, 1.0])
plt.title('Comparing the Models')
""" |