Spaces:
Sleeping
Sleeping
Sasha
commited on
Commit
·
d8eab79
1
Parent(s):
4474a2c
adding some fixes (paw still isn't working though)
Browse files
app.py
CHANGED
|
@@ -9,7 +9,7 @@ import numpy as np
|
|
| 9 |
import statistics
|
| 10 |
|
| 11 |
st.set_page_config(
|
| 12 |
-
page_title="Evaluation Buddy",
|
| 13 |
page_icon="./robot.png",
|
| 14 |
layout="wide",
|
| 15 |
)
|
|
@@ -25,7 +25,7 @@ top_datasets= ['glue', 'super_glue', 'wikitext', 'imdb', 'squad', 'squad_es', \
|
|
| 25 |
|
| 26 |
tasks= ['text classification', 'question answering', 'automatic speech recognition', 'natural language inference', \
|
| 27 |
'machine translation', 'sentiment analysis', 'text simplification', 'named entity recognition', \
|
| 28 |
-
'reading comprehension']
|
| 29 |
metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
|
| 30 |
|
| 31 |
with st.sidebar.expander("Datasets", expanded=True):
|
|
@@ -50,22 +50,26 @@ st.markdown("## Here is some information about your dataset:")
|
|
| 50 |
st.markdown("### Description")
|
| 51 |
|
| 52 |
st.markdown(dataset_builder.info.description)
|
|
|
|
|
|
|
|
|
|
| 53 |
st.markdown("For more information about this dataset, check out [its website](https://huggingface.co/datasets/"+dataset_name+")")
|
| 54 |
|
| 55 |
|
| 56 |
st.markdown("### Dataset-Specific Metrics")
|
| 57 |
if dataset_name in metrics:
|
| 58 |
-
st.markdown("Great news! Your dataset has a dedicated metric for it! You can use it like this:")
|
| 59 |
code = ''' from datasets import load_metric
|
| 60 |
metric = load_metric('''+dataset_name+''', '''+dataset_config+''')'''
|
| 61 |
st.code(code, language='python')
|
| 62 |
dedicated_metric = True
|
| 63 |
else:
|
| 64 |
-
st.markdown("Your dataset doesn't have a dedicated metric, but that's ok!")
|
| 65 |
dedicated_metric = False
|
| 66 |
|
| 67 |
st.markdown("### Task-Specific Metrics")
|
| 68 |
|
|
|
|
| 69 |
try:
|
| 70 |
task = dataset_builder.info.task_templates[0].task
|
| 71 |
except:
|
|
@@ -73,14 +77,20 @@ except:
|
|
| 73 |
if t in str(dataset_builder.info.description).lower():
|
| 74 |
task = t
|
| 75 |
else:
|
| 76 |
-
|
| 77 |
|
| 78 |
if task is not None:
|
| 79 |
st.markdown("The task associated to it your dataset is: " + task.replace('-',' '))
|
| 80 |
if task == 'automatic-speech-recognition':
|
| 81 |
st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
|
| 82 |
st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
|
|
|
|
|
|
|
|
|
|
| 83 |
st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
|
|
|
|
|
|
|
|
|
|
| 84 |
else:
|
| 85 |
st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")
|
| 86 |
|
|
@@ -129,10 +139,11 @@ try:
|
|
| 129 |
st.code(accuracy_code, language='python')
|
| 130 |
st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
|
| 131 |
except:
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
|
|
|
|
|
| 9 |
import statistics
|
| 10 |
|
| 11 |
st.set_page_config(
|
| 12 |
+
page_title="HuggingFace Evaluation Buddy",
|
| 13 |
page_icon="./robot.png",
|
| 14 |
layout="wide",
|
| 15 |
)
|
|
|
|
| 25 |
|
| 26 |
tasks= ['text classification', 'question answering', 'automatic speech recognition', 'natural language inference', \
|
| 27 |
'machine translation', 'sentiment analysis', 'text simplification', 'named entity recognition', \
|
| 28 |
+
'reading comprehension', 'paraphrase identification', 'natural language understanding']
|
| 29 |
metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
|
| 30 |
|
| 31 |
with st.sidebar.expander("Datasets", expanded=True):
|
|
|
|
| 50 |
st.markdown("### Description")
|
| 51 |
|
| 52 |
st.markdown(dataset_builder.info.description)
|
| 53 |
+
|
| 54 |
+
if len(dataset_builder.info.description) == 1:
|
| 55 |
+
st.markdown("This dataset does not have a description. :no_mouth:")
|
| 56 |
st.markdown("For more information about this dataset, check out [its website](https://huggingface.co/datasets/"+dataset_name+")")
|
| 57 |
|
| 58 |
|
| 59 |
st.markdown("### Dataset-Specific Metrics")
|
| 60 |
if dataset_name in metrics:
|
| 61 |
+
st.markdown("Great news! Your dataset has a dedicated metric for it! You can use it like this: :point_down:")
|
| 62 |
code = ''' from datasets import load_metric
|
| 63 |
metric = load_metric('''+dataset_name+''', '''+dataset_config+''')'''
|
| 64 |
st.code(code, language='python')
|
| 65 |
dedicated_metric = True
|
| 66 |
else:
|
| 67 |
+
st.markdown("Your dataset doesn't have a dedicated metric, but that's ok! :wink:")
|
| 68 |
dedicated_metric = False
|
| 69 |
|
| 70 |
st.markdown("### Task-Specific Metrics")
|
| 71 |
|
| 72 |
+
task = None
|
| 73 |
try:
|
| 74 |
task = dataset_builder.info.task_templates[0].task
|
| 75 |
except:
|
|
|
|
| 77 |
if t in str(dataset_builder.info.description).lower():
|
| 78 |
task = t
|
| 79 |
else:
|
| 80 |
+
continue
|
| 81 |
|
| 82 |
if task is not None:
|
| 83 |
st.markdown("The task associated to it your dataset is: " + task.replace('-',' '))
|
| 84 |
if task == 'automatic-speech-recognition':
|
| 85 |
st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
|
| 86 |
st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
|
| 87 |
+
wer_code = '''from datasets import load_metric
|
| 88 |
+
metric = load_metric("wer")'''
|
| 89 |
+
st.code(wer_code, language='python')
|
| 90 |
st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
|
| 91 |
+
cer_code = '''from datasets import load_metric
|
| 92 |
+
metric = load_metric("cer")'''
|
| 93 |
+
st.code(cer_code, language='python')
|
| 94 |
else:
|
| 95 |
st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")
|
| 96 |
|
|
|
|
| 139 |
st.code(accuracy_code, language='python')
|
| 140 |
st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
|
| 141 |
except:
|
| 142 |
+
if task != 'automatic-speech-recognition':
|
| 143 |
+
st.markdown("### Unsupervised Metrics")
|
| 144 |
+
st.markdown("Since dataset doesn't have any labels, so the metrics that you can use for evaluation are:")
|
| 145 |
+
st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
|
| 146 |
+
perplexity_code = '''from datasets import load_metric
|
| 147 |
+
metric = load_metric("perplexity")'''
|
| 148 |
+
st.code(perplexity_code, language='python')
|
| 149 |
+
st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')
|