davanstrien HF Staff commited on
Commit
bb6bec5
·
1 Parent(s): bbfd0e9

Add token classification dataset and model summary

Browse files
Files changed (1) hide show
  1. app.py +88 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import list_datasets
2
+ from toolz import concat
3
+ from toolz import frequencies
4
+ import gradio as gr
5
+ import pandas as pd
6
+
7
+ from huggingface_hub import list_models
8
+
9
+ from huggingface_hub import ModelFilter
10
+ from tqdm.auto import tqdm
11
+
12
+ """## Grab datasets with `token-classification` task"""
13
+
14
+ datasets = list(list_datasets(filter="task_categories:token-classification"))
15
+
16
+ datasets[0]
17
+
18
+ with_language = [
19
+ dataset
20
+ for dataset in datasets
21
+ if "language" in [t.split(":")[0] for t in dataset.tags]
22
+ ]
23
+
24
+ with_language[0]
25
+
26
+
27
+ def get_languages(dataset):
28
+ tags = list(dataset.tags)
29
+ languages = [t for t in tags if t.split(":")[0] == "language"]
30
+ languages = [language for language in languages if len(language.split(":")) == 2]
31
+ return [t.split(":")[1] for t in languages]
32
+
33
+
34
+ freqs = frequencies(concat(get_languages(dataset) for dataset in with_language))
35
+ freqs = dict(sorted(freqs.items(), key=lambda x: x[1], reverse=True))
36
+
37
+
38
+ no_model = []
39
+ for lang in tqdm(freqs.keys()):
40
+ models_for_lang_with_task_token_classification = list(
41
+ list_models(filter=ModelFilter(language=lang, task="token-classification"))
42
+ )
43
+ models_for_lang_any_task = list(list_models(filter=ModelFilter(language=lang)))
44
+ if not models_for_lang_with_task_token_classification:
45
+ data = {
46
+ "language": lang,
47
+ "datasets": freqs[lang],
48
+ "token_classification_models": len(
49
+ models_for_lang_with_task_token_classification
50
+ ),
51
+ "all_models": len(models_for_lang_any_task),
52
+ }
53
+ no_model.append(data)
54
+
55
+ len(no_model)
56
+
57
+
58
+ df = pd.DataFrame(no_model)
59
+
60
+ df = df.sort_values(
61
+ by=["datasets", "token_classification_models", "all_models"],
62
+ ascending=[False, True, True],
63
+ )
64
+
65
+
66
+ def report_summary():
67
+ summary = ""
68
+ for row in df.head(20).itertuples():
69
+ language = row[1]
70
+ summary += f"# Summary for language: {language}\n"
71
+ summary += f"## Datasets for token classification task for {language} \n"
72
+ summary += f"https://huggingface.co/datasets?task_categories=task_categories:token-classification&language=language:{language} \n"
73
+ summary += f"## Token classification models for {language}\n"
74
+ summary += f"https://huggingface.co/models?task_categories=task_categories:token-classification&language=language:{language} \n"
75
+ summary += "<br>\n"
76
+ return summary
77
+
78
+
79
+ with gr.Blocks() as demo:
80
+ gr.DataFrame(df)
81
+ gr.Markdown("# Top 20 candidates")
82
+ gr.Markdown(
83
+ "Candiates generated by sorting by most token classification datasets, then least token classification models, then least models overall"
84
+ )
85
+ gr.Markdown(report_summary())
86
+
87
+
88
+ demo.launch()