File size: 2,787 Bytes
bb6bec5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8152262
 
c175fdc
bb6bec5
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from huggingface_hub import list_datasets
from toolz import concat
from toolz import frequencies
import gradio as gr
import pandas as pd

from huggingface_hub import list_models

from huggingface_hub import ModelFilter
from tqdm.auto import tqdm

"""## Grab datasets with `token-classification` task"""

datasets = list(list_datasets(filter="task_categories:token-classification"))

datasets[0]

with_language = [
    dataset
    for dataset in datasets
    if "language" in [t.split(":")[0] for t in dataset.tags]
]

with_language[0]


def get_languages(dataset):
    tags = list(dataset.tags)
    languages = [t for t in tags if t.split(":")[0] == "language"]
    languages = [language for language in languages if len(language.split(":")) == 2]
    return [t.split(":")[1] for t in languages]


freqs = frequencies(concat(get_languages(dataset) for dataset in with_language))
freqs = dict(sorted(freqs.items(), key=lambda x: x[1], reverse=True))


no_model = []
for lang in tqdm(freqs.keys()):
    models_for_lang_with_task_token_classification = list(
        list_models(filter=ModelFilter(language=lang, task="token-classification"))
    )
    models_for_lang_any_task = list(list_models(filter=ModelFilter(language=lang)))
    if not models_for_lang_with_task_token_classification:
        data = {
            "language": lang,
            "datasets": freqs[lang],
            "token_classification_models": len(
                models_for_lang_with_task_token_classification
            ),
            "all_models": len(models_for_lang_any_task),
        }
        no_model.append(data)

len(no_model)


df = pd.DataFrame(no_model)

df = df.sort_values(
    by=["datasets", "token_classification_models", "all_models"],
    ascending=[False, True, True],
)


def report_summary():
    summary = ""
    for row in df.head(20).itertuples():
        language = row[1]
        summary += f"# Summary for language: {language}\n"
        summary += f"- [Datasets for token classification task for {language}](https://huggingface.co/datasets?task_categories=task_categories:token-classification&language=language:{language})\n"
        summary += f"- Token classification models for [{language}](https://huggingface.co/models?task_categories=task_categories:token-classification&language=language:{language})\n"
        summary += f"- All models for [{language}](https://huggingface.co/models?language={language}&sort=trending)\n"
        summary += "<br>\n"
    return summary


with gr.Blocks() as demo:
    gr.DataFrame(df)
    gr.Markdown("# Top 20 candidates")
    gr.Markdown(
        "Candiates generated by sorting by most token classification datasets, then least token classification models, then least models overall"
    )
    gr.Markdown(report_summary())


demo.launch()