Spaces:
Sleeping
Sleeping
from huggingface_hub import list_datasets | |
from toolz import concat | |
from toolz import frequencies | |
import gradio as gr | |
import pandas as pd | |
from huggingface_hub import list_models | |
from huggingface_hub import ModelFilter | |
from tqdm.auto import tqdm | |
"""## Grab datasets with `token-classification` task""" | |
datasets = list(list_datasets(filter="task_categories:token-classification")) | |
datasets[0] | |
with_language = [ | |
dataset | |
for dataset in datasets | |
if "language" in [t.split(":")[0] for t in dataset.tags] | |
] | |
with_language[0] | |
def get_languages(dataset): | |
tags = list(dataset.tags) | |
languages = [t for t in tags if t.split(":")[0] == "language"] | |
languages = [language for language in languages if len(language.split(":")) == 2] | |
return [t.split(":")[1] for t in languages] | |
freqs = frequencies(concat(get_languages(dataset) for dataset in with_language)) | |
freqs = dict(sorted(freqs.items(), key=lambda x: x[1], reverse=True)) | |
no_model = [] | |
for lang in tqdm(freqs.keys()): | |
models_for_lang_with_task_token_classification = list( | |
list_models(filter=ModelFilter(language=lang, task="token-classification")) | |
) | |
models_for_lang_any_task = list(list_models(filter=ModelFilter(language=lang))) | |
datasets_for_lang_any_task = list(list_datasets(filter=f"language:{lang}")) | |
if not models_for_lang_with_task_token_classification: | |
data = { | |
"language": lang, | |
"datasets_for_token_classification": freqs[lang], | |
"datasets": len(datasets_for_lang_any_task), | |
"token_classification_models": len( | |
models_for_lang_with_task_token_classification | |
), | |
"all_models": len(models_for_lang_any_task), | |
} | |
no_model.append(data) | |
len(no_model) | |
df = pd.DataFrame(no_model) | |
df = df.sort_values( | |
by=[ | |
"datasets_for_token_classification", | |
"datasets", | |
"token_classification_models", | |
"all_models", | |
], | |
ascending=[False, False, True, True], | |
) | |
def report_summary(): | |
summary = "" | |
for row in df.head(30).itertuples(): | |
language = row[1] | |
summary += f"# Summary for language: {language}\n" | |
summary += f"This language has {(row[2])} token classification datasets, {row[3]} datasets overall, {row[4]} token classification models, and {row[5]} models overall.\n" | |
summary += f"- [Datasets for token classification task for {language}](https://huggingface.co/datasets?task_categories=task_categories:token-classification&language=language:{language})\n" | |
summary += f"- [Token classification models for {language}](https://huggingface.co/models?task_categories=task_categories:token-classification&language=language:{language})\n" | |
summary += f"- [All models for {language}](https://huggingface.co/models?language={language}&sort=trending)\n" | |
summary += "<br>\n" | |
return summary | |
with gr.Blocks() as demo: | |
gr.DataFrame(df) | |
gr.Markdown("# Top 30 candidates") | |
gr.Markdown( | |
"Candiates generated by sorting by most token classification datasets, then least token classification models, then least models overall" | |
) | |
gr.Markdown(report_summary()) | |
demo.launch() | |