davanstrien's picture
davanstrien HF Staff
Update link to all models for each language
c175fdc
raw
history blame
2.79 kB
from huggingface_hub import list_datasets
from toolz import concat
from toolz import frequencies
import gradio as gr
import pandas as pd
from huggingface_hub import list_models
from huggingface_hub import ModelFilter
from tqdm.auto import tqdm
"""## Grab datasets with `token-classification` task"""
datasets = list(list_datasets(filter="task_categories:token-classification"))
datasets[0]
with_language = [
dataset
for dataset in datasets
if "language" in [t.split(":")[0] for t in dataset.tags]
]
with_language[0]
def get_languages(dataset):
tags = list(dataset.tags)
languages = [t for t in tags if t.split(":")[0] == "language"]
languages = [language for language in languages if len(language.split(":")) == 2]
return [t.split(":")[1] for t in languages]
freqs = frequencies(concat(get_languages(dataset) for dataset in with_language))
freqs = dict(sorted(freqs.items(), key=lambda x: x[1], reverse=True))
no_model = []
for lang in tqdm(freqs.keys()):
models_for_lang_with_task_token_classification = list(
list_models(filter=ModelFilter(language=lang, task="token-classification"))
)
models_for_lang_any_task = list(list_models(filter=ModelFilter(language=lang)))
if not models_for_lang_with_task_token_classification:
data = {
"language": lang,
"datasets": freqs[lang],
"token_classification_models": len(
models_for_lang_with_task_token_classification
),
"all_models": len(models_for_lang_any_task),
}
no_model.append(data)
len(no_model)
df = pd.DataFrame(no_model)
df = df.sort_values(
by=["datasets", "token_classification_models", "all_models"],
ascending=[False, True, True],
)
def report_summary():
summary = ""
for row in df.head(20).itertuples():
language = row[1]
summary += f"# Summary for language: {language}\n"
summary += f"- [Datasets for token classification task for {language}](https://huggingface.co/datasets?task_categories=task_categories:token-classification&language=language:{language})\n"
summary += f"- Token classification models for [{language}](https://huggingface.co/models?task_categories=task_categories:token-classification&language=language:{language})\n"
summary += f"- All models for [{language}](https://huggingface.co/models?language={language}&sort=trending)\n"
summary += "<br>\n"
return summary
with gr.Blocks() as demo:
gr.DataFrame(df)
gr.Markdown("# Top 20 candidates")
gr.Markdown(
"Candiates generated by sorting by most token classification datasets, then least token classification models, then least models overall"
)
gr.Markdown(report_summary())
demo.launch()