from huggingface_hub import list_datasets from toolz import concat from toolz import frequencies import gradio as gr import pandas as pd from huggingface_hub import list_models from huggingface_hub import ModelFilter from tqdm.auto import tqdm """## Grab datasets with `token-classification` task""" datasets = list(list_datasets(filter="task_categories:token-classification")) datasets[0] with_language = [ dataset for dataset in datasets if "language" in [t.split(":")[0] for t in dataset.tags] ] with_language[0] def get_languages(dataset): tags = list(dataset.tags) languages = [t for t in tags if t.split(":")[0] == "language"] languages = [language for language in languages if len(language.split(":")) == 2] return [t.split(":")[1] for t in languages] freqs = frequencies(concat(get_languages(dataset) for dataset in with_language)) freqs = dict(sorted(freqs.items(), key=lambda x: x[1], reverse=True)) no_model = [] for lang in tqdm(freqs.keys()): models_for_lang_with_task_token_classification = list( list_models(filter=ModelFilter(language=lang, task="token-classification")) ) models_for_lang_any_task = list(list_models(filter=ModelFilter(language=lang))) datasets_for_lang_any_task = list(list_datasets(filter=f"language:{lang}")) if not models_for_lang_with_task_token_classification: data = { "language": lang, "datasets_for_token_classification": freqs[lang], "datasets": len(datasets_for_lang_any_task), "token_classification_models": len( models_for_lang_with_task_token_classification ), "all_models": len(models_for_lang_any_task), } no_model.append(data) len(no_model) df = pd.DataFrame(no_model) df = df.sort_values( by=[ "datasets_for_token_classification", "datasets", "token_classification_models", "all_models", ], ascending=[False, False, True, True], ) def report_summary(): summary = "" for row in df.head(30).itertuples(): language = row[1] summary += f"# Summary for language: {language}\n" summary += f"This language has {(row[2])} token classification datasets, {row[3]} datasets overall, {row[4]} token classification models, and {row[5]} models overall.\n" summary += f"- [Datasets for token classification task for {language}](https://huggingface.co/datasets?task_categories=task_categories:token-classification&language=language:{language})\n" summary += f"- [Token classification models for {language}](https://huggingface.co/models?task_categories=task_categories:token-classification&language=language:{language})\n" summary += f"- [All models for {language}](https://huggingface.co/models?language={language}&sort=trending)\n" summary += "
\n" return summary with gr.Blocks() as demo: gr.DataFrame(df) gr.Markdown("# Top 30 candidates") gr.Markdown( "Candiates generated by sorting by most token classification datasets, then least token classification models, then least models overall" ) gr.Markdown(report_summary()) demo.launch()