{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": { "id": "D1Sg0CCPc5-3" }, "outputs": [], "source": [ "from huggingface_hub import list_datasets\n", "from toolz import concat\n", "from toolz import groupby\n", "from toolz import valmap\n", "from toolz import frequencies" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grab datasets with `token-classification` task" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "sJzsdvsEdDvQ" }, "outputs": [], "source": [ "datasets = list(list_datasets(filter=\"task_categories:token-classification\"))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IUdw59ftdLc5", "outputId": "21714002-6571-459b-e266-4c4838865d1f" }, "outputs": [ { "data": { "text/plain": [ "DatasetInfo(id='acronym_identification', author=None, sha='c3c245a18bbd57b1682b099e14460eebf154cbdf', last_modified=datetime.datetime(2023, 1, 25, 14, 18, 28, tzinfo=datetime.timezone.utc), private=False, gated=False, disabled=False, downloads=1823, likes=17, paperswithcode_id='acronym-identification', tags=['task_categories:token-classification', 'annotations_creators:expert-generated', 'language_creators:found', 'multilinguality:monolingual', 'size_categories:10K