Spaces:
Runtime error
Runtime error
| from pathlib import Path | |
| from typing import Optional, Union | |
| import distilabel | |
| import distilabel.distiset | |
| from distilabel.utils.card.dataset_card import ( | |
| DistilabelDatasetCard, | |
| size_categories_parser, | |
| ) | |
| from huggingface_hub import DatasetCardData, HfApi | |
| class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset): | |
| def _generate_card( | |
| self, | |
| repo_id: str, | |
| token: str, | |
| include_script: bool = False, | |
| filename_py: Optional[str] = None, | |
| ) -> None: | |
| """Generates a dataset card and pushes it to the Hugging Face Hub, and | |
| if the `pipeline.yaml` path is available in the `Distiset`, uploads that | |
| to the same repository. | |
| Args: | |
| repo_id: The ID of the repository to push to, from the `push_to_hub` method. | |
| token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method. | |
| include_script: Whether to upload the script to the hugging face repository. | |
| filename_py: The name of the script. If `include_script` is True, the script will | |
| be uploaded to the repository using this name, otherwise it won't be used. | |
| """ | |
| card = self._get_card( | |
| repo_id=repo_id, | |
| token=token, | |
| include_script=include_script, | |
| filename_py=filename_py, | |
| ) | |
| card.push_to_hub( | |
| repo_id, | |
| repo_type="dataset", | |
| token=token, | |
| ) | |
| if self.pipeline_path: | |
| # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well. | |
| HfApi().upload_file( | |
| path_or_fileobj=self.pipeline_path, | |
| path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token, | |
| ) | |
| def _get_card( | |
| self, | |
| repo_id: str, | |
| token: Optional[str] = None, | |
| include_script: bool = False, | |
| filename_py: Optional[str] = None, | |
| ) -> DistilabelDatasetCard: | |
| """Generates the dataset card for the `Distiset`. | |
| Note: | |
| If `repo_id` and `token` are provided, it will extract the metadata from the README.md file | |
| on the hub. | |
| Args: | |
| repo_id: Name of the repository to push to, or the path for the distiset if saved to disk. | |
| token: The token to authenticate with the Hugging Face Hub. | |
| We assume that if it's provided, the dataset will be in the Hugging Face Hub, | |
| so the README metadata will be extracted from there. | |
| include_script: Whether to upload the script to the hugging face repository. | |
| filename_py: The name of the script. If `include_script` is True, the script will | |
| be uploaded to the repository using this name, otherwise it won't be used. | |
| Returns: | |
| The dataset card for the `Distiset`. | |
| """ | |
| sample_records = {} | |
| for name, dataset in self.items(): | |
| sample_records[name] = ( | |
| dataset[0] if not isinstance(dataset, dict) else dataset["train"][0] | |
| ) | |
| readme_metadata = {} | |
| if repo_id and token: | |
| readme_metadata = self._extract_readme_metadata(repo_id, token) | |
| metadata = { | |
| **readme_metadata, | |
| "size_categories": size_categories_parser( | |
| max(len(dataset) for dataset in self.values()) | |
| ), | |
| "tags": [ | |
| "synthetic", | |
| "distilabel", | |
| "rlaif", | |
| "distilabel-dataset-generator", | |
| ], | |
| } | |
| card = DistilabelDatasetCard.from_template( | |
| card_data=DatasetCardData(**metadata), | |
| repo_id=repo_id, | |
| sample_records=sample_records, | |
| include_script=include_script, | |
| filename_py=filename_py, | |
| references=self.citations, | |
| ) | |
| return card | |
| distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag | |