Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Mar 5, 2024

Commit

64c3236

verified ·

1 Parent(s): be4a716

Upload loaders.py with huggingface_hub

Browse files

Files changed (1) hide show

loaders.py +31 -16

loaders.py CHANGED Viewed

@@ -22,19 +22,18 @@ LoadFromKaggle: loads datasets from the kaggle.com community site
 LoadFromIBMCloud: loads a dataset from the IBM cloud.
 ------------------------
 """
-import importlib
 import itertools
 import os
 import tempfile
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Dict, Mapping, Optional, Sequence, Union
 import pandas as pd
 from datasets import load_dataset as hf_load_dataset
 from tqdm import tqdm
-from .dataclass import InternalField
 from .logging_utils import get_logger
 from .operator import SourceOperator
 from .settings_utils import get_settings
@@ -43,20 +42,13 @@ from .stream import MultiStream, Stream
 logger = get_logger()
 settings = get_settings()
-try:
-    import ibm_boto3
-    ibm_boto3_available = True
-except ImportError:
-    ibm_boto3_available = False
 class Loader(SourceOperator):
     # The loader_limit an optional parameter used to control the maximum number of instances to load from the the source.
     # It is usually provided to the loader via the recipe (see standard.py)
     # The loader can use this value to limit the amount of data downloaded from the source
     # to reduce loading time.  However, this may not always be possible, so the
-    # loader may ingore this.  In any case, the recipe, will limit the number of instances in the returned
     # stream, after load is complete.
     loader_limit: int = None
     streaming: bool = False
@@ -92,7 +84,24 @@ class LoadHF(Loader):
         Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     ] = None
     streaming: bool = True
     _cache: dict = InternalField(default=None)
     def stream_dataset(self):
         if self._cache is None:
@@ -114,6 +123,9 @@ class LoadHF(Loader):
                             f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
                         ) from e
             if self.split is not None:
                 dataset = {self.split: dataset}
@@ -143,6 +155,10 @@ class LoadHF(Loader):
                         raise ValueError(
                             f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
                         ) from e
             if self.split is None:
                 for split in dataset.keys():
                     dataset[split] = dataset[split].to_iterable_dataset()
@@ -241,13 +257,10 @@ class MissingKaggleCredentialsError(ValueError):
 # TODO write how to obtain kaggle credentials
 class LoadFromKaggle(Loader):
     url: str
     def verify(self):
         super().verify()
-        if importlib.util.find_spec("opendatasets") is None:
-            raise ImportError(
-                "Please install opendatasets in order to use the LoadFromKaggle loader (using `pip install opendatasets`) "
-            )
         if not os.path.isfile("kaggle.json"):
             raise MissingKaggleCredentialsError(
                 "Please obtain kaggle credentials https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/ and save them to local ./kaggle.json file"
@@ -283,6 +296,7 @@ class LoadFromIBMCloud(Loader):
     # 3. Mapping: split -> file_names, e.g. {"test" : ["test1.json", "test2.json"], "train": ["train.json"]}
     data_files: Union[Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     caching: bool = True
     def _download_from_cos(self, cos, bucket_name, item_name, local_file):
         logger.info(f"Downloading {item_name} from {bucket_name} COS")
@@ -337,7 +351,6 @@ class LoadFromIBMCloud(Loader):
     def verify(self):
         super().verify()
-        assert ibm_boto3_available, "Please install ibm_boto3 in order to use the LoadFromIBMCloud loader (using `pip install ibm-cos-sdk`) "
         assert (
             self.endpoint_url is not None
         ), f"Please set the {self.endpoint_url_env} environmental variable"
@@ -351,6 +364,8 @@ class LoadFromIBMCloud(Loader):
             raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
     def process(self):
         cos = ibm_boto3.resource(
             "s3",
             aws_access_key_id=self.aws_access_key_id,

 LoadFromIBMCloud: loads a dataset from the IBM cloud.
 ------------------------
 """
 import itertools
 import os
 import tempfile
 from pathlib import Path
 from tempfile import TemporaryDirectory
+from typing import Dict, List, Mapping, Optional, Sequence, Union
 import pandas as pd
 from datasets import load_dataset as hf_load_dataset
 from tqdm import tqdm
+from .dataclass import InternalField, OptionalField
 from .logging_utils import get_logger
 from .operator import SourceOperator
 from .settings_utils import get_settings
 logger = get_logger()
 settings = get_settings()
 class Loader(SourceOperator):
     # The loader_limit an optional parameter used to control the maximum number of instances to load from the the source.
     # It is usually provided to the loader via the recipe (see standard.py)
     # The loader can use this value to limit the amount of data downloaded from the source
     # to reduce loading time.  However, this may not always be possible, so the
+    # loader may ignore this.  In any case, the recipe, will limit the number of instances in the returned
     # stream, after load is complete.
     loader_limit: int = None
     streaming: bool = False
         Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     ] = None
     streaming: bool = True
+    filtering_lambda: Optional[str] = None
     _cache: dict = InternalField(default=None)
+    requirements_list: List[str] = OptionalField(default_factory=list)
+    def verify(self):
+        for requirement in self.requirements_list:
+            if requirement not in self._requirements_list:
+                self._requirements_list.append(requirement)
+        super().verify()
+    def filtered_load(self, dataset):
+        logger.info(f"\nLoading filtered by: {self.filtering_lambda};")
+        return MultiStream(
+            {
+                name: dataset[name].filter(eval(self.filtering_lambda))
+                for name in dataset
+            }
+        )
     def stream_dataset(self):
         if self._cache is None:
                             f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
                         ) from e
+            if self.filtering_lambda is not None:
+                dataset = self.filtered_load(dataset)
             if self.split is not None:
                 dataset = {self.split: dataset}
                         raise ValueError(
                             f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
                         ) from e
+            if self.filtering_lambda is not None:
+                dataset = self.filtered_load(dataset)
             if self.split is None:
                 for split in dataset.keys():
                     dataset[split] = dataset[split].to_iterable_dataset()
 # TODO write how to obtain kaggle credentials
 class LoadFromKaggle(Loader):
     url: str
+    _requirements_list: List[str] = ["opendatasets"]
     def verify(self):
         super().verify()
         if not os.path.isfile("kaggle.json"):
             raise MissingKaggleCredentialsError(
                 "Please obtain kaggle credentials https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/ and save them to local ./kaggle.json file"
     # 3. Mapping: split -> file_names, e.g. {"test" : ["test1.json", "test2.json"], "train": ["train.json"]}
     data_files: Union[Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     caching: bool = True
+    _requirements_list: List[str] = ["ibm_boto3"]
     def _download_from_cos(self, cos, bucket_name, item_name, local_file):
         logger.info(f"Downloading {item_name} from {bucket_name} COS")
     def verify(self):
         super().verify()
         assert (
             self.endpoint_url is not None
         ), f"Please set the {self.endpoint_url_env} environmental variable"
             raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
     def process(self):
+        import ibm_boto3
         cos = ibm_boto3.resource(
             "s3",
             aws_access_key_id=self.aws_access_key_id,