Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Dec 3, 2023

Commit

04d2454

1 Parent(s): eee0bf8

Upload splitters.py with huggingface_hub

Browse files

Files changed (1) hide show

splitters.py +72 -87

splitters.py CHANGED Viewed

@@ -1,19 +1,10 @@
 import itertools
 from abc import abstractmethod
-from dataclasses import field
-from typing import Dict, List, Optional
 from .artifact import Artifact
-from .generator_utils import ReusableGenerator
-from .operator import InstanceOperatorWithGlobalAccess, MultiStreamOperator
-from .stream import MultiStream
-class Splitter(MultiStreamOperator):
-    pass
-from .random_utils import random
 from .split_utils import (
     parse_random_mix_string,
     parse_slices_string,
@@ -21,6 +12,11 @@ from .split_utils import (
     rename_split,
     slice_streams,
 )
 class RenameSplits(Splitter):
@@ -41,8 +37,8 @@ class SplitRandomMix(Splitter):
 class SeparateSplit(Splitter):
-    """
-    Separates a split (e.g. train) into several splits (e.g. train1, train2)
     sizes must indicate the size of every split except the last. If no size is give for the last split,
      it includes all the examples not allocated to any split.
     """
@@ -59,9 +55,15 @@ class SeparateSplit(Splitter):
         return super().verify()
     def process(self, multi_stream: MultiStream) -> MultiStream:
-        mapping = {key: {key: [(None, None)]} for key in multi_stream.keys() if key != self.from_split}
         so_far = 0
-        for name, size in itertools.zip_longest(self.to_split_names, self.to_split_sizes):
             mapping[name] = {self.from_split: [(so_far, size)]}
             if size:
                 so_far += size
@@ -87,19 +89,25 @@ class Sampler(Artifact):
     def set_size(self, size):
         if isinstance(size, str):
-            assert size.isdigit(), f"sample_size must be a natural number, got {self.sample_size}"
             size = int(size)
         self.sample_size = size
     @abstractmethod
-    def sample(self, instances_pool: List[Dict[str, object]]) -> List[Dict[str, object]]:
         pass
 class RandomSampler(Sampler):
-    def sample(self, instances_pool: List[Dict[str, object]]) -> List[Dict[str, object]]:
         instances_pool = list(instances_pool)
-        return random.sample(instances_pool, self.sample_size)
 class DiverseLabelsSampler(Sampler):
@@ -110,14 +118,29 @@ class DiverseLabelsSampler(Sampler):
         self.labels = None
     def examplar_repr(self, examplar):
-        assert (
-            "inputs" in examplar and self.choices in examplar["inputs"]
-        ), f"DiverseLabelsSampler assumes each examplar has {self.choices} field in it input"
         examplar_outputs = next(iter(examplar["outputs"].values()))
-        return str([choice for choice in examplar["inputs"][self.choices] if choice in examplar_outputs])
     def divide_by_repr(self, examplars_pool):
-        labels = dict()
         for examplar in examplars_pool:
             label_repr = self.examplar_repr(examplar)
             if label_repr not in labels:
@@ -125,11 +148,13 @@ class DiverseLabelsSampler(Sampler):
             labels[label_repr].append(examplar)
         return labels
-    def sample(self, instances_pool: List[Dict[str, object]]) -> List[Dict[str, object]]:
         if self.labels is None:
             self.labels = self.divide_by_repr(instances_pool)
         all_labels = list(self.labels.keys())
-        random.shuffle(all_labels)
         from collections import Counter
         total_allocated = 0
@@ -146,22 +171,21 @@ class DiverseLabelsSampler(Sampler):
         result = []
         for label, allocation in allocations.items():
-            sample = random.sample(self.labels[label], allocation)
             result.extend(sample)
-        random.shuffle(result)
         return result
-class SpreadSplit(InstanceOperatorWithGlobalAccess):
     source_stream: str = None
     target_field: str = None
     sampler: Sampler = None
     def prepare(self):
-        self.accessible_streams = [self.source_stream]
-        self.cache_accessible_streams = True
         self.local_cache = None
     def verify(self):
         assert self.source_stream is not None, "Source stream must be specified"
@@ -169,58 +193,19 @@ class SpreadSplit(InstanceOperatorWithGlobalAccess):
         assert self.sampler is not None, "Sampler must be specified"
         return super().verify()
-    def process(self, instance: Dict[str, object], multi_stream: MultiStream) -> Dict[str, object]:
-        if self.local_cache is None:
-            self.local_cache = list(multi_stream[self.source_stream])
-        source_stream = self.local_cache
-        sampled_instances = self.sampler.sample(source_stream)
-        instance[self.target_field] = sampled_instances
-        return instance
-if __name__ == "__main__":
-    # some tests
-    import random
-    random.seed(0)
-    splitter = SplitRandomMix(
-        mix={
-            "train": "train[90%]+validation[50%]",
-            "validation": "train[10%]+validation[50%]",
-            "test": "test",
-        }
-    )
-    def generator(name, size):
-        for i in range(size):
-            yield {"text": f"{name}_{i}"}
-    stream = MultiStream.from_generators(
-        {
-            "train": ReusableGenerator(generator, gen_kwargs={"name": "train", "size": 10}),
-            "validation": ReusableGenerator(generator, gen_kwargs={"name": "validation", "size": 10}),
-            "test": ReusableGenerator(generator, gen_kwargs={"name": "test", "size": 10}),
-        }
-    )
-    ds = splitter(stream)
-    for key, value in ds.items():
-        print(key)
-        for item in value:
-            print(item)
-    splitter = SliceSplit(
-        slices={
-            "train": "train[:2]+train[2:4]",
-            "validation": "train[4:6]",
-            "test": "train[6:]+test",
-        }
-    )
-    ds = splitter(stream)
-    for key, value in ds.items():
-        print(key)
-        for item in value:
-            print(item)

 import itertools
 from abc import abstractmethod
+from typing import Dict, List
 from .artifact import Artifact
+from .operator import InstanceOperatorWithMultiStreamAccess, MultiStreamOperator
+from .random_utils import get_random
 from .split_utils import (
     parse_random_mix_string,
     parse_slices_string,
     rename_split,
     slice_streams,
 )
+from .stream import MultiStream
+class Splitter(MultiStreamOperator):
+    pass
 class RenameSplits(Splitter):
 class SeparateSplit(Splitter):
+    """Separates a split (e.g. train) into several splits (e.g. train1, train2).
     sizes must indicate the size of every split except the last. If no size is give for the last split,
      it includes all the examples not allocated to any split.
     """
         return super().verify()
     def process(self, multi_stream: MultiStream) -> MultiStream:
+        mapping = {
+            key: {key: [(None, None)]}
+            for key in multi_stream.keys()
+            if key != self.from_split
+        }
         so_far = 0
+        for name, size in itertools.zip_longest(
+            self.to_split_names, self.to_split_sizes
+        ):
             mapping[name] = {self.from_split: [(so_far, size)]}
             if size:
                 so_far += size
     def set_size(self, size):
         if isinstance(size, str):
+            assert (
+                size.isdigit()
+            ), f"sample_size must be a natural number, got {self.sample_size}"
             size = int(size)
         self.sample_size = size
     @abstractmethod
+    def sample(
+        self, instances_pool: List[Dict[str, object]]
+    ) -> List[Dict[str, object]]:
         pass
 class RandomSampler(Sampler):
+    def sample(
+        self, instances_pool: List[Dict[str, object]]
+    ) -> List[Dict[str, object]]:
         instances_pool = list(instances_pool)
+        return get_random().sample(instances_pool, self.sample_size)
 class DiverseLabelsSampler(Sampler):
         self.labels = None
     def examplar_repr(self, examplar):
+        if "inputs" not in examplar:
+            raise ValueError(f"'inputs' field is missing from '{examplar}'.")
+        inputs = examplar["inputs"]
+        if self.choices not in inputs:
+            raise ValueError(f"{self.choices} field is missing from '{inputs}'.")
+        choices = inputs[self.choices]
+        if not isinstance(choices, list):
+            raise ValueError(
+                f"Unexpected input choices value '{choices}'. Expected a list."
+            )
+        if "outputs" not in examplar:
+            raise ValueError(f"'outputs' field is missing from '{examplar}'.")
         examplar_outputs = next(iter(examplar["outputs"].values()))
+        if not isinstance(examplar_outputs, list):
+            raise ValueError(
+                f"Unexpected examplar_outputs value '{examplar_outputs}'. Expected a list."
+            )
+        return str([choice for choice in choices if choice in examplar_outputs])
     def divide_by_repr(self, examplars_pool):
+        labels = {}
         for examplar in examplars_pool:
             label_repr = self.examplar_repr(examplar)
             if label_repr not in labels:
             labels[label_repr].append(examplar)
         return labels
+    def sample(
+        self, instances_pool: List[Dict[str, object]]
+    ) -> List[Dict[str, object]]:
         if self.labels is None:
             self.labels = self.divide_by_repr(instances_pool)
         all_labels = list(self.labels.keys())
+        get_random().shuffle(all_labels)
         from collections import Counter
         total_allocated = 0
         result = []
         for label, allocation in allocations.items():
+            sample = get_random().sample(self.labels[label], allocation)
             result.extend(sample)
+        get_random().shuffle(result)
         return result
+class SpreadSplit(InstanceOperatorWithMultiStreamAccess):
     source_stream: str = None
     target_field: str = None
     sampler: Sampler = None
     def prepare(self):
         self.local_cache = None
+        self.sampler.prepare()
     def verify(self):
         assert self.source_stream is not None, "Source stream must be specified"
         assert self.sampler is not None, "Sampler must be specified"
         return super().verify()
+    def process(
+        self, instance: Dict[str, object], multi_stream: MultiStream
+    ) -> Dict[str, object]:
+        try:
+            if self.local_cache is None:
+                self.local_cache = list(multi_stream[self.source_stream])
+            source_stream = self.local_cache
+            sampled_instances = self.sampler.sample(source_stream)
+            instance[self.target_field] = sampled_instances
+            return instance
+        except Exception as e:
+            raise Exception(
+                f"Unable to fetch instances from '{self.source_stream}' to '{self.target_field}'"
+            ) from e