import dspy import random from dspy.datasets import Dataset from datasets import load_dataset from typing import Union, List, Mapping, Tuple class DataLoader(Dataset): def __init__(self,): pass def from_huggingface( self, dataset_name: str, *args, input_keys: Tuple[str] = (), fields: Tuple[str] = None, **kwargs ) -> Union[Mapping[str, List[dspy.Example]], List[dspy.Example]]: if fields and not isinstance(fields, tuple): raise ValueError(f"Invalid fields provided. Please provide a tuple of fields.") if not isinstance(input_keys, tuple): raise ValueError(f"Invalid input keys provided. Please provide a tuple of input keys.") dataset = load_dataset(dataset_name, *args, **kwargs) if isinstance(dataset, list) and isinstance(kwargs["split"], list): dataset = {split_name:dataset[idx] for idx, split_name in enumerate(kwargs["split"])} try: returned_split = {} for split_name in dataset.keys(): if fields: returned_split[split_name] = [dspy.Example({field:row[field] for field in fields}).with_inputs(input_keys) for row in dataset[split_name]] else: returned_split[split_name] = [dspy.Example({field:row[field] for field in row.keys()}).with_inputs(input_keys) for row in dataset[split_name]] return returned_split except AttributeError: if fields: return [dspy.Example({field:row[field] for field in fields}).with_inputs(input_keys) for row in dataset] else: return [dspy.Example({field:row[field] for field in row.keys()}).with_inputs(input_keys) for row in dataset] def from_csv(self, file_path:str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]: dataset = load_dataset("csv", data_files=file_path)["train"] if not fields: fields = list(dataset.features) return [dspy.Example({field:row[field] for field in fields}).with_inputs(input_keys) for row in dataset] def sample( self, dataset: List[dspy.Example], n: int, *args, **kwargs ) -> List[dspy.Example]: if not isinstance(dataset, list): raise ValueError(f"Invalid dataset provided of type {type(dataset)}. Please provide a list of examples.") return random.sample(dataset, n, *args, **kwargs) def train_test_split( self, dataset: List[dspy.Example], train_size: Union[int, float] = 0.75, test_size: Union[int, float] = None, random_state: int = None ) -> Mapping[str, List[dspy.Example]]: if random_state is not None: random.seed(random_state) dataset_shuffled = dataset.copy() random.shuffle(dataset_shuffled) if train_size is not None and isinstance(train_size, float) and (0 < train_size < 1): train_end = int(len(dataset_shuffled) * train_size) elif train_size is not None and isinstance(train_size, int): train_end = train_size else: raise ValueError("Invalid train_size. Please provide a float between 0 and 1 or an int.") if test_size is not None: if isinstance(test_size, float) and (0 < test_size < 1): test_end = int(len(dataset_shuffled) * test_size) elif isinstance(test_size, int): test_end = test_size else: raise ValueError("Invalid test_size. Please provide a float between 0 and 1 or an int.") if train_end + test_end > len(dataset_shuffled): raise ValueError("train_size + test_size cannot exceed the total number of samples.") else: test_end = len(dataset_shuffled) - train_end train_dataset = dataset_shuffled[:train_end] test_dataset = dataset_shuffled[train_end:train_end + test_end] return {'train': train_dataset, 'test': test_dataset}