File size: 3,525 Bytes
6ae852e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
953417b
6ae852e
953417b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from typing import Mapping, Iterable

from torch.utils.data import BatchSampler, RandomSampler, SequentialSampler


class SafeBatchSampler(BatchSampler):
    """
    A safe `batch_sampler` that skips samples with `None` values, supports shuffling, and keep a fixed batch size.

    Args:
        data_source (Dataset): The dataset to sample from.
        batch_size (int): The size of each batch.
        drop_last (bool): Whether to drop the last batch if its size is smaller than `batch_size`. Defaults to `False`.
        shuffle (bool, optional): Whether to shuffle the data before sampling. Defaults to `True`.

    Example:
        >>> dataloader = DataLoader(dataset, batch_sampler=SafeBatchSampler(dataset, batch_size, drop_last, shuffle))
    """
    def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool, sampler=None):
        if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \
                batch_size <= 0:
            raise ValueError(f"batch_size should be a positive integer value, but got batch_size={batch_size}")
        if not isinstance(drop_last, bool):
            raise ValueError(f"drop_last should be a boolean value, but got drop_last={drop_last}")
        if sampler:
            pass
        elif shuffle:
            sampler = RandomSampler(data_source)  # type: ignore[arg-type]
        else:
            sampler = SequentialSampler(data_source)  # type: ignore[arg-type]

        super().__init__(sampler, batch_size, drop_last)
        self.data_source = data_source

    # def __iter__(self):
    #     batch = []
    #     for idx in self.sampler:
    #         sample = self.data_source[idx]
    #         # if isinstance(sample, list | tuple):
    #         #     pass
    #         # elif isinstance(sample, dict):
    #         #     sample = sample.values()
    #         # elif isinstance(sample, Series):
    #         #     sample = sample.values
    #         # else:
    #         #     sample = [sample]
    #         if isinstance(sample, (Iterable, Mapping)) and not isinstance(sample, str):
    #             if isinstance(sample, Mapping):
    #                 sample = list(sample.values())
    #         else:
    #             sample = [sample]
    #
    #         if all(v is not None for v in sample):
    #             batch.append(idx)
    #             if len(batch) == self.batch_size:
    #                 yield batch
    #                 batch = []
    #
    #     if len(batch) > 0 and not self.drop_last:
    #         yield batch
    #
    #     if not batch:
    #         raise StopIteration

    def __iter__(self):
        batch = [0] * self.batch_size
        idx_in_batch = 0
        for idx in self.sampler:
            sample = self.data_source[idx]
            if isinstance(sample, (Iterable, Mapping)) and not isinstance(sample, str):
                if isinstance(sample, Mapping):
                    sample = sample.values()
            else:
                sample = [sample]

            if all(v is not None for v in sample):
                batch[idx_in_batch] = idx
                idx_in_batch += 1
                if idx_in_batch == self.batch_size:
                    yield batch
                    idx_in_batch = 0
                    batch = [0] * self.batch_size

        if idx_in_batch > 0 and not self.drop_last:
            yield batch[:idx_in_batch]

#        if not any(batch):
            # raise StopIteration
#            return
    def __len__(self):
        float("inf")