File size: 12,145 Bytes
33d4721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
from dataclasses import dataclass
from typing import List, Optional

import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split


RESERVED_COLUMNS = ["autotrain_id", "autotrain_label"]


@dataclass
class TabularBinaryClassificationPreprocessor:
    """
    A preprocessor class for tabular binary classification tasks.

    Attributes:
        train_data (pd.DataFrame): The training data.
        label_column (str): The name of the label column in the training data.
        username (str): The username for the Hugging Face Hub.
        project_name (str): The name of the project.
        token (str): The authentication token for the Hugging Face Hub.
        id_column (Optional[str]): The name of the ID column in the training data. Default is None.
        valid_data (Optional[pd.DataFrame]): The validation data. Default is None.
        test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2.
        seed (Optional[int]): The random seed for splitting the data. Default is 42.
        local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Default is False.

    Methods:
        __post_init__(): Validates the presence of required columns in the training and validation data.
        split(): Splits the training data into training and validation sets if validation data is not provided.
        prepare_columns(train_df, valid_df): Prepares the columns by adding 'autotrain_id' and 'autotrain_label', and drops the original ID and label columns.
        prepare(): Prepares the dataset by splitting, processing columns, and saving or pushing the dataset to the Hugging Face Hub.
    """

    train_data: pd.DataFrame
    label_column: str
    username: str
    project_name: str
    token: str
    id_column: Optional[str] = None
    valid_data: Optional[pd.DataFrame] = None
    test_size: Optional[float] = 0.2
    seed: Optional[int] = 42
    local: Optional[bool] = False

    def __post_init__(self):
        # check if id_column and label_column are in train_data
        if self.id_column is not None:
            if self.id_column not in self.train_data.columns:
                raise ValueError(f"{self.id_column} not in train data")

        if self.label_column not in self.train_data.columns:
            raise ValueError(f"{self.label_column} not in train data")

        # check if id_column and label_column are in valid_data
        if self.valid_data is not None:
            if self.id_column is not None:
                if self.id_column not in self.valid_data.columns:
                    raise ValueError(f"{self.id_column} not in valid data")
            if self.label_column not in self.valid_data.columns:
                raise ValueError(f"{self.label_column} not in valid data")

        # make sure no reserved columns are in train_data or valid_data
        for column in RESERVED_COLUMNS:
            if column in self.train_data.columns:
                raise ValueError(f"{column} is a reserved column name")
            if self.valid_data is not None:
                if column in self.valid_data.columns:
                    raise ValueError(f"{column} is a reserved column name")

    def split(self):
        if self.valid_data is not None:
            return self.train_data, self.valid_data
        else:
            train_df, valid_df = train_test_split(
                self.train_data,
                test_size=self.test_size,
                random_state=self.seed,
                stratify=self.train_data[self.label_column],
            )
            train_df = train_df.reset_index(drop=True)
            valid_df = valid_df.reset_index(drop=True)
            return train_df, valid_df

    def prepare_columns(self, train_df, valid_df):
        train_df.loc[:, "autotrain_id"] = train_df[self.id_column] if self.id_column else list(range(len(train_df)))
        train_df.loc[:, "autotrain_label"] = train_df[self.label_column]
        valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] if self.id_column else list(range(len(valid_df)))
        valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column]

        # drop id_column and label_column
        drop_cols = [self.id_column, self.label_column] if self.id_column else [self.label_column]
        train_df = train_df.drop(columns=drop_cols)
        valid_df = valid_df.drop(columns=drop_cols)
        return train_df, valid_df

    def prepare(self):
        train_df, valid_df = self.split()
        train_df, valid_df = self.prepare_columns(train_df, valid_df)
        train_df = Dataset.from_pandas(train_df)
        valid_df = Dataset.from_pandas(valid_df)
        if self.local:
            dataset = DatasetDict(
                {
                    "train": train_df,
                    "validation": valid_df,
                }
            )
            dataset.save_to_disk(f"{self.project_name}/autotrain-data")
        else:
            train_df.push_to_hub(
                f"{self.username}/autotrain-data-{self.project_name}",
                split="train",
                private=True,
                token=self.token,
            )
            valid_df.push_to_hub(
                f"{self.username}/autotrain-data-{self.project_name}",
                split="validation",
                private=True,
                token=self.token,
            )
        if self.local:
            return f"{self.project_name}/autotrain-data"
        return f"{self.username}/autotrain-data-{self.project_name}"


class TabularMultiClassClassificationPreprocessor(TabularBinaryClassificationPreprocessor):
    pass


class TabularSingleColumnRegressionPreprocessor(TabularBinaryClassificationPreprocessor):
    def split(self):
        if self.valid_data is not None:
            return self.train_data, self.valid_data
        else:
            train_df, valid_df = train_test_split(
                self.train_data,
                test_size=self.test_size,
                random_state=self.seed,
            )
            train_df = train_df.reset_index(drop=True)
            valid_df = valid_df.reset_index(drop=True)
            return train_df, valid_df


@dataclass
class TabularMultiLabelClassificationPreprocessor:
    """
    TabularMultiLabelClassificationPreprocessor is a class for preprocessing tabular data for multi-label classification tasks.

    Attributes:
        train_data (pd.DataFrame): The training data.
        label_column (List[str]): List of columns to be used as labels.
        username (str): The username for the Hugging Face Hub.
        project_name (str): The project name for the Hugging Face Hub.
        id_column (Optional[str]): The column to be used as an identifier. Defaults to None.
        valid_data (Optional[pd.DataFrame]): The validation data. Defaults to None.
        test_size (Optional[float]): The proportion of the dataset to include in the validation split. Defaults to 0.2.
        seed (Optional[int]): The random seed for splitting the data. Defaults to 42.
        token (Optional[str]): The token for authentication with the Hugging Face Hub. Defaults to None.
        local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Defaults to False.

    Methods:
        __post_init__(): Validates the presence of id_column and label_column in train_data and valid_data, and checks for reserved column names.
        split(): Splits the train_data into training and validation sets if valid_data is not provided.
        prepare_columns(train_df, valid_df): Prepares the columns by adding autotrain_id and autotrain_label columns, and drops the original id_column and label_column.
        prepare(): Prepares the dataset by splitting the data, preparing the columns, and converting to Hugging Face Dataset format. Saves the dataset locally or pushes to the Hugging Face Hub.
    """

    train_data: pd.DataFrame
    label_column: List[str]
    username: str
    project_name: str
    id_column: Optional[str] = None
    valid_data: Optional[pd.DataFrame] = None
    test_size: Optional[float] = 0.2
    seed: Optional[int] = 42
    token: Optional[str] = None
    local: Optional[bool] = False

    def __post_init__(self):
        # check if id_column and label_column are in train_data
        if self.id_column is not None:
            if self.id_column not in self.train_data.columns:
                raise ValueError(f"{self.id_column} not in train data")

        for label in self.label_column:
            if label not in self.train_data.columns:
                raise ValueError(f"{label} not in train data")

        # check if id_column and label_column are in valid_data
        if self.valid_data is not None:
            if self.id_column is not None:
                if self.id_column not in self.valid_data.columns:
                    raise ValueError(f"{self.id_column} not in valid data")
            for label in self.label_column:
                if label not in self.valid_data.columns:
                    raise ValueError(f"{label} not in valid data")

        # make sure no reserved columns are in train_data or valid_data
        for column in RESERVED_COLUMNS:
            if column in self.train_data.columns:
                raise ValueError(f"{column} is a reserved column name")
            if self.valid_data is not None:
                if column in self.valid_data.columns:
                    raise ValueError(f"{column} is a reserved column name")

    def split(self):
        if self.valid_data is not None:
            return self.train_data, self.valid_data
        else:
            train_df, valid_df = train_test_split(
                self.train_data,
                test_size=self.test_size,
                random_state=self.seed,
                stratify=self.train_data[self.label_column],
            )
            train_df = train_df.reset_index(drop=True)
            valid_df = valid_df.reset_index(drop=True)
            return train_df, valid_df

    def prepare_columns(self, train_df, valid_df):
        train_df.loc[:, "autotrain_id"] = train_df[self.id_column] if self.id_column else list(range(len(train_df)))

        for label in range(len(self.label_column)):
            train_df.loc[:, f"autotrain_label_{label}"] = train_df[self.label_column[label]]

        valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] if self.id_column else list(range(len(valid_df)))

        for label in range(len(self.label_column)):
            valid_df.loc[:, f"autotrain_label_{label}"] = valid_df[self.label_column[label]]

        # drop id_column and label_column
        drop_cols = [self.id_column] + self.label_column if self.id_column else self.label_column
        train_df = train_df.drop(columns=drop_cols)
        valid_df = valid_df.drop(columns=drop_cols)
        return train_df, valid_df

    def prepare(self):
        train_df, valid_df = self.split()
        train_df, valid_df = self.prepare_columns(train_df, valid_df)
        train_df = Dataset.from_pandas(train_df)
        valid_df = Dataset.from_pandas(valid_df)
        if self.local:
            dataset = DatasetDict(
                {
                    "train": train_df,
                    "validation": valid_df,
                }
            )
            dataset.save_to_disk(f"{self.project_name}/autotrain-data")
        else:
            train_df.push_to_hub(
                f"{self.username}/autotrain-data-{self.project_name}",
                split="train",
                private=True,
                token=self.token,
            )
            valid_df.push_to_hub(
                f"{self.username}/autotrain-data-{self.project_name}",
                split="validation",
                private=True,
                token=self.token,
            )
        if self.local:
            return f"{self.project_name}/autotrain-data"
        return f"{self.username}/autotrain-data-{self.project_name}"


class TabularMultiColumnRegressionPreprocessor(TabularMultiLabelClassificationPreprocessor):
    pass