Spaces:
Sleeping
Sleeping
File size: 12,145 Bytes
33d4721 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
from dataclasses import dataclass
from typing import List, Optional
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
RESERVED_COLUMNS = ["autotrain_id", "autotrain_label"]
@dataclass
class TabularBinaryClassificationPreprocessor:
"""
A preprocessor class for tabular binary classification tasks.
Attributes:
train_data (pd.DataFrame): The training data.
label_column (str): The name of the label column in the training data.
username (str): The username for the Hugging Face Hub.
project_name (str): The name of the project.
token (str): The authentication token for the Hugging Face Hub.
id_column (Optional[str]): The name of the ID column in the training data. Default is None.
valid_data (Optional[pd.DataFrame]): The validation data. Default is None.
test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2.
seed (Optional[int]): The random seed for splitting the data. Default is 42.
local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Default is False.
Methods:
__post_init__(): Validates the presence of required columns in the training and validation data.
split(): Splits the training data into training and validation sets if validation data is not provided.
prepare_columns(train_df, valid_df): Prepares the columns by adding 'autotrain_id' and 'autotrain_label', and drops the original ID and label columns.
prepare(): Prepares the dataset by splitting, processing columns, and saving or pushing the dataset to the Hugging Face Hub.
"""
train_data: pd.DataFrame
label_column: str
username: str
project_name: str
token: str
id_column: Optional[str] = None
valid_data: Optional[pd.DataFrame] = None
test_size: Optional[float] = 0.2
seed: Optional[int] = 42
local: Optional[bool] = False
def __post_init__(self):
# check if id_column and label_column are in train_data
if self.id_column is not None:
if self.id_column not in self.train_data.columns:
raise ValueError(f"{self.id_column} not in train data")
if self.label_column not in self.train_data.columns:
raise ValueError(f"{self.label_column} not in train data")
# check if id_column and label_column are in valid_data
if self.valid_data is not None:
if self.id_column is not None:
if self.id_column not in self.valid_data.columns:
raise ValueError(f"{self.id_column} not in valid data")
if self.label_column not in self.valid_data.columns:
raise ValueError(f"{self.label_column} not in valid data")
# make sure no reserved columns are in train_data or valid_data
for column in RESERVED_COLUMNS:
if column in self.train_data.columns:
raise ValueError(f"{column} is a reserved column name")
if self.valid_data is not None:
if column in self.valid_data.columns:
raise ValueError(f"{column} is a reserved column name")
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
stratify=self.train_data[self.label_column],
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
def prepare_columns(self, train_df, valid_df):
train_df.loc[:, "autotrain_id"] = train_df[self.id_column] if self.id_column else list(range(len(train_df)))
train_df.loc[:, "autotrain_label"] = train_df[self.label_column]
valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] if self.id_column else list(range(len(valid_df)))
valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column]
# drop id_column and label_column
drop_cols = [self.id_column, self.label_column] if self.id_column else [self.label_column]
train_df = train_df.drop(columns=drop_cols)
valid_df = valid_df.drop(columns=drop_cols)
return train_df, valid_df
def prepare(self):
train_df, valid_df = self.split()
train_df, valid_df = self.prepare_columns(train_df, valid_df)
train_df = Dataset.from_pandas(train_df)
valid_df = Dataset.from_pandas(valid_df)
if self.local:
dataset = DatasetDict(
{
"train": train_df,
"validation": valid_df,
}
)
dataset.save_to_disk(f"{self.project_name}/autotrain-data")
else:
train_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="train",
private=True,
token=self.token,
)
valid_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="validation",
private=True,
token=self.token,
)
if self.local:
return f"{self.project_name}/autotrain-data"
return f"{self.username}/autotrain-data-{self.project_name}"
class TabularMultiClassClassificationPreprocessor(TabularBinaryClassificationPreprocessor):
pass
class TabularSingleColumnRegressionPreprocessor(TabularBinaryClassificationPreprocessor):
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
@dataclass
class TabularMultiLabelClassificationPreprocessor:
"""
TabularMultiLabelClassificationPreprocessor is a class for preprocessing tabular data for multi-label classification tasks.
Attributes:
train_data (pd.DataFrame): The training data.
label_column (List[str]): List of columns to be used as labels.
username (str): The username for the Hugging Face Hub.
project_name (str): The project name for the Hugging Face Hub.
id_column (Optional[str]): The column to be used as an identifier. Defaults to None.
valid_data (Optional[pd.DataFrame]): The validation data. Defaults to None.
test_size (Optional[float]): The proportion of the dataset to include in the validation split. Defaults to 0.2.
seed (Optional[int]): The random seed for splitting the data. Defaults to 42.
token (Optional[str]): The token for authentication with the Hugging Face Hub. Defaults to None.
local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Defaults to False.
Methods:
__post_init__(): Validates the presence of id_column and label_column in train_data and valid_data, and checks for reserved column names.
split(): Splits the train_data into training and validation sets if valid_data is not provided.
prepare_columns(train_df, valid_df): Prepares the columns by adding autotrain_id and autotrain_label columns, and drops the original id_column and label_column.
prepare(): Prepares the dataset by splitting the data, preparing the columns, and converting to Hugging Face Dataset format. Saves the dataset locally or pushes to the Hugging Face Hub.
"""
train_data: pd.DataFrame
label_column: List[str]
username: str
project_name: str
id_column: Optional[str] = None
valid_data: Optional[pd.DataFrame] = None
test_size: Optional[float] = 0.2
seed: Optional[int] = 42
token: Optional[str] = None
local: Optional[bool] = False
def __post_init__(self):
# check if id_column and label_column are in train_data
if self.id_column is not None:
if self.id_column not in self.train_data.columns:
raise ValueError(f"{self.id_column} not in train data")
for label in self.label_column:
if label not in self.train_data.columns:
raise ValueError(f"{label} not in train data")
# check if id_column and label_column are in valid_data
if self.valid_data is not None:
if self.id_column is not None:
if self.id_column not in self.valid_data.columns:
raise ValueError(f"{self.id_column} not in valid data")
for label in self.label_column:
if label not in self.valid_data.columns:
raise ValueError(f"{label} not in valid data")
# make sure no reserved columns are in train_data or valid_data
for column in RESERVED_COLUMNS:
if column in self.train_data.columns:
raise ValueError(f"{column} is a reserved column name")
if self.valid_data is not None:
if column in self.valid_data.columns:
raise ValueError(f"{column} is a reserved column name")
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
stratify=self.train_data[self.label_column],
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
def prepare_columns(self, train_df, valid_df):
train_df.loc[:, "autotrain_id"] = train_df[self.id_column] if self.id_column else list(range(len(train_df)))
for label in range(len(self.label_column)):
train_df.loc[:, f"autotrain_label_{label}"] = train_df[self.label_column[label]]
valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] if self.id_column else list(range(len(valid_df)))
for label in range(len(self.label_column)):
valid_df.loc[:, f"autotrain_label_{label}"] = valid_df[self.label_column[label]]
# drop id_column and label_column
drop_cols = [self.id_column] + self.label_column if self.id_column else self.label_column
train_df = train_df.drop(columns=drop_cols)
valid_df = valid_df.drop(columns=drop_cols)
return train_df, valid_df
def prepare(self):
train_df, valid_df = self.split()
train_df, valid_df = self.prepare_columns(train_df, valid_df)
train_df = Dataset.from_pandas(train_df)
valid_df = Dataset.from_pandas(valid_df)
if self.local:
dataset = DatasetDict(
{
"train": train_df,
"validation": valid_df,
}
)
dataset.save_to_disk(f"{self.project_name}/autotrain-data")
else:
train_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="train",
private=True,
token=self.token,
)
valid_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="validation",
private=True,
token=self.token,
)
if self.local:
return f"{self.project_name}/autotrain-data"
return f"{self.username}/autotrain-data-{self.project_name}"
class TabularMultiColumnRegressionPreprocessor(TabularMultiLabelClassificationPreprocessor):
pass
|