Spaces:
Sleeping
Sleeping
from dataclasses import dataclass | |
from typing import List, Optional | |
import pandas as pd | |
from datasets import Dataset, DatasetDict | |
from sklearn.model_selection import train_test_split | |
RESERVED_COLUMNS = ["autotrain_id", "autotrain_label"] | |
class TabularBinaryClassificationPreprocessor: | |
""" | |
A preprocessor class for tabular binary classification tasks. | |
Attributes: | |
train_data (pd.DataFrame): The training data. | |
label_column (str): The name of the label column in the training data. | |
username (str): The username for the Hugging Face Hub. | |
project_name (str): The name of the project. | |
token (str): The authentication token for the Hugging Face Hub. | |
id_column (Optional[str]): The name of the ID column in the training data. Default is None. | |
valid_data (Optional[pd.DataFrame]): The validation data. Default is None. | |
test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2. | |
seed (Optional[int]): The random seed for splitting the data. Default is 42. | |
local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Default is False. | |
Methods: | |
__post_init__(): Validates the presence of required columns in the training and validation data. | |
split(): Splits the training data into training and validation sets if validation data is not provided. | |
prepare_columns(train_df, valid_df): Prepares the columns by adding 'autotrain_id' and 'autotrain_label', and drops the original ID and label columns. | |
prepare(): Prepares the dataset by splitting, processing columns, and saving or pushing the dataset to the Hugging Face Hub. | |
""" | |
train_data: pd.DataFrame | |
label_column: str | |
username: str | |
project_name: str | |
token: str | |
id_column: Optional[str] = None | |
valid_data: Optional[pd.DataFrame] = None | |
test_size: Optional[float] = 0.2 | |
seed: Optional[int] = 42 | |
local: Optional[bool] = False | |
def __post_init__(self): | |
# check if id_column and label_column are in train_data | |
if self.id_column is not None: | |
if self.id_column not in self.train_data.columns: | |
raise ValueError(f"{self.id_column} not in train data") | |
if self.label_column not in self.train_data.columns: | |
raise ValueError(f"{self.label_column} not in train data") | |
# check if id_column and label_column are in valid_data | |
if self.valid_data is not None: | |
if self.id_column is not None: | |
if self.id_column not in self.valid_data.columns: | |
raise ValueError(f"{self.id_column} not in valid data") | |
if self.label_column not in self.valid_data.columns: | |
raise ValueError(f"{self.label_column} not in valid data") | |
# make sure no reserved columns are in train_data or valid_data | |
for column in RESERVED_COLUMNS: | |
if column in self.train_data.columns: | |
raise ValueError(f"{column} is a reserved column name") | |
if self.valid_data is not None: | |
if column in self.valid_data.columns: | |
raise ValueError(f"{column} is a reserved column name") | |
def split(self): | |
if self.valid_data is not None: | |
return self.train_data, self.valid_data | |
else: | |
train_df, valid_df = train_test_split( | |
self.train_data, | |
test_size=self.test_size, | |
random_state=self.seed, | |
stratify=self.train_data[self.label_column], | |
) | |
train_df = train_df.reset_index(drop=True) | |
valid_df = valid_df.reset_index(drop=True) | |
return train_df, valid_df | |
def prepare_columns(self, train_df, valid_df): | |
train_df.loc[:, "autotrain_id"] = train_df[self.id_column] if self.id_column else list(range(len(train_df))) | |
train_df.loc[:, "autotrain_label"] = train_df[self.label_column] | |
valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] if self.id_column else list(range(len(valid_df))) | |
valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column] | |
# drop id_column and label_column | |
drop_cols = [self.id_column, self.label_column] if self.id_column else [self.label_column] | |
train_df = train_df.drop(columns=drop_cols) | |
valid_df = valid_df.drop(columns=drop_cols) | |
return train_df, valid_df | |
def prepare(self): | |
train_df, valid_df = self.split() | |
train_df, valid_df = self.prepare_columns(train_df, valid_df) | |
train_df = Dataset.from_pandas(train_df) | |
valid_df = Dataset.from_pandas(valid_df) | |
if self.local: | |
dataset = DatasetDict( | |
{ | |
"train": train_df, | |
"validation": valid_df, | |
} | |
) | |
dataset.save_to_disk(f"{self.project_name}/autotrain-data") | |
else: | |
train_df.push_to_hub( | |
f"{self.username}/autotrain-data-{self.project_name}", | |
split="train", | |
private=True, | |
token=self.token, | |
) | |
valid_df.push_to_hub( | |
f"{self.username}/autotrain-data-{self.project_name}", | |
split="validation", | |
private=True, | |
token=self.token, | |
) | |
if self.local: | |
return f"{self.project_name}/autotrain-data" | |
return f"{self.username}/autotrain-data-{self.project_name}" | |
class TabularMultiClassClassificationPreprocessor(TabularBinaryClassificationPreprocessor): | |
pass | |
class TabularSingleColumnRegressionPreprocessor(TabularBinaryClassificationPreprocessor): | |
def split(self): | |
if self.valid_data is not None: | |
return self.train_data, self.valid_data | |
else: | |
train_df, valid_df = train_test_split( | |
self.train_data, | |
test_size=self.test_size, | |
random_state=self.seed, | |
) | |
train_df = train_df.reset_index(drop=True) | |
valid_df = valid_df.reset_index(drop=True) | |
return train_df, valid_df | |
class TabularMultiLabelClassificationPreprocessor: | |
""" | |
TabularMultiLabelClassificationPreprocessor is a class for preprocessing tabular data for multi-label classification tasks. | |
Attributes: | |
train_data (pd.DataFrame): The training data. | |
label_column (List[str]): List of columns to be used as labels. | |
username (str): The username for the Hugging Face Hub. | |
project_name (str): The project name for the Hugging Face Hub. | |
id_column (Optional[str]): The column to be used as an identifier. Defaults to None. | |
valid_data (Optional[pd.DataFrame]): The validation data. Defaults to None. | |
test_size (Optional[float]): The proportion of the dataset to include in the validation split. Defaults to 0.2. | |
seed (Optional[int]): The random seed for splitting the data. Defaults to 42. | |
token (Optional[str]): The token for authentication with the Hugging Face Hub. Defaults to None. | |
local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Defaults to False. | |
Methods: | |
__post_init__(): Validates the presence of id_column and label_column in train_data and valid_data, and checks for reserved column names. | |
split(): Splits the train_data into training and validation sets if valid_data is not provided. | |
prepare_columns(train_df, valid_df): Prepares the columns by adding autotrain_id and autotrain_label columns, and drops the original id_column and label_column. | |
prepare(): Prepares the dataset by splitting the data, preparing the columns, and converting to Hugging Face Dataset format. Saves the dataset locally or pushes to the Hugging Face Hub. | |
""" | |
train_data: pd.DataFrame | |
label_column: List[str] | |
username: str | |
project_name: str | |
id_column: Optional[str] = None | |
valid_data: Optional[pd.DataFrame] = None | |
test_size: Optional[float] = 0.2 | |
seed: Optional[int] = 42 | |
token: Optional[str] = None | |
local: Optional[bool] = False | |
def __post_init__(self): | |
# check if id_column and label_column are in train_data | |
if self.id_column is not None: | |
if self.id_column not in self.train_data.columns: | |
raise ValueError(f"{self.id_column} not in train data") | |
for label in self.label_column: | |
if label not in self.train_data.columns: | |
raise ValueError(f"{label} not in train data") | |
# check if id_column and label_column are in valid_data | |
if self.valid_data is not None: | |
if self.id_column is not None: | |
if self.id_column not in self.valid_data.columns: | |
raise ValueError(f"{self.id_column} not in valid data") | |
for label in self.label_column: | |
if label not in self.valid_data.columns: | |
raise ValueError(f"{label} not in valid data") | |
# make sure no reserved columns are in train_data or valid_data | |
for column in RESERVED_COLUMNS: | |
if column in self.train_data.columns: | |
raise ValueError(f"{column} is a reserved column name") | |
if self.valid_data is not None: | |
if column in self.valid_data.columns: | |
raise ValueError(f"{column} is a reserved column name") | |
def split(self): | |
if self.valid_data is not None: | |
return self.train_data, self.valid_data | |
else: | |
train_df, valid_df = train_test_split( | |
self.train_data, | |
test_size=self.test_size, | |
random_state=self.seed, | |
stratify=self.train_data[self.label_column], | |
) | |
train_df = train_df.reset_index(drop=True) | |
valid_df = valid_df.reset_index(drop=True) | |
return train_df, valid_df | |
def prepare_columns(self, train_df, valid_df): | |
train_df.loc[:, "autotrain_id"] = train_df[self.id_column] if self.id_column else list(range(len(train_df))) | |
for label in range(len(self.label_column)): | |
train_df.loc[:, f"autotrain_label_{label}"] = train_df[self.label_column[label]] | |
valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] if self.id_column else list(range(len(valid_df))) | |
for label in range(len(self.label_column)): | |
valid_df.loc[:, f"autotrain_label_{label}"] = valid_df[self.label_column[label]] | |
# drop id_column and label_column | |
drop_cols = [self.id_column] + self.label_column if self.id_column else self.label_column | |
train_df = train_df.drop(columns=drop_cols) | |
valid_df = valid_df.drop(columns=drop_cols) | |
return train_df, valid_df | |
def prepare(self): | |
train_df, valid_df = self.split() | |
train_df, valid_df = self.prepare_columns(train_df, valid_df) | |
train_df = Dataset.from_pandas(train_df) | |
valid_df = Dataset.from_pandas(valid_df) | |
if self.local: | |
dataset = DatasetDict( | |
{ | |
"train": train_df, | |
"validation": valid_df, | |
} | |
) | |
dataset.save_to_disk(f"{self.project_name}/autotrain-data") | |
else: | |
train_df.push_to_hub( | |
f"{self.username}/autotrain-data-{self.project_name}", | |
split="train", | |
private=True, | |
token=self.token, | |
) | |
valid_df.push_to_hub( | |
f"{self.username}/autotrain-data-{self.project_name}", | |
split="validation", | |
private=True, | |
token=self.token, | |
) | |
if self.local: | |
return f"{self.project_name}/autotrain-data" | |
return f"{self.username}/autotrain-data-{self.project_name}" | |
class TabularMultiColumnRegressionPreprocessor(TabularMultiLabelClassificationPreprocessor): | |
pass | |