hardiktiwari's picture
Upload 244 files
33d4721 verified
from dataclasses import dataclass
from typing import List, Optional
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
RESERVED_COLUMNS = ["autotrain_id", "autotrain_label"]
@dataclass
class TabularBinaryClassificationPreprocessor:
"""
A preprocessor class for tabular binary classification tasks.
Attributes:
train_data (pd.DataFrame): The training data.
label_column (str): The name of the label column in the training data.
username (str): The username for the Hugging Face Hub.
project_name (str): The name of the project.
token (str): The authentication token for the Hugging Face Hub.
id_column (Optional[str]): The name of the ID column in the training data. Default is None.
valid_data (Optional[pd.DataFrame]): The validation data. Default is None.
test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2.
seed (Optional[int]): The random seed for splitting the data. Default is 42.
local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Default is False.
Methods:
__post_init__(): Validates the presence of required columns in the training and validation data.
split(): Splits the training data into training and validation sets if validation data is not provided.
prepare_columns(train_df, valid_df): Prepares the columns by adding 'autotrain_id' and 'autotrain_label', and drops the original ID and label columns.
prepare(): Prepares the dataset by splitting, processing columns, and saving or pushing the dataset to the Hugging Face Hub.
"""
train_data: pd.DataFrame
label_column: str
username: str
project_name: str
token: str
id_column: Optional[str] = None
valid_data: Optional[pd.DataFrame] = None
test_size: Optional[float] = 0.2
seed: Optional[int] = 42
local: Optional[bool] = False
def __post_init__(self):
# check if id_column and label_column are in train_data
if self.id_column is not None:
if self.id_column not in self.train_data.columns:
raise ValueError(f"{self.id_column} not in train data")
if self.label_column not in self.train_data.columns:
raise ValueError(f"{self.label_column} not in train data")
# check if id_column and label_column are in valid_data
if self.valid_data is not None:
if self.id_column is not None:
if self.id_column not in self.valid_data.columns:
raise ValueError(f"{self.id_column} not in valid data")
if self.label_column not in self.valid_data.columns:
raise ValueError(f"{self.label_column} not in valid data")
# make sure no reserved columns are in train_data or valid_data
for column in RESERVED_COLUMNS:
if column in self.train_data.columns:
raise ValueError(f"{column} is a reserved column name")
if self.valid_data is not None:
if column in self.valid_data.columns:
raise ValueError(f"{column} is a reserved column name")
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
stratify=self.train_data[self.label_column],
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
def prepare_columns(self, train_df, valid_df):
train_df.loc[:, "autotrain_id"] = train_df[self.id_column] if self.id_column else list(range(len(train_df)))
train_df.loc[:, "autotrain_label"] = train_df[self.label_column]
valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] if self.id_column else list(range(len(valid_df)))
valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column]
# drop id_column and label_column
drop_cols = [self.id_column, self.label_column] if self.id_column else [self.label_column]
train_df = train_df.drop(columns=drop_cols)
valid_df = valid_df.drop(columns=drop_cols)
return train_df, valid_df
def prepare(self):
train_df, valid_df = self.split()
train_df, valid_df = self.prepare_columns(train_df, valid_df)
train_df = Dataset.from_pandas(train_df)
valid_df = Dataset.from_pandas(valid_df)
if self.local:
dataset = DatasetDict(
{
"train": train_df,
"validation": valid_df,
}
)
dataset.save_to_disk(f"{self.project_name}/autotrain-data")
else:
train_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="train",
private=True,
token=self.token,
)
valid_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="validation",
private=True,
token=self.token,
)
if self.local:
return f"{self.project_name}/autotrain-data"
return f"{self.username}/autotrain-data-{self.project_name}"
class TabularMultiClassClassificationPreprocessor(TabularBinaryClassificationPreprocessor):
pass
class TabularSingleColumnRegressionPreprocessor(TabularBinaryClassificationPreprocessor):
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
@dataclass
class TabularMultiLabelClassificationPreprocessor:
"""
TabularMultiLabelClassificationPreprocessor is a class for preprocessing tabular data for multi-label classification tasks.
Attributes:
train_data (pd.DataFrame): The training data.
label_column (List[str]): List of columns to be used as labels.
username (str): The username for the Hugging Face Hub.
project_name (str): The project name for the Hugging Face Hub.
id_column (Optional[str]): The column to be used as an identifier. Defaults to None.
valid_data (Optional[pd.DataFrame]): The validation data. Defaults to None.
test_size (Optional[float]): The proportion of the dataset to include in the validation split. Defaults to 0.2.
seed (Optional[int]): The random seed for splitting the data. Defaults to 42.
token (Optional[str]): The token for authentication with the Hugging Face Hub. Defaults to None.
local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Defaults to False.
Methods:
__post_init__(): Validates the presence of id_column and label_column in train_data and valid_data, and checks for reserved column names.
split(): Splits the train_data into training and validation sets if valid_data is not provided.
prepare_columns(train_df, valid_df): Prepares the columns by adding autotrain_id and autotrain_label columns, and drops the original id_column and label_column.
prepare(): Prepares the dataset by splitting the data, preparing the columns, and converting to Hugging Face Dataset format. Saves the dataset locally or pushes to the Hugging Face Hub.
"""
train_data: pd.DataFrame
label_column: List[str]
username: str
project_name: str
id_column: Optional[str] = None
valid_data: Optional[pd.DataFrame] = None
test_size: Optional[float] = 0.2
seed: Optional[int] = 42
token: Optional[str] = None
local: Optional[bool] = False
def __post_init__(self):
# check if id_column and label_column are in train_data
if self.id_column is not None:
if self.id_column not in self.train_data.columns:
raise ValueError(f"{self.id_column} not in train data")
for label in self.label_column:
if label not in self.train_data.columns:
raise ValueError(f"{label} not in train data")
# check if id_column and label_column are in valid_data
if self.valid_data is not None:
if self.id_column is not None:
if self.id_column not in self.valid_data.columns:
raise ValueError(f"{self.id_column} not in valid data")
for label in self.label_column:
if label not in self.valid_data.columns:
raise ValueError(f"{label} not in valid data")
# make sure no reserved columns are in train_data or valid_data
for column in RESERVED_COLUMNS:
if column in self.train_data.columns:
raise ValueError(f"{column} is a reserved column name")
if self.valid_data is not None:
if column in self.valid_data.columns:
raise ValueError(f"{column} is a reserved column name")
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
stratify=self.train_data[self.label_column],
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
def prepare_columns(self, train_df, valid_df):
train_df.loc[:, "autotrain_id"] = train_df[self.id_column] if self.id_column else list(range(len(train_df)))
for label in range(len(self.label_column)):
train_df.loc[:, f"autotrain_label_{label}"] = train_df[self.label_column[label]]
valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] if self.id_column else list(range(len(valid_df)))
for label in range(len(self.label_column)):
valid_df.loc[:, f"autotrain_label_{label}"] = valid_df[self.label_column[label]]
# drop id_column and label_column
drop_cols = [self.id_column] + self.label_column if self.id_column else self.label_column
train_df = train_df.drop(columns=drop_cols)
valid_df = valid_df.drop(columns=drop_cols)
return train_df, valid_df
def prepare(self):
train_df, valid_df = self.split()
train_df, valid_df = self.prepare_columns(train_df, valid_df)
train_df = Dataset.from_pandas(train_df)
valid_df = Dataset.from_pandas(valid_df)
if self.local:
dataset = DatasetDict(
{
"train": train_df,
"validation": valid_df,
}
)
dataset.save_to_disk(f"{self.project_name}/autotrain-data")
else:
train_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="train",
private=True,
token=self.token,
)
valid_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="validation",
private=True,
token=self.token,
)
if self.local:
return f"{self.project_name}/autotrain-data"
return f"{self.username}/autotrain-data-{self.project_name}"
class TabularMultiColumnRegressionPreprocessor(TabularMultiLabelClassificationPreprocessor):
pass