Spaces:
Sleeping
Sleeping
import os | |
import shutil | |
import uuid | |
from dataclasses import dataclass | |
from typing import Optional | |
import pandas as pd | |
from datasets import ClassLabel, Features, Image, Sequence, Value, load_dataset | |
from sklearn.model_selection import train_test_split | |
ALLOWED_EXTENSIONS = ("jpeg", "png", "jpg", "JPG", "JPEG", "PNG") | |
class ImageClassificationPreprocessor: | |
""" | |
A class used to preprocess image data for classification tasks. | |
Attributes | |
---------- | |
train_data : str | |
Path to the training data directory. | |
username : str | |
Username for the Hugging Face Hub. | |
project_name : str | |
Name of the project. | |
token : str | |
Authentication token for the Hugging Face Hub. | |
valid_data : Optional[str], optional | |
Path to the validation data directory, by default None. | |
test_size : Optional[float], optional | |
Proportion of the dataset to include in the validation split, by default 0.2. | |
seed : Optional[int], optional | |
Random seed for reproducibility, by default 42. | |
local : Optional[bool], optional | |
Whether to save the dataset locally or push to the Hugging Face Hub, by default False. | |
Methods | |
------- | |
__post_init__(): | |
Validates the structure and contents of the training and validation data directories. | |
split(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: | |
Splits the dataframe into training and validation sets. | |
prepare() -> str: | |
Prepares the dataset for training and either saves it locally or pushes it to the Hugging Face Hub. | |
""" | |
train_data: str | |
username: str | |
project_name: str | |
token: str | |
valid_data: Optional[str] = None | |
test_size: Optional[float] = 0.2 | |
seed: Optional[int] = 42 | |
local: Optional[bool] = False | |
def __post_init__(self): | |
# Check if train data path exists | |
if not os.path.exists(self.train_data): | |
raise ValueError(f"{self.train_data} does not exist.") | |
# Check if train data path contains at least 2 folders | |
subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()] | |
# list subfolders | |
if len(subfolders) < 2: | |
raise ValueError(f"{self.train_data} should contain at least 2 subfolders.") | |
# Check if each subfolder contains at least 5 image files in jpeg, png or jpg format only | |
for subfolder in subfolders: | |
image_files = [f for f in os.listdir(subfolder) if f.endswith(ALLOWED_EXTENSIONS)] | |
if len(image_files) < 5: | |
raise ValueError(f"{subfolder} should contain at least 5 jpeg, png or jpg files.") | |
# Check if there are no other files except image files in the subfolder | |
if len(image_files) != len(os.listdir(subfolder)): | |
raise ValueError(f"{subfolder} should not contain any other files except image files.") | |
# Check if there are no subfolders inside subfolders | |
subfolders_in_subfolder = [f.path for f in os.scandir(subfolder) if f.is_dir()] | |
if len(subfolders_in_subfolder) > 0: | |
raise ValueError(f"{subfolder} should not contain any subfolders.") | |
if self.valid_data: | |
# Check if valid data path exists | |
if not os.path.exists(self.valid_data): | |
raise ValueError(f"{self.valid_data} does not exist.") | |
# Check if valid data path contains at least 2 folders | |
subfolders = [f.path for f in os.scandir(self.valid_data) if f.is_dir()] | |
# make sure that the subfolders in train and valid data are the same | |
train_subfolders = set(os.path.basename(f.path) for f in os.scandir(self.train_data) if f.is_dir()) | |
valid_subfolders = set(os.path.basename(f.path) for f in os.scandir(self.valid_data) if f.is_dir()) | |
if train_subfolders != valid_subfolders: | |
raise ValueError(f"{self.valid_data} should have the same subfolders as {self.train_data}.") | |
if len(subfolders) < 2: | |
raise ValueError(f"{self.valid_data} should contain at least 2 subfolders.") | |
# Check if each subfolder contains at least 5 image files in jpeg, png or jpg format only | |
for subfolder in subfolders: | |
image_files = [f for f in os.listdir(subfolder) if f.endswith(ALLOWED_EXTENSIONS)] | |
if len(image_files) < 5: | |
raise ValueError(f"{subfolder} should contain at least 5 jpeg, png or jpg files.") | |
# Check if there are no other files except image files in the subfolder | |
if len(image_files) != len(os.listdir(subfolder)): | |
raise ValueError(f"{subfolder} should not contain any other files except image files.") | |
# Check if there are no subfolders inside subfolders | |
subfolders_in_subfolder = [f.path for f in os.scandir(subfolder) if f.is_dir()] | |
if len(subfolders_in_subfolder) > 0: | |
raise ValueError(f"{subfolder} should not contain any subfolders.") | |
def split(self, df): | |
train_df, valid_df = train_test_split( | |
df, | |
test_size=self.test_size, | |
random_state=self.seed, | |
stratify=df["subfolder"], | |
) | |
train_df = train_df.reset_index(drop=True) | |
valid_df = valid_df.reset_index(drop=True) | |
return train_df, valid_df | |
def prepare(self): | |
random_uuid = uuid.uuid4() | |
cache_dir = os.environ.get("HF_HOME") | |
if not cache_dir: | |
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") | |
data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) | |
if self.valid_data: | |
shutil.copytree(self.train_data, os.path.join(data_dir, "train")) | |
shutil.copytree(self.valid_data, os.path.join(data_dir, "validation")) | |
dataset = load_dataset("imagefolder", data_dir=data_dir) | |
dataset = dataset.rename_columns({"image": "autotrain_image", "label": "autotrain_label"}) | |
if self.local: | |
dataset.save_to_disk(f"{self.project_name}/autotrain-data") | |
else: | |
dataset.push_to_hub( | |
f"{self.username}/autotrain-data-{self.project_name}", | |
private=True, | |
token=self.token, | |
) | |
else: | |
subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()] | |
image_filenames = [] | |
subfolder_names = [] | |
for subfolder in subfolders: | |
for filename in os.listdir(subfolder): | |
if filename.endswith(("jpeg", "png", "jpg")): | |
image_filenames.append(filename) | |
subfolder_names.append(os.path.basename(subfolder)) | |
df = pd.DataFrame({"image_filename": image_filenames, "subfolder": subfolder_names}) | |
train_df, valid_df = self.split(df) | |
for row in train_df.itertuples(): | |
os.makedirs(os.path.join(data_dir, "train", row.subfolder), exist_ok=True) | |
shutil.copy( | |
os.path.join(self.train_data, row.subfolder, row.image_filename), | |
os.path.join(data_dir, "train", row.subfolder, row.image_filename), | |
) | |
for row in valid_df.itertuples(): | |
os.makedirs(os.path.join(data_dir, "validation", row.subfolder), exist_ok=True) | |
shutil.copy( | |
os.path.join(self.train_data, row.subfolder, row.image_filename), | |
os.path.join(data_dir, "validation", row.subfolder, row.image_filename), | |
) | |
dataset = load_dataset("imagefolder", data_dir=data_dir) | |
dataset = dataset.rename_columns({"image": "autotrain_image", "label": "autotrain_label"}) | |
if self.local: | |
dataset.save_to_disk(f"{self.project_name}/autotrain-data") | |
else: | |
dataset.push_to_hub( | |
f"{self.username}/autotrain-data-{self.project_name}", | |
private=True, | |
token=self.token, | |
) | |
if self.local: | |
return f"{self.project_name}/autotrain-data" | |
return f"{self.username}/autotrain-data-{self.project_name}" | |
class ObjectDetectionPreprocessor: | |
""" | |
A class to preprocess data for object detection tasks. | |
Attributes: | |
----------- | |
train_data : str | |
Path to the training data directory. | |
username : str | |
Username for the Hugging Face Hub. | |
project_name : str | |
Name of the project. | |
token : str | |
Authentication token for the Hugging Face Hub. | |
valid_data : Optional[str], default=None | |
Path to the validation data directory. | |
test_size : Optional[float], default=0.2 | |
Proportion of the dataset to include in the validation split. | |
seed : Optional[int], default=42 | |
Random seed for reproducibility. | |
local : Optional[bool], default=False | |
Whether to save the dataset locally or push to the Hugging Face Hub. | |
Methods: | |
-------- | |
_process_metadata(data_path): | |
Processes the metadata.jsonl file and extracts required columns and categories. | |
__post_init__(): | |
Validates the existence and content of the training and validation data directories. | |
split(df): | |
Splits the dataframe into training and validation sets. | |
prepare(): | |
Prepares the dataset for training by processing metadata, splitting data, and saving or pushing the dataset. | |
""" | |
train_data: str | |
username: str | |
project_name: str | |
token: str | |
valid_data: Optional[str] = None | |
test_size: Optional[float] = 0.2 | |
seed: Optional[int] = 42 | |
local: Optional[bool] = False | |
def _process_metadata(data_path): | |
metadata = pd.read_json(os.path.join(data_path, "metadata.jsonl"), lines=True) | |
# make sure that the metadata.jsonl file contains the required columns: file_name, objects | |
if "file_name" not in metadata.columns or "objects" not in metadata.columns: | |
raise ValueError(f"{data_path}/metadata.jsonl should contain 'file_name' and 'objects' columns.") | |
# keeo only file_name and objects columns | |
metadata = metadata[["file_name", "objects"]] | |
# inside metadata objects column, values should be bbox, area and category | |
# if area does not exist, it should be created by multiplying bbox width and height | |
categories = [] | |
for _, row in metadata.iterrows(): | |
obj = row["objects"] | |
if "bbox" not in obj or "category" not in obj: | |
raise ValueError(f"{data_path}/metadata.jsonl should contain 'bbox' and 'category' keys in 'objects'.") | |
# keep only bbox, area and category keys | |
obj = {k: obj[k] for k in ["bbox", "category"]} | |
categories.extend(obj["category"]) | |
categories = set(categories) | |
return metadata, categories | |
def __post_init__(self): | |
# Check if train data path exists | |
if not os.path.exists(self.train_data): | |
raise ValueError(f"{self.train_data} does not exist.") | |
# check if self.train_data contains at least 5 image files in jpeg, png or jpg format only | |
train_image_files = [f for f in os.listdir(self.train_data) if f.endswith(ALLOWED_EXTENSIONS)] | |
if len(train_image_files) < 5: | |
raise ValueError(f"{self.train_data} should contain at least 5 jpeg, png or jpg files.") | |
# check if self.train_data contains a metadata.jsonl file | |
if "metadata.jsonl" not in os.listdir(self.train_data): | |
raise ValueError(f"{self.train_data} should contain a metadata.jsonl file.") | |
# Check if valid data path exists | |
if self.valid_data: | |
if not os.path.exists(self.valid_data): | |
raise ValueError(f"{self.valid_data} does not exist.") | |
# check if self.valid_data contains at least 5 image files in jpeg, png or jpg format only | |
valid_image_files = [f for f in os.listdir(self.valid_data) if f.endswith(ALLOWED_EXTENSIONS)] | |
if len(valid_image_files) < 5: | |
raise ValueError(f"{self.valid_data} should contain at least 5 jpeg, png or jpg files.") | |
# check if self.valid_data contains a metadata.jsonl file | |
if "metadata.jsonl" not in os.listdir(self.valid_data): | |
raise ValueError(f"{self.valid_data} should contain a metadata.jsonl file.") | |
def split(self, df): | |
train_df, valid_df = train_test_split( | |
df, | |
test_size=self.test_size, | |
random_state=self.seed, | |
) | |
train_df = train_df.reset_index(drop=True) | |
valid_df = valid_df.reset_index(drop=True) | |
return train_df, valid_df | |
def prepare(self): | |
random_uuid = uuid.uuid4() | |
cache_dir = os.environ.get("HF_HOME") | |
if not cache_dir: | |
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") | |
data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) | |
if self.valid_data: | |
shutil.copytree(self.train_data, os.path.join(data_dir, "train")) | |
shutil.copytree(self.valid_data, os.path.join(data_dir, "validation")) | |
train_metadata, train_categories = self._process_metadata(os.path.join(data_dir, "train")) | |
valid_metadata, valid_categories = self._process_metadata(os.path.join(data_dir, "validation")) | |
train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) | |
valid_metadata.to_json( | |
os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True | |
) | |
all_categories = train_categories.union(valid_categories) | |
features = Features( | |
{ | |
"image": Image(), | |
"objects": Sequence( | |
{ | |
"bbox": Sequence(Value("float32"), length=4), | |
"category": ClassLabel(names=list(all_categories)), | |
} | |
), | |
} | |
) | |
dataset = load_dataset("imagefolder", data_dir=data_dir, features=features) | |
dataset = dataset.rename_columns( | |
{ | |
"image": "autotrain_image", | |
"objects": "autotrain_objects", | |
} | |
) | |
if self.local: | |
dataset.save_to_disk(f"{self.project_name}/autotrain-data") | |
else: | |
dataset.push_to_hub( | |
f"{self.username}/autotrain-data-{self.project_name}", | |
private=True, | |
token=self.token, | |
) | |
else: | |
metadata = pd.read_json(os.path.join(self.train_data, "metadata.jsonl"), lines=True) | |
train_df, valid_df = self.split(metadata) | |
# create train and validation folders | |
os.makedirs(os.path.join(data_dir, "train"), exist_ok=True) | |
os.makedirs(os.path.join(data_dir, "validation"), exist_ok=True) | |
# move images to train and validation folders | |
for row in train_df.iterrows(): | |
shutil.copy( | |
os.path.join(self.train_data, row[1]["file_name"]), | |
os.path.join(data_dir, "train", row[1]["file_name"]), | |
) | |
for row in valid_df.iterrows(): | |
shutil.copy( | |
os.path.join(self.train_data, row[1]["file_name"]), | |
os.path.join(data_dir, "validation", row[1]["file_name"]), | |
) | |
# save metadata.jsonl file to train and validation folders | |
train_df.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) | |
valid_df.to_json(os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True) | |
train_metadata, train_categories = self._process_metadata(os.path.join(data_dir, "train")) | |
valid_metadata, valid_categories = self._process_metadata(os.path.join(data_dir, "validation")) | |
train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) | |
valid_metadata.to_json( | |
os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True | |
) | |
all_categories = train_categories.union(valid_categories) | |
features = Features( | |
{ | |
"image": Image(), | |
"objects": Sequence( | |
{ | |
"bbox": Sequence(Value("float32"), length=4), | |
"category": ClassLabel(names=list(all_categories)), | |
} | |
), | |
} | |
) | |
dataset = load_dataset("imagefolder", data_dir=data_dir, features=features) | |
dataset = dataset.rename_columns( | |
{ | |
"image": "autotrain_image", | |
"objects": "autotrain_objects", | |
} | |
) | |
if self.local: | |
dataset.save_to_disk(f"{self.project_name}/autotrain-data") | |
else: | |
dataset.push_to_hub( | |
f"{self.username}/autotrain-data-{self.project_name}", | |
private=True, | |
token=self.token, | |
) | |
if self.local: | |
return f"{self.project_name}/autotrain-data" | |
return f"{self.username}/autotrain-data-{self.project_name}" | |
class ImageRegressionPreprocessor: | |
train_data: str | |
username: str | |
project_name: str | |
token: str | |
valid_data: Optional[str] = None | |
test_size: Optional[float] = 0.2 | |
seed: Optional[int] = 42 | |
local: Optional[bool] = False | |
def _process_metadata(data_path): | |
metadata = pd.read_json(os.path.join(data_path, "metadata.jsonl"), lines=True) | |
# make sure that the metadata.jsonl file contains the required columns: file_name, target | |
if "file_name" not in metadata.columns or "target" not in metadata.columns: | |
raise ValueError(f"{data_path}/metadata.jsonl should contain 'file_name' and 'target' columns.") | |
# keep only file_name and target columns | |
metadata = metadata[["file_name", "target"]] | |
return metadata | |
def __post_init__(self): | |
# Check if train data path exists | |
if not os.path.exists(self.train_data): | |
raise ValueError(f"{self.train_data} does not exist.") | |
# check if self.train_data contains at least 5 image files in jpeg, png or jpg format only | |
train_image_files = [f for f in os.listdir(self.train_data) if f.endswith(ALLOWED_EXTENSIONS)] | |
if len(train_image_files) < 5: | |
raise ValueError(f"{self.train_data} should contain at least 5 jpeg, png or jpg files.") | |
# check if self.train_data contains a metadata.jsonl file | |
if "metadata.jsonl" not in os.listdir(self.train_data): | |
raise ValueError(f"{self.train_data} should contain a metadata.jsonl file.") | |
# Check if valid data path exists | |
if self.valid_data: | |
if not os.path.exists(self.valid_data): | |
raise ValueError(f"{self.valid_data} does not exist.") | |
# check if self.valid_data contains at least 5 image files in jpeg, png or jpg format only | |
valid_image_files = [f for f in os.listdir(self.valid_data) if f.endswith(ALLOWED_EXTENSIONS)] | |
if len(valid_image_files) < 5: | |
raise ValueError(f"{self.valid_data} should contain at least 5 jpeg, png or jpg files.") | |
# check if self.valid_data contains a metadata.jsonl file | |
if "metadata.jsonl" not in os.listdir(self.valid_data): | |
raise ValueError(f"{self.valid_data} should contain a metadata.jsonl file.") | |
def split(self, df): | |
train_df, valid_df = train_test_split( | |
df, | |
test_size=self.test_size, | |
random_state=self.seed, | |
) | |
train_df = train_df.reset_index(drop=True) | |
valid_df = valid_df.reset_index(drop=True) | |
return train_df, valid_df | |
def prepare(self): | |
random_uuid = uuid.uuid4() | |
cache_dir = os.environ.get("HF_HOME") | |
if not cache_dir: | |
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") | |
data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) | |
if self.valid_data: | |
shutil.copytree(self.train_data, os.path.join(data_dir, "train")) | |
shutil.copytree(self.valid_data, os.path.join(data_dir, "validation")) | |
train_metadata = self._process_metadata(os.path.join(data_dir, "train")) | |
valid_metadata = self._process_metadata(os.path.join(data_dir, "validation")) | |
train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) | |
valid_metadata.to_json( | |
os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True | |
) | |
dataset = load_dataset("imagefolder", data_dir=data_dir) | |
dataset = dataset.rename_columns( | |
{ | |
"image": "autotrain_image", | |
"target": "autotrain_label", | |
} | |
) | |
if self.local: | |
dataset.save_to_disk(f"{self.project_name}/autotrain-data") | |
else: | |
dataset.push_to_hub( | |
f"{self.username}/autotrain-data-{self.project_name}", | |
private=True, | |
token=self.token, | |
) | |
else: | |
metadata = pd.read_json(os.path.join(self.train_data, "metadata.jsonl"), lines=True) | |
train_df, valid_df = self.split(metadata) | |
# create train and validation folders | |
os.makedirs(os.path.join(data_dir, "train"), exist_ok=True) | |
os.makedirs(os.path.join(data_dir, "validation"), exist_ok=True) | |
# move images to train and validation folders | |
for row in train_df.iterrows(): | |
shutil.copy( | |
os.path.join(self.train_data, row[1]["file_name"]), | |
os.path.join(data_dir, "train", row[1]["file_name"]), | |
) | |
for row in valid_df.iterrows(): | |
shutil.copy( | |
os.path.join(self.train_data, row[1]["file_name"]), | |
os.path.join(data_dir, "validation", row[1]["file_name"]), | |
) | |
# save metadata.jsonl file to train and validation folders | |
train_df.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) | |
valid_df.to_json(os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True) | |
train_metadata = self._process_metadata(os.path.join(data_dir, "train")) | |
valid_metadata = self._process_metadata(os.path.join(data_dir, "validation")) | |
train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) | |
valid_metadata.to_json( | |
os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True | |
) | |
dataset = load_dataset("imagefolder", data_dir=data_dir) | |
dataset = dataset.rename_columns( | |
{ | |
"image": "autotrain_image", | |
"target": "autotrain_label", | |
} | |
) | |
if self.local: | |
dataset.save_to_disk(f"{self.project_name}/autotrain-data") | |
else: | |
dataset.push_to_hub( | |
f"{self.username}/autotrain-data-{self.project_name}", | |
private=True, | |
token=self.token, | |
) | |
if self.local: | |
return f"{self.project_name}/autotrain-data" | |
return f"{self.username}/autotrain-data-{self.project_name}" | |