DeepSEQreen_NAR_fb

Sleeping

App Files Files Community

DeepSEQreen_NAR_fb / deepscreen /data /utils /dataset.py

libokj

Upload 110 files

6ae852e almost 2 years ago

raw

history blame

9.55 kB

	from numbers import Number
	from typing import Literal, Union, Sequence

	import pandas as pd
	from sklearn.base import TransformerMixin
	from sklearn.exceptions import NotFittedError
	from sklearn.utils.validation import check_is_fitted
	from torch.utils.data import Dataset

	from deepscreen.data.utils import label_transform, FlexibleIterable


	class BaseEntityDataset(Dataset):
	def __init__(
	self,
	dataset_path: str,
	use_col_prefixes=('X', 'Y', 'ID', 'U')
	):

	# Read the data table header row first to filter columns and create column dtype dict
	df = pd.read_csv(
	dataset_path,
	header=0, nrows=0,
	usecols=lambda col: col.startswith(use_col_prefixes)
	)
	# Read the whole data table
	df = pd.read_csv(
	dataset_path,
	header=0,
	usecols=df.columns,
	dtype={col: 'float32' if col.startswith('Y') else 'string' for col in df.columns}
	)

	self.df = df
	self.label_cols = [col for col in df.columns if col.startswith('Y')]
	self.label_unit_cols = [col for col in df.columns if col.startswith('U')]
	self.entity_id_cols = [col for col in df.columns if col.startswith('ID')]
	self.entity_cols = [col for col in df.columns if col.startswith('X')]

	def __len__(self):
	return len(self.df.index)

	def __getitem__(self, idx):
	raise NotImplementedError


	# TODO test transform
	class SingleEntitySingleTargetDataset(BaseEntityDataset):
	def __init__(
	self,
	dataset_path: str,
	task: Literal['regression', 'binary', 'multiclass'],
	n_classes: int,
	featurizer: callable,
	transformer: TransformerMixin = None,
	thresholds: Union[Number, Sequence[Number]] = None,
	discard_intermediate: bool = None,
	forward_fill: bool = True
	):
	super().__init__(dataset_path)

	assert len(self.entity_cols) == 1, 'The dataset contains more than 1 entity column (starting with `X`).'
	if len(self.label_cols) >= 0:
	assert len(self.label_cols) == 1, 'The dataset contains more than 1 label column (starting with `Y`).'
	# Remove trailing `1`s in column names for flexibility
	self.df.columns = self.df.columns.str.rstrip('1')

	# Forward-fill non-label columns
	nonlabel_cols = self.label_unit_cols + self.entity_id_cols + self.entity_cols
	if forward_fill:
	self.df[nonlabel_cols] = self.df[nonlabel_cols].ffill(axis=0)

	# Process target labels for training/testing if exist
	if self.label_cols:
	# Transform target labels
	self.df[self.label_cols] = self.df[self.label_cols].apply(
	label_transform,
	units=self.df.get('U', None),
	thresholds=thresholds,
	discard_intermediate=discard_intermediate).astype('float32')

	# Filter out rows with a NaN in Y (missing values); use inplace to save memory
	self.df.dropna(subset=self.label_cols, inplace=True)

	# Validate target labels
	# TODO: check sklearn.utils.multiclass.check_classification_targets
	match task:
	case 'regression':
	assert all(self.df['Y'].apply(lambda x: isinstance(x, Number))), \
	f"Y for task `regression` must be numeric; got {set(self.df['Y'].apply(type))}."
	case 'binary':
	assert all(self.df['Y'].isin([0, 1])), \
	f"Y for task `binary` (classification) must be 0 or 1, but Y got {pd.unique(self.df['Y'])}." \
	"\nYou may set `thresholds` to discretize continuous labels."
	case 'multiclass':
	assert n_classes >= 3, f'n_classes for task `multiclass` (classification) must be at least 3.'
	assert all(self.df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
	f"``Y` for task `multiclass` (classification) must be non-negative integers, " \
	f"but `Y` got {pd.unique(self.df['Y'])}." \
	"\nYou may set `thresholds` to discretize continuous labels."
	target_n_unique = self.df['Y'].nunique()
	assert target_n_unique == n_classes, \
	f"You have set n_classes for task `multiclass` (classification) task to {n_classes}, " \
	f"but `Y` has {target_n_unique} unique labels."

	if transformer:
	self.df['X'] = self.df['X'].apply(featurizer)
	try:
	check_is_fitted(transformer)
	self.df['X'] = list(transformer.transform(self.df['X']))
	except NotFittedError:
	self.df['X'] = list(transformer.fit_transform(self.df['X']))

	# Skip sample-wise feature extraction because it has already been done dataset-wise
	self.featurizer = lambda x: x

	self.featurizer = featurizer
	self.n_classes = n_classes
	self.df['ID'] = self.df.get('ID', self.df['X'])

	def __getitem__(self, idx):
	sample = self.df.loc[idx]
	return {
	'X': self.featurizer(sample['X']),
	'ID': sample['ID'],
	'Y': sample.get('Y')
	}


	# TODO WIP
	class MultiEntityMultiTargetDataset(BaseEntityDataset):
	def __init__(
	self,
	dataset_path: str,
	task: FlexibleIterable[Literal['regression', 'binary', 'multiclass']],
	n_class: FlexibleIterable[int],
	featurizers: FlexibleIterable[callable],
	thresholds: FlexibleIterable[Union[Number, Sequence[Number]]] = None,
	discard_intermediate: FlexibleIterable[bool] = None,
	):
	super().__init__(dataset_path)
	label_col_prefix = tuple('Y')
	nonlabel_col_prefixes = tuple(('X', 'ID', 'U'))
	allowed_col_prefixes = label_col_prefix + nonlabel_col_prefixes

	# Read the headers first to filter columns and create column dtype dict
	df = pd.read_csv(
	dataset_path,
	header=0, nrows=0,
	usecols=lambda col: col.startswith(allowed_col_prefixes)
	)

	# Read the whole table
	df = pd.read_csv(
	dataset_path,
	header=0,
	usecols=df.columns,
	dtype={col: 'float32' if col.startswith('Y') else 'string' for col in df.columns}
	)
	label_cols = [col for col in df.columns if col.startswith(label_col_prefix)]
	nonlabel_cols = [col for col in df.columns if col.startswith(nonlabel_col_prefixes)]
	self.entity_cols = [col for col in nonlabel_cols if col.startswith('X')]

	# Forward-fill all non-label columns
	df[nonlabel_cols] = df[nonlabel_cols].ffill(axis=0)

	# Process target labels for training/testing
	if label_cols:
	# Transform target labels
	df[label_cols] = df[label_cols].apply(label_transform, units=df.get('U', None), thresholds=thresholds,
	discard_intermediate=discard_intermediate).astype('float32')

	# Filter out rows with a NaN in Y (missing values)
	df.dropna(subset=label_cols, inplace=True)

	# Validate target labels
	# TODO: check sklearn.utils.multiclass.check_classification_targets
	# WIP
	match task:
	case 'regression':
	assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
	f"Y for task `regression` must be numeric; got {set(df['Y'].apply(type))}."
	case 'binary':
	assert all(df['Y'].isin([0, 1])), \
	f"Y for task `binary` must be 0 or 1, but Y got {pd.unique(df['Y'])}." \
	"\nYou may set `thresholds` to discretize continuous labels."
	case 'multiclass':
	assert len(label_cols) == len(n_class), \
	(f'Data table has {len(label_cols)} label columns (`Y*`) but you have specified '
	f'n_class of length {len(n_class)} for task `multiclass`.')
	for label, n in zip(df[label_cols], n_class):
	assert n >= 3, f'n_class for task `multiclass` must be at least 3.'
	assert all(label.apply(lambda x: x.is_integer() and x >= 0)), \
	f"Y for task `multiclass` must be non-negative integers, " \
	f"but Y got {pd.unique(label)}." \
	"\nYou may set `thresholds` to discretize continuous labels."
	target_n_unique = label.nunique()
	assert target_n_unique == n, \
	f"You have set n_classes for task `multiclass` task to {n}, " \
	f"but Y has {target_n_unique} unique labels."

	self.df = df
	self.featurizers = featurizers
	self.n_class = n_class

	def __len__(self):
	return len(self.df.index)

	# WIP
	def __getitem__(self, idx):
	sample = self.df.loc[idx]
	return {
	'X': [featurizer(x) for featurizer, x in zip(self.featurizers, sample[self.entity_cols])],
	'ID': sample.get('ID', sample['X']),
	'Y': sample.get('Y')
	}