DeepSEQreen_NAR_fb

Sleeping

App Files Files Community

libokj commited on Dec 22, 2023

Commit

47687c1

1 Parent(s): 23b383b

Delete deepscreen

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

deepscreen/__init__.py +0 -101
deepscreen/__pycache__/__init__.cpython-311.pyc +0 -0
deepscreen/__pycache__/__init__.cpython-39.pyc +0 -0
deepscreen/__pycache__/predict.cpython-311.pyc +0 -0
deepscreen/__pycache__/test.cpython-311.pyc +0 -0
deepscreen/__pycache__/train.cpython-311.pyc +0 -0
deepscreen/__pycache__/train.cpython-39.pyc +0 -0
deepscreen/data/__init__.py +0 -0
deepscreen/data/__pycache__/__init__.cpython-311.pyc +0 -0
deepscreen/data/__pycache__/__init__.cpython-39.pyc +0 -0
deepscreen/data/__pycache__/dti.cpython-311.pyc +0 -0
deepscreen/data/__pycache__/dti_datamodule.cpython-311.pyc +0 -0
deepscreen/data/dti.py +0 -422
deepscreen/data/dti.py.bak +0 -369
deepscreen/data/dti_datamodule.py +0 -314
deepscreen/data/entity_datamodule.py +0 -167
deepscreen/data/featurizers/__init__.py +0 -0
deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc +0 -0
deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc +0 -0
deepscreen/data/featurizers/__pycache__/fcs.cpython-311.pyc +0 -0
deepscreen/data/featurizers/__pycache__/graph.cpython-311.pyc +0 -0
deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc +0 -0
deepscreen/data/featurizers/categorical.py +0 -86
deepscreen/data/featurizers/chem.py +0 -48
deepscreen/data/featurizers/fcs.py +0 -67
deepscreen/data/featurizers/fingerprint/__init__.py +0 -45
deepscreen/data/featurizers/fingerprint/__pycache__/__init__.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/atompairs.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/avalonfp.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/estatefp.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/maccskeys.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/map4.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/mhfp6.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/morganfp.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/pharmErGfp.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/pharmPointfp.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/pubchemfp.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/rdkitfp.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/__pycache__/torsions.cpython-311.pyc +0 -0
deepscreen/data/featurizers/fingerprint/atompairs.py +0 -18
deepscreen/data/featurizers/fingerprint/avalonfp.py +0 -16
deepscreen/data/featurizers/fingerprint/estatefp.py +0 -12
deepscreen/data/featurizers/fingerprint/maccskeys.py +0 -25
deepscreen/data/featurizers/fingerprint/maccskeys.xlsx +0 -0
deepscreen/data/featurizers/fingerprint/map4.py +0 -130
deepscreen/data/featurizers/fingerprint/mhfp6.py +0 -18
deepscreen/data/featurizers/fingerprint/mnimalfatures.fdef +0 -53
deepscreen/data/featurizers/fingerprint/morganfp.py +0 -18
deepscreen/data/featurizers/fingerprint/pharmErGfp.py +0 -60
deepscreen/data/featurizers/fingerprint/pharmPointfp.py +0 -59

deepscreen/__init__.py DELETED Viewed

@@ -1,101 +0,0 @@
-"""
-DeepScreen package initialization, registering custom objects and monkey patching for some libraries.
-"""
-import sys
-from builtins import eval
-import lightning.fabric.strategies.launchers.subprocess_script as subprocess_script
-import torch
-from omegaconf import OmegaConf
-from deepscreen.utils import get_logger
-log = get_logger(__name__)
-# Allow basic Python operations in hydra interpolation; examples:
-# `in_channels: ${eval:${model.drug_encoder.out_channels}+${model.protein_encoder.out_channels}}`
-# `subdir: ${eval:${hydra.job.override_dirname}.replace('/', '.')}`
-OmegaConf.register_new_resolver("eval", eval)
-def sanitize_path(path_str: str):
-    """
-    Sanitize a string for path creation by replacing unsafe characters and cutting length to 255 (OS limitation).
-    """
-    return path_str.replace("/", ".").replace("\\", ".").replace(":", "-")[:255]
-OmegaConf.register_new_resolver("sanitize_path", sanitize_path)
-def _hydra_subprocess_cmd(local_rank: int):
-    """
-    Monkey patching for lightning.fabric.strategies.launchers.subprocess_script._hydra_subprocess_cmd
-    Temporarily fixes the problem of unnecessarily creating log folders for DDP subprocesses in Hydra multirun/sweep.
-    """
-    import __main__  # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
-    from hydra.core.hydra_config import HydraConfig
-    from hydra.utils import get_original_cwd, to_absolute_path
-    # when user is using hydra find the absolute path
-    if __main__.__spec__ is None:  # pragma: no-cover
-        command = [sys.executable, to_absolute_path(sys.argv[0])]
-    else:
-        command = [sys.executable, "-m", __main__.__spec__.name]
-    command += sys.argv[1:]
-    cwd = get_original_cwd()
-    rundir = f'"{HydraConfig.get().runtime.output_dir}"'
-    # Set output_subdir null since we don't want different subprocesses trying to write to config.yaml
-    command += [f"hydra.job.name=train_ddp_process_{local_rank}",
-                "hydra.output_subdir=null,"
-                f"hydra.runtime.output_dir={rundir}"]
-    return command, cwd
-subprocess_script._hydra_subprocess_cmd = _hydra_subprocess_cmd
-# from torch import Tensor
-# from lightning.fabric.utilities.distributed import _distributed_available
-# from lightning.pytorch.utilities.rank_zero import WarningCache
-# from lightning.pytorch.utilities.warnings import PossibleUserWarning
-# from lightning.pytorch.trainer.connectors.logger_connector.result import _ResultCollection
-# warning_cache = WarningCache()
-#
-# @staticmethod
-# def _get_cache(result_metric, on_step: bool):
-#     cache = None
-#     if on_step and result_metric.meta.on_step:
-#         cache = result_metric._forward_cache
-#     elif not on_step and result_metric.meta.on_epoch:
-#         if result_metric._computed is None:
-#             should = result_metric.meta.sync.should
-#             if not should and _distributed_available() and result_metric.is_tensor:
-#                 warning_cache.warn(
-#                     f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)`"
-#                     " when logging on epoch level in distributed setting to accumulate the metric across"
-#                     " devices.",
-#                     category=PossibleUserWarning,
-#                 )
-#             result_metric.compute()
-#             result_metric.meta.sync.should = should
-#
-#         cache = result_metric._computed
-#
-#         if cache is not None:
-#             if isinstance(cache, Tensor):
-#                 if not result_metric.meta.enable_graph:
-#                     return cache.detach()
-#
-#     return cache
-#
-#
-# _ResultCollection._get_cache = _get_cache
-if torch.cuda.is_available():
-    if torch.cuda.get_device_capability() >= (8, 0):
-        torch.set_float32_matmul_precision("high")
-        log.info("Your GPU supports tensor cores, "
-                 "we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")

deepscreen/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (3.28 kB)

deepscreen/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (247 Bytes)

deepscreen/__pycache__/predict.cpython-311.pyc DELETED Viewed

Binary file (3.37 kB)

deepscreen/__pycache__/test.cpython-311.pyc DELETED Viewed

Binary file (4.54 kB)

deepscreen/__pycache__/train.cpython-311.pyc DELETED Viewed

Binary file (7.14 kB)

deepscreen/__pycache__/train.cpython-39.pyc DELETED Viewed

Binary file (2.68 kB)

deepscreen/data/__init__.py DELETED Viewed

File without changes

deepscreen/data/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (179 Bytes)

deepscreen/data/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (161 Bytes)

deepscreen/data/__pycache__/dti.cpython-311.pyc DELETED Viewed

Binary file (23 kB)

deepscreen/data/__pycache__/dti_datamodule.cpython-311.pyc DELETED Viewed

Binary file (13 kB)

deepscreen/data/dti.py DELETED Viewed

@@ -1,422 +0,0 @@
-import re
-from functools import partial
-from numbers import Number
-from pathlib import Path
-from typing import Any, Dict, Optional, Sequence, Union, Literal
-from lightning import LightningDataModule
-import pandas as pd
-import swifter
-from sklearn.preprocessing import LabelEncoder
-from torch.utils.data import Dataset, DataLoader
-from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
-from deepscreen.utils import get_logger
-log = get_logger(__name__)
-SMILES_PAT = r"[^A-Za-z0-9=#:+\-\[\]<>()/\\@%,.*]"
-FASTA_PAT = r"[^A-Z*\-]"
-def validate_seq_str(seq, regex):
-    if seq:
-        err_charset = set(re.findall(regex, seq))
-        if not err_charset:
-            return None
-        else:
-            return ', '.join(err_charset)
-    else:
-        return 'Empty string'
-# TODO: save a list of corrupted records
-def rdkit_canonicalize(smiles):
-    from rdkit import Chem
-    try:
-        mol = Chem.MolFromSmiles(smiles)
-        cano_smiles = Chem.MolToSmiles(mol)
-        return cano_smiles
-    except Exception as e:
-        log.warning(f'Failed to canonicalize SMILES using RDKIT due to {str(e)}. Returning original SMILES: {smiles}')
-        return smiles
-class DTIDataset(Dataset):
-    def __init__(
-            self,
-            task: Literal['regression', 'binary', 'multiclass'],
-            num_classes: Optional[int],
-            data_path: str | Path,
-            drug_featurizer: callable,
-            protein_featurizer: callable,
-            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
-            discard_intermediate: Optional[bool] = False,
-            query: Optional[str] = 'X2'
-    ):
-        df = pd.read_csv(
-            data_path,
-            engine='python',
-            header=0,
-            usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
-            dtype={
-                'X1': 'str',
-                'ID1': 'str',
-                'X2': 'str',
-                'ID2': 'str',
-                'Y': 'float32',
-                'U': 'str',
-            },
-        )
-        # Read the whole data table
-        # if 'ID1' in df:
-        #     self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
-        # if 'ID2' in df:
-        #     self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
-        #     self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
-        # self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
-        # # train and eval mode data processing (fully labelled)
-        # if 'Y' in df.columns and df['Y'].notnull().all():
-        log.info(f"Processing data file: {data_path}")
-        # Forward-fill all non-label columns
-        df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
-        # TODO potentially allow running through the whole data validation process
-        # error = False
-        if 'Y' in df:
-            log.info(f"Validating labels (`Y`)...")
-            # TODO: check sklearn.utils.multiclass.check_classification_targets
-            match task:
-                case 'regression':
-                    assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
-                        f"""`Y` must be numeric for `regression` task,
-                        but it has {set(df['Y'].swifter.apply(type))}."""
-                case 'binary':
-                    if all(df['Y'].isin([0, 1])):
-                        assert not thresholds, \
-                            f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
-                            but still got `thresholds` ({thresholds}).
-                            Double check your choices of `task` and `thresholds`, and records in the `Y` column."""
-                    else:
-                        assert thresholds, \
-                            f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
-                            but it has {pd.unique(df['Y'])}.
-                            You may set `thresholds` to discretize continuous labels."""  # TODO print err idx instead
-                case 'multiclass':
-                    assert num_classes >= 3, f'`num_classes` for `task=multiclass` must be at least 3.'
-                    if all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)):
-                        assert not thresholds, \
-                            f"""`Y` is already non-negative integers for
-                            `multiclass` (classification) `task`, but still got `thresholds` ({thresholds}).
-                            Double check your choice of `task`, `thresholds` and records in the `Y` column."""
-                    else:
-                        assert thresholds, \
-                            f"""`Y` must be non-negative integers for
-                            `multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
-                            You must set `thresholds` to discretize continuous labels."""  # TODO print err idx instead
-            if 'U' in df.columns:
-                units = df['U']
-            else:
-                units = None
-                log.warning("Units ('U') not in the data table. "
-                            "Assuming all labels to be discrete or in p-scale (-log10[M]).")
-            # Transform labels
-            df['Y'] = label_transform(labels=df['Y'], units=units, thresholds=thresholds,
-                                      discard_intermediate=discard_intermediate)
-            # Filter out rows with a NaN in Y (missing values)
-            df.dropna(subset=['Y'], inplace=True)
-            match task:
-                case 'regression':
-                    df['Y'] = df['Y'].astype('float32')
-                    assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
-                        f"""`Y` must be numeric for `regression` task,
-                        but after transformation it still has {set(df['Y'].swifter.apply(type))}.
-                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
-                    # TODO print err idx instead
-                case 'binary':
-                    df['Y'] = df['Y'].astype('int')
-                    assert all(df['Y'].isin([0, 1])), \
-                        f"""`Y` must be 0 or 1 for `task=binary`, "
-                        but after transformation it still has {pd.unique(df['Y'])}.
-                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
-                    # TODO print err idx instead
-                case 'multiclass':
-                    df['Y'] = df['Y'].astype('int')
-                    assert all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)), \
-                        f"""Y must be non-negative integers for `task=multiclass`
-                        but after transformation it still has {pd.unique(df['Y'])}.
-                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
-                    # TODO print err idx instead
-                    target_n_unique = df['Y'].nunique()
-                    assert target_n_unique == num_classes, \
-                        f"""You have set `num_classes` for `task=multiclass` to {num_classes},
-                        but after transformation Y still has {target_n_unique} unique labels.
-                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
-        log.info("Validating SMILES (`X1`)...")
-        df['X1_ERR'] = df['X1'].swifter.progress_bar(
-            desc="Validating SMILES...").apply(validate_seq_str, regex=SMILES_PAT)
-        if not df['X1_ERR'].isna().all():
-            raise Exception(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
-        df['X1^'] = df['X1'].apply(rdkit_canonicalize)  # swifter
-        log.info("Validating FASTA (`X2`)...")
-        df['X2'] = df['X2'].str.upper()
-        df['X2_ERR'] = df['X2'].swifter.progress_bar(
-            desc="Validating FASTA...").apply(validate_seq_str, regex=FASTA_PAT)
-        if not df['X2_ERR'].isna().all():
-            raise Exception(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
-        # FASTA/SMILES indices as query for retrieval metrics like enrichment factor and hit rate
-        if query:
-            df['ID^'] = LabelEncoder().fit_transform(df[query])
-        self.df = df
-        self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
-        self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
-    def __len__(self):
-        return len(self.df.index)
-    def __getitem__(self, i):
-        sample = self.df.loc[i]
-        return {
-            'N': i,
-            'X1': sample['X1'],
-            'X1^': self.drug_featurizer(sample['X1^']),
-            'ID1': sample.get('ID1'),
-            'X2': sample['X2'],
-            'X2^': self.protein_featurizer(sample['X2']),
-            'ID2': sample.get('ID2'),
-            'Y': sample.get('Y'),
-            'ID^': sample.get('ID^'),
-        }
-class DTIDataModule(LightningDataModule):
-    """
-    DTI DataModule
-    A DataModule implements 5 key methods:
-        def prepare_data(self):
-            # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
-            # download data, pre-process, split, save to disk, etc.
-        def setup(self, stage):
-            # things to do on every process in DDP
-            # load data, set variables, etc.
-        def train_dataloader(self):
-            # return train dataloader
-        def val_dataloader(self):
-            # return validation dataloader
-        def test_dataloader(self):
-            # return test dataloader
-        def teardown(self):
-            # called on every process in DDP
-            # clean up after fit or test
-    This allows you to share a full dataset without explaining how to download,
-    split, transform and process the data.
-    Read the docs:
-        https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
-    """
-    def __init__(
-            self,
-            task: Literal['regression', 'binary', 'multiclass'],
-            num_classes: Optional[int],
-            batch_size: int,
-            # train: bool,
-            drug_featurizer: callable,
-            protein_featurizer: callable,
-            collator: callable = collate_fn,
-            data_dir: str = "data/",
-            data_file: Optional[str] = None,
-            train_val_test_split: Optional[Union[Sequence[Number | str]]] = None,
-            split: Optional[callable] = None,
-            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
-            discard_intermediate: Optional[bool] = False,
-            num_workers: int = 0,
-            pin_memory: bool = False,
-    ):
-        super().__init__()
-        self.train_data: Optional[Dataset] = None
-        self.val_data: Optional[Dataset] = None
-        self.test_data: Optional[Dataset] = None
-        self.predict_data: Optional[Dataset] = None
-        self.split = split
-        self.collator = collator
-        self.dataset = partial(
-            DTIDataset,
-            task=task,
-            num_classes=num_classes,
-            drug_featurizer=drug_featurizer,
-            protein_featurizer=protein_featurizer,
-            thresholds=thresholds,
-            discard_intermediate=discard_intermediate
-        )
-        # this line allows to access init params with 'self.hparams' ensures init params will be stored in ckpt
-        self.save_hyperparameters(logger=False)  # ignore=['split']
-    def prepare_data(self):
-        """
-        Download data if needed.
-        Do not use it to assign state (e.g., self.x = x).
-        """
-    def setup(self, stage: Optional[str] = None, encoding: str = None):
-        """
-        Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
-        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
-        careful not to execute data splitting twice.
-        """
-        # load and split datasets only if not loaded in initialization
-        if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
-            if self.hparams.train_val_test_split:
-                if len(self.hparams.train_val_test_split) != 3:
-                    raise ValueError('Length of `train_val_test_split` must be 3. '
-                                     'Set the second element to None for training without validation. '
-                                     'Set the third element to None for training without testing.')
-                self.train_data = self.hparams.train_val_test_split[0]
-                self.val_data = self.hparams.train_val_test_split[1]
-                self.test_data = self.hparams.train_val_test_split[2]
-                if all([self.hparams.data_file, self.split]):
-                    if all(isinstance(split, Number) or split is None
-                           for split in self.hparams.train_val_test_split):
-                        split_data = self.split(
-                            dataset=self.dataset(data_path=Path(self.hparams.data_dir, self.hparams.data_file)),
-                            lengths=[split for split in self.hparams.train_val_test_split if split is not None]
-                        )
-                        for dataset in ['train_data', 'val_data', 'test_data']:
-                            if getattr(self, dataset) is not None:
-                                setattr(self, dataset, split_data.pop(0))
-                    else:
-                        raise ValueError('`train_val_test_split` must be a sequence numbers or None'
-                                         '(float for percentages and int for sample numbers) '
-                                         'if both `data_file` and `split` have been specified.')
-                elif (all(isinstance(split, str) or split is None
-                          for split in self.hparams.train_val_test_split)
-                      and not any([self.hparams.data_file, self.split])):
-                    for dataset in ['train_data', 'val_data', 'test_data']:
-                        if getattr(self, dataset) is not None:
-                            data_path = Path(getattr(self, dataset))
-                            if not data_path.is_absolute():
-                                data_path = Path(self.hparams.data_dir, data_path)
-                            setattr(self, dataset, self.dataset(data_path=data_path))
-                else:
-                    raise ValueError('For training, you must specify either all of `data_file`, `split`, '
-                                     'and `train_val_test_split` as a sequence of numbers or '
-                                     'solely `train_val_test_split` as a sequence of data file paths.')
-            elif self.hparams.data_file and not any([self.split, self.hparams.train_val_test_split]):
-                data_path = Path(self.hparams.data_file)
-                if not data_path.is_absolute():
-                    data_path = Path(self.hparams.data_dir, data_path)
-                self.test_data = self.predict_data = self.dataset(data_path=data_path)
-            else:
-                raise ValueError("For training, you must specify `train_val_test_split`. "
-                                 "For testing/predicting, you must specify only `data_file` without "
-                                 "`train_val_test_split` or `split`.")
-    def train_dataloader(self):
-        return DataLoader(
-            dataset=self.train_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.train_data,
-                batch_size=self.hparams.batch_size,
-                # Dropping the last batch prevents problems caused by variable batch sizes in training, e.g.,
-                # batch_size=1 in BatchNorm, and shuffling ensures the model be trained on all samples over epochs.
-                drop_last=True,
-                shuffle=True,
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=True,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=self.collator,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            dataset=self.val_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.val_data,
-                batch_size=self.hparams.batch_size,
-                drop_last=False,
-                shuffle=False
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=self.collator,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def test_dataloader(self):
-        return DataLoader(
-            dataset=self.test_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.test_data,
-                batch_size=self.hparams.batch_size,
-                drop_last=False,
-                shuffle=False
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=self.collator,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def predict_dataloader(self):
-        return DataLoader(
-            dataset=self.predict_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.predict_data,
-                batch_size=self.hparams.batch_size,
-                drop_last=False,
-                shuffle=False
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=self.collator,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def teardown(self, stage: Optional[str] = None):
-        """Clean up after fit or test."""
-        pass
-    def state_dict(self):
-        """Extra things to save to checkpoint."""
-        return {}
-    def load_state_dict(self, state_dict: Dict[str, Any]):
-        """Things to do when loading checkpoint."""
-        pass

deepscreen/data/dti.py.bak DELETED Viewed

@@ -1,369 +0,0 @@
-from functools import partial
-from numbers import Number
-from pathlib import Path
-from typing import Any, Dict, Optional, Sequence, Union, Literal
-from lightning import LightningDataModule
-import pandas as pd
-from sklearn.preprocessing import LabelEncoder
-from torch.utils.data import Dataset, DataLoader
-from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
-from deepscreen.utils import get_logger
-log = get_logger(__name__)
-# TODO: save a list of corrupted records
-class DTIDataset(Dataset):
-    def __init__(
-            self,
-            task: Literal['regression', 'binary', 'multiclass'],
-            n_class: Optional[int],
-            data_path: str | Path,
-            drug_featurizer: callable,
-            protein_featurizer: callable,
-            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
-            discard_intermediate: Optional[bool] = False,
-    ):
-        df = pd.read_csv(
-            data_path,
-            engine='python',
-            header=0,
-            usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
-            dtype={
-                'X1': 'str',
-                'ID1': 'str',
-                'X2': 'str',
-                'ID2': 'str',
-                'Y': 'float32',
-                'U': 'str',
-            },
-        )
-        # Read the whole data table
-        # if 'ID1' in df:
-        #     self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
-        # if 'ID2' in df:
-        #     self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
-        #     self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
-        # self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
-        # # train and eval mode data processing (fully labelled)
-        # if 'Y' in df.columns and df['Y'].notnull().all():
-        log.info(f"Processing data file: {data_path}")
-        # Forward-fill all non-label columns
-        df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
-        if 'Y' in df:
-            log.info(f"Performing pre-transformation target validation.")
-            # TODO: check sklearn.utils.multiclass.check_classification_targets
-            match task:
-                case 'regression':
-                    assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
-                        f"""`Y` must be numeric for `regression` task,
-                        but it has {set(df['Y'].apply(type))}."""
-                case 'binary':
-                    if all(df['Y'].isin([0, 1])):
-                        assert not thresholds, \
-                            f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
-                            but still got `thresholds` {thresholds}.
-                            Double check your choices of `task` and `thresholds` and records in the `Y` column."""
-                    else:
-                        assert thresholds, \
-                            f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
-                            but it has {pd.unique(df['Y'])}.
-                            You must set `thresholds` to discretize continuous labels."""
-                case 'multiclass':
-                    assert n_class >= 3, f'`n_class` for `multiclass` (classification) `task` must be at least 3.'
-                    if all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)):
-                        assert not thresholds, \
-                            f"""`Y` is already non-negative integers for
-                            `multiclass` (classification) `task`, but still got `thresholds` {thresholds}.
-                            Double check your choice of `task`, `thresholds` and records in the `Y` column."""
-                    else:
-                        assert thresholds, \
-                            f"""`Y` must be non-negative integers for
-                            `multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
-                            You must set `thresholds` to discretize continuous labels."""
-            if 'U' in df.columns:
-                units = df['U']
-            else:
-                units = None
-                log.warning("Units ('U') not in the data table. "
-                            "Assuming all labels to be discrete or in p-scale (-log10[M]).")
-            # Transform labels
-            df['Y'] = label_transform(labels=df['Y'], units=units, thresholds=thresholds,
-                                      discard_intermediate=discard_intermediate)
-            # Filter out rows with a NaN in Y (missing values)
-            df.dropna(subset=['Y'], inplace=True)
-            log.info(f"Performing post-transformation target validation.")
-            match task:
-                case 'regression':
-                    df['Y'] = df['Y'].astype('float32')
-                    assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
-                        f"""`Y` must be numeric for `regression` task,
-                        but after transformation it still has {set(df['Y'].apply(type))}.
-                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
-                case 'binary':
-                    df['Y'] = df['Y'].astype('int')
-                    assert all(df['Y'].isin([0, 1])), \
-                        f"""`Y` must be 0 or 1 for `binary` (classification) `task`, "
-                        but after transformation it still has {pd.unique(df['Y'])}.
-                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
-                case 'multiclass':
-                    df['Y'] = df['Y'].astype('int')
-                    assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
-                        f"""Y must be non-negative integers for task `multiclass` (classification)
-                        but after transformation it still has {pd.unique(df['Y'])}.
-                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
-                    target_n_unique = df['Y'].nunique()
-                    assert target_n_unique == n_class, \
-                        f"""You have set `n_class` for `multiclass` (classification) `task` to {n_class},
-                        but after transformation Y still has {target_n_unique} unique labels.
-                        Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
-        # Indexed protein/FASTA for retrieval metrics
-        df['IDX'] = LabelEncoder().fit_transform(df['X2'])
-        self.df = df
-        self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
-        self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
-    def __len__(self):
-        return len(self.df.index)
-    def __getitem__(self, i):
-        sample = self.df.loc[i]
-        return {
-            'N': i,
-            'X1': self.drug_featurizer(sample['X1']),
-            'ID1': sample.get('ID1', sample['X1']),
-            'X2': self.protein_featurizer(sample['X2']),
-            'ID2': sample.get('ID2', sample['X2']),
-            'Y': sample.get('Y'),
-            'IDX': sample['IDX'],
-        }
-class DTIDataModule(LightningDataModule):
-    """
-    DTI DataModule
-    A DataModule implements 5 key methods:
-        def prepare_data(self):
-            # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
-            # download data, pre-process, split, save to disk, etc.
-        def setup(self, stage):
-            # things to do on every process in DDP
-            # load data, set variables, etc.
-        def train_dataloader(self):
-            # return train dataloader
-        def val_dataloader(self):
-            # return validation dataloader
-        def test_dataloader(self):
-            # return test dataloader
-        def teardown(self):
-            # called on every process in DDP
-            # clean up after fit or test
-    This allows you to share a full dataset without explaining how to download,
-    split, transform and process the data.
-    Read the docs:
-        https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
-    """
-    def __init__(
-            self,
-            task: Literal['regression', 'binary', 'multiclass'],
-            n_class: Optional[int],
-            batch_size: int,
-            # train: bool,
-            drug_featurizer: callable,
-            protein_featurizer: callable,
-            collator: callable = collate_fn,
-            data_dir: str = "data/",
-            data_file: Optional[str] = None,
-            train_val_test_split: Optional[Union[Sequence[Number | str]]] = None,
-            split: Optional[callable] = None,
-            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
-            discard_intermediate: Optional[bool] = False,
-            num_workers: int = 0,
-            pin_memory: bool = False,
-    ):
-        super().__init__()
-        self.train_data: Optional[Dataset] = None
-        self.val_data: Optional[Dataset] = None
-        self.test_data: Optional[Dataset] = None
-        self.predict_data: Optional[Dataset] = None
-        self.split = split
-        self.collator = collator
-        self.dataset = partial(
-            DTIDataset,
-            task=task,
-            n_class=n_class,
-            drug_featurizer=drug_featurizer,
-            protein_featurizer=protein_featurizer,
-            thresholds=thresholds,
-            discard_intermediate=discard_intermediate
-        )
-        if train_val_test_split:
-            # TODO test behavior for trainer.test and predict when this is passed
-            if len(train_val_test_split) not in [2, 3]:
-                raise ValueError('Length of `train_val_test_split` must be 2 (for training without testing) or 3.')
-            if all([data_file, split]):
-                if all(isinstance(split, Number) for split in train_val_test_split):
-                    pass
-                else:
-                    raise ValueError('`train_val_test_split` must be a sequence numbers '
-                                     '(float for percentages and int for sample numbers) '
-                                     'if both `data_file` and `split` have been specified.')
-            elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
-                split_paths = []
-                for split in train_val_test_split:
-                    split = Path(split)
-                    if not split.is_absolute():
-                        split = Path(data_dir, split)
-                    split_paths.append(split)
-                self.train_data = self.dataset(data_path=split_paths[0])
-                self.val_data = self.dataset(data_path=split_paths[1])
-                if len(train_val_test_split) == 3:
-                    self.test_data = self.dataset(data_path=split_paths[2])
-            else:
-                raise ValueError('For training, you must specify either `data_file`, `split`, '
-                                 'and `train_val_test_split` as a sequence of numbers or '
-                                 'solely `train_val_test_split` as a sequence of data file paths.')
-        elif data_file and not any([split, train_val_test_split]):
-            data_file = Path(data_file)
-            if not data_file.is_absolute():
-                data_file = Path(data_dir, data_file)
-            self.test_data = self.predict_data = self.dataset(data_path=data_file)
-        else:
-            raise ValueError("For training, you must specify `train_val_test_split`. "
-                             "For testing/predicting, you must specify only `data_file` without "
-                             "`train_val_test_split` or `split`.")
-        # this line allows to access init params with 'self.hparams' attribute
-        # also ensures init params will be stored in ckpt
-        self.save_hyperparameters(logger=False)  # ignore=['split']
-    def prepare_data(self):
-        """
-        Download data if needed.
-        Do not use it to assign state (e.g., self.x = x).
-        """
-    def setup(self, stage: Optional[str] = None, encoding: str = None):
-        """
-        Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
-        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
-        careful not to execute data splitting twice.
-        """
-        # TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
-        # load and split datasets only if not loaded in initialization
-        if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
-            self.train_data, self.val_data, self.test_data = self.split(
-                dataset=self.dataset(data_path=Path(self.hparams.data_dir, self.hparams.data_file)),
-                lengths=self.hparams.train_val_test_split
-            )
-    def train_dataloader(self):
-        return DataLoader(
-            dataset=self.train_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.train_data,
-                batch_size=self.hparams.batch_size,
-                # Dropping the last batch prevents problems caused by variable batch sizes in training, e.g.,
-                # batch_size=1 in BatchNorm, and shuffling ensures the model be trained on all samples over epochs.
-                drop_last=True,
-                shuffle=True,
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=True,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=self.collator,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            dataset=self.val_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.val_data,
-                batch_size=self.hparams.batch_size,
-                drop_last=False,
-                shuffle=False
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=self.collator,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def test_dataloader(self):
-        return DataLoader(
-            dataset=self.test_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.test_data,
-                batch_size=self.hparams.batch_size,
-                drop_last=False,
-                shuffle=False
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=self.collator,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def predict_dataloader(self):
-        return DataLoader(
-            dataset=self.predict_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.predict_data,
-                batch_size=self.hparams.batch_size,
-                drop_last=False,
-                shuffle=False
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=self.collator,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def teardown(self, stage: Optional[str] = None):
-        """Clean up after fit or test."""
-        pass
-    def state_dict(self):
-        """Extra things to save to checkpoint."""
-        return {}
-    def load_state_dict(self, state_dict: Dict[str, Any]):
-        """Things to do when loading checkpoint."""
-        pass

deepscreen/data/dti_datamodule.py DELETED Viewed

@@ -1,314 +0,0 @@
-# from itertools import product
-from collections import namedtuple
-from numbers import Number
-from typing import Any, Dict, Optional, Sequence, Union, Literal
-# import numpy as np
-import pandas as pd
-from lightning import LightningDataModule
-from torch.utils.data import Dataset, DataLoader, random_split
-from deepscreen.data.utils.label import label_transform
-from deepscreen.data.utils.collator import collate_fn
-from deepscreen.data.utils.sampler import SafeBatchSampler
-class DTIDataset(Dataset):
-    def __init__(
-            self,
-            task: Literal['regression', 'binary', 'multiclass'],
-            n_classes: Optional[int],
-            data_dir: str,
-            dataset_name: str,
-            drug_featurizer: callable,
-            protein_featurizer: callable,
-            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
-            discard_intermediate: Optional[bool] = False,
-    ):
-        df = pd.read_csv(
-            f'{data_dir}{dataset_name}.csv',
-            header=0, sep=',',
-            usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
-            dtype={'X1': 'str', 'ID1': 'str',
-                   'X2': 'str', 'ID2': 'str',
-                   'Y': 'float32', 'U': 'str'}
-        )
-        # if 'ID1' in df:
-        #     self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
-        # if 'ID2' in df:
-        #     self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
-        #     self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
-        # self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
-        # # train and eval mode data processing (fully labelled)
-        # if 'Y' in df.columns and df['Y'].notnull().all():
-        # Forward-fill all non-label columns
-        df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
-        if 'Y' in df:
-            # Transform labels
-            df['Y'] = df['Y'].apply(label_transform, units=df.get('U', None), thresholds=thresholds,
-                                    discard_intermediate=discard_intermediate).astype('float32')
-            # Filter out rows with a NaN in Y (missing values)
-            df.dropna(subset=['Y'], inplace=True)
-            # Validate target labels for training/testing
-            # TODO: check sklearn.utils.multiclass.check_classification_targets
-            match task:
-                case 'regression':
-                    assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
-                        f"Y for task `regression` must be numeric; got {set(df['Y'].apply(type))}."
-                case 'binary':
-                    assert all(df['Y'].isin([0, 1])), \
-                        f"Y for task `binary` (classification) must be 0 or 1, but Y got {pd.unique(df['Y'])}." \
-                        "\nYou may set `thresholds` to discretize continuous labels."
-                case 'multiclass':
-                    assert n_classes >= 3, f'n_classes for task `multiclass` (classification) must be at least 3.'
-                    assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
-                        f"Y for task `multiclass` (classification) must be non-negative integers, " \
-                        f"but Y got {pd.unique(df['Y'])}." \
-                        "\nYou may set `thresholds` to discretize continuous labels."
-                    target_n_unique = df['Y'].nunique()
-                    assert target_n_unique == n_classes, \
-                        f"You have set n_classes for task `multiclass` (classification) task to {n_classes}, " \
-                        f"but Y has {target_n_unique} unique labels."
-        # # Predict mode data processing
-        # else:
-        #     df = pd.DataFrame(product(df['X1'].dropna(), df['X2'].dropna()), columns=['X1', 'X2'])
-        #     if hasattr(self, "x1_to_id1"):
-        #         df['ID1'] = df['X1'].map(self.x1_to_id1)
-        #     if hasattr(self, "x1_to_id2"):
-        #         df['ID2'] = df['X2'].map(self.x2_to_id2)
-        # self.smiles = df['X1']
-        # self.fasta = df['X2']
-        # self.smiles_ids = df.get('ID1', df['X1'])
-        # self.fasta_ids = df.get('ID2', df['X2'])
-        # self.labels = df.get('Y', None)
-        self.df = df
-        self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
-        self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
-        self.n_classes = df['Y'].nunique()
-        # self.train = train
-        self.Data = namedtuple('Data', ['FT1', 'ID1', 'FT2', 'ID2', 'Y'])
-    def __len__(self):
-        return len(self.df.index)
-    def __getitem__(self, idx):
-        sample = self.df.loc[idx]
-        return self.Data(
-            FT1=self.drug_featurizer(sample['X1']),
-            ID1=sample.get('ID1', sample['X1']),
-            FT2=self.protein_featurizer(sample['X2']),
-            ID2=sample.get('ID2', sample['X2']),
-            Y=sample.get('Y')
-        )
-        #     {
-        #     'FT1': self.drug_featurizer(sample['X1']),
-        #     'ID1': sample.get('ID1', sample['X1']),
-        #     'FT2': self.protein_featurizer(sample['X2']),
-        #     'ID2': sample.get('ID2', sample['X2']),
-        #     'Y': sample.get('Y')
-        # }
-        # if self.train:
-        # sample = self.drug_featurizer(self.smiles[idx]), self.protein_featurizer(self.fasta[idx]), self.labels[idx]
-        # sample = {
-        #     'FT1': self.drug_featurizer(self.smiles[idx]),
-        #     'FT2': self.protein_featurizer(self.fasta[idx]),
-        #     'ID2': self.smiles_ids[idx],
-        # }
-        # else:
-        #     # sample = self.drug_featurizer(self.smiles[idx]), self.protein_featurizer(self.fasta[idx])
-        #     sample = {
-        #         'FT1': self.drug_featurizer(self.smiles[idx]),
-        #         'FT2': self.protein_featurizer(self.fasta[idx]),
-        #     }
-        #
-        # if all([True if n is not None else False for n in sample.values()]):
-        #     return sample  # | {
-        #     #     'ID1': self.smiles_ids[idx],
-        #     #     'X1': self.drug_featurizer(self.smiles[idx]),
-        #     #     'ID2': self.fasta_ids[idx],
-        #     #     'X2': self.protein_featurizer(self.fasta[idx]),
-        #     # }
-        # else:
-        #     return self.__getitem__(np.random.randint(0, self.size))
-class DTIdatamodule(LightningDataModule):
-    """
-    DTI DataModule
-    A DataModule implements 5 key methods:
-        def prepare_data(self):
-            # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
-            # download data, pre-process, split, save to disk, etc.
-        def setup(self, stage):
-            # things to do on every process in DDP
-            # load data, set variables, etc.
-        def train_dataloader(self):
-            # return train dataloader
-        def val_dataloader(self):
-            # return validation dataloader
-        def test_dataloader(self):
-            # return test dataloader
-        def teardown(self):
-            # called on every process in DDP
-            # clean up after fit or test
-    This allows you to share a full dataset without explaining how to download,
-    split, transform and process the data.
-    Read the docs:
-        https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
-    """
-    def __init__(
-            self,
-            task: Literal['regression', 'binary', 'multiclass'],
-            n_classes: Optional[int],
-            train: bool,
-            drug_featurizer: callable,
-            protein_featurizer: callable,
-            batch_size: int,
-            train_val_test_split: Optional[Sequence[Number]],
-            num_workers: int = 0,
-            thresholds: Optional[Union[Number, Sequence[Number]]] = None,
-            pin_memory: bool = False,
-            data_dir: str = "data/",
-            dataset_name: Optional[str] = None,
-            split: Optional[callable] = random_split,
-    ):
-        super().__init__()
-        # this line allows to access init params with 'self.hparams' attribute
-        # also ensures init params will be stored in ckpt
-        self.save_hyperparameters(logger=False)
-        # data processing
-        self.data_split = split
-        self.data_train: Optional[Dataset] = None
-        self.data_val: Optional[Dataset] = None
-        self.data_test: Optional[Dataset] = None
-        self.data_predict: Optional[Dataset] = None
-    def prepare_data(self):
-        """
-        Download data if needed.
-        Do not use it to assign state (e.g., self.x = x).
-        """
-    def setup(self, stage: Optional[str] = None, encoding: str = None):
-        """
-        Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
-        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
-        careful not to execute data splitting twice.
-        """
-        # TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
-        # load and split datasets only if not loaded in initialization
-        if not any([self.data_train, self.data_val, self.data_test, self.data_predict]):
-            dataset = DTIDataset(
-                task=self.hparams.task,
-                n_classes=self.hparams.n_classes,
-                data_dir=self.hparams.data_dir,
-                drug_featurizer=self.hparams.drug_featurizer,
-                protein_featurizer=self.hparams.protein_featurizer,
-                dataset_name=self.hparams.dataset_name,
-                thresholds=self.hparams.thresholds,
-            )
-            if self.hparams.train:
-                self.data_train, self.data_val, self.data_test = self.data_split(
-                    dataset=dataset,
-                    lengths=self.hparams.train_val_test_split
-                )
-            else:
-                self.data_test = self.data_predict = dataset
-    def train_dataloader(self):
-        return DataLoader(
-            dataset=self.data_train,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.data_train,
-                batch_size=self.hparams.batch_size,
-                drop_last=True,
-                shuffle=True,
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=True,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=collate_fn,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            dataset=self.data_val,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.data_val,
-                batch_size=self.hparams.batch_size,
-                drop_last=False,
-                shuffle=False,
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=collate_fn,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def test_dataloader(self):
-        return DataLoader(
-            dataset=self.data_test,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.data_test,
-                batch_size=self.hparams.batch_size,
-                drop_last=False,
-                shuffle=False,
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=collate_fn,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def predict_dataloader(self):
-        return DataLoader(
-            dataset=self.data_predict,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.data_predict,
-                batch_size=self.hparams.batch_size,
-                drop_last=False,
-                shuffle=False,
-            ),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=collate_fn,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def teardown(self, stage: Optional[str] = None):
-        """Clean up after fit or test."""
-        pass
-    def state_dict(self):
-        """Extra things to save to checkpoint."""
-        return {}
-    def load_state_dict(self, state_dict: Dict[str, Any]):
-        """Things to do when loading checkpoint."""
-        pass

deepscreen/data/entity_datamodule.py DELETED Viewed

@@ -1,167 +0,0 @@
-from numbers import Number
-from pathlib import Path
-from typing import Any, Dict, Optional, Sequence, Type
-from lightning import LightningDataModule
-from sklearn.base import TransformerMixin
-from torch.utils.data import Dataset, DataLoader
-from deepscreen.data.utils import collate_fn, SafeBatchSampler
-from deepscreen.data.utils.dataset import BaseEntityDataset
-class EntityDataModule(LightningDataModule):
-    """
-    def prepare_data(self):
-        # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
-        # download data, pre-process, split, save to disk, etc.
-    def setup(self, stage):
-        # things to do on every process in DDP
-        # load data, set variables, etc.
-    def train_dataloader(self):
-        # return train dataloader
-    def val_dataloader(self):
-        # return validation dataloader
-    def test_dataloader(self):
-        # return test dataloader
-    def teardown(self):
-        # called on every process in DDP
-        # clean up after fit or test
-    """
-    def __init__(
-            self,
-            dataset: type[BaseEntityDataset],
-            transformer: type[TransformerMixin],
-            train: bool,
-            batch_size: int,
-            data_dir: str = "data/",
-            data_file: Optional[str] = None,
-            train_val_test_split: Optional[Sequence[Number], Sequence[str]] = None,
-            split: Optional[callable] = None,
-            num_workers: int = 0,
-            pin_memory: bool = False,
-    ):
-        super().__init__()
-        # data processing
-        self.split = split
-        if train:
-            if all([data_file, split]):
-                if all(isinstance(split, Number) for split in train_val_test_split):
-                    pass
-                else:
-                    raise ValueError('`train_val_test_split` must be a sequence of 3 numbers '
-                                     '(float for percentages and int for sample numbers) if '
-                                     '`data_file` and `split` have been specified.')
-            elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
-                self.train_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[0]))
-                self.val_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[1]))
-                self.test_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[2]))
-            else:
-                raise ValueError('For training (train=True), you must specify either '
-                                 '`dataset_name` and `split` with `train_val_test_split` of 3 numbers or '
-                                 'solely `train_val_test_split` of 3 data file names.')
-        else:
-            if data_file and not any([split, train_val_test_split]):
-                self.test_data = self.predict_data = dataset(dataset_path=str(Path(data_dir) / data_file))
-            else:
-                raise ValueError("For testing/predicting (train=False), you must specify only `data_file` without "
-                                 "`train_val_test_split` or `split`")
-        # this line allows to access init params with 'self.hparams' attribute
-        # also ensures init params will be stored in ckpt
-        self.save_hyperparameters(logger=False)
-    def prepare_data(self):
-        """
-        Download data if needed.
-        Do not use it to assign state (e.g., self.x = x).
-        """
-    def setup(self, stage: Optional[str] = None, encoding: str = None):
-        """
-        Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
-        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
-        careful not to execute data splitting twice.
-        """
-        # TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
-        # TODO: find a way to apply transformer.fit_transform only to train and transformer.transform only to val, test
-        # load and split datasets only if not loaded in initialization
-        if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
-            self.train_data, self.val_data, self.test_data = self.split(
-                dataset=self.hparams.dataset(data_dir=self.hparams.data_dir,
-                                             dataset_name=self.hparams.train_dataset_name),
-                lengths=self.hparams.train_val_test_split
-            )
-    def train_dataloader(self):
-        return DataLoader(
-            dataset=self.train_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.train_data,
-                batch_size=self.hparams.batch_size,
-                shuffle=True),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=True,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=collate_fn,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            dataset=self.val_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.val_data,
-                batch_size=self.hparams.batch_size,
-                shuffle=False),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=collate_fn,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def test_dataloader(self):
-        return DataLoader(
-            dataset=self.test_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.test_data,
-                batch_size=self.hparams.batch_size,
-                shuffle=False),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=collate_fn,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def predict_dataloader(self):
-        return DataLoader(
-            dataset=self.predict_data,
-            batch_sampler=SafeBatchSampler(
-                data_source=self.predict_data,
-                batch_size=self.hparams.batch_size,
-                shuffle=False),
-            # batch_size=self.hparams.batch_size,
-            # shuffle=False,
-            num_workers=self.hparams.num_workers,
-            pin_memory=self.hparams.pin_memory,
-            collate_fn=collate_fn,
-            persistent_workers=True if self.hparams.num_workers > 0 else False
-        )
-    def teardown(self, stage: Optional[str] = None):
-        """Clean up after fit or test."""
-        pass
-    def state_dict(self):
-        """Extra things to save to checkpoint."""
-        return {}
-    def load_state_dict(self, state_dict: Dict[str, Any]):
-        """Things to do when loading checkpoint."""
-        pass

deepscreen/data/featurizers/__init__.py DELETED Viewed

File without changes

deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (191 Bytes)

deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc DELETED Viewed

Binary file (5.6 kB)

deepscreen/data/featurizers/__pycache__/fcs.cpython-311.pyc DELETED Viewed

Binary file (4.17 kB)

deepscreen/data/featurizers/__pycache__/graph.cpython-311.pyc DELETED Viewed

Binary file (7.21 kB)

deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc DELETED Viewed

Binary file (14.7 kB)

deepscreen/data/featurizers/categorical.py DELETED Viewed

@@ -1,86 +0,0 @@
-import numpy as np
-# Sets of KNOWN characters in SMILES and FASTA sequences
-# Use list instead of set to preserve character order
-SMILES_VOCAB = ('#', '%', ')', '(', '+', '-', '.', '1', '0', '3', '2', '5', '4',
-                '7', '6', '9', '8', '=', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I',
-                'H', 'K', 'M', 'L', 'O', 'N', 'P', 'S', 'R', 'U', 'T', 'W', 'V',
-                'Y', '[', 'Z', ']', '_', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i',
-                'h', 'm', 'l', 'o', 'n', 's', 'r', 'u', 't', 'y')
-FASTA_VOCAB = ('A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'O',
-               'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X', 'Z')
-# Check uniqueness, create character-index dicts, and add '?' for unknown characters as index 0
-assert len(SMILES_VOCAB) == len(set(SMILES_VOCAB)), 'SMILES_CHARSET has duplicate characters.'
-SMILES_CHARSET_IDX = {character: index+1 for index, character in enumerate(SMILES_VOCAB)} | {'?': 0}
-assert len(FASTA_VOCAB) == len(set(FASTA_VOCAB)), 'FASTA_CHARSET has duplicate characters.'
-FASTA_CHARSET_IDX = {character: index+1 for index, character in enumerate(FASTA_VOCAB)} | {'?': 0}
-def sequence_to_onehot(sequence: str, charset, max_sequence_length: int):
-    assert len(charset) == len(set(charset)), '`charset` contains duplicate characters.'
-    charset_idx = {character: index+1 for index, character in enumerate(charset)} | {'?': 0}
-    onehot = np.zeros((max_sequence_length, len(charset_idx)), dtype=int)
-    for index, character in enumerate(sequence[:max_sequence_length]):
-        onehot[index, charset_idx.get(character, 0)] = 1
-    return onehot.transpose()
-def sequence_to_label(sequence: str, charset, max_sequence_length: int):
-    assert len(charset) == len(set(charset)), '`charset` contains duplicate characters.'
-    charset_idx = {character: index+1 for index, character in enumerate(charset)} | {'?': 0}
-    label = np.zeros(max_sequence_length, dtype=int)
-    for index, character in enumerate(sequence[:max_sequence_length]):
-        label[index] = charset_idx.get(character, 0)
-    return label
-def smiles_to_onehot(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100):  # , in_channels: int = len(SMILES_CHARSET)
-    # assert len(SMILES_CHARSET) == len(set(SMILES_CHARSET)), 'SMILES_CHARSET has duplicate characters.'
-    # onehot = np.zeros((max_sequence_length, len(SMILES_CHARSET_IDX)))
-    # for index, character in enumerate(smiles[:max_sequence_length]):
-    #     onehot[index, SMILES_CHARSET_IDX.get(character, 0)] = 1
-    # return onehot.transpose()
-    return sequence_to_onehot(smiles, smiles_charset, max_sequence_length)
-def smiles_to_label(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100):  # , in_channels: int = len(SMILES_CHARSET)
-    # label = np.zeros(max_sequence_length)
-    # for index, character in enumerate(smiles[:max_sequence_length]):
-    #     label[index] = SMILES_CHARSET_IDX.get(character, 0)
-    # return label
-    return sequence_to_label(smiles, smiles_charset, max_sequence_length)
-def fasta_to_onehot(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000):  # in_channels: int = len(FASTA_CHARSET)
-    # onehot = np.zeros((max_sequence_length, len(FASTA_CHARSET_IDX)))
-    # for index, character in enumerate(fasta[:max_sequence_length]):
-    #     onehot[index, FASTA_CHARSET_IDX.get(character, 0)] = 1
-    # return onehot.transpose()
-    return sequence_to_onehot(fasta, fasta_charset, max_sequence_length)
-def fasta_to_label(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000):  # in_channels: int = len(FASTA_CHARSET)
-    # label = np.zeros(max_sequence_length)
-    # for index, character in enumerate(fasta[:max_sequence_length]):
-    #     label[index] = FASTA_CHARSET_IDX.get(character, 0)
-    # return label
-    return sequence_to_label(fasta, fasta_charset, max_sequence_length)
-def one_of_k_encoding(x, allowable_set):
-    if x not in allowable_set:
-        raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
-    return list(map(lambda s: x == s, allowable_set))
-def one_of_k_encoding_unk(x, allowable_set):
-    """Maps inputs not in the allowable set to the last element."""
-    if x not in allowable_set:
-        x = allowable_set[-1]
-    return list(map(lambda s: x == s, allowable_set))

deepscreen/data/featurizers/chem.py DELETED Viewed

@@ -1,48 +0,0 @@
-"""
-Mainly adapted from MolMap:
-https://github.com/shenwanxiang/bidd-molmap/tree/master/molmap/feature/fingerprint
-"""
-import numpy as np
-from rdkit import Chem, DataStructs
-from rdkit.Chem import AllChem
-from rdkit.Chem.Fingerprints import FingerprintMols
-from rdkit.Chem.rdReducedGraphs import GetErGFingerprint
-from deepscreen import get_logger
-log = get_logger(__name__)
-def smiles_to_erg(smiles):
-    try:
-        mol = Chem.MolFromSmiles(smiles)
-        features = np.array(GetErGFingerprint(mol), dtype=bool)
-        return features
-    except Exception as e:
-        log.warning(f"Failed to convert SMILES ({smiles}) to ErGFP due to {str(e)}")
-        return None
-def smiles_to_morgan(smiles, radius=2, n_bits=1024):
-    try:
-        mol = Chem.MolFromSmiles(smiles)
-        features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
-        features = np.zeros((1,))
-        DataStructs.ConvertToNumpyArray(features_vec, features)
-    except Exception as e:
-        log.warning(f"Failed to convert SMILES ({smiles}) to ErGFP due to {str(e)}")
-        return None
-def smiles_to_daylight(smiles):
-    try:
-        NumFinger = 2048
-        mol = Chem.MolFromSmiles(smiles)
-        bv = FingerprintMols.FingerprintMol(mol)
-        temp = tuple(bv.GetOnBits())
-        features = np.zeros((NumFinger,))
-        features[np.array(temp)] = 1
-    except:
-        print(f'RDKit could not find this SMILES: {smiles} convert to all 0 features')
-        features = np.zeros((2048,))
-    return features.astype(int)

deepscreen/data/featurizers/fcs.py DELETED Viewed

@@ -1,67 +0,0 @@
-from importlib import resources
-import numpy as np
-import pandas as pd
-from subword_nmt.apply_bpe import BPE
-import codecs
-vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/protein_codes_uniprot.txt')
-bpe_codes_protein = codecs.open(vocab_path)
-protein_bpe = BPE(bpe_codes_protein, merges=-1, separator='')
-sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_uniprot.csv')
-sub_csv = pd.read_csv(sub_csv_path)
-idx2word_protein = sub_csv['index'].values
-words2idx_protein = dict(zip(idx2word_protein, range(0, len(idx2word_protein))))
-vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/drug_codes_chembl.txt')
-bpe_codes_drug = codecs.open(vocab_path)
-drug_bpe = BPE(bpe_codes_drug, merges=-1, separator='')
-sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_chembl.csv')
-sub_csv = pd.read_csv(sub_csv_path)
-idx2word_drug = sub_csv['index'].values
-words2idx_drug = dict(zip(idx2word_drug, range(0, len(idx2word_drug))))
-def protein_to_embedding(x, max_sequence_length):
-    max_p = max_sequence_length
-    t1 = protein_bpe.process_line(x).split()  # split
-    try:
-        i1 = np.asarray([words2idx_protein[i] for i in t1])  # index
-    except:
-        i1 = np.array([0])
-        # print(x)
-    l = len(i1)
-    if l < max_p:
-        i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
-        input_mask = ([1] * l) + ([0] * (max_p - l))
-    else:
-        i = i1[:max_p]
-        input_mask = [1] * max_p
-    return i, np.asarray(input_mask)
-def drug_to_embedding(x, max_sequence_length):
-    max_d = max_sequence_length
-    t1 = drug_bpe.process_line(x).split()  # split
-    try:
-        i1 = np.asarray([words2idx_drug[i] for i in t1])  # index
-    except:
-        i1 = np.array([0])
-        # print(x)
-    l = len(i1)
-    if l < max_d:
-        i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
-        input_mask = ([1] * l) + ([0] * (max_d - l))
-    else:
-        i = i1[:max_d]
-        input_mask = [1] * max_d
-    return i, np.asarray(input_mask)

deepscreen/data/featurizers/fingerprint/__init__.py DELETED Viewed

@@ -1,45 +0,0 @@
-from typing import Literal
-from .atompairs import GetAtomPairFPs
-from .avalonfp import GetAvalonFPs
-from .rdkitfp import GetRDkitFPs
-from .morganfp import GetMorganFPs
-from .estatefp import GetEstateFPs
-from .maccskeys import GetMACCSFPs
-from .pharmErGfp import GetPharmacoErGFPs
-from .pharmPointfp import GetPharmacoPFPs
-from .pubchemfp import GetPubChemFPs
-from .torsions import GetTorsionFPs
-from .mhfp6 import GetMHFP6
-# from .map4 import GetMAP4
-from rdkit import Chem
-from deepscreen import get_logger
-log = get_logger(__name__)
-FP_MAP = {
-    'MorganFP': GetMorganFPs,
-    'RDkitFP': GetRDkitFPs,
-    'AtomPairFP': GetAtomPairFPs,
-    'TorsionFP': GetTorsionFPs,
-    'AvalonFP': GetAvalonFPs,
-    'EstateFP': GetEstateFPs,
-    'MACCSFP': GetMACCSFPs,
-    'PharmacoErGFP': GetPharmacoErGFPs,
-    'PharmacoPFP': GetPharmacoPFPs,
-    'PubChemFP': GetPubChemFPs,
-    'MHFP6': GetMHFP6,
-    # 'MAP4': GetMAP4,
-}
-def smiles_to_fingerprint(smiles, fingerprint: Literal[tuple(FP_MAP.keys())], **kwargs):
-    func = FP_MAP[fingerprint]
-    try:
-        mol = Chem.MolFromSmiles(smiles)
-        arr = func(mol, **kwargs)
-        return arr
-    except Exception as e:
-        log.warning(f"Failed to convert SMILES ({smiles}) to {fingerprint} due to {str(e)}")
-        return None

deepscreen/data/featurizers/fingerprint/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (2.18 kB)

deepscreen/data/featurizers/fingerprint/__pycache__/atompairs.cpython-311.pyc DELETED Viewed

Binary file (1.03 kB)

deepscreen/data/featurizers/fingerprint/__pycache__/avalonfp.cpython-311.pyc DELETED Viewed

Binary file (928 Bytes)

deepscreen/data/featurizers/fingerprint/__pycache__/estatefp.cpython-311.pyc DELETED Viewed

Binary file (685 Bytes)

deepscreen/data/featurizers/fingerprint/__pycache__/maccskeys.cpython-311.pyc DELETED Viewed

Binary file (1.3 kB)

deepscreen/data/featurizers/fingerprint/__pycache__/map4.cpython-311.pyc DELETED Viewed

Binary file (7.61 kB)

deepscreen/data/featurizers/fingerprint/__pycache__/mhfp6.cpython-311.pyc DELETED Viewed

Binary file (1.07 kB)

deepscreen/data/featurizers/fingerprint/__pycache__/morganfp.cpython-311.pyc DELETED Viewed

Binary file (962 Bytes)

deepscreen/data/featurizers/fingerprint/__pycache__/pharmErGfp.cpython-311.pyc DELETED Viewed

Binary file (2.4 kB)

deepscreen/data/featurizers/fingerprint/__pycache__/pharmPointfp.cpython-311.pyc DELETED Viewed

Binary file (3.23 kB)

deepscreen/data/featurizers/fingerprint/__pycache__/pubchemfp.cpython-311.pyc DELETED Viewed

Binary file (77.7 kB)

deepscreen/data/featurizers/fingerprint/__pycache__/rdkitfp.cpython-311.pyc DELETED Viewed

Binary file (1.65 kB)

deepscreen/data/featurizers/fingerprint/__pycache__/torsions.cpython-311.pyc DELETED Viewed

Binary file (1.04 kB)

deepscreen/data/featurizers/fingerprint/atompairs.py DELETED Viewed

@@ -1,18 +0,0 @@
-from rdkit.Chem.AtomPairs import Pairs
-from rdkit.Chem import DataStructs
-import numpy as np
-_type = 'topological-based'
-def GetAtomPairFPs(mol, nBits=2048, binary=True):
-    '''
-    atompairs fingerprints
-    '''
-    fp = Pairs.GetHashedAtomPairFingerprint(mol, nBits=nBits)
-    if binary:
-        arr = np.zeros((0,), dtype=np.bool_)
-    else:
-        arr = np.zeros((0,), dtype=np.int8)
-    DataStructs.ConvertToNumpyArray(fp, arr)
-    return arr

deepscreen/data/featurizers/fingerprint/avalonfp.py DELETED Viewed

@@ -1,16 +0,0 @@
-from rdkit.Chem import DataStructs
-from rdkit.Avalon.pyAvalonTools import GetAvalonFP as GAFP
-import numpy as np
-_type = 'topological-based'
-def GetAvalonFPs(mol, nBits=2048):
-    '''
-    Avalon_fingerprints: https://pubs.acs.org/doi/pdf/10.1021/ci050413p
-    '''
-    fp = GAFP(mol, nBits=nBits)
-    arr = np.zeros((0,), dtype=np.bool_)
-    DataStructs.ConvertToNumpyArray(fp, arr)
-    return arr

deepscreen/data/featurizers/fingerprint/estatefp.py DELETED Viewed

@@ -1,12 +0,0 @@
-from rdkit.Chem.EState import Fingerprinter
-import numpy as np
-_type = 'Estate-based'
-def GetEstateFPs(mol):
-    '''
-    79 bits Estate fps
-    '''
-    x = Fingerprinter.FingerprintMol(mol)[0]
-    return x.astype(np.bool_)

deepscreen/data/featurizers/fingerprint/maccskeys.py DELETED Viewed

@@ -1,25 +0,0 @@
-from rdkit.Chem import AllChem
-from rdkit.Chem import DataStructs
-import numpy as np
-import pandas as pd
-import os
-_type = 'SMARTS-based'
-file_path = os.path.dirname(__file__)
-def GetMACCSFPs(mol):
-    '''
-    166 bits
-    '''
-    fp = AllChem.GetMACCSKeysFingerprint(mol)
-    arr = np.zeros((0,), dtype=np.bool_)
-    DataStructs.ConvertToNumpyArray(fp, arr)
-    return arr
-def GetMACCSFPInfos():
-    return pd.read_excel(os.path.join(file_path, 'maccskeys.xlsx'))

deepscreen/data/featurizers/fingerprint/maccskeys.xlsx DELETED Viewed

Binary file (14 kB)

deepscreen/data/featurizers/fingerprint/map4.py DELETED Viewed

@@ -1,130 +0,0 @@
-"""
-MinHashed Atom-pair Fingerprint, MAP
-orignal paper: Capecchi, Alice, Daniel Probst, and Jean-Louis Reymond. "One molecular fingerprint to rule them all: drugs, biomolecules, and the metabolome." Journal of Cheminformatics 12.1 (2020): 1-15. orignal code: https://github.com/reymond-group/map4, thanks their orignal work
-A small bug is fixed: https://github.com/reymond-group/map4/issues/6
-"""
-_type = 'topological-based'
-import itertools
-from collections import defaultdict
-import tmap as tm
-from mhfp.encoder import MHFPEncoder
-from rdkit import Chem
-from rdkit.Chem import rdmolops
-from rdkit.Chem.rdmolops import GetDistanceMatrix
-def to_smiles(mol):
-    return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)
-class MAP4Calculator:
-    def __init__(self, dimensions=2048, radius=2, is_counted=False, is_folded=False, fold_dimensions=2048):
-        """
-        MAP4 calculator class
-        """
-        self.dimensions = dimensions
-        self.radius = radius
-        self.is_counted = is_counted
-        self.is_folded = is_folded
-        self.fold_dimensions = fold_dimensions
-        if self.is_folded:
-            self.encoder = MHFPEncoder(dimensions)
-        else:
-            self.encoder = tm.Minhash(dimensions)
-    def calculate(self, mol):
-        """Calculates the atom pair minhashed fingerprint
-        Arguments:
-            mol -- rdkit mol object
-        Returns:
-            tmap VectorUint -- minhashed fingerprint
-        """
-        atom_env_pairs = self._calculate(mol)
-        if self.is_folded:
-            return self._fold(atom_env_pairs)
-        return self.encoder.from_string_array(atom_env_pairs)
-    def calculate_many(self, mols):
-        """ Calculates the atom pair minhashed fingerprint
-        Arguments:
-            mols -- list of mols
-        Returns:
-            list of tmap VectorUint -- minhashed fingerprints list
-        """
-        atom_env_pairs_list = [self._calculate(mol) for mol in mols]
-        if self.is_folded:
-            return [self._fold(pairs) for pairs in atom_env_pairs_list]
-        return self.encoder.batch_from_string_array(atom_env_pairs_list)
-    def _calculate(self, mol):
-        return self._all_pairs(mol, self._get_atom_envs(mol))
-    def _fold(self, pairs):
-        fp_hash = self.encoder.hash(set(pairs))
-        return self.encoder.fold(fp_hash, self.fold_dimensions)
-    def _get_atom_envs(self, mol):
-        atoms_env = {}
-        for atom in mol.GetAtoms():
-            idx = atom.GetIdx()
-            for radius in range(1, self.radius + 1):
-                if idx not in atoms_env:
-                    atoms_env[idx] = []
-                atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius))
-        return atoms_env
-    @classmethod
-    def _find_env(cls, mol, idx, radius):
-        env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
-        atom_map = {}
-        submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
-        if idx in atom_map:
-            smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False)
-            return smiles
-        return ''
-    def _all_pairs(self, mol, atoms_env):
-        atom_pairs = []
-        distance_matrix = GetDistanceMatrix(mol)
-        num_atoms = mol.GetNumAtoms()
-        shingle_dict = defaultdict(int)
-        for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
-            dist = str(int(distance_matrix[idx1][idx2]))
-            for i in range(self.radius):
-                env_a = atoms_env[idx1][i]
-                env_b = atoms_env[idx2][i]
-                ordered = sorted([env_a, env_b])
-                shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])
-                if self.is_counted:
-                    shingle_dict[shingle] += 1
-                    shingle += '|' + str(shingle_dict[shingle])
-                atom_pairs.append(shingle.encode('utf-8'))
-        return list(set(atom_pairs))
-def GetMAP4(mol, nBits=2048, radius=2, fold_dimensions=None):
-    """
-    MAP4: radius=2
-    """
-    if fold_dimensions == None:
-        fold_dimensions = nBits
-    calc = MAP4Calculator(dimensions=nBits, radius=radius, is_counted=False, is_folded=True,
-                          fold_dimensions=fold_dimensions)
-    arr = calc.calculate(mol)
-    return arr.astype(bool)

deepscreen/data/featurizers/fingerprint/mhfp6.py DELETED Viewed

@@ -1,18 +0,0 @@
-"""
-Probst, Daniel, and Jean-Louis Reymond. "A probabilistic molecular fingerprint for big data settings." Journal of cheminformatics 10.1 (2018): 66.'
-orignal code: https://github.com/reymond-group/mhfp
-"""
-from mhfp.encoder import MHFPEncoder
-def GetMHFP6(mol, nBits=2048, radius=3):
-    """
-    MHFP6: radius=3
-    """
-    encoder = MHFPEncoder(n_permutations=nBits)
-    hash_values = encoder.encode_mol(mol, radius=radius, rings=True, kekulize=True, min_radius=1)
-    arr = encoder.fold(hash_values, nBits)
-    return arr.astype(bool)

deepscreen/data/featurizers/fingerprint/mnimalfatures.fdef DELETED Viewed

@@ -1,53 +0,0 @@
-AtomType NDonor [N&!H0&v3,N&!H0&+1&v4,n&H1&+0]
-AtomType ChalcDonor [O,S;H1;+0]
-DefineFeature SingleAtomDonor [{NDonor},{ChalcDonor},!$([D1]-[C;D3]=[O,S,N])]
-  Family Donor
-  Weights 1
-EndFeature
-AtomType NAcceptor [$([N&v3;H1,H2]-[!$(*=[O,N,P,S])])]
-Atomtype NAcceptor [$([N;v3;H0])]
-AtomType NAcceptor [$([n;+0])]
-AtomType ChalcAcceptor [$([O,S;H1;v2]-[!$(*=[O,N,P,S])])]
-AtomType ChalcAcceptor [O,S;H0;v2]
-Atomtype ChalcAcceptor [O,S;-]
-Atomtype ChalcAcceptor [o,s;+0]
-AtomType HalogenAcceptor [F]
-DefineFeature SingleAtomAcceptor [{NAcceptor},{ChalcAcceptor},{HalogenAcceptor}]
-  Family Acceptor
-  Weights 1
-EndFeature
-# this one is delightfully easy:
-DefineFeature AcidicGroup [C,S](=[O,S,P])-[O;H1,H0&-1]
-  Family NegIonizable
-  Weights 1.0,1.0,1.0
-EndFeature
-AtomType CarbonOrArom_NonCarbonyl [$([C,a]);!$([C,a](=O))]
-AtomType BasicNH2 [$([N;H2&+0][{CarbonOrArom_NonCarbonyl}])]
-AtomType BasicNH1 [$([N;H1&+0]([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
-AtomType BasicNH0 [$([N;H0&+0]([{CarbonOrArom_NonCarbonyl}])([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
-AtomType BasicNakedN [N,n;X2;+0]
-DefineFeature BasicGroup [{BasicNH2},{BasicNH1},{BasicNH0},{BasicNakedN}]
-  Family PosIonizable
-  Weights 1.0
-EndFeature
-# aromatic rings of various sizes:
-DefineFeature Arom5 a1aaaa1
-  Family Aromatic
-  Weights 1.0,1.0,1.0,1.0,1.0
-EndFeature
-DefineFeature Arom6 a1aaaaa1
-  Family Aromatic
-  Weights 1.0,1.0,1.0,1.0,1.0,1.0
-EndFeature
-DefineFeature Arom7 a1aaaaaa1
-  Family Aromatic
-  Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0
-EndFeature
-DefineFeature Arom8 a1aaaaaaa1
-  Family Aromatic
-  Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-EndFeature

deepscreen/data/featurizers/fingerprint/morganfp.py DELETED Viewed

@@ -1,18 +0,0 @@
-from rdkit.Chem import AllChem
-from rdkit.Chem import DataStructs
-import numpy as np
-def GetMorganFPs(mol, nBits=2048, radius=2, return_bitInfo=False):
-    """
-    ECFP4: radius=2
-    """
-    bitInfo = {}
-    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius,
-                                               bitInfo=bitInfo, nBits=nBits)
-    arr = np.zeros((0,), dtype=np.bool_)
-    DataStructs.ConvertToNumpyArray(fp, arr)
-    if return_bitInfo:
-        return arr, bitInfo
-    return arr

deepscreen/data/featurizers/fingerprint/pharmErGfp.py DELETED Viewed

@@ -1,60 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sat Aug 17 16:54:12 2019
-@author: [email protected]
-@calculate ErG fps, more info: https://pubs.acs.org/doi/full/10.1021/ci050457y#
-"""
-_type = 'Pharmacophore-based'
-import numpy as np
-from rdkit.Chem import AllChem
-## get info from : https://github.com/rdkit/rdkit/blob/d41752d558bf7200ab67b98cdd9e37f1bdd378de/Code/GraphMol/ReducedGraphs/ReducedGraphs.cpp
-Donor = ["[N;!H0;v3,v4&+1]", "[O,S;H1;+0]", "[n&H1&+0]"]
-Acceptor = ["[O,S;H1;v2;!$(*-*=[O,N,P,S])]", "[O;H0;v2]", "[O,S;v1;-]",
-            "[N;v3;!$(N-*=[O,N,P,S])]", "[n&H0&+0]", "[o;+0;!$([o]:n);!$([o]:c:n)]"]
-Positive = ["[#7;+]", "[N;H2&+0][$([C,a]);!$([C,a](=O))]",
-            "[N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]",
-            "[N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))]"]
-Negative = ["[C,S](=[O,S,P])-[O;H1,-1]"]
-Hydrophobic = ["[C;D3,D4](-[CH3])-[CH3]", "[S;D2](-C)-C"]
-Aromatic = ["a"]
-PROPERTY_KEY = ["Donor", "Acceptor", "Positive", "Negative", "Hydrophobic", "Aromatic"]
-def GetPharmacoErGFPs(mol, fuzzIncrement=0.3, maxPath=21, binary=True, return_bitInfo=False):
-    '''
-    https://pubs.acs.org/doi/full/10.1021/ci050457y#
-    return maxPath*21 bits
-    size(v) = (n(n + 1)/2) * (maxDist - minDist + 1)
-    '''
-    minPath = 1
-    arr = AllChem.GetErGFingerprint(mol, fuzzIncrement=fuzzIncrement, maxPath=maxPath, minPath=minPath)
-    arr = arr.astype(np.float32)
-    if binary:
-        arr = arr.astype(np.bool_)
-    if return_bitInfo:
-        bitInfo = []
-        for i in range(len(PROPERTY_KEY)):
-            for j in range(i, len(PROPERTY_KEY)):
-                for path in range(minPath, maxPath + 1):
-                    triplet = (PROPERTY_KEY[i], PROPERTY_KEY[j], path)
-                    bitInfo.append(triplet)
-        return arr, bitInfo
-    return arr

deepscreen/data/featurizers/fingerprint/pharmPointfp.py DELETED Viewed

@@ -1,59 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sat Aug 17 16:54:12 2019
-@author: [email protected]
-Combining a set of chemical features with the 2D (topological) distances between them gives a 2D pharmacophore. When the distances are binned, unique integer ids can be assigned to each of these pharmacophores and they can be stored in a fingerprint. Details of the encoding are in: https://www.rdkit.org/docs/RDKit_Book.html#ph4-figure
-"""
-_type = 'Pharmacophore-based'
-from rdkit.Chem.Pharm2D.SigFactory import SigFactory
-from rdkit.Chem.Pharm2D import Generate
-from rdkit.Chem import DataStructs
-from rdkit.Chem import ChemicalFeatures
-import numpy as np
-import os
-fdef = os.path.join(os.path.dirname(__file__), 'mnimalfatures.fdef')
-featFactory = ChemicalFeatures.BuildFeatureFactory(fdef)
-def GetPharmacoPFPs(mol,
-                    bins=[(i, i + 1) for i in range(20)],
-                    minPointCount=2,
-                    maxPointCount=2,
-                    return_bitInfo=False):
-    '''
-    Note: maxPointCont with 3 is slowly
-    bins = [(i,i+1) for i in range(20)],
-    maxPonitCount=2 for large-scale computation
-    '''
-    MysigFactory = SigFactory(featFactory,
-                              trianglePruneBins=False,
-                              minPointCount=minPointCount,
-                              maxPointCount=maxPointCount)
-    MysigFactory.SetBins(bins)
-    MysigFactory.Init()
-    res = Generate.Gen2DFingerprint(mol, MysigFactory)
-    arr = np.array(list(res)).astype(np.bool_)
-    if return_bitInfo:
-        description = []
-        for i in range(len(res)):
-            description.append(MysigFactory.GetBitDescription(i))
-        return arr, description
-    return arr
-if __name__ == '__main__':
-    from rdkit import Chem
-    mol = Chem.MolFromSmiles('CC#CC(=O)NC1=NC=C2C(=C1)C(=NC=N2)NC3=CC(=C(C=C3)F)Cl')
-    a = GetPharmacoPFPs(mol, bins=[(i, i + 1) for i in range(20)], minPointCount=2, maxPointCount=2)