DeepSEQreen_NAR_fb

Sleeping

App Files Files Community

libokj commited on Apr 1, 2024

Commit

3cadfde

1 Parent(s): 6534610

Update deepscreen/data/dti.py

Browse files

Files changed (1) hide show

deepscreen/data/dti.py +21 -17

deepscreen/data/dti.py CHANGED Viewed

@@ -6,7 +6,9 @@ from typing import Any, Dict, Optional, Sequence, Union, Literal
 from lightning import LightningDataModule
 import pandas as pd
-import swifter
 from sklearn.preprocessing import LabelEncoder
 from torch.utils.data import Dataset, DataLoader
@@ -14,6 +16,7 @@ from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
 from deepscreen.utils import get_logger
 log = get_logger(__name__)
 SMILES_PAT = r"[^A-Za-z0-9=#:+\-\[\]<>()/\\@%,.*]"
 FASTA_PAT = r"[^A-Z*\-]"
@@ -33,14 +36,12 @@ def validate_seq_str(seq, regex):
 # TODO: save a list of corrupted records
 def rdkit_canonicalize(smiles):
-    from rdkit import Chem
     try:
         mol = Chem.MolFromSmiles(smiles)
-        cano_smiles = Chem.MolToSmiles(mol)
-        return cano_smiles
     except Exception as e:
         log.warning(f'Failed to canonicalize SMILES using RDKIT due to {str(e)}. Returning original SMILES: {smiles}')
-        return smiles
 class DTIDataset(Dataset):
@@ -85,6 +86,12 @@ class DTIDataset(Dataset):
         # Forward-fill all non-label columns
         df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
         # TODO potentially allow running through the whole data validation process
         # error = False
@@ -93,9 +100,9 @@ class DTIDataset(Dataset):
             # TODO: check sklearn.utils.multiclass.check_classification_targets
             match task:
                 case 'regression':
-                    assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
                         f"""`Y` must be numeric for `regression` task,
-                        but it has {set(df['Y'].swifter.apply(type))}."""
                 case 'binary':
                     if all(df['Y'].isin([0, 1])):
@@ -112,7 +119,7 @@ class DTIDataset(Dataset):
                 case 'multiclass':
                     assert num_classes >= 3, f'`num_classes` for `task=multiclass` must be at least 3.'
-                    if all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)):
                         assert not thresholds, \
                             f"""`Y` is already non-negative integers for
                             `multiclass` (classification) `task`, but still got `thresholds` ({thresholds}).
@@ -140,9 +147,9 @@ class DTIDataset(Dataset):
             match task:
                 case 'regression':
                     df['Y'] = df['Y'].astype('float32')
-                    assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
                         f"""`Y` must be numeric for `regression` task,
-                        but after transformation it still has {set(df['Y'].swifter.apply(type))}.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
                     # TODO print err idx instead
                 case 'binary':
@@ -154,7 +161,7 @@ class DTIDataset(Dataset):
                     # TODO print err idx instead
                 case 'multiclass':
                     df['Y'] = df['Y'].astype('int')
-                    assert all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)), \
                         f"""Y must be non-negative integers for `task=multiclass`
                         but after transformation it still has {pd.unique(df['Y'])}.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
@@ -166,16 +173,14 @@ class DTIDataset(Dataset):
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
         log.info("Validating SMILES (`X1`)...")
-        df['X1_ERR'] = df['X1'].swifter.progress_bar(
-            desc="Validating SMILES...").apply(validate_seq_str, regex=SMILES_PAT)
         if not df['X1_ERR'].isna().all():
             raise Exception(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
-        df['X1^'] = df['X1'].swifter.apply(rdkit_canonicalize)  # swifter
         log.info("Validating FASTA (`X2`)...")
         df['X2'] = df['X2'].str.upper()
-        df['X2_ERR'] = df['X2'].swifter.progress_bar(
-            desc="Validating FASTA...").apply(validate_seq_str, regex=FASTA_PAT)
         if not df['X2_ERR'].isna().all():
             raise Exception(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
@@ -425,4 +430,3 @@ class DTIDataModule(LightningDataModule):
     def load_state_dict(self, state_dict: Dict[str, Any]):
         """Things to do when loading checkpoint."""
         pass

 from lightning import LightningDataModule
 import pandas as pd
+from pandarallel import pandarallel
+from rdkit import Chem
+#import swifter
 from sklearn.preprocessing import LabelEncoder
 from torch.utils.data import Dataset, DataLoader
 from deepscreen.utils import get_logger
 log = get_logger(__name__)
+pandarallel.initialize(progress_bar=True)
 SMILES_PAT = r"[^A-Za-z0-9=#:+\-\[\]<>()/\\@%,.*]"
 FASTA_PAT = r"[^A-Z*\-]"
 # TODO: save a list of corrupted records
 def rdkit_canonicalize(smiles):
     try:
         mol = Chem.MolFromSmiles(smiles)
+        smiles = Chem.MolToSmiles(mol)
     except Exception as e:
         log.warning(f'Failed to canonicalize SMILES using RDKIT due to {str(e)}. Returning original SMILES: {smiles}')
+    return smiles
 class DTIDataset(Dataset):
         # Forward-fill all non-label columns
         df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
+        # Fill NAs in string cols with an empty string to prevent wrong type inference by pytorch collator
+        for col in df.columns:
+            if df[col].dtype == 'object':
+                df[col] = df[col].fillna('')
         # TODO potentially allow running through the whole data validation process
         # error = False
             # TODO: check sklearn.utils.multiclass.check_classification_targets
             match task:
                 case 'regression':
+                    assert all(df['Y'].parallel_apply(lambda x: isinstance(x, Number))), \
                         f"""`Y` must be numeric for `regression` task,
+                        but it has {set(df['Y'].parallel_apply(type))}."""
                 case 'binary':
                     if all(df['Y'].isin([0, 1])):
                 case 'multiclass':
                     assert num_classes >= 3, f'`num_classes` for `task=multiclass` must be at least 3.'
+                    if all(df['Y'].parallel_apply(lambda x: x.is_integer() and x >= 0)):
                         assert not thresholds, \
                             f"""`Y` is already non-negative integers for
                             `multiclass` (classification) `task`, but still got `thresholds` ({thresholds}).
             match task:
                 case 'regression':
                     df['Y'] = df['Y'].astype('float32')
+                    assert all(df['Y'].parallel_apply(lambda x: isinstance(x, Number))), \
                         f"""`Y` must be numeric for `regression` task,
+                        but after transformation it still has {set(df['Y'].parallel_apply(type))}.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
                     # TODO print err idx instead
                 case 'binary':
                     # TODO print err idx instead
                 case 'multiclass':
                     df['Y'] = df['Y'].astype('int')
+                    assert all(df['Y'].parallel_apply(lambda x: x.is_integer() and x >= 0)), \
                         f"""Y must be non-negative integers for `task=multiclass`
                         but after transformation it still has {pd.unique(df['Y'])}.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
         log.info("Validating SMILES (`X1`)...")
+        df['X1_ERR'] = df['X1'].parallel_apply(validate_seq_str, regex=SMILES_PAT)
         if not df['X1_ERR'].isna().all():
             raise Exception(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+        df['X1^'] = df['X1'].parallel_apply(rdkit_canonicalize)
         log.info("Validating FASTA (`X2`)...")
         df['X2'] = df['X2'].str.upper()
+        df['X2_ERR'] = df['X2'].parallel_apply(validate_seq_str, regex=FASTA_PAT)
         if not df['X2_ERR'].isna().all():
             raise Exception(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
     def load_state_dict(self, state_dict: Dict[str, Any]):
         """Things to do when loading checkpoint."""
         pass