Spaces:
Sleeping
Sleeping
Upload 110 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- deepscreen/__init__.py +101 -0
- deepscreen/__pycache__/__init__.cpython-311.pyc +0 -0
- deepscreen/__pycache__/predict.cpython-311.pyc +0 -0
- deepscreen/data/__init__.py +0 -0
- deepscreen/data/__pycache__/__init__.cpython-311.pyc +0 -0
- deepscreen/data/__pycache__/dti.cpython-311.pyc +0 -0
- deepscreen/data/dti.py +422 -0
- deepscreen/data/dti.py.bak +369 -0
- deepscreen/data/dti_datamodule.py +314 -0
- deepscreen/data/entity_datamodule.py +167 -0
- deepscreen/data/featurizers/__init__.py +0 -0
- deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc +0 -0
- deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc +0 -0
- deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc +0 -0
- deepscreen/data/featurizers/categorical.py +86 -0
- deepscreen/data/featurizers/chem.py +48 -0
- deepscreen/data/featurizers/fcs.py +67 -0
- deepscreen/data/featurizers/fingerprint/__init__.py +45 -0
- deepscreen/data/featurizers/fingerprint/atompairs.py +18 -0
- deepscreen/data/featurizers/fingerprint/avalonfp.py +16 -0
- deepscreen/data/featurizers/fingerprint/estatefp.py +12 -0
- deepscreen/data/featurizers/fingerprint/maccskeys.py +25 -0
- deepscreen/data/featurizers/fingerprint/maccskeys.xlsx +0 -0
- deepscreen/data/featurizers/fingerprint/map4.py +130 -0
- deepscreen/data/featurizers/fingerprint/mhfp6.py +18 -0
- deepscreen/data/featurizers/fingerprint/mnimalfatures.fdef +53 -0
- deepscreen/data/featurizers/fingerprint/morganfp.py +18 -0
- deepscreen/data/featurizers/fingerprint/pharmErGfp.py +60 -0
- deepscreen/data/featurizers/fingerprint/pharmPointfp.py +59 -0
- deepscreen/data/featurizers/fingerprint/pubchemfp.py +1731 -0
- deepscreen/data/featurizers/fingerprint/pubchemfp.xlsx +0 -0
- deepscreen/data/featurizers/fingerprint/rdkitfp.py +42 -0
- deepscreen/data/featurizers/fingerprint/smarts_maccskey.py +178 -0
- deepscreen/data/featurizers/fingerprint/smarts_pharmacophore.py +21 -0
- deepscreen/data/featurizers/fingerprint/smarts_pubchem.py +734 -0
- deepscreen/data/featurizers/fingerprint/torsions.py +18 -0
- deepscreen/data/featurizers/graph.py +133 -0
- deepscreen/data/featurizers/monn.py +106 -0
- deepscreen/data/featurizers/token.py +299 -0
- deepscreen/data/single_entity.py +195 -0
- deepscreen/data/utils/__init__.py +8 -0
- deepscreen/data/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- deepscreen/data/utils/__pycache__/collator.cpython-311.pyc +0 -0
- deepscreen/data/utils/__pycache__/label.cpython-311.pyc +0 -0
- deepscreen/data/utils/__pycache__/sampler.cpython-311.pyc +0 -0
- deepscreen/data/utils/__pycache__/split.cpython-311.pyc +0 -0
- deepscreen/data/utils/collator.py +168 -0
- deepscreen/data/utils/dataset.py +216 -0
- deepscreen/data/utils/label.py +93 -0
- deepscreen/data/utils/sampler.py +90 -0
deepscreen/__init__.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DeepScreen package initialization, registering custom objects and monkey patching for some libraries.
|
| 3 |
+
"""
|
| 4 |
+
import sys
|
| 5 |
+
from builtins import eval
|
| 6 |
+
|
| 7 |
+
import lightning.fabric.strategies.launchers.subprocess_script as subprocess_script
|
| 8 |
+
import torch
|
| 9 |
+
from omegaconf import OmegaConf
|
| 10 |
+
|
| 11 |
+
from deepscreen.utils import get_logger
|
| 12 |
+
|
| 13 |
+
log = get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
# Allow basic Python operations in hydra interpolation; examples:
|
| 16 |
+
# `in_channels: ${eval:${model.drug_encoder.out_channels}+${model.protein_encoder.out_channels}}`
|
| 17 |
+
# `subdir: ${eval:${hydra.job.override_dirname}.replace('/', '.')}`
|
| 18 |
+
OmegaConf.register_new_resolver("eval", eval)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def sanitize_path(path_str: str):
|
| 22 |
+
"""
|
| 23 |
+
Sanitize a string for path creation by replacing unsafe characters and cutting length to 255 (OS limitation).
|
| 24 |
+
"""
|
| 25 |
+
return path_str.replace("/", ".").replace("\\", ".").replace(":", "-")[:255]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
OmegaConf.register_new_resolver("sanitize_path", sanitize_path)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _hydra_subprocess_cmd(local_rank: int):
|
| 32 |
+
"""
|
| 33 |
+
Monkey patching for lightning.fabric.strategies.launchers.subprocess_script._hydra_subprocess_cmd
|
| 34 |
+
Temporarily fixes the problem of unnecessarily creating log folders for DDP subprocesses in Hydra multirun/sweep.
|
| 35 |
+
"""
|
| 36 |
+
import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
|
| 37 |
+
from hydra.core.hydra_config import HydraConfig
|
| 38 |
+
from hydra.utils import get_original_cwd, to_absolute_path
|
| 39 |
+
|
| 40 |
+
# when user is using hydra find the absolute path
|
| 41 |
+
if __main__.__spec__ is None: # pragma: no-cover
|
| 42 |
+
command = [sys.executable, to_absolute_path(sys.argv[0])]
|
| 43 |
+
else:
|
| 44 |
+
command = [sys.executable, "-m", __main__.__spec__.name]
|
| 45 |
+
|
| 46 |
+
command += sys.argv[1:]
|
| 47 |
+
|
| 48 |
+
cwd = get_original_cwd()
|
| 49 |
+
rundir = f'"{HydraConfig.get().runtime.output_dir}"'
|
| 50 |
+
# Set output_subdir null since we don't want different subprocesses trying to write to config.yaml
|
| 51 |
+
command += [f"hydra.job.name=train_ddp_process_{local_rank}",
|
| 52 |
+
"hydra.output_subdir=null,"
|
| 53 |
+
f"hydra.runtime.output_dir={rundir}"]
|
| 54 |
+
return command, cwd
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
subprocess_script._hydra_subprocess_cmd = _hydra_subprocess_cmd
|
| 58 |
+
|
| 59 |
+
# from torch import Tensor
|
| 60 |
+
# from lightning.fabric.utilities.distributed import _distributed_available
|
| 61 |
+
# from lightning.pytorch.utilities.rank_zero import WarningCache
|
| 62 |
+
# from lightning.pytorch.utilities.warnings import PossibleUserWarning
|
| 63 |
+
# from lightning.pytorch.trainer.connectors.logger_connector.result import _ResultCollection
|
| 64 |
+
|
| 65 |
+
# warning_cache = WarningCache()
|
| 66 |
+
#
|
| 67 |
+
# @staticmethod
|
| 68 |
+
# def _get_cache(result_metric, on_step: bool):
|
| 69 |
+
# cache = None
|
| 70 |
+
# if on_step and result_metric.meta.on_step:
|
| 71 |
+
# cache = result_metric._forward_cache
|
| 72 |
+
# elif not on_step and result_metric.meta.on_epoch:
|
| 73 |
+
# if result_metric._computed is None:
|
| 74 |
+
# should = result_metric.meta.sync.should
|
| 75 |
+
# if not should and _distributed_available() and result_metric.is_tensor:
|
| 76 |
+
# warning_cache.warn(
|
| 77 |
+
# f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)`"
|
| 78 |
+
# " when logging on epoch level in distributed setting to accumulate the metric across"
|
| 79 |
+
# " devices.",
|
| 80 |
+
# category=PossibleUserWarning,
|
| 81 |
+
# )
|
| 82 |
+
# result_metric.compute()
|
| 83 |
+
# result_metric.meta.sync.should = should
|
| 84 |
+
#
|
| 85 |
+
# cache = result_metric._computed
|
| 86 |
+
#
|
| 87 |
+
# if cache is not None:
|
| 88 |
+
# if isinstance(cache, Tensor):
|
| 89 |
+
# if not result_metric.meta.enable_graph:
|
| 90 |
+
# return cache.detach()
|
| 91 |
+
#
|
| 92 |
+
# return cache
|
| 93 |
+
#
|
| 94 |
+
#
|
| 95 |
+
# _ResultCollection._get_cache = _get_cache
|
| 96 |
+
|
| 97 |
+
if torch.cuda.is_available():
|
| 98 |
+
if torch.cuda.get_device_capability() >= (8, 0):
|
| 99 |
+
torch.set_float32_matmul_precision("high")
|
| 100 |
+
log.info("Your GPU supports tensor cores, "
|
| 101 |
+
"we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")
|
deepscreen/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (3.28 kB). View file
|
|
|
deepscreen/__pycache__/predict.cpython-311.pyc
ADDED
|
Binary file (3.38 kB). View file
|
|
|
deepscreen/data/__init__.py
ADDED
|
File without changes
|
deepscreen/data/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (179 Bytes). View file
|
|
|
deepscreen/data/__pycache__/dti.cpython-311.pyc
ADDED
|
Binary file (23 kB). View file
|
|
|
deepscreen/data/dti.py
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from functools import partial
|
| 3 |
+
from numbers import Number
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any, Dict, Optional, Sequence, Union, Literal
|
| 6 |
+
|
| 7 |
+
from lightning import LightningDataModule
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import swifter
|
| 10 |
+
from sklearn.preprocessing import LabelEncoder
|
| 11 |
+
from torch.utils.data import Dataset, DataLoader
|
| 12 |
+
|
| 13 |
+
from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
|
| 14 |
+
from deepscreen.utils import get_logger
|
| 15 |
+
|
| 16 |
+
log = get_logger(__name__)
|
| 17 |
+
|
| 18 |
+
SMILES_PAT = r"[^A-Za-z0-9=#:+\-\[\]<>()/\\@%,.*]"
|
| 19 |
+
FASTA_PAT = r"[^A-Z*\-]"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def validate_seq_str(seq, regex):
|
| 23 |
+
if seq:
|
| 24 |
+
err_charset = set(re.findall(regex, seq))
|
| 25 |
+
if not err_charset:
|
| 26 |
+
return None
|
| 27 |
+
else:
|
| 28 |
+
return ', '.join(err_charset)
|
| 29 |
+
else:
|
| 30 |
+
return 'Empty string'
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# TODO: save a list of corrupted records
|
| 34 |
+
|
| 35 |
+
def rdkit_canonicalize(smiles):
|
| 36 |
+
from rdkit import Chem
|
| 37 |
+
try:
|
| 38 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 39 |
+
cano_smiles = Chem.MolToSmiles(mol)
|
| 40 |
+
return cano_smiles
|
| 41 |
+
except Exception as e:
|
| 42 |
+
log.warning(f'Failed to canonicalize SMILES using RDKIT due to {str(e)}. Returning original SMILES: {smiles}')
|
| 43 |
+
return smiles
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class DTIDataset(Dataset):
|
| 47 |
+
def __init__(
|
| 48 |
+
self,
|
| 49 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
| 50 |
+
num_classes: Optional[int],
|
| 51 |
+
data_path: str | Path,
|
| 52 |
+
drug_featurizer: callable,
|
| 53 |
+
protein_featurizer: callable,
|
| 54 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
| 55 |
+
discard_intermediate: Optional[bool] = False,
|
| 56 |
+
query: Optional[str] = 'X2'
|
| 57 |
+
):
|
| 58 |
+
df = pd.read_csv(
|
| 59 |
+
data_path,
|
| 60 |
+
engine='python',
|
| 61 |
+
header=0,
|
| 62 |
+
usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
|
| 63 |
+
dtype={
|
| 64 |
+
'X1': 'str',
|
| 65 |
+
'ID1': 'str',
|
| 66 |
+
'X2': 'str',
|
| 67 |
+
'ID2': 'str',
|
| 68 |
+
'Y': 'float32',
|
| 69 |
+
'U': 'str',
|
| 70 |
+
},
|
| 71 |
+
)
|
| 72 |
+
# Read the whole data table
|
| 73 |
+
|
| 74 |
+
# if 'ID1' in df:
|
| 75 |
+
# self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
|
| 76 |
+
# if 'ID2' in df:
|
| 77 |
+
# self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
|
| 78 |
+
# self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
|
| 79 |
+
# self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
|
| 80 |
+
|
| 81 |
+
# # train and eval mode data processing (fully labelled)
|
| 82 |
+
# if 'Y' in df.columns and df['Y'].notnull().all():
|
| 83 |
+
log.info(f"Processing data file: {data_path}")
|
| 84 |
+
|
| 85 |
+
# Forward-fill all non-label columns
|
| 86 |
+
df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
|
| 87 |
+
|
| 88 |
+
# TODO potentially allow running through the whole data validation process
|
| 89 |
+
# error = False
|
| 90 |
+
|
| 91 |
+
if 'Y' in df:
|
| 92 |
+
log.info(f"Validating labels (`Y`)...")
|
| 93 |
+
# TODO: check sklearn.utils.multiclass.check_classification_targets
|
| 94 |
+
match task:
|
| 95 |
+
case 'regression':
|
| 96 |
+
assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
|
| 97 |
+
f"""`Y` must be numeric for `regression` task,
|
| 98 |
+
but it has {set(df['Y'].swifter.apply(type))}."""
|
| 99 |
+
|
| 100 |
+
case 'binary':
|
| 101 |
+
if all(df['Y'].isin([0, 1])):
|
| 102 |
+
assert not thresholds, \
|
| 103 |
+
f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
|
| 104 |
+
but still got `thresholds` ({thresholds}).
|
| 105 |
+
Double check your choices of `task` and `thresholds`, and records in the `Y` column."""
|
| 106 |
+
else:
|
| 107 |
+
assert thresholds, \
|
| 108 |
+
f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
|
| 109 |
+
but it has {pd.unique(df['Y'])}.
|
| 110 |
+
You may set `thresholds` to discretize continuous labels.""" # TODO print err idx instead
|
| 111 |
+
|
| 112 |
+
case 'multiclass':
|
| 113 |
+
assert num_classes >= 3, f'`num_classes` for `task=multiclass` must be at least 3.'
|
| 114 |
+
|
| 115 |
+
if all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)):
|
| 116 |
+
assert not thresholds, \
|
| 117 |
+
f"""`Y` is already non-negative integers for
|
| 118 |
+
`multiclass` (classification) `task`, but still got `thresholds` ({thresholds}).
|
| 119 |
+
Double check your choice of `task`, `thresholds` and records in the `Y` column."""
|
| 120 |
+
else:
|
| 121 |
+
assert thresholds, \
|
| 122 |
+
f"""`Y` must be non-negative integers for
|
| 123 |
+
`multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
|
| 124 |
+
You must set `thresholds` to discretize continuous labels.""" # TODO print err idx instead
|
| 125 |
+
|
| 126 |
+
if 'U' in df.columns:
|
| 127 |
+
units = df['U']
|
| 128 |
+
else:
|
| 129 |
+
units = None
|
| 130 |
+
log.warning("Units ('U') not in the data table. "
|
| 131 |
+
"Assuming all labels to be discrete or in p-scale (-log10[M]).")
|
| 132 |
+
|
| 133 |
+
# Transform labels
|
| 134 |
+
df['Y'] = label_transform(labels=df['Y'], units=units, thresholds=thresholds,
|
| 135 |
+
discard_intermediate=discard_intermediate)
|
| 136 |
+
|
| 137 |
+
# Filter out rows with a NaN in Y (missing values)
|
| 138 |
+
df.dropna(subset=['Y'], inplace=True)
|
| 139 |
+
|
| 140 |
+
match task:
|
| 141 |
+
case 'regression':
|
| 142 |
+
df['Y'] = df['Y'].astype('float32')
|
| 143 |
+
assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
|
| 144 |
+
f"""`Y` must be numeric for `regression` task,
|
| 145 |
+
but after transformation it still has {set(df['Y'].swifter.apply(type))}.
|
| 146 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
| 147 |
+
# TODO print err idx instead
|
| 148 |
+
case 'binary':
|
| 149 |
+
df['Y'] = df['Y'].astype('int')
|
| 150 |
+
assert all(df['Y'].isin([0, 1])), \
|
| 151 |
+
f"""`Y` must be 0 or 1 for `task=binary`, "
|
| 152 |
+
but after transformation it still has {pd.unique(df['Y'])}.
|
| 153 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
| 154 |
+
# TODO print err idx instead
|
| 155 |
+
case 'multiclass':
|
| 156 |
+
df['Y'] = df['Y'].astype('int')
|
| 157 |
+
assert all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)), \
|
| 158 |
+
f"""Y must be non-negative integers for `task=multiclass`
|
| 159 |
+
but after transformation it still has {pd.unique(df['Y'])}.
|
| 160 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
| 161 |
+
# TODO print err idx instead
|
| 162 |
+
target_n_unique = df['Y'].nunique()
|
| 163 |
+
assert target_n_unique == num_classes, \
|
| 164 |
+
f"""You have set `num_classes` for `task=multiclass` to {num_classes},
|
| 165 |
+
but after transformation Y still has {target_n_unique} unique labels.
|
| 166 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
| 167 |
+
|
| 168 |
+
log.info("Validating SMILES (`X1`)...")
|
| 169 |
+
df['X1_ERR'] = df['X1'].swifter.progress_bar(
|
| 170 |
+
desc="Validating SMILES...").apply(validate_seq_str, regex=SMILES_PAT)
|
| 171 |
+
if not df['X1_ERR'].isna().all():
|
| 172 |
+
raise Exception(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
|
| 173 |
+
df['X1^'] = df['X1'].apply(rdkit_canonicalize) # swifter
|
| 174 |
+
|
| 175 |
+
log.info("Validating FASTA (`X2`)...")
|
| 176 |
+
df['X2'] = df['X2'].str.upper()
|
| 177 |
+
df['X2_ERR'] = df['X2'].swifter.progress_bar(
|
| 178 |
+
desc="Validating FASTA...").apply(validate_seq_str, regex=FASTA_PAT)
|
| 179 |
+
if not df['X2_ERR'].isna().all():
|
| 180 |
+
raise Exception(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
|
| 181 |
+
|
| 182 |
+
# FASTA/SMILES indices as query for retrieval metrics like enrichment factor and hit rate
|
| 183 |
+
if query:
|
| 184 |
+
df['ID^'] = LabelEncoder().fit_transform(df[query])
|
| 185 |
+
|
| 186 |
+
self.df = df
|
| 187 |
+
self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
|
| 188 |
+
self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
|
| 189 |
+
|
| 190 |
+
def __len__(self):
|
| 191 |
+
return len(self.df.index)
|
| 192 |
+
|
| 193 |
+
def __getitem__(self, i):
|
| 194 |
+
sample = self.df.loc[i]
|
| 195 |
+
return {
|
| 196 |
+
'N': i,
|
| 197 |
+
'X1': sample['X1'],
|
| 198 |
+
'X1^': self.drug_featurizer(sample['X1^']),
|
| 199 |
+
'ID1': sample.get('ID1'),
|
| 200 |
+
'X2': sample['X2'],
|
| 201 |
+
'X2^': self.protein_featurizer(sample['X2']),
|
| 202 |
+
'ID2': sample.get('ID2'),
|
| 203 |
+
'Y': sample.get('Y'),
|
| 204 |
+
'ID^': sample.get('ID^'),
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
class DTIDataModule(LightningDataModule):
|
| 209 |
+
"""
|
| 210 |
+
DTI DataModule
|
| 211 |
+
|
| 212 |
+
A DataModule implements 5 key methods:
|
| 213 |
+
|
| 214 |
+
def prepare_data(self):
|
| 215 |
+
# things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
|
| 216 |
+
# download data, pre-process, split, save to disk, etc.
|
| 217 |
+
def setup(self, stage):
|
| 218 |
+
# things to do on every process in DDP
|
| 219 |
+
# load data, set variables, etc.
|
| 220 |
+
def train_dataloader(self):
|
| 221 |
+
# return train dataloader
|
| 222 |
+
def val_dataloader(self):
|
| 223 |
+
# return validation dataloader
|
| 224 |
+
def test_dataloader(self):
|
| 225 |
+
# return test dataloader
|
| 226 |
+
def teardown(self):
|
| 227 |
+
# called on every process in DDP
|
| 228 |
+
# clean up after fit or test
|
| 229 |
+
|
| 230 |
+
This allows you to share a full dataset without explaining how to download,
|
| 231 |
+
split, transform and process the data.
|
| 232 |
+
|
| 233 |
+
Read the docs:
|
| 234 |
+
https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
|
| 235 |
+
"""
|
| 236 |
+
|
| 237 |
+
def __init__(
|
| 238 |
+
self,
|
| 239 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
| 240 |
+
num_classes: Optional[int],
|
| 241 |
+
batch_size: int,
|
| 242 |
+
# train: bool,
|
| 243 |
+
drug_featurizer: callable,
|
| 244 |
+
protein_featurizer: callable,
|
| 245 |
+
collator: callable = collate_fn,
|
| 246 |
+
data_dir: str = "data/",
|
| 247 |
+
data_file: Optional[str] = None,
|
| 248 |
+
train_val_test_split: Optional[Union[Sequence[Number | str]]] = None,
|
| 249 |
+
split: Optional[callable] = None,
|
| 250 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
| 251 |
+
discard_intermediate: Optional[bool] = False,
|
| 252 |
+
num_workers: int = 0,
|
| 253 |
+
pin_memory: bool = False,
|
| 254 |
+
):
|
| 255 |
+
super().__init__()
|
| 256 |
+
|
| 257 |
+
self.train_data: Optional[Dataset] = None
|
| 258 |
+
self.val_data: Optional[Dataset] = None
|
| 259 |
+
self.test_data: Optional[Dataset] = None
|
| 260 |
+
self.predict_data: Optional[Dataset] = None
|
| 261 |
+
self.split = split
|
| 262 |
+
self.collator = collator
|
| 263 |
+
self.dataset = partial(
|
| 264 |
+
DTIDataset,
|
| 265 |
+
task=task,
|
| 266 |
+
num_classes=num_classes,
|
| 267 |
+
drug_featurizer=drug_featurizer,
|
| 268 |
+
protein_featurizer=protein_featurizer,
|
| 269 |
+
thresholds=thresholds,
|
| 270 |
+
discard_intermediate=discard_intermediate
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# this line allows to access init params with 'self.hparams' ensures init params will be stored in ckpt
|
| 274 |
+
self.save_hyperparameters(logger=False) # ignore=['split']
|
| 275 |
+
|
| 276 |
+
def prepare_data(self):
|
| 277 |
+
"""
|
| 278 |
+
Download data if needed.
|
| 279 |
+
Do not use it to assign state (e.g., self.x = x).
|
| 280 |
+
"""
|
| 281 |
+
|
| 282 |
+
def setup(self, stage: Optional[str] = None, encoding: str = None):
|
| 283 |
+
"""
|
| 284 |
+
Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
|
| 285 |
+
This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
|
| 286 |
+
careful not to execute data splitting twice.
|
| 287 |
+
"""
|
| 288 |
+
# load and split datasets only if not loaded in initialization
|
| 289 |
+
if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
|
| 290 |
+
if self.hparams.train_val_test_split:
|
| 291 |
+
if len(self.hparams.train_val_test_split) != 3:
|
| 292 |
+
raise ValueError('Length of `train_val_test_split` must be 3. '
|
| 293 |
+
'Set the second element to None for training without validation. '
|
| 294 |
+
'Set the third element to None for training without testing.')
|
| 295 |
+
|
| 296 |
+
self.train_data = self.hparams.train_val_test_split[0]
|
| 297 |
+
self.val_data = self.hparams.train_val_test_split[1]
|
| 298 |
+
self.test_data = self.hparams.train_val_test_split[2]
|
| 299 |
+
|
| 300 |
+
if all([self.hparams.data_file, self.split]):
|
| 301 |
+
if all(isinstance(split, Number) or split is None
|
| 302 |
+
for split in self.hparams.train_val_test_split):
|
| 303 |
+
split_data = self.split(
|
| 304 |
+
dataset=self.dataset(data_path=Path(self.hparams.data_dir, self.hparams.data_file)),
|
| 305 |
+
lengths=[split for split in self.hparams.train_val_test_split if split is not None]
|
| 306 |
+
)
|
| 307 |
+
for dataset in ['train_data', 'val_data', 'test_data']:
|
| 308 |
+
if getattr(self, dataset) is not None:
|
| 309 |
+
setattr(self, dataset, split_data.pop(0))
|
| 310 |
+
|
| 311 |
+
else:
|
| 312 |
+
raise ValueError('`train_val_test_split` must be a sequence numbers or None'
|
| 313 |
+
'(float for percentages and int for sample numbers) '
|
| 314 |
+
'if both `data_file` and `split` have been specified.')
|
| 315 |
+
|
| 316 |
+
elif (all(isinstance(split, str) or split is None
|
| 317 |
+
for split in self.hparams.train_val_test_split)
|
| 318 |
+
and not any([self.hparams.data_file, self.split])):
|
| 319 |
+
for dataset in ['train_data', 'val_data', 'test_data']:
|
| 320 |
+
if getattr(self, dataset) is not None:
|
| 321 |
+
data_path = Path(getattr(self, dataset))
|
| 322 |
+
if not data_path.is_absolute():
|
| 323 |
+
data_path = Path(self.hparams.data_dir, data_path)
|
| 324 |
+
setattr(self, dataset, self.dataset(data_path=data_path))
|
| 325 |
+
|
| 326 |
+
else:
|
| 327 |
+
raise ValueError('For training, you must specify either all of `data_file`, `split`, '
|
| 328 |
+
'and `train_val_test_split` as a sequence of numbers or '
|
| 329 |
+
'solely `train_val_test_split` as a sequence of data file paths.')
|
| 330 |
+
|
| 331 |
+
elif self.hparams.data_file and not any([self.split, self.hparams.train_val_test_split]):
|
| 332 |
+
data_path = Path(self.hparams.data_file)
|
| 333 |
+
if not data_path.is_absolute():
|
| 334 |
+
data_path = Path(self.hparams.data_dir, data_path)
|
| 335 |
+
self.test_data = self.predict_data = self.dataset(data_path=data_path)
|
| 336 |
+
|
| 337 |
+
else:
|
| 338 |
+
raise ValueError("For training, you must specify `train_val_test_split`. "
|
| 339 |
+
"For testing/predicting, you must specify only `data_file` without "
|
| 340 |
+
"`train_val_test_split` or `split`.")
|
| 341 |
+
|
| 342 |
+
def train_dataloader(self):
|
| 343 |
+
return DataLoader(
|
| 344 |
+
dataset=self.train_data,
|
| 345 |
+
batch_sampler=SafeBatchSampler(
|
| 346 |
+
data_source=self.train_data,
|
| 347 |
+
batch_size=self.hparams.batch_size,
|
| 348 |
+
# Dropping the last batch prevents problems caused by variable batch sizes in training, e.g.,
|
| 349 |
+
# batch_size=1 in BatchNorm, and shuffling ensures the model be trained on all samples over epochs.
|
| 350 |
+
drop_last=True,
|
| 351 |
+
shuffle=True,
|
| 352 |
+
),
|
| 353 |
+
# batch_size=self.hparams.batch_size,
|
| 354 |
+
# shuffle=True,
|
| 355 |
+
num_workers=self.hparams.num_workers,
|
| 356 |
+
pin_memory=self.hparams.pin_memory,
|
| 357 |
+
collate_fn=self.collator,
|
| 358 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
def val_dataloader(self):
|
| 362 |
+
return DataLoader(
|
| 363 |
+
dataset=self.val_data,
|
| 364 |
+
batch_sampler=SafeBatchSampler(
|
| 365 |
+
data_source=self.val_data,
|
| 366 |
+
batch_size=self.hparams.batch_size,
|
| 367 |
+
drop_last=False,
|
| 368 |
+
shuffle=False
|
| 369 |
+
),
|
| 370 |
+
# batch_size=self.hparams.batch_size,
|
| 371 |
+
# shuffle=False,
|
| 372 |
+
num_workers=self.hparams.num_workers,
|
| 373 |
+
pin_memory=self.hparams.pin_memory,
|
| 374 |
+
collate_fn=self.collator,
|
| 375 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
def test_dataloader(self):
|
| 379 |
+
return DataLoader(
|
| 380 |
+
dataset=self.test_data,
|
| 381 |
+
batch_sampler=SafeBatchSampler(
|
| 382 |
+
data_source=self.test_data,
|
| 383 |
+
batch_size=self.hparams.batch_size,
|
| 384 |
+
drop_last=False,
|
| 385 |
+
shuffle=False
|
| 386 |
+
),
|
| 387 |
+
# batch_size=self.hparams.batch_size,
|
| 388 |
+
# shuffle=False,
|
| 389 |
+
num_workers=self.hparams.num_workers,
|
| 390 |
+
pin_memory=self.hparams.pin_memory,
|
| 391 |
+
collate_fn=self.collator,
|
| 392 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
def predict_dataloader(self):
|
| 396 |
+
return DataLoader(
|
| 397 |
+
dataset=self.predict_data,
|
| 398 |
+
batch_sampler=SafeBatchSampler(
|
| 399 |
+
data_source=self.predict_data,
|
| 400 |
+
batch_size=self.hparams.batch_size,
|
| 401 |
+
drop_last=False,
|
| 402 |
+
shuffle=False
|
| 403 |
+
),
|
| 404 |
+
# batch_size=self.hparams.batch_size,
|
| 405 |
+
# shuffle=False,
|
| 406 |
+
num_workers=self.hparams.num_workers,
|
| 407 |
+
pin_memory=self.hparams.pin_memory,
|
| 408 |
+
collate_fn=self.collator,
|
| 409 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
def teardown(self, stage: Optional[str] = None):
|
| 413 |
+
"""Clean up after fit or test."""
|
| 414 |
+
pass
|
| 415 |
+
|
| 416 |
+
def state_dict(self):
|
| 417 |
+
"""Extra things to save to checkpoint."""
|
| 418 |
+
return {}
|
| 419 |
+
|
| 420 |
+
def load_state_dict(self, state_dict: Dict[str, Any]):
|
| 421 |
+
"""Things to do when loading checkpoint."""
|
| 422 |
+
pass
|
deepscreen/data/dti.py.bak
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import partial
|
| 2 |
+
from numbers import Number
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any, Dict, Optional, Sequence, Union, Literal
|
| 5 |
+
|
| 6 |
+
from lightning import LightningDataModule
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from sklearn.preprocessing import LabelEncoder
|
| 9 |
+
from torch.utils.data import Dataset, DataLoader
|
| 10 |
+
|
| 11 |
+
from deepscreen.data.utils import label_transform, collate_fn, SafeBatchSampler
|
| 12 |
+
from deepscreen.utils import get_logger
|
| 13 |
+
|
| 14 |
+
log = get_logger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# TODO: save a list of corrupted records
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class DTIDataset(Dataset):
|
| 21 |
+
def __init__(
|
| 22 |
+
self,
|
| 23 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
| 24 |
+
n_class: Optional[int],
|
| 25 |
+
data_path: str | Path,
|
| 26 |
+
drug_featurizer: callable,
|
| 27 |
+
protein_featurizer: callable,
|
| 28 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
| 29 |
+
discard_intermediate: Optional[bool] = False,
|
| 30 |
+
):
|
| 31 |
+
df = pd.read_csv(
|
| 32 |
+
data_path,
|
| 33 |
+
engine='python',
|
| 34 |
+
header=0,
|
| 35 |
+
usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
|
| 36 |
+
dtype={
|
| 37 |
+
'X1': 'str',
|
| 38 |
+
'ID1': 'str',
|
| 39 |
+
'X2': 'str',
|
| 40 |
+
'ID2': 'str',
|
| 41 |
+
'Y': 'float32',
|
| 42 |
+
'U': 'str',
|
| 43 |
+
},
|
| 44 |
+
)
|
| 45 |
+
# Read the whole data table
|
| 46 |
+
|
| 47 |
+
# if 'ID1' in df:
|
| 48 |
+
# self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
|
| 49 |
+
# if 'ID2' in df:
|
| 50 |
+
# self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
|
| 51 |
+
# self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
|
| 52 |
+
# self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
|
| 53 |
+
|
| 54 |
+
# # train and eval mode data processing (fully labelled)
|
| 55 |
+
# if 'Y' in df.columns and df['Y'].notnull().all():
|
| 56 |
+
log.info(f"Processing data file: {data_path}")
|
| 57 |
+
|
| 58 |
+
# Forward-fill all non-label columns
|
| 59 |
+
df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
|
| 60 |
+
|
| 61 |
+
if 'Y' in df:
|
| 62 |
+
log.info(f"Performing pre-transformation target validation.")
|
| 63 |
+
# TODO: check sklearn.utils.multiclass.check_classification_targets
|
| 64 |
+
match task:
|
| 65 |
+
case 'regression':
|
| 66 |
+
assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
|
| 67 |
+
f"""`Y` must be numeric for `regression` task,
|
| 68 |
+
but it has {set(df['Y'].apply(type))}."""
|
| 69 |
+
|
| 70 |
+
case 'binary':
|
| 71 |
+
if all(df['Y'].isin([0, 1])):
|
| 72 |
+
assert not thresholds, \
|
| 73 |
+
f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
|
| 74 |
+
but still got `thresholds` {thresholds}.
|
| 75 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` column."""
|
| 76 |
+
else:
|
| 77 |
+
assert thresholds, \
|
| 78 |
+
f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
|
| 79 |
+
but it has {pd.unique(df['Y'])}.
|
| 80 |
+
You must set `thresholds` to discretize continuous labels."""
|
| 81 |
+
|
| 82 |
+
case 'multiclass':
|
| 83 |
+
assert n_class >= 3, f'`n_class` for `multiclass` (classification) `task` must be at least 3.'
|
| 84 |
+
|
| 85 |
+
if all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)):
|
| 86 |
+
assert not thresholds, \
|
| 87 |
+
f"""`Y` is already non-negative integers for
|
| 88 |
+
`multiclass` (classification) `task`, but still got `thresholds` {thresholds}.
|
| 89 |
+
Double check your choice of `task`, `thresholds` and records in the `Y` column."""
|
| 90 |
+
else:
|
| 91 |
+
assert thresholds, \
|
| 92 |
+
f"""`Y` must be non-negative integers for
|
| 93 |
+
`multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
|
| 94 |
+
You must set `thresholds` to discretize continuous labels."""
|
| 95 |
+
|
| 96 |
+
if 'U' in df.columns:
|
| 97 |
+
units = df['U']
|
| 98 |
+
else:
|
| 99 |
+
units = None
|
| 100 |
+
log.warning("Units ('U') not in the data table. "
|
| 101 |
+
"Assuming all labels to be discrete or in p-scale (-log10[M]).")
|
| 102 |
+
|
| 103 |
+
# Transform labels
|
| 104 |
+
df['Y'] = label_transform(labels=df['Y'], units=units, thresholds=thresholds,
|
| 105 |
+
discard_intermediate=discard_intermediate)
|
| 106 |
+
|
| 107 |
+
# Filter out rows with a NaN in Y (missing values)
|
| 108 |
+
df.dropna(subset=['Y'], inplace=True)
|
| 109 |
+
|
| 110 |
+
log.info(f"Performing post-transformation target validation.")
|
| 111 |
+
match task:
|
| 112 |
+
case 'regression':
|
| 113 |
+
df['Y'] = df['Y'].astype('float32')
|
| 114 |
+
assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
|
| 115 |
+
f"""`Y` must be numeric for `regression` task,
|
| 116 |
+
but after transformation it still has {set(df['Y'].apply(type))}.
|
| 117 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
| 118 |
+
|
| 119 |
+
case 'binary':
|
| 120 |
+
df['Y'] = df['Y'].astype('int')
|
| 121 |
+
assert all(df['Y'].isin([0, 1])), \
|
| 122 |
+
f"""`Y` must be 0 or 1 for `binary` (classification) `task`, "
|
| 123 |
+
but after transformation it still has {pd.unique(df['Y'])}.
|
| 124 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
| 125 |
+
|
| 126 |
+
case 'multiclass':
|
| 127 |
+
df['Y'] = df['Y'].astype('int')
|
| 128 |
+
assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
|
| 129 |
+
f"""Y must be non-negative integers for task `multiclass` (classification)
|
| 130 |
+
but after transformation it still has {pd.unique(df['Y'])}.
|
| 131 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
| 132 |
+
|
| 133 |
+
target_n_unique = df['Y'].nunique()
|
| 134 |
+
assert target_n_unique == n_class, \
|
| 135 |
+
f"""You have set `n_class` for `multiclass` (classification) `task` to {n_class},
|
| 136 |
+
but after transformation Y still has {target_n_unique} unique labels.
|
| 137 |
+
Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
|
| 138 |
+
|
| 139 |
+
# Indexed protein/FASTA for retrieval metrics
|
| 140 |
+
df['IDX'] = LabelEncoder().fit_transform(df['X2'])
|
| 141 |
+
|
| 142 |
+
self.df = df
|
| 143 |
+
self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
|
| 144 |
+
self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
|
| 145 |
+
|
| 146 |
+
def __len__(self):
|
| 147 |
+
return len(self.df.index)
|
| 148 |
+
|
| 149 |
+
def __getitem__(self, i):
|
| 150 |
+
sample = self.df.loc[i]
|
| 151 |
+
return {
|
| 152 |
+
'N': i,
|
| 153 |
+
'X1': self.drug_featurizer(sample['X1']),
|
| 154 |
+
'ID1': sample.get('ID1', sample['X1']),
|
| 155 |
+
'X2': self.protein_featurizer(sample['X2']),
|
| 156 |
+
'ID2': sample.get('ID2', sample['X2']),
|
| 157 |
+
'Y': sample.get('Y'),
|
| 158 |
+
'IDX': sample['IDX'],
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
class DTIDataModule(LightningDataModule):
|
| 163 |
+
"""
|
| 164 |
+
DTI DataModule
|
| 165 |
+
|
| 166 |
+
A DataModule implements 5 key methods:
|
| 167 |
+
|
| 168 |
+
def prepare_data(self):
|
| 169 |
+
# things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
|
| 170 |
+
# download data, pre-process, split, save to disk, etc.
|
| 171 |
+
def setup(self, stage):
|
| 172 |
+
# things to do on every process in DDP
|
| 173 |
+
# load data, set variables, etc.
|
| 174 |
+
def train_dataloader(self):
|
| 175 |
+
# return train dataloader
|
| 176 |
+
def val_dataloader(self):
|
| 177 |
+
# return validation dataloader
|
| 178 |
+
def test_dataloader(self):
|
| 179 |
+
# return test dataloader
|
| 180 |
+
def teardown(self):
|
| 181 |
+
# called on every process in DDP
|
| 182 |
+
# clean up after fit or test
|
| 183 |
+
|
| 184 |
+
This allows you to share a full dataset without explaining how to download,
|
| 185 |
+
split, transform and process the data.
|
| 186 |
+
|
| 187 |
+
Read the docs:
|
| 188 |
+
https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
|
| 189 |
+
"""
|
| 190 |
+
|
| 191 |
+
def __init__(
|
| 192 |
+
self,
|
| 193 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
| 194 |
+
n_class: Optional[int],
|
| 195 |
+
batch_size: int,
|
| 196 |
+
# train: bool,
|
| 197 |
+
drug_featurizer: callable,
|
| 198 |
+
protein_featurizer: callable,
|
| 199 |
+
collator: callable = collate_fn,
|
| 200 |
+
data_dir: str = "data/",
|
| 201 |
+
data_file: Optional[str] = None,
|
| 202 |
+
train_val_test_split: Optional[Union[Sequence[Number | str]]] = None,
|
| 203 |
+
split: Optional[callable] = None,
|
| 204 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
| 205 |
+
discard_intermediate: Optional[bool] = False,
|
| 206 |
+
num_workers: int = 0,
|
| 207 |
+
pin_memory: bool = False,
|
| 208 |
+
):
|
| 209 |
+
super().__init__()
|
| 210 |
+
|
| 211 |
+
self.train_data: Optional[Dataset] = None
|
| 212 |
+
self.val_data: Optional[Dataset] = None
|
| 213 |
+
self.test_data: Optional[Dataset] = None
|
| 214 |
+
self.predict_data: Optional[Dataset] = None
|
| 215 |
+
self.split = split
|
| 216 |
+
self.collator = collator
|
| 217 |
+
self.dataset = partial(
|
| 218 |
+
DTIDataset,
|
| 219 |
+
task=task,
|
| 220 |
+
n_class=n_class,
|
| 221 |
+
drug_featurizer=drug_featurizer,
|
| 222 |
+
protein_featurizer=protein_featurizer,
|
| 223 |
+
thresholds=thresholds,
|
| 224 |
+
discard_intermediate=discard_intermediate
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
if train_val_test_split:
|
| 228 |
+
# TODO test behavior for trainer.test and predict when this is passed
|
| 229 |
+
if len(train_val_test_split) not in [2, 3]:
|
| 230 |
+
raise ValueError('Length of `train_val_test_split` must be 2 (for training without testing) or 3.')
|
| 231 |
+
if all([data_file, split]):
|
| 232 |
+
if all(isinstance(split, Number) for split in train_val_test_split):
|
| 233 |
+
pass
|
| 234 |
+
else:
|
| 235 |
+
raise ValueError('`train_val_test_split` must be a sequence numbers '
|
| 236 |
+
'(float for percentages and int for sample numbers) '
|
| 237 |
+
'if both `data_file` and `split` have been specified.')
|
| 238 |
+
elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
|
| 239 |
+
split_paths = []
|
| 240 |
+
for split in train_val_test_split:
|
| 241 |
+
split = Path(split)
|
| 242 |
+
if not split.is_absolute():
|
| 243 |
+
split = Path(data_dir, split)
|
| 244 |
+
split_paths.append(split)
|
| 245 |
+
|
| 246 |
+
self.train_data = self.dataset(data_path=split_paths[0])
|
| 247 |
+
self.val_data = self.dataset(data_path=split_paths[1])
|
| 248 |
+
if len(train_val_test_split) == 3:
|
| 249 |
+
self.test_data = self.dataset(data_path=split_paths[2])
|
| 250 |
+
else:
|
| 251 |
+
raise ValueError('For training, you must specify either `data_file`, `split`, '
|
| 252 |
+
'and `train_val_test_split` as a sequence of numbers or '
|
| 253 |
+
'solely `train_val_test_split` as a sequence of data file paths.')
|
| 254 |
+
|
| 255 |
+
elif data_file and not any([split, train_val_test_split]):
|
| 256 |
+
data_file = Path(data_file)
|
| 257 |
+
if not data_file.is_absolute():
|
| 258 |
+
data_file = Path(data_dir, data_file)
|
| 259 |
+
self.test_data = self.predict_data = self.dataset(data_path=data_file)
|
| 260 |
+
else:
|
| 261 |
+
raise ValueError("For training, you must specify `train_val_test_split`. "
|
| 262 |
+
"For testing/predicting, you must specify only `data_file` without "
|
| 263 |
+
"`train_val_test_split` or `split`.")
|
| 264 |
+
|
| 265 |
+
# this line allows to access init params with 'self.hparams' attribute
|
| 266 |
+
# also ensures init params will be stored in ckpt
|
| 267 |
+
self.save_hyperparameters(logger=False) # ignore=['split']
|
| 268 |
+
|
| 269 |
+
def prepare_data(self):
|
| 270 |
+
"""
|
| 271 |
+
Download data if needed.
|
| 272 |
+
Do not use it to assign state (e.g., self.x = x).
|
| 273 |
+
"""
|
| 274 |
+
|
| 275 |
+
def setup(self, stage: Optional[str] = None, encoding: str = None):
|
| 276 |
+
"""
|
| 277 |
+
Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
|
| 278 |
+
This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
|
| 279 |
+
careful not to execute data splitting twice.
|
| 280 |
+
"""
|
| 281 |
+
# TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
|
| 282 |
+
# load and split datasets only if not loaded in initialization
|
| 283 |
+
if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
|
| 284 |
+
self.train_data, self.val_data, self.test_data = self.split(
|
| 285 |
+
dataset=self.dataset(data_path=Path(self.hparams.data_dir, self.hparams.data_file)),
|
| 286 |
+
lengths=self.hparams.train_val_test_split
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
def train_dataloader(self):
|
| 290 |
+
return DataLoader(
|
| 291 |
+
dataset=self.train_data,
|
| 292 |
+
batch_sampler=SafeBatchSampler(
|
| 293 |
+
data_source=self.train_data,
|
| 294 |
+
batch_size=self.hparams.batch_size,
|
| 295 |
+
# Dropping the last batch prevents problems caused by variable batch sizes in training, e.g.,
|
| 296 |
+
# batch_size=1 in BatchNorm, and shuffling ensures the model be trained on all samples over epochs.
|
| 297 |
+
drop_last=True,
|
| 298 |
+
shuffle=True,
|
| 299 |
+
),
|
| 300 |
+
# batch_size=self.hparams.batch_size,
|
| 301 |
+
# shuffle=True,
|
| 302 |
+
num_workers=self.hparams.num_workers,
|
| 303 |
+
pin_memory=self.hparams.pin_memory,
|
| 304 |
+
collate_fn=self.collator,
|
| 305 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
def val_dataloader(self):
|
| 309 |
+
return DataLoader(
|
| 310 |
+
dataset=self.val_data,
|
| 311 |
+
batch_sampler=SafeBatchSampler(
|
| 312 |
+
data_source=self.val_data,
|
| 313 |
+
batch_size=self.hparams.batch_size,
|
| 314 |
+
drop_last=False,
|
| 315 |
+
shuffle=False
|
| 316 |
+
),
|
| 317 |
+
# batch_size=self.hparams.batch_size,
|
| 318 |
+
# shuffle=False,
|
| 319 |
+
num_workers=self.hparams.num_workers,
|
| 320 |
+
pin_memory=self.hparams.pin_memory,
|
| 321 |
+
collate_fn=self.collator,
|
| 322 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
def test_dataloader(self):
|
| 326 |
+
return DataLoader(
|
| 327 |
+
dataset=self.test_data,
|
| 328 |
+
batch_sampler=SafeBatchSampler(
|
| 329 |
+
data_source=self.test_data,
|
| 330 |
+
batch_size=self.hparams.batch_size,
|
| 331 |
+
drop_last=False,
|
| 332 |
+
shuffle=False
|
| 333 |
+
),
|
| 334 |
+
# batch_size=self.hparams.batch_size,
|
| 335 |
+
# shuffle=False,
|
| 336 |
+
num_workers=self.hparams.num_workers,
|
| 337 |
+
pin_memory=self.hparams.pin_memory,
|
| 338 |
+
collate_fn=self.collator,
|
| 339 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
def predict_dataloader(self):
|
| 343 |
+
return DataLoader(
|
| 344 |
+
dataset=self.predict_data,
|
| 345 |
+
batch_sampler=SafeBatchSampler(
|
| 346 |
+
data_source=self.predict_data,
|
| 347 |
+
batch_size=self.hparams.batch_size,
|
| 348 |
+
drop_last=False,
|
| 349 |
+
shuffle=False
|
| 350 |
+
),
|
| 351 |
+
# batch_size=self.hparams.batch_size,
|
| 352 |
+
# shuffle=False,
|
| 353 |
+
num_workers=self.hparams.num_workers,
|
| 354 |
+
pin_memory=self.hparams.pin_memory,
|
| 355 |
+
collate_fn=self.collator,
|
| 356 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
def teardown(self, stage: Optional[str] = None):
|
| 360 |
+
"""Clean up after fit or test."""
|
| 361 |
+
pass
|
| 362 |
+
|
| 363 |
+
def state_dict(self):
|
| 364 |
+
"""Extra things to save to checkpoint."""
|
| 365 |
+
return {}
|
| 366 |
+
|
| 367 |
+
def load_state_dict(self, state_dict: Dict[str, Any]):
|
| 368 |
+
"""Things to do when loading checkpoint."""
|
| 369 |
+
pass
|
deepscreen/data/dti_datamodule.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from itertools import product
|
| 2 |
+
from collections import namedtuple
|
| 3 |
+
from numbers import Number
|
| 4 |
+
from typing import Any, Dict, Optional, Sequence, Union, Literal
|
| 5 |
+
|
| 6 |
+
# import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from lightning import LightningDataModule
|
| 9 |
+
from torch.utils.data import Dataset, DataLoader, random_split
|
| 10 |
+
|
| 11 |
+
from deepscreen.data.utils.label import label_transform
|
| 12 |
+
from deepscreen.data.utils.collator import collate_fn
|
| 13 |
+
from deepscreen.data.utils.sampler import SafeBatchSampler
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class DTIDataset(Dataset):
|
| 17 |
+
def __init__(
|
| 18 |
+
self,
|
| 19 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
| 20 |
+
n_classes: Optional[int],
|
| 21 |
+
data_dir: str,
|
| 22 |
+
dataset_name: str,
|
| 23 |
+
drug_featurizer: callable,
|
| 24 |
+
protein_featurizer: callable,
|
| 25 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
| 26 |
+
discard_intermediate: Optional[bool] = False,
|
| 27 |
+
):
|
| 28 |
+
df = pd.read_csv(
|
| 29 |
+
f'{data_dir}{dataset_name}.csv',
|
| 30 |
+
header=0, sep=',',
|
| 31 |
+
usecols=lambda x: x in ['X1', 'ID1', 'X2', 'ID2', 'Y', 'U'],
|
| 32 |
+
dtype={'X1': 'str', 'ID1': 'str',
|
| 33 |
+
'X2': 'str', 'ID2': 'str',
|
| 34 |
+
'Y': 'float32', 'U': 'str'}
|
| 35 |
+
)
|
| 36 |
+
# if 'ID1' in df:
|
| 37 |
+
# self.x1_to_id1 = dict(zip(df['X1'], df['ID1']))
|
| 38 |
+
# if 'ID2' in df:
|
| 39 |
+
# self.x2_to_id2 = dict(zip(df['X2'], df['ID2']))
|
| 40 |
+
# self.id2_to_indexes = dict(zip(df['ID2'], range(len(df['ID2']))))
|
| 41 |
+
# self.x2_to_indexes = dict(zip(df['X2'], range(len(df['X2']))))
|
| 42 |
+
|
| 43 |
+
# # train and eval mode data processing (fully labelled)
|
| 44 |
+
# if 'Y' in df.columns and df['Y'].notnull().all():
|
| 45 |
+
|
| 46 |
+
# Forward-fill all non-label columns
|
| 47 |
+
df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
|
| 48 |
+
|
| 49 |
+
if 'Y' in df:
|
| 50 |
+
# Transform labels
|
| 51 |
+
df['Y'] = df['Y'].apply(label_transform, units=df.get('U', None), thresholds=thresholds,
|
| 52 |
+
discard_intermediate=discard_intermediate).astype('float32')
|
| 53 |
+
|
| 54 |
+
# Filter out rows with a NaN in Y (missing values)
|
| 55 |
+
df.dropna(subset=['Y'], inplace=True)
|
| 56 |
+
|
| 57 |
+
# Validate target labels for training/testing
|
| 58 |
+
# TODO: check sklearn.utils.multiclass.check_classification_targets
|
| 59 |
+
match task:
|
| 60 |
+
case 'regression':
|
| 61 |
+
assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
|
| 62 |
+
f"Y for task `regression` must be numeric; got {set(df['Y'].apply(type))}."
|
| 63 |
+
case 'binary':
|
| 64 |
+
assert all(df['Y'].isin([0, 1])), \
|
| 65 |
+
f"Y for task `binary` (classification) must be 0 or 1, but Y got {pd.unique(df['Y'])}." \
|
| 66 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
| 67 |
+
case 'multiclass':
|
| 68 |
+
assert n_classes >= 3, f'n_classes for task `multiclass` (classification) must be at least 3.'
|
| 69 |
+
assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
|
| 70 |
+
f"Y for task `multiclass` (classification) must be non-negative integers, " \
|
| 71 |
+
f"but Y got {pd.unique(df['Y'])}." \
|
| 72 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
| 73 |
+
target_n_unique = df['Y'].nunique()
|
| 74 |
+
assert target_n_unique == n_classes, \
|
| 75 |
+
f"You have set n_classes for task `multiclass` (classification) task to {n_classes}, " \
|
| 76 |
+
f"but Y has {target_n_unique} unique labels."
|
| 77 |
+
|
| 78 |
+
# # Predict mode data processing
|
| 79 |
+
# else:
|
| 80 |
+
# df = pd.DataFrame(product(df['X1'].dropna(), df['X2'].dropna()), columns=['X1', 'X2'])
|
| 81 |
+
# if hasattr(self, "x1_to_id1"):
|
| 82 |
+
# df['ID1'] = df['X1'].map(self.x1_to_id1)
|
| 83 |
+
# if hasattr(self, "x1_to_id2"):
|
| 84 |
+
# df['ID2'] = df['X2'].map(self.x2_to_id2)
|
| 85 |
+
|
| 86 |
+
# self.smiles = df['X1']
|
| 87 |
+
# self.fasta = df['X2']
|
| 88 |
+
# self.smiles_ids = df.get('ID1', df['X1'])
|
| 89 |
+
# self.fasta_ids = df.get('ID2', df['X2'])
|
| 90 |
+
# self.labels = df.get('Y', None)
|
| 91 |
+
|
| 92 |
+
self.df = df
|
| 93 |
+
self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
|
| 94 |
+
self.protein_featurizer = protein_featurizer if protein_featurizer is not None else (lambda x: x)
|
| 95 |
+
self.n_classes = df['Y'].nunique()
|
| 96 |
+
# self.train = train
|
| 97 |
+
|
| 98 |
+
self.Data = namedtuple('Data', ['FT1', 'ID1', 'FT2', 'ID2', 'Y'])
|
| 99 |
+
|
| 100 |
+
def __len__(self):
|
| 101 |
+
return len(self.df.index)
|
| 102 |
+
|
| 103 |
+
def __getitem__(self, idx):
|
| 104 |
+
sample = self.df.loc[idx]
|
| 105 |
+
return self.Data(
|
| 106 |
+
FT1=self.drug_featurizer(sample['X1']),
|
| 107 |
+
ID1=sample.get('ID1', sample['X1']),
|
| 108 |
+
FT2=self.protein_featurizer(sample['X2']),
|
| 109 |
+
ID2=sample.get('ID2', sample['X2']),
|
| 110 |
+
Y=sample.get('Y')
|
| 111 |
+
)
|
| 112 |
+
# {
|
| 113 |
+
# 'FT1': self.drug_featurizer(sample['X1']),
|
| 114 |
+
# 'ID1': sample.get('ID1', sample['X1']),
|
| 115 |
+
# 'FT2': self.protein_featurizer(sample['X2']),
|
| 116 |
+
# 'ID2': sample.get('ID2', sample['X2']),
|
| 117 |
+
# 'Y': sample.get('Y')
|
| 118 |
+
# }
|
| 119 |
+
# if self.train:
|
| 120 |
+
# sample = self.drug_featurizer(self.smiles[idx]), self.protein_featurizer(self.fasta[idx]), self.labels[idx]
|
| 121 |
+
# sample = {
|
| 122 |
+
# 'FT1': self.drug_featurizer(self.smiles[idx]),
|
| 123 |
+
# 'FT2': self.protein_featurizer(self.fasta[idx]),
|
| 124 |
+
# 'ID2': self.smiles_ids[idx],
|
| 125 |
+
# }
|
| 126 |
+
# else:
|
| 127 |
+
# # sample = self.drug_featurizer(self.smiles[idx]), self.protein_featurizer(self.fasta[idx])
|
| 128 |
+
# sample = {
|
| 129 |
+
# 'FT1': self.drug_featurizer(self.smiles[idx]),
|
| 130 |
+
# 'FT2': self.protein_featurizer(self.fasta[idx]),
|
| 131 |
+
# }
|
| 132 |
+
#
|
| 133 |
+
# if all([True if n is not None else False for n in sample.values()]):
|
| 134 |
+
# return sample # | {
|
| 135 |
+
# # 'ID1': self.smiles_ids[idx],
|
| 136 |
+
# # 'X1': self.drug_featurizer(self.smiles[idx]),
|
| 137 |
+
# # 'ID2': self.fasta_ids[idx],
|
| 138 |
+
# # 'X2': self.protein_featurizer(self.fasta[idx]),
|
| 139 |
+
# # }
|
| 140 |
+
# else:
|
| 141 |
+
# return self.__getitem__(np.random.randint(0, self.size))
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class DTIdatamodule(LightningDataModule):
|
| 145 |
+
"""
|
| 146 |
+
DTI DataModule
|
| 147 |
+
|
| 148 |
+
A DataModule implements 5 key methods:
|
| 149 |
+
|
| 150 |
+
def prepare_data(self):
|
| 151 |
+
# things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
|
| 152 |
+
# download data, pre-process, split, save to disk, etc.
|
| 153 |
+
def setup(self, stage):
|
| 154 |
+
# things to do on every process in DDP
|
| 155 |
+
# load data, set variables, etc.
|
| 156 |
+
def train_dataloader(self):
|
| 157 |
+
# return train dataloader
|
| 158 |
+
def val_dataloader(self):
|
| 159 |
+
# return validation dataloader
|
| 160 |
+
def test_dataloader(self):
|
| 161 |
+
# return test dataloader
|
| 162 |
+
def teardown(self):
|
| 163 |
+
# called on every process in DDP
|
| 164 |
+
# clean up after fit or test
|
| 165 |
+
|
| 166 |
+
This allows you to share a full dataset without explaining how to download,
|
| 167 |
+
split, transform and process the data.
|
| 168 |
+
|
| 169 |
+
Read the docs:
|
| 170 |
+
https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
|
| 171 |
+
"""
|
| 172 |
+
|
| 173 |
+
def __init__(
|
| 174 |
+
self,
|
| 175 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
| 176 |
+
n_classes: Optional[int],
|
| 177 |
+
train: bool,
|
| 178 |
+
drug_featurizer: callable,
|
| 179 |
+
protein_featurizer: callable,
|
| 180 |
+
batch_size: int,
|
| 181 |
+
train_val_test_split: Optional[Sequence[Number]],
|
| 182 |
+
num_workers: int = 0,
|
| 183 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
| 184 |
+
pin_memory: bool = False,
|
| 185 |
+
data_dir: str = "data/",
|
| 186 |
+
dataset_name: Optional[str] = None,
|
| 187 |
+
split: Optional[callable] = random_split,
|
| 188 |
+
):
|
| 189 |
+
super().__init__()
|
| 190 |
+
|
| 191 |
+
# this line allows to access init params with 'self.hparams' attribute
|
| 192 |
+
# also ensures init params will be stored in ckpt
|
| 193 |
+
self.save_hyperparameters(logger=False)
|
| 194 |
+
|
| 195 |
+
# data processing
|
| 196 |
+
self.data_split = split
|
| 197 |
+
|
| 198 |
+
self.data_train: Optional[Dataset] = None
|
| 199 |
+
self.data_val: Optional[Dataset] = None
|
| 200 |
+
self.data_test: Optional[Dataset] = None
|
| 201 |
+
self.data_predict: Optional[Dataset] = None
|
| 202 |
+
|
| 203 |
+
def prepare_data(self):
|
| 204 |
+
"""
|
| 205 |
+
Download data if needed.
|
| 206 |
+
Do not use it to assign state (e.g., self.x = x).
|
| 207 |
+
"""
|
| 208 |
+
|
| 209 |
+
def setup(self, stage: Optional[str] = None, encoding: str = None):
|
| 210 |
+
"""
|
| 211 |
+
Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
|
| 212 |
+
This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
|
| 213 |
+
careful not to execute data splitting twice.
|
| 214 |
+
"""
|
| 215 |
+
# TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
|
| 216 |
+
# load and split datasets only if not loaded in initialization
|
| 217 |
+
if not any([self.data_train, self.data_val, self.data_test, self.data_predict]):
|
| 218 |
+
dataset = DTIDataset(
|
| 219 |
+
task=self.hparams.task,
|
| 220 |
+
n_classes=self.hparams.n_classes,
|
| 221 |
+
data_dir=self.hparams.data_dir,
|
| 222 |
+
drug_featurizer=self.hparams.drug_featurizer,
|
| 223 |
+
protein_featurizer=self.hparams.protein_featurizer,
|
| 224 |
+
dataset_name=self.hparams.dataset_name,
|
| 225 |
+
thresholds=self.hparams.thresholds,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
if self.hparams.train:
|
| 229 |
+
self.data_train, self.data_val, self.data_test = self.data_split(
|
| 230 |
+
dataset=dataset,
|
| 231 |
+
lengths=self.hparams.train_val_test_split
|
| 232 |
+
)
|
| 233 |
+
else:
|
| 234 |
+
self.data_test = self.data_predict = dataset
|
| 235 |
+
|
| 236 |
+
def train_dataloader(self):
|
| 237 |
+
return DataLoader(
|
| 238 |
+
dataset=self.data_train,
|
| 239 |
+
batch_sampler=SafeBatchSampler(
|
| 240 |
+
data_source=self.data_train,
|
| 241 |
+
batch_size=self.hparams.batch_size,
|
| 242 |
+
drop_last=True,
|
| 243 |
+
shuffle=True,
|
| 244 |
+
),
|
| 245 |
+
# batch_size=self.hparams.batch_size,
|
| 246 |
+
# shuffle=True,
|
| 247 |
+
num_workers=self.hparams.num_workers,
|
| 248 |
+
pin_memory=self.hparams.pin_memory,
|
| 249 |
+
collate_fn=collate_fn,
|
| 250 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
def val_dataloader(self):
|
| 254 |
+
return DataLoader(
|
| 255 |
+
dataset=self.data_val,
|
| 256 |
+
batch_sampler=SafeBatchSampler(
|
| 257 |
+
data_source=self.data_val,
|
| 258 |
+
batch_size=self.hparams.batch_size,
|
| 259 |
+
drop_last=False,
|
| 260 |
+
shuffle=False,
|
| 261 |
+
),
|
| 262 |
+
# batch_size=self.hparams.batch_size,
|
| 263 |
+
# shuffle=False,
|
| 264 |
+
num_workers=self.hparams.num_workers,
|
| 265 |
+
pin_memory=self.hparams.pin_memory,
|
| 266 |
+
collate_fn=collate_fn,
|
| 267 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
def test_dataloader(self):
|
| 271 |
+
return DataLoader(
|
| 272 |
+
dataset=self.data_test,
|
| 273 |
+
batch_sampler=SafeBatchSampler(
|
| 274 |
+
data_source=self.data_test,
|
| 275 |
+
batch_size=self.hparams.batch_size,
|
| 276 |
+
drop_last=False,
|
| 277 |
+
shuffle=False,
|
| 278 |
+
),
|
| 279 |
+
# batch_size=self.hparams.batch_size,
|
| 280 |
+
# shuffle=False,
|
| 281 |
+
num_workers=self.hparams.num_workers,
|
| 282 |
+
pin_memory=self.hparams.pin_memory,
|
| 283 |
+
collate_fn=collate_fn,
|
| 284 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
def predict_dataloader(self):
|
| 288 |
+
return DataLoader(
|
| 289 |
+
dataset=self.data_predict,
|
| 290 |
+
batch_sampler=SafeBatchSampler(
|
| 291 |
+
data_source=self.data_predict,
|
| 292 |
+
batch_size=self.hparams.batch_size,
|
| 293 |
+
drop_last=False,
|
| 294 |
+
shuffle=False,
|
| 295 |
+
),
|
| 296 |
+
# batch_size=self.hparams.batch_size,
|
| 297 |
+
# shuffle=False,
|
| 298 |
+
num_workers=self.hparams.num_workers,
|
| 299 |
+
pin_memory=self.hparams.pin_memory,
|
| 300 |
+
collate_fn=collate_fn,
|
| 301 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
def teardown(self, stage: Optional[str] = None):
|
| 305 |
+
"""Clean up after fit or test."""
|
| 306 |
+
pass
|
| 307 |
+
|
| 308 |
+
def state_dict(self):
|
| 309 |
+
"""Extra things to save to checkpoint."""
|
| 310 |
+
return {}
|
| 311 |
+
|
| 312 |
+
def load_state_dict(self, state_dict: Dict[str, Any]):
|
| 313 |
+
"""Things to do when loading checkpoint."""
|
| 314 |
+
pass
|
deepscreen/data/entity_datamodule.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from numbers import Number
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Any, Dict, Optional, Sequence, Type
|
| 4 |
+
|
| 5 |
+
from lightning import LightningDataModule
|
| 6 |
+
from sklearn.base import TransformerMixin
|
| 7 |
+
from torch.utils.data import Dataset, DataLoader
|
| 8 |
+
|
| 9 |
+
from deepscreen.data.utils import collate_fn, SafeBatchSampler
|
| 10 |
+
from deepscreen.data.utils.dataset import BaseEntityDataset
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class EntityDataModule(LightningDataModule):
|
| 14 |
+
"""
|
| 15 |
+
def prepare_data(self):
|
| 16 |
+
# things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
|
| 17 |
+
# download data, pre-process, split, save to disk, etc.
|
| 18 |
+
def setup(self, stage):
|
| 19 |
+
# things to do on every process in DDP
|
| 20 |
+
# load data, set variables, etc.
|
| 21 |
+
def train_dataloader(self):
|
| 22 |
+
# return train dataloader
|
| 23 |
+
def val_dataloader(self):
|
| 24 |
+
# return validation dataloader
|
| 25 |
+
def test_dataloader(self):
|
| 26 |
+
# return test dataloader
|
| 27 |
+
def teardown(self):
|
| 28 |
+
# called on every process in DDP
|
| 29 |
+
# clean up after fit or test
|
| 30 |
+
"""
|
| 31 |
+
def __init__(
|
| 32 |
+
self,
|
| 33 |
+
dataset: type[BaseEntityDataset],
|
| 34 |
+
transformer: type[TransformerMixin],
|
| 35 |
+
train: bool,
|
| 36 |
+
batch_size: int,
|
| 37 |
+
data_dir: str = "data/",
|
| 38 |
+
data_file: Optional[str] = None,
|
| 39 |
+
train_val_test_split: Optional[Sequence[Number], Sequence[str]] = None,
|
| 40 |
+
split: Optional[callable] = None,
|
| 41 |
+
num_workers: int = 0,
|
| 42 |
+
pin_memory: bool = False,
|
| 43 |
+
):
|
| 44 |
+
super().__init__()
|
| 45 |
+
|
| 46 |
+
# data processing
|
| 47 |
+
self.split = split
|
| 48 |
+
|
| 49 |
+
if train:
|
| 50 |
+
if all([data_file, split]):
|
| 51 |
+
if all(isinstance(split, Number) for split in train_val_test_split):
|
| 52 |
+
pass
|
| 53 |
+
else:
|
| 54 |
+
raise ValueError('`train_val_test_split` must be a sequence of 3 numbers '
|
| 55 |
+
'(float for percentages and int for sample numbers) if '
|
| 56 |
+
'`data_file` and `split` have been specified.')
|
| 57 |
+
elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
|
| 58 |
+
self.train_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[0]))
|
| 59 |
+
self.val_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[1]))
|
| 60 |
+
self.test_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[2]))
|
| 61 |
+
else:
|
| 62 |
+
raise ValueError('For training (train=True), you must specify either '
|
| 63 |
+
'`dataset_name` and `split` with `train_val_test_split` of 3 numbers or '
|
| 64 |
+
'solely `train_val_test_split` of 3 data file names.')
|
| 65 |
+
else:
|
| 66 |
+
if data_file and not any([split, train_val_test_split]):
|
| 67 |
+
self.test_data = self.predict_data = dataset(dataset_path=str(Path(data_dir) / data_file))
|
| 68 |
+
else:
|
| 69 |
+
raise ValueError("For testing/predicting (train=False), you must specify only `data_file` without "
|
| 70 |
+
"`train_val_test_split` or `split`")
|
| 71 |
+
|
| 72 |
+
# this line allows to access init params with 'self.hparams' attribute
|
| 73 |
+
# also ensures init params will be stored in ckpt
|
| 74 |
+
self.save_hyperparameters(logger=False)
|
| 75 |
+
def prepare_data(self):
|
| 76 |
+
"""
|
| 77 |
+
Download data if needed.
|
| 78 |
+
Do not use it to assign state (e.g., self.x = x).
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
def setup(self, stage: Optional[str] = None, encoding: str = None):
|
| 82 |
+
"""
|
| 83 |
+
Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
|
| 84 |
+
This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
|
| 85 |
+
careful not to execute data splitting twice.
|
| 86 |
+
"""
|
| 87 |
+
# TODO test SafeBatchSampler (which skips samples with any None without introducing variable batch size)
|
| 88 |
+
# TODO: find a way to apply transformer.fit_transform only to train and transformer.transform only to val, test
|
| 89 |
+
# load and split datasets only if not loaded in initialization
|
| 90 |
+
if not any([self.train_data, self.test_data, self.val_data, self.predict_data]):
|
| 91 |
+
self.train_data, self.val_data, self.test_data = self.split(
|
| 92 |
+
dataset=self.hparams.dataset(data_dir=self.hparams.data_dir,
|
| 93 |
+
dataset_name=self.hparams.train_dataset_name),
|
| 94 |
+
lengths=self.hparams.train_val_test_split
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
def train_dataloader(self):
|
| 98 |
+
return DataLoader(
|
| 99 |
+
dataset=self.train_data,
|
| 100 |
+
batch_sampler=SafeBatchSampler(
|
| 101 |
+
data_source=self.train_data,
|
| 102 |
+
batch_size=self.hparams.batch_size,
|
| 103 |
+
shuffle=True),
|
| 104 |
+
# batch_size=self.hparams.batch_size,
|
| 105 |
+
# shuffle=True,
|
| 106 |
+
num_workers=self.hparams.num_workers,
|
| 107 |
+
pin_memory=self.hparams.pin_memory,
|
| 108 |
+
collate_fn=collate_fn,
|
| 109 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
def val_dataloader(self):
|
| 113 |
+
return DataLoader(
|
| 114 |
+
dataset=self.val_data,
|
| 115 |
+
batch_sampler=SafeBatchSampler(
|
| 116 |
+
data_source=self.val_data,
|
| 117 |
+
batch_size=self.hparams.batch_size,
|
| 118 |
+
shuffle=False),
|
| 119 |
+
# batch_size=self.hparams.batch_size,
|
| 120 |
+
# shuffle=False,
|
| 121 |
+
num_workers=self.hparams.num_workers,
|
| 122 |
+
pin_memory=self.hparams.pin_memory,
|
| 123 |
+
collate_fn=collate_fn,
|
| 124 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
def test_dataloader(self):
|
| 128 |
+
return DataLoader(
|
| 129 |
+
dataset=self.test_data,
|
| 130 |
+
batch_sampler=SafeBatchSampler(
|
| 131 |
+
data_source=self.test_data,
|
| 132 |
+
batch_size=self.hparams.batch_size,
|
| 133 |
+
shuffle=False),
|
| 134 |
+
# batch_size=self.hparams.batch_size,
|
| 135 |
+
# shuffle=False,
|
| 136 |
+
num_workers=self.hparams.num_workers,
|
| 137 |
+
pin_memory=self.hparams.pin_memory,
|
| 138 |
+
collate_fn=collate_fn,
|
| 139 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
def predict_dataloader(self):
|
| 143 |
+
return DataLoader(
|
| 144 |
+
dataset=self.predict_data,
|
| 145 |
+
batch_sampler=SafeBatchSampler(
|
| 146 |
+
data_source=self.predict_data,
|
| 147 |
+
batch_size=self.hparams.batch_size,
|
| 148 |
+
shuffle=False),
|
| 149 |
+
# batch_size=self.hparams.batch_size,
|
| 150 |
+
# shuffle=False,
|
| 151 |
+
num_workers=self.hparams.num_workers,
|
| 152 |
+
pin_memory=self.hparams.pin_memory,
|
| 153 |
+
collate_fn=collate_fn,
|
| 154 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
def teardown(self, stage: Optional[str] = None):
|
| 158 |
+
"""Clean up after fit or test."""
|
| 159 |
+
pass
|
| 160 |
+
|
| 161 |
+
def state_dict(self):
|
| 162 |
+
"""Extra things to save to checkpoint."""
|
| 163 |
+
return {}
|
| 164 |
+
|
| 165 |
+
def load_state_dict(self, state_dict: Dict[str, Any]):
|
| 166 |
+
"""Things to do when loading checkpoint."""
|
| 167 |
+
pass
|
deepscreen/data/featurizers/__init__.py
ADDED
|
File without changes
|
deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (191 Bytes). View file
|
|
|
deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc
ADDED
|
Binary file (5.6 kB). View file
|
|
|
deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc
ADDED
|
Binary file (14.9 kB). View file
|
|
|
deepscreen/data/featurizers/categorical.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
# Sets of KNOWN characters in SMILES and FASTA sequences
|
| 4 |
+
# Use list instead of set to preserve character order
|
| 5 |
+
SMILES_VOCAB = ('#', '%', ')', '(', '+', '-', '.', '1', '0', '3', '2', '5', '4',
|
| 6 |
+
'7', '6', '9', '8', '=', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I',
|
| 7 |
+
'H', 'K', 'M', 'L', 'O', 'N', 'P', 'S', 'R', 'U', 'T', 'W', 'V',
|
| 8 |
+
'Y', '[', 'Z', ']', '_', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i',
|
| 9 |
+
'h', 'm', 'l', 'o', 'n', 's', 'r', 'u', 't', 'y')
|
| 10 |
+
FASTA_VOCAB = ('A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'O',
|
| 11 |
+
'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X', 'Z')
|
| 12 |
+
|
| 13 |
+
# Check uniqueness, create character-index dicts, and add '?' for unknown characters as index 0
|
| 14 |
+
assert len(SMILES_VOCAB) == len(set(SMILES_VOCAB)), 'SMILES_CHARSET has duplicate characters.'
|
| 15 |
+
SMILES_CHARSET_IDX = {character: index+1 for index, character in enumerate(SMILES_VOCAB)} | {'?': 0}
|
| 16 |
+
|
| 17 |
+
assert len(FASTA_VOCAB) == len(set(FASTA_VOCAB)), 'FASTA_CHARSET has duplicate characters.'
|
| 18 |
+
FASTA_CHARSET_IDX = {character: index+1 for index, character in enumerate(FASTA_VOCAB)} | {'?': 0}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def sequence_to_onehot(sequence: str, charset, max_sequence_length: int):
|
| 22 |
+
assert len(charset) == len(set(charset)), '`charset` contains duplicate characters.'
|
| 23 |
+
charset_idx = {character: index+1 for index, character in enumerate(charset)} | {'?': 0}
|
| 24 |
+
|
| 25 |
+
onehot = np.zeros((max_sequence_length, len(charset_idx)), dtype=int)
|
| 26 |
+
for index, character in enumerate(sequence[:max_sequence_length]):
|
| 27 |
+
onehot[index, charset_idx.get(character, 0)] = 1
|
| 28 |
+
|
| 29 |
+
return onehot.transpose()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def sequence_to_label(sequence: str, charset, max_sequence_length: int):
|
| 33 |
+
assert len(charset) == len(set(charset)), '`charset` contains duplicate characters.'
|
| 34 |
+
charset_idx = {character: index+1 for index, character in enumerate(charset)} | {'?': 0}
|
| 35 |
+
|
| 36 |
+
label = np.zeros(max_sequence_length, dtype=int)
|
| 37 |
+
for index, character in enumerate(sequence[:max_sequence_length]):
|
| 38 |
+
label[index] = charset_idx.get(character, 0)
|
| 39 |
+
|
| 40 |
+
return label
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def smiles_to_onehot(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100): # , in_channels: int = len(SMILES_CHARSET)
|
| 44 |
+
# assert len(SMILES_CHARSET) == len(set(SMILES_CHARSET)), 'SMILES_CHARSET has duplicate characters.'
|
| 45 |
+
# onehot = np.zeros((max_sequence_length, len(SMILES_CHARSET_IDX)))
|
| 46 |
+
# for index, character in enumerate(smiles[:max_sequence_length]):
|
| 47 |
+
# onehot[index, SMILES_CHARSET_IDX.get(character, 0)] = 1
|
| 48 |
+
# return onehot.transpose()
|
| 49 |
+
return sequence_to_onehot(smiles, smiles_charset, max_sequence_length)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def smiles_to_label(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100): # , in_channels: int = len(SMILES_CHARSET)
|
| 53 |
+
# label = np.zeros(max_sequence_length)
|
| 54 |
+
# for index, character in enumerate(smiles[:max_sequence_length]):
|
| 55 |
+
# label[index] = SMILES_CHARSET_IDX.get(character, 0)
|
| 56 |
+
# return label
|
| 57 |
+
return sequence_to_label(smiles, smiles_charset, max_sequence_length)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def fasta_to_onehot(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000): # in_channels: int = len(FASTA_CHARSET)
|
| 61 |
+
# onehot = np.zeros((max_sequence_length, len(FASTA_CHARSET_IDX)))
|
| 62 |
+
# for index, character in enumerate(fasta[:max_sequence_length]):
|
| 63 |
+
# onehot[index, FASTA_CHARSET_IDX.get(character, 0)] = 1
|
| 64 |
+
# return onehot.transpose()
|
| 65 |
+
return sequence_to_onehot(fasta, fasta_charset, max_sequence_length)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def fasta_to_label(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000): # in_channels: int = len(FASTA_CHARSET)
|
| 69 |
+
# label = np.zeros(max_sequence_length)
|
| 70 |
+
# for index, character in enumerate(fasta[:max_sequence_length]):
|
| 71 |
+
# label[index] = FASTA_CHARSET_IDX.get(character, 0)
|
| 72 |
+
# return label
|
| 73 |
+
return sequence_to_label(fasta, fasta_charset, max_sequence_length)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def one_of_k_encoding(x, allowable_set):
|
| 77 |
+
if x not in allowable_set:
|
| 78 |
+
raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
|
| 79 |
+
return list(map(lambda s: x == s, allowable_set))
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def one_of_k_encoding_unk(x, allowable_set):
|
| 83 |
+
"""Maps inputs not in the allowable set to the last element."""
|
| 84 |
+
if x not in allowable_set:
|
| 85 |
+
x = allowable_set[-1]
|
| 86 |
+
return list(map(lambda s: x == s, allowable_set))
|
deepscreen/data/featurizers/chem.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Mainly adapted from MolMap:
|
| 3 |
+
https://github.com/shenwanxiang/bidd-molmap/tree/master/molmap/feature/fingerprint
|
| 4 |
+
"""
|
| 5 |
+
import numpy as np
|
| 6 |
+
from rdkit import Chem, DataStructs
|
| 7 |
+
from rdkit.Chem import AllChem
|
| 8 |
+
from rdkit.Chem.Fingerprints import FingerprintMols
|
| 9 |
+
from rdkit.Chem.rdReducedGraphs import GetErGFingerprint
|
| 10 |
+
|
| 11 |
+
from deepscreen import get_logger
|
| 12 |
+
|
| 13 |
+
log = get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def smiles_to_erg(smiles):
|
| 17 |
+
try:
|
| 18 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 19 |
+
features = np.array(GetErGFingerprint(mol), dtype=bool)
|
| 20 |
+
return features
|
| 21 |
+
except Exception as e:
|
| 22 |
+
log.warning(f"Failed to convert SMILES ({smiles}) to ErGFP due to {str(e)}")
|
| 23 |
+
return None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def smiles_to_morgan(smiles, radius=2, n_bits=1024):
|
| 27 |
+
try:
|
| 28 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 29 |
+
features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
|
| 30 |
+
features = np.zeros((1,))
|
| 31 |
+
DataStructs.ConvertToNumpyArray(features_vec, features)
|
| 32 |
+
except Exception as e:
|
| 33 |
+
log.warning(f"Failed to convert SMILES ({smiles}) to ErGFP due to {str(e)}")
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def smiles_to_daylight(smiles):
|
| 38 |
+
try:
|
| 39 |
+
NumFinger = 2048
|
| 40 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 41 |
+
bv = FingerprintMols.FingerprintMol(mol)
|
| 42 |
+
temp = tuple(bv.GetOnBits())
|
| 43 |
+
features = np.zeros((NumFinger,))
|
| 44 |
+
features[np.array(temp)] = 1
|
| 45 |
+
except:
|
| 46 |
+
print(f'RDKit could not find this SMILES: {smiles} convert to all 0 features')
|
| 47 |
+
features = np.zeros((2048,))
|
| 48 |
+
return features.astype(int)
|
deepscreen/data/featurizers/fcs.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from importlib import resources
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from subword_nmt.apply_bpe import BPE
|
| 6 |
+
import codecs
|
| 7 |
+
|
| 8 |
+
vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/protein_codes_uniprot.txt')
|
| 9 |
+
bpe_codes_protein = codecs.open(vocab_path)
|
| 10 |
+
protein_bpe = BPE(bpe_codes_protein, merges=-1, separator='')
|
| 11 |
+
|
| 12 |
+
sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_uniprot.csv')
|
| 13 |
+
sub_csv = pd.read_csv(sub_csv_path)
|
| 14 |
+
idx2word_protein = sub_csv['index'].values
|
| 15 |
+
words2idx_protein = dict(zip(idx2word_protein, range(0, len(idx2word_protein))))
|
| 16 |
+
|
| 17 |
+
vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/drug_codes_chembl.txt')
|
| 18 |
+
bpe_codes_drug = codecs.open(vocab_path)
|
| 19 |
+
drug_bpe = BPE(bpe_codes_drug, merges=-1, separator='')
|
| 20 |
+
|
| 21 |
+
sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_chembl.csv')
|
| 22 |
+
sub_csv = pd.read_csv(sub_csv_path)
|
| 23 |
+
idx2word_drug = sub_csv['index'].values
|
| 24 |
+
words2idx_drug = dict(zip(idx2word_drug, range(0, len(idx2word_drug))))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def protein_to_embedding(x, max_sequence_length):
|
| 28 |
+
max_p = max_sequence_length
|
| 29 |
+
t1 = protein_bpe.process_line(x).split() # split
|
| 30 |
+
try:
|
| 31 |
+
i1 = np.asarray([words2idx_protein[i] for i in t1]) # index
|
| 32 |
+
except:
|
| 33 |
+
i1 = np.array([0])
|
| 34 |
+
# print(x)
|
| 35 |
+
|
| 36 |
+
l = len(i1)
|
| 37 |
+
|
| 38 |
+
if l < max_p:
|
| 39 |
+
i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
|
| 40 |
+
input_mask = ([1] * l) + ([0] * (max_p - l))
|
| 41 |
+
else:
|
| 42 |
+
i = i1[:max_p]
|
| 43 |
+
input_mask = [1] * max_p
|
| 44 |
+
|
| 45 |
+
return i, np.asarray(input_mask)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def drug_to_embedding(x, max_sequence_length):
|
| 49 |
+
max_d = max_sequence_length
|
| 50 |
+
t1 = drug_bpe.process_line(x).split() # split
|
| 51 |
+
try:
|
| 52 |
+
i1 = np.asarray([words2idx_drug[i] for i in t1]) # index
|
| 53 |
+
except:
|
| 54 |
+
i1 = np.array([0])
|
| 55 |
+
# print(x)
|
| 56 |
+
|
| 57 |
+
l = len(i1)
|
| 58 |
+
|
| 59 |
+
if l < max_d:
|
| 60 |
+
i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
|
| 61 |
+
input_mask = ([1] * l) + ([0] * (max_d - l))
|
| 62 |
+
|
| 63 |
+
else:
|
| 64 |
+
i = i1[:max_d]
|
| 65 |
+
input_mask = [1] * max_d
|
| 66 |
+
|
| 67 |
+
return i, np.asarray(input_mask)
|
deepscreen/data/featurizers/fingerprint/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
|
| 3 |
+
from .atompairs import GetAtomPairFPs
|
| 4 |
+
from .avalonfp import GetAvalonFPs
|
| 5 |
+
from .rdkitfp import GetRDkitFPs
|
| 6 |
+
from .morganfp import GetMorganFPs
|
| 7 |
+
from .estatefp import GetEstateFPs
|
| 8 |
+
from .maccskeys import GetMACCSFPs
|
| 9 |
+
from .pharmErGfp import GetPharmacoErGFPs
|
| 10 |
+
from .pharmPointfp import GetPharmacoPFPs
|
| 11 |
+
from .pubchemfp import GetPubChemFPs
|
| 12 |
+
from .torsions import GetTorsionFPs
|
| 13 |
+
from .mhfp6 import GetMHFP6
|
| 14 |
+
# from .map4 import GetMAP4
|
| 15 |
+
from rdkit import Chem
|
| 16 |
+
|
| 17 |
+
from deepscreen import get_logger
|
| 18 |
+
|
| 19 |
+
log = get_logger(__name__)
|
| 20 |
+
|
| 21 |
+
FP_MAP = {
|
| 22 |
+
'MorganFP': GetMorganFPs,
|
| 23 |
+
'RDkitFP': GetRDkitFPs,
|
| 24 |
+
'AtomPairFP': GetAtomPairFPs,
|
| 25 |
+
'TorsionFP': GetTorsionFPs,
|
| 26 |
+
'AvalonFP': GetAvalonFPs,
|
| 27 |
+
'EstateFP': GetEstateFPs,
|
| 28 |
+
'MACCSFP': GetMACCSFPs,
|
| 29 |
+
'PharmacoErGFP': GetPharmacoErGFPs,
|
| 30 |
+
'PharmacoPFP': GetPharmacoPFPs,
|
| 31 |
+
'PubChemFP': GetPubChemFPs,
|
| 32 |
+
'MHFP6': GetMHFP6,
|
| 33 |
+
# 'MAP4': GetMAP4,
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def smiles_to_fingerprint(smiles, fingerprint: Literal[tuple(FP_MAP.keys())], **kwargs):
|
| 38 |
+
func = FP_MAP[fingerprint]
|
| 39 |
+
try:
|
| 40 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 41 |
+
arr = func(mol, **kwargs)
|
| 42 |
+
return arr
|
| 43 |
+
except Exception as e:
|
| 44 |
+
log.warning(f"Failed to convert SMILES ({smiles}) to {fingerprint} due to {str(e)}")
|
| 45 |
+
return None
|
deepscreen/data/featurizers/fingerprint/atompairs.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rdkit.Chem.AtomPairs import Pairs
|
| 2 |
+
from rdkit.Chem import DataStructs
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
_type = 'topological-based'
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def GetAtomPairFPs(mol, nBits=2048, binary=True):
|
| 9 |
+
'''
|
| 10 |
+
atompairs fingerprints
|
| 11 |
+
'''
|
| 12 |
+
fp = Pairs.GetHashedAtomPairFingerprint(mol, nBits=nBits)
|
| 13 |
+
if binary:
|
| 14 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
| 15 |
+
else:
|
| 16 |
+
arr = np.zeros((0,), dtype=np.int8)
|
| 17 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
| 18 |
+
return arr
|
deepscreen/data/featurizers/fingerprint/avalonfp.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rdkit.Chem import DataStructs
|
| 2 |
+
from rdkit.Avalon.pyAvalonTools import GetAvalonFP as GAFP
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
_type = 'topological-based'
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def GetAvalonFPs(mol, nBits=2048):
|
| 9 |
+
'''
|
| 10 |
+
Avalon_fingerprints: https://pubs.acs.org/doi/pdf/10.1021/ci050413p
|
| 11 |
+
'''
|
| 12 |
+
|
| 13 |
+
fp = GAFP(mol, nBits=nBits)
|
| 14 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
| 15 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
| 16 |
+
return arr
|
deepscreen/data/featurizers/fingerprint/estatefp.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rdkit.Chem.EState import Fingerprinter
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
_type = 'Estate-based'
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def GetEstateFPs(mol):
|
| 8 |
+
'''
|
| 9 |
+
79 bits Estate fps
|
| 10 |
+
'''
|
| 11 |
+
x = Fingerprinter.FingerprintMol(mol)[0]
|
| 12 |
+
return x.astype(np.bool_)
|
deepscreen/data/featurizers/fingerprint/maccskeys.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rdkit.Chem import AllChem
|
| 2 |
+
from rdkit.Chem import DataStructs
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
_type = 'SMARTS-based'
|
| 8 |
+
|
| 9 |
+
file_path = os.path.dirname(__file__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def GetMACCSFPs(mol):
|
| 13 |
+
'''
|
| 14 |
+
166 bits
|
| 15 |
+
'''
|
| 16 |
+
|
| 17 |
+
fp = AllChem.GetMACCSKeysFingerprint(mol)
|
| 18 |
+
|
| 19 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
| 20 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
| 21 |
+
return arr
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def GetMACCSFPInfos():
|
| 25 |
+
return pd.read_excel(os.path.join(file_path, 'maccskeys.xlsx'))
|
deepscreen/data/featurizers/fingerprint/maccskeys.xlsx
ADDED
|
Binary file (14 kB). View file
|
|
|
deepscreen/data/featurizers/fingerprint/map4.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MinHashed Atom-pair Fingerprint, MAP
|
| 3 |
+
orignal paper: Capecchi, Alice, Daniel Probst, and Jean-Louis Reymond. "One molecular fingerprint to rule them all: drugs, biomolecules, and the metabolome." Journal of Cheminformatics 12.1 (2020): 1-15. orignal code: https://github.com/reymond-group/map4, thanks their orignal work
|
| 4 |
+
|
| 5 |
+
A small bug is fixed: https://github.com/reymond-group/map4/issues/6
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
_type = 'topological-based'
|
| 9 |
+
|
| 10 |
+
import itertools
|
| 11 |
+
from collections import defaultdict
|
| 12 |
+
|
| 13 |
+
import tmap as tm
|
| 14 |
+
from mhfp.encoder import MHFPEncoder
|
| 15 |
+
from rdkit import Chem
|
| 16 |
+
from rdkit.Chem import rdmolops
|
| 17 |
+
from rdkit.Chem.rdmolops import GetDistanceMatrix
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def to_smiles(mol):
|
| 21 |
+
return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class MAP4Calculator:
|
| 25 |
+
def __init__(self, dimensions=2048, radius=2, is_counted=False, is_folded=False, fold_dimensions=2048):
|
| 26 |
+
"""
|
| 27 |
+
MAP4 calculator class
|
| 28 |
+
"""
|
| 29 |
+
self.dimensions = dimensions
|
| 30 |
+
self.radius = radius
|
| 31 |
+
self.is_counted = is_counted
|
| 32 |
+
self.is_folded = is_folded
|
| 33 |
+
self.fold_dimensions = fold_dimensions
|
| 34 |
+
|
| 35 |
+
if self.is_folded:
|
| 36 |
+
self.encoder = MHFPEncoder(dimensions)
|
| 37 |
+
else:
|
| 38 |
+
self.encoder = tm.Minhash(dimensions)
|
| 39 |
+
|
| 40 |
+
def calculate(self, mol):
|
| 41 |
+
"""Calculates the atom pair minhashed fingerprint
|
| 42 |
+
Arguments:
|
| 43 |
+
mol -- rdkit mol object
|
| 44 |
+
Returns:
|
| 45 |
+
tmap VectorUint -- minhashed fingerprint
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
atom_env_pairs = self._calculate(mol)
|
| 49 |
+
if self.is_folded:
|
| 50 |
+
return self._fold(atom_env_pairs)
|
| 51 |
+
return self.encoder.from_string_array(atom_env_pairs)
|
| 52 |
+
|
| 53 |
+
def calculate_many(self, mols):
|
| 54 |
+
""" Calculates the atom pair minhashed fingerprint
|
| 55 |
+
Arguments:
|
| 56 |
+
mols -- list of mols
|
| 57 |
+
Returns:
|
| 58 |
+
list of tmap VectorUint -- minhashed fingerprints list
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
atom_env_pairs_list = [self._calculate(mol) for mol in mols]
|
| 62 |
+
if self.is_folded:
|
| 63 |
+
return [self._fold(pairs) for pairs in atom_env_pairs_list]
|
| 64 |
+
return self.encoder.batch_from_string_array(atom_env_pairs_list)
|
| 65 |
+
|
| 66 |
+
def _calculate(self, mol):
|
| 67 |
+
return self._all_pairs(mol, self._get_atom_envs(mol))
|
| 68 |
+
|
| 69 |
+
def _fold(self, pairs):
|
| 70 |
+
fp_hash = self.encoder.hash(set(pairs))
|
| 71 |
+
return self.encoder.fold(fp_hash, self.fold_dimensions)
|
| 72 |
+
|
| 73 |
+
def _get_atom_envs(self, mol):
|
| 74 |
+
atoms_env = {}
|
| 75 |
+
for atom in mol.GetAtoms():
|
| 76 |
+
idx = atom.GetIdx()
|
| 77 |
+
for radius in range(1, self.radius + 1):
|
| 78 |
+
if idx not in atoms_env:
|
| 79 |
+
atoms_env[idx] = []
|
| 80 |
+
atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius))
|
| 81 |
+
return atoms_env
|
| 82 |
+
|
| 83 |
+
@classmethod
|
| 84 |
+
def _find_env(cls, mol, idx, radius):
|
| 85 |
+
env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
|
| 86 |
+
atom_map = {}
|
| 87 |
+
|
| 88 |
+
submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
|
| 89 |
+
if idx in atom_map:
|
| 90 |
+
smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False)
|
| 91 |
+
return smiles
|
| 92 |
+
return ''
|
| 93 |
+
|
| 94 |
+
def _all_pairs(self, mol, atoms_env):
|
| 95 |
+
atom_pairs = []
|
| 96 |
+
distance_matrix = GetDistanceMatrix(mol)
|
| 97 |
+
num_atoms = mol.GetNumAtoms()
|
| 98 |
+
shingle_dict = defaultdict(int)
|
| 99 |
+
for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
|
| 100 |
+
dist = str(int(distance_matrix[idx1][idx2]))
|
| 101 |
+
|
| 102 |
+
for i in range(self.radius):
|
| 103 |
+
env_a = atoms_env[idx1][i]
|
| 104 |
+
env_b = atoms_env[idx2][i]
|
| 105 |
+
|
| 106 |
+
ordered = sorted([env_a, env_b])
|
| 107 |
+
|
| 108 |
+
shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])
|
| 109 |
+
|
| 110 |
+
if self.is_counted:
|
| 111 |
+
shingle_dict[shingle] += 1
|
| 112 |
+
shingle += '|' + str(shingle_dict[shingle])
|
| 113 |
+
|
| 114 |
+
atom_pairs.append(shingle.encode('utf-8'))
|
| 115 |
+
return list(set(atom_pairs))
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def GetMAP4(mol, nBits=2048, radius=2, fold_dimensions=None):
|
| 119 |
+
"""
|
| 120 |
+
MAP4: radius=2
|
| 121 |
+
"""
|
| 122 |
+
if fold_dimensions == None:
|
| 123 |
+
fold_dimensions = nBits
|
| 124 |
+
|
| 125 |
+
calc = MAP4Calculator(dimensions=nBits, radius=radius, is_counted=False, is_folded=True,
|
| 126 |
+
fold_dimensions=fold_dimensions)
|
| 127 |
+
|
| 128 |
+
arr = calc.calculate(mol)
|
| 129 |
+
|
| 130 |
+
return arr.astype(bool)
|
deepscreen/data/featurizers/fingerprint/mhfp6.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Probst, Daniel, and Jean-Louis Reymond. "A probabilistic molecular fingerprint for big data settings." Journal of cheminformatics 10.1 (2018): 66.'
|
| 3 |
+
|
| 4 |
+
orignal code: https://github.com/reymond-group/mhfp
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from mhfp.encoder import MHFPEncoder
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def GetMHFP6(mol, nBits=2048, radius=3):
|
| 12 |
+
"""
|
| 13 |
+
MHFP6: radius=3
|
| 14 |
+
"""
|
| 15 |
+
encoder = MHFPEncoder(n_permutations=nBits)
|
| 16 |
+
hash_values = encoder.encode_mol(mol, radius=radius, rings=True, kekulize=True, min_radius=1)
|
| 17 |
+
arr = encoder.fold(hash_values, nBits)
|
| 18 |
+
return arr.astype(bool)
|
deepscreen/data/featurizers/fingerprint/mnimalfatures.fdef
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
AtomType NDonor [N&!H0&v3,N&!H0&+1&v4,n&H1&+0]
|
| 2 |
+
AtomType ChalcDonor [O,S;H1;+0]
|
| 3 |
+
DefineFeature SingleAtomDonor [{NDonor},{ChalcDonor},!$([D1]-[C;D3]=[O,S,N])]
|
| 4 |
+
Family Donor
|
| 5 |
+
Weights 1
|
| 6 |
+
EndFeature
|
| 7 |
+
|
| 8 |
+
AtomType NAcceptor [$([N&v3;H1,H2]-[!$(*=[O,N,P,S])])]
|
| 9 |
+
Atomtype NAcceptor [$([N;v3;H0])]
|
| 10 |
+
AtomType NAcceptor [$([n;+0])]
|
| 11 |
+
AtomType ChalcAcceptor [$([O,S;H1;v2]-[!$(*=[O,N,P,S])])]
|
| 12 |
+
AtomType ChalcAcceptor [O,S;H0;v2]
|
| 13 |
+
Atomtype ChalcAcceptor [O,S;-]
|
| 14 |
+
Atomtype ChalcAcceptor [o,s;+0]
|
| 15 |
+
AtomType HalogenAcceptor [F]
|
| 16 |
+
DefineFeature SingleAtomAcceptor [{NAcceptor},{ChalcAcceptor},{HalogenAcceptor}]
|
| 17 |
+
Family Acceptor
|
| 18 |
+
Weights 1
|
| 19 |
+
EndFeature
|
| 20 |
+
|
| 21 |
+
# this one is delightfully easy:
|
| 22 |
+
DefineFeature AcidicGroup [C,S](=[O,S,P])-[O;H1,H0&-1]
|
| 23 |
+
Family NegIonizable
|
| 24 |
+
Weights 1.0,1.0,1.0
|
| 25 |
+
EndFeature
|
| 26 |
+
|
| 27 |
+
AtomType CarbonOrArom_NonCarbonyl [$([C,a]);!$([C,a](=O))]
|
| 28 |
+
AtomType BasicNH2 [$([N;H2&+0][{CarbonOrArom_NonCarbonyl}])]
|
| 29 |
+
AtomType BasicNH1 [$([N;H1&+0]([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
|
| 30 |
+
AtomType BasicNH0 [$([N;H0&+0]([{CarbonOrArom_NonCarbonyl}])([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
|
| 31 |
+
AtomType BasicNakedN [N,n;X2;+0]
|
| 32 |
+
DefineFeature BasicGroup [{BasicNH2},{BasicNH1},{BasicNH0},{BasicNakedN}]
|
| 33 |
+
Family PosIonizable
|
| 34 |
+
Weights 1.0
|
| 35 |
+
EndFeature
|
| 36 |
+
|
| 37 |
+
# aromatic rings of various sizes:
|
| 38 |
+
DefineFeature Arom5 a1aaaa1
|
| 39 |
+
Family Aromatic
|
| 40 |
+
Weights 1.0,1.0,1.0,1.0,1.0
|
| 41 |
+
EndFeature
|
| 42 |
+
DefineFeature Arom6 a1aaaaa1
|
| 43 |
+
Family Aromatic
|
| 44 |
+
Weights 1.0,1.0,1.0,1.0,1.0,1.0
|
| 45 |
+
EndFeature
|
| 46 |
+
DefineFeature Arom7 a1aaaaaa1
|
| 47 |
+
Family Aromatic
|
| 48 |
+
Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0
|
| 49 |
+
EndFeature
|
| 50 |
+
DefineFeature Arom8 a1aaaaaaa1
|
| 51 |
+
Family Aromatic
|
| 52 |
+
Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
|
| 53 |
+
EndFeature
|
deepscreen/data/featurizers/fingerprint/morganfp.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rdkit.Chem import AllChem
|
| 2 |
+
from rdkit.Chem import DataStructs
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def GetMorganFPs(mol, nBits=2048, radius=2, return_bitInfo=False):
|
| 7 |
+
"""
|
| 8 |
+
ECFP4: radius=2
|
| 9 |
+
"""
|
| 10 |
+
bitInfo = {}
|
| 11 |
+
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius,
|
| 12 |
+
bitInfo=bitInfo, nBits=nBits)
|
| 13 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
| 14 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
| 15 |
+
|
| 16 |
+
if return_bitInfo:
|
| 17 |
+
return arr, bitInfo
|
| 18 |
+
return arr
|
deepscreen/data/featurizers/fingerprint/pharmErGfp.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
Created on Sat Aug 17 16:54:12 2019
|
| 5 |
+
|
| 6 |
+
@author: [email protected]
|
| 7 |
+
|
| 8 |
+
@calculate ErG fps, more info: https://pubs.acs.org/doi/full/10.1021/ci050457y#
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
_type = 'Pharmacophore-based'
|
| 12 |
+
|
| 13 |
+
import numpy as np
|
| 14 |
+
from rdkit.Chem import AllChem
|
| 15 |
+
|
| 16 |
+
## get info from : https://github.com/rdkit/rdkit/blob/d41752d558bf7200ab67b98cdd9e37f1bdd378de/Code/GraphMol/ReducedGraphs/ReducedGraphs.cpp
|
| 17 |
+
Donor = ["[N;!H0;v3,v4&+1]", "[O,S;H1;+0]", "[n&H1&+0]"]
|
| 18 |
+
|
| 19 |
+
Acceptor = ["[O,S;H1;v2;!$(*-*=[O,N,P,S])]", "[O;H0;v2]", "[O,S;v1;-]",
|
| 20 |
+
"[N;v3;!$(N-*=[O,N,P,S])]", "[n&H0&+0]", "[o;+0;!$([o]:n);!$([o]:c:n)]"]
|
| 21 |
+
|
| 22 |
+
Positive = ["[#7;+]", "[N;H2&+0][$([C,a]);!$([C,a](=O))]",
|
| 23 |
+
"[N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]",
|
| 24 |
+
"[N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))]"]
|
| 25 |
+
|
| 26 |
+
Negative = ["[C,S](=[O,S,P])-[O;H1,-1]"]
|
| 27 |
+
|
| 28 |
+
Hydrophobic = ["[C;D3,D4](-[CH3])-[CH3]", "[S;D2](-C)-C"]
|
| 29 |
+
|
| 30 |
+
Aromatic = ["a"]
|
| 31 |
+
|
| 32 |
+
PROPERTY_KEY = ["Donor", "Acceptor", "Positive", "Negative", "Hydrophobic", "Aromatic"]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def GetPharmacoErGFPs(mol, fuzzIncrement=0.3, maxPath=21, binary=True, return_bitInfo=False):
|
| 36 |
+
'''
|
| 37 |
+
https://pubs.acs.org/doi/full/10.1021/ci050457y#
|
| 38 |
+
return maxPath*21 bits
|
| 39 |
+
|
| 40 |
+
size(v) = (n(n + 1)/2) * (maxDist - minDist + 1)
|
| 41 |
+
|
| 42 |
+
'''
|
| 43 |
+
minPath = 1
|
| 44 |
+
|
| 45 |
+
arr = AllChem.GetErGFingerprint(mol, fuzzIncrement=fuzzIncrement, maxPath=maxPath, minPath=minPath)
|
| 46 |
+
arr = arr.astype(np.float32)
|
| 47 |
+
|
| 48 |
+
if binary:
|
| 49 |
+
arr = arr.astype(np.bool_)
|
| 50 |
+
|
| 51 |
+
if return_bitInfo:
|
| 52 |
+
bitInfo = []
|
| 53 |
+
for i in range(len(PROPERTY_KEY)):
|
| 54 |
+
for j in range(i, len(PROPERTY_KEY)):
|
| 55 |
+
for path in range(minPath, maxPath + 1):
|
| 56 |
+
triplet = (PROPERTY_KEY[i], PROPERTY_KEY[j], path)
|
| 57 |
+
bitInfo.append(triplet)
|
| 58 |
+
return arr, bitInfo
|
| 59 |
+
|
| 60 |
+
return arr
|
deepscreen/data/featurizers/fingerprint/pharmPointfp.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
Created on Sat Aug 17 16:54:12 2019
|
| 5 |
+
|
| 6 |
+
@author: [email protected]
|
| 7 |
+
|
| 8 |
+
Combining a set of chemical features with the 2D (topological) distances between them gives a 2D pharmacophore. When the distances are binned, unique integer ids can be assigned to each of these pharmacophores and they can be stored in a fingerprint. Details of the encoding are in: https://www.rdkit.org/docs/RDKit_Book.html#ph4-figure
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
_type = 'Pharmacophore-based'
|
| 12 |
+
|
| 13 |
+
from rdkit.Chem.Pharm2D.SigFactory import SigFactory
|
| 14 |
+
from rdkit.Chem.Pharm2D import Generate
|
| 15 |
+
from rdkit.Chem import DataStructs
|
| 16 |
+
from rdkit.Chem import ChemicalFeatures
|
| 17 |
+
|
| 18 |
+
import numpy as np
|
| 19 |
+
import os
|
| 20 |
+
|
| 21 |
+
fdef = os.path.join(os.path.dirname(__file__), 'mnimalfatures.fdef')
|
| 22 |
+
featFactory = ChemicalFeatures.BuildFeatureFactory(fdef)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def GetPharmacoPFPs(mol,
|
| 26 |
+
bins=[(i, i + 1) for i in range(20)],
|
| 27 |
+
minPointCount=2,
|
| 28 |
+
maxPointCount=2,
|
| 29 |
+
return_bitInfo=False):
|
| 30 |
+
'''
|
| 31 |
+
Note: maxPointCont with 3 is slowly
|
| 32 |
+
|
| 33 |
+
bins = [(i,i+1) for i in range(20)],
|
| 34 |
+
maxPonitCount=2 for large-scale computation
|
| 35 |
+
|
| 36 |
+
'''
|
| 37 |
+
MysigFactory = SigFactory(featFactory,
|
| 38 |
+
trianglePruneBins=False,
|
| 39 |
+
minPointCount=minPointCount,
|
| 40 |
+
maxPointCount=maxPointCount)
|
| 41 |
+
MysigFactory.SetBins(bins)
|
| 42 |
+
MysigFactory.Init()
|
| 43 |
+
|
| 44 |
+
res = Generate.Gen2DFingerprint(mol, MysigFactory)
|
| 45 |
+
arr = np.array(list(res)).astype(np.bool_)
|
| 46 |
+
if return_bitInfo:
|
| 47 |
+
description = []
|
| 48 |
+
for i in range(len(res)):
|
| 49 |
+
description.append(MysigFactory.GetBitDescription(i))
|
| 50 |
+
return arr, description
|
| 51 |
+
|
| 52 |
+
return arr
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
if __name__ == '__main__':
|
| 56 |
+
from rdkit import Chem
|
| 57 |
+
|
| 58 |
+
mol = Chem.MolFromSmiles('CC#CC(=O)NC1=NC=C2C(=C1)C(=NC=N2)NC3=CC(=C(C=C3)F)Cl')
|
| 59 |
+
a = GetPharmacoPFPs(mol, bins=[(i, i + 1) for i in range(20)], minPointCount=2, maxPointCount=2)
|
deepscreen/data/featurizers/fingerprint/pubchemfp.py
ADDED
|
@@ -0,0 +1,1731 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
Created on Sun Aug 25 20:29:36 2019
|
| 5 |
+
|
| 6 |
+
@author: charleshen
|
| 7 |
+
|
| 8 |
+
@Note: The code are copyed from PyBioMed, with a minor repair
|
| 9 |
+
|
| 10 |
+
https://www.ncbi.nlm.nih.gov/pubmed/29556758
|
| 11 |
+
|
| 12 |
+
these are SMARTS patterns corresponding to the PubChem fingerprints
|
| 13 |
+
https://astro.temple.edu/~tua87106/list_fingerprints.pdf
|
| 14 |
+
ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
|
| 15 |
+
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
_type = 'SMARTS-based'
|
| 19 |
+
|
| 20 |
+
import numpy as np
|
| 21 |
+
from rdkit import Chem
|
| 22 |
+
from rdkit import DataStructs
|
| 23 |
+
import os
|
| 24 |
+
import pandas as pd
|
| 25 |
+
|
| 26 |
+
smartsPatts = {
|
| 27 |
+
1: ('[H]', 3), # 1-115
|
| 28 |
+
2: ('[H]', 7),
|
| 29 |
+
3: ('[H]', 15),
|
| 30 |
+
4: ('[H]', 31),
|
| 31 |
+
5: ('[Li]', 0),
|
| 32 |
+
6: ('[Li]', 1),
|
| 33 |
+
7: ('[B]', 0),
|
| 34 |
+
8: ('[B]', 1),
|
| 35 |
+
9: ('[B]', 3),
|
| 36 |
+
10: ('[C]', 1),
|
| 37 |
+
11: ('[C]', 3),
|
| 38 |
+
12: ('[C]', 7),
|
| 39 |
+
13: ('[C]', 15),
|
| 40 |
+
14: ('[C]', 31),
|
| 41 |
+
15: ('[N]', 0),
|
| 42 |
+
16: ('[N]', 1),
|
| 43 |
+
17: ('[N]', 3),
|
| 44 |
+
18: ('[N]', 7),
|
| 45 |
+
19: ('[O]', 0),
|
| 46 |
+
20: ('[O]', 1),
|
| 47 |
+
21: ('[O]', 3),
|
| 48 |
+
22: ('[O]', 7),
|
| 49 |
+
23: ('[O]', 15),
|
| 50 |
+
24: ('[F]', 0),
|
| 51 |
+
25: ('[F]', 1),
|
| 52 |
+
26: ('[F]', 3),
|
| 53 |
+
27: ('[Na]', 0),
|
| 54 |
+
28: ('[Na]', 1),
|
| 55 |
+
29: ('[Si]', 0),
|
| 56 |
+
30: ('[Si]', 1),
|
| 57 |
+
31: ('[P]', 0),
|
| 58 |
+
32: ('[P]', 1),
|
| 59 |
+
33: ('[P]', 3),
|
| 60 |
+
34: ('[S]', 0),
|
| 61 |
+
35: ('[S]', 1),
|
| 62 |
+
36: ('[S]', 3),
|
| 63 |
+
37: ('[S]', 7),
|
| 64 |
+
38: ('[Cl]', 0),
|
| 65 |
+
39: ('[Cl]', 1),
|
| 66 |
+
40: ('[Cl]', 3),
|
| 67 |
+
41: ('[Cl]', 7),
|
| 68 |
+
42: ('[K]', 0),
|
| 69 |
+
43: ('[K]', 1),
|
| 70 |
+
44: ('[Br]', 0),
|
| 71 |
+
45: ('[Br]', 1),
|
| 72 |
+
46: ('[Br]', 3),
|
| 73 |
+
47: ('[I]', 0),
|
| 74 |
+
48: ('[I]', 1),
|
| 75 |
+
49: ('[I]', 3),
|
| 76 |
+
50: ('[Be]', 0),
|
| 77 |
+
51: ('[Mg]', 0),
|
| 78 |
+
52: ('[Al]', 0),
|
| 79 |
+
53: ('[Ca]', 0),
|
| 80 |
+
54: ('[Sc]', 0),
|
| 81 |
+
55: ('[Ti]', 0),
|
| 82 |
+
56: ('[V]', 0),
|
| 83 |
+
57: ('[Cr]', 0),
|
| 84 |
+
58: ('[Mn]', 0),
|
| 85 |
+
59: ('[Fe]', 0),
|
| 86 |
+
60: ('[CO]', 0),
|
| 87 |
+
61: ('[Ni]', 0),
|
| 88 |
+
62: ('[Cu]', 0),
|
| 89 |
+
63: ('[Zn]', 0),
|
| 90 |
+
64: ('[Ga]', 0),
|
| 91 |
+
65: ('[Ge]', 0),
|
| 92 |
+
66: ('[As]', 0),
|
| 93 |
+
67: ('[Se]', 0),
|
| 94 |
+
68: ('[Kr]', 0),
|
| 95 |
+
69: ('[Rb]', 0),
|
| 96 |
+
70: ('[Sr]', 0),
|
| 97 |
+
71: ('[Y]', 0),
|
| 98 |
+
72: ('[Zr]', 0),
|
| 99 |
+
73: ('[Nb]', 0),
|
| 100 |
+
74: ('[Mo]', 0),
|
| 101 |
+
75: ('[Ru]', 0),
|
| 102 |
+
76: ('[Rh]', 0),
|
| 103 |
+
77: ('[Pd]', 0),
|
| 104 |
+
78: ('[Ag]', 0),
|
| 105 |
+
79: ('[Cd]', 0),
|
| 106 |
+
80: ('[In]', 0),
|
| 107 |
+
81: ('[Sn]', 0),
|
| 108 |
+
82: ('[Sb]', 0),
|
| 109 |
+
83: ('[Te]', 0),
|
| 110 |
+
84: ('[Xe]', 0),
|
| 111 |
+
85: ('[Cs]', 0),
|
| 112 |
+
86: ('[Ba]', 0),
|
| 113 |
+
87: ('[Lu]', 0),
|
| 114 |
+
88: ('[Hf]', 0),
|
| 115 |
+
89: ('[Ta]', 0),
|
| 116 |
+
90: ('[W]', 0),
|
| 117 |
+
91: ('[Re]', 0),
|
| 118 |
+
92: ('[Os]', 0),
|
| 119 |
+
93: ('[Ir]', 0),
|
| 120 |
+
94: ('[Pt]', 0),
|
| 121 |
+
95: ('[Au]', 0),
|
| 122 |
+
96: ('[Hg]', 0),
|
| 123 |
+
97: ('[Tl]', 0),
|
| 124 |
+
98: ('[Pb]', 0),
|
| 125 |
+
99: ('[Bi]', 0),
|
| 126 |
+
100: ('[La]', 0),
|
| 127 |
+
101: ('[Ce]', 0),
|
| 128 |
+
102: ('[Pr]', 0),
|
| 129 |
+
103: ('[Nd]', 0),
|
| 130 |
+
104: ('[Pm]', 0),
|
| 131 |
+
105: ('[Sm]', 0),
|
| 132 |
+
106: ('[Eu]', 0),
|
| 133 |
+
107: ('[Gd]', 0),
|
| 134 |
+
108: ('[Tb]', 0),
|
| 135 |
+
109: ('[Dy]', 0),
|
| 136 |
+
110: ('[Ho]', 0),
|
| 137 |
+
111: ('[Er]', 0),
|
| 138 |
+
112: ('[Tm]', 0),
|
| 139 |
+
113: ('[Yb]', 0),
|
| 140 |
+
114: ('[Tc]', 0),
|
| 141 |
+
115: ('[U]', 0),
|
| 142 |
+
116: ('[Li&!H0]', 0), # 264-881
|
| 143 |
+
117: ('[Li]~[Li]', 0),
|
| 144 |
+
118: ('[Li]~[#5]', 0),
|
| 145 |
+
119: ('[Li]~[#6]', 0),
|
| 146 |
+
120: ('[Li]~[#8]', 0),
|
| 147 |
+
121: ('[Li]~[F]', 0),
|
| 148 |
+
122: ('[Li]~[#15]', 0),
|
| 149 |
+
123: ('[Li]~[#16]', 0),
|
| 150 |
+
124: ('[Li]~[Cl]', 0),
|
| 151 |
+
125: ('[#5&!H0]', 0),
|
| 152 |
+
126: ('[#5]~[#5]', 0),
|
| 153 |
+
127: ('[#5]~[#6]', 0),
|
| 154 |
+
128: ('[#5]~[#7]', 0),
|
| 155 |
+
129: ('[#5]~[#8]', 0),
|
| 156 |
+
130: ('[#5]~[F]', 0),
|
| 157 |
+
131: ('[#5]~[#14]', 0),
|
| 158 |
+
132: ('[#5]~[#15]', 0),
|
| 159 |
+
133: ('[#5]~[#16]', 0),
|
| 160 |
+
134: ('[#5]~[Cl]', 0),
|
| 161 |
+
135: ('[#5]~[Br]', 0),
|
| 162 |
+
136: ('[#6&!H0]', 0),
|
| 163 |
+
137: ('[#6]~[#6]', 0),
|
| 164 |
+
138: ('[#6]~[#7]', 0),
|
| 165 |
+
139: ('[#6]~[#8]', 0),
|
| 166 |
+
140: ('[#6]~[F]', 0),
|
| 167 |
+
141: ('[#6]~[Na]', 0),
|
| 168 |
+
142: ('[#6]~[Mg]', 0),
|
| 169 |
+
143: ('[#6]~[Al]', 0),
|
| 170 |
+
144: ('[#6]~[#14]', 0),
|
| 171 |
+
145: ('[#6]~[#15]', 0),
|
| 172 |
+
146: ('[#6]~[#16]', 0),
|
| 173 |
+
147: ('[#6]~[Cl]', 0),
|
| 174 |
+
148: ('[#6]~[#33]', 0),
|
| 175 |
+
149: ('[#6]~[#34]', 0),
|
| 176 |
+
150: ('[#6]~[Br]', 0),
|
| 177 |
+
151: ('[#6]~[I]', 0),
|
| 178 |
+
152: ('[#7&!H0]', 0),
|
| 179 |
+
153: ('[#7]~[#7]', 0),
|
| 180 |
+
154: ('[#7]~[#8]', 0),
|
| 181 |
+
155: ('[#7]~[F]', 0),
|
| 182 |
+
156: ('[#7]~[#14]', 0),
|
| 183 |
+
157: ('[#7]~[#15]', 0),
|
| 184 |
+
158: ('[#7]~[#16]', 0),
|
| 185 |
+
159: ('[#7]~[Cl]', 0),
|
| 186 |
+
160: ('[#7]~[Br]', 0),
|
| 187 |
+
161: ('[#8&!H0]', 0),
|
| 188 |
+
162: ('[#8]~[#8]', 0),
|
| 189 |
+
163: ('[#8]~[Mg]', 0),
|
| 190 |
+
164: ('[#8]~[Na]', 0),
|
| 191 |
+
165: ('[#8]~[Al]', 0),
|
| 192 |
+
166: ('[#8]~[#14]', 0),
|
| 193 |
+
167: ('[#8]~[#15]', 0),
|
| 194 |
+
168: ('[#8]~[K]', 0),
|
| 195 |
+
169: ('[F]~[#15]', 0),
|
| 196 |
+
170: ('[F]~[#16]', 0),
|
| 197 |
+
171: ('[Al&!H0]', 0),
|
| 198 |
+
172: ('[Al]~[Cl]', 0),
|
| 199 |
+
173: ('[#14&!H0]', 0),
|
| 200 |
+
174: ('[#14]~[#14]', 0),
|
| 201 |
+
175: ('[#14]~[Cl]', 0),
|
| 202 |
+
176: ('[#15&!H0]', 0),
|
| 203 |
+
177: ('[#15]~[#15]', 0),
|
| 204 |
+
178: ('[#33&!H0]', 0),
|
| 205 |
+
179: ('[#33]~[#33]', 0),
|
| 206 |
+
180: ('[#6](~Br)(~[#6])', 0),
|
| 207 |
+
181: ('[#6](~Br)(~[#6])(~[#6])', 0),
|
| 208 |
+
182: ('[#6&!H0]~[Br]', 0),
|
| 209 |
+
183: ('[#6](~[Br])(:[c])', 0),
|
| 210 |
+
184: ('[#6](~[Br])(:[n])', 0),
|
| 211 |
+
185: ('[#6](~[#6])(~[#6])', 0),
|
| 212 |
+
186: ('[#6](~[#6])(~[#6])(~[#6])', 0),
|
| 213 |
+
187: ('[#6](~[#6])(~[#6])(~[#6])(~[#6])', 0),
|
| 214 |
+
188: ('[#6H1](~[#6])(~[#6])(~[#6])', 0),
|
| 215 |
+
189: ('[#6](~[#6])(~[#6])(~[#6])(~[#7])', 0),
|
| 216 |
+
190: ('[#6](~[#6])(~[#6])(~[#6])(~[#8])', 0),
|
| 217 |
+
191: ('[#6H1](~[#6])(~[#6])(~[#7])', 0),
|
| 218 |
+
192: ('[#6H1](~[#6])(~[#6])(~[#8])', 0),
|
| 219 |
+
193: ('[#6](~[#6])(~[#6])(~[#7])', 0),
|
| 220 |
+
194: ('[#6](~[#6])(~[#6])(~[#8])', 0),
|
| 221 |
+
195: ('[#6](~[#6])(~[Cl])', 0),
|
| 222 |
+
196: ('[#6&!H0](~[#6])(~[Cl])', 0),
|
| 223 |
+
197: ('[#6H,#6H2,#6H3,#6H4]~[#6]', 0),
|
| 224 |
+
198: ('[#6&!H0](~[#6])(~[#7])', 0),
|
| 225 |
+
199: ('[#6&!H0](~[#6])(~[#8])', 0),
|
| 226 |
+
200: ('[#6H1](~[#6])(~[#8])(~[#8])', 0),
|
| 227 |
+
201: ('[#6&!H0](~[#6])(~[#15])', 0),
|
| 228 |
+
202: ('[#6&!H0](~[#6])(~[#16])', 0),
|
| 229 |
+
203: ('[#6](~[#6])(~[I])', 0),
|
| 230 |
+
204: ('[#6](~[#6])(~[#7])', 0),
|
| 231 |
+
205: ('[#6](~[#6])(~[#8])', 0),
|
| 232 |
+
206: ('[#6](~[#6])(~[#16])', 0),
|
| 233 |
+
207: ('[#6](~[#6])(~[#14])', 0),
|
| 234 |
+
208: ('[#6](~[#6])(:c)', 0),
|
| 235 |
+
209: ('[#6](~[#6])(:c)(:c)', 0),
|
| 236 |
+
210: ('[#6](~[#6])(:c)(:n)', 0),
|
| 237 |
+
211: ('[#6](~[#6])(:n)', 0),
|
| 238 |
+
212: ('[#6](~[#6])(:n)(:n)', 0),
|
| 239 |
+
213: ('[#6](~[Cl])(~[Cl])', 0),
|
| 240 |
+
214: ('[#6&!H0](~[Cl])', 0),
|
| 241 |
+
215: ('[#6](~[Cl])(:c)', 0),
|
| 242 |
+
216: ('[#6](~[F])(~[F])', 0),
|
| 243 |
+
217: ('[#6](~[F])(:c)', 0),
|
| 244 |
+
218: ('[#6&!H0](~[#7])', 0),
|
| 245 |
+
219: ('[#6&!H0](~[#8])', 0),
|
| 246 |
+
220: ('[#6&!H0](~[#8])(~[#8])', 0),
|
| 247 |
+
221: ('[#6&!H0](~[#16])', 0),
|
| 248 |
+
222: ('[#6&!H0](~[#14])', 0),
|
| 249 |
+
223: ('[#6&!H0]:c', 0),
|
| 250 |
+
224: ('[#6&!H0](:c)(:c)', 0),
|
| 251 |
+
225: ('[#6&!H0](:c)(:n)', 0),
|
| 252 |
+
226: ('[#6&!H0](:n)', 0),
|
| 253 |
+
227: ('[#6H3]', 0),
|
| 254 |
+
228: ('[#6](~[#7])(~[#7])', 0),
|
| 255 |
+
229: ('[#6](~[#7])(:c)', 0),
|
| 256 |
+
230: ('[#6](~[#7])(:c)(:c)', 0),
|
| 257 |
+
231: ('[#6](~[#7])(:c)(:n)', 0),
|
| 258 |
+
232: ('[#6](~[#7])(:n)', 0),
|
| 259 |
+
233: ('[#6](~[#8])(~[#8])', 0),
|
| 260 |
+
234: ('[#6](~[#8])(:c)', 0),
|
| 261 |
+
235: ('[#6](~[#8])(:c)(:c)', 0),
|
| 262 |
+
236: ('[#6](~[#16])(:c)', 0),
|
| 263 |
+
237: ('[#6](:c)(:c)', 0),
|
| 264 |
+
238: ('[#6](:c)(:c)(:c)', 0),
|
| 265 |
+
239: ('[#6](:c)(:c)(:n)', 0),
|
| 266 |
+
240: ('[#6](:c)(:n)', 0),
|
| 267 |
+
241: ('[#6](:c)(:n)(:n)', 0),
|
| 268 |
+
242: ('[#6](:n)(:n)', 0),
|
| 269 |
+
243: ('[#7](~[#6])(~[#6])', 0),
|
| 270 |
+
244: ('[#7](~[#6])(~[#6])(~[#6])', 0),
|
| 271 |
+
245: ('[#7&!H0](~[#6])(~[#6])', 0),
|
| 272 |
+
246: ('[#7&!H0](~[#6])', 0),
|
| 273 |
+
247: ('[#7&!H0](~[#6])(~[#7])', 0),
|
| 274 |
+
248: ('[#7](~[#6])(~[#8])', 0),
|
| 275 |
+
249: ('[#7](~[#6])(:c)', 0),
|
| 276 |
+
250: ('[#7](~[#6])(:c)(:c)', 0),
|
| 277 |
+
251: ('[#7&!H0](~[#7])', 0),
|
| 278 |
+
252: ('[#7&!H0](:c)', 0),
|
| 279 |
+
253: ('[#7&!H0](:c)(:c)', 0),
|
| 280 |
+
254: ('[#7](~[#8])(~[#8])', 0),
|
| 281 |
+
255: ('[#7](~[#8])(:o)', 0),
|
| 282 |
+
256: ('[#7](:c)(:c)', 0),
|
| 283 |
+
257: ('[#7](:c)(:c)(:c)', 0),
|
| 284 |
+
258: ('[#8](~[#6])(~[#6])', 0),
|
| 285 |
+
259: ('[#8&!H0](~[#6])', 0),
|
| 286 |
+
260: ('[#8](~[#6])(~[#15])', 0),
|
| 287 |
+
261: ('[#8&!H0](~[#16])', 0),
|
| 288 |
+
262: ('[#8](:c)(:c)', 0),
|
| 289 |
+
263: ('[#15](~[#6])(~[#6])', 0),
|
| 290 |
+
264: ('[#15](~[#8])(~[#8])', 0),
|
| 291 |
+
265: ('[#16](~[#6])(~[#6])', 0),
|
| 292 |
+
266: ('[#16&!H0](~[#6])', 0),
|
| 293 |
+
267: ('[#16](~[#6])(~[#8])', 0),
|
| 294 |
+
268: ('[#14](~[#6])(~[#6])', 0),
|
| 295 |
+
269: ('[#6]=,:[#6]', 0),
|
| 296 |
+
270: ('[#6]#[#6]', 0),
|
| 297 |
+
271: ('[#6]=,:[#7]', 0),
|
| 298 |
+
272: ('[#6]#[#7]', 0),
|
| 299 |
+
273: ('[#6]=,:[#8]', 0),
|
| 300 |
+
274: ('[#6]=,:[#16]', 0),
|
| 301 |
+
275: ('[#7]=,:[#7]', 0),
|
| 302 |
+
276: ('[#7]=,:[#8]', 0),
|
| 303 |
+
277: ('[#7]=,:[#15]', 0),
|
| 304 |
+
278: ('[#15]=,:[#8]', 0),
|
| 305 |
+
279: ('[#15]=,:[#15]', 0),
|
| 306 |
+
280: ('[#6](#[#6])(-,:[#6])', 0),
|
| 307 |
+
281: ('[#6&!H0](#[#6])', 0),
|
| 308 |
+
282: ('[#6](#[#7])(-,:[#6])', 0),
|
| 309 |
+
283: ('[#6](-,:[#6])(-,:[#6])(=,:[#6])', 0),
|
| 310 |
+
284: ('[#6](-,:[#6])(-,:[#6])(=,:[#7])', 0),
|
| 311 |
+
285: ('[#6](-,:[#6])(-,:[#6])(=,:[#8])', 0),
|
| 312 |
+
286: ('[#6](-,:[#6])([Cl])(=,:[#8])', 0),
|
| 313 |
+
287: ('[#6&!H0](-,:[#6])(=,:[#6])', 0),
|
| 314 |
+
288: ('[#6&!H0](-,:[#6])(=,:[#7])', 0),
|
| 315 |
+
289: ('[#6&!H0](-,:[#6])(=,:[#8])', 0),
|
| 316 |
+
290: ('[#6](-,:[#6])(-,:[#7])(=,:[#6])', 0),
|
| 317 |
+
291: ('[#6](-,:[#6])(-,:[#7])(=,:[#7])', 0),
|
| 318 |
+
292: ('[#6](-,:[#6])(-,:[#7])(=,:[#8])', 0),
|
| 319 |
+
293: ('[#6](-,:[#6])(-,:[#8])(=,:[#8])', 0),
|
| 320 |
+
294: ('[#6](-,:[#6])(=,:[#6])', 0),
|
| 321 |
+
295: ('[#6](-,:[#6])(=,:[#7])', 0),
|
| 322 |
+
296: ('[#6](-,:[#6])(=,:[#8])', 0),
|
| 323 |
+
297: ('[#6]([Cl])(=,:[#8])', 0),
|
| 324 |
+
298: ('[#6&!H0](-,:[#7])(=,:[#6])', 0),
|
| 325 |
+
299: ('[#6&!H0](=,:[#6])', 0),
|
| 326 |
+
300: ('[#6&!H0](=,:[#7])', 0),
|
| 327 |
+
301: ('[#6&!H0](=,:[#8])', 0),
|
| 328 |
+
302: ('[#6](-,:[#7])(=,:[#6])', 0),
|
| 329 |
+
303: ('[#6](-,:[#7])(=,:[#7])', 0),
|
| 330 |
+
304: ('[#6](-,:[#7])(=,:[#8])', 0),
|
| 331 |
+
305: ('[#6](-,:[#8])(=,:[#8])', 0),
|
| 332 |
+
306: ('[#7](-,:[#6])(=,:[#6])', 0),
|
| 333 |
+
307: ('[#7](-,:[#6])(=,:[#8])', 0),
|
| 334 |
+
308: ('[#7](-,:[#8])(=,:[#8])', 0),
|
| 335 |
+
309: ('[#15](-,:[#8])(=,:[#8])', 0),
|
| 336 |
+
310: ('[#16](-,:[#6])(=,:[#8])', 0),
|
| 337 |
+
311: ('[#16](-,:[#8])(=,:[#8])', 0),
|
| 338 |
+
312: ('[#16](=,:[#8])(=,:[#8])', 0),
|
| 339 |
+
313: ('[#6]-,:[#6]-,:[#6]#[#6]', 0),
|
| 340 |
+
314: ('[#8]-,:[#6]-,:[#6]=,:[#7]', 0),
|
| 341 |
+
315: ('[#8]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 342 |
+
316: ('[#7]:[#6]-,:[#16&!H0]', 0),
|
| 343 |
+
317: ('[#7]-,:[#6]-,:[#6]=,:[#6]', 0),
|
| 344 |
+
318: ('[#8]=,:[#16]-,:[#6]-,:[#6]', 0),
|
| 345 |
+
319: ('[#7]#[#6]-,:[#6]=,:[#6]', 0),
|
| 346 |
+
320: ('[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
|
| 347 |
+
321: ('[#8]=,:[#16]-,:[#6]-,:[#7]', 0),
|
| 348 |
+
322: ('[#16]-,:[#16]-,:[#6]:[#6]', 0),
|
| 349 |
+
323: ('[#6]:[#6]-,:[#6]=,:[#6]', 0),
|
| 350 |
+
324: ('[#16]:[#6]:[#6]:[#6]', 0),
|
| 351 |
+
325: ('[#6]:[#7]:[#6]-,:[#6]', 0),
|
| 352 |
+
326: ('[#16]-,:[#6]:[#7]:[#6]', 0),
|
| 353 |
+
327: ('[#16]:[#6]:[#6]:[#7]', 0),
|
| 354 |
+
328: ('[#16]-,:[#6]=,:[#7]-,:[#6]', 0),
|
| 355 |
+
329: ('[#6]-,:[#8]-,:[#6]=,:[#6]', 0),
|
| 356 |
+
330: ('[#7]-,:[#7]-,:[#6]:[#6]', 0),
|
| 357 |
+
331: ('[#16]-,:[#6]=,:[#7&!H0]', 0),
|
| 358 |
+
332: ('[#16]-,:[#6]-,:[#16]-,:[#6]', 0),
|
| 359 |
+
333: ('[#6]:[#16]:[#6]-,:[#6]', 0),
|
| 360 |
+
334: ('[#8]-,:[#16]-,:[#6]:[#6]', 0),
|
| 361 |
+
335: ('[#6]:[#7]-,:[#6]:[#6]', 0),
|
| 362 |
+
336: ('[#7]-,:[#16]-,:[#6]:[#6]', 0),
|
| 363 |
+
337: ('[#7]-,:[#6]:[#7]:[#6]', 0),
|
| 364 |
+
338: ('[#7]:[#6]:[#6]:[#7]', 0),
|
| 365 |
+
339: ('[#7]-,:[#6]:[#7]:[#7]', 0),
|
| 366 |
+
340: ('[#7]-,:[#6]=,:[#7]-,:[#6]', 0),
|
| 367 |
+
341: ('[#7]-,:[#6]=,:[#7&!H0]', 0),
|
| 368 |
+
342: ('[#7]-,:[#6]-,:[#16]-,:[#6]', 0),
|
| 369 |
+
343: ('[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
|
| 370 |
+
344: ('[#6]-,:[#7]:[#6&!H0]', 0),
|
| 371 |
+
345: ('[#7]-,:[#6]:[#8]:[#6]', 0),
|
| 372 |
+
346: ('[#8]=,:[#6]-,:[#6]:[#6]', 0),
|
| 373 |
+
347: ('[#8]=,:[#6]-,:[#6]:[#7]', 0),
|
| 374 |
+
348: ('[#6]-,:[#7]-,:[#6]:[#6]', 0),
|
| 375 |
+
349: ('[#7]:[#7]-,:[#6&!H0]', 0),
|
| 376 |
+
350: ('[#8]-,:[#6]:[#6]:[#7]', 0),
|
| 377 |
+
351: ('[#8]-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 378 |
+
352: ('[#7]-,:[#6]:[#6]:[#7]', 0),
|
| 379 |
+
353: ('[#6]-,:[#16]-,:[#6]:[#6]', 0),
|
| 380 |
+
354: ('[Cl]-,:[#6]:[#6]-,:[#6]', 0),
|
| 381 |
+
355: ('[#7]-,:[#6]=,:[#6&!H0]', 0),
|
| 382 |
+
356: ('[Cl]-,:[#6]:[#6&!H0]', 0),
|
| 383 |
+
357: ('[#7]:[#6]:[#7]-,:[#6]', 0),
|
| 384 |
+
358: ('[Cl]-,:[#6]:[#6]-,:[#8]', 0),
|
| 385 |
+
359: ('[#6]-,:[#6]:[#7]:[#6]', 0),
|
| 386 |
+
360: ('[#6]-,:[#6]-,:[#16]-,:[#6]', 0),
|
| 387 |
+
361: ('[#16]=,:[#6]-,:[#7]-,:[#6]', 0),
|
| 388 |
+
362: ('[Br]-,:[#6]:[#6]-,:[#6]', 0),
|
| 389 |
+
363: ('[#7&!H0]-,:[#7&!H0]', 0),
|
| 390 |
+
364: ('[#16]=,:[#6]-,:[#7&!H0]', 0),
|
| 391 |
+
365: ('[#6]-,:[#33]-[#8&!H0]', 0),
|
| 392 |
+
366: ('[#16]:[#6]:[#6&!H0]', 0),
|
| 393 |
+
367: ('[#8]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 394 |
+
368: ('[#7]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 395 |
+
369: ('[#6H,#6H2,#6H3]=,:[#6H,#6H2,#6H3]', 0),
|
| 396 |
+
370: ('[#7]-,:[#7]-,:[#6]-,:[#7]', 0),
|
| 397 |
+
371: ('[#8]=,:[#6]-,:[#7]-,:[#7]', 0),
|
| 398 |
+
372: ('[#7]=,:[#6]-,:[#7]-,:[#6]', 0),
|
| 399 |
+
373: ('[#6]=,:[#6]-,:[#6]:[#6]', 0),
|
| 400 |
+
374: ('[#6]:[#7]-,:[#6&!H0]', 0),
|
| 401 |
+
375: ('[#6]-,:[#7]-,:[#7&!H0]', 0),
|
| 402 |
+
376: ('[#7]:[#6]:[#6]-,:[#6]', 0),
|
| 403 |
+
377: ('[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 404 |
+
378: ('[#33]-,:[#6]:[#6&!H0]', 0),
|
| 405 |
+
379: ('[Cl]-,:[#6]:[#6]-,:[Cl]', 0),
|
| 406 |
+
380: ('[#6]:[#6]:[#7&!H0]', 0),
|
| 407 |
+
381: ('[#7&!H0]-,:[#6&!H0]', 0),
|
| 408 |
+
382: ('[Cl]-,:[#6]-,:[#6]-,:[Cl]', 0),
|
| 409 |
+
383: ('[#7]:[#6]-,:[#6]:[#6]', 0),
|
| 410 |
+
384: ('[#16]-,:[#6]:[#6]-,:[#6]', 0),
|
| 411 |
+
385: ('[#16]-,:[#6]:[#6&!H0]', 0),
|
| 412 |
+
386: ('[#16]-,:[#6]:[#6]-,:[#7]', 0),
|
| 413 |
+
387: ('[#16]-,:[#6]:[#6]-,:[#8]', 0),
|
| 414 |
+
388: ('[#8]=,:[#6]-,:[#6]-,:[#6]', 0),
|
| 415 |
+
389: ('[#8]=,:[#6]-,:[#6]-,:[#7]', 0),
|
| 416 |
+
390: ('[#8]=,:[#6]-,:[#6]-,:[#8]', 0),
|
| 417 |
+
391: ('[#7]=,:[#6]-,:[#6]-,:[#6]', 0),
|
| 418 |
+
392: ('[#7]=,:[#6]-,:[#6&!H0]', 0),
|
| 419 |
+
393: ('[#6]-,:[#7]-,:[#6&!H0]', 0),
|
| 420 |
+
394: ('[#8]-,:[#6]:[#6]-,:[#6]', 0),
|
| 421 |
+
395: ('[#8]-,:[#6]:[#6&!H0]', 0),
|
| 422 |
+
396: ('[#8]-,:[#6]:[#6]-,:[#7]', 0),
|
| 423 |
+
397: ('[#8]-,:[#6]:[#6]-,:[#8]', 0),
|
| 424 |
+
398: ('[#7]-,:[#6]:[#6]-,:[#6]', 0),
|
| 425 |
+
399: ('[#7]-,:[#6]:[#6&!H0]', 0),
|
| 426 |
+
400: ('[#7]-,:[#6]:[#6]-,:[#7]', 0),
|
| 427 |
+
401: ('[#8]-,:[#6]-,:[#6]:[#6]', 0),
|
| 428 |
+
402: ('[#7]-,:[#6]-,:[#6]:[#6]', 0),
|
| 429 |
+
403: ('[Cl]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 430 |
+
404: ('[Cl]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 431 |
+
405: ('[#6]:[#6]-,:[#6]:[#6]', 0),
|
| 432 |
+
406: ('[#8]=,:[#6]-,:[#6]=,:[#6]', 0),
|
| 433 |
+
407: ('[Br]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 434 |
+
408: ('[#7]=,:[#6]-,:[#6]=,:[#6]', 0),
|
| 435 |
+
409: ('[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
|
| 436 |
+
410: ('[#7]:[#6]-,:[#8&!H0]', 0),
|
| 437 |
+
411: ('[#8]=,:[#7]-,:c:c', 0),
|
| 438 |
+
412: ('[#8]-,:[#6]-,:[#7&!H0]', 0),
|
| 439 |
+
413: ('[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 440 |
+
414: ('[Cl]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 441 |
+
415: ('[Br]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 442 |
+
416: ('[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 443 |
+
417: ('[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
|
| 444 |
+
418: ('[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
| 445 |
+
419: ('[#8]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 446 |
+
420: ('[#8]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 447 |
+
421: ('N#[#6]-,:[#6]-,:[#6]', 0),
|
| 448 |
+
422: ('[#7]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 449 |
+
423: ('[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
| 450 |
+
424: ('[#6&!H0]-,:[#8&!H0]', 0),
|
| 451 |
+
425: ('n:c:n:c', 0),
|
| 452 |
+
426: ('[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
|
| 453 |
+
427: ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
| 454 |
+
428: ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
|
| 455 |
+
429: ('[#7]=,:[#6]-,:[#6]:[#6&!H0]', 0),
|
| 456 |
+
430: ('c:c-,:[#7]-,:c:c', 0),
|
| 457 |
+
431: ('[#6]-,:[#6]:[#6]-,:c:c', 0),
|
| 458 |
+
432: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 459 |
+
433: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 460 |
+
434: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 461 |
+
435: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 462 |
+
436: ('[Cl]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
| 463 |
+
437: ('c:c-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 464 |
+
438: ('[#6]-,:[#6]:[#6]-,:[#7]-,:[#6]', 0),
|
| 465 |
+
439: ('[#6]-,:[#16]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 466 |
+
440: ('[#7]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
| 467 |
+
441: ('[#8]=,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 468 |
+
442: ('[#6]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
| 469 |
+
443: ('[#6]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
| 470 |
+
444: ('[Cl]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 471 |
+
445: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 472 |
+
446: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 473 |
+
447: ('[#6]-,:[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
|
| 474 |
+
448: ('c:c-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 475 |
+
449: ('[#7]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 476 |
+
450: ('[#8]=,:[#6]-,:[#6]-,:c:c', 0),
|
| 477 |
+
451: ('[Cl]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
| 478 |
+
452: ('[#6H,#6H2,#6H3]-,:[#6]=,:[#6H,#6H2,#6H3]', 0),
|
| 479 |
+
453: ('[#7]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
| 480 |
+
454: ('[#7]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
|
| 481 |
+
455: ('[#8]=,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 482 |
+
456: ('[#6]-,:c:c:[#6]-,:[#6]', 0),
|
| 483 |
+
457: ('[#6]-,:[#8]-,:[#6]-,:[#6]:c', 0),
|
| 484 |
+
458: ('[#8]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 485 |
+
459: ('[#8]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
| 486 |
+
460: ('[#7]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
| 487 |
+
461: ('[#6]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
| 488 |
+
462: ('[Cl]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 489 |
+
463: ('[#6]-,:[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 490 |
+
464: ('[#7]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 491 |
+
465: ('[#7]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
| 492 |
+
466: ('[#6]-,:[#7]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 493 |
+
467: ('[#6]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
| 494 |
+
468: ('[#7]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 495 |
+
469: ('c:c:n:n:c', 0),
|
| 496 |
+
470: ('[#6]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
| 497 |
+
471: ('c:[#6]-,:[#6]-,:[#6]:c', 0),
|
| 498 |
+
472: ('[#8]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 499 |
+
473: ('c:c-,:[#8]-,:[#6]-,:[#6]', 0),
|
| 500 |
+
474: ('[#7]-,:[#6]:c:c:n', 0),
|
| 501 |
+
475: ('[#8]=,:[#6]-,:[#8]-,:[#6]:c', 0),
|
| 502 |
+
476: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
| 503 |
+
477: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#7]', 0),
|
| 504 |
+
478: ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
|
| 505 |
+
479: ('[#6]-,:[#8]-,:[#6]:[#6]-,:[#6]', 0),
|
| 506 |
+
480: ('[#8]=,:[#33]-,:[#6]:c:c', 0),
|
| 507 |
+
481: ('[#6]-,:[#7]-,:[#6]-,:[#6]:c', 0),
|
| 508 |
+
482: ('[#16]-,:[#6]:c:c-,:[#7]', 0),
|
| 509 |
+
483: ('[#8]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
| 510 |
+
484: ('[#8]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
| 511 |
+
485: ('[#6]-,:[#6]-,:[#8]-,:[#6]:c', 0),
|
| 512 |
+
486: ('[#7]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
| 513 |
+
487: ('[#6]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
| 514 |
+
488: ('[#7]-,:[#7]-,:[#6]-,:[#7&!H0]', 0),
|
| 515 |
+
489: ('[#6]-,:[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 516 |
+
490: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 517 |
+
491: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 518 |
+
492: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 519 |
+
493: ('[#6]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 520 |
+
494: ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
|
| 521 |
+
495: ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 522 |
+
496: ('[#6&!H0]-,:[#6]-,:[#7&!H0]', 0),
|
| 523 |
+
497: ('[#6]-,:[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
|
| 524 |
+
498: ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 525 |
+
499: ('[#8]=,:[#6]-,:[#7]-,:[#6&!H0]', 0),
|
| 526 |
+
500: ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#7]', 0),
|
| 527 |
+
501: ('[#8]=,:[#7]-,:[#6]:[#6]-,:[#7]', 0),
|
| 528 |
+
502: ('[#8]=,:[#7]-,:c:c-,:[#8]', 0),
|
| 529 |
+
503: ('[#8]=,:[#6]-,:[#7]-,:[#6]=,:[#8]', 0),
|
| 530 |
+
504: ('[#8]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
| 531 |
+
505: ('[#8]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
|
| 532 |
+
506: ('[#8]-,:[#6]:[#6]:[#6]-,:[#8]', 0),
|
| 533 |
+
507: ('[#7]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 534 |
+
508: ('[#8]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
| 535 |
+
509: ('[#6]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 536 |
+
510: ('[#6]-,:[#7]-,:[#6]:[#6]-,:[#6]', 0),
|
| 537 |
+
511: ('[#6]-,:[#6]-,:[#16]-,:[#6]-,:[#6]', 0),
|
| 538 |
+
512: ('[#8]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 539 |
+
513: ('[#6]-,:[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
|
| 540 |
+
514: ('[#8]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
| 541 |
+
515: ('[#8]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 542 |
+
516: ('[#8]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
| 543 |
+
517: ('[#6]-,:[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
|
| 544 |
+
518: ('[#7]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
| 545 |
+
519: ('[#6]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 546 |
+
520: ('[#6]=,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
| 547 |
+
521: ('[#6]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
| 548 |
+
522: ('[Cl]-,:[#6]:[#6]-,:[#6]=,:[#8]', 0),
|
| 549 |
+
523: ('[Br]-,:[#6]:c:c-,:[#6]', 0),
|
| 550 |
+
524: ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 551 |
+
525: ('[#8]=,:[#6]-,:[#6]=,:[#6&!H0]', 0),
|
| 552 |
+
526: ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#7]', 0),
|
| 553 |
+
527: ('[#7]-,:[#6]-,:[#7]-,:[#6]:c', 0),
|
| 554 |
+
528: ('[Br]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
| 555 |
+
529: ('[#7]#[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 556 |
+
530: ('[#6]-,:[#6]=,:[#6]-,:[#6]:c', 0),
|
| 557 |
+
531: ('[#6]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 558 |
+
532: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 559 |
+
533: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 560 |
+
534: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 561 |
+
535: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 562 |
+
536: ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 563 |
+
537: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 564 |
+
538: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 565 |
+
539: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 566 |
+
540: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 567 |
+
541: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 568 |
+
542: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 569 |
+
543: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 570 |
+
544: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 571 |
+
545: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 572 |
+
546: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 573 |
+
547: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 574 |
+
548: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 575 |
+
549: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 576 |
+
550: ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
|
| 577 |
+
551: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 578 |
+
552: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
|
| 579 |
+
553: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 580 |
+
554: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
|
| 581 |
+
555: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 582 |
+
556: ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
|
| 583 |
+
557: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 584 |
+
558: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
|
| 585 |
+
559: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](=,:[#8])-,:[#6]', 0),
|
| 586 |
+
560: ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
|
| 587 |
+
561: ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
|
| 588 |
+
562: ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 589 |
+
563: ('[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
|
| 590 |
+
564: ('[#6]-,:[#6](-,:[#6])(-,:[#6])-,:[#6]-,:[#6]', 0),
|
| 591 |
+
565: ('[#6]-,:[#6](-,:[#6])-,:[#6](-,:[#6])-,:[#6]', 0),
|
| 592 |
+
566: ('[#6]c1ccc([#6])cc1', 0),
|
| 593 |
+
567: ('[#6]c1ccc([#8])cc1', 0),
|
| 594 |
+
568: ('[#6]c1ccc([#16])cc1', 0),
|
| 595 |
+
569: ('[#6]c1ccc([#7])cc1', 0),
|
| 596 |
+
570: ('[#6]c1ccc(Cl)cc1', 0),
|
| 597 |
+
571: ('[#6]c1ccc(Br)cc1', 0),
|
| 598 |
+
572: ('[#8]c1ccc([#8])cc1', 0),
|
| 599 |
+
573: ('[#8]c1ccc([#16])cc1', 0),
|
| 600 |
+
574: ('[#8]c1ccc([#7])cc1', 0),
|
| 601 |
+
575: ('[#8]c1ccc(Cl)cc1', 0),
|
| 602 |
+
576: ('[#8]c1ccc(Br)cc1', 0),
|
| 603 |
+
577: ('[#16]c1ccc([#16])cc1', 0),
|
| 604 |
+
578: ('[#16]c1ccc([#7])cc1', 0),
|
| 605 |
+
579: ('[#16]c1ccc(Cl)cc1', 0),
|
| 606 |
+
580: ('[#16]c1ccc(Br)cc1', 0),
|
| 607 |
+
581: ('[#7]c1ccc([#7])cc1', 0),
|
| 608 |
+
582: ('[#7]c1ccc(Cl)cc1', 0),
|
| 609 |
+
583: ('[#7]c1ccc(Br)cc1', 0),
|
| 610 |
+
584: ('Clc1ccc(Cl)cc1', 0),
|
| 611 |
+
585: ('Clc1ccc(Br)cc1', 0),
|
| 612 |
+
586: ('Brc1ccc(Br)cc1', 0),
|
| 613 |
+
587: ('[#6]c1cc([#6])ccc1', 0),
|
| 614 |
+
588: ('[#6]c1cc([#8])ccc1', 0),
|
| 615 |
+
589: ('[#6]c1cc([#16])ccc1', 0),
|
| 616 |
+
590: ('[#6]c1cc([#7])ccc1', 0),
|
| 617 |
+
591: ('[#6]c1cc(Cl)ccc1', 0),
|
| 618 |
+
592: ('[#6]c1cc(Br)ccc1', 0),
|
| 619 |
+
593: ('[#8]c1cc([#8])ccc1', 0),
|
| 620 |
+
594: ('[#8]c1cc([#16])ccc1', 0),
|
| 621 |
+
595: ('[#8]c1cc([#7])ccc1', 0),
|
| 622 |
+
596: ('[#8]c1cc(Cl)ccc1', 0),
|
| 623 |
+
597: ('[#8]c1cc(Br)ccc1', 0),
|
| 624 |
+
598: ('[#16]c1cc([#16])ccc1', 0),
|
| 625 |
+
599: ('[#16]c1cc([#7])ccc1', 0),
|
| 626 |
+
600: ('[#16]c1cc(Cl)ccc1', 0),
|
| 627 |
+
601: ('[#16]c1cc(Br)ccc1', 0),
|
| 628 |
+
602: ('[#7]c1cc([#7])ccc1', 0),
|
| 629 |
+
603: ('[#7]c1cc(Cl)ccc1', 0),
|
| 630 |
+
604: ('[#7]c1cc(Br)ccc1', 0),
|
| 631 |
+
605: ('Clc1cc(Cl)ccc1', 0),
|
| 632 |
+
606: ('Clc1cc(Br)ccc1', 0),
|
| 633 |
+
607: ('Brc1cc(Br)ccc1', 0),
|
| 634 |
+
608: ('[#6]c1c([#6])cccc1', 0),
|
| 635 |
+
609: ('[#6]c1c([#8])cccc1', 0),
|
| 636 |
+
610: ('[#6]c1c([#16])cccc1', 0),
|
| 637 |
+
611: ('[#6]c1c([#7])cccc1', 0),
|
| 638 |
+
612: ('[#6]c1c(Cl)cccc1', 0),
|
| 639 |
+
613: ('[#6]c1c(Br)cccc1', 0),
|
| 640 |
+
614: ('[#8]c1c([#8])cccc1', 0),
|
| 641 |
+
615: ('[#8]c1c([#16])cccc1', 0),
|
| 642 |
+
616: ('[#8]c1c([#7])cccc1', 0),
|
| 643 |
+
617: ('[#8]c1c(Cl)cccc1', 0),
|
| 644 |
+
618: ('[#8]c1c(Br)cccc1', 0),
|
| 645 |
+
619: ('[#16]c1c([#16])cccc1', 0),
|
| 646 |
+
620: ('[#16]c1c([#7])cccc1', 0),
|
| 647 |
+
621: ('[#16]c1c(Cl)cccc1', 0),
|
| 648 |
+
622: ('[#16]c1c(Br)cccc1', 0),
|
| 649 |
+
623: ('[#7]c1c([#7])cccc1', 0),
|
| 650 |
+
624: ('[#7]c1c(Cl)cccc1', 0),
|
| 651 |
+
625: ('[#7]c1c(Br)cccc1', 0),
|
| 652 |
+
626: ('Clc1c(Cl)cccc1', 0),
|
| 653 |
+
627: ('Clc1c(Br)cccc1', 0),
|
| 654 |
+
628: ('Brc1c(Br)cccc1', 0),
|
| 655 |
+
629: ('[#6][#6]1[#6][#6][#6]([#6])[#6][#6]1', 0),
|
| 656 |
+
630: ('[#6][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
|
| 657 |
+
631: ('[#6][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
| 658 |
+
632: ('[#6][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
| 659 |
+
633: ('[#6][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
| 660 |
+
634: ('[#6][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 661 |
+
635: ('[#8][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
|
| 662 |
+
636: ('[#8][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
| 663 |
+
637: ('[#8][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
| 664 |
+
638: ('[#8][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
| 665 |
+
639: ('[#8][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 666 |
+
640: ('[#16][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
| 667 |
+
641: ('[#16][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
| 668 |
+
642: ('[#16][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
| 669 |
+
643: ('[#16][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 670 |
+
644: ('[#7][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
| 671 |
+
645: ('[#7][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
| 672 |
+
646: ('[#7][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 673 |
+
647: ('Cl[#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
| 674 |
+
648: ('Cl[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 675 |
+
649: ('Br[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 676 |
+
650: ('[#6][#6]1[#6][#6]([#6])[#6][#6][#6]1', 0),
|
| 677 |
+
651: ('[#6][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
|
| 678 |
+
652: ('[#6][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
| 679 |
+
653: ('[#6][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
| 680 |
+
654: ('[#6][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
| 681 |
+
655: ('[#6][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 682 |
+
656: ('[#8][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
|
| 683 |
+
657: ('[#8][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
| 684 |
+
658: ('[#8][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
| 685 |
+
659: ('[#8][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
| 686 |
+
660: ('[#8][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 687 |
+
661: ('[#16][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
| 688 |
+
662: ('[#16][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
| 689 |
+
663: ('[#16][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
| 690 |
+
664: ('[#16][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 691 |
+
665: ('[#7][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
| 692 |
+
666: ('[#7][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
| 693 |
+
667: ('[#7][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 694 |
+
668: ('Cl[#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
| 695 |
+
669: ('Cl[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 696 |
+
670: ('Br[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 697 |
+
671: ('[#6][#6]1[#6]([#6])[#6][#6][#6][#6]1', 0),
|
| 698 |
+
672: ('[#6][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
|
| 699 |
+
673: ('[#6][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
| 700 |
+
674: ('[#6][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
| 701 |
+
675: ('[#6][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
| 702 |
+
676: ('[#6][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 703 |
+
677: ('[#8][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
|
| 704 |
+
678: ('[#8][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
| 705 |
+
679: ('[#8][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
| 706 |
+
680: ('[#8][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
| 707 |
+
681: ('[#8][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 708 |
+
682: ('[#16][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
| 709 |
+
683: ('[#16][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
| 710 |
+
684: ('[#16][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
| 711 |
+
685: ('[#16][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 712 |
+
686: ('[#7][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
| 713 |
+
687: ('[#7][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
| 714 |
+
688: ('[#7][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 715 |
+
689: ('Cl[#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
| 716 |
+
690: ('Cl[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 717 |
+
691: ('Br[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 718 |
+
692: ('[#6][#6]1[#6][#6]([#6])[#6][#6]1', 0),
|
| 719 |
+
693: ('[#6][#6]1[#6][#6]([#8])[#6][#6]1', 0),
|
| 720 |
+
694: ('[#6][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
| 721 |
+
695: ('[#6][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
| 722 |
+
696: ('[#6][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
| 723 |
+
697: ('[#6][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 724 |
+
698: ('[#8][#6]1[#6][#6]([#8])[#6][#6]1', 0),
|
| 725 |
+
699: ('[#8][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
| 726 |
+
700: ('[#8][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
| 727 |
+
701: ('[#8][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
| 728 |
+
702: ('[#8][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 729 |
+
703: ('[#16][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
| 730 |
+
704: ('[#16][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
| 731 |
+
705: ('[#16][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
| 732 |
+
706: ('[#16][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 733 |
+
707: ('[#7][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
| 734 |
+
708: ('[#7][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
| 735 |
+
709: ('[#7][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 736 |
+
710: ('Cl[#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
| 737 |
+
711: ('Cl[#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 738 |
+
712: ('Br[#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 739 |
+
713: ('[#6][#6]1[#6]([#6])[#6][#6][#6]1', 0),
|
| 740 |
+
714: ('[#6][#6]1[#6]([#8])[#6][#6][#6]1', 0),
|
| 741 |
+
715: ('[#6][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
| 742 |
+
716: ('[#6][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
| 743 |
+
717: ('[#6][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
| 744 |
+
718: ('[#6][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
| 745 |
+
719: ('[#8][#6]1[#6]([#8])[#6][#6][#6]1', 0),
|
| 746 |
+
720: ('[#8][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
| 747 |
+
721: ('[#8][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
| 748 |
+
722: ('[#8][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
| 749 |
+
723: ('[#8][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
| 750 |
+
724: ('[#16][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
| 751 |
+
725: ('[#16][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
| 752 |
+
726: ('[#16][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
| 753 |
+
727: ('[#16][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
| 754 |
+
728: ('[#7][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
| 755 |
+
729: ('[#7][#6]1[#6](Cl)[#6][#6]1', 0),
|
| 756 |
+
730: ('[#7][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
| 757 |
+
731: ('Cl[#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
| 758 |
+
732: ('Cl[#6]1[#6](Br)[#6][#6][#6]1', 0),
|
| 759 |
+
733: ('Br[#6]1[#6](Br)[#6][#6][#6]1', 0)}
|
| 760 |
+
|
| 761 |
+
PubchemKeys = None
|
| 762 |
+
|
| 763 |
+
|
| 764 |
+
def InitKeys(keyList, keyDict):
|
| 765 |
+
""" *Internal Use Only*
|
| 766 |
+
generates SMARTS patterns for the keys, run once
|
| 767 |
+
"""
|
| 768 |
+
assert len(keyList) == len(keyDict.keys()), 'length mismatch'
|
| 769 |
+
for key in keyDict.keys():
|
| 770 |
+
patt, count = keyDict[key]
|
| 771 |
+
if patt != '?':
|
| 772 |
+
sma = Chem.MolFromSmarts(patt)
|
| 773 |
+
if not sma:
|
| 774 |
+
print('SMARTS parser error for key #%d: %s' % (key, patt))
|
| 775 |
+
else:
|
| 776 |
+
keyList[key - 1] = sma, count
|
| 777 |
+
|
| 778 |
+
|
| 779 |
+
def calcPubChemFingerPart1(mol, **kwargs):
|
| 780 |
+
""" Calculate PubChem Fingerprints (1-115; 263-881)
|
| 781 |
+
**Arguments**
|
| 782 |
+
- mol: the molecule to be fingerprinted
|
| 783 |
+
- any extra keyword arguments are ignored
|
| 784 |
+
**Returns**
|
| 785 |
+
a _DataStructs.SparseBitVect_ containing the fingerprint.
|
| 786 |
+
>>> m = Chem.MolFromSmiles('CNO')
|
| 787 |
+
>>> bv = PubChemFingerPart1(m)
|
| 788 |
+
>>> tuple(bv.GetOnBits())
|
| 789 |
+
(24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164)
|
| 790 |
+
>>> bv = PubChemFingerPart1(Chem.MolFromSmiles('CCC'))
|
| 791 |
+
>>> tuple(bv.GetOnBits())
|
| 792 |
+
(74, 114, 149, 155, 160)
|
| 793 |
+
"""
|
| 794 |
+
global PubchemKeys
|
| 795 |
+
if PubchemKeys is None:
|
| 796 |
+
PubchemKeys = [(None, 0)] * len(smartsPatts.keys())
|
| 797 |
+
InitKeys(PubchemKeys, smartsPatts)
|
| 798 |
+
ctor = kwargs.get('ctor', DataStructs.SparseBitVect)
|
| 799 |
+
res = ctor(len(PubchemKeys) + 1)
|
| 800 |
+
for i, (patt, count) in enumerate(PubchemKeys):
|
| 801 |
+
if patt is not None:
|
| 802 |
+
if count == 0:
|
| 803 |
+
res[i + 1] = mol.HasSubstructMatch(patt)
|
| 804 |
+
else:
|
| 805 |
+
matches = mol.GetSubstructMatches(patt)
|
| 806 |
+
if len(matches) > count:
|
| 807 |
+
res[i + 1] = 1
|
| 808 |
+
return res
|
| 809 |
+
|
| 810 |
+
|
| 811 |
+
def func_1(mol, bits):
|
| 812 |
+
""" *Internal Use Only*
|
| 813 |
+
Calculate PubChem Fingerprints (116-263)
|
| 814 |
+
"""
|
| 815 |
+
ringSize = []
|
| 816 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
| 817 |
+
AllRingsAtom = mol.GetRingInfo().AtomRings()
|
| 818 |
+
for ring in AllRingsAtom:
|
| 819 |
+
ringSize.append(len(ring))
|
| 820 |
+
for k, v in temp.items():
|
| 821 |
+
if len(ring) == k:
|
| 822 |
+
temp[k] += 1
|
| 823 |
+
if temp[3] >= 2:
|
| 824 |
+
bits[0] = 1
|
| 825 |
+
bits[7] = 1
|
| 826 |
+
elif temp[3] == 1:
|
| 827 |
+
bits[0] = 1
|
| 828 |
+
else:
|
| 829 |
+
pass
|
| 830 |
+
if temp[4] >= 2:
|
| 831 |
+
bits[14] = 1
|
| 832 |
+
bits[21] = 1
|
| 833 |
+
elif temp[4] == 1:
|
| 834 |
+
bits[14] = 1
|
| 835 |
+
else:
|
| 836 |
+
pass
|
| 837 |
+
if temp[5] >= 5:
|
| 838 |
+
bits[28] = 1
|
| 839 |
+
bits[35] = 1
|
| 840 |
+
bits[42] = 1
|
| 841 |
+
bits[49] = 1
|
| 842 |
+
bits[56] = 1
|
| 843 |
+
elif temp[5] == 4:
|
| 844 |
+
bits[28] = 1
|
| 845 |
+
bits[35] = 1
|
| 846 |
+
bits[42] = 1
|
| 847 |
+
bits[49] = 1
|
| 848 |
+
elif temp[5] == 3:
|
| 849 |
+
bits[28] = 1
|
| 850 |
+
bits[35] = 1
|
| 851 |
+
bits[42] = 1
|
| 852 |
+
elif temp[5] == 2:
|
| 853 |
+
bits[28] = 1
|
| 854 |
+
bits[35] = 1
|
| 855 |
+
elif temp[5] == 1:
|
| 856 |
+
bits[28] = 1
|
| 857 |
+
else:
|
| 858 |
+
pass
|
| 859 |
+
if temp[6] >= 5:
|
| 860 |
+
bits[63] = 1
|
| 861 |
+
bits[70] = 1
|
| 862 |
+
bits[77] = 1
|
| 863 |
+
bits[84] = 1
|
| 864 |
+
bits[91] = 1
|
| 865 |
+
elif temp[6] == 4:
|
| 866 |
+
bits[63] = 1
|
| 867 |
+
bits[70] = 1
|
| 868 |
+
bits[77] = 1
|
| 869 |
+
bits[84] = 1
|
| 870 |
+
elif temp[6] == 3:
|
| 871 |
+
bits[63] = 1
|
| 872 |
+
bits[70] = 1
|
| 873 |
+
bits[77] = 1
|
| 874 |
+
elif temp[6] == 2:
|
| 875 |
+
bits[63] = 1
|
| 876 |
+
bits[70] = 1
|
| 877 |
+
elif temp[6] == 1:
|
| 878 |
+
bits[63] = 1
|
| 879 |
+
else:
|
| 880 |
+
pass
|
| 881 |
+
if temp[7] >= 2:
|
| 882 |
+
bits[98] = 1
|
| 883 |
+
bits[105] = 1
|
| 884 |
+
elif temp[7] == 1:
|
| 885 |
+
bits[98] = 1
|
| 886 |
+
else:
|
| 887 |
+
pass
|
| 888 |
+
if temp[8] >= 2:
|
| 889 |
+
bits[112] = 1
|
| 890 |
+
bits[119] = 1
|
| 891 |
+
elif temp[8] == 1:
|
| 892 |
+
bits[112] = 1
|
| 893 |
+
else:
|
| 894 |
+
pass
|
| 895 |
+
if temp[9] >= 1:
|
| 896 |
+
bits[126] = 1
|
| 897 |
+
else:
|
| 898 |
+
pass
|
| 899 |
+
if temp[10] >= 1:
|
| 900 |
+
bits[133] = 1
|
| 901 |
+
else:
|
| 902 |
+
pass
|
| 903 |
+
|
| 904 |
+
return ringSize, bits
|
| 905 |
+
|
| 906 |
+
|
| 907 |
+
def func_2(mol, bits):
|
| 908 |
+
""" *Internal Use Only*
|
| 909 |
+
saturated or aromatic carbon-only ring
|
| 910 |
+
"""
|
| 911 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
| 912 |
+
ringSize = []
|
| 913 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
| 914 |
+
for ring in AllRingsBond:
|
| 915 |
+
######### saturated
|
| 916 |
+
nonsingle = False
|
| 917 |
+
for bondIdx in ring:
|
| 918 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
| 919 |
+
nonsingle = True
|
| 920 |
+
break
|
| 921 |
+
if nonsingle == False:
|
| 922 |
+
ringSize.append(len(ring))
|
| 923 |
+
for k, v in temp.items():
|
| 924 |
+
if len(ring) == k:
|
| 925 |
+
temp[k] += 1
|
| 926 |
+
######## aromatic carbon-only
|
| 927 |
+
aromatic = True
|
| 928 |
+
AllCarb = True
|
| 929 |
+
for bondIdx in ring:
|
| 930 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
|
| 931 |
+
aromatic = False
|
| 932 |
+
break
|
| 933 |
+
for bondIdx in ring:
|
| 934 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
| 935 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
| 936 |
+
if BeginAtom.GetAtomicNum() != 6 or EndAtom.GetAtomicNum() != 6:
|
| 937 |
+
AllCarb = False
|
| 938 |
+
break
|
| 939 |
+
if aromatic == True and AllCarb == True:
|
| 940 |
+
ringSize.append(len(ring))
|
| 941 |
+
for k, v in temp.items():
|
| 942 |
+
if len(ring) == k:
|
| 943 |
+
temp[k] += 1
|
| 944 |
+
if temp[3] >= 2:
|
| 945 |
+
bits[1] = 1
|
| 946 |
+
bits[8] = 1
|
| 947 |
+
elif temp[3] == 1:
|
| 948 |
+
bits[1] = 1
|
| 949 |
+
else:
|
| 950 |
+
pass
|
| 951 |
+
if temp[4] >= 2:
|
| 952 |
+
bits[15] = 1
|
| 953 |
+
bits[22] = 1
|
| 954 |
+
elif temp[4] == 1:
|
| 955 |
+
bits[15] = 1
|
| 956 |
+
else:
|
| 957 |
+
pass
|
| 958 |
+
if temp[5] >= 5:
|
| 959 |
+
bits[29] = 1
|
| 960 |
+
bits[36] = 1
|
| 961 |
+
bits[43] = 1
|
| 962 |
+
bits[50] = 1
|
| 963 |
+
bits[57] = 1
|
| 964 |
+
elif temp[5] == 4:
|
| 965 |
+
bits[29] = 1
|
| 966 |
+
bits[36] = 1
|
| 967 |
+
bits[43] = 1
|
| 968 |
+
bits[50] = 1
|
| 969 |
+
elif temp[5] == 3:
|
| 970 |
+
bits[29] = 1
|
| 971 |
+
bits[36] = 1
|
| 972 |
+
bits[43] = 1
|
| 973 |
+
elif temp[5] == 2:
|
| 974 |
+
bits[29] = 1
|
| 975 |
+
bits[36] = 1
|
| 976 |
+
elif temp[5] == 1:
|
| 977 |
+
bits[29] = 1
|
| 978 |
+
else:
|
| 979 |
+
pass
|
| 980 |
+
if temp[6] >= 5:
|
| 981 |
+
bits[64] = 1
|
| 982 |
+
bits[71] = 1
|
| 983 |
+
bits[78] = 1
|
| 984 |
+
bits[85] = 1
|
| 985 |
+
bits[92] = 1
|
| 986 |
+
elif temp[6] == 4:
|
| 987 |
+
bits[64] = 1
|
| 988 |
+
bits[71] = 1
|
| 989 |
+
bits[78] = 1
|
| 990 |
+
bits[85] = 1
|
| 991 |
+
elif temp[6] == 3:
|
| 992 |
+
bits[64] = 1
|
| 993 |
+
bits[71] = 1
|
| 994 |
+
bits[78] = 1
|
| 995 |
+
elif temp[6] == 2:
|
| 996 |
+
bits[64] = 1
|
| 997 |
+
bits[71] = 1
|
| 998 |
+
elif temp[6] == 1:
|
| 999 |
+
bits[64] = 1
|
| 1000 |
+
else:
|
| 1001 |
+
pass
|
| 1002 |
+
if temp[7] >= 2:
|
| 1003 |
+
bits[99] = 1
|
| 1004 |
+
bits[106] = 1
|
| 1005 |
+
elif temp[7] == 1:
|
| 1006 |
+
bits[99] = 1
|
| 1007 |
+
else:
|
| 1008 |
+
pass
|
| 1009 |
+
if temp[8] >= 2:
|
| 1010 |
+
bits[113] = 1
|
| 1011 |
+
bits[120] = 1
|
| 1012 |
+
elif temp[8] == 1:
|
| 1013 |
+
bits[113] = 1
|
| 1014 |
+
else:
|
| 1015 |
+
pass
|
| 1016 |
+
if temp[9] >= 1:
|
| 1017 |
+
bits[127] = 1
|
| 1018 |
+
else:
|
| 1019 |
+
pass
|
| 1020 |
+
if temp[10] >= 1:
|
| 1021 |
+
bits[134] = 1
|
| 1022 |
+
else:
|
| 1023 |
+
pass
|
| 1024 |
+
return ringSize, bits
|
| 1025 |
+
|
| 1026 |
+
|
| 1027 |
+
def func_3(mol, bits):
|
| 1028 |
+
""" *Internal Use Only*
|
| 1029 |
+
saturated or aromatic nitrogen-containing
|
| 1030 |
+
"""
|
| 1031 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
| 1032 |
+
ringSize = []
|
| 1033 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
| 1034 |
+
for ring in AllRingsBond:
|
| 1035 |
+
######### saturated
|
| 1036 |
+
nonsingle = False
|
| 1037 |
+
for bondIdx in ring:
|
| 1038 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
| 1039 |
+
nonsingle = True
|
| 1040 |
+
break
|
| 1041 |
+
if nonsingle == False:
|
| 1042 |
+
ringSize.append(len(ring))
|
| 1043 |
+
for k, v in temp.items():
|
| 1044 |
+
if len(ring) == k:
|
| 1045 |
+
temp[k] += 1
|
| 1046 |
+
######## aromatic nitrogen-containing
|
| 1047 |
+
aromatic = True
|
| 1048 |
+
ContainNitro = False
|
| 1049 |
+
for bondIdx in ring:
|
| 1050 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
|
| 1051 |
+
aromatic = False
|
| 1052 |
+
break
|
| 1053 |
+
for bondIdx in ring:
|
| 1054 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
| 1055 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
| 1056 |
+
if BeginAtom.GetAtomicNum() == 7 or EndAtom.GetAtomicNum() == 7:
|
| 1057 |
+
ContainNitro = True
|
| 1058 |
+
break
|
| 1059 |
+
if aromatic == True and ContainNitro == True:
|
| 1060 |
+
ringSize.append(len(ring))
|
| 1061 |
+
for k, v in temp.items():
|
| 1062 |
+
if len(ring) == k:
|
| 1063 |
+
temp[k] += 1
|
| 1064 |
+
if temp[3] >= 2:
|
| 1065 |
+
bits[2] = 1
|
| 1066 |
+
bits[9] = 1
|
| 1067 |
+
elif temp[3] == 1:
|
| 1068 |
+
bits[2] = 1
|
| 1069 |
+
else:
|
| 1070 |
+
pass
|
| 1071 |
+
if temp[4] >= 2:
|
| 1072 |
+
bits[16] = 1
|
| 1073 |
+
bits[23] = 1
|
| 1074 |
+
elif temp[4] == 1:
|
| 1075 |
+
bits[16] = 1
|
| 1076 |
+
else:
|
| 1077 |
+
pass
|
| 1078 |
+
if temp[5] >= 5:
|
| 1079 |
+
bits[30] = 1
|
| 1080 |
+
bits[37] = 1
|
| 1081 |
+
bits[44] = 1
|
| 1082 |
+
bits[51] = 1
|
| 1083 |
+
bits[58] = 1
|
| 1084 |
+
elif temp[5] == 4:
|
| 1085 |
+
bits[30] = 1
|
| 1086 |
+
bits[37] = 1
|
| 1087 |
+
bits[44] = 1
|
| 1088 |
+
bits[51] = 1
|
| 1089 |
+
elif temp[5] == 3:
|
| 1090 |
+
bits[30] = 1
|
| 1091 |
+
bits[37] = 1
|
| 1092 |
+
bits[44] = 1
|
| 1093 |
+
elif temp[5] == 2:
|
| 1094 |
+
bits[30] = 1
|
| 1095 |
+
bits[37] = 1
|
| 1096 |
+
elif temp[5] == 1:
|
| 1097 |
+
bits[30] = 1
|
| 1098 |
+
else:
|
| 1099 |
+
pass
|
| 1100 |
+
if temp[6] >= 5:
|
| 1101 |
+
bits[65] = 1
|
| 1102 |
+
bits[72] = 1
|
| 1103 |
+
bits[79] = 1
|
| 1104 |
+
bits[86] = 1
|
| 1105 |
+
bits[93] = 1
|
| 1106 |
+
elif temp[6] == 4:
|
| 1107 |
+
bits[65] = 1
|
| 1108 |
+
bits[72] = 1
|
| 1109 |
+
bits[79] = 1
|
| 1110 |
+
bits[86] = 1
|
| 1111 |
+
elif temp[6] == 3:
|
| 1112 |
+
bits[65] = 1
|
| 1113 |
+
bits[72] = 1
|
| 1114 |
+
bits[79] = 1
|
| 1115 |
+
elif temp[6] == 2:
|
| 1116 |
+
bits[65] = 1
|
| 1117 |
+
bits[72] = 1
|
| 1118 |
+
elif temp[6] == 1:
|
| 1119 |
+
bits[65] = 1
|
| 1120 |
+
else:
|
| 1121 |
+
pass
|
| 1122 |
+
if temp[7] >= 2:
|
| 1123 |
+
bits[100] = 1
|
| 1124 |
+
bits[107] = 1
|
| 1125 |
+
elif temp[7] == 1:
|
| 1126 |
+
bits[100] = 1
|
| 1127 |
+
else:
|
| 1128 |
+
pass
|
| 1129 |
+
if temp[8] >= 2:
|
| 1130 |
+
bits[114] = 1
|
| 1131 |
+
bits[121] = 1
|
| 1132 |
+
elif temp[8] == 1:
|
| 1133 |
+
bits[114] = 1
|
| 1134 |
+
else:
|
| 1135 |
+
pass
|
| 1136 |
+
if temp[9] >= 1:
|
| 1137 |
+
bits[128] = 1
|
| 1138 |
+
else:
|
| 1139 |
+
pass
|
| 1140 |
+
if temp[10] >= 1:
|
| 1141 |
+
bits[135] = 1
|
| 1142 |
+
else:
|
| 1143 |
+
pass
|
| 1144 |
+
return ringSize, bits
|
| 1145 |
+
|
| 1146 |
+
|
| 1147 |
+
def func_4(mol, bits):
|
| 1148 |
+
""" *Internal Use Only*
|
| 1149 |
+
saturated or aromatic heteroatom-containing
|
| 1150 |
+
"""
|
| 1151 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
| 1152 |
+
ringSize = []
|
| 1153 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
| 1154 |
+
for ring in AllRingsBond:
|
| 1155 |
+
######### saturated
|
| 1156 |
+
nonsingle = False
|
| 1157 |
+
for bondIdx in ring:
|
| 1158 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
| 1159 |
+
nonsingle = True
|
| 1160 |
+
break
|
| 1161 |
+
if nonsingle == False:
|
| 1162 |
+
ringSize.append(len(ring))
|
| 1163 |
+
for k, v in temp.items():
|
| 1164 |
+
if len(ring) == k:
|
| 1165 |
+
temp[k] += 1
|
| 1166 |
+
######## aromatic heteroatom-containing
|
| 1167 |
+
aromatic = True
|
| 1168 |
+
heteroatom = False
|
| 1169 |
+
for bondIdx in ring:
|
| 1170 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
|
| 1171 |
+
aromatic = False
|
| 1172 |
+
break
|
| 1173 |
+
for bondIdx in ring:
|
| 1174 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
| 1175 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
| 1176 |
+
if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
|
| 1177 |
+
heteroatom = True
|
| 1178 |
+
break
|
| 1179 |
+
if aromatic == True and heteroatom == True:
|
| 1180 |
+
ringSize.append(len(ring))
|
| 1181 |
+
for k, v in temp.items():
|
| 1182 |
+
if len(ring) == k:
|
| 1183 |
+
temp[k] += 1
|
| 1184 |
+
if temp[3] >= 2:
|
| 1185 |
+
bits[3] = 1
|
| 1186 |
+
bits[10] = 1
|
| 1187 |
+
elif temp[3] == 1:
|
| 1188 |
+
bits[3] = 1
|
| 1189 |
+
else:
|
| 1190 |
+
pass
|
| 1191 |
+
if temp[4] >= 2:
|
| 1192 |
+
bits[17] = 1
|
| 1193 |
+
bits[24] = 1
|
| 1194 |
+
elif temp[4] == 1:
|
| 1195 |
+
bits[17] = 1
|
| 1196 |
+
else:
|
| 1197 |
+
pass
|
| 1198 |
+
if temp[5] >= 5:
|
| 1199 |
+
bits[31] = 1
|
| 1200 |
+
bits[38] = 1
|
| 1201 |
+
bits[45] = 1
|
| 1202 |
+
bits[52] = 1
|
| 1203 |
+
bits[59] = 1
|
| 1204 |
+
elif temp[5] == 4:
|
| 1205 |
+
bits[31] = 1
|
| 1206 |
+
bits[38] = 1
|
| 1207 |
+
bits[45] = 1
|
| 1208 |
+
bits[52] = 1
|
| 1209 |
+
elif temp[5] == 3:
|
| 1210 |
+
bits[31] = 1
|
| 1211 |
+
bits[38] = 1
|
| 1212 |
+
bits[45] = 1
|
| 1213 |
+
elif temp[5] == 2:
|
| 1214 |
+
bits[31] = 1
|
| 1215 |
+
bits[38] = 1
|
| 1216 |
+
elif temp[5] == 1:
|
| 1217 |
+
bits[31] = 1
|
| 1218 |
+
else:
|
| 1219 |
+
pass
|
| 1220 |
+
if temp[6] >= 5:
|
| 1221 |
+
bits[66] = 1
|
| 1222 |
+
bits[73] = 1
|
| 1223 |
+
bits[80] = 1
|
| 1224 |
+
bits[87] = 1
|
| 1225 |
+
bits[94] = 1
|
| 1226 |
+
elif temp[6] == 4:
|
| 1227 |
+
bits[66] = 1
|
| 1228 |
+
bits[73] = 1
|
| 1229 |
+
bits[80] = 1
|
| 1230 |
+
bits[87] = 1
|
| 1231 |
+
elif temp[6] == 3:
|
| 1232 |
+
bits[66] = 1
|
| 1233 |
+
bits[73] = 1
|
| 1234 |
+
bits[80] = 1
|
| 1235 |
+
elif temp[6] == 2:
|
| 1236 |
+
bits[66] = 1
|
| 1237 |
+
bits[73] = 1
|
| 1238 |
+
elif temp[6] == 1:
|
| 1239 |
+
bits[66] = 1
|
| 1240 |
+
else:
|
| 1241 |
+
pass
|
| 1242 |
+
if temp[7] >= 2:
|
| 1243 |
+
bits[101] = 1
|
| 1244 |
+
bits[108] = 1
|
| 1245 |
+
elif temp[7] == 1:
|
| 1246 |
+
bits[101] = 1
|
| 1247 |
+
else:
|
| 1248 |
+
pass
|
| 1249 |
+
if temp[8] >= 2:
|
| 1250 |
+
bits[115] = 1
|
| 1251 |
+
bits[122] = 1
|
| 1252 |
+
elif temp[8] == 1:
|
| 1253 |
+
bits[115] = 1
|
| 1254 |
+
else:
|
| 1255 |
+
pass
|
| 1256 |
+
if temp[9] >= 1:
|
| 1257 |
+
bits[129] = 1
|
| 1258 |
+
else:
|
| 1259 |
+
pass
|
| 1260 |
+
if temp[10] >= 1:
|
| 1261 |
+
bits[136] = 1
|
| 1262 |
+
else:
|
| 1263 |
+
pass
|
| 1264 |
+
return ringSize, bits
|
| 1265 |
+
|
| 1266 |
+
|
| 1267 |
+
def func_5(mol, bits):
|
| 1268 |
+
""" *Internal Use Only*
|
| 1269 |
+
unsaturated non-aromatic carbon-only
|
| 1270 |
+
"""
|
| 1271 |
+
ringSize = []
|
| 1272 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
| 1273 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
| 1274 |
+
for ring in AllRingsBond:
|
| 1275 |
+
unsaturated = False
|
| 1276 |
+
nonaromatic = True
|
| 1277 |
+
Allcarb = True
|
| 1278 |
+
######### unsaturated
|
| 1279 |
+
for bondIdx in ring:
|
| 1280 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
| 1281 |
+
unsaturated = True
|
| 1282 |
+
break
|
| 1283 |
+
######## non-aromatic
|
| 1284 |
+
for bondIdx in ring:
|
| 1285 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
|
| 1286 |
+
nonaromatic = False
|
| 1287 |
+
break
|
| 1288 |
+
######## allcarb
|
| 1289 |
+
for bondIdx in ring:
|
| 1290 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
| 1291 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
| 1292 |
+
if BeginAtom.GetAtomicNum() != 6 or EndAtom.GetAtomicNum() != 6:
|
| 1293 |
+
Allcarb = False
|
| 1294 |
+
break
|
| 1295 |
+
if unsaturated == True and nonaromatic == True and Allcarb == True:
|
| 1296 |
+
ringSize.append(len(ring))
|
| 1297 |
+
for k, v in temp.items():
|
| 1298 |
+
if len(ring) == k:
|
| 1299 |
+
temp[k] += 1
|
| 1300 |
+
if temp[3] >= 2:
|
| 1301 |
+
bits[4] = 1
|
| 1302 |
+
bits[11] = 1
|
| 1303 |
+
elif temp[3] == 1:
|
| 1304 |
+
bits[4] = 1
|
| 1305 |
+
else:
|
| 1306 |
+
pass
|
| 1307 |
+
if temp[4] >= 2:
|
| 1308 |
+
bits[18] = 1
|
| 1309 |
+
bits[25] = 1
|
| 1310 |
+
elif temp[4] == 1:
|
| 1311 |
+
bits[18] = 1
|
| 1312 |
+
else:
|
| 1313 |
+
pass
|
| 1314 |
+
if temp[5] >= 5:
|
| 1315 |
+
bits[32] = 1
|
| 1316 |
+
bits[39] = 1
|
| 1317 |
+
bits[46] = 1
|
| 1318 |
+
bits[53] = 1
|
| 1319 |
+
bits[60] = 1
|
| 1320 |
+
elif temp[5] == 4:
|
| 1321 |
+
bits[32] = 1
|
| 1322 |
+
bits[39] = 1
|
| 1323 |
+
bits[46] = 1
|
| 1324 |
+
bits[53] = 1
|
| 1325 |
+
elif temp[5] == 3:
|
| 1326 |
+
bits[32] = 1
|
| 1327 |
+
bits[39] = 1
|
| 1328 |
+
bits[46] = 1
|
| 1329 |
+
elif temp[5] == 2:
|
| 1330 |
+
bits[32] = 1
|
| 1331 |
+
bits[39] = 1
|
| 1332 |
+
elif temp[5] == 1:
|
| 1333 |
+
bits[32] = 1
|
| 1334 |
+
else:
|
| 1335 |
+
pass
|
| 1336 |
+
if temp[6] >= 5:
|
| 1337 |
+
bits[67] = 1
|
| 1338 |
+
bits[74] = 1
|
| 1339 |
+
bits[81] = 1
|
| 1340 |
+
bits[88] = 1
|
| 1341 |
+
bits[95] = 1
|
| 1342 |
+
elif temp[6] == 4:
|
| 1343 |
+
bits[67] = 1
|
| 1344 |
+
bits[74] = 1
|
| 1345 |
+
bits[81] = 1
|
| 1346 |
+
bits[88] = 1
|
| 1347 |
+
elif temp[6] == 3:
|
| 1348 |
+
bits[67] = 1
|
| 1349 |
+
bits[74] = 1
|
| 1350 |
+
bits[81] = 1
|
| 1351 |
+
elif temp[6] == 2:
|
| 1352 |
+
bits[67] = 1
|
| 1353 |
+
bits[74] = 1
|
| 1354 |
+
elif temp[6] == 1:
|
| 1355 |
+
bits[67] = 1
|
| 1356 |
+
else:
|
| 1357 |
+
pass
|
| 1358 |
+
if temp[7] >= 2:
|
| 1359 |
+
bits[102] = 1
|
| 1360 |
+
bits[109] = 1
|
| 1361 |
+
elif temp[7] == 1:
|
| 1362 |
+
bits[102] = 1
|
| 1363 |
+
else:
|
| 1364 |
+
pass
|
| 1365 |
+
if temp[8] >= 2:
|
| 1366 |
+
bits[116] = 1
|
| 1367 |
+
bits[123] = 1
|
| 1368 |
+
elif temp[8] == 1:
|
| 1369 |
+
bits[116] = 1
|
| 1370 |
+
else:
|
| 1371 |
+
pass
|
| 1372 |
+
if temp[9] >= 1:
|
| 1373 |
+
bits[130] = 1
|
| 1374 |
+
else:
|
| 1375 |
+
pass
|
| 1376 |
+
if temp[10] >= 1:
|
| 1377 |
+
bits[137] = 1
|
| 1378 |
+
else:
|
| 1379 |
+
pass
|
| 1380 |
+
return ringSize, bits
|
| 1381 |
+
|
| 1382 |
+
|
| 1383 |
+
def func_6(mol, bits):
|
| 1384 |
+
""" *Internal Use Only*
|
| 1385 |
+
unsaturated non-aromatic nitrogen-containing
|
| 1386 |
+
"""
|
| 1387 |
+
ringSize = []
|
| 1388 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
| 1389 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
| 1390 |
+
for ring in AllRingsBond:
|
| 1391 |
+
unsaturated = False
|
| 1392 |
+
nonaromatic = True
|
| 1393 |
+
ContainNitro = False
|
| 1394 |
+
######### unsaturated
|
| 1395 |
+
for bondIdx in ring:
|
| 1396 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
| 1397 |
+
unsaturated = True
|
| 1398 |
+
break
|
| 1399 |
+
######## non-aromatic
|
| 1400 |
+
for bondIdx in ring:
|
| 1401 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
|
| 1402 |
+
nonaromatic = False
|
| 1403 |
+
break
|
| 1404 |
+
######## nitrogen-containing
|
| 1405 |
+
for bondIdx in ring:
|
| 1406 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
| 1407 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
| 1408 |
+
if BeginAtom.GetAtomicNum() == 7 or EndAtom.GetAtomicNum() == 7:
|
| 1409 |
+
ContainNitro = True
|
| 1410 |
+
break
|
| 1411 |
+
if unsaturated == True and nonaromatic == True and ContainNitro == True:
|
| 1412 |
+
ringSize.append(len(ring))
|
| 1413 |
+
for k, v in temp.items():
|
| 1414 |
+
if len(ring) == k:
|
| 1415 |
+
temp[k] += 1
|
| 1416 |
+
if temp[3] >= 2:
|
| 1417 |
+
bits[5] = 1
|
| 1418 |
+
bits[12] = 1
|
| 1419 |
+
elif temp[3] == 1:
|
| 1420 |
+
bits[5] = 1
|
| 1421 |
+
else:
|
| 1422 |
+
pass
|
| 1423 |
+
if temp[4] >= 2:
|
| 1424 |
+
bits[19] = 1
|
| 1425 |
+
bits[26] = 1
|
| 1426 |
+
elif temp[4] == 1:
|
| 1427 |
+
bits[19] = 1
|
| 1428 |
+
else:
|
| 1429 |
+
pass
|
| 1430 |
+
if temp[5] >= 5:
|
| 1431 |
+
bits[33] = 1
|
| 1432 |
+
bits[40] = 1
|
| 1433 |
+
bits[47] = 1
|
| 1434 |
+
bits[54] = 1
|
| 1435 |
+
bits[61] = 1
|
| 1436 |
+
elif temp[5] == 4:
|
| 1437 |
+
bits[33] = 1
|
| 1438 |
+
bits[40] = 1
|
| 1439 |
+
bits[47] = 1
|
| 1440 |
+
bits[54] = 1
|
| 1441 |
+
elif temp[5] == 3:
|
| 1442 |
+
bits[33] = 1
|
| 1443 |
+
bits[40] = 1
|
| 1444 |
+
bits[47] = 1
|
| 1445 |
+
elif temp[5] == 2:
|
| 1446 |
+
bits[33] = 1
|
| 1447 |
+
bits[40] = 1
|
| 1448 |
+
elif temp[5] == 1:
|
| 1449 |
+
bits[33] = 1
|
| 1450 |
+
else:
|
| 1451 |
+
pass
|
| 1452 |
+
if temp[6] >= 5:
|
| 1453 |
+
bits[68] = 1
|
| 1454 |
+
bits[75] = 1
|
| 1455 |
+
bits[82] = 1
|
| 1456 |
+
bits[89] = 1
|
| 1457 |
+
bits[96] = 1
|
| 1458 |
+
elif temp[6] == 4:
|
| 1459 |
+
bits[68] = 1
|
| 1460 |
+
bits[75] = 1
|
| 1461 |
+
bits[82] = 1
|
| 1462 |
+
bits[89] = 1
|
| 1463 |
+
elif temp[6] == 3:
|
| 1464 |
+
bits[68] = 1
|
| 1465 |
+
bits[75] = 1
|
| 1466 |
+
bits[82] = 1
|
| 1467 |
+
elif temp[6] == 2:
|
| 1468 |
+
bits[68] = 1
|
| 1469 |
+
bits[75] = 1
|
| 1470 |
+
elif temp[6] == 1:
|
| 1471 |
+
bits[68] = 1
|
| 1472 |
+
else:
|
| 1473 |
+
pass
|
| 1474 |
+
if temp[7] >= 2:
|
| 1475 |
+
bits[103] = 1
|
| 1476 |
+
bits[110] = 1
|
| 1477 |
+
elif temp[7] == 1:
|
| 1478 |
+
bits[103] = 1
|
| 1479 |
+
else:
|
| 1480 |
+
pass
|
| 1481 |
+
if temp[8] >= 2:
|
| 1482 |
+
bits[117] = 1
|
| 1483 |
+
bits[124] = 1
|
| 1484 |
+
elif temp[8] == 1:
|
| 1485 |
+
bits[117] = 1
|
| 1486 |
+
else:
|
| 1487 |
+
pass
|
| 1488 |
+
if temp[9] >= 1:
|
| 1489 |
+
bits[131] = 1
|
| 1490 |
+
else:
|
| 1491 |
+
pass
|
| 1492 |
+
if temp[10] >= 1:
|
| 1493 |
+
bits[138] = 1
|
| 1494 |
+
else:
|
| 1495 |
+
pass
|
| 1496 |
+
return ringSize, bits
|
| 1497 |
+
|
| 1498 |
+
|
| 1499 |
+
def func_7(mol, bits):
|
| 1500 |
+
""" *Internal Use Only*
|
| 1501 |
+
unsaturated non-aromatic heteroatom-containing
|
| 1502 |
+
"""
|
| 1503 |
+
ringSize = []
|
| 1504 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
| 1505 |
+
temp = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}
|
| 1506 |
+
for ring in AllRingsBond:
|
| 1507 |
+
unsaturated = False
|
| 1508 |
+
nonaromatic = True
|
| 1509 |
+
heteroatom = False
|
| 1510 |
+
######### unsaturated
|
| 1511 |
+
for bondIdx in ring:
|
| 1512 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'SINGLE':
|
| 1513 |
+
unsaturated = True
|
| 1514 |
+
break
|
| 1515 |
+
######## non-aromatic
|
| 1516 |
+
for bondIdx in ring:
|
| 1517 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name == 'AROMATIC':
|
| 1518 |
+
nonaromatic = False
|
| 1519 |
+
break
|
| 1520 |
+
######## heteroatom-containing
|
| 1521 |
+
for bondIdx in ring:
|
| 1522 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
| 1523 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
| 1524 |
+
if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
|
| 1525 |
+
heteroatom = True
|
| 1526 |
+
break
|
| 1527 |
+
if unsaturated == True and nonaromatic == True and heteroatom == True:
|
| 1528 |
+
ringSize.append(len(ring))
|
| 1529 |
+
for k, v in temp.items():
|
| 1530 |
+
if len(ring) == k:
|
| 1531 |
+
temp[k] += 1
|
| 1532 |
+
if temp[3] >= 2:
|
| 1533 |
+
bits[6] = 1
|
| 1534 |
+
bits[13] = 1
|
| 1535 |
+
elif temp[3] == 1:
|
| 1536 |
+
bits[6] = 1
|
| 1537 |
+
else:
|
| 1538 |
+
pass
|
| 1539 |
+
if temp[4] >= 2:
|
| 1540 |
+
bits[20] = 1
|
| 1541 |
+
bits[27] = 1
|
| 1542 |
+
elif temp[4] == 1:
|
| 1543 |
+
bits[20] = 1
|
| 1544 |
+
else:
|
| 1545 |
+
pass
|
| 1546 |
+
if temp[5] >= 5:
|
| 1547 |
+
bits[34] = 1
|
| 1548 |
+
bits[41] = 1
|
| 1549 |
+
bits[48] = 1
|
| 1550 |
+
bits[55] = 1
|
| 1551 |
+
bits[62] = 1
|
| 1552 |
+
elif temp[5] == 4:
|
| 1553 |
+
bits[34] = 1
|
| 1554 |
+
bits[41] = 1
|
| 1555 |
+
bits[48] = 1
|
| 1556 |
+
bits[55] = 1
|
| 1557 |
+
elif temp[5] == 3:
|
| 1558 |
+
bits[34] = 1
|
| 1559 |
+
bits[41] = 1
|
| 1560 |
+
bits[48] = 1
|
| 1561 |
+
elif temp[5] == 2:
|
| 1562 |
+
bits[34] = 1
|
| 1563 |
+
bits[41] = 1
|
| 1564 |
+
elif temp[5] == 1:
|
| 1565 |
+
bits[34] = 1
|
| 1566 |
+
else:
|
| 1567 |
+
pass
|
| 1568 |
+
if temp[6] >= 5:
|
| 1569 |
+
bits[69] = 1
|
| 1570 |
+
bits[76] = 1
|
| 1571 |
+
bits[83] = 1
|
| 1572 |
+
bits[90] = 1
|
| 1573 |
+
bits[97] = 1
|
| 1574 |
+
elif temp[6] == 4:
|
| 1575 |
+
bits[69] = 1
|
| 1576 |
+
bits[76] = 1
|
| 1577 |
+
bits[83] = 1
|
| 1578 |
+
bits[90] = 1
|
| 1579 |
+
elif temp[6] == 3:
|
| 1580 |
+
bits[69] = 1
|
| 1581 |
+
bits[76] = 1
|
| 1582 |
+
bits[83] = 1
|
| 1583 |
+
elif temp[6] == 2:
|
| 1584 |
+
bits[69] = 1
|
| 1585 |
+
bits[76] = 1
|
| 1586 |
+
elif temp[6] == 1:
|
| 1587 |
+
bits[69] = 1
|
| 1588 |
+
else:
|
| 1589 |
+
pass
|
| 1590 |
+
if temp[7] >= 2:
|
| 1591 |
+
bits[104] = 1
|
| 1592 |
+
bits[111] = 1
|
| 1593 |
+
elif temp[7] == 1:
|
| 1594 |
+
bits[104] = 1
|
| 1595 |
+
else:
|
| 1596 |
+
pass
|
| 1597 |
+
if temp[8] >= 2:
|
| 1598 |
+
bits[118] = 1
|
| 1599 |
+
bits[125] = 1
|
| 1600 |
+
elif temp[8] == 1:
|
| 1601 |
+
bits[118] = 1
|
| 1602 |
+
else:
|
| 1603 |
+
pass
|
| 1604 |
+
if temp[9] >= 1:
|
| 1605 |
+
bits[132] = 1
|
| 1606 |
+
else:
|
| 1607 |
+
pass
|
| 1608 |
+
if temp[10] >= 1:
|
| 1609 |
+
bits[139] = 1
|
| 1610 |
+
else:
|
| 1611 |
+
pass
|
| 1612 |
+
return ringSize, bits
|
| 1613 |
+
|
| 1614 |
+
|
| 1615 |
+
def func_8(mol, bits):
|
| 1616 |
+
""" *Internal Use Only*
|
| 1617 |
+
aromatic rings or hetero-aromatic rings
|
| 1618 |
+
"""
|
| 1619 |
+
AllRingsBond = mol.GetRingInfo().BondRings()
|
| 1620 |
+
temp = {'aromatic': 0, 'heteroatom': 0}
|
| 1621 |
+
for ring in AllRingsBond:
|
| 1622 |
+
aromatic = True
|
| 1623 |
+
heteroatom = False
|
| 1624 |
+
for bondIdx in ring:
|
| 1625 |
+
if mol.GetBondWithIdx(bondIdx).GetBondType().name != 'AROMATIC':
|
| 1626 |
+
aromatic = False
|
| 1627 |
+
break
|
| 1628 |
+
if aromatic == True:
|
| 1629 |
+
temp['aromatic'] += 1
|
| 1630 |
+
for bondIdx in ring:
|
| 1631 |
+
BeginAtom = mol.GetBondWithIdx(bondIdx).GetBeginAtom()
|
| 1632 |
+
EndAtom = mol.GetBondWithIdx(bondIdx).GetEndAtom()
|
| 1633 |
+
if BeginAtom.GetAtomicNum() not in [1, 6] or EndAtom.GetAtomicNum() not in [1, 6]:
|
| 1634 |
+
heteroatom = True
|
| 1635 |
+
break
|
| 1636 |
+
if heteroatom == True:
|
| 1637 |
+
temp['heteroatom'] += 1
|
| 1638 |
+
if temp['aromatic'] >= 4:
|
| 1639 |
+
bits[140] = 1
|
| 1640 |
+
bits[142] = 1
|
| 1641 |
+
bits[144] = 1
|
| 1642 |
+
bits[146] = 1
|
| 1643 |
+
elif temp['aromatic'] == 3:
|
| 1644 |
+
bits[140] = 1
|
| 1645 |
+
bits[142] = 1
|
| 1646 |
+
bits[144] = 1
|
| 1647 |
+
elif temp['aromatic'] == 2:
|
| 1648 |
+
bits[140] = 1
|
| 1649 |
+
bits[142] = 1
|
| 1650 |
+
elif temp['aromatic'] == 1:
|
| 1651 |
+
bits[140] = 1
|
| 1652 |
+
else:
|
| 1653 |
+
pass
|
| 1654 |
+
if temp['aromatic'] >= 4 and temp['heteroatom'] >= 4:
|
| 1655 |
+
bits[141] = 1
|
| 1656 |
+
bits[143] = 1
|
| 1657 |
+
bits[145] = 1
|
| 1658 |
+
bits[147] = 1
|
| 1659 |
+
elif temp['aromatic'] == 3 and temp['heteroatom'] == 3:
|
| 1660 |
+
bits[141] = 1
|
| 1661 |
+
bits[143] = 1
|
| 1662 |
+
bits[145] = 1
|
| 1663 |
+
elif temp['aromatic'] == 2 and temp['heteroatom'] == 2:
|
| 1664 |
+
bits[141] = 1
|
| 1665 |
+
bits[143] = 1
|
| 1666 |
+
elif temp['aromatic'] == 1 and temp['heteroatom'] == 1:
|
| 1667 |
+
bits[141] = 1
|
| 1668 |
+
else:
|
| 1669 |
+
pass
|
| 1670 |
+
return bits
|
| 1671 |
+
|
| 1672 |
+
|
| 1673 |
+
def calcPubChemFingerPart2(mol): # 116-263
|
| 1674 |
+
""" *Internal Use Only*
|
| 1675 |
+
Calculate PubChem Fingerprints (116-263)
|
| 1676 |
+
"""
|
| 1677 |
+
bits = [0] * 148
|
| 1678 |
+
bits = func_1(mol, bits)[1]
|
| 1679 |
+
bits = func_2(mol, bits)[1]
|
| 1680 |
+
bits = func_3(mol, bits)[1]
|
| 1681 |
+
bits = func_4(mol, bits)[1]
|
| 1682 |
+
bits = func_5(mol, bits)[1]
|
| 1683 |
+
bits = func_6(mol, bits)[1]
|
| 1684 |
+
bits = func_7(mol, bits)[1]
|
| 1685 |
+
bits = func_8(mol, bits)
|
| 1686 |
+
|
| 1687 |
+
return bits
|
| 1688 |
+
|
| 1689 |
+
|
| 1690 |
+
def GetPubChemFPs(mol):
|
| 1691 |
+
"""*Internal Use Only*
|
| 1692 |
+
Calculate PubChem Fingerprints
|
| 1693 |
+
"""
|
| 1694 |
+
mol = Chem.AddHs(mol)
|
| 1695 |
+
AllBits = [0] * 881
|
| 1696 |
+
res1 = list(calcPubChemFingerPart1(mol).ToBitString())
|
| 1697 |
+
for index, item in enumerate(res1[1:116]):
|
| 1698 |
+
if item == '1':
|
| 1699 |
+
AllBits[index] = 1
|
| 1700 |
+
for index2, item2 in enumerate(res1[116:734]):
|
| 1701 |
+
if item2 == '1':
|
| 1702 |
+
AllBits[index2 + 115 + 148] = 1
|
| 1703 |
+
res2 = calcPubChemFingerPart2(mol)
|
| 1704 |
+
for index3, item3 in enumerate(res2):
|
| 1705 |
+
if item3 == 1:
|
| 1706 |
+
AllBits[index3 + 115] = 1
|
| 1707 |
+
AllBits = np.array(AllBits, dtype=np.bool_)
|
| 1708 |
+
|
| 1709 |
+
return AllBits
|
| 1710 |
+
|
| 1711 |
+
|
| 1712 |
+
# ------------------------------------
|
| 1713 |
+
|
| 1714 |
+
|
| 1715 |
+
file_path = os.path.dirname(__file__)
|
| 1716 |
+
|
| 1717 |
+
|
| 1718 |
+
def GetPubChemFPInfos():
|
| 1719 |
+
return pd.read_excel(os.path.join(file_path, 'pubchemfp.xlsx'))
|
| 1720 |
+
|
| 1721 |
+
|
| 1722 |
+
if __name__ == '__main__':
|
| 1723 |
+
print('-' * 10 + 'START' + '-' * 10)
|
| 1724 |
+
SMILES = 'C1=NC2NC3=CNCC3=CC2CC1'
|
| 1725 |
+
mol = Chem.MolFromSmiles(SMILES)
|
| 1726 |
+
mol2 = Chem.AddHs(mol)
|
| 1727 |
+
result = GetPubChemFPs(mol2)
|
| 1728 |
+
print('Molecule: %s' % SMILES)
|
| 1729 |
+
print('-' * 25)
|
| 1730 |
+
print('Results: %s' % result)
|
| 1731 |
+
print('-' * 10 + 'END' + '-' * 10)
|
deepscreen/data/featurizers/fingerprint/pubchemfp.xlsx
ADDED
|
Binary file (41.2 kB). View file
|
|
|
deepscreen/data/featurizers/fingerprint/rdkitfp.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
topological fingerprint
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
from rdkit.Chem.rdmolops import RDKFingerprint
|
| 8 |
+
from rdkit.Chem import DataStructs
|
| 9 |
+
|
| 10 |
+
_type = 'topological-based'
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def GetRDkitFPs(mol, nBits=2048, return_bitInfo=False):
|
| 14 |
+
"""
|
| 15 |
+
#################################################################
|
| 16 |
+
Calculate Daylight-like fingerprint or topological fingerprint
|
| 17 |
+
|
| 18 |
+
(1024 bits).
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
|
| 22 |
+
result=CalculateDaylightFingerprint(mol)
|
| 23 |
+
|
| 24 |
+
Input: mol is a molecule object.
|
| 25 |
+
|
| 26 |
+
Output: result is a tuple form. The first is the number of
|
| 27 |
+
|
| 28 |
+
fingerprints. The second is a dict form whose keys are the
|
| 29 |
+
|
| 30 |
+
position which this molecule has some substructure. The third
|
| 31 |
+
|
| 32 |
+
is the DataStructs which is used for calculating the similarity.
|
| 33 |
+
#################################################################
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
bitInfo = {}
|
| 37 |
+
fp = RDKFingerprint(mol, fpSize=nBits, bitInfo=bitInfo)
|
| 38 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
| 39 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
| 40 |
+
if return_bitInfo:
|
| 41 |
+
return arr, return_bitInfo
|
| 42 |
+
return arr
|
deepscreen/data/featurizers/fingerprint/smarts_maccskey.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
smartsPatts = {
|
| 2 |
+
'MACCSFP0': (None, 0),
|
| 3 |
+
# ignore, Bit 0 is a placeholder and should be ignored: https://github.com/rdkit/rdkit/issues/1726
|
| 4 |
+
'MACCSFP1': ('?', 0),
|
| 5 |
+
'MACCSFP2': ('[#104]', 0),
|
| 6 |
+
'MACCSFP3': ('[#32,#33,#34,#50,#51,#52,#82,#83,#84]', 0),
|
| 7 |
+
'MACCSFP4': ('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]', 0),
|
| 8 |
+
'MACCSFP5': ('[Sc,Ti,Y,Zr,Hf]', 0),
|
| 9 |
+
'MACCSFP6': ('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]', 0),
|
| 10 |
+
'MACCSFP7': ('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]', 0),
|
| 11 |
+
'MACCSFP8': ('[!#6;!#1]1~*~*~*~1', 0),
|
| 12 |
+
'MACCSFP9': ('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]', 0),
|
| 13 |
+
'MACCSFP10': ('[Be,Mg,Ca,Sr,Ba,Ra]', 0),
|
| 14 |
+
'MACCSFP11': ('*1~*~*~*~1', 0),
|
| 15 |
+
'MACCSFP12': ('[Cu,Zn,Ag,Cd,Au,Hg]', 0),
|
| 16 |
+
'MACCSFP13': ('[#8]~[#7](~[#6])~[#6]', 0),
|
| 17 |
+
'MACCSFP14': ('[#16]-[#16]', 0),
|
| 18 |
+
'MACCSFP15': ('[#8]~[#6](~[#8])~[#8]', 0),
|
| 19 |
+
'MACCSFP16': ('[!#6;!#1]1~*~*~1', 0),
|
| 20 |
+
'MACCSFP17': ('[#6]#[#6]', 0),
|
| 21 |
+
'MACCSFP18': ('[#5,#13,#31,#49,#81]', 0),
|
| 22 |
+
'MACCSFP19': ('*1~*~*~*~*~*~*~1', 0),
|
| 23 |
+
'MACCSFP20': ('[#14]', 0),
|
| 24 |
+
'MACCSFP21': ('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]', 0),
|
| 25 |
+
'MACCSFP22': ('*1~*~*~1', 0),
|
| 26 |
+
'MACCSFP23': ('[#7]~[#6](~[#8])~[#8]', 0),
|
| 27 |
+
'MACCSFP24': ('[#7]-[#8]', 0),
|
| 28 |
+
'MACCSFP25': ('[#7]~[#6](~[#7])~[#7]', 0),
|
| 29 |
+
'MACCSFP26': ('[#6]=;@[#6](@*)@*', 0),
|
| 30 |
+
'MACCSFP27': ('[I]', 0),
|
| 31 |
+
'MACCSFP28': ('[!#6;!#1]~[CH2]~[!#6;!#1]', 0),
|
| 32 |
+
'MACCSFP29': ('[#15]', 0),
|
| 33 |
+
'MACCSFP30': ('[#6]~[!#6;!#1](~[#6])(~[#6])~*', 0),
|
| 34 |
+
'MACCSFP31': ('[!#6;!#1]~[F,Cl,Br,I]', 0),
|
| 35 |
+
'MACCSFP32': ('[#6]~[#16]~[#7]', 0),
|
| 36 |
+
'MACCSFP33': ('[#7]~[#16]', 0),
|
| 37 |
+
'MACCSFP34': ('[CH2]=*', 0),
|
| 38 |
+
'MACCSFP35': ('[Li,Na,K,Rb,Cs,Fr]', 0),
|
| 39 |
+
'MACCSFP36': ('[#16R]', 0),
|
| 40 |
+
'MACCSFP37': ('[#7]~[#6](~[#8])~[#7]', 0),
|
| 41 |
+
'MACCSFP38': ('[#7]~[#6](~[#6])~[#7]', 0),
|
| 42 |
+
'MACCSFP39': ('[#8]~[#16](~[#8])~[#8]', 0),
|
| 43 |
+
'MACCSFP40': ('[#16]-[#8]', 0),
|
| 44 |
+
'MACCSFP41': ('[#6]#[#7]', 0),
|
| 45 |
+
'MACCSFP42': ('F', 0),
|
| 46 |
+
'MACCSFP43': ('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]', 0),
|
| 47 |
+
'MACCSFP44': ('?', 0),
|
| 48 |
+
'MACCSFP45': ('[#6]=[#6]~[#7]', 0),
|
| 49 |
+
'MACCSFP46': ('Br', 0),
|
| 50 |
+
'MACCSFP47': ('[#16]~*~[#7]', 0),
|
| 51 |
+
'MACCSFP48': ('[#8]~[!#6;!#1](~[#8])(~[#8])', 0),
|
| 52 |
+
'MACCSFP49': ('[!+0]', 0),
|
| 53 |
+
'MACCSFP50': ('[#6]=[#6](~[#6])~[#6]', 0),
|
| 54 |
+
'MACCSFP51': ('[#6]~[#16]~[#8]', 0),
|
| 55 |
+
'MACCSFP52': ('[#7]~[#7]', 0),
|
| 56 |
+
'MACCSFP53': ('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]', 0),
|
| 57 |
+
'MACCSFP54': ('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]', 0),
|
| 58 |
+
'MACCSFP55': ('[#8]~[#16]~[#8]', 0),
|
| 59 |
+
'MACCSFP56': ('[#8]~[#7](~[#8])~[#6]', 0),
|
| 60 |
+
'MACCSFP57': ('[#8R]', 0),
|
| 61 |
+
'MACCSFP58': ('[!#6;!#1]~[#16]~[!#6;!#1]', 0),
|
| 62 |
+
'MACCSFP59': ('[#16]!:*:*', 0),
|
| 63 |
+
'MACCSFP60': ('[#16]=[#8]', 0),
|
| 64 |
+
'MACCSFP61': ('*~[#16](~*)~*', 0),
|
| 65 |
+
'MACCSFP62': ('*@*!@*@*', 0),
|
| 66 |
+
'MACCSFP63': ('[#7]=[#8]', 0),
|
| 67 |
+
'MACCSFP64': ('*@*!@[#16]', 0),
|
| 68 |
+
'MACCSFP65': ('c:n', 0),
|
| 69 |
+
'MACCSFP66': ('[#6]~[#6](~[#6])(~[#6])~*', 0),
|
| 70 |
+
'MACCSFP67': ('[!#6;!#1]~[#16]', 0),
|
| 71 |
+
'MACCSFP68': ('[!#6;!#1;!H0]~[!#6;!#1;!H0]', 0),
|
| 72 |
+
'MACCSFP69': ('[!#6;!#1]~[!#6;!#1;!H0]', 0),
|
| 73 |
+
'MACCSFP70': ('[!#6;!#1]~[#7]~[!#6;!#1]', 0),
|
| 74 |
+
'MACCSFP71': ('[#7]~[#8]', 0),
|
| 75 |
+
'MACCSFP72': ('[#8]~*~*~[#8]', 0),
|
| 76 |
+
'MACCSFP73': ('[#16]=*', 0),
|
| 77 |
+
'MACCSFP74': ('[CH3]~*~[CH3]', 0),
|
| 78 |
+
'MACCSFP75': ('*!@[#7]@*', 0),
|
| 79 |
+
'MACCSFP76': ('[#6]=[#6](~*)~*', 0),
|
| 80 |
+
'MACCSFP77': ('[#7]~*~[#7]', 0),
|
| 81 |
+
'MACCSFP78': ('[#6]=[#7]', 0),
|
| 82 |
+
'MACCSFP79': ('[#7]~*~*~[#7]', 0),
|
| 83 |
+
'MACCSFP80': ('[#7]~*~*~*~[#7]', 0),
|
| 84 |
+
'MACCSFP81': ('[#16]~*(~*)~*', 0),
|
| 85 |
+
'MACCSFP82': ('*~[CH2]~[!#6;!#1;!H0]', 0),
|
| 86 |
+
'MACCSFP83': ('[!#6;!#1]1~*~*~*~*~1', 0),
|
| 87 |
+
'MACCSFP84': ('[NH2]', 0),
|
| 88 |
+
'MACCSFP85': ('[#6]~[#7](~[#6])~[#6]', 0),
|
| 89 |
+
'MACCSFP86': ('[C;H2,H3][!#6;!#1][C;H2,H3]', 0),
|
| 90 |
+
'MACCSFP87': ('[F,Cl,Br,I]!@*@*', 0),
|
| 91 |
+
'MACCSFP88': ('[#16]', 0),
|
| 92 |
+
'MACCSFP89': ('[#8]~*~*~*~[#8]', 0),
|
| 93 |
+
'MACCSFP90': (
|
| 94 |
+
'[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',
|
| 95 |
+
0),
|
| 96 |
+
'MACCSFP91': (
|
| 97 |
+
'[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',
|
| 98 |
+
0),
|
| 99 |
+
'MACCSFP92': ('[#8]~[#6](~[#7])~[#6]', 0),
|
| 100 |
+
'MACCSFP93': ('[!#6;!#1]~[CH3]', 0),
|
| 101 |
+
'MACCSFP94': ('[!#6;!#1]~[#7]', 0),
|
| 102 |
+
'MACCSFP95': ('[#7]~*~*~[#8]', 0),
|
| 103 |
+
'MACCSFP96': ('*1~*~*~*~*~1', 0),
|
| 104 |
+
'MACCSFP97': ('[#7]~*~*~*~[#8]', 0),
|
| 105 |
+
'MACCSFP98': ('[!#6;!#1]1~*~*~*~*~*~1', 0),
|
| 106 |
+
'MACCSFP99': ('[#6]=[#6]', 0),
|
| 107 |
+
'MACCSFP100': ('*~[CH2]~[#7]', 0),
|
| 108 |
+
'MACCSFP101': (
|
| 109 |
+
'[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',
|
| 110 |
+
0),
|
| 111 |
+
'MACCSFP102': ('[!#6;!#1]~[#8]', 0),
|
| 112 |
+
'MACCSFP103': ('Cl', 0),
|
| 113 |
+
'MACCSFP104': ('[!#6;!#1;!H0]~*~[CH2]~*', 0),
|
| 114 |
+
'MACCSFP105': ('*@*(@*)@*', 0),
|
| 115 |
+
'MACCSFP106': ('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]', 0),
|
| 116 |
+
'MACCSFP107': ('[F,Cl,Br,I]~*(~*)~*', 0),
|
| 117 |
+
'MACCSFP108': ('[CH3]~*~*~*~[CH2]~*', 0),
|
| 118 |
+
'MACCSFP109': ('*~[CH2]~[#8]', 0),
|
| 119 |
+
'MACCSFP110': ('[#7]~[#6]~[#8]', 0),
|
| 120 |
+
'MACCSFP111': ('[#7]~*~[CH2]~*', 0),
|
| 121 |
+
'MACCSFP112': ('*~*(~*)(~*)~*', 0),
|
| 122 |
+
'MACCSFP113': ('[#8]!:*:*', 0),
|
| 123 |
+
'MACCSFP114': ('[CH3]~[CH2]~*', 0),
|
| 124 |
+
'MACCSFP115': ('[CH3]~*~[CH2]~*', 0),
|
| 125 |
+
'MACCSFP116': ('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]', 0),
|
| 126 |
+
'MACCSFP117': ('[#7]~*~[#8]', 0),
|
| 127 |
+
'MACCSFP118': ('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]', 1),
|
| 128 |
+
'MACCSFP119': ('[#7]=*', 0),
|
| 129 |
+
'MACCSFP120': ('[!#6;R]', 1),
|
| 130 |
+
'MACCSFP121': ('[#7;R]', 0),
|
| 131 |
+
'MACCSFP122': ('*~[#7](~*)~*', 0),
|
| 132 |
+
'MACCSFP123': ('[#8]~[#6]~[#8]', 0),
|
| 133 |
+
'MACCSFP124': ('[!#6;!#1]~[!#6;!#1]', 0),
|
| 134 |
+
'MACCSFP125': ('?', 0),
|
| 135 |
+
'MACCSFP126': ('*!@[#8]!@*', 0),
|
| 136 |
+
'MACCSFP127': ('*@*!@[#8]', 1),
|
| 137 |
+
'MACCSFP128': (
|
| 138 |
+
'[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',
|
| 139 |
+
0),
|
| 140 |
+
'MACCSFP129': ('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',
|
| 141 |
+
0),
|
| 142 |
+
'MACCSFP130': ('[!#6;!#1]~[!#6;!#1]', 1),
|
| 143 |
+
'MACCSFP131': ('[!#6;!#1;!H0]', 1),
|
| 144 |
+
'MACCSFP132': ('[#8]~*~[CH2]~*', 0),
|
| 145 |
+
'MACCSFP133': ('*@*!@[#7]', 0),
|
| 146 |
+
'MACCSFP134': ('[F,Cl,Br,I]', 0),
|
| 147 |
+
'MACCSFP135': ('[#7]!:*:*', 0),
|
| 148 |
+
'MACCSFP136': ('[#8]=*', 1),
|
| 149 |
+
'MACCSFP137': ('[!C;!c;R]', 0),
|
| 150 |
+
'MACCSFP138': ('[!#6;!#1]~[CH2]~*', 1),
|
| 151 |
+
'MACCSFP139': ('[O;!H0]', 0),
|
| 152 |
+
'MACCSFP140': ('[#8]', 3),
|
| 153 |
+
'MACCSFP141': ('[CH3]', 2),
|
| 154 |
+
'MACCSFP142': ('[#7]', 1),
|
| 155 |
+
'MACCSFP143': ('*@*!@[#8]', 0),
|
| 156 |
+
'MACCSFP144': ('*!:*:*!:*', 0),
|
| 157 |
+
'MACCSFP145': ('*1~*~*~*~*~*~1', 1),
|
| 158 |
+
'MACCSFP146': ('[#8]', 2),
|
| 159 |
+
'MACCSFP147': ('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]', 0),
|
| 160 |
+
'MACCSFP148': ('*~[!#6;!#1](~*)~*', 0),
|
| 161 |
+
'MACCSFP149': ('[C;H3,H4]', 1),
|
| 162 |
+
'MACCSFP150': ('*!@*@*!@*', 0),
|
| 163 |
+
'MACCSFP151': ('[#7;!H0]', 0),
|
| 164 |
+
'MACCSFP152': ('[#8]~[#6](~[#6])~[#6]', 0),
|
| 165 |
+
'MACCSFP153': ('[!#6;!#1]~[CH2]~*', 0),
|
| 166 |
+
'MACCSFP154': ('[#6]=[#8]', 0),
|
| 167 |
+
'MACCSFP155': ('*!@[CH2]!@*', 0),
|
| 168 |
+
'MACCSFP156': ('[#7]~*(~*)~*', 0),
|
| 169 |
+
'MACCSFP157': ('[#6]-[#8]', 0),
|
| 170 |
+
'MACCSFP158': ('[#6]-[#7]', 0),
|
| 171 |
+
'MACCSFP159': ('[#8]', 1),
|
| 172 |
+
'MACCSFP160': ('[C;H3,H4]', 0),
|
| 173 |
+
'MACCSFP161': ('[#7]', 0),
|
| 174 |
+
'MACCSFP162': ('a', 0),
|
| 175 |
+
'MACCSFP163': ('*1~*~*~*~*~*~1', 0),
|
| 176 |
+
'MACCSFP164': ('[#8]', 0),
|
| 177 |
+
'MACCSFP165': ('[R]', 0),
|
| 178 |
+
'MACCSFP166': ('?', 0)}
|
deepscreen/data/featurizers/fingerprint/smarts_pharmacophore.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Donor = ["[N;!H0;v3,v4&+1]", "[O,S;H1;+0]", "[n&H1&+0]"]
|
| 2 |
+
|
| 3 |
+
Acceptor = ["[O,S;H1;v2;!$(*-*=[O,N,P,S])]", "[O;H0;v2]", "[O,S;v1;-]",
|
| 4 |
+
"[N;v3;!$(N-*=[O,N,P,S])]", "[n&H0&+0]", "[o;+0;!$([o]:n);!$([o]:c:n)]"]
|
| 5 |
+
|
| 6 |
+
Positive = ["[#7;+]", "[N;H2&+0][$([C,a]);!$([C,a](=O))]",
|
| 7 |
+
"[N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]",
|
| 8 |
+
"[N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))]"]
|
| 9 |
+
|
| 10 |
+
Negative = ["[C,S](=[O,S,P])-[O;H1,-1]"]
|
| 11 |
+
|
| 12 |
+
Hydrophobic = ["[C;D3,D4](-[CH3])-[CH3]", "[S;D2](-C)-C"]
|
| 13 |
+
|
| 14 |
+
Aromatic = ["a"]
|
| 15 |
+
|
| 16 |
+
pharmacophore_smarts = {"Donor": Donor,
|
| 17 |
+
"Acceptor": Acceptor,
|
| 18 |
+
"Positive": Positive,
|
| 19 |
+
"Negative": Negative,
|
| 20 |
+
"Hydrophobic": Hydrophobic,
|
| 21 |
+
"Aromatic": Aromatic}
|
deepscreen/data/featurizers/fingerprint/smarts_pubchem.py
ADDED
|
@@ -0,0 +1,734 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
smartsPatts = {
|
| 2 |
+
'PubChemFP0': ('[H]', 3),
|
| 3 |
+
'PubChemFP1': ('[H]', 7),
|
| 4 |
+
'PubChemFP2': ('[H]', 15),
|
| 5 |
+
'PubChemFP3': ('[H]', 31),
|
| 6 |
+
'PubChemFP4': ('[Li]', 0),
|
| 7 |
+
'PubChemFP5': ('[Li]', 1),
|
| 8 |
+
'PubChemFP6': ('[B]', 0),
|
| 9 |
+
'PubChemFP7': ('[B]', 1),
|
| 10 |
+
'PubChemFP8': ('[B]', 3),
|
| 11 |
+
'PubChemFP9': ('[C]', 1),
|
| 12 |
+
'PubChemFP10': ('[C]', 3),
|
| 13 |
+
'PubChemFP11': ('[C]', 7),
|
| 14 |
+
'PubChemFP12': ('[C]', 15),
|
| 15 |
+
'PubChemFP13': ('[C]', 31),
|
| 16 |
+
'PubChemFP14': ('[N]', 0),
|
| 17 |
+
'PubChemFP15': ('[N]', 1),
|
| 18 |
+
'PubChemFP16': ('[N]', 3),
|
| 19 |
+
'PubChemFP17': ('[N]', 7),
|
| 20 |
+
'PubChemFP18': ('[O]', 0),
|
| 21 |
+
'PubChemFP19': ('[O]', 1),
|
| 22 |
+
'PubChemFP20': ('[O]', 3),
|
| 23 |
+
'PubChemFP21': ('[O]', 7),
|
| 24 |
+
'PubChemFP22': ('[O]', 15),
|
| 25 |
+
'PubChemFP23': ('[F]', 0),
|
| 26 |
+
'PubChemFP24': ('[F]', 1),
|
| 27 |
+
'PubChemFP25': ('[F]', 3),
|
| 28 |
+
'PubChemFP26': ('[Na]', 0),
|
| 29 |
+
'PubChemFP27': ('[Na]', 1),
|
| 30 |
+
'PubChemFP28': ('[Si]', 0),
|
| 31 |
+
'PubChemFP29': ('[Si]', 1),
|
| 32 |
+
'PubChemFP30': ('[P]', 0),
|
| 33 |
+
'PubChemFP31': ('[P]', 1),
|
| 34 |
+
'PubChemFP32': ('[P]', 3),
|
| 35 |
+
'PubChemFP33': ('[S]', 0),
|
| 36 |
+
'PubChemFP34': ('[S]', 1),
|
| 37 |
+
'PubChemFP35': ('[S]', 3),
|
| 38 |
+
'PubChemFP36': ('[S]', 7),
|
| 39 |
+
'PubChemFP37': ('[Cl]', 0),
|
| 40 |
+
'PubChemFP38': ('[Cl]', 1),
|
| 41 |
+
'PubChemFP39': ('[Cl]', 3),
|
| 42 |
+
'PubChemFP40': ('[Cl]', 7),
|
| 43 |
+
'PubChemFP41': ('[K]', 0),
|
| 44 |
+
'PubChemFP42': ('[K]', 1),
|
| 45 |
+
'PubChemFP43': ('[Br]', 0),
|
| 46 |
+
'PubChemFP44': ('[Br]', 1),
|
| 47 |
+
'PubChemFP45': ('[Br]', 3),
|
| 48 |
+
'PubChemFP46': ('[I]', 0),
|
| 49 |
+
'PubChemFP47': ('[I]', 1),
|
| 50 |
+
'PubChemFP48': ('[I]', 3),
|
| 51 |
+
'PubChemFP49': ('[Be]', 0),
|
| 52 |
+
'PubChemFP50': ('[Mg]', 0),
|
| 53 |
+
'PubChemFP51': ('[Al]', 0),
|
| 54 |
+
'PubChemFP52': ('[Ca]', 0),
|
| 55 |
+
'PubChemFP53': ('[Sc]', 0),
|
| 56 |
+
'PubChemFP54': ('[Ti]', 0),
|
| 57 |
+
'PubChemFP55': ('[V]', 0),
|
| 58 |
+
'PubChemFP56': ('[Cr]', 0),
|
| 59 |
+
'PubChemFP57': ('[Mn]', 0),
|
| 60 |
+
'PubChemFP58': ('[Fe]', 0),
|
| 61 |
+
'PubChemFP59': ('[CO]', 0),
|
| 62 |
+
'PubChemFP60': ('[Ni]', 0),
|
| 63 |
+
'PubChemFP61': ('[Cu]', 0),
|
| 64 |
+
'PubChemFP62': ('[Zn]', 0),
|
| 65 |
+
'PubChemFP63': ('[Ga]', 0),
|
| 66 |
+
'PubChemFP64': ('[Ge]', 0),
|
| 67 |
+
'PubChemFP65': ('[As]', 0),
|
| 68 |
+
'PubChemFP66': ('[Se]', 0),
|
| 69 |
+
'PubChemFP67': ('[Kr]', 0),
|
| 70 |
+
'PubChemFP68': ('[Rb]', 0),
|
| 71 |
+
'PubChemFP69': ('[Sr]', 0),
|
| 72 |
+
'PubChemFP70': ('[Y]', 0),
|
| 73 |
+
'PubChemFP71': ('[Zr]', 0),
|
| 74 |
+
'PubChemFP72': ('[Nb]', 0),
|
| 75 |
+
'PubChemFP73': ('[Mo]', 0),
|
| 76 |
+
'PubChemFP74': ('[Ru]', 0),
|
| 77 |
+
'PubChemFP75': ('[Rh]', 0),
|
| 78 |
+
'PubChemFP76': ('[Pd]', 0),
|
| 79 |
+
'PubChemFP77': ('[Ag]', 0),
|
| 80 |
+
'PubChemFP78': ('[Cd]', 0),
|
| 81 |
+
'PubChemFP79': ('[In]', 0),
|
| 82 |
+
'PubChemFP80': ('[Sn]', 0),
|
| 83 |
+
'PubChemFP81': ('[Sb]', 0),
|
| 84 |
+
'PubChemFP82': ('[Te]', 0),
|
| 85 |
+
'PubChemFP83': ('[Xe]', 0),
|
| 86 |
+
'PubChemFP84': ('[Cs]', 0),
|
| 87 |
+
'PubChemFP85': ('[Ba]', 0),
|
| 88 |
+
'PubChemFP86': ('[Lu]', 0),
|
| 89 |
+
'PubChemFP87': ('[Hf]', 0),
|
| 90 |
+
'PubChemFP88': ('[Ta]', 0),
|
| 91 |
+
'PubChemFP89': ('[W]', 0),
|
| 92 |
+
'PubChemFP90': ('[Re]', 0),
|
| 93 |
+
'PubChemFP91': ('[Os]', 0),
|
| 94 |
+
'PubChemFP92': ('[Ir]', 0),
|
| 95 |
+
'PubChemFP93': ('[Pt]', 0),
|
| 96 |
+
'PubChemFP94': ('[Au]', 0),
|
| 97 |
+
'PubChemFP95': ('[Hg]', 0),
|
| 98 |
+
'PubChemFP96': ('[Tl]', 0),
|
| 99 |
+
'PubChemFP97': ('[Pb]', 0),
|
| 100 |
+
'PubChemFP98': ('[Bi]', 0),
|
| 101 |
+
'PubChemFP99': ('[La]', 0),
|
| 102 |
+
'PubChemFP100': ('[Ce]', 0),
|
| 103 |
+
'PubChemFP101': ('[Pr]', 0),
|
| 104 |
+
'PubChemFP102': ('[Nd]', 0),
|
| 105 |
+
'PubChemFP103': ('[Pm]', 0),
|
| 106 |
+
'PubChemFP104': ('[Sm]', 0),
|
| 107 |
+
'PubChemFP105': ('[Eu]', 0),
|
| 108 |
+
'PubChemFP106': ('[Gd]', 0),
|
| 109 |
+
'PubChemFP107': ('[Tb]', 0),
|
| 110 |
+
'PubChemFP108': ('[Dy]', 0),
|
| 111 |
+
'PubChemFP109': ('[Ho]', 0),
|
| 112 |
+
'PubChemFP110': ('[Er]', 0),
|
| 113 |
+
'PubChemFP111': ('[Tm]', 0),
|
| 114 |
+
'PubChemFP112': ('[Yb]', 0),
|
| 115 |
+
'PubChemFP113': ('[Tc]', 0),
|
| 116 |
+
'PubChemFP114': ('[U]', 0),
|
| 117 |
+
'PubChemFP263': ('[Li&!H0]', 0),
|
| 118 |
+
'PubChemFP264': ('[Li]~[Li]', 0),
|
| 119 |
+
'PubChemFP265': ('[Li]~[#5]', 0),
|
| 120 |
+
'PubChemFP266': ('[Li]~[#6]', 0),
|
| 121 |
+
'PubChemFP267': ('[Li]~[#8]', 0),
|
| 122 |
+
'PubChemFP268': ('[Li]~[F]', 0),
|
| 123 |
+
'PubChemFP269': ('[Li]~[#15]', 0),
|
| 124 |
+
'PubChemFP270': ('[Li]~[#16]', 0),
|
| 125 |
+
'PubChemFP271': ('[Li]~[Cl]', 0),
|
| 126 |
+
'PubChemFP272': ('[#5&!H0]', 0),
|
| 127 |
+
'PubChemFP273': ('[#5]~[#5]', 0),
|
| 128 |
+
'PubChemFP274': ('[#5]~[#6]', 0),
|
| 129 |
+
'PubChemFP275': ('[#5]~[#7]', 0),
|
| 130 |
+
'PubChemFP276': ('[#5]~[#8]', 0),
|
| 131 |
+
'PubChemFP277': ('[#5]~[F]', 0),
|
| 132 |
+
'PubChemFP278': ('[#5]~[#14]', 0),
|
| 133 |
+
'PubChemFP279': ('[#5]~[#15]', 0),
|
| 134 |
+
'PubChemFP280': ('[#5]~[#16]', 0),
|
| 135 |
+
'PubChemFP281': ('[#5]~[Cl]', 0),
|
| 136 |
+
'PubChemFP282': ('[#5]~[Br]', 0),
|
| 137 |
+
'PubChemFP283': ('[#6&!H0]', 0),
|
| 138 |
+
'PubChemFP284': ('[#6]~[#6]', 0),
|
| 139 |
+
'PubChemFP285': ('[#6]~[#7]', 0),
|
| 140 |
+
'PubChemFP286': ('[#6]~[#8]', 0),
|
| 141 |
+
'PubChemFP287': ('[#6]~[F]', 0),
|
| 142 |
+
'PubChemFP288': ('[#6]~[Na]', 0),
|
| 143 |
+
'PubChemFP289': ('[#6]~[Mg]', 0),
|
| 144 |
+
'PubChemFP290': ('[#6]~[Al]', 0),
|
| 145 |
+
'PubChemFP291': ('[#6]~[#14]', 0),
|
| 146 |
+
'PubChemFP292': ('[#6]~[#15]', 0),
|
| 147 |
+
'PubChemFP293': ('[#6]~[#16]', 0),
|
| 148 |
+
'PubChemFP294': ('[#6]~[Cl]', 0),
|
| 149 |
+
'PubChemFP295': ('[#6]~[#33]', 0),
|
| 150 |
+
'PubChemFP296': ('[#6]~[#34]', 0),
|
| 151 |
+
'PubChemFP297': ('[#6]~[Br]', 0),
|
| 152 |
+
'PubChemFP298': ('[#6]~[I]', 0),
|
| 153 |
+
'PubChemFP299': ('[#7&!H0]', 0),
|
| 154 |
+
'PubChemFP300': ('[#7]~[#7]', 0),
|
| 155 |
+
'PubChemFP301': ('[#7]~[#8]', 0),
|
| 156 |
+
'PubChemFP302': ('[#7]~[F]', 0),
|
| 157 |
+
'PubChemFP303': ('[#7]~[#14]', 0),
|
| 158 |
+
'PubChemFP304': ('[#7]~[#15]', 0),
|
| 159 |
+
'PubChemFP305': ('[#7]~[#16]', 0),
|
| 160 |
+
'PubChemFP306': ('[#7]~[Cl]', 0),
|
| 161 |
+
'PubChemFP307': ('[#7]~[Br]', 0),
|
| 162 |
+
'PubChemFP308': ('[#8&!H0]', 0),
|
| 163 |
+
'PubChemFP309': ('[#8]~[#8]', 0),
|
| 164 |
+
'PubChemFP310': ('[#8]~[Mg]', 0),
|
| 165 |
+
'PubChemFP311': ('[#8]~[Na]', 0),
|
| 166 |
+
'PubChemFP312': ('[#8]~[Al]', 0),
|
| 167 |
+
'PubChemFP313': ('[#8]~[#14]', 0),
|
| 168 |
+
'PubChemFP314': ('[#8]~[#15]', 0),
|
| 169 |
+
'PubChemFP315': ('[#8]~[K]', 0),
|
| 170 |
+
'PubChemFP316': ('[F]~[#15]', 0),
|
| 171 |
+
'PubChemFP317': ('[F]~[#16]', 0),
|
| 172 |
+
'PubChemFP318': ('[Al&!H0]', 0),
|
| 173 |
+
'PubChemFP319': ('[Al]~[Cl]', 0),
|
| 174 |
+
'PubChemFP320': ('[#14&!H0]', 0),
|
| 175 |
+
'PubChemFP321': ('[#14]~[#14]', 0),
|
| 176 |
+
'PubChemFP322': ('[#14]~[Cl]', 0),
|
| 177 |
+
'PubChemFP323': ('[#15&!H0]', 0),
|
| 178 |
+
'PubChemFP324': ('[#15]~[#15]', 0),
|
| 179 |
+
'PubChemFP325': ('[#33&!H0]', 0),
|
| 180 |
+
'PubChemFP326': ('[#33]~[#33]', 0),
|
| 181 |
+
'PubChemFP327': ('[#6](~Br)(~[#6])', 0),
|
| 182 |
+
'PubChemFP328': ('[#6](~Br)(~[#6])(~[#6])', 0),
|
| 183 |
+
'PubChemFP329': ('[#6&!H0]~[Br]', 0),
|
| 184 |
+
'PubChemFP330': ('[#6](~[Br])(:[c])', 0),
|
| 185 |
+
'PubChemFP331': ('[#6](~[Br])(:[n])', 0),
|
| 186 |
+
'PubChemFP332': ('[#6](~[#6])(~[#6])', 0),
|
| 187 |
+
'PubChemFP333': ('[#6](~[#6])(~[#6])(~[#6])', 0),
|
| 188 |
+
'PubChemFP334': ('[#6](~[#6])(~[#6])(~[#6])(~[#6])', 0),
|
| 189 |
+
'PubChemFP335': ('[#6H1](~[#6])(~[#6])(~[#6])', 0),
|
| 190 |
+
'PubChemFP336': ('[#6](~[#6])(~[#6])(~[#6])(~[#7])', 0),
|
| 191 |
+
'PubChemFP337': ('[#6](~[#6])(~[#6])(~[#6])(~[#8])', 0),
|
| 192 |
+
'PubChemFP338': ('[#6H1](~[#6])(~[#6])(~[#7])', 0),
|
| 193 |
+
'PubChemFP339': ('[#6H1](~[#6])(~[#6])(~[#8])', 0),
|
| 194 |
+
'PubChemFP340': ('[#6](~[#6])(~[#6])(~[#7])', 0),
|
| 195 |
+
'PubChemFP341': ('[#6](~[#6])(~[#6])(~[#8])', 0),
|
| 196 |
+
'PubChemFP342': ('[#6](~[#6])(~[Cl])', 0),
|
| 197 |
+
'PubChemFP343': ('[#6&!H0](~[#6])(~[Cl])', 0),
|
| 198 |
+
'PubChemFP344': ('[#6H,#6H2,#6H3,#6H4]~[#6]', 0),
|
| 199 |
+
'PubChemFP345': ('[#6&!H0](~[#6])(~[#7])', 0),
|
| 200 |
+
'PubChemFP346': ('[#6&!H0](~[#6])(~[#8])', 0),
|
| 201 |
+
'PubChemFP347': ('[#6H1](~[#6])(~[#8])(~[#8])', 0),
|
| 202 |
+
'PubChemFP348': ('[#6&!H0](~[#6])(~[#15])', 0),
|
| 203 |
+
'PubChemFP349': ('[#6&!H0](~[#6])(~[#16])', 0),
|
| 204 |
+
'PubChemFP350': ('[#6](~[#6])(~[I])', 0),
|
| 205 |
+
'PubChemFP351': ('[#6](~[#6])(~[#7])', 0),
|
| 206 |
+
'PubChemFP352': ('[#6](~[#6])(~[#8])', 0),
|
| 207 |
+
'PubChemFP353': ('[#6](~[#6])(~[#16])', 0),
|
| 208 |
+
'PubChemFP354': ('[#6](~[#6])(~[#14])', 0),
|
| 209 |
+
'PubChemFP355': ('[#6](~[#6])(:c)', 0),
|
| 210 |
+
'PubChemFP356': ('[#6](~[#6])(:c)(:c)', 0),
|
| 211 |
+
'PubChemFP357': ('[#6](~[#6])(:c)(:n)', 0),
|
| 212 |
+
'PubChemFP358': ('[#6](~[#6])(:n)', 0),
|
| 213 |
+
'PubChemFP359': ('[#6](~[#6])(:n)(:n)', 0),
|
| 214 |
+
'PubChemFP360': ('[#6](~[Cl])(~[Cl])', 0),
|
| 215 |
+
'PubChemFP361': ('[#6&!H0](~[Cl])', 0),
|
| 216 |
+
'PubChemFP362': ('[#6](~[Cl])(:c)', 0),
|
| 217 |
+
'PubChemFP363': ('[#6](~[F])(~[F])', 0),
|
| 218 |
+
'PubChemFP364': ('[#6](~[F])(:c)', 0),
|
| 219 |
+
'PubChemFP365': ('[#6&!H0](~[#7])', 0),
|
| 220 |
+
'PubChemFP366': ('[#6&!H0](~[#8])', 0),
|
| 221 |
+
'PubChemFP367': ('[#6&!H0](~[#8])(~[#8])', 0),
|
| 222 |
+
'PubChemFP368': ('[#6&!H0](~[#16])', 0),
|
| 223 |
+
'PubChemFP369': ('[#6&!H0](~[#14])', 0),
|
| 224 |
+
'PubChemFP370': ('[#6&!H0]:c', 0),
|
| 225 |
+
'PubChemFP371': ('[#6&!H0](:c)(:c)', 0),
|
| 226 |
+
'PubChemFP372': ('[#6&!H0](:c)(:n)', 0),
|
| 227 |
+
'PubChemFP373': ('[#6&!H0](:n)', 0),
|
| 228 |
+
'PubChemFP374': ('[#6H3]', 0),
|
| 229 |
+
'PubChemFP375': ('[#6](~[#7])(~[#7])', 0),
|
| 230 |
+
'PubChemFP376': ('[#6](~[#7])(:c)', 0),
|
| 231 |
+
'PubChemFP377': ('[#6](~[#7])(:c)(:c)', 0),
|
| 232 |
+
'PubChemFP378': ('[#6](~[#7])(:c)(:n)', 0),
|
| 233 |
+
'PubChemFP379': ('[#6](~[#7])(:n)', 0),
|
| 234 |
+
'PubChemFP380': ('[#6](~[#8])(~[#8])', 0),
|
| 235 |
+
'PubChemFP381': ('[#6](~[#8])(:c)', 0),
|
| 236 |
+
'PubChemFP382': ('[#6](~[#8])(:c)(:c)', 0),
|
| 237 |
+
'PubChemFP383': ('[#6](~[#16])(:c)', 0),
|
| 238 |
+
'PubChemFP384': ('[#6](:c)(:c)', 0),
|
| 239 |
+
'PubChemFP385': ('[#6](:c)(:c)(:c)', 0),
|
| 240 |
+
'PubChemFP386': ('[#6](:c)(:c)(:n)', 0),
|
| 241 |
+
'PubChemFP387': ('[#6](:c)(:n)', 0),
|
| 242 |
+
'PubChemFP388': ('[#6](:c)(:n)(:n)', 0),
|
| 243 |
+
'PubChemFP389': ('[#6](:n)(:n)', 0),
|
| 244 |
+
'PubChemFP390': ('[#7](~[#6])(~[#6])', 0),
|
| 245 |
+
'PubChemFP391': ('[#7](~[#6])(~[#6])(~[#6])', 0),
|
| 246 |
+
'PubChemFP392': ('[#7&!H0](~[#6])(~[#6])', 0),
|
| 247 |
+
'PubChemFP393': ('[#7&!H0](~[#6])', 0),
|
| 248 |
+
'PubChemFP394': ('[#7&!H0](~[#6])(~[#7])', 0),
|
| 249 |
+
'PubChemFP395': ('[#7](~[#6])(~[#8])', 0),
|
| 250 |
+
'PubChemFP396': ('[#7](~[#6])(:c)', 0),
|
| 251 |
+
'PubChemFP397': ('[#7](~[#6])(:c)(:c)', 0),
|
| 252 |
+
'PubChemFP398': ('[#7&!H0](~[#7])', 0),
|
| 253 |
+
'PubChemFP399': ('[#7&!H0](:c)', 0),
|
| 254 |
+
'PubChemFP400': ('[#7&!H0](:c)(:c)', 0),
|
| 255 |
+
'PubChemFP401': ('[#7](~[#8])(~[#8])', 0),
|
| 256 |
+
'PubChemFP402': ('[#7](~[#8])(:o)', 0),
|
| 257 |
+
'PubChemFP403': ('[#7](:c)(:c)', 0),
|
| 258 |
+
'PubChemFP404': ('[#7](:c)(:c)(:c)', 0),
|
| 259 |
+
'PubChemFP405': ('[#8](~[#6])(~[#6])', 0),
|
| 260 |
+
'PubChemFP406': ('[#8&!H0](~[#6])', 0),
|
| 261 |
+
'PubChemFP407': ('[#8](~[#6])(~[#15])', 0),
|
| 262 |
+
'PubChemFP408': ('[#8&!H0](~[#16])', 0),
|
| 263 |
+
'PubChemFP409': ('[#8](:c)(:c)', 0),
|
| 264 |
+
'PubChemFP410': ('[#15](~[#6])(~[#6])', 0),
|
| 265 |
+
'PubChemFP411': ('[#15](~[#8])(~[#8])', 0),
|
| 266 |
+
'PubChemFP412': ('[#16](~[#6])(~[#6])', 0),
|
| 267 |
+
'PubChemFP413': ('[#16&!H0](~[#6])', 0),
|
| 268 |
+
'PubChemFP414': ('[#16](~[#6])(~[#8])', 0),
|
| 269 |
+
'PubChemFP415': ('[#14](~[#6])(~[#6])', 0),
|
| 270 |
+
'PubChemFP416': ('[#6]=,:[#6]', 0),
|
| 271 |
+
'PubChemFP417': ('[#6]#[#6]', 0),
|
| 272 |
+
'PubChemFP418': ('[#6]=,:[#7]', 0),
|
| 273 |
+
'PubChemFP419': ('[#6]#[#7]', 0),
|
| 274 |
+
'PubChemFP420': ('[#6]=,:[#8]', 0),
|
| 275 |
+
'PubChemFP421': ('[#6]=,:[#16]', 0),
|
| 276 |
+
'PubChemFP422': ('[#7]=,:[#7]', 0),
|
| 277 |
+
'PubChemFP423': ('[#7]=,:[#8]', 0),
|
| 278 |
+
'PubChemFP424': ('[#7]=,:[#15]', 0),
|
| 279 |
+
'PubChemFP425': ('[#15]=,:[#8]', 0),
|
| 280 |
+
'PubChemFP426': ('[#15]=,:[#15]', 0),
|
| 281 |
+
'PubChemFP427': ('[#6](#[#6])(-,:[#6])', 0),
|
| 282 |
+
'PubChemFP428': ('[#6&!H0](#[#6])', 0),
|
| 283 |
+
'PubChemFP429': ('[#6](#[#7])(-,:[#6])', 0),
|
| 284 |
+
'PubChemFP430': ('[#6](-,:[#6])(-,:[#6])(=,:[#6])', 0),
|
| 285 |
+
'PubChemFP431': ('[#6](-,:[#6])(-,:[#6])(=,:[#7])', 0),
|
| 286 |
+
'PubChemFP432': ('[#6](-,:[#6])(-,:[#6])(=,:[#8])', 0),
|
| 287 |
+
'PubChemFP433': ('[#6](-,:[#6])([Cl])(=,:[#8])', 0),
|
| 288 |
+
'PubChemFP434': ('[#6&!H0](-,:[#6])(=,:[#6])', 0),
|
| 289 |
+
'PubChemFP435': ('[#6&!H0](-,:[#6])(=,:[#7])', 0),
|
| 290 |
+
'PubChemFP436': ('[#6&!H0](-,:[#6])(=,:[#8])', 0),
|
| 291 |
+
'PubChemFP437': ('[#6](-,:[#6])(-,:[#7])(=,:[#6])', 0),
|
| 292 |
+
'PubChemFP438': ('[#6](-,:[#6])(-,:[#7])(=,:[#7])', 0),
|
| 293 |
+
'PubChemFP439': ('[#6](-,:[#6])(-,:[#7])(=,:[#8])', 0),
|
| 294 |
+
'PubChemFP440': ('[#6](-,:[#6])(-,:[#8])(=,:[#8])', 0),
|
| 295 |
+
'PubChemFP441': ('[#6](-,:[#6])(=,:[#6])', 0),
|
| 296 |
+
'PubChemFP442': ('[#6](-,:[#6])(=,:[#7])', 0),
|
| 297 |
+
'PubChemFP443': ('[#6](-,:[#6])(=,:[#8])', 0),
|
| 298 |
+
'PubChemFP444': ('[#6]([Cl])(=,:[#8])', 0),
|
| 299 |
+
'PubChemFP445': ('[#6&!H0](-,:[#7])(=,:[#6])', 0),
|
| 300 |
+
'PubChemFP446': ('[#6&!H0](=,:[#6])', 0),
|
| 301 |
+
'PubChemFP447': ('[#6&!H0](=,:[#7])', 0),
|
| 302 |
+
'PubChemFP448': ('[#6&!H0](=,:[#8])', 0),
|
| 303 |
+
'PubChemFP449': ('[#6](-,:[#7])(=,:[#6])', 0),
|
| 304 |
+
'PubChemFP450': ('[#6](-,:[#7])(=,:[#7])', 0),
|
| 305 |
+
'PubChemFP451': ('[#6](-,:[#7])(=,:[#8])', 0),
|
| 306 |
+
'PubChemFP452': ('[#6](-,:[#8])(=,:[#8])', 0),
|
| 307 |
+
'PubChemFP453': ('[#7](-,:[#6])(=,:[#6])', 0),
|
| 308 |
+
'PubChemFP454': ('[#7](-,:[#6])(=,:[#8])', 0),
|
| 309 |
+
'PubChemFP455': ('[#7](-,:[#8])(=,:[#8])', 0),
|
| 310 |
+
'PubChemFP456': ('[#15](-,:[#8])(=,:[#8])', 0),
|
| 311 |
+
'PubChemFP457': ('[#16](-,:[#6])(=,:[#8])', 0),
|
| 312 |
+
'PubChemFP458': ('[#16](-,:[#8])(=,:[#8])', 0),
|
| 313 |
+
'PubChemFP459': ('[#16](=,:[#8])(=,:[#8])', 0),
|
| 314 |
+
'PubChemFP460': ('[#6]-,:[#6]-,:[#6]#[#6]', 0),
|
| 315 |
+
'PubChemFP461': ('[#8]-,:[#6]-,:[#6]=,:[#7]', 0),
|
| 316 |
+
'PubChemFP462': ('[#8]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 317 |
+
'PubChemFP463': ('[#7]:[#6]-,:[#16&!H0]', 0),
|
| 318 |
+
'PubChemFP464': ('[#7]-,:[#6]-,:[#6]=,:[#6]', 0),
|
| 319 |
+
'PubChemFP465': ('[#8]=,:[#16]-,:[#6]-,:[#6]', 0),
|
| 320 |
+
'PubChemFP466': ('[#7]#[#6]-,:[#6]=,:[#6]', 0),
|
| 321 |
+
'PubChemFP467': ('[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
|
| 322 |
+
'PubChemFP468': ('[#8]=,:[#16]-,:[#6]-,:[#7]', 0),
|
| 323 |
+
'PubChemFP469': ('[#16]-,:[#16]-,:[#6]:[#6]', 0),
|
| 324 |
+
'PubChemFP470': ('[#6]:[#6]-,:[#6]=,:[#6]', 0),
|
| 325 |
+
'PubChemFP471': ('[#16]:[#6]:[#6]:[#6]', 0),
|
| 326 |
+
'PubChemFP472': ('[#6]:[#7]:[#6]-,:[#6]', 0),
|
| 327 |
+
'PubChemFP473': ('[#16]-,:[#6]:[#7]:[#6]', 0),
|
| 328 |
+
'PubChemFP474': ('[#16]:[#6]:[#6]:[#7]', 0),
|
| 329 |
+
'PubChemFP475': ('[#16]-,:[#6]=,:[#7]-,:[#6]', 0),
|
| 330 |
+
'PubChemFP476': ('[#6]-,:[#8]-,:[#6]=,:[#6]', 0),
|
| 331 |
+
'PubChemFP477': ('[#7]-,:[#7]-,:[#6]:[#6]', 0),
|
| 332 |
+
'PubChemFP478': ('[#16]-,:[#6]=,:[#7&!H0]', 0),
|
| 333 |
+
'PubChemFP479': ('[#16]-,:[#6]-,:[#16]-,:[#6]', 0),
|
| 334 |
+
'PubChemFP480': ('[#6]:[#16]:[#6]-,:[#6]', 0),
|
| 335 |
+
'PubChemFP481': ('[#8]-,:[#16]-,:[#6]:[#6]', 0),
|
| 336 |
+
'PubChemFP482': ('[#6]:[#7]-,:[#6]:[#6]', 0),
|
| 337 |
+
'PubChemFP483': ('[#7]-,:[#16]-,:[#6]:[#6]', 0),
|
| 338 |
+
'PubChemFP484': ('[#7]-,:[#6]:[#7]:[#6]', 0),
|
| 339 |
+
'PubChemFP485': ('[#7]:[#6]:[#6]:[#7]', 0),
|
| 340 |
+
'PubChemFP486': ('[#7]-,:[#6]:[#7]:[#7]', 0),
|
| 341 |
+
'PubChemFP487': ('[#7]-,:[#6]=,:[#7]-,:[#6]', 0),
|
| 342 |
+
'PubChemFP488': ('[#7]-,:[#6]=,:[#7&!H0]', 0),
|
| 343 |
+
'PubChemFP489': ('[#7]-,:[#6]-,:[#16]-,:[#6]', 0),
|
| 344 |
+
'PubChemFP490': ('[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
|
| 345 |
+
'PubChemFP491': ('[#6]-,:[#7]:[#6&!H0]', 0),
|
| 346 |
+
'PubChemFP492': ('[#7]-,:[#6]:[#8]:[#6]', 0),
|
| 347 |
+
'PubChemFP493': ('[#8]=,:[#6]-,:[#6]:[#6]', 0),
|
| 348 |
+
'PubChemFP494': ('[#8]=,:[#6]-,:[#6]:[#7]', 0),
|
| 349 |
+
'PubChemFP495': ('[#6]-,:[#7]-,:[#6]:[#6]', 0),
|
| 350 |
+
'PubChemFP496': ('[#7]:[#7]-,:[#6&!H0]', 0),
|
| 351 |
+
'PubChemFP497': ('[#8]-,:[#6]:[#6]:[#7]', 0),
|
| 352 |
+
'PubChemFP498': ('[#8]-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 353 |
+
'PubChemFP499': ('[#7]-,:[#6]:[#6]:[#7]', 0),
|
| 354 |
+
'PubChemFP500': ('[#6]-,:[#16]-,:[#6]:[#6]', 0),
|
| 355 |
+
'PubChemFP501': ('[Cl]-,:[#6]:[#6]-,:[#6]', 0),
|
| 356 |
+
'PubChemFP502': ('[#7]-,:[#6]=,:[#6&!H0]', 0),
|
| 357 |
+
'PubChemFP503': ('[Cl]-,:[#6]:[#6&!H0]', 0),
|
| 358 |
+
'PubChemFP504': ('[#7]:[#6]:[#7]-,:[#6]', 0),
|
| 359 |
+
'PubChemFP505': ('[Cl]-,:[#6]:[#6]-,:[#8]', 0),
|
| 360 |
+
'PubChemFP506': ('[#6]-,:[#6]:[#7]:[#6]', 0),
|
| 361 |
+
'PubChemFP507': ('[#6]-,:[#6]-,:[#16]-,:[#6]', 0),
|
| 362 |
+
'PubChemFP508': ('[#16]=,:[#6]-,:[#7]-,:[#6]', 0),
|
| 363 |
+
'PubChemFP509': ('[Br]-,:[#6]:[#6]-,:[#6]', 0),
|
| 364 |
+
'PubChemFP510': ('[#7&!H0]-,:[#7&!H0]', 0),
|
| 365 |
+
'PubChemFP511': ('[#16]=,:[#6]-,:[#7&!H0]', 0),
|
| 366 |
+
'PubChemFP512': ('[#6]-,:[#33]-[#8&!H0]', 0),
|
| 367 |
+
'PubChemFP513': ('[#16]:[#6]:[#6&!H0]', 0),
|
| 368 |
+
'PubChemFP514': ('[#8]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 369 |
+
'PubChemFP515': ('[#7]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 370 |
+
'PubChemFP516': ('[#6H,#6H2,#6H3]=,:[#6H,#6H2,#6H3]', 0),
|
| 371 |
+
'PubChemFP517': ('[#7]-,:[#7]-,:[#6]-,:[#7]', 0),
|
| 372 |
+
'PubChemFP518': ('[#8]=,:[#6]-,:[#7]-,:[#7]', 0),
|
| 373 |
+
'PubChemFP519': ('[#7]=,:[#6]-,:[#7]-,:[#6]', 0),
|
| 374 |
+
'PubChemFP520': ('[#6]=,:[#6]-,:[#6]:[#6]', 0),
|
| 375 |
+
'PubChemFP521': ('[#6]:[#7]-,:[#6&!H0]', 0),
|
| 376 |
+
'PubChemFP522': ('[#6]-,:[#7]-,:[#7&!H0]', 0),
|
| 377 |
+
'PubChemFP523': ('[#7]:[#6]:[#6]-,:[#6]', 0),
|
| 378 |
+
'PubChemFP524': ('[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 379 |
+
'PubChemFP525': ('[#33]-,:[#6]:[#6&!H0]', 0),
|
| 380 |
+
'PubChemFP526': ('[Cl]-,:[#6]:[#6]-,:[Cl]', 0),
|
| 381 |
+
'PubChemFP527': ('[#6]:[#6]:[#7&!H0]', 0),
|
| 382 |
+
'PubChemFP528': ('[#7&!H0]-,:[#6&!H0]', 0),
|
| 383 |
+
'PubChemFP529': ('[Cl]-,:[#6]-,:[#6]-,:[Cl]', 0),
|
| 384 |
+
'PubChemFP530': ('[#7]:[#6]-,:[#6]:[#6]', 0),
|
| 385 |
+
'PubChemFP531': ('[#16]-,:[#6]:[#6]-,:[#6]', 0),
|
| 386 |
+
'PubChemFP532': ('[#16]-,:[#6]:[#6&!H0]', 0),
|
| 387 |
+
'PubChemFP533': ('[#16]-,:[#6]:[#6]-,:[#7]', 0),
|
| 388 |
+
'PubChemFP534': ('[#16]-,:[#6]:[#6]-,:[#8]', 0),
|
| 389 |
+
'PubChemFP535': ('[#8]=,:[#6]-,:[#6]-,:[#6]', 0),
|
| 390 |
+
'PubChemFP536': ('[#8]=,:[#6]-,:[#6]-,:[#7]', 0),
|
| 391 |
+
'PubChemFP537': ('[#8]=,:[#6]-,:[#6]-,:[#8]', 0),
|
| 392 |
+
'PubChemFP538': ('[#7]=,:[#6]-,:[#6]-,:[#6]', 0),
|
| 393 |
+
'PubChemFP539': ('[#7]=,:[#6]-,:[#6&!H0]', 0),
|
| 394 |
+
'PubChemFP540': ('[#6]-,:[#7]-,:[#6&!H0]', 0),
|
| 395 |
+
'PubChemFP541': ('[#8]-,:[#6]:[#6]-,:[#6]', 0),
|
| 396 |
+
'PubChemFP542': ('[#8]-,:[#6]:[#6&!H0]', 0),
|
| 397 |
+
'PubChemFP543': ('[#8]-,:[#6]:[#6]-,:[#7]', 0),
|
| 398 |
+
'PubChemFP544': ('[#8]-,:[#6]:[#6]-,:[#8]', 0),
|
| 399 |
+
'PubChemFP545': ('[#7]-,:[#6]:[#6]-,:[#6]', 0),
|
| 400 |
+
'PubChemFP546': ('[#7]-,:[#6]:[#6&!H0]', 0),
|
| 401 |
+
'PubChemFP547': ('[#7]-,:[#6]:[#6]-,:[#7]', 0),
|
| 402 |
+
'PubChemFP548': ('[#8]-,:[#6]-,:[#6]:[#6]', 0),
|
| 403 |
+
'PubChemFP549': ('[#7]-,:[#6]-,:[#6]:[#6]', 0),
|
| 404 |
+
'PubChemFP550': ('[Cl]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 405 |
+
'PubChemFP551': ('[Cl]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 406 |
+
'PubChemFP552': ('[#6]:[#6]-,:[#6]:[#6]', 0),
|
| 407 |
+
'PubChemFP553': ('[#8]=,:[#6]-,:[#6]=,:[#6]', 0),
|
| 408 |
+
'PubChemFP554': ('[Br]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 409 |
+
'PubChemFP555': ('[#7]=,:[#6]-,:[#6]=,:[#6]', 0),
|
| 410 |
+
'PubChemFP556': ('[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
|
| 411 |
+
'PubChemFP557': ('[#7]:[#6]-,:[#8&!H0]', 0),
|
| 412 |
+
'PubChemFP558': ('[#8]=,:[#7]-,:c:c', 0),
|
| 413 |
+
'PubChemFP559': ('[#8]-,:[#6]-,:[#7&!H0]', 0),
|
| 414 |
+
'PubChemFP560': ('[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 415 |
+
'PubChemFP561': ('[Cl]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 416 |
+
'PubChemFP562': ('[Br]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 417 |
+
'PubChemFP563': ('[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 418 |
+
'PubChemFP564': ('[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
|
| 419 |
+
'PubChemFP565': ('[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
| 420 |
+
'PubChemFP566': ('[#8]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 421 |
+
'PubChemFP567': ('[#8]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 422 |
+
'PubChemFP568': ('N#[#6]-,:[#6]-,:[#6]', 0),
|
| 423 |
+
'PubChemFP569': ('[#7]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 424 |
+
'PubChemFP570': ('[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
| 425 |
+
'PubChemFP571': ('[#6&!H0]-,:[#8&!H0]', 0),
|
| 426 |
+
'PubChemFP572': ('n:c:n:c', 0),
|
| 427 |
+
'PubChemFP573': ('[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
|
| 428 |
+
'PubChemFP574': ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
| 429 |
+
'PubChemFP575': ('[#8]-,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
|
| 430 |
+
'PubChemFP576': ('[#7]=,:[#6]-,:[#6]:[#6&!H0]', 0),
|
| 431 |
+
'PubChemFP577': ('c:c-,:[#7]-,:c:c', 0),
|
| 432 |
+
'PubChemFP578': ('[#6]-,:[#6]:[#6]-,:c:c', 0),
|
| 433 |
+
'PubChemFP579': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 434 |
+
'PubChemFP580': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 435 |
+
'PubChemFP581': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 436 |
+
'PubChemFP582': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 437 |
+
'PubChemFP583': ('[Cl]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
| 438 |
+
'PubChemFP584': ('c:c-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 439 |
+
'PubChemFP585': ('[#6]-,:[#6]:[#6]-,:[#7]-,:[#6]', 0),
|
| 440 |
+
'PubChemFP586': ('[#6]-,:[#16]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 441 |
+
'PubChemFP587': ('[#7]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
| 442 |
+
'PubChemFP588': ('[#8]=,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 443 |
+
'PubChemFP589': ('[#6]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
| 444 |
+
'PubChemFP590': ('[#6]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
| 445 |
+
'PubChemFP591': ('[Cl]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 446 |
+
'PubChemFP592': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 447 |
+
'PubChemFP593': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 448 |
+
'PubChemFP594': ('[#6]-,:[#8]-,:[#6]-,:[#6]=,:[#6]', 0),
|
| 449 |
+
'PubChemFP595': ('c:c-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 450 |
+
'PubChemFP596': ('[#7]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 451 |
+
'PubChemFP597': ('[#8]=,:[#6]-,:[#6]-,:c:c', 0),
|
| 452 |
+
'PubChemFP598': ('[Cl]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
| 453 |
+
'PubChemFP599': ('[#6H,#6H2,#6H3]-,:[#6]=,:[#6H,#6H2,#6H3]', 0),
|
| 454 |
+
'PubChemFP600': ('[#7]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
| 455 |
+
'PubChemFP601': ('[#7]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
|
| 456 |
+
'PubChemFP602': ('[#8]=,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 457 |
+
'PubChemFP603': ('[#6]-,:c:c:[#6]-,:[#6]', 0),
|
| 458 |
+
'PubChemFP604': ('[#6]-,:[#8]-,:[#6]-,:[#6]:c', 0),
|
| 459 |
+
'PubChemFP605': ('[#8]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 460 |
+
'PubChemFP606': ('[#8]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
| 461 |
+
'PubChemFP607': ('[#7]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
| 462 |
+
'PubChemFP608': ('[#6]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
| 463 |
+
'PubChemFP609': ('[Cl]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 464 |
+
'PubChemFP610': ('[#6]-,:[#8]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 465 |
+
'PubChemFP611': ('[#7]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 466 |
+
'PubChemFP612': ('[#7]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
| 467 |
+
'PubChemFP613': ('[#6]-,:[#7]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 468 |
+
'PubChemFP614': ('[#6]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
| 469 |
+
'PubChemFP615': ('[#7]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 470 |
+
'PubChemFP616': ('c:c:n:n:c', 0),
|
| 471 |
+
'PubChemFP617': ('[#6]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
| 472 |
+
'PubChemFP618': ('c:[#6]-,:[#6]-,:[#6]:c', 0),
|
| 473 |
+
'PubChemFP619': ('[#8]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 474 |
+
'PubChemFP620': ('c:c-,:[#8]-,:[#6]-,:[#6]', 0),
|
| 475 |
+
'PubChemFP621': ('[#7]-,:[#6]:c:c:n', 0),
|
| 476 |
+
'PubChemFP622': ('[#8]=,:[#6]-,:[#8]-,:[#6]:c', 0),
|
| 477 |
+
'PubChemFP623': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
| 478 |
+
'PubChemFP624': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#7]', 0),
|
| 479 |
+
'PubChemFP625': ('[#8]=,:[#6]-,:[#6]:[#6]-,:[#8]', 0),
|
| 480 |
+
'PubChemFP626': ('[#6]-,:[#8]-,:[#6]:[#6]-,:[#6]', 0),
|
| 481 |
+
'PubChemFP627': ('[#8]=,:[#33]-,:[#6]:c:c', 0),
|
| 482 |
+
'PubChemFP628': ('[#6]-,:[#7]-,:[#6]-,:[#6]:c', 0),
|
| 483 |
+
'PubChemFP629': ('[#16]-,:[#6]:c:c-,:[#7]', 0),
|
| 484 |
+
'PubChemFP630': ('[#8]-,:[#6]:[#6]-,:[#8]-,:[#6]', 0),
|
| 485 |
+
'PubChemFP631': ('[#8]-,:[#6]:[#6]-,:[#8&!H0]', 0),
|
| 486 |
+
'PubChemFP632': ('[#6]-,:[#6]-,:[#8]-,:[#6]:c', 0),
|
| 487 |
+
'PubChemFP633': ('[#7]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
| 488 |
+
'PubChemFP634': ('[#6]-,:[#6]-,:[#6]:[#6]-,:[#6]', 0),
|
| 489 |
+
'PubChemFP635': ('[#7]-,:[#7]-,:[#6]-,:[#7&!H0]', 0),
|
| 490 |
+
'PubChemFP636': ('[#6]-,:[#7]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 491 |
+
'PubChemFP637': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 492 |
+
'PubChemFP638': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 493 |
+
'PubChemFP639': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 494 |
+
'PubChemFP640': ('[#6]=,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 495 |
+
'PubChemFP641': ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#6]', 0),
|
| 496 |
+
'PubChemFP642': ('[#8]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 497 |
+
'PubChemFP643': ('[#6&!H0]-,:[#6]-,:[#7&!H0]', 0),
|
| 498 |
+
'PubChemFP644': ('[#6]-,:[#6]=,:[#7]-,:[#7]-,:[#6]', 0),
|
| 499 |
+
'PubChemFP645': ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 500 |
+
'PubChemFP646': ('[#8]=,:[#6]-,:[#7]-,:[#6&!H0]', 0),
|
| 501 |
+
'PubChemFP647': ('[#8]=,:[#6]-,:[#7]-,:[#6]-,:[#7]', 0),
|
| 502 |
+
'PubChemFP648': ('[#8]=,:[#7]-,:[#6]:[#6]-,:[#7]', 0),
|
| 503 |
+
'PubChemFP649': ('[#8]=,:[#7]-,:c:c-,:[#8]', 0),
|
| 504 |
+
'PubChemFP650': ('[#8]=,:[#6]-,:[#7]-,:[#6]=,:[#8]', 0),
|
| 505 |
+
'PubChemFP651': ('[#8]-,:[#6]:[#6]:[#6]-,:[#6]', 0),
|
| 506 |
+
'PubChemFP652': ('[#8]-,:[#6]:[#6]:[#6]-,:[#7]', 0),
|
| 507 |
+
'PubChemFP653': ('[#8]-,:[#6]:[#6]:[#6]-,:[#8]', 0),
|
| 508 |
+
'PubChemFP654': ('[#7]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 509 |
+
'PubChemFP655': ('[#8]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
| 510 |
+
'PubChemFP656': ('[#6]-,:[#6]-,:[#7]-,:[#6]-,:[#6]', 0),
|
| 511 |
+
'PubChemFP657': ('[#6]-,:[#7]-,:[#6]:[#6]-,:[#6]', 0),
|
| 512 |
+
'PubChemFP658': ('[#6]-,:[#6]-,:[#16]-,:[#6]-,:[#6]', 0),
|
| 513 |
+
'PubChemFP659': ('[#8]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 514 |
+
'PubChemFP660': ('[#6]-,:[#6]=,:[#6]-,:[#6]-,:[#6]', 0),
|
| 515 |
+
'PubChemFP661': ('[#8]-,:[#6]-,:[#8]-,:[#6]-,:[#6]', 0),
|
| 516 |
+
'PubChemFP662': ('[#8]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 517 |
+
'PubChemFP663': ('[#8]-,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
| 518 |
+
'PubChemFP664': ('[#6]-,:[#6]=,:[#6]-,:[#6]=,:[#6]', 0),
|
| 519 |
+
'PubChemFP665': ('[#7]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
| 520 |
+
'PubChemFP666': ('[#6]=,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 521 |
+
'PubChemFP667': ('[#6]=,:[#6]-,:[#6]-,:[#8&!H0]', 0),
|
| 522 |
+
'PubChemFP668': ('[#6]-,:[#6]:[#6]-,:[#6]-,:[#6]', 0),
|
| 523 |
+
'PubChemFP669': ('[Cl]-,:[#6]:[#6]-,:[#6]=,:[#8]', 0),
|
| 524 |
+
'PubChemFP670': ('[Br]-,:[#6]:c:c-,:[#6]', 0),
|
| 525 |
+
'PubChemFP671': ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 526 |
+
'PubChemFP672': ('[#8]=,:[#6]-,:[#6]=,:[#6&!H0]', 0),
|
| 527 |
+
'PubChemFP673': ('[#8]=,:[#6]-,:[#6]=,:[#6]-,:[#7]', 0),
|
| 528 |
+
'PubChemFP674': ('[#7]-,:[#6]-,:[#7]-,:[#6]:c', 0),
|
| 529 |
+
'PubChemFP675': ('[Br]-,:[#6]-,:[#6]-,:[#6]:c', 0),
|
| 530 |
+
'PubChemFP676': ('[#7]#[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 531 |
+
'PubChemFP677': ('[#6]-,:[#6]=,:[#6]-,:[#6]:c', 0),
|
| 532 |
+
'PubChemFP678': ('[#6]-,:[#6]-,:[#6]=,:[#6]-,:[#6]', 0),
|
| 533 |
+
'PubChemFP679': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 534 |
+
'PubChemFP680': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 535 |
+
'PubChemFP681': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 536 |
+
'PubChemFP682': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 537 |
+
'PubChemFP683': ('[#7]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 538 |
+
'PubChemFP684': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 539 |
+
'PubChemFP685': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 540 |
+
'PubChemFP686': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 541 |
+
'PubChemFP687': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 542 |
+
'PubChemFP688': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 543 |
+
'PubChemFP689': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 544 |
+
'PubChemFP690': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 545 |
+
'PubChemFP691': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 546 |
+
'PubChemFP692': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 547 |
+
'PubChemFP693': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]', 0),
|
| 548 |
+
'PubChemFP694': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]=,:[#8]', 0),
|
| 549 |
+
'PubChemFP695': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]', 0),
|
| 550 |
+
'PubChemFP696': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 551 |
+
'PubChemFP697': ('[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
|
| 552 |
+
'PubChemFP698': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 553 |
+
'PubChemFP699': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]', 0),
|
| 554 |
+
'PubChemFP700': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#8]-,:[#6]', 0),
|
| 555 |
+
'PubChemFP701': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
|
| 556 |
+
'PubChemFP702': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#7]-,:[#6]', 0),
|
| 557 |
+
'PubChemFP703': ('[#8]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
|
| 558 |
+
'PubChemFP704': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 559 |
+
'PubChemFP705': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#8])-,:[#6]', 0),
|
| 560 |
+
'PubChemFP706': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](=,:[#8])-,:[#6]', 0),
|
| 561 |
+
'PubChemFP707': ('[#8]=,:[#6]-,:[#6]-,:[#6]-,:[#6]-,:[#6](-,:[#7])-,:[#6]', 0),
|
| 562 |
+
'PubChemFP708': ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
|
| 563 |
+
'PubChemFP709': ('[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]-,:[#6]', 0),
|
| 564 |
+
'PubChemFP710': ('[#6]-,:[#6]-,:[#6](-,:[#6])-,:[#6]-,:[#6]', 0),
|
| 565 |
+
'PubChemFP711': ('[#6]-,:[#6](-,:[#6])(-,:[#6])-,:[#6]-,:[#6]', 0),
|
| 566 |
+
'PubChemFP712': ('[#6]-,:[#6](-,:[#6])-,:[#6](-,:[#6])-,:[#6]', 0),
|
| 567 |
+
'PubChemFP713': ('[#6]c1ccc([#6])cc1', 0),
|
| 568 |
+
'PubChemFP714': ('[#6]c1ccc([#8])cc1', 0),
|
| 569 |
+
'PubChemFP715': ('[#6]c1ccc([#16])cc1', 0),
|
| 570 |
+
'PubChemFP716': ('[#6]c1ccc([#7])cc1', 0),
|
| 571 |
+
'PubChemFP717': ('[#6]c1ccc(Cl)cc1', 0),
|
| 572 |
+
'PubChemFP718': ('[#6]c1ccc(Br)cc1', 0),
|
| 573 |
+
'PubChemFP719': ('[#8]c1ccc([#8])cc1', 0),
|
| 574 |
+
'PubChemFP720': ('[#8]c1ccc([#16])cc1', 0),
|
| 575 |
+
'PubChemFP721': ('[#8]c1ccc([#7])cc1', 0),
|
| 576 |
+
'PubChemFP722': ('[#8]c1ccc(Cl)cc1', 0),
|
| 577 |
+
'PubChemFP723': ('[#8]c1ccc(Br)cc1', 0),
|
| 578 |
+
'PubChemFP724': ('[#16]c1ccc([#16])cc1', 0),
|
| 579 |
+
'PubChemFP725': ('[#16]c1ccc([#7])cc1', 0),
|
| 580 |
+
'PubChemFP726': ('[#16]c1ccc(Cl)cc1', 0),
|
| 581 |
+
'PubChemFP727': ('[#16]c1ccc(Br)cc1', 0),
|
| 582 |
+
'PubChemFP728': ('[#7]c1ccc([#7])cc1', 0),
|
| 583 |
+
'PubChemFP729': ('[#7]c1ccc(Cl)cc1', 0),
|
| 584 |
+
'PubChemFP730': ('[#7]c1ccc(Br)cc1', 0),
|
| 585 |
+
'PubChemFP731': ('Clc1ccc(Cl)cc1', 0),
|
| 586 |
+
'PubChemFP732': ('Clc1ccc(Br)cc1', 0),
|
| 587 |
+
'PubChemFP733': ('Brc1ccc(Br)cc1', 0),
|
| 588 |
+
'PubChemFP734': ('[#6]c1cc([#6])ccc1', 0),
|
| 589 |
+
'PubChemFP735': ('[#6]c1cc([#8])ccc1', 0),
|
| 590 |
+
'PubChemFP736': ('[#6]c1cc([#16])ccc1', 0),
|
| 591 |
+
'PubChemFP737': ('[#6]c1cc([#7])ccc1', 0),
|
| 592 |
+
'PubChemFP738': ('[#6]c1cc(Cl)ccc1', 0),
|
| 593 |
+
'PubChemFP739': ('[#6]c1cc(Br)ccc1', 0),
|
| 594 |
+
'PubChemFP740': ('[#8]c1cc([#8])ccc1', 0),
|
| 595 |
+
'PubChemFP741': ('[#8]c1cc([#16])ccc1', 0),
|
| 596 |
+
'PubChemFP742': ('[#8]c1cc([#7])ccc1', 0),
|
| 597 |
+
'PubChemFP743': ('[#8]c1cc(Cl)ccc1', 0),
|
| 598 |
+
'PubChemFP744': ('[#8]c1cc(Br)ccc1', 0),
|
| 599 |
+
'PubChemFP745': ('[#16]c1cc([#16])ccc1', 0),
|
| 600 |
+
'PubChemFP746': ('[#16]c1cc([#7])ccc1', 0),
|
| 601 |
+
'PubChemFP747': ('[#16]c1cc(Cl)ccc1', 0),
|
| 602 |
+
'PubChemFP748': ('[#16]c1cc(Br)ccc1', 0),
|
| 603 |
+
'PubChemFP749': ('[#7]c1cc([#7])ccc1', 0),
|
| 604 |
+
'PubChemFP750': ('[#7]c1cc(Cl)ccc1', 0),
|
| 605 |
+
'PubChemFP751': ('[#7]c1cc(Br)ccc1', 0),
|
| 606 |
+
'PubChemFP752': ('Clc1cc(Cl)ccc1', 0),
|
| 607 |
+
'PubChemFP753': ('Clc1cc(Br)ccc1', 0),
|
| 608 |
+
'PubChemFP754': ('Brc1cc(Br)ccc1', 0),
|
| 609 |
+
'PubChemFP755': ('[#6]c1c([#6])cccc1', 0),
|
| 610 |
+
'PubChemFP756': ('[#6]c1c([#8])cccc1', 0),
|
| 611 |
+
'PubChemFP757': ('[#6]c1c([#16])cccc1', 0),
|
| 612 |
+
'PubChemFP758': ('[#6]c1c([#7])cccc1', 0),
|
| 613 |
+
'PubChemFP759': ('[#6]c1c(Cl)cccc1', 0),
|
| 614 |
+
'PubChemFP760': ('[#6]c1c(Br)cccc1', 0),
|
| 615 |
+
'PubChemFP761': ('[#8]c1c([#8])cccc1', 0),
|
| 616 |
+
'PubChemFP762': ('[#8]c1c([#16])cccc1', 0),
|
| 617 |
+
'PubChemFP763': ('[#8]c1c([#7])cccc1', 0),
|
| 618 |
+
'PubChemFP764': ('[#8]c1c(Cl)cccc1', 0),
|
| 619 |
+
'PubChemFP765': ('[#8]c1c(Br)cccc1', 0),
|
| 620 |
+
'PubChemFP766': ('[#16]c1c([#16])cccc1', 0),
|
| 621 |
+
'PubChemFP767': ('[#16]c1c([#7])cccc1', 0),
|
| 622 |
+
'PubChemFP768': ('[#16]c1c(Cl)cccc1', 0),
|
| 623 |
+
'PubChemFP769': ('[#16]c1c(Br)cccc1', 0),
|
| 624 |
+
'PubChemFP770': ('[#7]c1c([#7])cccc1', 0),
|
| 625 |
+
'PubChemFP771': ('[#7]c1c(Cl)cccc1', 0),
|
| 626 |
+
'PubChemFP772': ('[#7]c1c(Br)cccc1', 0),
|
| 627 |
+
'PubChemFP773': ('Clc1c(Cl)cccc1', 0),
|
| 628 |
+
'PubChemFP774': ('Clc1c(Br)cccc1', 0),
|
| 629 |
+
'PubChemFP775': ('Brc1c(Br)cccc1', 0),
|
| 630 |
+
'PubChemFP776': ('[#6][#6]1[#6][#6][#6]([#6])[#6][#6]1', 0),
|
| 631 |
+
'PubChemFP777': ('[#6][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
|
| 632 |
+
'PubChemFP778': ('[#6][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
| 633 |
+
'PubChemFP779': ('[#6][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
| 634 |
+
'PubChemFP780': ('[#6][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
| 635 |
+
'PubChemFP781': ('[#6][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 636 |
+
'PubChemFP782': ('[#8][#6]1[#6][#6][#6]([#8])[#6][#6]1', 0),
|
| 637 |
+
'PubChemFP783': ('[#8][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
| 638 |
+
'PubChemFP784': ('[#8][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
| 639 |
+
'PubChemFP785': ('[#8][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
| 640 |
+
'PubChemFP786': ('[#8][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 641 |
+
'PubChemFP787': ('[#16][#6]1[#6][#6][#6]([#16])[#6][#6]1', 0),
|
| 642 |
+
'PubChemFP788': ('[#16][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
| 643 |
+
'PubChemFP789': ('[#16][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
| 644 |
+
'PubChemFP790': ('[#16][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 645 |
+
'PubChemFP791': ('[#7][#6]1[#6][#6][#6]([#7])[#6][#6]1', 0),
|
| 646 |
+
'PubChemFP792': ('[#7][#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
| 647 |
+
'PubChemFP793': ('[#7][#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 648 |
+
'PubChemFP794': ('Cl[#6]1[#6][#6][#6](Cl)[#6][#6]1', 0),
|
| 649 |
+
'PubChemFP795': ('Cl[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 650 |
+
'PubChemFP796': ('Br[#6]1[#6][#6][#6](Br)[#6][#6]1', 0),
|
| 651 |
+
'PubChemFP797': ('[#6][#6]1[#6][#6]([#6])[#6][#6][#6]1', 0),
|
| 652 |
+
'PubChemFP798': ('[#6][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
|
| 653 |
+
'PubChemFP799': ('[#6][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
| 654 |
+
'PubChemFP800': ('[#6][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
| 655 |
+
'PubChemFP801': ('[#6][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
| 656 |
+
'PubChemFP802': ('[#6][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 657 |
+
'PubChemFP803': ('[#8][#6]1[#6][#6]([#8])[#6][#6][#6]1', 0),
|
| 658 |
+
'PubChemFP804': ('[#8][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
| 659 |
+
'PubChemFP805': ('[#8][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
| 660 |
+
'PubChemFP806': ('[#8][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
| 661 |
+
'PubChemFP807': ('[#8][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 662 |
+
'PubChemFP808': ('[#16][#6]1[#6][#6]([#16])[#6][#6][#6]1', 0),
|
| 663 |
+
'PubChemFP809': ('[#16][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
| 664 |
+
'PubChemFP810': ('[#16][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
| 665 |
+
'PubChemFP811': ('[#16][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 666 |
+
'PubChemFP812': ('[#7][#6]1[#6][#6]([#7])[#6][#6][#6]1', 0),
|
| 667 |
+
'PubChemFP813': ('[#7][#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
| 668 |
+
'PubChemFP814': ('[#7][#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 669 |
+
'PubChemFP815': ('Cl[#6]1[#6][#6](Cl)[#6][#6][#6]1', 0),
|
| 670 |
+
'PubChemFP816': ('Cl[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 671 |
+
'PubChemFP817': ('Br[#6]1[#6][#6](Br)[#6][#6][#6]1', 0),
|
| 672 |
+
'PubChemFP818': ('[#6][#6]1[#6]([#6])[#6][#6][#6][#6]1', 0),
|
| 673 |
+
'PubChemFP819': ('[#6][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
|
| 674 |
+
'PubChemFP820': ('[#6][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
| 675 |
+
'PubChemFP821': ('[#6][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
| 676 |
+
'PubChemFP822': ('[#6][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
| 677 |
+
'PubChemFP823': ('[#6][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 678 |
+
'PubChemFP824': ('[#8][#6]1[#6]([#8])[#6][#6][#6][#6]1', 0),
|
| 679 |
+
'PubChemFP825': ('[#8][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
| 680 |
+
'PubChemFP826': ('[#8][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
| 681 |
+
'PubChemFP827': ('[#8][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
| 682 |
+
'PubChemFP828': ('[#8][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 683 |
+
'PubChemFP829': ('[#16][#6]1[#6]([#16])[#6][#6][#6][#6]1', 0),
|
| 684 |
+
'PubChemFP830': ('[#16][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
| 685 |
+
'PubChemFP831': ('[#16][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
| 686 |
+
'PubChemFP832': ('[#16][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 687 |
+
'PubChemFP833': ('[#7][#6]1[#6]([#7])[#6][#6][#6][#6]1', 0),
|
| 688 |
+
'PubChemFP834': ('[#7][#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
| 689 |
+
'PubChemFP835': ('[#7][#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 690 |
+
'PubChemFP836': ('Cl[#6]1[#6](Cl)[#6][#6][#6][#6]1', 0),
|
| 691 |
+
'PubChemFP837': ('Cl[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 692 |
+
'PubChemFP838': ('Br[#6]1[#6](Br)[#6][#6][#6][#6]1', 0),
|
| 693 |
+
'PubChemFP839': ('[#6][#6]1[#6][#6]([#6])[#6][#6]1', 0),
|
| 694 |
+
'PubChemFP840': ('[#6][#6]1[#6][#6]([#8])[#6][#6]1', 0),
|
| 695 |
+
'PubChemFP841': ('[#6][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
| 696 |
+
'PubChemFP842': ('[#6][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
| 697 |
+
'PubChemFP843': ('[#6][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
| 698 |
+
'PubChemFP844': ('[#6][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 699 |
+
'PubChemFP845': ('[#8][#6]1[#6][#6]([#8])[#6][#6]1', 0),
|
| 700 |
+
'PubChemFP846': ('[#8][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
| 701 |
+
'PubChemFP847': ('[#8][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
| 702 |
+
'PubChemFP848': ('[#8][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
| 703 |
+
'PubChemFP849': ('[#8][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 704 |
+
'PubChemFP850': ('[#16][#6]1[#6][#6]([#16])[#6][#6]1', 0),
|
| 705 |
+
'PubChemFP851': ('[#16][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
| 706 |
+
'PubChemFP852': ('[#16][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
| 707 |
+
'PubChemFP853': ('[#16][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 708 |
+
'PubChemFP854': ('[#7][#6]1[#6][#6]([#7])[#6][#6]1', 0),
|
| 709 |
+
'PubChemFP855': ('[#7][#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
| 710 |
+
'PubChemFP856': ('[#7][#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 711 |
+
'PubChemFP857': ('Cl[#6]1[#6][#6](Cl)[#6][#6]1', 0),
|
| 712 |
+
'PubChemFP858': ('Cl[#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 713 |
+
'PubChemFP859': ('Br[#6]1[#6][#6](Br)[#6][#6]1', 0),
|
| 714 |
+
'PubChemFP860': ('[#6][#6]1[#6]([#6])[#6][#6][#6]1', 0),
|
| 715 |
+
'PubChemFP861': ('[#6][#6]1[#6]([#8])[#6][#6][#6]1', 0),
|
| 716 |
+
'PubChemFP862': ('[#6][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
| 717 |
+
'PubChemFP863': ('[#6][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
| 718 |
+
'PubChemFP864': ('[#6][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
| 719 |
+
'PubChemFP865': ('[#6][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
| 720 |
+
'PubChemFP866': ('[#8][#6]1[#6]([#8])[#6][#6][#6]1', 0),
|
| 721 |
+
'PubChemFP867': ('[#8][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
| 722 |
+
'PubChemFP868': ('[#8][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
| 723 |
+
'PubChemFP869': ('[#8][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
| 724 |
+
'PubChemFP870': ('[#8][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
| 725 |
+
'PubChemFP871': ('[#16][#6]1[#6]([#16])[#6][#6][#6]1', 0),
|
| 726 |
+
'PubChemFP872': ('[#16][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
| 727 |
+
'PubChemFP873': ('[#16][#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
| 728 |
+
'PubChemFP874': ('[#16][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
| 729 |
+
'PubChemFP875': ('[#7][#6]1[#6]([#7])[#6][#6][#6]1', 0),
|
| 730 |
+
'PubChemFP876': ('[#7][#6]1[#6](Cl)[#6][#6]1', 0),
|
| 731 |
+
'PubChemFP877': ('[#7][#6]1[#6](Br)[#6][#6][#6]1', 0),
|
| 732 |
+
'PubChemFP878': ('Cl[#6]1[#6](Cl)[#6][#6][#6]1', 0),
|
| 733 |
+
'PubChemFP879': ('Cl[#6]1[#6](Br)[#6][#6][#6]1', 0),
|
| 734 |
+
'PubChemFP880': ('Br[#6]1[#6](Br)[#6][#6][#6]1', 0)}
|
deepscreen/data/featurizers/fingerprint/torsions.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rdkit.Chem.AtomPairs import Torsions
|
| 2 |
+
from rdkit.Chem import DataStructs
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
_type = 'topological-based'
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def GetTorsionFPs(mol, nBits=2048, binary=True):
|
| 9 |
+
'''
|
| 10 |
+
atompairs fingerprints
|
| 11 |
+
'''
|
| 12 |
+
fp = Torsions.GetHashedTopologicalTorsionFingerprint(mol, nBits=nBits)
|
| 13 |
+
if binary:
|
| 14 |
+
arr = np.zeros((0,), dtype=np.bool_)
|
| 15 |
+
else:
|
| 16 |
+
arr = np.zeros((0,), dtype=np.int8)
|
| 17 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
| 18 |
+
return arr
|
deepscreen/data/featurizers/graph.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import networkx as nx
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
from rdkit import Chem
|
| 5 |
+
from torch_geometric.utils import from_smiles
|
| 6 |
+
from torch_geometric.data import Data
|
| 7 |
+
|
| 8 |
+
from deepscreen.data.featurizers.categorical import one_of_k_encoding_unk, one_of_k_encoding
|
| 9 |
+
from deepscreen.utils import get_logger
|
| 10 |
+
|
| 11 |
+
log = get_logger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def atom_features(atom, explicit_H=False, use_chirality=True):
|
| 15 |
+
"""
|
| 16 |
+
Adapted from TransformerCPI 2.0
|
| 17 |
+
"""
|
| 18 |
+
symbol = ['C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I', 'other'] # 10-dim
|
| 19 |
+
degree = [0, 1, 2, 3, 4, 5, 6] # 7-dim
|
| 20 |
+
hybridization_type = [Chem.rdchem.HybridizationType.SP,
|
| 21 |
+
Chem.rdchem.HybridizationType.SP2,
|
| 22 |
+
Chem.rdchem.HybridizationType.SP3,
|
| 23 |
+
Chem.rdchem.HybridizationType.SP3D,
|
| 24 |
+
Chem.rdchem.HybridizationType.SP3D2,
|
| 25 |
+
'other'] # 6-dim
|
| 26 |
+
|
| 27 |
+
# 10+7+2+6+1=26
|
| 28 |
+
results = one_of_k_encoding_unk(atom.GetSymbol(), symbol) + \
|
| 29 |
+
one_of_k_encoding(atom.GetDegree(), degree) + \
|
| 30 |
+
[atom.GetFormalCharge(), atom.GetNumRadicalElectrons()] + \
|
| 31 |
+
one_of_k_encoding_unk(atom.GetHybridization(), hybridization_type) + [atom.GetIsAromatic()]
|
| 32 |
+
|
| 33 |
+
# In case of explicit hydrogen(QM8, QM9), avoid calling `GetTotalNumHs`
|
| 34 |
+
# 26+5=31
|
| 35 |
+
if not explicit_H:
|
| 36 |
+
results = results + one_of_k_encoding_unk(atom.GetTotalNumHs(),
|
| 37 |
+
[0, 1, 2, 3, 4])
|
| 38 |
+
# 31+3=34
|
| 39 |
+
if use_chirality:
|
| 40 |
+
try:
|
| 41 |
+
results = results + one_of_k_encoding_unk(
|
| 42 |
+
atom.GetProp('_CIPCode'),
|
| 43 |
+
['R', 'S']) + [atom.HasProp('_ChiralityPossible')]
|
| 44 |
+
except:
|
| 45 |
+
results = results + [False, False] + [atom.HasProp('_ChiralityPossible')]
|
| 46 |
+
|
| 47 |
+
return np.array(results)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def bond_features(bond):
|
| 51 |
+
bt = bond.GetBondType()
|
| 52 |
+
return np.array(
|
| 53 |
+
[bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE, bt == Chem.rdchem.BondType.TRIPLE,
|
| 54 |
+
bt == Chem.rdchem.BondType.AROMATIC, bond.GetIsConjugated(), bond.IsInRing()])
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def smiles_to_graph_pyg(smiles):
|
| 58 |
+
"""
|
| 59 |
+
Convert SMILES to graph with the default method defined by PyTorch Geometric
|
| 60 |
+
"""
|
| 61 |
+
try:
|
| 62 |
+
return from_smiles(smiles)
|
| 63 |
+
except Exception as e:
|
| 64 |
+
log.warning(f"Failed to featurize the following SMILES to graph: {smiles} due to {str(e)}")
|
| 65 |
+
return None
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def smiles_to_graph(smiles, atom_features: callable = atom_features):
|
| 69 |
+
"""
|
| 70 |
+
Convert SMILES to graph with custom atom_features
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 74 |
+
|
| 75 |
+
features = []
|
| 76 |
+
for atom in mol.GetAtoms():
|
| 77 |
+
feature = atom_features(atom)
|
| 78 |
+
features.append(feature / sum(feature))
|
| 79 |
+
features = np.array(features)
|
| 80 |
+
|
| 81 |
+
edges = []
|
| 82 |
+
for bond in mol.GetBonds():
|
| 83 |
+
edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
|
| 84 |
+
g = nx.Graph(edges).to_directed()
|
| 85 |
+
|
| 86 |
+
if len(edges) == 0:
|
| 87 |
+
edge_index = [[0, 0]]
|
| 88 |
+
else:
|
| 89 |
+
edge_index = []
|
| 90 |
+
for e1, e2 in g.edges:
|
| 91 |
+
edge_index.append([e1, e2])
|
| 92 |
+
|
| 93 |
+
return Data(x=torch.Tensor(features),
|
| 94 |
+
edge_index=torch.LongTensor(edge_index).transpose(0, 1))
|
| 95 |
+
|
| 96 |
+
except Exception as e:
|
| 97 |
+
log.warning(f"Failed to convert SMILES ({smiles}) to graph due to {str(e)}")
|
| 98 |
+
return None
|
| 99 |
+
# features = []
|
| 100 |
+
# for atom in mol.GetAtoms():
|
| 101 |
+
# feature = atom_features(atom)
|
| 102 |
+
# features.append(feature / sum(feature))
|
| 103 |
+
#
|
| 104 |
+
# edge_indices = []
|
| 105 |
+
# for bond in mol.GetBonds():
|
| 106 |
+
# i = bond.GetBeginAtomIdx()
|
| 107 |
+
# j = bond.GetEndAtomIdx()
|
| 108 |
+
# edge_indices += [[i, j], [j, i]]
|
| 109 |
+
#
|
| 110 |
+
# edge_index = torch.tensor(edge_indices)
|
| 111 |
+
# edge_index = edge_index.t().to(torch.long).view(2, -1)
|
| 112 |
+
#
|
| 113 |
+
# if edge_index.numel() > 0: # Sort indices.
|
| 114 |
+
# perm = (edge_index[0] * x.size(0) + edge_index[1]).argsort()
|
| 115 |
+
# edge_index = edge_index[:, perm]
|
| 116 |
+
#
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def smiles_to_mol_features(smiles, num_atom_feat: callable):
|
| 120 |
+
try:
|
| 121 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 122 |
+
num_atom_feat = len(atom_features(mol.GetAtoms()[0]))
|
| 123 |
+
atom_feat = np.zeros((mol.GetNumAtoms(), num_atom_feat))
|
| 124 |
+
for atom in mol.GetAtoms():
|
| 125 |
+
atom_feat[atom.GetIdx(), :] = atom_features(atom)
|
| 126 |
+
adj = Chem.GetAdjacencyMatrix(mol)
|
| 127 |
+
adj_mat = np.array(adj)
|
| 128 |
+
|
| 129 |
+
return atom_feat, adj_mat
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
log.warning(f"Failed to featurize the following SMILES to molecular features: {smiles} due to {str(e)}")
|
| 133 |
+
return None
|
deepscreen/data/featurizers/monn.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from rdkit.Chem import MolFromSmiles
|
| 3 |
+
|
| 4 |
+
from deepscreen.data.featurizers.categorical import FASTA_VOCAB, fasta_to_label
|
| 5 |
+
from deepscreen.data.featurizers.graph import atom_features, bond_features
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_mask(arr):
|
| 9 |
+
a = np.zeros(1, len(arr))
|
| 10 |
+
a[1, :arr.shape[0]] = 1
|
| 11 |
+
return a
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def add_index(input_array, ebd_size):
|
| 15 |
+
batch_size, n_vertex, n_nbs = np.shape(input_array)
|
| 16 |
+
add_idx = np.array(range(0, ebd_size * batch_size, ebd_size) * (n_nbs * n_vertex))
|
| 17 |
+
add_idx = np.transpose(add_idx.reshape(-1, batch_size))
|
| 18 |
+
add_idx = add_idx.reshape(-1)
|
| 19 |
+
new_array = input_array.reshape(-1) + add_idx
|
| 20 |
+
return new_array
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# TODO fix padding and masking
|
| 24 |
+
def drug_featurizer(smiles, max_neighbors=6):
|
| 25 |
+
mol = MolFromSmiles(smiles)
|
| 26 |
+
|
| 27 |
+
# convert molecule to GNN input
|
| 28 |
+
n_atoms = mol.GetNumAtoms()
|
| 29 |
+
assert mol.GetNumBonds() >= 0
|
| 30 |
+
|
| 31 |
+
n_bonds = max(mol.GetNumBonds(), 1)
|
| 32 |
+
feat_atoms = np.zeros((n_atoms,)) # atom feature ID
|
| 33 |
+
feat_bonds = np.zeros((n_bonds,)) # bond feature ID
|
| 34 |
+
atom_adj = np.zeros((n_atoms, max_neighbors))
|
| 35 |
+
bond_adj = np.zeros((n_atoms, max_neighbors))
|
| 36 |
+
n_neighbors = np.zeros((n_atoms,))
|
| 37 |
+
neighbor_mask = np.zeros((n_atoms, max_neighbors))
|
| 38 |
+
|
| 39 |
+
for atom in mol.GetAtoms():
|
| 40 |
+
idx = atom.GetIdx()
|
| 41 |
+
feat_atoms[idx] = atom_features(atom)
|
| 42 |
+
|
| 43 |
+
for bond in mol.GetBonds():
|
| 44 |
+
a1 = bond.GetBeginAtom().GetIdx()
|
| 45 |
+
a2 = bond.GetEndAtom().GetIdx()
|
| 46 |
+
idx = bond.GetIdx()
|
| 47 |
+
feat_bonds[idx] = bond_features(bond)
|
| 48 |
+
try:
|
| 49 |
+
atom_adj[a1, n_neighbors[a1]] = a2
|
| 50 |
+
atom_adj[a2, n_neighbors[a2]] = a1
|
| 51 |
+
except:
|
| 52 |
+
return [], [], [], [], []
|
| 53 |
+
bond_adj[a1, n_neighbors[a1]] = idx
|
| 54 |
+
bond_adj[a2, n_neighbors[a2]] = idx
|
| 55 |
+
n_neighbors[a1] += 1
|
| 56 |
+
n_neighbors[a2] += 1
|
| 57 |
+
|
| 58 |
+
for i in range(len(n_neighbors)):
|
| 59 |
+
neighbor_mask[i, :n_neighbors[i]] = 1
|
| 60 |
+
|
| 61 |
+
vertex_mask = get_mask(feat_atoms)
|
| 62 |
+
# vertex = pack_1d(feat_atoms)
|
| 63 |
+
# edge = pack_1d(feat_bonds)
|
| 64 |
+
# atom_adj = pack_2d(atom_adj)
|
| 65 |
+
# bond_adj = pack_2d(bond_adj)
|
| 66 |
+
# nbs_mask = pack_2d(n_neighbors_mat)
|
| 67 |
+
|
| 68 |
+
atom_adj = add_index(atom_adj, np.shape(atom_adj)[1])
|
| 69 |
+
bond_adj = add_index(bond_adj, np.shape(feat_bonds)[1])
|
| 70 |
+
|
| 71 |
+
return vertex_mask, feat_atoms, feat_bonds, atom_adj, bond_adj, neighbor_mask
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# TODO WIP the pairwise_label matrix probably should be generated beforehand and stored as an extra label in the dataset
|
| 75 |
+
def get_pairwise_label(pdbid, interaction_dict, mol):
|
| 76 |
+
if pdbid in interaction_dict:
|
| 77 |
+
sdf_element = np.array([atom.GetSymbol().upper() for atom in mol.GetAtoms()])
|
| 78 |
+
atom_element = np.array(interaction_dict[pdbid]['atom_element'], dtype=str)
|
| 79 |
+
atom_name_list = np.array(interaction_dict[pdbid]['atom_name'], dtype=str)
|
| 80 |
+
atom_interact = np.array(interaction_dict[pdbid]['atom_interact'], dtype=int)
|
| 81 |
+
nonH_position = np.where(atom_element != 'H')[0]
|
| 82 |
+
assert sum(atom_element[nonH_position] != sdf_element) == 0
|
| 83 |
+
|
| 84 |
+
atom_name_list = atom_name_list[nonH_position].tolist()
|
| 85 |
+
pairwise_mat = np.zeros((len(nonH_position), len(interaction_dict[pdbid]['uniprot_seq'])), dtype=np.int32)
|
| 86 |
+
for atom_name, bond_type in interaction_dict[pdbid]['atom_bond_type']:
|
| 87 |
+
atom_idx = atom_name_list.index(str(atom_name))
|
| 88 |
+
assert atom_idx < len(nonH_position)
|
| 89 |
+
|
| 90 |
+
seq_idx_list = []
|
| 91 |
+
for seq_idx, bond_type_seq in interaction_dict[pdbid]['residue_bond_type']:
|
| 92 |
+
if bond_type == bond_type_seq:
|
| 93 |
+
seq_idx_list.append(seq_idx)
|
| 94 |
+
pairwise_mat[atom_idx, seq_idx] = 1
|
| 95 |
+
if len(np.where(pairwise_mat != 0)[0]) != 0:
|
| 96 |
+
pairwise_mask = True
|
| 97 |
+
return True, pairwise_mat
|
| 98 |
+
return False, np.zeros((1, 1))
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def protein_featurizer(fasta):
|
| 102 |
+
sequence = fasta_to_label(fasta)
|
| 103 |
+
# pad proteins and make masks
|
| 104 |
+
seq_mask = get_mask(sequence)
|
| 105 |
+
|
| 106 |
+
return seq_mask, sequence
|
deepscreen/data/featurizers/token.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import collections
|
| 2 |
+
from importlib import resources
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
from typing import Optional, List
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
from transformers import BertTokenizer
|
| 9 |
+
|
| 10 |
+
SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""
|
| 11 |
+
# \[[^\]]+\] # match anything inside square brackets
|
| 12 |
+
# |Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p # match elements
|
| 13 |
+
# |\(|\) # match parentheses
|
| 14 |
+
# |\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2} # match various symbols
|
| 15 |
+
# |[0-9] # match digits
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def sequence_to_kmers(sequence, k=3):
|
| 19 |
+
""" Divide a string into a list of kmers strings.
|
| 20 |
+
|
| 21 |
+
Parameters:
|
| 22 |
+
sequence (string)
|
| 23 |
+
k (int), default 3
|
| 24 |
+
Returns:
|
| 25 |
+
List containing a list of kmers.
|
| 26 |
+
"""
|
| 27 |
+
return [sequence[i:i + k] for i in range(len(sequence) - k + 1)]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def sequence_to_word_embedding(sequence, model):
|
| 31 |
+
"""Get protein embedding, infer a list of 3-mers to (num_word, 100) matrix"""
|
| 32 |
+
kmers = sequence_to_kmers(sequence)
|
| 33 |
+
vec = np.zeros((len(kmers), 100))
|
| 34 |
+
i = 0
|
| 35 |
+
for word in kmers:
|
| 36 |
+
try:
|
| 37 |
+
vec[i,] = model.wv[word]
|
| 38 |
+
except KeyError:
|
| 39 |
+
pass
|
| 40 |
+
i += 1
|
| 41 |
+
return vec
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def sequence_to_token_ids(sequence, tokenizer):
|
| 45 |
+
token_ids = tokenizer.encode(sequence)
|
| 46 |
+
return np.array(token_ids)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# def sequence_to_token_ids(sequence, tokenizer, max_length: int):
|
| 50 |
+
# token_ids = tokenizer.encode(sequence)
|
| 51 |
+
# length = min(max_length, len(token_ids))
|
| 52 |
+
#
|
| 53 |
+
# token_ids_padded = np.zeros(max_length, dtype='int')
|
| 54 |
+
# token_ids_padded[:length] = token_ids[:length]
|
| 55 |
+
#
|
| 56 |
+
# return token_ids_padded
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class SmilesTokenizer(BertTokenizer):
|
| 60 |
+
"""
|
| 61 |
+
Adapted from https://github.com/deepchem/deepchem/.
|
| 62 |
+
|
| 63 |
+
Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
|
| 64 |
+
implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
|
| 65 |
+
algorithm over SMILES strings using the tokenization SMILES regex developed by Schwaller et al.
|
| 66 |
+
|
| 67 |
+
Please see https://github.com/huggingface/transformers
|
| 68 |
+
and https://github.com/rxn4chemistry/rxnfp for more details.
|
| 69 |
+
|
| 70 |
+
Examples
|
| 71 |
+
--------
|
| 72 |
+
>>> tokenizer = SmilesTokenizer(vocab_path, regex_pattern)
|
| 73 |
+
>>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))
|
| 74 |
+
[12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13]
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
References
|
| 78 |
+
----------
|
| 79 |
+
.. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
|
| 80 |
+
Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
|
| 81 |
+
Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3
|
| 82 |
+
|
| 83 |
+
Note
|
| 84 |
+
----
|
| 85 |
+
This class requires huggingface's transformers and tokenizers libraries to be installed.
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
def __init__(
|
| 89 |
+
self,
|
| 90 |
+
vocab_file: str = 'resources/vocabs/smiles.txt',
|
| 91 |
+
regex_pattern: str = SMI_REGEX_PATTERN,
|
| 92 |
+
# unk_token="[UNK]",
|
| 93 |
+
# sep_token="[SEP]",
|
| 94 |
+
# pad_token="[PAD]",
|
| 95 |
+
# cls_token="[CLS]",
|
| 96 |
+
# mask_token="[MASK]",
|
| 97 |
+
**kwargs):
|
| 98 |
+
"""Constructs a SmilesTokenizer.
|
| 99 |
+
|
| 100 |
+
Parameters
|
| 101 |
+
----------
|
| 102 |
+
vocab_file: str
|
| 103 |
+
Path to a SMILES character per line vocabulary file.
|
| 104 |
+
Default vocab file is found in deepchem/feat/tests/data/vocab.txt
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
super().__init__(vocab_file, **kwargs)
|
| 108 |
+
|
| 109 |
+
if not os.path.isfile(vocab_file):
|
| 110 |
+
raise ValueError(
|
| 111 |
+
"Can't find a vocab file at path '{}'.".format(vocab_file))
|
| 112 |
+
self.vocab = load_vocab(vocab_file)
|
| 113 |
+
unused_indexes = [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")]
|
| 114 |
+
self.highest_unused_index = 0 if len(unused_indexes) == 0 else max(unused_indexes)
|
| 115 |
+
self.ids_to_tokens = collections.OrderedDict([
|
| 116 |
+
(ids, tok) for tok, ids in self.vocab.items()
|
| 117 |
+
])
|
| 118 |
+
self.basic_tokenizer = BasicSmilesTokenizer(regex_pattern=regex_pattern)
|
| 119 |
+
|
| 120 |
+
@property
|
| 121 |
+
def vocab_size(self):
|
| 122 |
+
return len(self.vocab)
|
| 123 |
+
|
| 124 |
+
@property
|
| 125 |
+
def vocab_list(self):
|
| 126 |
+
return list(self.vocab.keys())
|
| 127 |
+
|
| 128 |
+
def _tokenize(self, text: str, max_seq_length: int = 512, **kwargs):
|
| 129 |
+
"""Tokenize a string into a list of tokens.
|
| 130 |
+
|
| 131 |
+
Parameters
|
| 132 |
+
----------
|
| 133 |
+
text: str
|
| 134 |
+
Input string sequence to be tokenized.
|
| 135 |
+
"""
|
| 136 |
+
|
| 137 |
+
max_len_single_sentence = max_seq_length - 2
|
| 138 |
+
split_tokens = [
|
| 139 |
+
token for token in self.basic_tokenizer.tokenize(text)
|
| 140 |
+
[:max_len_single_sentence]
|
| 141 |
+
]
|
| 142 |
+
return split_tokens
|
| 143 |
+
|
| 144 |
+
def _convert_token_to_id(self, token: str):
|
| 145 |
+
"""Converts a token (str/unicode) in an id using the vocab.
|
| 146 |
+
|
| 147 |
+
Parameters
|
| 148 |
+
----------
|
| 149 |
+
token: str
|
| 150 |
+
String token from a larger sequence to be converted to a numerical id.
|
| 151 |
+
"""
|
| 152 |
+
|
| 153 |
+
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
| 154 |
+
|
| 155 |
+
def _convert_id_to_token(self, index: int):
|
| 156 |
+
"""Converts an index (integer) in a token (string/unicode) using the vocab.
|
| 157 |
+
|
| 158 |
+
Parameters
|
| 159 |
+
----------
|
| 160 |
+
index: int
|
| 161 |
+
Integer index to be converted back to a string-based token as part of a larger sequence.
|
| 162 |
+
"""
|
| 163 |
+
|
| 164 |
+
return self.ids_to_tokens.get(index, self.unk_token)
|
| 165 |
+
|
| 166 |
+
def convert_tokens_to_string(self, tokens: List[str]):
|
| 167 |
+
"""Converts a sequence of tokens (string) in a single string.
|
| 168 |
+
|
| 169 |
+
Parameters
|
| 170 |
+
----------
|
| 171 |
+
tokens: List[str]
|
| 172 |
+
List of tokens for a given string sequence.
|
| 173 |
+
|
| 174 |
+
Returns
|
| 175 |
+
-------
|
| 176 |
+
out_string: str
|
| 177 |
+
Single string from combined tokens.
|
| 178 |
+
"""
|
| 179 |
+
|
| 180 |
+
out_string: str = " ".join(tokens).replace(" ##", "").strip()
|
| 181 |
+
return out_string
|
| 182 |
+
|
| 183 |
+
def add_special_tokens_ids_single_sequence(self,
|
| 184 |
+
token_ids: List[Optional[int]]):
|
| 185 |
+
"""Adds special tokens to a sequence for sequence classification tasks.
|
| 186 |
+
|
| 187 |
+
A BERT sequence has the following format: [CLS] X [SEP]
|
| 188 |
+
|
| 189 |
+
Parameters
|
| 190 |
+
----------
|
| 191 |
+
token_ids: list[int]
|
| 192 |
+
list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
|
| 193 |
+
"""
|
| 194 |
+
|
| 195 |
+
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
| 196 |
+
|
| 197 |
+
def add_special_tokens_single_sequence(self, tokens: List[str]):
|
| 198 |
+
"""Adds special tokens to the a sequence for sequence classification tasks.
|
| 199 |
+
A BERT sequence has the following format: [CLS] X [SEP]
|
| 200 |
+
|
| 201 |
+
Parameters
|
| 202 |
+
----------
|
| 203 |
+
tokens: List[str]
|
| 204 |
+
List of tokens for a given string sequence.
|
| 205 |
+
"""
|
| 206 |
+
return [self.cls_token] + tokens + [self.sep_token]
|
| 207 |
+
|
| 208 |
+
def add_special_tokens_ids_sequence_pair(
|
| 209 |
+
self, token_ids_0: List[Optional[int]],
|
| 210 |
+
token_ids_1: List[Optional[int]]) -> List[Optional[int]]:
|
| 211 |
+
"""Adds special tokens to a sequence pair for sequence classification tasks.
|
| 212 |
+
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
| 213 |
+
|
| 214 |
+
Parameters
|
| 215 |
+
----------
|
| 216 |
+
token_ids_0: List[int]
|
| 217 |
+
List of ids for the first string sequence in the sequence pair (A).
|
| 218 |
+
token_ids_1: List[int]
|
| 219 |
+
List of tokens for the second string sequence in the sequence pair (B).
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
sep = [self.sep_token_id]
|
| 223 |
+
cls = [self.cls_token_id]
|
| 224 |
+
|
| 225 |
+
return cls + token_ids_0 + sep + token_ids_1 + sep
|
| 226 |
+
|
| 227 |
+
def add_padding_tokens(self,
|
| 228 |
+
token_ids: List[Optional[int]],
|
| 229 |
+
length: int,
|
| 230 |
+
right: bool = True) -> List[Optional[int]]:
|
| 231 |
+
"""Adds padding tokens to return a sequence of length max_length.
|
| 232 |
+
By default padding tokens are added to the right of the sequence.
|
| 233 |
+
|
| 234 |
+
Parameters
|
| 235 |
+
----------
|
| 236 |
+
token_ids: list[optional[int]]
|
| 237 |
+
list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
|
| 238 |
+
length: int
|
| 239 |
+
right: bool, default True
|
| 240 |
+
|
| 241 |
+
Returns
|
| 242 |
+
-------
|
| 243 |
+
List[int]
|
| 244 |
+
"""
|
| 245 |
+
padding = [self.pad_token_id] * (length - len(token_ids))
|
| 246 |
+
|
| 247 |
+
if right:
|
| 248 |
+
return token_ids + padding
|
| 249 |
+
else:
|
| 250 |
+
return padding + token_ids
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
class BasicSmilesTokenizer(object):
|
| 254 |
+
"""
|
| 255 |
+
Adapted from https://github.com/deepchem/deepchem/.
|
| 256 |
+
Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al.
|
| 257 |
+
This tokenizer is to be used when a tokenizer that does not require the transformers library by HuggingFace is required.
|
| 258 |
+
|
| 259 |
+
Examples
|
| 260 |
+
--------
|
| 261 |
+
>>> tokenizer = BasicSmilesTokenizer()
|
| 262 |
+
>>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O"))
|
| 263 |
+
['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O']
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
References
|
| 267 |
+
----------
|
| 268 |
+
.. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
|
| 269 |
+
ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
|
| 270 |
+
1572-1583 DOI: 10.1021/acscentsci.9b00576
|
| 271 |
+
"""
|
| 272 |
+
|
| 273 |
+
def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
|
| 274 |
+
"""Constructs a BasicSMILESTokenizer.
|
| 275 |
+
|
| 276 |
+
Parameters
|
| 277 |
+
----------
|
| 278 |
+
regex: string
|
| 279 |
+
SMILES token regex
|
| 280 |
+
"""
|
| 281 |
+
self.regex_pattern = regex_pattern
|
| 282 |
+
self.regex = re.compile(self.regex_pattern)
|
| 283 |
+
|
| 284 |
+
def tokenize(self, text):
|
| 285 |
+
"""Basic Tokenization of a SMILES.
|
| 286 |
+
"""
|
| 287 |
+
tokens = [token for token in self.regex.findall(text)]
|
| 288 |
+
return tokens
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def load_vocab(vocab_file):
|
| 292 |
+
"""Loads a vocabulary file into a dictionary."""
|
| 293 |
+
vocab = collections.OrderedDict()
|
| 294 |
+
with open(vocab_file, "r", encoding="utf-8") as reader:
|
| 295 |
+
tokens = reader.readlines()
|
| 296 |
+
for index, token in enumerate(tokens):
|
| 297 |
+
token = token.rstrip("\n")
|
| 298 |
+
vocab[token] = index
|
| 299 |
+
return vocab
|
deepscreen/data/single_entity.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from itertools import product
|
| 2 |
+
from numbers import Number
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any, Dict, Optional, Sequence, Union, Literal
|
| 5 |
+
|
| 6 |
+
# import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from lightning import LightningDataModule
|
| 9 |
+
from sklearn.base import TransformerMixin
|
| 10 |
+
from torch.utils.data import Dataset, DataLoader, random_split
|
| 11 |
+
|
| 12 |
+
from deepscreen.data.utils.dataset import SingleEntitySingleTargetDataset, BaseEntityDataset
|
| 13 |
+
from deepscreen.data.utils.label import label_transform
|
| 14 |
+
from deepscreen.data.utils.collator import collate_fn
|
| 15 |
+
from deepscreen.data.utils.sampler import SafeBatchSampler
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class EntityDataModule(LightningDataModule):
|
| 19 |
+
"""
|
| 20 |
+
DTI DataModule
|
| 21 |
+
|
| 22 |
+
A DataModule implements 5 key methods:
|
| 23 |
+
|
| 24 |
+
def prepare_data(self):
|
| 25 |
+
# things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
|
| 26 |
+
# download data, pre-process, split, save to disk, etc.
|
| 27 |
+
def setup(self, stage):
|
| 28 |
+
# things to do on every process in DDP
|
| 29 |
+
# load data, set variables, etc.
|
| 30 |
+
def train_dataloader(self):
|
| 31 |
+
# return train dataloader
|
| 32 |
+
def val_dataloader(self):
|
| 33 |
+
# return validation dataloader
|
| 34 |
+
def test_dataloader(self):
|
| 35 |
+
# return test dataloader
|
| 36 |
+
def teardown(self):
|
| 37 |
+
# called on every process in DDP
|
| 38 |
+
# clean up after fit or test
|
| 39 |
+
|
| 40 |
+
This allows you to share a full dataset without explaining how to download,
|
| 41 |
+
split, transform and process the data.
|
| 42 |
+
|
| 43 |
+
Read the docs:
|
| 44 |
+
https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
def __init__(
|
| 48 |
+
self,
|
| 49 |
+
dataset: type[BaseEntityDataset],
|
| 50 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
| 51 |
+
n_classes: Optional[int],
|
| 52 |
+
train: bool,
|
| 53 |
+
batch_size: int,
|
| 54 |
+
num_workers: int = 0,
|
| 55 |
+
thresholds: Optional[Union[Number, Sequence[Number]]] = None,
|
| 56 |
+
pin_memory: bool = False,
|
| 57 |
+
data_dir: str = "data/",
|
| 58 |
+
data_file: Optional[str] = None,
|
| 59 |
+
train_val_test_split: Optional[Sequence[Number], Sequence[str]] = None,
|
| 60 |
+
split: Optional[callable] = random_split,
|
| 61 |
+
):
|
| 62 |
+
super().__init__()
|
| 63 |
+
data_path = Path(data_dir) / data_file
|
| 64 |
+
# this line allows to access init params with 'self.hparams' attribute
|
| 65 |
+
# also ensures init params will be stored in ckpt
|
| 66 |
+
self.save_hyperparameters(logger=False)
|
| 67 |
+
|
| 68 |
+
# data processing
|
| 69 |
+
self.split = split
|
| 70 |
+
|
| 71 |
+
if train:
|
| 72 |
+
if all([data_file, split]):
|
| 73 |
+
if all(isinstance(split, Number) for split in train_val_test_split):
|
| 74 |
+
pass
|
| 75 |
+
else:
|
| 76 |
+
raise ValueError('`train_val_test_split` must be a sequence of 3 numbers '
|
| 77 |
+
'(float for percentages and int for sample numbers) if '
|
| 78 |
+
'`data_file` and `split` have been specified.')
|
| 79 |
+
elif all(isinstance(split, str) for split in train_val_test_split) and not any([data_file, split]):
|
| 80 |
+
self.train_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[0]))
|
| 81 |
+
self.val_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[1]))
|
| 82 |
+
self.test_data = dataset(dataset_path=str(Path(data_dir) / train_val_test_split[2]))
|
| 83 |
+
else:
|
| 84 |
+
raise ValueError('For training (train=True), you must specify either '
|
| 85 |
+
'`dataset_name` and `split` with `train_val_test_split` of 3 numbers or '
|
| 86 |
+
'solely `train_val_test_split` of 3 data file names.')
|
| 87 |
+
else:
|
| 88 |
+
if data_file and not any([split, train_val_test_split]):
|
| 89 |
+
self.test_data = self.predict_data = dataset(dataset_path=str(Path(data_dir) / data_file))
|
| 90 |
+
else:
|
| 91 |
+
raise ValueError("For testing/predicting (train=False), you must specify only `data_file` without "
|
| 92 |
+
"`train_val_test_split` or `split`")
|
| 93 |
+
|
| 94 |
+
def prepare_data(self):
|
| 95 |
+
"""
|
| 96 |
+
Download data if needed.
|
| 97 |
+
Do not use it to assign state (e.g., self.x = x).
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
def setup(self, stage: Optional[str] = None, encoding: str = None):
|
| 101 |
+
"""
|
| 102 |
+
Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
|
| 103 |
+
This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
|
| 104 |
+
careful not to execute data splitting twice.
|
| 105 |
+
"""
|
| 106 |
+
# load and split datasets only if not loaded in initialization
|
| 107 |
+
if not any([self.data_train, self.data_val, self.data_test, self.data_predict]):
|
| 108 |
+
dataset = SingleEntitySingleTargetDataset(
|
| 109 |
+
task=self.hparams.task,
|
| 110 |
+
n_classes=self.hparams.n_classes,
|
| 111 |
+
dataset_path=Path(self.hparams.data_dir) / self.hparams.dataset_name,
|
| 112 |
+
transformer=self.hparams.transformer,
|
| 113 |
+
featurizer=self.hparams.featurizer,
|
| 114 |
+
thresholds=self.hparams.thresholds,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
if self.hparams.train:
|
| 118 |
+
self.data_train, self.data_val, self.data_test = self.split(
|
| 119 |
+
dataset=dataset,
|
| 120 |
+
lengths=self.hparams.train_val_test_split
|
| 121 |
+
)
|
| 122 |
+
else:
|
| 123 |
+
self.data_test = self.data_predict = dataset
|
| 124 |
+
|
| 125 |
+
def train_dataloader(self):
|
| 126 |
+
return DataLoader(
|
| 127 |
+
dataset=self.data_train,
|
| 128 |
+
batch_sampler=SafeBatchSampler(
|
| 129 |
+
data_source=self.data_train,
|
| 130 |
+
batch_size=self.hparams.batch_size,
|
| 131 |
+
shuffle=True),
|
| 132 |
+
# batch_size=self.hparams.batch_size,
|
| 133 |
+
# shuffle=True,
|
| 134 |
+
num_workers=self.hparams.num_workers,
|
| 135 |
+
pin_memory=self.hparams.pin_memory,
|
| 136 |
+
collate_fn=collate_fn,
|
| 137 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
def val_dataloader(self):
|
| 141 |
+
return DataLoader(
|
| 142 |
+
dataset=self.data_val,
|
| 143 |
+
batch_sampler=SafeBatchSampler(
|
| 144 |
+
data_source=self.data_val,
|
| 145 |
+
batch_size=self.hparams.batch_size,
|
| 146 |
+
shuffle=False),
|
| 147 |
+
# batch_size=self.hparams.batch_size,
|
| 148 |
+
# shuffle=False,
|
| 149 |
+
num_workers=self.hparams.num_workers,
|
| 150 |
+
pin_memory=self.hparams.pin_memory,
|
| 151 |
+
collate_fn=collate_fn,
|
| 152 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
def test_dataloader(self):
|
| 156 |
+
return DataLoader(
|
| 157 |
+
dataset=self.data_test,
|
| 158 |
+
batch_sampler=SafeBatchSampler(
|
| 159 |
+
data_source=self.data_test,
|
| 160 |
+
batch_size=self.hparams.batch_size,
|
| 161 |
+
shuffle=False),
|
| 162 |
+
# batch_size=self.hparams.batch_size,
|
| 163 |
+
# shuffle=False,
|
| 164 |
+
num_workers=self.hparams.num_workers,
|
| 165 |
+
pin_memory=self.hparams.pin_memory,
|
| 166 |
+
collate_fn=collate_fn,
|
| 167 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
def predict_dataloader(self):
|
| 171 |
+
return DataLoader(
|
| 172 |
+
dataset=self.data_predict,
|
| 173 |
+
batch_sampler=SafeBatchSampler(
|
| 174 |
+
data_source=self.data_predict,
|
| 175 |
+
batch_size=self.hparams.batch_size,
|
| 176 |
+
shuffle=False),
|
| 177 |
+
# batch_size=self.hparams.batch_size,
|
| 178 |
+
# shuffle=False,
|
| 179 |
+
num_workers=self.hparams.num_workers,
|
| 180 |
+
pin_memory=self.hparams.pin_memory,
|
| 181 |
+
collate_fn=collate_fn,
|
| 182 |
+
persistent_workers=True if self.hparams.num_workers > 0 else False
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
def teardown(self, stage: Optional[str] = None):
|
| 186 |
+
"""Clean up after fit or test."""
|
| 187 |
+
pass
|
| 188 |
+
|
| 189 |
+
def state_dict(self):
|
| 190 |
+
"""Extra things to save to checkpoint."""
|
| 191 |
+
return {}
|
| 192 |
+
|
| 193 |
+
def load_state_dict(self, state_dict: Dict[str, Any]):
|
| 194 |
+
"""Things to do when loading checkpoint."""
|
| 195 |
+
pass
|
deepscreen/data/utils/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Sequence, TypeVar, Union
|
| 2 |
+
|
| 3 |
+
from deepscreen.data.utils.collator import collate_fn
|
| 4 |
+
from deepscreen.data.utils.label import label_transform
|
| 5 |
+
from deepscreen.data.utils.sampler import SafeBatchSampler
|
| 6 |
+
|
| 7 |
+
T = TypeVar('T')
|
| 8 |
+
FlexibleIterable = Union[T, Sequence[T], Dict[str, T]]
|
deepscreen/data/utils/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (701 Bytes). View file
|
|
|
deepscreen/data/utils/__pycache__/collator.cpython-311.pyc
ADDED
|
Binary file (4.97 kB). View file
|
|
|
deepscreen/data/utils/__pycache__/label.cpython-311.pyc
ADDED
|
Binary file (4.88 kB). View file
|
|
|
deepscreen/data/utils/__pycache__/sampler.cpython-311.pyc
ADDED
|
Binary file (3.56 kB). View file
|
|
|
deepscreen/data/utils/__pycache__/split.cpython-311.pyc
ADDED
|
Binary file (5.68 kB). View file
|
|
|
deepscreen/data/utils/collator.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Define collate functions for new data types here
|
| 3 |
+
"""
|
| 4 |
+
from functools import partial
|
| 5 |
+
from itertools import chain
|
| 6 |
+
|
| 7 |
+
import dgl
|
| 8 |
+
import torch
|
| 9 |
+
from torch.nn.utils.rnn import pad_sequence
|
| 10 |
+
from torch.utils.data._utils.collate import default_collate_fn_map, collate_tensor_fn, collate
|
| 11 |
+
import torch_geometric
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def collate_pyg_fn(batch, collate_fn_map=None):
|
| 15 |
+
"""
|
| 16 |
+
PyG graph collation
|
| 17 |
+
"""
|
| 18 |
+
return torch_geometric.data.Batch.from_data_list(batch)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def collate_dgl_fn(batch, collate_fn_map=None):
|
| 22 |
+
"""
|
| 23 |
+
DGL graph collation
|
| 24 |
+
"""
|
| 25 |
+
return dgl.batch(batch)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def pad_collate_tensor_fn(batch, padding_value=0.0, collate_fn_map=None):
|
| 29 |
+
"""
|
| 30 |
+
Similar to pad_packed_sequence(pack_sequence(batch, enforce_sorted=False), batch_first=True),
|
| 31 |
+
but additionally supports padding a list of square Tensors of size ``(L x L x ...)``.
|
| 32 |
+
:param batch:
|
| 33 |
+
:param padding_value:
|
| 34 |
+
:param collate_fn_map:
|
| 35 |
+
:return: padded_batch, lengths
|
| 36 |
+
"""
|
| 37 |
+
lengths = [tensor.size(0) for tensor in batch]
|
| 38 |
+
if any(element != lengths[0] for element in lengths[1:]):
|
| 39 |
+
try:
|
| 40 |
+
# Tensors share at least one common dimension size, use pad_sequence
|
| 41 |
+
batch = pad_sequence(batch, batch_first=True, padding_value=padding_value)
|
| 42 |
+
except RuntimeError:
|
| 43 |
+
# Tensors do not share any common dimension size, find the max size of each dimension in the batch
|
| 44 |
+
max_sizes = [max([tensor.size(dim) for tensor in batch]) for dim in range(batch[0].dim())]
|
| 45 |
+
# Pad every dimension of all tensors in the batch to be the respective max size with the value
|
| 46 |
+
batch = collate_tensor_fn([
|
| 47 |
+
torch.nn.functional.pad(
|
| 48 |
+
tensor, tuple(chain.from_iterable(
|
| 49 |
+
[(0, max_sizes[dim] - tensor.size(dim)) for dim in range(tensor.dim())][::-1])
|
| 50 |
+
), mode='constant', value=padding_value) for tensor in batch
|
| 51 |
+
])
|
| 52 |
+
else:
|
| 53 |
+
batch = collate_tensor_fn(batch)
|
| 54 |
+
|
| 55 |
+
lengths = torch.as_tensor(lengths)
|
| 56 |
+
# Return the padded batch tensor and the lengths
|
| 57 |
+
return batch, lengths
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# Join custom collate functions with the default collation map of PyTorch
|
| 61 |
+
COLLATE_FN_MAP = default_collate_fn_map | {
|
| 62 |
+
torch_geometric.data.data.BaseData: collate_pyg_fn,
|
| 63 |
+
dgl.DGLGraph: collate_dgl_fn,
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def collate_fn(batch, automatic_padding=False, padding_value=0):
|
| 68 |
+
if automatic_padding:
|
| 69 |
+
COLLATE_FN_MAP.update({
|
| 70 |
+
torch.Tensor: partial(pad_collate_tensor_fn, padding_value=padding_value),
|
| 71 |
+
})
|
| 72 |
+
return collate(batch, collate_fn_map=COLLATE_FN_MAP)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# class VariableLengthSequence(torch.Tensor):
|
| 76 |
+
# """
|
| 77 |
+
# A custom PyTorch Tensor class that is similar to PackedSequence, except it can be directly used as a batch tensor,
|
| 78 |
+
# and it has an attribute called lengths, which signifies the length of each original sequence in the batch.
|
| 79 |
+
# """
|
| 80 |
+
#
|
| 81 |
+
# def __new__(cls, data, lengths):
|
| 82 |
+
# """
|
| 83 |
+
# Creates a new VariableLengthSequence object from the given data and lengths.
|
| 84 |
+
# Args:
|
| 85 |
+
# data (torch.Tensor): The batch collated tensor of shape (batch_size, max_length, *).
|
| 86 |
+
# lengths (torch.Tensor): The lengths of each original sequence in the batch of shape (batch_size,).
|
| 87 |
+
# Returns:
|
| 88 |
+
# VariableLengthSequence: A new VariableLengthSequence object.
|
| 89 |
+
# """
|
| 90 |
+
# # Check the validity of the inputs
|
| 91 |
+
# assert isinstance(data, torch.Tensor), "data must be a torch.Tensor"
|
| 92 |
+
# assert isinstance(lengths, torch.Tensor), "lengths must be a torch.Tensor"
|
| 93 |
+
# assert data.dim() >= 2, "data must have at least two dimensions"
|
| 94 |
+
# assert lengths.dim() == 1, "lengths must have one dimension"
|
| 95 |
+
# assert data.size(0) == lengths.size(0), "data and lengths must have the same batch size"
|
| 96 |
+
# assert lengths.min() > 0, "lengths must be positive"
|
| 97 |
+
# assert lengths.max() <= data.size(1), "lengths must not exceed the max length of data"
|
| 98 |
+
#
|
| 99 |
+
# # Create a new tensor object from data
|
| 100 |
+
# obj = super().__new__(cls, data)
|
| 101 |
+
#
|
| 102 |
+
# # Set the lengths attribute
|
| 103 |
+
# obj.lengths = lengths
|
| 104 |
+
#
|
| 105 |
+
# return obj
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# class VariableLengthSequence(torch.Tensor):
|
| 109 |
+
# _lengths = torch.Tensor()
|
| 110 |
+
#
|
| 111 |
+
# def __new__(cls, data, lengths, *args, **kwargs):
|
| 112 |
+
# self = super().__new__(cls, data, *args, **kwargs)
|
| 113 |
+
# self.lengths = lengths
|
| 114 |
+
# return self
|
| 115 |
+
#
|
| 116 |
+
# def clone(self, *args, **kwargs):
|
| 117 |
+
# return VariableLengthSequence(super().clone(*args, **kwargs), self.lengths.clone())
|
| 118 |
+
#
|
| 119 |
+
# def new_empty(self, *size):
|
| 120 |
+
# return VariableLengthSequence(super().new_empty(*size), self.lengths)
|
| 121 |
+
#
|
| 122 |
+
# def to(self, *args, **kwargs):
|
| 123 |
+
# return VariableLengthSequence(super().to(*args, **kwargs), self.lengths.to(*args, **kwargs))
|
| 124 |
+
#
|
| 125 |
+
# def __format__(self, format_spec):
|
| 126 |
+
# # Convert self to a string or a number here, depending on what you need
|
| 127 |
+
# return self.item().__format__(format_spec)
|
| 128 |
+
#
|
| 129 |
+
# @property
|
| 130 |
+
# def lengths(self):
|
| 131 |
+
# return self._lengths
|
| 132 |
+
#
|
| 133 |
+
# @lengths.setter
|
| 134 |
+
# def lengths(self, lengths):
|
| 135 |
+
# self._lengths = lengths
|
| 136 |
+
#
|
| 137 |
+
# def cpu(self, *args, **kwargs):
|
| 138 |
+
# return VariableLengthSequence(super().cpu(*args, **kwargs), self.lengths.cpu(*args, **kwargs))
|
| 139 |
+
#
|
| 140 |
+
# def cuda(self, *args, **kwargs):
|
| 141 |
+
# return VariableLengthSequence(super().cuda(*args, **kwargs), self.lengths.cuda(*args, **kwargs))
|
| 142 |
+
#
|
| 143 |
+
# def pin_memory(self):
|
| 144 |
+
# return VariableLengthSequence(super().pin_memory(), self.lengths.pin_memory())
|
| 145 |
+
#
|
| 146 |
+
# def share_memory_(self):
|
| 147 |
+
# super().share_memory_()
|
| 148 |
+
# self.lengths.share_memory_()
|
| 149 |
+
# return self
|
| 150 |
+
#
|
| 151 |
+
# def detach_(self, *args, **kwargs):
|
| 152 |
+
# super().detach_(*args, **kwargs)
|
| 153 |
+
# self.lengths.detach_(*args, **kwargs)
|
| 154 |
+
# return self
|
| 155 |
+
#
|
| 156 |
+
# def detach(self, *args, **kwargs):
|
| 157 |
+
# return VariableLengthSequence(super().detach(*args, **kwargs), self.lengths.detach(*args, **kwargs))
|
| 158 |
+
#
|
| 159 |
+
# def record_stream(self, *args, **kwargs):
|
| 160 |
+
# super().record_stream(*args, **kwargs)
|
| 161 |
+
# self.lengths.record_stream(*args, **kwargs)
|
| 162 |
+
# return self
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
# @classmethod
|
| 166 |
+
# def __torch_function__(cls, func, types, args=(), kwargs=None):
|
| 167 |
+
# return super().__torch_function__(func, types, args, kwargs) \
|
| 168 |
+
# if cls.lengths is not None else torch.Tensor.__torch_function__(func, types, args, kwargs)
|
deepscreen/data/utils/dataset.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from numbers import Number
|
| 2 |
+
from typing import Literal, Union, Sequence
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from sklearn.base import TransformerMixin
|
| 6 |
+
from sklearn.exceptions import NotFittedError
|
| 7 |
+
from sklearn.utils.validation import check_is_fitted
|
| 8 |
+
from torch.utils.data import Dataset
|
| 9 |
+
|
| 10 |
+
from deepscreen.data.utils import label_transform, FlexibleIterable
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class BaseEntityDataset(Dataset):
|
| 14 |
+
def __init__(
|
| 15 |
+
self,
|
| 16 |
+
dataset_path: str,
|
| 17 |
+
use_col_prefixes=('X', 'Y', 'ID', 'U')
|
| 18 |
+
):
|
| 19 |
+
|
| 20 |
+
# Read the data table header row first to filter columns and create column dtype dict
|
| 21 |
+
df = pd.read_csv(
|
| 22 |
+
dataset_path,
|
| 23 |
+
header=0, nrows=0,
|
| 24 |
+
usecols=lambda col: col.startswith(use_col_prefixes)
|
| 25 |
+
)
|
| 26 |
+
# Read the whole data table
|
| 27 |
+
df = pd.read_csv(
|
| 28 |
+
dataset_path,
|
| 29 |
+
header=0,
|
| 30 |
+
usecols=df.columns,
|
| 31 |
+
dtype={col: 'float32' if col.startswith('Y') else 'string' for col in df.columns}
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
self.df = df
|
| 35 |
+
self.label_cols = [col for col in df.columns if col.startswith('Y')]
|
| 36 |
+
self.label_unit_cols = [col for col in df.columns if col.startswith('U')]
|
| 37 |
+
self.entity_id_cols = [col for col in df.columns if col.startswith('ID')]
|
| 38 |
+
self.entity_cols = [col for col in df.columns if col.startswith('X')]
|
| 39 |
+
|
| 40 |
+
def __len__(self):
|
| 41 |
+
return len(self.df.index)
|
| 42 |
+
|
| 43 |
+
def __getitem__(self, idx):
|
| 44 |
+
raise NotImplementedError
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# TODO test transform
|
| 48 |
+
class SingleEntitySingleTargetDataset(BaseEntityDataset):
|
| 49 |
+
def __init__(
|
| 50 |
+
self,
|
| 51 |
+
dataset_path: str,
|
| 52 |
+
task: Literal['regression', 'binary', 'multiclass'],
|
| 53 |
+
n_classes: int,
|
| 54 |
+
featurizer: callable,
|
| 55 |
+
transformer: TransformerMixin = None,
|
| 56 |
+
thresholds: Union[Number, Sequence[Number]] = None,
|
| 57 |
+
discard_intermediate: bool = None,
|
| 58 |
+
forward_fill: bool = True
|
| 59 |
+
):
|
| 60 |
+
super().__init__(dataset_path)
|
| 61 |
+
|
| 62 |
+
assert len(self.entity_cols) == 1, 'The dataset contains more than 1 entity column (starting with `X`).'
|
| 63 |
+
if len(self.label_cols) >= 0:
|
| 64 |
+
assert len(self.label_cols) == 1, 'The dataset contains more than 1 label column (starting with `Y`).'
|
| 65 |
+
# Remove trailing `1`s in column names for flexibility
|
| 66 |
+
self.df.columns = self.df.columns.str.rstrip('1')
|
| 67 |
+
|
| 68 |
+
# Forward-fill non-label columns
|
| 69 |
+
nonlabel_cols = self.label_unit_cols + self.entity_id_cols + self.entity_cols
|
| 70 |
+
if forward_fill:
|
| 71 |
+
self.df[nonlabel_cols] = self.df[nonlabel_cols].ffill(axis=0)
|
| 72 |
+
|
| 73 |
+
# Process target labels for training/testing if exist
|
| 74 |
+
if self.label_cols:
|
| 75 |
+
# Transform target labels
|
| 76 |
+
self.df[self.label_cols] = self.df[self.label_cols].apply(
|
| 77 |
+
label_transform,
|
| 78 |
+
units=self.df.get('U', None),
|
| 79 |
+
thresholds=thresholds,
|
| 80 |
+
discard_intermediate=discard_intermediate).astype('float32')
|
| 81 |
+
|
| 82 |
+
# Filter out rows with a NaN in Y (missing values); use inplace to save memory
|
| 83 |
+
self.df.dropna(subset=self.label_cols, inplace=True)
|
| 84 |
+
|
| 85 |
+
# Validate target labels
|
| 86 |
+
# TODO: check sklearn.utils.multiclass.check_classification_targets
|
| 87 |
+
match task:
|
| 88 |
+
case 'regression':
|
| 89 |
+
assert all(self.df['Y'].apply(lambda x: isinstance(x, Number))), \
|
| 90 |
+
f"Y for task `regression` must be numeric; got {set(self.df['Y'].apply(type))}."
|
| 91 |
+
case 'binary':
|
| 92 |
+
assert all(self.df['Y'].isin([0, 1])), \
|
| 93 |
+
f"Y for task `binary` (classification) must be 0 or 1, but Y got {pd.unique(self.df['Y'])}." \
|
| 94 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
| 95 |
+
case 'multiclass':
|
| 96 |
+
assert n_classes >= 3, f'n_classes for task `multiclass` (classification) must be at least 3.'
|
| 97 |
+
assert all(self.df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
|
| 98 |
+
f"``Y` for task `multiclass` (classification) must be non-negative integers, " \
|
| 99 |
+
f"but `Y` got {pd.unique(self.df['Y'])}." \
|
| 100 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
| 101 |
+
target_n_unique = self.df['Y'].nunique()
|
| 102 |
+
assert target_n_unique == n_classes, \
|
| 103 |
+
f"You have set n_classes for task `multiclass` (classification) task to {n_classes}, " \
|
| 104 |
+
f"but `Y` has {target_n_unique} unique labels."
|
| 105 |
+
|
| 106 |
+
if transformer:
|
| 107 |
+
self.df['X'] = self.df['X'].apply(featurizer)
|
| 108 |
+
try:
|
| 109 |
+
check_is_fitted(transformer)
|
| 110 |
+
self.df['X'] = list(transformer.transform(self.df['X']))
|
| 111 |
+
except NotFittedError:
|
| 112 |
+
self.df['X'] = list(transformer.fit_transform(self.df['X']))
|
| 113 |
+
|
| 114 |
+
# Skip sample-wise feature extraction because it has already been done dataset-wise
|
| 115 |
+
self.featurizer = lambda x: x
|
| 116 |
+
|
| 117 |
+
self.featurizer = featurizer
|
| 118 |
+
self.n_classes = n_classes
|
| 119 |
+
self.df['ID'] = self.df.get('ID', self.df['X'])
|
| 120 |
+
|
| 121 |
+
def __getitem__(self, idx):
|
| 122 |
+
sample = self.df.loc[idx]
|
| 123 |
+
return {
|
| 124 |
+
'X': self.featurizer(sample['X']),
|
| 125 |
+
'ID': sample['ID'],
|
| 126 |
+
'Y': sample.get('Y')
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# TODO WIP
|
| 131 |
+
class MultiEntityMultiTargetDataset(BaseEntityDataset):
|
| 132 |
+
def __init__(
|
| 133 |
+
self,
|
| 134 |
+
dataset_path: str,
|
| 135 |
+
task: FlexibleIterable[Literal['regression', 'binary', 'multiclass']],
|
| 136 |
+
n_class: FlexibleIterable[int],
|
| 137 |
+
featurizers: FlexibleIterable[callable],
|
| 138 |
+
thresholds: FlexibleIterable[Union[Number, Sequence[Number]]] = None,
|
| 139 |
+
discard_intermediate: FlexibleIterable[bool] = None,
|
| 140 |
+
):
|
| 141 |
+
super().__init__(dataset_path)
|
| 142 |
+
label_col_prefix = tuple('Y')
|
| 143 |
+
nonlabel_col_prefixes = tuple(('X', 'ID', 'U'))
|
| 144 |
+
allowed_col_prefixes = label_col_prefix + nonlabel_col_prefixes
|
| 145 |
+
|
| 146 |
+
# Read the headers first to filter columns and create column dtype dict
|
| 147 |
+
df = pd.read_csv(
|
| 148 |
+
dataset_path,
|
| 149 |
+
header=0, nrows=0,
|
| 150 |
+
usecols=lambda col: col.startswith(allowed_col_prefixes)
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# Read the whole table
|
| 154 |
+
df = pd.read_csv(
|
| 155 |
+
dataset_path,
|
| 156 |
+
header=0,
|
| 157 |
+
usecols=df.columns,
|
| 158 |
+
dtype={col: 'float32' if col.startswith('Y') else 'string' for col in df.columns}
|
| 159 |
+
)
|
| 160 |
+
label_cols = [col for col in df.columns if col.startswith(label_col_prefix)]
|
| 161 |
+
nonlabel_cols = [col for col in df.columns if col.startswith(nonlabel_col_prefixes)]
|
| 162 |
+
self.entity_cols = [col for col in nonlabel_cols if col.startswith('X')]
|
| 163 |
+
|
| 164 |
+
# Forward-fill all non-label columns
|
| 165 |
+
df[nonlabel_cols] = df[nonlabel_cols].ffill(axis=0)
|
| 166 |
+
|
| 167 |
+
# Process target labels for training/testing
|
| 168 |
+
if label_cols:
|
| 169 |
+
# Transform target labels
|
| 170 |
+
df[label_cols] = df[label_cols].apply(label_transform, units=df.get('U', None), thresholds=thresholds,
|
| 171 |
+
discard_intermediate=discard_intermediate).astype('float32')
|
| 172 |
+
|
| 173 |
+
# Filter out rows with a NaN in Y (missing values)
|
| 174 |
+
df.dropna(subset=label_cols, inplace=True)
|
| 175 |
+
|
| 176 |
+
# Validate target labels
|
| 177 |
+
# TODO: check sklearn.utils.multiclass.check_classification_targets
|
| 178 |
+
# WIP
|
| 179 |
+
match task:
|
| 180 |
+
case 'regression':
|
| 181 |
+
assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
|
| 182 |
+
f"Y for task `regression` must be numeric; got {set(df['Y'].apply(type))}."
|
| 183 |
+
case 'binary':
|
| 184 |
+
assert all(df['Y'].isin([0, 1])), \
|
| 185 |
+
f"Y for task `binary` must be 0 or 1, but Y got {pd.unique(df['Y'])}." \
|
| 186 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
| 187 |
+
case 'multiclass':
|
| 188 |
+
assert len(label_cols) == len(n_class), \
|
| 189 |
+
(f'Data table has {len(label_cols)} label columns (`Y*`) but you have specified '
|
| 190 |
+
f'n_class of length {len(n_class)} for task `multiclass`.')
|
| 191 |
+
for label, n in zip(df[label_cols], n_class):
|
| 192 |
+
assert n >= 3, f'n_class for task `multiclass` must be at least 3.'
|
| 193 |
+
assert all(label.apply(lambda x: x.is_integer() and x >= 0)), \
|
| 194 |
+
f"Y for task `multiclass` must be non-negative integers, " \
|
| 195 |
+
f"but Y got {pd.unique(label)}." \
|
| 196 |
+
"\nYou may set `thresholds` to discretize continuous labels."
|
| 197 |
+
target_n_unique = label.nunique()
|
| 198 |
+
assert target_n_unique == n, \
|
| 199 |
+
f"You have set n_classes for task `multiclass` task to {n}, " \
|
| 200 |
+
f"but Y has {target_n_unique} unique labels."
|
| 201 |
+
|
| 202 |
+
self.df = df
|
| 203 |
+
self.featurizers = featurizers
|
| 204 |
+
self.n_class = n_class
|
| 205 |
+
|
| 206 |
+
def __len__(self):
|
| 207 |
+
return len(self.df.index)
|
| 208 |
+
|
| 209 |
+
# WIP
|
| 210 |
+
def __getitem__(self, idx):
|
| 211 |
+
sample = self.df.loc[idx]
|
| 212 |
+
return {
|
| 213 |
+
'X': [featurizer(x) for featurizer, x in zip(self.featurizers, sample[self.entity_cols])],
|
| 214 |
+
'ID': sample.get('ID', sample['X']),
|
| 215 |
+
'Y': sample.get('Y')
|
| 216 |
+
}
|
deepscreen/data/utils/label.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from numbers import Number
|
| 2 |
+
from typing import Optional, Union
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
from deepscreen.utils import get_logger
|
| 7 |
+
|
| 8 |
+
log = get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
MOLARITY_TO_POTENCY = {
|
| 11 |
+
'p': lambda x: x,
|
| 12 |
+
'M': lambda x: -np.log10(x),
|
| 13 |
+
'mM': lambda x: -np.log10(x) + 3,
|
| 14 |
+
'μM': lambda x: -np.log10(x) + 6,
|
| 15 |
+
'uM': lambda x: -np.log10(x) + 6, # in case someone doesn't know how to type micromolar lol
|
| 16 |
+
'nM': lambda x: -np.log10(x) + 9,
|
| 17 |
+
'pM': lambda x: -np.log10(x) + 12,
|
| 18 |
+
'fM': lambda x: -np.log10(x) + 15,
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# TODO rewrite for swifter.apply
|
| 23 |
+
def molar_to_p(labels, units):
|
| 24 |
+
assert units in MOLARITY_TO_POTENCY, f"Allowed units: {', '.join(MOLARITY_TO_POTENCY)}."
|
| 25 |
+
|
| 26 |
+
unit_converted_labels = []
|
| 27 |
+
for label, unit in (labels, units):
|
| 28 |
+
unit_converted_labels.append(MOLARITY_TO_POTENCY[unit](label))
|
| 29 |
+
labels = np.array(unit_converted_labels)
|
| 30 |
+
|
| 31 |
+
return labels
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def label_discretize(labels, thresholds):
|
| 35 |
+
# if isinstance(threshold, Number):
|
| 36 |
+
# labels = np.where(labels < threshold, 1, 0)
|
| 37 |
+
# else:
|
| 38 |
+
# labels = np.where(labels < threshold[0], 1, np.where(labels > threshold[1], 0, np.nan))
|
| 39 |
+
if isinstance(thresholds, Number):
|
| 40 |
+
labels = 1 - np.digitize(labels, [thresholds])
|
| 41 |
+
else:
|
| 42 |
+
labels = np.digitize(labels, np.sort(thresholds)[::-1])
|
| 43 |
+
|
| 44 |
+
return labels
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def label_transform(
|
| 48 |
+
labels,
|
| 49 |
+
units: Optional[list[str]],
|
| 50 |
+
thresholds: Optional[Union[float, list[Number]]],
|
| 51 |
+
discard_intermediate: Optional[bool]
|
| 52 |
+
):
|
| 53 |
+
f"""Convert labels of all units to p scale (-log10[M]) and binarize them if specified.
|
| 54 |
+
:param labels: a sequence of labels, continuous or binary values
|
| 55 |
+
:type labels: array_like
|
| 56 |
+
:param units: a sequence of label units in {', '.join(MOLARITY_TO_POTENCY)}
|
| 57 |
+
:type units: array_like, optional
|
| 58 |
+
:param thresholds: discretization threshold(s) for affinity labels, in p scale (-log10[M]).
|
| 59 |
+
A single number maps affinities below it to 1 and otherwise to 0.
|
| 60 |
+
A tuple of two or more thresholds maps affinities to multiple discrete levels descendingly, assigning values
|
| 61 |
+
values below the lowest threshold to the highest level (e.g. 2) and values above the greatest threshold to 0
|
| 62 |
+
:type thresholds: list, float, optional
|
| 63 |
+
:param discard_intermediate: whether to discard the intermediate (indeterminate) level if provided an odd
|
| 64 |
+
number of thresholds (>=3)
|
| 65 |
+
:type discard_intermediate: bool
|
| 66 |
+
:return: a numpy array of affinity labels in p scale (-log10[M]) or discrete labels
|
| 67 |
+
"""
|
| 68 |
+
# # Check if labels are already discrete (ignoring NAs).
|
| 69 |
+
# discrete = labels.dropna().isin([0, 1]).all()
|
| 70 |
+
#
|
| 71 |
+
# if discrete:
|
| 72 |
+
# assert discretize, "Cannot train a regression model with discrete labels."
|
| 73 |
+
# if thresholds:
|
| 74 |
+
# warn("Ignoring 'threshold' because 'Y' (labels) in the data table is already binary.")
|
| 75 |
+
# if units:
|
| 76 |
+
# warn("Ignoring 'units' because 'Y' (labels) in the data table is already binary.")
|
| 77 |
+
# labels = labels
|
| 78 |
+
if units:
|
| 79 |
+
labels = molar_to_p(labels, units)
|
| 80 |
+
|
| 81 |
+
if thresholds:
|
| 82 |
+
labels = label_discretize(labels, thresholds)
|
| 83 |
+
if discard_intermediate:
|
| 84 |
+
assert len(thresholds) % 2 == 1 and len(thresholds) >= 3, \
|
| 85 |
+
"Must give an odd number of (at least 3) thresholds to discard the intermediate level."
|
| 86 |
+
intermediate_level = len(thresholds) // 2
|
| 87 |
+
# Make the intermediate-level labels NaN (which will be filtered out later)
|
| 88 |
+
labels[labels == intermediate_level] = np.nan
|
| 89 |
+
# Reduce all levels above the intermediate level by 1
|
| 90 |
+
labels[labels > intermediate_level] -= 1
|
| 91 |
+
|
| 92 |
+
return labels
|
| 93 |
+
|
deepscreen/data/utils/sampler.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Mapping, Iterable
|
| 2 |
+
|
| 3 |
+
from torch.utils.data import BatchSampler, RandomSampler, SequentialSampler
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class SafeBatchSampler(BatchSampler):
|
| 7 |
+
"""
|
| 8 |
+
A safe `batch_sampler` that skips samples with `None` values, supports shuffling, and keep a fixed batch size.
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
data_source (Dataset): The dataset to sample from.
|
| 12 |
+
batch_size (int): The size of each batch.
|
| 13 |
+
drop_last (bool): Whether to drop the last batch if its size is smaller than `batch_size`. Defaults to `False`.
|
| 14 |
+
shuffle (bool, optional): Whether to shuffle the data before sampling. Defaults to `True`.
|
| 15 |
+
|
| 16 |
+
Example:
|
| 17 |
+
>>> dataloader = DataLoader(dataset, batch_sampler=SafeBatchSampler(dataset, batch_size, drop_last, shuffle))
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool, sampler=None):
|
| 21 |
+
if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \
|
| 22 |
+
batch_size <= 0:
|
| 23 |
+
raise ValueError(f"batch_size should be a positive integer value, but got batch_size={batch_size}")
|
| 24 |
+
if not isinstance(drop_last, bool):
|
| 25 |
+
raise ValueError(f"drop_last should be a boolean value, but got drop_last={drop_last}")
|
| 26 |
+
if sampler:
|
| 27 |
+
pass
|
| 28 |
+
elif shuffle:
|
| 29 |
+
sampler = RandomSampler(data_source) # type: ignore[arg-type]
|
| 30 |
+
else:
|
| 31 |
+
sampler = SequentialSampler(data_source) # type: ignore[arg-type]
|
| 32 |
+
|
| 33 |
+
super().__init__(sampler, batch_size, drop_last)
|
| 34 |
+
self.data_source = data_source
|
| 35 |
+
|
| 36 |
+
# def __iter__(self):
|
| 37 |
+
# batch = []
|
| 38 |
+
# for idx in self.sampler:
|
| 39 |
+
# sample = self.data_source[idx]
|
| 40 |
+
# # if isinstance(sample, list | tuple):
|
| 41 |
+
# # pass
|
| 42 |
+
# # elif isinstance(sample, dict):
|
| 43 |
+
# # sample = sample.values()
|
| 44 |
+
# # elif isinstance(sample, Series):
|
| 45 |
+
# # sample = sample.values
|
| 46 |
+
# # else:
|
| 47 |
+
# # sample = [sample]
|
| 48 |
+
# if isinstance(sample, (Iterable, Mapping)) and not isinstance(sample, str):
|
| 49 |
+
# if isinstance(sample, Mapping):
|
| 50 |
+
# sample = list(sample.values())
|
| 51 |
+
# else:
|
| 52 |
+
# sample = [sample]
|
| 53 |
+
#
|
| 54 |
+
# if all(v is not None for v in sample):
|
| 55 |
+
# batch.append(idx)
|
| 56 |
+
# if len(batch) == self.batch_size:
|
| 57 |
+
# yield batch
|
| 58 |
+
# batch = []
|
| 59 |
+
#
|
| 60 |
+
# if len(batch) > 0 and not self.drop_last:
|
| 61 |
+
# yield batch
|
| 62 |
+
#
|
| 63 |
+
# if not batch:
|
| 64 |
+
# raise StopIteration
|
| 65 |
+
|
| 66 |
+
def __iter__(self):
|
| 67 |
+
batch = [0] * self.batch_size
|
| 68 |
+
idx_in_batch = 0
|
| 69 |
+
for idx in self.sampler:
|
| 70 |
+
sample = self.data_source[idx]
|
| 71 |
+
if isinstance(sample, (Iterable, Mapping)) and not isinstance(sample, str):
|
| 72 |
+
if isinstance(sample, Mapping):
|
| 73 |
+
sample = sample.values()
|
| 74 |
+
else:
|
| 75 |
+
sample = [sample]
|
| 76 |
+
|
| 77 |
+
if all(v is not None for v in sample):
|
| 78 |
+
batch[idx_in_batch] = idx
|
| 79 |
+
idx_in_batch += 1
|
| 80 |
+
if idx_in_batch == self.batch_size:
|
| 81 |
+
yield batch
|
| 82 |
+
idx_in_batch = 0
|
| 83 |
+
batch = [0] * self.batch_size
|
| 84 |
+
|
| 85 |
+
if idx_in_batch > 0 and not self.drop_last:
|
| 86 |
+
yield batch[:idx_in_batch]
|
| 87 |
+
|
| 88 |
+
if not any(batch):
|
| 89 |
+
# raise StopIteration
|
| 90 |
+
return
|