diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -1,2957 +1,2961 @@
-import glob
-import smtplib
-from datetime import datetime, timedelta
-import itertools
-import textwrap
-from email.mime.multipart import MIMEMultipart
-from email.mime.text import MIMEText
-from email.utils import formatdate, make_msgid
-from functools import cache
-from math import pi
-from time import sleep, time
-from uuid import uuid4
-
-import io
-import os
-from pathlib import Path
-import sys
-
-import pytz
-from Bio import SeqIO
-from Bio.Align import PairwiseAligner
-from email_validator import validate_email, EmailNotValidError
-import gradio as gr
-import hydra
-import pandas as pd
-from pandarallel import pandarallel
-import requests
-from rdkit.DataStructs import BulkTanimotoSimilarity
-from requests.adapters import HTTPAdapter, Retry
-from markdown import markdown
-from rdkit import Chem
-from rdkit.Chem import AllChem, Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen
-from rdkit.Chem.Features.ShowFeats import _featColors
-from rdkit.Chem.Scaffolds import MurckoScaffold
-import py3Dmol
-
-from bokeh.models import Legend, NumberFormatter, BooleanFormatter, HTMLTemplateFormatter, LegendItem
-from bokeh.palettes import Category20c_20
-from bokeh.plotting import figure
-from bokeh.transform import cumsum
-from bokeh.resources import INLINE
-import seaborn as sns
-import panel as pn
-
-from apscheduler.schedulers.background import BackgroundScheduler
-from tinydb import TinyDB, Query
-
-#import swifter
-from tqdm.auto import tqdm
-
-from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
-from deepscreen.predict import predict
-
-sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
-import sascorer
-
-DATASET_MAX_LEN = 10_240
-SERVER_DATA_DIR = os.getenv('DATA') # '/data'
-DB_EXPIRY = timedelta(hours=48).total_seconds()
-
-CSS = """
-.help-tip {
- position: absolute;
- display: inline-block;
- top: 16px;
- right: 0px;
- text-align: center;
- border-radius: 40%;
- /* border: 2px solid darkred; background-color: #8B0000;*/
- width: 24px;
- height: 24px;
- font-size: 16px;
- line-height: 26px;
- cursor: default;
- transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1);
- z-index: 100 !important;
-}
-
-.help-tip:hover {
- cursor: pointer;
- /*background-color: #ccc;*/
-}
-
-.help-tip:before {
- content: '?';
- font-weight: 700;
- color: #8B0000;
- z-index: 100 !important;
-}
-
-.help-tip p {
- visibility: hidden;
- opacity: 0;
- text-align: left;
- background-color: #EFDDE3;
- padding: 20px;
- width: 300px;
- position: absolute;
- border-radius: 4px;
- right: -4px;
- color: #494F5A;
- font-size: 13px;
- line-height: normal;
- transform: scale(0.7);
- transform-origin: 100% 0%;
- transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1);
- z-index: 100;
-}
-
-.help-tip:hover p {
- cursor: default;
- visibility: visible;
- opacity: 1;
- transform: scale(1.0);
-}
-
-.help-tip p:before {
- position: absolute;
- content: '';
- width: 0;
- height: 0;
- border: 6px solid transparent;
- border-bottom-color: #EFDDE3;
- right: 10px;
- top: -12px;
-}
-
-.help-tip p:after {
- width: 100%;
- height: 40px;
- content: '';
- position: absolute;
- top: -5px;
- left: 0;
- z-index: 101;
-}
-
-.upload_button {
- background-color: #008000;
-}
-
-.absolute {
- position: absolute;
-}
-
-.example {
-padding: 0;
-background: none;
-border: none;
-text-decoration: underline;
-box-shadow: none;
-text-align: left !important;
-display: inline-block !important;
-}
-
-footer {
-visibility: hidden
-}
-"""
-
-
-class View3DmolCell(py3Dmol.view):
- def __init__(self, width=320, height=200):
- divid = "3dmolviewer_UNIQUEID"
- self.uniqueid = None
- if isinstance(width, int):
- width = '%dpx' % width
- if isinstance(height, int):
- height = '%dpx' % height
- self.startjs = '''
{text}
',
- )
-
-
-TASK_MAP = {
- 'Compound-Protein Interaction': 'DTI',
- 'Compound-Protein Binding Affinity': 'DTA',
-}
-
-TASK_METRIC_MAP = {
- 'DTI': 'AUROC',
- 'DTA': 'CI',
- 'Compound-Protein Interaction': 'AUROC',
- 'Compound-Protein Binding Affinity': 'CI',
- 'CPI': 'DTI',
- 'CPA': 'DTA',
-}
-
-PRESET_MAP = {
- 'DeepDTA': 'deep_dta',
- 'DeepConvDTI': 'deep_conv_dti',
- 'GraphDTA': 'graph_dta',
- 'MGraphDTA': 'm_graph_dta',
- 'HyperAttentionDTI': 'hyper_attention_dti',
- 'MolTrans': 'mol_trans',
- 'TransformerCPI': 'transformer_cpi',
- 'TransformerCPI2': 'transformer_cpi_2',
- 'DrugBAN': 'drug_ban',
- 'DrugVQA-Seq': 'drug_vqa'
-}
-
-TARGET_FAMILY_MAP = {
- 'General': 'general',
- 'Kinase': 'kinase',
- 'Non-Kinase Enzyme': 'non_kinase_enzyme',
- 'Membrane Receptor': 'membrane_receptor',
- 'Nuclear Receptor': 'nuclear_receptor',
- 'Ion Channel': 'ion_channel',
- 'Others': 'others',
- # 'general': 'general',
- # 'kinase': 'kinase',
- # 'non-kinase enzyme': 'non_kinase_enzyme',
- # 'membrane receptor': 'membrane_receptor',
- # 'nuclear Receptor': 'nuclear_receptor',
- # 'ion channel': 'ion_channel',
- # 'others': 'others',
-}
-
-TARGET_LIBRARY_MAP = {
- 'DrugBank (Human)': 'drugbank_targets.csv',
- 'ChEMBL33 (Human)': 'ChEMBL33_human_proteins.csv',
-}
-
-DRUG_LIBRARY_MAP = {
- 'DrugBank (Human)': 'drugbank_compounds.csv',
- 'Drug Repurposing Hub': 'drug_repurposing_hub.csv',
- 'Enamine Discovery Diversity Set (DDS-10)': 'Enamine_Discovery_Diversity_Set_10_10240cmpds_20240130.csv',
- 'Enamine Phenotypic Screening Library (PSL-5760)': 'Enamine_Phenotypic_Screening_Library_plated_5760cmds_2020_07_20.csv'
-}
-
-COLUMN_ALIASES = {
- 'X1': 'Compound SMILES',
- 'X2': 'Target FASTA',
- 'ID1': 'Compound ID',
- 'ID2': 'Target ID',
- 'Y': 'Actual CPI/CPA',
- 'Y^': 'Predicted CPI/CPA',
-}
-
-DRUG_SCRENN_CPI_OPTS = [
- 'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set',
- 'Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target',
- 'Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound',
-]
-
-DRUG_SCRENN_CPA_OPTS = [
- 'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set',
-]
-
-TARGET_IDENTIFY_CPI_OPTS = [
- 'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set',
- 'Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound',
- 'Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target',
-]
-
-TARGET_IDENTIFY_CPA_OPTS = [
- 'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set',
-]
-
-pd.set_option('display.float_format', '{:.3f}'.format)
-PandasTools.molRepresentation = 'svg'
-PandasTools.drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
-PandasTools.drawOptions.clearBackground = False
-PandasTools.drawOptions.bondLineWidth = 1
-PandasTools.drawOptions.explicitMethyl = True
-PandasTools.drawOptions.singleColourWedgeBonds = True
-PandasTools.drawOptions.useCDKAtomPalette()
-PandasTools.molSize = (100, 64)
-
-
-def remove_job_record(job_id):
- # Delete the job from the database
- db.remove(Job.id == job_id)
- # Delete the corresponding files
- files = glob.glob(f"{SERVER_DATA_DIR}/{job_id}*")
- for file_path in files:
- if os.path.exists(file_path):
- os.remove(file_path)
-
-
-def check_expiry():
- Job = Query()
- jobs = db.all()
-
- for job in jobs:
- # Check if the job has expired
- if job['status'] != 'RUNNING':
- expiry_time = job['expiry_time'] if job['expiry_time'] is not None else job['start_time'] + DB_EXPIRY
- if expiry_time < time():
- # Delete the job from the database
- db.remove(Job.id == job['id'])
- # Delete the corresponding file
- files = glob.glob(f"{SERVER_DATA_DIR}/{job['id']}*")
- for file_path in files:
- if os.path.exists(file_path):
- os.remove(file_path)
- elif job['status'] == 'RUNNING' and time() - job['start_time'] > 4 * 60 * 60: # 4 hours
- # Mark the job as failed
- db.update({'status': 'FAILED',
- 'error': 'Job has timed out by exceeding the maximum running time of 4 hours.'},
- Job.id == job['id'])
- if job.get('email'):
- send_email(job)
-
-
-def smiles_to_ecfp(smiles):
- mol = Chem.MolFromSmiles(smiles)
- if mol:
- ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
- else:
- ecfp = []
- return ecfp
-
-
-def max_tanimoto_similarity(smi, seen_smiles_with_fp):
- if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty:
- return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
-
- if smi in seen_smiles_with_fp['X1'].values:
- compound = smi
- if 'ID1' in seen_smiles_with_fp.columns:
- id1 = seen_smiles_with_fp.loc[seen_smiles_with_fp['X1'] == smi, 'ID1'].values[0]
- if pd.notnull(id1) and id1 != '':
- compound = id1
- return {'Max. Tanimoto Similarity': 1, 'Max. Tanimoto Similarity Compound': compound}
-
- mol = Chem.MolFromSmiles(smi)
- if mol is None:
- return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
-
- mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
- sims = pd.Series(BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'].values)).to_numpy()
- idx = sims.argmax()
- compound = seen_smiles_with_fp.iloc[idx]['X1']
- if 'ID1' in seen_smiles_with_fp.columns:
- id1 = seen_smiles_with_fp.iloc[idx]['ID1']
- if pd.notnull(id1) and id1 != '':
- compound = id1
-
- return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound}
-
-
-def alignment_score(query, target):
- aligner = PairwiseAligner()
- aligner.mode = 'local'
- alignment = aligner.align(query, target)
- return alignment.score / max(len(query), len(target))
-
-
-def max_sequence_identity(seq, seen_fastas):
- if seq is None or seen_fastas is None or seen_fastas.empty:
- return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None}
-
- if seq in seen_fastas['X2'].values:
- target = seq
- if 'ID2' in seen_fastas.columns:
- id2 = seen_fastas.loc[seen_fastas['X2'] == seq, 'ID2'].values[0]
- if pd.notnull(id2) and id2 != '':
- target = id2
- return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
-
- cached_alignment_score = cache(alignment_score)
- max_iden = 0
- target = None
- for fasta in seen_fastas['X2'].values:
- identity = cached_alignment_score(seq, fasta)
-
- if identity > max_iden:
- max_iden = identity
- target = fasta
- if 'ID2' in seen_fastas.columns:
- id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
- if pd.notnull(id2) and id2 != '':
- target = id2
- if max_iden == 1:
- break
-
- cached_alignment_score.cache_clear()
- return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target}
-
-
-def get_seen_smiles(family, task):
- if family == 'General':
- family = 'all_families_full'
- else:
- family = TARGET_FAMILY_MAP[family.title()]
- seen_smiles = pd.read_csv(
- f'data/benchmarks/seen_compounds/{family}_{task.lower()}_random_split.csv')
- return seen_smiles
-
-
-def get_seen_fastas(family, task):
- if family == 'General':
- family = 'all_families_full'
- else:
- family = TARGET_FAMILY_MAP[family.title()]
- seen_fastas = pd.read_csv(
- f'data/benchmarks/seen_targets/{family}_{task.lower()}_random_split.csv')
- return seen_fastas
-
-
-@cache
-def get_fasta_family_map():
- usecols = ['X2', 'ID2', 'Target Family']
- fasta_family_map = pd.concat([
- pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv', usecols=usecols),
- pd.read_csv('data/target_libraries/idmapping_not_in_chembl.csv', usecols=usecols)
- ]).drop_duplicates(subset=['X2'], keep='first')
- return fasta_family_map
-
-
-def lipinski(mol):
- """
- Lipinski's rules:
- Hydrogen bond donors <= 5
- Hydrogen bond acceptors <= 10
- Molecular weight <= 500 daltons
- logP <= 5
- """
- return (
- Lipinski.NumHDonors(mol) <= 5 and
- Lipinski.NumHAcceptors(mol) <= 10 and
- Descriptors.MolWt(mol) <= 500 and
- Crippen.MolLogP(mol) <= 5
- )
-
-
-def reos(mol):
- """
- Rapid Elimination Of Swill filter:
- Molecular weight between 200 and 500
- LogP between -5.0 and +5.0
- H-bond donor count between 0 and 5
- H-bond acceptor count between 0 and 10
- Formal charge between -2 and +2
- Rotatable bond count between 0 and 8
- Heavy atom count between 15 and 50
- """
- return (
- 200 <= Descriptors.MolWt(mol) <= 500 and
- -5.0 <= Crippen.MolLogP(mol) <= 5.0 and
- 0 <= Lipinski.NumHDonors(mol) <= 5 and
- 0 <= Lipinski.NumHAcceptors(mol) <= 10 and
- -2 <= rdmolops.GetFormalCharge(mol) <= 2 and
- 0 <= rdMolDescriptors.CalcNumRotatableBonds(mol) <= 8 and
- 15 <= rdMolDescriptors.CalcNumHeavyAtoms(mol) <= 50
- )
-
-
-def ghose(mol):
- """
- Ghose drug like filter:
- Molecular weight between 160 and 480
- LogP between -0.4 and +5.6
- Atom count between 20 and 70
- Molar refractivity between 40 and 130
- """
- return (
- 160 <= Descriptors.MolWt(mol) <= 480 and
- -0.4 <= Crippen.MolLogP(mol) <= 5.6 and
- 20 <= rdMolDescriptors.CalcNumAtoms(mol) <= 70 and
- 40 <= Crippen.MolMR(mol) <= 130
- )
-
-
-def veber(mol):
- """
- The Veber filter is a rule of thumb filter for orally active drugs described in
- Veber et al., J Med Chem. 2002; 45(12): 2615-23.:
- Rotatable bonds <= 10
- Topological polar surface area <= 140
- """
- return (
- rdMolDescriptors.CalcNumRotatableBonds(mol) <= 10 and
- rdMolDescriptors.CalcTPSA(mol) <= 140
- )
-
-
-def rule_of_three(mol):
- """
- Rule of Three filter (Congreve et al., Drug Discov. Today. 8 (19): 876–7, (2003).):
- Molecular weight <= 300
- LogP <= 3
- H-bond donor <= 3
- H-bond acceptor count <= 3
- Rotatable bond count <= 3
- """
- return (
- Descriptors.MolWt(mol) <= 300 and
- Crippen.MolLogP(mol) <= 3 and
- Lipinski.NumHDonors(mol) <= 3 and
- Lipinski.NumHAcceptors(mol) <= 3 and
- rdMolDescriptors.CalcNumRotatableBonds(mol) <= 3
- )
-
-
-@cache
-def load_smarts_patterns(smarts_path):
- # Load the CSV file containing SMARTS patterns
- smarts_df = pd.read_csv(Path(smarts_path))
- # Convert all SMARTS patterns to molecules
- smarts_mols = [Chem.MolFromSmarts(smarts) for smarts in smarts_df['smarts']]
- return smarts_mols
-
-
-def smarts_filter(mol, smarts_mols):
- for smarts_mol in smarts_mols:
- if smarts_mol is not None and mol.HasSubstructMatch(smarts_mol):
- return False
- return True
-
-
-def pains(mol):
- smarts_mols = load_smarts_patterns("data/filters/pains.csv")
- return smarts_filter(mol, smarts_mols)
-
-
-def mlsmr(mol):
- smarts_mols = load_smarts_patterns("data/filters/mlsmr.csv")
- return smarts_filter(mol, smarts_mols)
-
-
-def dundee(mol):
- smarts_mols = load_smarts_patterns("data/filters/dundee.csv")
- return smarts_filter(mol, smarts_mols)
-
-
-def glaxo(mol):
- smarts_mols = load_smarts_patterns("data/filters/glaxo.csv")
- return smarts_filter(mol, smarts_mols)
-
-
-def bms(mol):
- smarts_mols = load_smarts_patterns("data/filters/bms.csv")
- return smarts_filter(mol, smarts_mols)
-
-
-SCORE_MAP = {
- 'SAscore': sascorer.calculateScore,
- 'LogP': Crippen.MolLogP,
- 'Molecular Weight': Descriptors.MolWt,
- 'Number of Atoms': rdMolDescriptors.CalcNumAtoms,
- 'Number of Heavy Atoms': rdMolDescriptors.CalcNumHeavyAtoms,
- 'Molar Refractivity': Crippen.MolMR,
- 'H-Bond Donor Count': Lipinski.NumHDonors,
- 'H-Bond Acceptor Count': Lipinski.NumHAcceptors,
- 'Rotatable Bond Count': rdMolDescriptors.CalcNumRotatableBonds,
- 'Topological Polar Surface Area': rdMolDescriptors.CalcTPSA,
-}
-
-FILTER_MAP = {
- # TODO support number_of_violations
- 'REOS': reos,
- "Lipinski's Rule of Five": lipinski,
- 'Ghose': ghose,
- 'Rule of Three': rule_of_three,
- 'Veber': veber,
- 'PAINS': pains,
- 'MLSMR': mlsmr,
- 'Dundee': dundee,
- 'Glaxo': glaxo,
- 'BMS': bms,
-}
-
-
-def validate_columns(df, mandatory_cols):
- missing_cols = [col for col in mandatory_cols if col not in df.columns]
- if missing_cols:
- error_message = (f"The following mandatory columns are missing "
- f"in the uploaded dataset: {str(mandatory_cols).strip('[]')}.")
- raise ValueError(error_message)
- else:
- return
-
-
-def process_target_fasta(sequence):
- try:
- if sequence:
- lines = sequence.strip().split("\n")
- if lines[0].startswith(">"):
- lines = lines[1:]
- return ''.join(lines).split(">")[0].strip()
- # record = list(SeqIO.parse(io.StringIO(sequence), "fasta"))[0]
- # return str(record.seq)
- else:
- raise ValueError('Empty FASTA sequence.')
- except Exception as e:
- raise gr.Error(f'Failed to process FASTA due to error: {str(e)}')
-
-
-def send_email(job_info):
- if job_info.get('email'):
- try:
- email_info = job_info.copy()
- email_serv = os.getenv('EMAIL_SERV')
- email_port = os.getenv('EMAIL_PORT')
- email_addr = os.getenv('EMAIL_ADDR')
- email_pass = os.getenv('EMAIL_PASS')
- email_form = os.getenv('EMAIL_FORM')
- email_subj = os.getenv('EMAIL_SUBJ')
-
- for key, value in email_info.items():
- if key.endswith("time") and value:
- email_info[key] = ts_to_str(value, get_timezone_by_ip(email_info['ip']))
-
- server = smtplib.SMTP(email_serv, int(email_port))
- # server.starttls()
-
- server.login(email_addr, email_pass)
- msg = MIMEMultipart("alternative")
- msg["From"] = email_addr
- msg["To"] = email_info['email']
- msg["Subject"] = email_subj.format(**email_info)
- msg["Date"] = formatdate(localtime=True)
- msg["Message-ID"] = make_msgid()
-
- msg.attach(MIMEText(markdown(email_form.format(**email_info)), 'html'))
- msg.attach(MIMEText(email_form.format(**email_info), 'plain'))
-
- server.sendmail(email_addr, email_info['email'], msg.as_string())
- server.quit()
- gr.Info('Email notification sent.')
- except Exception as e:
- gr.Warning('Failed to send email notification due to error: ' + str(e))
-
-
-def check_user_running_job(email, request):
- message = ("You already have a running prediction job (ID: {id}) under this {reason}. "
- "Please wait for it to complete before submitting another job.")
- try:
- # with open('jobs.json', 'r') as f: # /data/
- # # Load the JSON data from the file
- # jobs = json.load(f)
- #
- # for job_id, job_info in jobs.items():
- # # check if a job is running for the email
- # if email:
- # if job_info["email"] == email and job_info["status"] == "running":
- # return message.format(id=job_id, reason="email")
- # # check if a job is running for the session
- # elif request.cookies:
- # for key, value in job_info["cookies"].items() and job_info["status"] == "running":
- # if key in request.cookies and request.cookies[key] == value:
- # return message.format(id=job_id, reason="session")
- # # check if a job is running for the IP
- # else:
- # if job_info["IP"] == request.client.host and job_info["status"] == "running":
- # return message.format(id=job_id, reason="IP")
- # check if a job is running for the email
- Job = Query()
- if email:
- job = db.search((Job.email == email) & (Job.status == "RUNNING"))
- if job:
- return message.format(id=job[0]['id'], reason="email")
- # check if a job is running for the session
- elif request.cookies:
- for key, value in request.cookies.items():
- job = db.search((Job.cookies[key] == value) & (Job.status == "RUNNING"))
- if job:
- return message.format(id=job[0]['id'], reason="session")
- # check if a job is running for the IP
- else:
- job = db.search((Job.IP == request.client.host) & (Job.status == "RUNNING"))
- if job:
- return message.format(id=job[0]['id'], reason="IP")
-
- return False
- except Exception as e:
- raise gr.Error(f'Failed to validate user running jobs due to error: {str(e)}')
-
-
-def get_timezone_by_ip(ip):
- try:
- data = session.get(f'https://worldtimeapi.org/api/ip/{ip}').json()
- return data['timezone']
- except Exception:
- return 'UTC'
-
-
-def ts_to_str(timestamp, timezone):
- # Create a timezone-aware datetime object from the UNIX timestamp
- dt = datetime.fromtimestamp(timestamp, pytz.utc)
-
- # Convert the timezone-aware datetime object to the target timezone
- target_timezone = pytz.timezone(timezone)
- localized_dt = dt.astimezone(target_timezone)
-
- # Format the datetime object to the specified string format
- return localized_dt.strftime('%Y-%m-%d %H:%M:%S (%Z%z)')
-
-
-def lookup_job(job_id):
- gr.Info('Start querying the job database...')
- stop = False
- retry = 0
- while not stop:
- try:
- sleep(5)
- Job = Query()
- jobs = db.search((Job.id == job_id))
- if jobs:
- job = jobs[0]
- job_status = job['status']
- job_type = job['type']
- error = job['error']
- start_time = ts_to_str(job['start_time'], get_timezone_by_ip(job['ip']))
- if job.get('end_time'):
- end_time = ts_to_str(job['end_time'], get_timezone_by_ip(job['ip']))
- if job.get('expiry_time'):
- expiry_time = ts_to_str(job['expiry_time'], get_timezone_by_ip(job['ip']))
- if job_status == "RUNNING":
- yield {
- pred_lookup_status: f'''
-Your **{job_type}** job (ID: **{job_id}**) started at
-**{start_time}** and is **RUNNING...**
-
-It might take a few minutes up to a few hours depending on the prediction dataset, the model, and the queue status.
-You may keep the page open and wait for job completion, or close the page and revisit later to look up the job status
-using the job id. You will also receive an email notification once the job is done.
-''',
- pred_lookup_btn: gr.Button(visible=False),
- pred_lookup_stop_btn: gr.Button(visible=True)
- }
- if job_status == "COMPLETED":
- stop = True
- msg = f"Your {job_type} job (ID: {job_id}) has been **COMPLETED**"
- msg += f" at {end_time}" if job.get('end_time') else ""
- msg += f" and the results will expire by {expiry_time}." if job.get('expiry_time') else "."
- msg += f' Redirecting to the report page...'
-
- gr.Info(msg)
- yield {
- pred_lookup_status: msg,
- pred_lookup_btn: gr.Button(visible=True),
- pred_lookup_stop_btn: gr.Button(visible=False),
- tabs: gr.Tabs(selected='Chemical Property Report'),
- file_for_report: job['output_file']
- }
- if job_status == "FAILED":
- stop = True
- msg = f'Your {job_type} job (ID: {job_id}) has **FAILED**'
- msg += f' at {end_time}' if job.get('end_time') else ''
- msg += f' due to error: {error}.' if job.get('expiry_time') else '.'
- gr.Info(msg)
- yield {
- pred_lookup_status: msg,
- pred_lookup_btn: gr.Button(visible=True),
- pred_lookup_stop_btn: gr.Button(visible=False),
- tabs: gr.Tabs(selected='Prediction Status Lookup'),
- }
- else:
- stop = (retry > 3)
- if not stop:
- msg = f'Job ID {job_id} not found. Retrying... ({retry})'
- else:
- msg = f'Job ID {job_id} not found after {retry} retries. Please check the job ID and try again.'
- gr.Info(msg)
- retry += 1
- yield {
- pred_lookup_status: msg,
- pred_lookup_btn: gr.Button(visible=True),
- pred_lookup_stop_btn: gr.Button(visible=False),
- tabs: gr.Tabs(selected='Prediction Status Lookup'),
- }
-
- except Exception as e:
- raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
-
-
-def apply_advanced_opts(prediction_df, opts, df_training):
- # Advanced options for Drug Hit Screening
- if "Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set" in opts:
- x2 = prediction_df['X2'].iloc[0]
-
- prediction_df[[
- 'Max. Sequence Identity to Training Targets',
- 'Max. Id. Training Target'
- ]] = pd.Series(max_sequence_identity(x2, df_training))
-
- if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
- x2 = prediction_df['X2'].iloc[0]
- pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
- pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp)
-
- @cache
- def max_sim(smiles):
- return max_tanimoto_similarity(smiles, pos_compounds_df)
-
- prediction_df[[
- 'Max. Tanimoto Similarity to Known Ligands',
- 'Max. Sim. Ligand'
- ]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
-
- max_sim.cache_clear()
-
- if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
- x2 = prediction_df['X2'].iloc[0]
- prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
-
- @cache
- def max_id(compound):
- pos_targets_df = df_training.loc[df_training['X1'] == compound]
- return max_sequence_identity(x2, pos_targets_df)
-
- prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
- 'Max. Id. Target']] = (
- prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series)
- )
- prediction_df.drop(['X1^'], axis=1, inplace=True)
-
- max_id.cache_clear()
-
- # Advanced options for Target Protein Identification
- if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
- x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
- if x1 not in df_training['X1'].values:
- df_training['FP'] = df_training['X1'].parallel_apply(smiles_to_ecfp)
-
- prediction_df[[
- 'Max. Tanimoto Similarity to Training Compounds',
- 'Max. Sim. Training Compound'
- ]] = pd.Series(max_tanimoto_similarity(x1, df_training))
-
- if "Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound" in opts:
- x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
- pos_targets_df = df_training.loc[(df_training['X1'] == x1) & (df_training['Y'] == 1)].copy()
-
- @cache
- def max_id(fasta):
- return max_sequence_identity(fasta, pos_targets_df)
-
- prediction_df[[
- 'Max. Sequence Identity to Known Targets of Input Compound',
- 'Max. Id. Target'
- ]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series)
-
- max_id.cache_clear()
-
- if "Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target" in opts:
- x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
-
- @cache
- def max_sim(fasta):
- pos_targets_df = df_training.loc[(df_training['X2'] == fasta) & (df_training['Y'] == 1)].copy()
- if x1 not in pos_targets_df['X1'].values:
- pos_targets_df['FP'] = pos_targets_df['X1'].apply(smiles_to_ecfp)
- return max_tanimoto_similarity(x1, pos_targets_df)
-
- prediction_df[[
- 'Max. Tanimoto Similarity to Known Ligands of Identified Target',
- 'Max. Sim. Ligand'
- ]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series)
-
- max_sim.cache_clear()
-
- return prediction_df
-
-
-def submit_predict(predict_filepath, task, preset, target_family, opts, job_info):
- job_id = job_info['id']
- status = job_info['status']
- send_email(job_info)
- db.insert(job_info)
- error = None
- task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
- predictions_file = None
- df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
- df_training['X1^'] = df_training['X1']
- orig_df = pd.read_csv(predict_filepath)
- alignment_df = get_fasta_family_map()
- prediction_df = pd.DataFrame()
-
- @cache
- def detect_family(query):
- # Check for an exact match first
- exact_match = alignment_df[alignment_df['X2'] == query]
- if not exact_match.empty:
- row = exact_match.iloc[0]
- return row['Target Family']
- # If no exact match, then calculate alignment score
- else:
- aligner = PairwiseAligner()
- aligner.mode = 'local'
-
- def align_score(target):
- alignment = aligner.align(query, target)
- return alignment.score / max(len(query), len(target))
-
- alignment_df['score'] = alignment_df['X2'].apply(align_score)
- row = alignment_df.loc[alignment_df['score'].idxmax()]
- return row['Target Family']
-
- if 'Target Family' not in orig_df.columns:
- orig_df['Target Family'] = None
- if orig_df['Target Family'].isna().any():
- orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
- orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
- )
- orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
- detect_family.cache_clear()
-
- orig_df['X1^'] = orig_df['X1'].parallel_apply(rdkit_canonicalize)
-
- orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
- annotated_df = orig_df[~orig_df['Y'].isna()].copy()
- annotated_df.rename(columns={'Y': 'Y^'}, inplace=True)
- annotated_df['Source'] = 'Database'
- columns_to_drop = ['X1^', 'Compound', 'Scaffold', 'Scaffold SMILES']
- columns_to_drop = [col for col in columns_to_drop if col in annotated_df.columns]
- annotated_df.drop(columns_to_drop, axis=1, inplace=True)
-
- # Save the unannotated data
- unannotated_df = orig_df[orig_df['Y'].isna()].drop(['Y'], axis=1)
- if not unannotated_df.empty:
- unannotated_df.to_csv(predict_filepath, index=False, na_rep='')
- else:
- annotated_df.to_csv(predictions_file, index=False, na_rep='')
- status = "COMPLETED"
- return {run_state: False}
-
- columns_to_drop = ['ID1', 'X1^', 'Compound', 'Scaffold', 'Scaffold SMILES', 'ID2', 'Y', 'Y^']
- columns_to_drop = [col for col in columns_to_drop if col in orig_df.columns]
- orig_df.drop(columns_to_drop, axis=1, inplace=True)
-
- try:
- if target_family != 'Family-Specific Auto-Recommendation':
- target_family_value = TARGET_FAMILY_MAP[target_family.title()]
- task_value = TASK_MAP[task]
- preset_value = PRESET_MAP[preset]
- predictions_file = (f'{SERVER_DATA_DIR}/'
- f'{job_id}_{task_file_abbr[task]}_{preset}_{target_family_value}_predictions.csv')
-
- cfg = hydra.compose(
- config_name="webserver_inference",
- overrides=[f"task={task_value}",
- f"preset={preset_value}",
- f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family_value}.ckpt",
- f"data.data_file='{str(predict_filepath)}'"])
-
- predictions, _ = predict(cfg)
- predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
- predictions['Source'] = f'Predicted ({preset} {target_family})'
- df_list = [prediction_df, predictions]
- prediction_df = pd.concat([df for df in df_list if not df.empty])
-
- else:
- predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_family-recommended_predictions.csv'
- task_value = TASK_MAP[task]
- score = TASK_METRIC_MAP[task]
- benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
- predict_df = pd.read_csv(predict_filepath)
-
- for family, subset in predict_df.groupby('Target Family'):
- predict_subset_filepath = os.path.join(
- os.path.dirname(predict_filepath), f'{job_id}_{family}_input.csv'
- )
- subset.to_csv(predict_subset_filepath, index=False, na_rep='')
-
- seen_compounds = get_seen_smiles(family, task_value)['X1'].values
- if subset['X1^'].iloc[0] in seen_compounds:
- scenario = "Seen Compound"
- else:
- scenario = "Unseen Compound"
-
- filtered_df = benchmark_df[(benchmark_df['Family'] == family.title())
- & (benchmark_df['Scenario'] == scenario)
- & (benchmark_df['Type'] == 'Family')]
-
- seen_compounds = get_seen_smiles('General', task_value)['X1'].values
- if subset['X1^'].iloc[0] in seen_compounds:
- scenario = "Seen Compound"
- else:
- scenario = "Unseen Compound"
-
- filtered_df = pd.concat([
- filtered_df,
- benchmark_df[(benchmark_df['Family'] == family.title())
- & (benchmark_df['Scenario'] == scenario)
- & (benchmark_df['Type'] == 'General')]
- ])
-
- row = filtered_df.loc[filtered_df[score].idxmax()]
- preset_value = PRESET_MAP[row['Model']]
- target_family = TARGET_FAMILY_MAP[family.title()] if row['Type'] == 'Family' else 'general'
- cfg = hydra.compose(
- config_name="webserver_inference",
- overrides=[f"task={task_value}",
- f"preset={preset_value}",
- f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
- f"data.data_file='{str(predict_subset_filepath)}'"])
-
- predictions, _ = predict(cfg)
- predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
- predictions['Source'] = (f'Predicted ({row["Model"]} '
- f'{family.title() if row["Type"] == "Family" else "General"})')
- df_list = [prediction_df, predictions]
- prediction_df = pd.concat([df for df in df_list if not df.empty])
-
- prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
- df_list = [prediction_df, annotated_df]
- prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True)
-
- prediction_df = apply_advanced_opts(prediction_df, opts, df_training)
-
- prediction_df.drop(['N', 'FP'], axis=1, errors='ignore').to_csv(predictions_file, index=False, na_rep='')
- status = 'COMPLETED'
-
- return {run_state: False}
-
- except Exception as e:
- gr.Warning(f"Prediction job failed due to error: {str(e)}")
- status = "FAILED"
- predictions_file = None
- error = str(e)
- return {run_state: False}
-
- finally:
- Job = Query()
- job_query = (Job.id == job_id)
-
- end_time = time()
- expiry_time = end_time + DB_EXPIRY
-
- db.update({'end_time': end_time,
- 'expiry_time': expiry_time,
- 'status': status,
- 'error': error,
- 'input_file': predict_filepath,
- 'output_file': predictions_file},
- job_query)
- if job_info := db.search(job_query)[0]:
- if job_info.get('email'):
- send_email(job_info)
-
-
-def update_df(file, progress=gr.Progress(track_tqdm=True)):
- if file and Path(file).is_file():
- task = None
- job = None
-
- if "_CPI_" in str(file):
- task = 'Compound-Protein Interaction'
- elif "_CPA_" in str(file):
- task = 'Compound-Protein Binding Affinity'
-
- df = pd.read_csv(file)
-
- if 'N' in df.columns:
- df.set_index('N', inplace=True)
-
- if not any(col in ['X1', 'X2'] for col in df.columns):
- gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.")
- return {analyze_btn: gr.Button(interactive=False)}
-
- if 'X1' in df.columns:
- if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
- df['Compound'] = df['X1'].parallel_apply(
- lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
- df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
- df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
- df['Pharmacophore'] = None
- if task == 'Compound-Protein Binding Affinity':
- # Convert Y^ from pIC50 (nM) to IC50 (nM)
- if 'Y^' in df.columns:
- df['Y^'] = 10 ** (-df['Y^']) * 1e9
-
- n_compound = df['X1'].nunique()
- n_protein = df['X2'].nunique()
-
- if n_compound == 1 and n_protein >= 2:
- job = 'Target Protein Identification'
- if task == 'Compound-Protein Interaction':
- opts = TARGET_IDENTIFY_CPI_OPTS
- elif task == 'Compound-Protein Binding Affinity':
- opts = TARGET_IDENTIFY_CPA_OPTS
- if n_compound >= 2 and n_protein == 1:
- job = 'Drug Hit Screening'
- if task == 'Compound-Protein Interaction':
- opts = DRUG_SCRENN_CPI_OPTS
- elif task == 'Compound-Protein Binding Affinity':
- opts = DRUG_SCRENN_CPA_OPTS
-
- return {
- html_report: create_html_report(df, file=None, task=task),
- raw_df: df,
- report_df: df.copy(),
- analyze_btn: gr.Button(interactive=True),
- report_task: task,
- job_opts: gr.CheckboxGroup(
- label=f'{job} Advanced Options',
- choices=opts, visible=True
- ) if job else gr.CheckboxGroup(visible=False),
- }
- else:
- return {analyze_btn: gr.Button(interactive=False)}
-
-
-def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(track_tqdm=True)):
- df_html = df.copy(deep=True)
- column_aliases = COLUMN_ALIASES.copy()
- cols_left = list(pd.Index([
- 'ID1', 'ID2', 'Compound', 'Scaffold', 'Pharmacophore', 'X1', 'Scaffold SMILES', 'X2', 'Y^'
- ]).intersection(df_html.columns))
- # cols_right = list(pd.Index(['X1', 'X2']).intersection(df_html.columns))
- # df_html = df_html[cols_left + (df_html.columns.drop(cols_left + cols_right).tolist()) + cols_right]
- df_html = df_html[cols_left + df_html.columns.drop(cols_left).tolist()]
-
- if isinstance(task, str):
- column_aliases.update({
- 'Y^': 'Interaction Probability' if task == 'Compound-Protein Interaction'
- else 'Binding Affinity (IC50 [nM])'
- })
-
- ascending = True if column_aliases['Y^'] == 'Binding Affinity (IC50 [nM])' else False
- df_html = df_html.sort_values(
- [col for col in ['Y^'] if col in df_html.columns], ascending=ascending
- )
-
- if not file:
- df_html = df_html.iloc[:31]
-
- # Remove repeated info for one-against-N tasks to save visual and physical space
- job = 'Chemical Property'
- unique_entity = 'Unique Entity'
- unique_df = None
- category = None
- columns_unique = None
-
- if 'Exclude Pharmacophore 3D' not in opts:
- df_html['Pharmacophore'] = df_html['Compound'].parallel_apply(
- lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
-
- if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
- df_html['Compound'] = df_html['Compound'].parallel_apply(
- lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
- else:
- df_html.drop(['Compound'], axis=1, inplace=True)
-
- if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
- df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
- lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
- else:
- df_html.drop(['Scaffold'], axis=1, inplace=True)
-
- if 'X1' in df_html.columns and 'X2' in df_html.columns:
- n_compound = df_html['X1'].nunique()
- n_protein = df_html['X2'].nunique()
-
- if n_compound == 1 and n_protein >= 2:
- unique_entity = 'Compound of Interest'
- if any(col in df_html.columns for col in ['Y^', 'Y']):
- job = 'Target Protein Identification'
- category = 'Target Family'
- columns_unique = df_html.columns.isin(
- ['ID1', 'Compound', 'Scaffold', 'X1', 'Scaffold SMILES', 'Pharmacophore',
- 'Max. Tanimoto Similarity to Training Compounds', 'Max. Sim. Training Compound']
- + list(FILTER_MAP.keys()) + list(SCORE_MAP.keys())
- )
-
- elif n_compound >= 2 and n_protein == 1:
- unique_entity = 'Target of Interest'
- if any(col in df_html.columns for col in ['Y^', 'Y']):
- job = 'Drug Hit Screening'
- category = 'Scaffold SMILES'
- columns_unique = df_html.columns.isin(
- ['X2', 'ID2', 'Max. Sequence Identity to Training Targets', 'Max. Id. Training Target']
- )
-
- elif 'Y^' in df_html.columns:
- job = 'Interaction Pair Inference'
-
- df_html.rename(columns=column_aliases, inplace=True)
- df_html.index.name = 'Index'
- if 'Target FASTA' in df_html.columns:
- df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
- lambda x: wrap_text(x) if not pd.isna(x) else x)
-
- num_cols = df_html.select_dtypes('number').columns
- num_col_colors = sns.color_palette('husl', len(num_cols))
- bool_cols = df_html.select_dtypes(bool).columns
- bool_col_colors = {True: 'lightgreen', False: 'lightpink'}
-
- if columns_unique is not None:
- unique_df = df_html.loc[:, columns_unique].iloc[[0]].copy()
- df_html = df_html.loc[:, ~columns_unique]
- df_html.dropna(how='all', axis=1, inplace=True)
- unique_df.dropna(how='all', axis=1, inplace=True)
-
- if not file:
- if 'Compound ID' in df_html.columns:
- df_html.drop(['Compound SMILES'], axis=1, inplace=True)
- if 'Target ID' in df_html.columns:
- df_html.drop(['Target FASTA'], axis=1, inplace=True)
- if 'Target FASTA' in df_html.columns:
- df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
- lambda x: wrap_text(x) if not pd.isna(x) else x)
- if 'Scaffold SMILES' in df_html.columns:
- df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
-
- # FIXME: Temporarily drop pharmacophore column before an image solution is found
- if 'Pharmacophore' in df_html.columns:
- df_html.drop(['Pharmacophore'], axis=1, inplace=True)
- if unique_df is not None and 'Pharmacophore' in unique_df.columns:
- unique_df.drop(['Pharmacophore'], axis=1, inplace=True)
-
- styled_df = df_html.fillna('').style.format(precision=3)
-
- for i, col in enumerate(num_cols):
- cmap = sns.light_palette(num_col_colors[i], as_cmap=True)
- if col in df_html.columns:
- if col not in ['Binding Affinity (IC50 [nM])']:
- cmap.set_bad('white')
- styled_df = styled_df.background_gradient(
- subset=[col], cmap=cmap)
- else:
- cmap = cmap.reversed()
- cmap.set_bad('white')
- styled_df = styled_df.background_gradient(
- subset=[col], cmap=cmap)
-
- if any(df_html.columns.isin(bool_cols)):
- styled_df.applymap(lambda val: f'background-color: {bool_col_colors[val]}', subset=bool_cols)
-
- table_html = styled_df.to_html()
- unique_html = ''
- if unique_df is not None:
- if 'Target FASTA' in unique_df.columns:
- unique_df['Target FASTA'] = unique_df['Target FASTA'].str.replace('\n', '
')
-
- if 'Max. Sequence Identity to Training Targets' in unique_df.columns:
- # Add alert emoji for sequence identity below 0.85
- if unique_df['Max. Sequence Identity to Training Targets'].iloc[0] < 0.85:
- unique_df['Max. Sequence Identity to Training Targets'] = (
- unique_df['Max. Sequence Identity to Training Targets'].apply(
- lambda x: f'
{x:.3f}'
- f' ⚠️Lower than recommended (0.85)'
- f' - predictive reliability may be compromised'
- )
- )
-
- if 'Max. Tanimoto Similarity to Training Compounds' in unique_df.columns:
- # Add alert emoji for sequence identity below 0.85
- if unique_df['Max. Tanimoto Similarity to Training Compounds'].iloc[0] < 0.85:
- unique_df['Max. Tanimoto Similarity to Training Compounds'] = (
- unique_df['Max. Tanimoto Similarity to Training Compounds'].apply(
- lambda x: f'
{x:.3f}'
- f' ⚠️Lower than recommended (0.85)'
- f' - predictive reliability may be compromised'
- )
- )
-
- if any(unique_df.columns.isin(bool_cols)):
- unique_df = unique_df.style.applymap(
- lambda val: f"background-color: {bool_col_colors[val]}", subset=bool_cols)
- unique_html = (f'
'
- f'{unique_df.to_html(escape=False, index=False)}
')
-
- return (f'
{job} Report Preview (Top 30 Records)
'
- f'
{unique_html}
'
- f'
{table_html}
')
-
- else:
- image_zoom_formatter = HTMLTemplateFormatter(template='
<%= value %>
')
- uniprot_id_formatter = HTMLTemplateFormatter(
- template='<% if (value == value) { ' # Check if value is not NaN
- 'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) '
- # Check if value is a valid UniProt ID
- '{ %>
<%= value %><% '
- # Else treat it as a sequence or other plain-text string, line-warping every 60 characters
- '} else { %>
<%= value.match(/.{1,60}/g).join("
") '
- '%>
<% } %><% } else { %><% } %>' # Output empty string if value is NaN
- )
- pubchem_id_formatter = HTMLTemplateFormatter(
- template='<% if (value == value) { ' # Check if value is not NaN
- '%>
<%= value %>'
- '<% } else { %><% } %>' # Output empty string if value is NaN
- )
- alert_emoji_formatter = HTMLTemplateFormatter(
- template='<% if (value < 0.85) { '
- '%>
<%= value %> '
- '⚠️Lower than recommended (0.85) - predictive reliability may be compromised<% '
- '} else { %><%= value %><% } %>'
- )
- bool_formatters = {col: BooleanFormatter() for col in bool_cols}
- float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
- other_formatters = {
- 'Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
- 'Compound': image_zoom_formatter,
- 'Scaffold': image_zoom_formatter,
- 'Pharmacophore': {'type': 'executeScriptFormatter'},
- 'Target FASTA': {'type': 'textarea', 'width': 60},
- 'Target ID': uniprot_id_formatter,
- 'Compound ID': pubchem_id_formatter,
- 'Max. Sim. Ligand': pubchem_id_formatter,
- 'Max. Id. Target': uniprot_id_formatter,
- 'Max. Sim. Training Compound': pubchem_id_formatter,
- 'Max. Id. Training Target': uniprot_id_formatter,
- 'Max. Sequence Identity to Training Targets': alert_emoji_formatter,
- 'Max. Sequence Identity to Known Targets of Hit Compound': alert_emoji_formatter,
- }
- formatters = {**bool_formatters, **float_formatters, **other_formatters}
-
- # html = df.to_html(file)
- # return html
-
- report_table = pn.widgets.Tabulator(
- df_html, formatters=formatters,
- frozen_columns=[
- 'Index', 'Target ID', 'Compound ID', 'Compound'
- ],
- disabled=True, sizing_mode='stretch_both', pagination='local', page_size=10
- )
-
- for i, col in enumerate(num_cols):
- cmap = sns.light_palette(num_col_colors[i], as_cmap=True)
- if col not in ['Binding Affinity (IC50 [nM])']:
- if col not in ['Interaction Probability']:
- cmap.set_bad(color='white')
- report_table.style.background_gradient(
- subset=df_html.columns == col, cmap=cmap)
- else:
- continue
- else:
- cmap = cmap.reversed()
- cmap.set_bad(color='white')
- report_table.style.background_gradient(
- subset=df_html.columns == col, cmap=cmap)
-
- pie_charts = {}
- for y in df_html.columns.intersection(['Interaction Probability', 'Binding Affinity (IC50 [nM])']):
- pie_charts[y] = []
- for k in [10, 30, 100]:
- if k < len(df_html):
- pie_charts[y].append(create_pie_chart(df_html, category=category, value=y, top_k=k))
- pie_charts[y].append(create_pie_chart(df_html, category=category, value=y, top_k=len(df_html)))
-
- # Remove keys with empty values
- pie_charts = {k: v for k, v in pie_charts.items() if any(v)}
-
- panel_css = """
- .tabulator {
- font-family: Courier New !important;
- font-weight: normal !important;
- font-size: 12px !important;
- }
-
- .tabulator-cell {
- overflow: visible !important;
- align-content: center !important;
- }
-
- .tabulator-cell:hover {
- z-index: 1000 !important;
- }
-
- .image-zoom-viewer {
- display: inline-block;
- overflow: visible;
- z-index: 1000;
- }
-
- .image-zoom-viewer::after {
- content: "";
- top: 0;
- left: 0;
- width: 100%;
- height: 100%;
- pointer-events: none;
- }
-
- .image-zoom-viewer:hover::after {
- pointer-events: all;
- }
-
- /* When hovering over the container, scale its child (the SVG) */
- .tabulator-cell:hover .image-zoom-viewer svg {
- padding: 3px;
- position: absolute;
- background-color: rgba(250, 250, 250, 0.854);
- box-shadow: 0 0 10px rgba(0, 0, 0, 0.618);
- border-radius: 3px;
- transform: scale(3); /* Scale up the SVG */
- transition: transform 0.3s ease;
- pointer-events: none; /* Prevents the SVG from blocking mouse interactions */
- z-index: 1000;
- }
- """
-
- pn.extension(
- raw_css=[panel_css],
- js_files={'panel_custom': 'static/panel.js', '3Dmol': 'static/3Dmol-min.js'},
- # js_modules={'3Dmol': 'static/3Dmol-min.js'},
- inline=True,
- )
-
- template = pn.template.VanillaTemplate(
- title=f'DeepSEQreen {job} Report',
- sidebar=[],
- favicon='deepseqreen.ico',
- logo='deepseqreen.svg',
- header_background='#F3F5F7',
- header_color='#4372c4',
- busy_indicator=None,
- )
-
- stats_pane = pn.Row()
- if unique_df is not None:
- unique_table = pn.widgets.Tabulator(unique_df, formatters=formatters, sizing_mode='stretch_width',
- show_index=False, disabled=True,
- frozen_columns=['Compound ID', 'Compound', 'Target ID'])
- # if pie_charts:
- # unique_table.width = 640
- stats_pane.append(pn.Column(f'### {unique_entity}', unique_table))
- if pie_charts:
- for score_name, figure_list in pie_charts.items():
- stats_pane.append(
- pn.Column(f'### {category} by Top {score_name}',
- pn.Tabs(*figure_list, tabs_location='above'))
- # pn.Card(pn.Row(v), title=f'{category} by Top {k}')
- )
-
- if stats_pane:
- template.main.append(pn.Card(stats_pane,
- sizing_mode='stretch_width', title='Summary Statistics', margin=10))
-
- template.main.append(
- pn.Card(report_table, title=f'{job} Results', # width=1200,
- margin=10)
- )
-
- template.save(file, title=f'DeepSEQreen {job} Report', resources=INLINE)
- return file
-
-
-def create_pie_chart(df, category, value, top_k):
- if category not in df or value not in df:
- return
- top_k_df = df.nlargest(top_k, value)
- category_counts = top_k_df[category].value_counts()
- data = pd.DataFrame({category: category_counts.index, 'value': category_counts.values})
-
- data['proportion'] = data['value'] / data['value'].sum()
- # Merge rows with proportion less than 0.2% into one row
- mask = data['proportion'] < 0.002
- if any(mask):
- merged_row = data[mask].sum()
- merged_row[category] = '...'
- data = pd.concat([data[~mask], pd.DataFrame(merged_row).T])
- data['angle'] = data['proportion'] * 2 * pi
-
- color_dict = {cat: color for cat, color in
- zip(df[category].unique(),
- (Category20c_20 * (len(df[category].unique()) // 20 + 1))[:len(df[category].unique())])}
- color_dict['...'] = '#636363'
- data['color'] = data[category].map(color_dict)
-
- tooltips = [
- (f"{category}", f"@{{{category}}}"),
- ("Count", "@value"),
- ("Percentage", "@proportion{0.0%}")
- ]
-
- if category == 'Scaffold SMILES' and 'Scaffold' in df.columns:
- data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left',
- left_on='Scaffold SMILES', right_on='Scaffold SMILES')
- tooltips.append(("Scaffold", "
@{Scaffold}{safe}
"))
- p = figure(height=384, width=960, name=f"Top {top_k}" if top_k < len(df) else 'All', sizing_mode='stretch_height',
- toolbar_location=None, tools="hover", tooltips=tooltips, x_range=(-0.4, 0.4))
-
- def truncate_label(label, max_length=60):
- return label if len(label) <= max_length else label[:max_length] + "..."
-
- data['legend_field'] = data[category].apply(truncate_label)
-
- p.add_layout(Legend(padding=0, margin=0), 'right')
- p.wedge(x=0, y=1, radius=0.3,
- start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
- line_color="white", fill_color='color', legend_field='legend_field', source=data)
-
- # Limit the number of legend items to 20 and add "..." if there are more than 20 items
- if len(p.legend.items) > 20:
- new_legend_items = p.legend.items[:20]
- new_legend_items.append(LegendItem(label="..."))
- p.legend.items = new_legend_items
-
- p.legend.label_text_font_size = "10pt"
- p.legend.label_text_font = "courier"
- p.axis.axis_label = None
- p.axis.visible = False
- p.grid.grid_line_color = None
- p.outline_line_width = 0
- p.min_border = 0
- p.margin = 0
-
- return p
-
-
-def submit_report(df, score_list, filter_list, opt_list, task, progress=gr.Progress(track_tqdm=True)):
- df_report = df.copy()
- try:
- for filter_name in filter_list:
- df_report[filter_name] = df_report['Compound'].parallel_apply(
- lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
-
- for score_name in score_list:
- df_report[score_name] = df_report['Compound'].parallel_apply(
- lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
-
- if opt_list:
- df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
- df_report = apply_advanced_opts(df_report, opt_list, df_training)
-
- return (create_html_report(df_report, file=None, task=task), df_report,
- gr.File(visible=False), gr.File(visible=False))
-
- except Exception as e:
- gr.Warning(f'Failed to report results due to error: {str(e)}')
- return None, None, None, None
-
-
-def wrap_text(text, line_length=60):
- if isinstance(text, str):
- wrapper = textwrap.TextWrapper(width=line_length)
- if text.startswith('>'):
- sections = text.split('>')
- wrapped_sections = []
- for section in sections:
- if not section:
- continue
- lines = section.split('\n')
- seq_header = lines[0]
- wrapped_seq = wrapper.fill(''.join(lines[1:]))
- wrapped_sections.append(f">{seq_header}\n{wrapped_seq}")
- return '\n'.join(wrapped_sections)
- else:
- return wrapper.fill(text)
- else:
- return text
-
-
-def unwrap_text(text):
- return text.strip.replece('\n', '')
-
-
-def drug_library_from_sdf(sdf_path):
- return PandasTools.LoadSDF(
- sdf_path,
- smilesName='X1', molColName='Compound', includeFingerprints=True
- )
-
-
-def process_target_library_upload(library_upload):
- if library_upload.endswith('.csv'):
- df = pd.read_csv(library_upload)
- elif library_upload.endswith('.fasta'):
- df = target_library_from_fasta(library_upload)
- else:
- raise gr.Error('Currently only CSV and FASTA files are supported as target libraries.')
- validate_columns(df, ['X2'])
- return df
-
-
-def process_drug_library_upload(library_upload):
- if library_upload.endswith('.csv'):
- df = pd.read_csv(library_upload)
- elif library_upload.endswith('.sdf'):
- df = drug_library_from_sdf(library_upload)
- else:
- raise gr.Error('Currently only CSV and SDF files are supported as drug libraries.')
- validate_columns(df, ['X1'])
- return df
-
-
-def target_library_from_fasta(fasta_path):
- records = list(SeqIO.parse(fasta_path, "fasta"))
- id2 = [record.id for record in records]
- seq = [str(record.seq) for record in records]
- df = pd.DataFrame({'ID2': id2, 'X2': seq})
- return df
-
-
-theme = gr.themes.Base(spacing_size="sm", text_size='md', font=gr.themes.GoogleFont("Roboto")).set(
- background_fill_primary='#eef3f9',
- background_fill_secondary='white',
- checkbox_label_background_fill='#eef3f9',
- checkbox_label_background_fill_hover='#dfe6f0',
- checkbox_background_color='white',
- checkbox_border_color='#4372c4',
- border_color_primary='#4372c4',
- border_color_accent='#2e6ab5',
- button_primary_background_fill='#2e6ab4',
- button_primary_text_color='white',
- body_text_color='#28496F',
- block_background_fill='#fbfcfd',
- block_title_text_color='#28496F',
- block_label_text_color='#28496F',
- block_info_text_color='#505358',
- block_border_color=None,
- # input_border_color='#4372c4',
- # panel_border_color='#4372c4',
- input_background_fill='#F1F2F4',
-)
-
-with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48 * 3600)) as demo:
- run_state = gr.State(value=False)
- screen_flag = gr.State(value=False)
- identify_flag = gr.State(value=False)
- infer_flag = gr.State(value=False)
-
- with gr.Tabs() as tabs:
- with gr.TabItem(label='Drug Hit Screening', id='Drug Hit Screening'):
- gr.Markdown('''
- #
Drug Hit Screening
-
-
- To predict interactions or binding affinities of a single target against a compound library.
-
- ''')
- with gr.Row():
- with gr.Column():
- HelpTip(
- "Enter (paste) a amino acid sequence below manually or upload a FASTA file. "
- "If multiple entities are in the FASTA, only the first will be used. "
- "Alternatively, enter a Uniprot ID or gene symbol with organism and click Query for "
- "the sequence."
- )
- target_input_type = gr.Dropdown(
- label='Step 1. Select Target Input Type and Input',
- choices=['Sequence', 'UniProt ID', 'Gene symbol'],
- info='Enter (paste) a FASTA string below manually or upload a FASTA file.',
- value='Sequence',
- scale=4, interactive=True
- )
-
- with gr.Row():
- target_id = gr.Textbox(show_label=False, visible=False,
- interactive=True, scale=4,
- info='Enter a UniProt ID and query.')
- target_gene = gr.Textbox(
- show_label=False, visible=False,
- interactive=True, scale=4,
- info='Enter a gene symbol and query. The first record will be used.')
- target_organism = gr.Textbox(
- info='Organism scientific name (default: Homo sapiens).',
- placeholder='Homo sapiens', show_label=False,
- visible=False, interactive=True, scale=4, )
- target_upload_btn = gr.UploadButton(label='Upload a FASTA File', type='binary',
- visible=True, variant='primary',
- size='lg')
- target_paste_markdown = gr.Button(value='OR Paste Your Sequence Below',
- variant='secondary')
- target_query_btn = gr.Button(value='Query the Sequence', variant='primary',
- visible=False, scale=4)
- # with gr.Row():
- # example_uniprot = gr.Button(value='Example: Q16539', elem_classes='example', visible=False)
- # example_gene = gr.Button(value='Example: MAPK14', elem_classes='example', visible=False)
- example_fasta = gr.Button(value='Example: MAPK14 (Q16539)', elem_classes='example')
- target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5)
- # with gr.Row():
- # with gr.Column():
- # with gr.Column():
- # gr.File(label='Example FASTA file',
- # value='data/examples/MAPK14.fasta', interactive=False)
-
- with gr.Row():
- with gr.Column(min_width=200):
- HelpTip(
- "Click Auto-detect to identify the protein family using sequence alignment. "
- "This optional step allows applying a family-specific model instead of a all-family "
- "model (general). "
- "Manually select general if the alignment results are unsatisfactory."
- )
- drug_screen_target_family = gr.Dropdown(
- choices=list(TARGET_FAMILY_MAP.keys()),
- value='General',
- label='Step 2. Select Target Family (Optional)', interactive=True)
- target_family_detect_btn = gr.Button(value='OR Let Us Auto-Detect for You',
- variant='primary')
- with gr.Column(min_width=200):
- HelpTip(
- "Interaction prediction provides you binding probability score between the target of "
- "interest and each compound in the library, "
- "while affinity prediction directly estimates their binding strength measured using "
- "half maximal inhibitory concentration (IC
50) in units of nM."
- )
- drug_screen_task = gr.Dropdown(
- list(TASK_MAP.keys()),
- label='Step 3. Select a Prediction Task',
- value='Compound-Protein Interaction')
- with gr.Column(min_width=200):
- HelpTip(
- "Select your preferred model, or click Recommend for the best-performing model based "
- "on the selected task, family, and whether the target was trained. "
- "Please refer to documentation for detailed benchmark results."
- )
- drug_screen_preset = gr.Dropdown(
- list(PRESET_MAP.keys()),
- label='Step 4. Select a Preset Model')
- screen_preset_recommend_btn = gr.Button(
- value='OR Let Us Recommend for You', variant='primary')
-
- with gr.Row():
- with gr.Column():
- HelpTip(
- "Select a preset compound library (e.g., DrugBank). "
- "Alternatively, upload a CSV file with a column named X1 containing compound SMILES, "
- "or use an SDF file (Max. 10,000 compounds per task). Example CSV and SDF files are "
- "provided below and can be downloaded by clicking the lower right corner."
- )
- drug_library = gr.Dropdown(
- label='Step 5. Select a Preset Compound Library',
- choices=list(DRUG_LIBRARY_MAP.keys()))
- with gr.Row():
- gr.File(label='Example SDF compound library',
- value='data/examples/compound_library.sdf', interactive=False)
- gr.File(label='Example CSV compound library',
- value='data/examples/compound_library.csv', interactive=False)
- drug_library_upload_btn = gr.UploadButton(
- label='OR Upload Your Own Library', variant='primary')
- drug_library_upload = gr.File(label='Custom compound library file', visible=False)
-
- with gr.Column():
- HelpTip("""
-
Max. Sequence Identity between the Input Target and Targets in the Training Set:
-this serves as an indicator of the predictioon applicability/reliability –
-higher similarities indicate more reliable predictions (preferably > 0.85).
-
Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target:
-this serves as an indicator of both the confidence level and novelty of the predicted hit compounds –
-higher similarities suggest greater confidence, while lower Tanimoto similarities may indicate the novelty
-of the identified hit compounds compared to known drugs or true interacting compounds of the input target.
-
Max. Sequence Identity between the Input Target and Known Targets of Hit Compound:
-this serves as an additional indicator of the confidence level of the predicted hit compounds –
-higher identities usually lead to greater confidence in the predictions.
-""")
- drug_screen_opts = gr.CheckboxGroup(
- label="Step 6. Select Advanced Options",
- value=DRUG_SCRENN_CPI_OPTS[0],
- choices=DRUG_SCRENN_CPI_OPTS,
- info="Advanced features - may increase the job computation time. "
- "See the Help Tip on the right or the Documentation for detailed explanation.",
-
- )
- with gr.Row():
- with gr.Column():
- drug_screen_email = gr.Textbox(
- label='Step 7. Input Your Email Address (Optional)',
- info="Your email address will be used to notify you of the status of your job. "
- "If you cannot receive the email, please check your spam/junk folder."
- )
-
- with gr.Row(visible=True):
- with gr.Row():
- drug_screen_clr_btn = gr.ClearButton(size='lg')
- drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
-
- screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
-
- with gr.TabItem(label='Target Protein Identification', id='Target Protein Identification'):
- gr.Markdown('''
- #
Target Protein Identification
-
-
- To predict interactions or binding affinities of a single compound against a protein library.
-
- ''')
- with gr.Column() as identify_page:
- with gr.Row():
- with gr.Column():
- HelpTip(
- "Enter (paste) a compound SMILES below manually or upload a SDF file. "
- "If multiple entities are in the SDF, only the first will be used. "
- "SMILES can be obtained by searching for the compound of interest in databases such "
- "as NCBI, PubChem and and ChEMBL."
- )
- compound_type = gr.Dropdown(
- label='Step 1. Select Compound Input Type and Input',
- choices=['SMILES', 'SDF'],
- info='Enter (paste) an SMILES string or upload an SDF file to convert to SMILES.',
- value='SMILES',
- interactive=True)
- compound_upload_btn = gr.UploadButton(
- label='OR Upload a SDF File', variant='primary', type='binary', visible=False)
-
- compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
- example_drug = gr.Button(value='Example: Aspirin', elem_classes='example')
-
- with gr.Row():
- with gr.Column(visible=True):
- HelpTip(
- "By default, models trained on all protein families (general) will be applied. "
- "If you upload a target library containing proteins all in the same family, "
- "you may manually select a Target Family."
- )
- # target_identify_target_family = gr.Dropdown(
- # choices=['Family-Specific Auto-Recommendation'] + list(TARGET_FAMILY_MAP.keys()),
- # value='Family-Specific Auto-Recommendation',
- # label='Step 2. Select Target Family')
- target_identify_target_family = gr.Dropdown(
- choices=['General'],
- value='General',
- label='Step 2. Select Target Family')
- with gr.Column():
- HelpTip(
- "Interaction prediction provides you binding probability score between the target of "
- "interest and each compound in the library, while affinity prediction directly "
- "estimates their binding strength measured using "
- "half maximal inhibitory concentration (IC
50) in units of nM."
- )
- target_identify_task = gr.Dropdown(
- list(TASK_MAP.keys()),
- label='Step 3. Select a Prediction Task',
- value='Compound-Protein Interaction')
-
- with gr.Column():
- HelpTip(
- "Select your preferred model, or click Recommend for the best-performing model based "
- "on the selected task and whether the compound was trained. By default, General-trained "
- "model is used for Target Protein Identification. "
- "Please refer to the documentation for detailed benchmark results."
- )
- # target_identify_preset = gr.Dropdown(
- # choices=['Family-Specific Auto-Recommendation'] + list(PRESET_MAP.keys()),
- # value='Family-Specific Auto-Recommendation',
- # label='Step 4. Select a Preset Model')
- target_identify_preset = gr.Dropdown(
- choices=['DeepConvDTI', 'DrugBAN', 'HyperAttentionDTI'],
- value='DrugBAN',
- label='Step 4. Select a Preset Model')
- identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
- variant='primary')
- with gr.Row():
- with gr.Column():
- HelpTip(
- "Select a preset target library (e.g., ChEMBL33_human_proteins). "
- "Alternatively, upload a CSV file with a column named X2 containing target protein "
- "sequences, or use an FASTA file (Max. 10,000 targets per task). "
- "Example CSV and SDF files are provided below "
- "and can be downloaded by clicking the lower right corner."
- )
- target_library = gr.Dropdown(
- label='Step 5. Select a Preset Target Library',
- choices=list(TARGET_LIBRARY_MAP.keys()))
- with gr.Row():
- gr.File(label='Example FASTA target library',
- value='data/examples/target_library.fasta', interactive=False)
- gr.File(label='Example CSV target library',
- value='data/examples/target_library.csv', interactive=False)
- target_library_upload_btn = gr.UploadButton(
- label='OR Upload Your Own Library', variant='primary')
- target_library_upload = gr.File(label='Custom target library file', visible=False)
- with gr.Column():
- HelpTip("""
-
Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set:
-this serves as an indicator of prediction applicability and reliability –
-higher similarities indicates more reliable predictions (ideally > 0.85).
-
Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound:
-this serves as an indicator of prediction confidence for the potential targets –
-higher similarities typically imply higher confidence levels.
-
Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target:
-this serves as an additional indicator of the confidence level in the predicted potential targets –
-higher similarities usually correspond to greater prediction confidence.
-""")
- target_identify_opts = gr.CheckboxGroup(
- choices=TARGET_IDENTIFY_CPI_OPTS,
- value=TARGET_IDENTIFY_CPI_OPTS[0],
- label='Step 6. Select Advanced Options',
- info="Advanced features - may increase the job computation time. "
- "See the Help Tip on the right or the Documentation for detailed explanation."
- )
- with gr.Row():
- with gr.Column():
- target_identify_email = gr.Textbox(
- label='Step 7. Input Your Email Address (Optional)',
- info="Your email address will be used to notify you of the status of your job. "
- "If you cannot receive the email, please check your spam/junk folder."
- )
-
- with gr.Row(visible=True):
- target_identify_clr_btn = gr.ClearButton(size='lg')
- target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary',
- size='lg')
-
- identify_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
-
- with gr.TabItem(label='Interaction Pair Inference', id='Interaction Pair Inference'):
- gr.Markdown('''
- #
Interaction Pair Inference
-
-
To predict interactions or binding affinities between up to
- 10,000 paired compound-protein data.
- ''')
- HelpTip(
- "A custom interation pair dataset can be a CSV file with 2 required columns "
- "(X1 for smiles and X2 for sequences) "
- "and optionally 2 ID columns (ID1 for compound ID and ID2 for target ID), "
- "or generated from a FASTA file containing multiple "
- "sequences and a SDF file containing multiple compounds. "
- "Currently, a maximum of 10,000 pairs is supported, "
- "which means that the size of CSV file or "
- "the product of the two library sizes should not exceed 10,000."
- )
- infer_type = gr.Dropdown(
- choices=['Upload a CSV file containing paired compound-protein data',
- 'Upload a compound library and a target library'],
- label='Step 1. Select Pair Input Type and Input',
- value='Upload a CSV file containing paired compound-protein data')
- with gr.Column() as pair_upload:
- gr.File(
- label="Example CSV dataset",
- value="data/examples/interaction_pair_inference.csv",
- interactive=False
- )
- with gr.Row():
- infer_csv_prompt = gr.Button(
- value="Upload Your Own Dataset Below",
- variant='secondary')
- with gr.Column():
- infer_pair = gr.File(
- label='Upload CSV File Containing Paired Records',
- file_count="single",
- type='filepath',
- visible=True
- )
- with gr.Column(visible=False) as pair_generate:
- with gr.Row():
- gr.File(
- label='Example SDF compound library',
- value='data/examples/compound_library.sdf',
- interactive=False
- )
- gr.File(
- label='Example FASTA target library',
- value='data/examples/target_library.fasta',
- interactive=False
- )
- with gr.Row():
- gr.File(
- label='Example CSV compound library',
- value='data/examples/compound_library.csv',
- interactive=False
- )
- gr.File(
- label='Example CSV target library',
- value='data/examples/target_library.csv',
- interactive=False
- )
- with gr.Row():
- infer_library_prompt = gr.Button(
- value="Upload Your Own Libraries Below",
- visible=False,
- variant='secondary'
- )
- with gr.Row():
- infer_drug = gr.File(
- label='Upload SDF/CSV File Containing Multiple Compounds',
- file_count="single",
- type='filepath'
- )
- infer_target = gr.File(
- label='Upload FASTA/CSV File Containing Multiple Targets',
- file_count="single",
- type='filepath'
- )
-
- with gr.Row():
- with gr.Column(min_width=200):
- HelpTip(
- "By default, models trained on all protein families (general) will be applied. "
- "If the proteins in the target library of interest "
- "all belong to the same protein family, manually selecting the family is supported."
- )
-
- pair_infer_target_family = gr.Dropdown(
- choices=list(TARGET_FAMILY_MAP.keys()),
- value='General',
- label='Step 2. Select Target Family (Optional)'
- )
-
- with gr.Column(min_width=200):
- HelpTip(
- "Interaction prediction provides you binding probability score "
- "between the target of interest and each compound in the library, "
- "while affinity prediction directly estimates their binding strength "
- "measured using half maximal inhibitory concentration (IC
50) in units of nM."
- )
- pair_infer_task = gr.Dropdown(
- list(TASK_MAP.keys()),
- label='Step 3. Select a Prediction Task',
- value='Compound-Protein Interaction'
- )
-
- with gr.Column(min_width=200):
- HelpTip(
- "Select your preferred model. Please refer to documentation for detailed benchmark results."
- )
- pair_infer_preset = gr.Dropdown(
- list(PRESET_MAP.keys()),
- label='Step 4. Select a Preset Model'
- )
- # infer_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
- # variant='primary')
- pair_infer_opts = gr.CheckboxGroup(visible=False)
-
- with gr.Row():
- pair_infer_email = gr.Textbox(
- label='Step 5. Input Your Email Address (Optional)',
- info="Your email address will be used to notify you of the status of your job. "
- "If you cannot receive the email, please check your spam/junk folder.")
-
- with gr.Row(visible=True):
- pair_infer_clr_btn = gr.ClearButton(size='lg')
- pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg')
-
- infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False)
-
- with gr.TabItem(label='Chemical Property Report', id='Chemical Property Report'):
- gr.Markdown('''
- #
Chemical Property Report
-
- To compute chemical properties for the predictions of Drug Hit Screening,
- Target Protein Identification, and Interaction Pair Inference.
-
- You may also upload your own dataset using a CSV file containing
- one required column `X1` for compound SMILES.
-
- The page shows only a preview report displaying at most 30 records
- (with top predicted CPI/CPA if reporting results from a prediction job).
-
- Please first `Preview` the report, then `Generate` and download a CSV report
- or an interactive HTML report below if you wish to access the full report.
- ''')
- raw_df = gr.State(value=pd.DataFrame())
- report_df = gr.State(value=pd.DataFrame())
- with gr.Row():
- with gr.Column(scale=1):
- file_for_report = gr.File(interactive=True, type='filepath')
- report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False,
- value='Compound-Protein Interaction',
- label='Specify the Task Labels in the Uploaded Dataset')
- with gr.Column(scale=2):
- with gr.Column():
- with gr.Row():
- scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Compound Scores')
- filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Compound Filters')
- job_opts = gr.CheckboxGroup(visible=False)
-
- with gr.Accordion('Report Generate Options', open=True):
- with gr.Row():
- csv_sep = gr.Radio(label='CSV Delimiter',
- choices=['Comma', 'Tab'], value='Comma')
- html_opts = gr.CheckboxGroup(label='HTML Report Options',
- choices=[
- 'Exclude Molecular Graph',
- 'Exclude Scaffold Graph',
- 'Exclude Pharmacophore 3D'
- ])
-
- with gr.Row():
- report_clr_btn = gr.ClearButton(size='lg')
- analyze_btn = gr.Button('Calculate Properties and Preview', variant='primary',
- size='lg', interactive=False)
-
- with gr.Row():
- with gr.Column(scale=3):
- html_report = gr.HTML() # label='Results', visible=True)
- ranking_pie_chart = gr.Plot(visible=False)
-
- with gr.Row():
- with gr.Column():
- csv_generate = gr.Button(value='Generate CSV Report',
- interactive=False, variant='primary')
- csv_download_file = gr.File(label='Download CSV Report', visible=False)
- with gr.Column():
- html_generate = gr.Button(value='Generate HTML Report',
- interactive=False, variant='primary')
- html_download_file = gr.File(label='Download HTML Report', visible=False)
-
- with gr.TabItem(label='Prediction Status Lookup', id='Prediction Status Lookup'):
- gr.Markdown('''
- #
Prediction Status Lookup
-
- To check the status of an in-progress or historical job using the job ID and retrieve the predictions
- if the job has completed. Note that predictions are only kept for 48 hours upon job completion.
-
- You will be redirected to Chemical Property Report for carrying out further analysis and
- generating the full report when the job is done. If the Lookup fails to respond, please wait for a
- few minutes and refresh the page to try again.
- ''')
- with gr.Column():
- pred_lookup_id = gr.Textbox(
- label='Input Your Job ID', placeholder='e.g., e9dfd149-3f5c-48a6-b797-c27d027611ac',
- info="Your job ID is a UUID4 string that you receive after submitting a job on the "
- "page or in the email notification.")
- pred_lookup_btn = gr.Button(value='Lookup the Job Status', variant='primary', visible=True)
- pred_lookup_stop_btn = gr.Button(value='Stop Tracking', variant='stop', visible=False)
- pred_lookup_status = gr.Markdown()
-
- # retrieve_email = gr.Textbox(label='Step 2. Input Your Email Address', placeholder='e.g.,
-
-
- def target_input_type_select(input_type):
- match input_type:
- case 'UniProt ID':
- return [gr.Dropdown(info=''),
- gr.UploadButton(visible=False),
- gr.Textbox(visible=True, value=''),
- gr.Textbox(visible=False, value=''),
- gr.Textbox(visible=False, value=''),
- gr.Button(visible=True),
- gr.Code(value=''),
- gr.Button(visible=False)]
- case 'Gene symbol':
- return [gr.Dropdown(info=''),
- gr.UploadButton(visible=False),
- gr.Textbox(visible=False, value=''),
- gr.Textbox(visible=True, value=''),
- gr.Textbox(visible=True, value=''),
- gr.Button(visible=True),
- gr.Code(value=''),
- gr.Button(visible=False)]
- case 'Sequence':
- return [gr.Dropdown(info='Enter (paste) a FASTA string below manually or upload a FASTA file.'),
- gr.UploadButton(visible=True),
- gr.Textbox(visible=False, value=''),
- gr.Textbox(visible=False, value=''),
- gr.Textbox(visible=False, value=''),
- gr.Button(visible=False),
- gr.Code(value=''),
- gr.Button(visible=True)]
-
-
- target_input_type.select(
- fn=target_input_type_select,
- inputs=target_input_type,
- outputs=[
- target_input_type, target_upload_btn,
- target_id, target_gene, target_organism, target_query_btn,
- target_fasta, target_paste_markdown
- ],
- show_progress='hidden'
- )
-
-
- def uniprot_query(input_type, uid, gene, organism='Human'):
- uniprot_endpoint = 'https://rest.uniprot.org/uniprotkb/{query}'
- fasta_rec = ''
-
- match input_type:
- case 'UniProt ID':
- query = f"{uid.strip()}.fasta"
- case 'Gene symbol':
- organism = organism if organism else 'Human'
- query = f'search?query=organism_name:{organism.strip()}+AND+gene:{gene.strip()}&format=fasta'
-
- try:
- fasta = session.get(uniprot_endpoint.format(query=query))
- fasta.raise_for_status()
- if fasta.text:
- fasta_rec = next(SeqIO.parse(io.StringIO(fasta.text), format='fasta'))
- fasta_rec = f">{fasta_rec.description}\n{fasta_rec.seq}"
-
- except Exception as e:
- raise gr.Warning(f"Failed to query FASTA from UniProt database due to {str(e)}")
- finally:
- return fasta_rec
-
-
- def process_fasta_upload(fasta_upload):
- fasta = ''
- try:
- fasta = fasta_upload.decode()
- except Exception as e:
- gr.Warning(f"Please upload a valid FASTA file. Error: {str(e)}")
- return fasta
-
-
- target_upload_btn.upload(
- fn=process_fasta_upload, inputs=target_upload_btn, outputs=target_fasta
- ).then(
- fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden'
- )
- target_query_btn.click(
- fn=uniprot_query, inputs=[target_input_type, target_id, target_gene, target_organism], outputs=target_fasta
- ).then(
- fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden'
- )
-
-
- def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)):
- try:
- aligner = PairwiseAligner(mode='local')
- alignment_df = get_fasta_family_map()
-
- processed_fasta = process_target_fasta(fasta)
-
- # Check for an exact match first
- exact_match = alignment_df[alignment_df['X2'] == processed_fasta]
- if not exact_match.empty:
- row = exact_match.iloc[0]
- family = str(row['Target Family']).title()
- return gr.Dropdown(
- value=family,
- info=f"Reason: Exact match found with {row['ID2']} from family {family}")
-
- # If no exact match, then calculate alignment score
- def align_score(query):
- alignment = aligner.align(processed_fasta, query)
- return alignment.score / max(len(processed_fasta), len(query))
-
- alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
- row = alignment_df.loc[alignment_df['score'].idxmax()]
- family = str(row['Target Family']).title()
- return gr.Dropdown(value=family,
- info=f"Reason: Best sequence identity ({row['score']}) "
- f"with {row['ID2']} from family {family}")
- except Exception as e:
- gr.Warning("Failed to detect the protein family due to error: " + str(e))
-
-
- target_family_detect_btn.click(fn=target_family_detect, inputs=target_fasta, outputs=drug_screen_target_family)
-
- # target_fasta.focus(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden')
- target_fasta.blur(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden')
-
- drug_library_upload_btn.upload(fn=lambda x: [
- x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name])
- ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
-
- drug_screen_task.select(
- fn=lambda task, opts: gr.CheckboxGroup(choices=DRUG_SCRENN_CPA_OPTS)
- if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup(
- choices=DRUG_SCRENN_CPI_OPTS, value=DRUG_SCRENN_CPI_OPTS[0]),
- inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts,
- show_progress='hidden'
- )
-
- target_identify_task.select(
- fn=lambda task, opts: gr.CheckboxGroup(choices=TARGET_IDENTIFY_CPA_OPTS)
- if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup(
- choices=TARGET_IDENTIFY_CPI_OPTS, value=TARGET_IDENTIFY_CPI_OPTS[0]),
- inputs=[target_identify_task, target_identify_opts], outputs=target_identify_opts,
- show_progress='hidden'
- )
-
- def example_fill(input_type):
- return {target_id: 'Q16539',
- target_gene: 'MAPK14',
- target_organism: 'Human',
- target_fasta: """
->sp|Q16539|MK14_HUMAN Mitogen-activated protein kinase 14 OS=Homo sapiens OX=9606 GN=MAPK14 PE=1 SV=3
-MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLRVAVKKLSRPFQ
-SIIHAKRTYRELRLLKHMKHENVIGLLDVFTPARSLEEFNDVYLVTHLMGADLNNIVKCQ
-KLTDDHVQFLIYQILRGLKYIHSADIIHRDLKPSNLAVNEDCELKILDFGLARHTDDEMT
-GYVATRWYRAPEIMLNWMHYNQTVDIWSVGCIMAELLTGRTLFPGTDHIDQLKLILRLVG
-TPGAELLKKISSESARNYIQSLTQMPKMNFANVFIGANPLAVDLLEKMLVLDSDKRITAA
-QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
-"""}
-
-
- example_fasta.click(fn=example_fill, inputs=target_input_type, outputs=[
- target_id, target_gene, target_organism, target_fasta], show_progress='hidden')
-
-
- def screen_recommend_model(fasta, family, task):
- task = TASK_MAP[task]
- score = TASK_METRIC_MAP[task]
- benchmark_df = pd.read_csv(f'data/benchmarks/{task}_test_metrics.csv')
-
- if not fasta:
- gr.Warning('Please enter a valid FASTA for model recommendation.')
- return [None, family]
-
- if family == 'General':
- seen_targets = get_seen_fastas('General', task)['X2'].values
- if process_target_fasta(fasta) in seen_targets:
- scenario = "Seen Target"
- else:
- scenario = "Unseen Target"
- filtered_df = benchmark_df[(benchmark_df['Family'] == 'All Families')
- & (benchmark_df['Scenario'] == scenario)
- & (benchmark_df['Type'] == 'General')]
-
- else:
- seen_targets_general = get_seen_fastas('General', task)['X2'].values
- if process_target_fasta(fasta) in seen_targets_general:
- scenario_general = "Seen Target"
- else:
- scenario_general = "Unseen Target"
-
- seen_targets_family = get_seen_fastas(family, task)['X2'].values
- if process_target_fasta(fasta) in seen_targets_family:
- scenario_family = "Seen Target"
- else:
- scenario_family = "Unseen Target"
-
- filtered_df_general = benchmark_df[(benchmark_df['Family'] == family)
- & (benchmark_df['Scenario'] == scenario_general)
- & (benchmark_df['Type'] == 'General')]
- filtered_df_family = benchmark_df[(benchmark_df['Family'] == family)
- & (benchmark_df['Scenario'] == scenario_family)
- & (benchmark_df['Type'] == 'Family')]
- filtered_df = pd.concat([filtered_df_general, filtered_df_family])
-
- row = filtered_df.loc[filtered_df[score].idxmax()]
- if row['Scenario'] == 'Seen Target':
- scenario = "Seen Target (>=0.85 sequence identity)"
- elif row['Scenario'] == 'Unseen Target':
- scenario = "Unseen Target (<0.85 sequence identity)"
-
- return {drug_screen_preset:
- gr.Dropdown(value=row['Model'],
- info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
- f"model with the best {score} in the {scenario} scenario on {row['Family']}."),
- drug_screen_target_family:
- gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
-
-
- screen_preset_recommend_btn.click(
- fn=screen_recommend_model,
- inputs=[target_fasta, drug_screen_target_family, drug_screen_task],
- outputs=[drug_screen_preset, drug_screen_target_family],
- show_progress='hidden'
- )
-
-
- def compound_input_type_select(input_type):
- match input_type:
- case 'SMILES':
- return gr.Button(visible=False)
- case 'SDF':
- return gr.Button(visible=True)
-
-
- compound_type.select(fn=compound_input_type_select,
- inputs=compound_type, outputs=compound_upload_btn, show_progress='hidden')
-
-
- def compound_upload_process(input_type, input_upload):
- smiles = ''
- try:
- match input_type:
- case 'SMILES':
- smiles = input_upload.decode()
- case 'SDF':
- suppl = Chem.ForwardSDMolSupplier(io.BytesIO(input_upload))
- smiles = Chem.MolToSmiles(next(suppl))
- except Exception as e:
- gr.Warning(f"Please upload a valid {input_type} file. Error: {str(e)}")
- return smiles
-
-
- compound_upload_btn.upload(fn=compound_upload_process,
- inputs=[compound_type, compound_upload_btn],
- outputs=compound_smiles)
-
- example_drug.click(fn=lambda: 'CC(=O)Oc1ccccc1C(=O)O', outputs=compound_smiles, show_progress='hidden')
-
- target_library_upload_btn.upload(fn=lambda x: [
- x.name, gr.Dropdown(value=Path(x.name).name, choices=list(TARGET_LIBRARY_MAP.keys()) + [Path(x.name).name])
- ], inputs=target_library_upload_btn, outputs=[target_library_upload, target_library])
-
-
- def identify_recommend_model(smiles, family, task):
- task = TASK_MAP[task]
- score = TASK_METRIC_MAP[task]
- benchmark_df = pd.read_csv(f'data/benchmarks/{task}_test_metrics.csv')
-
- if not smiles:
- gr.Warning('Please enter a valid SMILES for model recommendation.')
- return None
- if family == 'Family-Specific Auto-Recommendation':
- return 'Family-Specific Auto-Recommendation'
-
- if family == 'General':
- seen_compounds = pd.read_csv(
- f'data/benchmarks/seen_compounds/all_families_full_{task.lower()}_random_split.csv')
- family = 'All Families'
-
- else:
- seen_compounds = pd.read_csv(
- f'data/benchmarks/seen_compounds/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
-
- if rdkit_canonicalize(smiles) in seen_compounds['X1'].values:
- scenario = "Seen Compound"
- else:
- scenario = "Unseen Compound"
-
- filtered_df = benchmark_df[(benchmark_df['Family'] == family)
- & (benchmark_df['Scenario'] == scenario)
- & (benchmark_df['Type'] == 'General')]
-
- row = filtered_df.loc[filtered_df[score].idxmax()]
-
- return gr.Dropdown(value=row['Model'],
- info=f"Reason: {scenario} in training; choosing the model "
- f"with the best {score} in the {scenario} scenario.")
-
-
- identify_preset_recommend_btn.click(fn=identify_recommend_model,
- inputs=[compound_smiles, target_identify_target_family, target_identify_task],
- outputs=target_identify_preset, show_progress='hidden')
-
-
- def infer_type_change(upload_type):
- match upload_type:
- case "Upload a compound library and a target library":
- return {
- pair_upload: gr.Column(visible=False),
- pair_generate: gr.Column(visible=True),
- infer_pair: None,
- infer_drug: None,
- infer_target: None,
- infer_csv_prompt: gr.Button(visible=False),
- infer_library_prompt: gr.Button(visible=True),
- }
- case "Upload a CSV file containing paired compound-protein data":
- return {
- pair_upload: gr.Column(visible=True),
- pair_generate: gr.Column(visible=False),
- infer_pair: None,
- infer_drug: None,
- infer_target: None,
- infer_csv_prompt: gr.Button(visible=True),
- infer_library_prompt: gr.Button(visible=False),
- }
-
-
- infer_type.select(fn=infer_type_change, inputs=infer_type,
- outputs=[pair_upload, pair_generate, infer_pair, infer_drug, infer_target,
- infer_csv_prompt, infer_library_prompt],
- show_progress='hidden')
-
-
- def common_input_validate(state, preset, email, request):
- gr.Info('Start processing inputs...')
- if not preset:
- raise gr.Error('Please select a model.')
-
- if email:
- try:
- email_info = validate_email(email, check_deliverability=False)
- email = email_info.normalized
- except EmailNotValidError as e:
- raise gr.Error(f"Invalid email address: {str(e)}.")
-
- if state:
- raise gr.Error(f"You already have a running prediction job (ID: {state['id']}) under this session. "
- "Please wait for it to complete before submitting another job.")
-
- if check := check_user_running_job(email, request):
- raise gr.Error(check)
-
- return state, preset, email
-
-
- def common_job_initiate(job_id, job_type, email, request, task):
- gr.Info('Finished processing inputs. Initiating the prediction job... '
- 'You will be redirected to Prediction Status Lookup once the job has been submitted.')
- job_info = {'id': job_id,
- 'type': job_type,
- 'task': task,
- 'status': 'RUNNING',
- 'email': email,
- 'ip': request.headers.get('x-forwarded-for', request.client.host),
- 'cookies': dict(request.cookies),
- 'start_time': time(),
- 'end_time': None,
- 'expiry_time': None,
- 'error': None}
- # db.insert(job_info)
- return job_info
-
-
- def drug_screen_validate(fasta, library, library_upload, preset, task, email, state,
- request: gr.Request, progress=gr.Progress(track_tqdm=True)):
- state, preset, email = common_input_validate(state, preset, email, request)
-
- fasta = process_target_fasta(fasta)
- err = validate_seq_str(fasta, FASTA_PAT)
- if err:
- raise gr.Error(f'Found error(s) in your Target FASTA input: {err}')
- if not library:
- raise gr.Error('Please select or upload a compound library.')
- if library in DRUG_LIBRARY_MAP.keys():
- screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
- else:
- screen_df = process_drug_library_upload(library_upload)
- if len(screen_df) >= DATASET_MAX_LEN:
- raise gr.Error(f'The uploaded compound library has more records '
- f'than the allowed maximum {DATASET_MAX_LEN}.')
-
- screen_df['X2'] = fasta
-
- job_id = str(uuid4())
- temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
- screen_df.to_csv(temp_file, index=False, na_rep='')
- if temp_file.is_file():
- job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task)
- return {screen_data_for_predict: str(temp_file),
- run_state: job_info}
- else:
- raise gr.Error('System failed to create temporary files. Please try again later.')
-
-
- def target_identify_validate(smiles, library, library_upload, preset, task, email, state,
- request: gr.Request, progress=gr.Progress(track_tqdm=True)):
- state, preset, email = common_input_validate(state, preset, email, request)
-
- smiles = smiles.strip()
- err = validate_seq_str(smiles, SMILES_PAT)
- if err:
- raise gr.Error(f'Found error(s) in your Compound SMILES input: {err}')
- if not library:
- raise gr.Error('Please select or upload a target library.')
- if library in TARGET_LIBRARY_MAP.keys():
- identify_df = pd.read_csv(Path('data/target_libraries', TARGET_LIBRARY_MAP[library]))
- else:
- identify_df = process_target_library_upload(library_upload)
- if len(identify_df) >= DATASET_MAX_LEN:
- raise gr.Error(f'The uploaded target library has more records '
- f'than the allowed maximum {DATASET_MAX_LEN}.')
- identify_df['X1'] = smiles
-
- job_id = str(uuid4())
- temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
- identify_df.to_csv(temp_file, index=False, na_rep='')
- if temp_file.is_file():
- job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task)
- return {identify_data_for_predict: str(temp_file),
- run_state: job_info}
- else:
- raise gr.Error('System failed to create temporary files. Please try again later.')
-
-
- def pair_infer_validate(drug_target_pair_upload, drug_upload, target_upload, preset, task, email, state,
- request: gr.Request, progress=gr.Progress(track_tqdm=True)):
- state, preset, email = common_input_validate(state, preset, email, request)
-
- job_id = str(uuid4())
- if drug_target_pair_upload:
- infer_df = pd.read_csv(drug_target_pair_upload)
- validate_columns(infer_df, ['X1', 'X2'])
-
- infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
- validate_seq_str, regex=SMILES_PAT)
- if not infer_df['X1_ERR'].isna().all():
- raise ValueError(
- f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
-
- infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
- validate_seq_str, regex=FASTA_PAT)
- if not infer_df['X2_ERR'].isna().all():
- raise ValueError(
- f"Encountered invalid FASTA:\n{infer_df[~infer_df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
-
- temp_file = Path(drug_target_pair_upload).resolve()
-
- elif drug_upload and target_upload:
- drug_df = process_drug_library_upload(drug_upload)
- target_df = process_target_library_upload(target_upload)
-
- drug_df.drop_duplicates(subset=['X1'], inplace=True)
- target_df.drop_duplicates(subset=['X2'], inplace=True)
-
- infer_df = pd.DataFrame(list(itertools.product(drug_df['X1'], target_df['X2'])),
- columns=['X1', 'X2'])
- infer_df = infer_df.merge(drug_df, on='X1').merge(target_df, on='X2')
-
- if len(infer_df) >= DATASET_MAX_LEN:
- raise gr.Error(f'The uploaded/generated compound-protein pair dataset has more records '
- f'than the allowed maximum {DATASET_MAX_LEN}.')
-
- temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
- infer_df.to_csv(temp_file, index=False, na_rep='')
-
- else:
- raise gr.Error('Should upload a compound-protein pair dataset, or '
- 'upload both a compound library and a target library.')
-
- if temp_file.is_file():
- job_info = common_job_initiate(job_id, 'Interaction Pair Inference', email, request, task)
- return {infer_data_for_predict: str(temp_file),
- run_state: job_info}
- else:
- raise gr.Error('System failed to create temporary files. Please try again later.')
-
-
- def fill_job_id(job_info):
- try:
- return job_info['id']
- except Exception as e:
- gr.Warning(f'Failed to fetch job ID due to error: {str(e)}')
- return ''
-
-
- drug_screen_click = drug_screen_btn.click(
- fn=drug_screen_validate,
- inputs=[target_fasta, drug_library, drug_library_upload, drug_screen_preset, drug_screen_task,
- drug_screen_email, run_state],
- outputs=[screen_data_for_predict, run_state],
- concurrency_limit=2,
- )
-
- drug_screen_lookup = drug_screen_click.success(
- fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs],
- ).then(
- fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id]
- ).then(
- fn=lookup_job,
- inputs=[pred_lookup_id],
- outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
- show_progress='minimal',
- concurrency_limit=100,
- )
-
- # drug_screen_click.success(
- # fn=send_email,
- # inputs=[run_state]
- # )
-
- drug_screen_click.success(
- fn=submit_predict,
- inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset,
- drug_screen_target_family, drug_screen_opts, run_state, ],
- outputs=[run_state, ]
- )
-
- drug_screen_clr_btn.click(
- lambda: ['General'] + [[]] + [None] * 5,
- outputs=[drug_screen_target_family, drug_screen_opts,
- target_fasta, drug_screen_preset, drug_library, drug_library_upload, drug_screen_email],
- show_progress='hidden'
- )
-
- target_identify_clr_btn.click(
- lambda: ['General'] + [[]] + [None] * 5,
- outputs=[target_identify_target_family, target_identify_opts,
- compound_smiles, target_identify_preset, target_library, target_library_upload, target_identify_email],
- show_progress='hidden'
- )
-
- pair_infer_clr_btn.click(
- lambda: ['General'] + [None] * 5,
- outputs=[pair_infer_target_family,
- infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_email],
- show_progress='hidden'
- )
-
- report_clr_btn.click(
- lambda: [[]] * 3 + [None] * 3 +
- [gr.Button(interactive=False)] * 3 +
- [gr.File(visible=False, value=None)] * 2 +
- [gr.Dropdown(visible=False, value=None), gr.HTML(value=''), gr.CheckboxGroup(visible=False)],
- outputs=[
- scores, filters, html_opts,
- file_for_report, raw_df, report_df,
- csv_generate, html_generate, analyze_btn,
- csv_download_file, html_download_file,
- report_task, html_report, job_opts
- ],
- show_progress='hidden'
- )
-
-
- def update_preset(family, preset):
- if family == 'Family-Specific Auto-Recommendation':
- return 'Family-Specific Auto-Recommendation'
- elif preset == 'Family-Specific Auto-Recommendation':
- return None
- else:
- return preset
-
-
- def update_family(family, preset):
- if preset == 'Family-Specific Auto-Recommendation':
- return 'Family-Specific Auto-Recommendation'
- elif family == 'Family-Specific Auto-Recommendation':
- return None
- else:
- return family
-
-
- target_identify_target_family.change(
- fn=update_preset, inputs=[target_identify_target_family, target_identify_preset],
- outputs=target_identify_preset, show_progress='hidden')
- target_identify_preset.change(
- fn=update_family, inputs=[target_identify_target_family, target_identify_preset],
- outputs=target_identify_target_family, show_progress='hidden')
-
- target_identify_click = target_identify_btn.click(
- fn=target_identify_validate,
- inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task,
- target_identify_email, run_state],
- outputs=[identify_data_for_predict, run_state],
- concurrency_limit=2,
- )
-
- target_identify_lookup = target_identify_click.success(
- fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs],
- ).then(
- fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id]
- ).then(
- fn=lookup_job,
- inputs=[pred_lookup_id],
- outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
- show_progress='minimal',
- concurrency_limit=100
- )
-
- # target_identify_click.success(
- # fn=send_email,
- # inputs=[run_state]
- # )
-
- target_identify_click.success(
- fn=submit_predict,
- inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
- target_identify_target_family, target_identify_opts, run_state, ], # , target_identify_email],
- outputs=[run_state, ]
- )
-
- pair_infer_click = pair_infer_btn.click(
- fn=pair_infer_validate,
- inputs=[infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_task,
- pair_infer_email, run_state],
- outputs=[infer_data_for_predict, run_state],
- concurrency_limit=2,
- )
-
- pair_infer_lookup = pair_infer_click.success(
- fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs],
- ).then(
- fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id]
- ).then(
- fn=lookup_job,
- inputs=[pred_lookup_id],
- outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
- show_progress='minimal',
- concurrency_limit=100
- )
-
- # pair_infer_click.success(
- # fn=send_email,
- # inputs=[run_state]
- # )
-
- pair_infer_click.success(
- fn=submit_predict,
- inputs=[infer_data_for_predict, pair_infer_task, pair_infer_preset,
- pair_infer_target_family, pair_infer_opts, run_state, ], # , pair_infer_email],
- outputs=[run_state, ]
- )
-
- pred_lookup_click = pred_lookup_btn.click(
- fn=lookup_job,
- inputs=[pred_lookup_id],
- outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
- show_progress='minimal',
- cancels=[drug_screen_lookup, target_identify_lookup, pair_infer_lookup],
- concurrency_limit=100,
- )
-
- pred_lookup_stop_btn.click(
- fn=lambda: [gr.Button(visible=True), gr.Button(visible=False)],
- outputs=[pred_lookup_btn, pred_lookup_stop_btn],
- cancels=[pred_lookup_click, drug_screen_lookup, target_identify_lookup, pair_infer_lookup],
- concurrency_limit=None,
- )
-
-
- def inquire_task(df):
- if 'Y^' in df.columns:
- label = 'predicted CPI/CPA labels (`Y^`)'
- return {report_task: gr.Dropdown(visible=True,
- info=f'Found {label} in your uploaded dataset. '
- 'Is it compound-protein interaction or binding affinity?'),
- html_report: ''}
- else:
- return {report_task: gr.Dropdown(visible=False)}
-
- report_df_change = file_for_report.change(
- fn=update_df, inputs=file_for_report, outputs=[
- html_report, raw_df, report_df, analyze_btn, report_task, job_opts
- ],
- concurrency_limit=100,
- ).success(
- fn=lambda: [gr.Button(interactive=True)] * 3 +
- [gr.File(visible=False, value=None)] * 2,
- outputs=[
- csv_generate, html_generate, analyze_btn, csv_download_file, html_download_file
- ],
- )
-
- file_for_report.upload(
- # fn=update_df, inputs=file_for_report, outputs=[
- # html_report, raw_df, report_df, analyze_btn, report_task, job_opts
- # ],
- # cancels=[report_df_change],
- # concurrency_limit=100,
- # ).success(
- fn=inquire_task, inputs=[raw_df],
- outputs=[report_task, html_report],
- )
-
- file_for_report.clear(
- fn=lambda: [gr.Button(interactive=False)] * 3 +
- [gr.File(visible=False, value=None)] * 2 +
- [gr.Dropdown(visible=False, value=None), '', gr.CheckboxGroup(visible=False)],
- cancels=[report_df_change],
- outputs=[
- csv_generate, html_generate, analyze_btn,
- csv_download_file, html_download_file,
- report_task, html_report, job_opts
- ]
- )
-
- analyze_btn.click(
- fn=submit_report, inputs=[raw_df, scores, filters, job_opts, report_task], outputs=[
- html_report, report_df, csv_download_file, html_download_file]
- ).success(
- fn=lambda: [gr.Button(interactive=True)] * 2,
- outputs=[csv_generate, html_generate],
- concurrency_limit=100,
- )
-
-
- def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)):
- csv_sep_map = {
- 'Comma': ',',
- 'Tab': '\t',
- }
- y_colname = 'Y^'
- if isinstance(task, str):
- if task == 'Compound-Protein Interaction':
- y_colname = 'Y_prob'
- elif task == 'Compound-Protein Binding Affinity':
- y_colname = 'Y_IC50'
- try:
- now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
- filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv"
- df.rename(columns={'Y^': y_colname}).drop(
- labels=['Compound', 'Scaffold'], axis=1
- ).to_csv(filename, index=False, na_rep='', sep=csv_sep_map[sep])
-
- return gr.File(filename, visible=True)
- except Exception as e:
- gr.Warning(f"Failed to generate CSV due to error: {str(e)}")
- return None
-
-
- def create_html_report_file(df, file_report, task, opts, progress=gr.Progress(track_tqdm=True)):
- try:
- now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
- filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.html"
- create_html_report(df, filename, task, opts)
- return gr.File(filename, visible=True)
- except Exception as e:
- gr.Warning(f"Failed to generate HTML due to error: {str(e)}")
- return None
-
-
- # html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate])
-
- csv_generate.click(
- lambda: gr.File(visible=True), outputs=csv_download_file,
- ).then(
- fn=create_csv_report_file, inputs=[report_df, file_for_report, report_task, csv_sep],
- outputs=csv_download_file, show_progress='full'
- )
- html_generate.click(
- lambda: gr.File(visible=True), outputs=html_download_file,
- ).then(
- fn=create_html_report_file, inputs=[report_df, file_for_report, report_task, html_opts],
- outputs=html_download_file, show_progress='full'
- )
-
-if __name__ == "__main__":
- pandarallel.initialize()
-
- hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
-
- session = requests.Session()
- ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))
- session.mount('http://', ADAPTER)
- session.mount('https://', ADAPTER)
-
- db = TinyDB(f'{SERVER_DATA_DIR}/db.json')
- # Set all RUNNING jobs to FAILED at TinyDB initialization
- Job = Query()
- jobs = db.all()
- for job in jobs:
- if job['status'] == 'RUNNING':
- db.update({'status': 'FAILED'}, Job.id == job['id'])
-
- scheduler = BackgroundScheduler()
- scheduler.add_job(check_expiry, 'interval', hours=1, timezone=pytz.utc)
- scheduler.start()
-
- demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
+import glob
+import smtplib
+from datetime import datetime, timedelta
+import itertools
+import textwrap
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from email.utils import formatdate, make_msgid
+from functools import cache
+from math import pi
+from time import sleep, time
+from uuid import uuid4
+
+import io
+import os
+from pathlib import Path
+import sys
+
+import pytz
+from Bio import SeqIO
+from Bio.Align import PairwiseAligner
+from email_validator import validate_email, EmailNotValidError
+import gradio as gr
+import hydra
+import pandas as pd
+from pandarallel import pandarallel
+import requests
+from rdkit.DataStructs import BulkTanimotoSimilarity
+from requests.adapters import HTTPAdapter, Retry
+from markdown import markdown
+from rdkit import Chem
+from rdkit.Chem import AllChem, Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen
+from rdkit.Chem.Features.ShowFeats import _featColors
+from rdkit.Chem.Scaffolds import MurckoScaffold
+import py3Dmol
+
+from bokeh.models import Legend, NumberFormatter, BooleanFormatter, HTMLTemplateFormatter, LegendItem
+from bokeh.palettes import Category20c_20
+from bokeh.plotting import figure
+from bokeh.transform import cumsum
+from bokeh.resources import INLINE
+import seaborn as sns
+import panel as pn
+
+from apscheduler.schedulers.background import BackgroundScheduler
+from tinydb import TinyDB, Query
+
+#import swifter
+from tqdm.auto import tqdm
+
+from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
+from deepscreen.predict import predict
+
+sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
+import sascorer
+
+DATASET_MAX_LEN = 10_240
+SERVER_DATA_DIR = os.getenv('DATA') # '/data'
+DB_EXPIRY = timedelta(hours=48).total_seconds()
+
+CSS = """
+.help-tip {
+ position: absolute;
+ display: inline-block;
+ top: 16px;
+ right: 0px;
+ text-align: center;
+ border-radius: 40%;
+ /* border: 2px solid darkred; background-color: #8B0000;*/
+ width: 24px;
+ height: 24px;
+ font-size: 16px;
+ line-height: 26px;
+ cursor: default;
+ transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1);
+ z-index: 100 !important;
+}
+
+.help-tip:hover {
+ cursor: pointer;
+ /*background-color: #ccc;*/
+}
+
+.help-tip:before {
+ content: '?';
+ font-weight: 700;
+ color: #8B0000;
+ z-index: 100 !important;
+}
+
+.help-tip p {
+ visibility: hidden;
+ opacity: 0;
+ text-align: left;
+ background-color: #EFDDE3;
+ padding: 20px;
+ width: 300px;
+ position: absolute;
+ border-radius: 4px;
+ right: -4px;
+ color: #494F5A;
+ font-size: 13px;
+ line-height: normal;
+ transform: scale(0.7);
+ transform-origin: 100% 0%;
+ transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1);
+ z-index: 100;
+}
+
+.help-tip:hover p {
+ cursor: default;
+ visibility: visible;
+ opacity: 1;
+ transform: scale(1.0);
+}
+
+.help-tip p:before {
+ position: absolute;
+ content: '';
+ width: 0;
+ height: 0;
+ border: 6px solid transparent;
+ border-bottom-color: #EFDDE3;
+ right: 10px;
+ top: -12px;
+}
+
+.help-tip p:after {
+ width: 100%;
+ height: 40px;
+ content: '';
+ position: absolute;
+ top: -5px;
+ left: 0;
+ z-index: 101;
+}
+
+.upload_button {
+ background-color: #008000;
+}
+
+.absolute {
+ position: absolute;
+}
+
+.example {
+padding: 0;
+background: none;
+border: none;
+text-decoration: underline;
+box-shadow: none;
+text-align: left !important;
+display: inline-block !important;
+}
+
+footer {
+visibility: hidden
+}
+"""
+
+
+class View3DmolCell(py3Dmol.view):
+ def __init__(self, width=320, height=200):
+ divid = "3dmolviewer_UNIQUEID"
+ self.uniqueid = None
+ if isinstance(width, int):
+ width = '%dpx' % width
+ if isinstance(height, int):
+ height = '%dpx' % height
+ self.startjs = '''
+
\n''' % (divid, width, height)
+ self.startjs += ''
+
+ self.updatejs = ''
+ self.viewergrid = None
+
+ self.startjs += 'viewer_UNIQUEID = $3Dmol.createViewer(document.getElementById("%s"),{backgroundColor:"white"});\n' % divid
+ self.startjs += "viewer_UNIQUEID.zoomTo();\n"
+ self.endjs = "viewer_UNIQUEID.render();\n" + self.endjs
+
+
+FEAT_FACTORY = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
+
+
+def rgb_to_hex(rgb):
+ rgb = tuple(round(i * 255) for i in rgb)
+ return '#{:02x}{:02x}{:02x}'.format(rgb[0], rgb[1], rgb[2])
+
+
+def mol_to_pharm3d(mol, mode='html'):
+ try:
+ # AllChem.Compute2DCoords(mol)
+ mol = Chem.AddHs(mol)
+ params = AllChem.ETKDGv3()
+ params.randomSeed = 0xf00d # for reproducibility
+ AllChem.EmbedMolecule(mol, params)
+
+ feats = FEAT_FACTORY.GetFeaturesForMol(mol)
+
+ view = View3DmolCell(width=320, height=200)
+ for feat in feats:
+ pos = feat.GetPos()
+ color = _featColors.get(feat.GetFamily(), (.5, .5, .5))
+ view.addSphere({
+ 'center': {'x': pos.x, 'y': pos.y, 'z': pos.z},
+ 'radius': 0.5,
+ 'color': rgb_to_hex(color)
+ })
+
+ mol_block = Chem.MolToMolBlock(mol)
+ view.addModel(mol_block, 'sdf')
+ view.setStyle({'stick': {}})
+ view.zoomTo()
+
+ if mode == 'html':
+ return view.write_html()
+ # case 'png':
+ # return view.png()
+ except Exception:
+ return None
+
+
+class HelpTip:
+ def __new__(cls, text):
+ return gr.HTML(
+ # elem_classes="absolute",
+ value=f'
{text}
',
+ )
+
+
+TASK_MAP = {
+ 'Compound-Protein Interaction': 'DTI',
+ 'Compound-Protein Binding Affinity': 'DTA',
+}
+
+TASK_METRIC_MAP = {
+ 'DTI': 'AUROC',
+ 'DTA': 'CI',
+ 'Compound-Protein Interaction': 'AUROC',
+ 'Compound-Protein Binding Affinity': 'CI',
+ 'CPI': 'DTI',
+ 'CPA': 'DTA',
+}
+
+PRESET_MAP = {
+ 'DeepDTA': 'deep_dta',
+ 'DeepConvDTI': 'deep_conv_dti',
+ 'GraphDTA': 'graph_dta',
+ 'MGraphDTA': 'm_graph_dta',
+ 'HyperAttentionDTI': 'hyper_attention_dti',
+ 'MolTrans': 'mol_trans',
+ 'TransformerCPI': 'transformer_cpi',
+ 'TransformerCPI2': 'transformer_cpi_2',
+ 'DrugBAN': 'drug_ban',
+ 'DrugVQA-Seq': 'drug_vqa'
+}
+
+TARGET_FAMILY_MAP = {
+ 'General': 'general',
+ 'Kinase': 'kinase',
+ 'Non-Kinase Enzyme': 'non_kinase_enzyme',
+ 'Membrane Receptor': 'membrane_receptor',
+ 'Nuclear Receptor': 'nuclear_receptor',
+ 'Ion Channel': 'ion_channel',
+ 'Others': 'others',
+ # 'general': 'general',
+ # 'kinase': 'kinase',
+ # 'non-kinase enzyme': 'non_kinase_enzyme',
+ # 'membrane receptor': 'membrane_receptor',
+ # 'nuclear Receptor': 'nuclear_receptor',
+ # 'ion channel': 'ion_channel',
+ # 'others': 'others',
+}
+
+TARGET_LIBRARY_MAP = {
+ 'DrugBank (Human)': 'drugbank_targets.csv',
+ 'ChEMBL33 (Human)': 'ChEMBL33_human_proteins.csv',
+}
+
+DRUG_LIBRARY_MAP = {
+ 'DrugBank (Human)': 'drugbank_compounds.csv',
+ 'Drug Repurposing Hub': 'drug_repurposing_hub.csv',
+ 'Enamine Discovery Diversity Set (DDS-10)': 'Enamine_Discovery_Diversity_Set_10_10240cmpds_20240130.csv',
+ 'Enamine Phenotypic Screening Library (PSL-5760)': 'Enamine_Phenotypic_Screening_Library_plated_5760cmds_2020_07_20.csv'
+}
+
+COLUMN_ALIASES = {
+ 'X1': 'Compound SMILES',
+ 'X2': 'Target FASTA',
+ 'ID1': 'Compound ID',
+ 'ID2': 'Target ID',
+ 'Y': 'Actual CPI/CPA',
+ 'Y^': 'Predicted CPI/CPA',
+}
+
+DRUG_SCRENN_CPI_OPTS = [
+ 'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set',
+ 'Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target',
+ 'Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound',
+]
+
+DRUG_SCRENN_CPA_OPTS = [
+ 'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set',
+]
+
+TARGET_IDENTIFY_CPI_OPTS = [
+ 'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set',
+ 'Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound',
+ 'Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target',
+]
+
+TARGET_IDENTIFY_CPA_OPTS = [
+ 'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set',
+]
+
+pd.set_option('display.float_format', '{:.3f}'.format)
+PandasTools.molRepresentation = 'svg'
+PandasTools.drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
+PandasTools.drawOptions.clearBackground = False
+PandasTools.drawOptions.bondLineWidth = 1
+PandasTools.drawOptions.explicitMethyl = True
+PandasTools.drawOptions.singleColourWedgeBonds = True
+PandasTools.drawOptions.useCDKAtomPalette()
+PandasTools.molSize = (100, 64)
+
+
+def remove_job_record(job_id):
+ # Delete the job from the database
+ db.remove(Job.id == job_id)
+ # Delete the corresponding files
+ files = glob.glob(f"{SERVER_DATA_DIR}/{job_id}*")
+ for file_path in files:
+ if os.path.exists(file_path):
+ os.remove(file_path)
+
+
+def check_expiry():
+ Job = Query()
+ jobs = db.all()
+
+ for job in jobs:
+ # Check if the job has expired
+ if job['status'] != 'RUNNING':
+ expiry_time = job['expiry_time'] if job['expiry_time'] is not None else job['start_time'] + DB_EXPIRY
+ if expiry_time < time():
+ # Delete the job from the database
+ db.remove(Job.id == job['id'])
+ # Delete the corresponding file
+ files = glob.glob(f"{SERVER_DATA_DIR}/{job['id']}*")
+ for file_path in files:
+ if os.path.exists(file_path):
+ os.remove(file_path)
+ elif job['status'] == 'RUNNING' and time() - job['start_time'] > 4 * 60 * 60: # 4 hours
+ # Mark the job as failed
+ db.update({'status': 'FAILED',
+ 'error': 'Job has timed out by exceeding the maximum running time of 4 hours.'},
+ Job.id == job['id'])
+ if job.get('email'):
+ send_email(job)
+
+
+def smiles_to_ecfp(smiles):
+ mol = Chem.MolFromSmiles(smiles)
+ if mol:
+ ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
+ else:
+ ecfp = []
+ return ecfp
+
+
+def max_tanimoto_similarity(smi, seen_smiles_with_fp):
+ if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty:
+ return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
+
+ if smi in seen_smiles_with_fp['X1'].values:
+ compound = smi
+ if 'ID1' in seen_smiles_with_fp.columns:
+ id1 = seen_smiles_with_fp.loc[seen_smiles_with_fp['X1'] == smi, 'ID1'].values[0]
+ if pd.notnull(id1) and id1 != '':
+ compound = id1
+ return {'Max. Tanimoto Similarity': 1, 'Max. Tanimoto Similarity Compound': compound}
+
+ mol = Chem.MolFromSmiles(smi)
+ if mol is None:
+ return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
+
+ mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
+ sims = pd.Series(BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'].values)).to_numpy()
+ idx = sims.argmax()
+ compound = seen_smiles_with_fp.iloc[idx]['X1']
+ if 'ID1' in seen_smiles_with_fp.columns:
+ id1 = seen_smiles_with_fp.iloc[idx]['ID1']
+ if pd.notnull(id1) and id1 != '':
+ compound = id1
+
+ return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound}
+
+
+def alignment_score(query, target):
+ aligner = PairwiseAligner()
+ aligner.mode = 'local'
+ alignment = aligner.align(query, target)
+ return alignment.score / max(len(query), len(target))
+
+
+def max_sequence_identity(seq, seen_fastas):
+ if seq is None or seen_fastas is None or seen_fastas.empty:
+ return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None}
+
+ if seq in seen_fastas['X2'].values:
+ target = seq
+ if 'ID2' in seen_fastas.columns:
+ id2 = seen_fastas.loc[seen_fastas['X2'] == seq, 'ID2'].values[0]
+ if pd.notnull(id2) and id2 != '':
+ target = id2
+ return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
+
+ cached_alignment_score = cache(alignment_score)
+ max_iden = 0
+ target = None
+ for fasta in seen_fastas['X2'].values:
+ identity = cached_alignment_score(seq, fasta)
+
+ if identity > max_iden:
+ max_iden = identity
+ target = fasta
+ if 'ID2' in seen_fastas.columns:
+ id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
+ if pd.notnull(id2) and id2 != '':
+ target = id2
+ if max_iden == 1:
+ break
+
+ cached_alignment_score.cache_clear()
+ return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target}
+
+
+def get_seen_smiles(family, task):
+ if family == 'General':
+ family = 'all_families_full'
+ else:
+ family = TARGET_FAMILY_MAP[family.title()]
+ seen_smiles = pd.read_csv(
+ f'data/benchmarks/seen_compounds/{family}_{task.lower()}_random_split.csv')
+ return seen_smiles
+
+
+def get_seen_fastas(family, task):
+ if family == 'General':
+ family = 'all_families_full'
+ else:
+ family = TARGET_FAMILY_MAP[family.title()]
+ seen_fastas = pd.read_csv(
+ f'data/benchmarks/seen_targets/{family}_{task.lower()}_random_split.csv')
+ return seen_fastas
+
+
+@cache
+def get_fasta_family_map():
+ usecols = ['X2', 'ID2', 'Target Family']
+ fasta_family_map = pd.concat([
+ pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv', usecols=usecols),
+ pd.read_csv('data/target_libraries/idmapping_not_in_chembl.csv', usecols=usecols)
+ ]).drop_duplicates(subset=['X2'], keep='first')
+ return fasta_family_map
+
+
+def lipinski(mol):
+ """
+ Lipinski's rules:
+ Hydrogen bond donors <= 5
+ Hydrogen bond acceptors <= 10
+ Molecular weight <= 500 daltons
+ logP <= 5
+ """
+ return (
+ Lipinski.NumHDonors(mol) <= 5 and
+ Lipinski.NumHAcceptors(mol) <= 10 and
+ Descriptors.MolWt(mol) <= 500 and
+ Crippen.MolLogP(mol) <= 5
+ )
+
+
+def reos(mol):
+ """
+ Rapid Elimination Of Swill filter:
+ Molecular weight between 200 and 500
+ LogP between -5.0 and +5.0
+ H-bond donor count between 0 and 5
+ H-bond acceptor count between 0 and 10
+ Formal charge between -2 and +2
+ Rotatable bond count between 0 and 8
+ Heavy atom count between 15 and 50
+ """
+ return (
+ 200 <= Descriptors.MolWt(mol) <= 500 and
+ -5.0 <= Crippen.MolLogP(mol) <= 5.0 and
+ 0 <= Lipinski.NumHDonors(mol) <= 5 and
+ 0 <= Lipinski.NumHAcceptors(mol) <= 10 and
+ -2 <= rdmolops.GetFormalCharge(mol) <= 2 and
+ 0 <= rdMolDescriptors.CalcNumRotatableBonds(mol) <= 8 and
+ 15 <= rdMolDescriptors.CalcNumHeavyAtoms(mol) <= 50
+ )
+
+
+def ghose(mol):
+ """
+ Ghose drug like filter:
+ Molecular weight between 160 and 480
+ LogP between -0.4 and +5.6
+ Atom count between 20 and 70
+ Molar refractivity between 40 and 130
+ """
+ return (
+ 160 <= Descriptors.MolWt(mol) <= 480 and
+ -0.4 <= Crippen.MolLogP(mol) <= 5.6 and
+ 20 <= rdMolDescriptors.CalcNumAtoms(mol) <= 70 and
+ 40 <= Crippen.MolMR(mol) <= 130
+ )
+
+
+def veber(mol):
+ """
+ The Veber filter is a rule of thumb filter for orally active drugs described in
+ Veber et al., J Med Chem. 2002; 45(12): 2615-23.:
+ Rotatable bonds <= 10
+ Topological polar surface area <= 140
+ """
+ return (
+ rdMolDescriptors.CalcNumRotatableBonds(mol) <= 10 and
+ rdMolDescriptors.CalcTPSA(mol) <= 140
+ )
+
+
+def rule_of_three(mol):
+ """
+ Rule of Three filter (Congreve et al., Drug Discov. Today. 8 (19): 876–7, (2003).):
+ Molecular weight <= 300
+ LogP <= 3
+ H-bond donor <= 3
+ H-bond acceptor count <= 3
+ Rotatable bond count <= 3
+ """
+ return (
+ Descriptors.MolWt(mol) <= 300 and
+ Crippen.MolLogP(mol) <= 3 and
+ Lipinski.NumHDonors(mol) <= 3 and
+ Lipinski.NumHAcceptors(mol) <= 3 and
+ rdMolDescriptors.CalcNumRotatableBonds(mol) <= 3
+ )
+
+
+@cache
+def load_smarts_patterns(smarts_path):
+ # Load the CSV file containing SMARTS patterns
+ smarts_df = pd.read_csv(Path(smarts_path))
+ # Convert all SMARTS patterns to molecules
+ smarts_mols = [Chem.MolFromSmarts(smarts) for smarts in smarts_df['smarts']]
+ return smarts_mols
+
+
+def smarts_filter(mol, smarts_mols):
+ for smarts_mol in smarts_mols:
+ if smarts_mol is not None and mol.HasSubstructMatch(smarts_mol):
+ return False
+ return True
+
+
+def pains(mol):
+ smarts_mols = load_smarts_patterns("data/filters/pains.csv")
+ return smarts_filter(mol, smarts_mols)
+
+
+def mlsmr(mol):
+ smarts_mols = load_smarts_patterns("data/filters/mlsmr.csv")
+ return smarts_filter(mol, smarts_mols)
+
+
+def dundee(mol):
+ smarts_mols = load_smarts_patterns("data/filters/dundee.csv")
+ return smarts_filter(mol, smarts_mols)
+
+
+def glaxo(mol):
+ smarts_mols = load_smarts_patterns("data/filters/glaxo.csv")
+ return smarts_filter(mol, smarts_mols)
+
+
+def bms(mol):
+ smarts_mols = load_smarts_patterns("data/filters/bms.csv")
+ return smarts_filter(mol, smarts_mols)
+
+
+SCORE_MAP = {
+ 'SAscore': sascorer.calculateScore,
+ 'LogP': Crippen.MolLogP,
+ 'Molecular Weight': Descriptors.MolWt,
+ 'Number of Atoms': rdMolDescriptors.CalcNumAtoms,
+ 'Number of Heavy Atoms': rdMolDescriptors.CalcNumHeavyAtoms,
+ 'Molar Refractivity': Crippen.MolMR,
+ 'H-Bond Donor Count': Lipinski.NumHDonors,
+ 'H-Bond Acceptor Count': Lipinski.NumHAcceptors,
+ 'Rotatable Bond Count': rdMolDescriptors.CalcNumRotatableBonds,
+ 'Topological Polar Surface Area': rdMolDescriptors.CalcTPSA,
+}
+
+FILTER_MAP = {
+ # TODO support number_of_violations
+ 'REOS': reos,
+ "Lipinski's Rule of Five": lipinski,
+ 'Ghose': ghose,
+ 'Rule of Three': rule_of_three,
+ 'Veber': veber,
+ 'PAINS': pains,
+ 'MLSMR': mlsmr,
+ 'Dundee': dundee,
+ 'Glaxo': glaxo,
+ 'BMS': bms,
+}
+
+
+def validate_columns(df, mandatory_cols):
+ missing_cols = [col for col in mandatory_cols if col not in df.columns]
+ if missing_cols:
+ error_message = (f"The following mandatory columns are missing "
+ f"in the uploaded dataset: {str(mandatory_cols).strip('[]')}.")
+ raise ValueError(error_message)
+ else:
+ return
+
+
+def process_target_fasta(sequence):
+ try:
+ if sequence:
+ lines = sequence.strip().split("\n")
+ if lines[0].startswith(">"):
+ lines = lines[1:]
+ return ''.join(lines).split(">")[0].strip()
+ # record = list(SeqIO.parse(io.StringIO(sequence), "fasta"))[0]
+ # return str(record.seq)
+ else:
+ raise ValueError('Empty FASTA sequence.')
+ except Exception as e:
+ raise gr.Error(f'Failed to process FASTA due to error: {str(e)}')
+
+
+def send_email(job_info):
+ if job_info.get('email'):
+ try:
+ email_info = job_info.copy()
+ email_serv = os.getenv('EMAIL_SERV')
+ email_port = os.getenv('EMAIL_PORT')
+ email_addr = os.getenv('EMAIL_ADDR')
+ email_pass = os.getenv('EMAIL_PASS')
+ email_form = os.getenv('EMAIL_FORM')
+ email_subj = os.getenv('EMAIL_SUBJ')
+
+ for key, value in email_info.items():
+ if key.endswith("time") and value:
+ email_info[key] = ts_to_str(value, get_timezone_by_ip(email_info['ip']))
+
+ server = smtplib.SMTP(email_serv, int(email_port))
+ # server.starttls()
+
+ server.login(email_addr, email_pass)
+ msg = MIMEMultipart("alternative")
+ msg["From"] = email_addr
+ msg["To"] = email_info['email']
+ msg["Subject"] = email_subj.format(**email_info)
+ msg["Date"] = formatdate(localtime=True)
+ msg["Message-ID"] = make_msgid()
+
+ msg.attach(MIMEText(markdown(email_form.format(**email_info)), 'html'))
+ msg.attach(MIMEText(email_form.format(**email_info), 'plain'))
+
+ server.sendmail(email_addr, email_info['email'], msg.as_string())
+ server.quit()
+ gr.Info('Email notification sent.')
+ except Exception as e:
+ gr.Warning('Failed to send email notification due to error: ' + str(e))
+
+
+def check_user_running_job(email, request):
+ message = ("You already have a running prediction job (ID: {id}) under this {reason}. "
+ "Please wait for it to complete before submitting another job.")
+ try:
+ # with open('jobs.json', 'r') as f: # /data/
+ # # Load the JSON data from the file
+ # jobs = json.load(f)
+ #
+ # for job_id, job_info in jobs.items():
+ # # check if a job is running for the email
+ # if email:
+ # if job_info["email"] == email and job_info["status"] == "running":
+ # return message.format(id=job_id, reason="email")
+ # # check if a job is running for the session
+ # elif request.cookies:
+ # for key, value in job_info["cookies"].items() and job_info["status"] == "running":
+ # if key in request.cookies and request.cookies[key] == value:
+ # return message.format(id=job_id, reason="session")
+ # # check if a job is running for the IP
+ # else:
+ # if job_info["IP"] == request.client.host and job_info["status"] == "running":
+ # return message.format(id=job_id, reason="IP")
+ # check if a job is running for the email
+ Job = Query()
+ if email:
+ job = db.search((Job.email == email) & (Job.status == "RUNNING"))
+ if job:
+ return message.format(id=job[0]['id'], reason="email")
+ # check if a job is running for the session
+ elif request.cookies:
+ for key, value in request.cookies.items():
+ job = db.search((Job.cookies[key] == value) & (Job.status == "RUNNING"))
+ if job:
+ return message.format(id=job[0]['id'], reason="session")
+ # check if a job is running for the IP
+ else:
+ job = db.search((Job.IP == request.client.host) & (Job.status == "RUNNING"))
+ if job:
+ return message.format(id=job[0]['id'], reason="IP")
+
+ return False
+ except Exception as e:
+ raise gr.Error(f'Failed to validate user running jobs due to error: {str(e)}')
+
+
+def get_timezone_by_ip(ip):
+ try:
+ data = session.get(f'https://worldtimeapi.org/api/ip/{ip}').json()
+ return data['timezone']
+ except Exception:
+ return 'UTC'
+
+
+def ts_to_str(timestamp, timezone):
+ # Create a timezone-aware datetime object from the UNIX timestamp
+ dt = datetime.fromtimestamp(timestamp, pytz.utc)
+
+ # Convert the timezone-aware datetime object to the target timezone
+ target_timezone = pytz.timezone(timezone)
+ localized_dt = dt.astimezone(target_timezone)
+
+ # Format the datetime object to the specified string format
+ return localized_dt.strftime('%Y-%m-%d %H:%M:%S (%Z%z)')
+
+
+def lookup_job(job_id):
+ gr.Info('Start querying the job database...')
+ stop = False
+ retry = 0
+ while not stop:
+ try:
+ sleep(5)
+ Job = Query()
+ jobs = db.search((Job.id == job_id))
+ if jobs:
+ job = jobs[0]
+ job_status = job['status']
+ job_type = job['type']
+ error = job['error']
+ start_time = ts_to_str(job['start_time'], get_timezone_by_ip(job['ip']))
+ if job.get('end_time'):
+ end_time = ts_to_str(job['end_time'], get_timezone_by_ip(job['ip']))
+ if job.get('expiry_time'):
+ expiry_time = ts_to_str(job['expiry_time'], get_timezone_by_ip(job['ip']))
+ if job_status == "RUNNING":
+ yield {
+ pred_lookup_status: f'''
+Your **{job_type}** job (ID: **{job_id}**) started at
+**{start_time}** and is **RUNNING...**
+
+It might take a few minutes up to a few hours depending on the prediction dataset, the model, and the queue status.
+You may keep the page open and wait for job completion, or close the page and revisit later to look up the job status
+using the job id. You will also receive an email notification once the job is done.
+''',
+ pred_lookup_btn: gr.Button(visible=False),
+ pred_lookup_stop_btn: gr.Button(visible=True)
+ }
+ if job_status == "COMPLETED":
+ stop = True
+ msg = f"Your {job_type} job (ID: {job_id}) has been **COMPLETED**"
+ msg += f" at {end_time}" if job.get('end_time') else ""
+ msg += f" and the results will expire by {expiry_time}." if job.get('expiry_time') else "."
+ msg += f' Redirecting to the report page...'
+
+ gr.Info(msg)
+ yield {
+ pred_lookup_status: msg,
+ pred_lookup_btn: gr.Button(visible=True),
+ pred_lookup_stop_btn: gr.Button(visible=False),
+ tabs: gr.Tabs(selected='Chemical Property Report'),
+ file_for_report: job['output_file']
+ }
+ if job_status == "FAILED":
+ stop = True
+ msg = f'Your {job_type} job (ID: {job_id}) has **FAILED**'
+ msg += f' at {end_time}' if job.get('end_time') else ''
+ msg += f' due to error: {error}.' if job.get('expiry_time') else '.'
+ gr.Info(msg)
+ yield {
+ pred_lookup_status: msg,
+ pred_lookup_btn: gr.Button(visible=True),
+ pred_lookup_stop_btn: gr.Button(visible=False),
+ tabs: gr.Tabs(selected='Prediction Status Lookup'),
+ }
+ else:
+ stop = (retry > 3)
+ if not stop:
+ msg = f'Job ID {job_id} not found. Retrying... ({retry})'
+ else:
+ msg = f'Job ID {job_id} not found after {retry} retries. Please check the job ID and try again.'
+ gr.Info(msg)
+ retry += 1
+ yield {
+ pred_lookup_status: msg,
+ pred_lookup_btn: gr.Button(visible=True),
+ pred_lookup_stop_btn: gr.Button(visible=False),
+ tabs: gr.Tabs(selected='Prediction Status Lookup'),
+ }
+
+ except Exception as e:
+ raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
+
+
+def apply_advanced_opts(prediction_df, opts, df_training):
+ # Advanced options for Drug Hit Screening
+ if "Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set" in opts:
+ x2 = prediction_df['X2'].iloc[0]
+
+ prediction_df[[
+ 'Max. Sequence Identity to Training Targets',
+ 'Max. Id. Training Target'
+ ]] = pd.Series(max_sequence_identity(x2, df_training))
+
+ if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
+ x2 = prediction_df['X2'].iloc[0]
+ pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
+ pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp)
+
+ @cache
+ def max_sim(smiles):
+ return max_tanimoto_similarity(smiles, pos_compounds_df)
+
+ prediction_df[[
+ 'Max. Tanimoto Similarity to Known Ligands',
+ 'Max. Sim. Ligand'
+ ]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
+
+ max_sim.cache_clear()
+
+ if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
+ x2 = prediction_df['X2'].iloc[0]
+ prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
+
+ @cache
+ def max_id(compound):
+ pos_targets_df = df_training.loc[df_training['X1'] == compound]
+ return max_sequence_identity(x2, pos_targets_df)
+
+ prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
+ 'Max. Id. Target']] = (
+ prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series)
+ )
+ prediction_df.drop(['X1^'], axis=1, inplace=True)
+
+ max_id.cache_clear()
+
+ # Advanced options for Target Protein Identification
+ if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
+ x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
+ if x1 not in df_training['X1'].values:
+ df_training['FP'] = df_training['X1'].parallel_apply(smiles_to_ecfp)
+
+ prediction_df[[
+ 'Max. Tanimoto Similarity to Training Compounds',
+ 'Max. Sim. Training Compound'
+ ]] = pd.Series(max_tanimoto_similarity(x1, df_training))
+
+ if "Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound" in opts:
+ x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
+ pos_targets_df = df_training.loc[(df_training['X1'] == x1) & (df_training['Y'] == 1)].copy()
+
+ @cache
+ def max_id(fasta):
+ return max_sequence_identity(fasta, pos_targets_df)
+
+ prediction_df[[
+ 'Max. Sequence Identity to Known Targets of Input Compound',
+ 'Max. Id. Target'
+ ]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series)
+
+ max_id.cache_clear()
+
+ if "Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target" in opts:
+ x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
+
+ @cache
+ def max_sim(fasta):
+ pos_targets_df = df_training.loc[(df_training['X2'] == fasta) & (df_training['Y'] == 1)].copy()
+ if x1 not in pos_targets_df['X1'].values:
+ pos_targets_df['FP'] = pos_targets_df['X1'].apply(smiles_to_ecfp)
+ return max_tanimoto_similarity(x1, pos_targets_df)
+
+ prediction_df[[
+ 'Max. Tanimoto Similarity to Known Ligands of Identified Target',
+ 'Max. Sim. Ligand'
+ ]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series)
+
+ max_sim.cache_clear()
+
+ return prediction_df
+
+
+def submit_predict(predict_filepath, task, preset, target_family, opts, job_info):
+ job_id = job_info['id']
+ status = job_info['status']
+ send_email(job_info)
+ db.insert(job_info)
+ error = None
+ task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
+ predictions_file = None
+ df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
+ df_training['X1^'] = df_training['X1']
+ orig_df = pd.read_csv(predict_filepath)
+ alignment_df = get_fasta_family_map()
+ prediction_df = pd.DataFrame()
+
+ @cache
+ def detect_family(query):
+ # Check for an exact match first
+ exact_match = alignment_df[alignment_df['X2'] == query]
+ if not exact_match.empty:
+ row = exact_match.iloc[0]
+ return row['Target Family']
+ # If no exact match, then calculate alignment score
+ else:
+ aligner = PairwiseAligner()
+ aligner.mode = 'local'
+
+ def align_score(target):
+ alignment = aligner.align(query, target)
+ return alignment.score / max(len(query), len(target))
+
+ alignment_df['score'] = alignment_df['X2'].apply(align_score)
+ row = alignment_df.loc[alignment_df['score'].idxmax()]
+ return row['Target Family']
+
+ if 'Target Family' not in orig_df.columns:
+ orig_df['Target Family'] = None
+ if orig_df['Target Family'].isna().any():
+ if orig_df['X2'].nunique() > 1:
+ orig_df = orig_df.reset_index(drop=True)
+ orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
+ orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
+ )
+ else:
+ orig_df['Target Family'] = detect_family(orig_df['X2'].iloc[0])
+ orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
+ detect_family.cache_clear()
+
+ orig_df['X1^'] = orig_df['X1'].parallel_apply(rdkit_canonicalize)
+
+ orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
+ annotated_df = orig_df[~orig_df['Y'].isna()].copy()
+ annotated_df.rename(columns={'Y': 'Y^'}, inplace=True)
+ annotated_df['Source'] = 'Database'
+ columns_to_drop = ['X1^', 'Compound', 'Scaffold', 'Scaffold SMILES']
+ columns_to_drop = [col for col in columns_to_drop if col in annotated_df.columns]
+ annotated_df.drop(columns_to_drop, axis=1, inplace=True)
+
+ # Save the unannotated data
+ unannotated_df = orig_df[orig_df['Y'].isna()].drop(['Y'], axis=1)
+ if not unannotated_df.empty:
+ unannotated_df.to_csv(predict_filepath, index=False, na_rep='')
+ else:
+ annotated_df.to_csv(predictions_file, index=False, na_rep='')
+ status = "COMPLETED"
+ return {run_state: False}
+
+ columns_to_drop = ['ID1', 'X1^', 'Compound', 'Scaffold', 'Scaffold SMILES', 'ID2', 'Y', 'Y^']
+ columns_to_drop = [col for col in columns_to_drop if col in orig_df.columns]
+ orig_df.drop(columns_to_drop, axis=1, inplace=True)
+
+ try:
+ if target_family != 'Family-Specific Auto-Recommendation':
+ target_family_value = TARGET_FAMILY_MAP[target_family.title()]
+ task_value = TASK_MAP[task]
+ preset_value = PRESET_MAP[preset]
+ predictions_file = (f'{SERVER_DATA_DIR}/'
+ f'{job_id}_{task_file_abbr[task]}_{preset}_{target_family_value}_predictions.csv')
+
+ cfg = hydra.compose(
+ config_name="webserver_inference",
+ overrides=[f"task={task_value}",
+ f"preset={preset_value}",
+ f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family_value}.ckpt",
+ f"data.data_file='{str(predict_filepath)}'"])
+
+ predictions, _ = predict(cfg)
+ predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
+ predictions['Source'] = f'Predicted ({preset} {target_family})'
+ df_list = [prediction_df, predictions]
+ prediction_df = pd.concat([df for df in df_list if not df.empty])
+
+ else:
+ predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_family-recommended_predictions.csv'
+ task_value = TASK_MAP[task]
+ score = TASK_METRIC_MAP[task]
+ benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
+ predict_df = pd.read_csv(predict_filepath)
+
+ for family, subset in predict_df.groupby('Target Family'):
+ predict_subset_filepath = os.path.join(
+ os.path.dirname(predict_filepath), f'{job_id}_{family}_input.csv'
+ )
+ subset.to_csv(predict_subset_filepath, index=False, na_rep='')
+
+ seen_compounds = get_seen_smiles(family, task_value)['X1'].values
+ if subset['X1^'].iloc[0] in seen_compounds:
+ scenario = "Seen Compound"
+ else:
+ scenario = "Unseen Compound"
+
+ filtered_df = benchmark_df[(benchmark_df['Family'] == family.title())
+ & (benchmark_df['Scenario'] == scenario)
+ & (benchmark_df['Type'] == 'Family')]
+
+ seen_compounds = get_seen_smiles('General', task_value)['X1'].values
+ if subset['X1^'].iloc[0] in seen_compounds:
+ scenario = "Seen Compound"
+ else:
+ scenario = "Unseen Compound"
+
+ filtered_df = pd.concat([
+ filtered_df,
+ benchmark_df[(benchmark_df['Family'] == family.title())
+ & (benchmark_df['Scenario'] == scenario)
+ & (benchmark_df['Type'] == 'General')]
+ ])
+
+ row = filtered_df.loc[filtered_df[score].idxmax()]
+ preset_value = PRESET_MAP[row['Model']]
+ target_family = TARGET_FAMILY_MAP[family.title()] if row['Type'] == 'Family' else 'general'
+ cfg = hydra.compose(
+ config_name="webserver_inference",
+ overrides=[f"task={task_value}",
+ f"preset={preset_value}",
+ f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
+ f"data.data_file='{str(predict_subset_filepath)}'"])
+
+ predictions, _ = predict(cfg)
+ predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
+ predictions['Source'] = (f'Predicted ({row["Model"]} '
+ f'{family.title() if row["Type"] == "Family" else "General"})')
+ df_list = [prediction_df, predictions]
+ prediction_df = pd.concat([df for df in df_list if not df.empty])
+
+ prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
+ df_list = [prediction_df, annotated_df]
+ prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True)
+
+ prediction_df = apply_advanced_opts(prediction_df, opts, df_training)
+
+ prediction_df.drop(['N', 'FP'], axis=1, errors='ignore').to_csv(predictions_file, index=False, na_rep='')
+ status = 'COMPLETED'
+
+ return {run_state: False}
+
+ except Exception as e:
+ gr.Warning(f"Prediction job failed due to error: {str(e)}")
+ status = "FAILED"
+ predictions_file = None
+ error = str(e)
+ return {run_state: False}
+
+ finally:
+ Job = Query()
+ job_query = (Job.id == job_id)
+
+ end_time = time()
+ expiry_time = end_time + DB_EXPIRY
+
+ db.update({'end_time': end_time,
+ 'expiry_time': expiry_time,
+ 'status': status,
+ 'error': error,
+ 'input_file': predict_filepath,
+ 'output_file': predictions_file},
+ job_query)
+ if job_info := db.search(job_query)[0]:
+ if job_info.get('email'):
+ send_email(job_info)
+
+
+def update_df(file, progress=gr.Progress(track_tqdm=True)):
+ if file and Path(file).is_file():
+ task = None
+ job = None
+
+ if "_CPI_" in str(file):
+ task = 'Compound-Protein Interaction'
+ elif "_CPA_" in str(file):
+ task = 'Compound-Protein Binding Affinity'
+
+ df = pd.read_csv(file)
+
+ if 'N' in df.columns:
+ df.set_index('N', inplace=True)
+
+ if not any(col in ['X1', 'X2'] for col in df.columns):
+ gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.")
+ return {analyze_btn: gr.Button(interactive=False)}
+
+ if 'X1' in df.columns:
+ if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
+ df['Compound'] = df['X1'].parallel_apply(
+ lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
+ df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
+ df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
+ df['Pharmacophore'] = None
+ if task == 'Compound-Protein Binding Affinity':
+ # Convert Y^ from pIC50 (nM) to IC50 (nM)
+ if 'Y^' in df.columns:
+ df['Y^'] = 10 ** (-df['Y^']) * 1e9
+
+ n_compound = df['X1'].nunique()
+ n_protein = df['X2'].nunique()
+
+ if n_compound == 1 and n_protein >= 2:
+ job = 'Target Protein Identification'
+ if task == 'Compound-Protein Interaction':
+ opts = TARGET_IDENTIFY_CPI_OPTS
+ elif task == 'Compound-Protein Binding Affinity':
+ opts = TARGET_IDENTIFY_CPA_OPTS
+ if n_compound >= 2 and n_protein == 1:
+ job = 'Drug Hit Screening'
+ if task == 'Compound-Protein Interaction':
+ opts = DRUG_SCRENN_CPI_OPTS
+ elif task == 'Compound-Protein Binding Affinity':
+ opts = DRUG_SCRENN_CPA_OPTS
+
+ return {
+ html_report: create_html_report(df, file=None, task=task),
+ raw_df: df,
+ report_df: df.copy(),
+ analyze_btn: gr.Button(interactive=True),
+ report_task: task,
+ job_opts: gr.CheckboxGroup(
+ label=f'{job} Advanced Options',
+ choices=opts, visible=True
+ ) if job else gr.CheckboxGroup(visible=False),
+ }
+ else:
+ return {analyze_btn: gr.Button(interactive=False)}
+
+
+def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(track_tqdm=True)):
+ df_html = df.copy(deep=True)
+ column_aliases = COLUMN_ALIASES.copy()
+ cols_left = list(pd.Index([
+ 'ID1', 'ID2', 'Compound', 'Scaffold', 'Pharmacophore', 'X1', 'Scaffold SMILES', 'X2', 'Y^'
+ ]).intersection(df_html.columns))
+ # cols_right = list(pd.Index(['X1', 'X2']).intersection(df_html.columns))
+ # df_html = df_html[cols_left + (df_html.columns.drop(cols_left + cols_right).tolist()) + cols_right]
+ df_html = df_html[cols_left + df_html.columns.drop(cols_left).tolist()]
+
+ if isinstance(task, str):
+ column_aliases.update({
+ 'Y^': 'Interaction Probability' if task == 'Compound-Protein Interaction'
+ else 'Binding Affinity (IC50 [nM])'
+ })
+
+ ascending = True if column_aliases['Y^'] == 'Binding Affinity (IC50 [nM])' else False
+ df_html = df_html.sort_values(
+ [col for col in ['Y^'] if col in df_html.columns], ascending=ascending
+ )
+
+ if not file:
+ df_html = df_html.iloc[:31]
+
+ # Remove repeated info for one-against-N tasks to save visual and physical space
+ job = 'Chemical Property'
+ unique_entity = 'Unique Entity'
+ unique_df = None
+ category = None
+ columns_unique = None
+
+ if 'Exclude Pharmacophore 3D' not in opts:
+ df_html['Pharmacophore'] = df_html['Compound'].parallel_apply(
+ lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
+
+ if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
+ df_html['Compound'] = df_html['Compound'].parallel_apply(
+ lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
+ else:
+ df_html.drop(['Compound'], axis=1, inplace=True)
+
+ if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
+ df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
+ lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
+ else:
+ df_html.drop(['Scaffold'], axis=1, inplace=True)
+
+ if 'X1' in df_html.columns and 'X2' in df_html.columns:
+ n_compound = df_html['X1'].nunique()
+ n_protein = df_html['X2'].nunique()
+
+ if n_compound == 1 and n_protein >= 2:
+ unique_entity = 'Compound of Interest'
+ if any(col in df_html.columns for col in ['Y^', 'Y']):
+ job = 'Target Protein Identification'
+ category = 'Target Family'
+ columns_unique = df_html.columns.isin(
+ ['ID1', 'Compound', 'Scaffold', 'X1', 'Scaffold SMILES', 'Pharmacophore',
+ 'Max. Tanimoto Similarity to Training Compounds', 'Max. Sim. Training Compound']
+ + list(FILTER_MAP.keys()) + list(SCORE_MAP.keys())
+ )
+
+ elif n_compound >= 2 and n_protein == 1:
+ unique_entity = 'Target of Interest'
+ if any(col in df_html.columns for col in ['Y^', 'Y']):
+ job = 'Drug Hit Screening'
+ category = 'Scaffold SMILES'
+ columns_unique = df_html.columns.isin(
+ ['X2', 'ID2', 'Max. Sequence Identity to Training Targets', 'Max. Id. Training Target']
+ )
+
+ elif 'Y^' in df_html.columns:
+ job = 'Interaction Pair Inference'
+
+ df_html.rename(columns=column_aliases, inplace=True)
+ df_html.index.name = 'Index'
+ if 'Target FASTA' in df_html.columns:
+ df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
+ lambda x: wrap_text(x) if not pd.isna(x) else x)
+
+ num_cols = df_html.select_dtypes('number').columns
+ num_col_colors = sns.color_palette('husl', len(num_cols))
+ bool_cols = df_html.select_dtypes(bool).columns
+ bool_col_colors = {True: 'lightgreen', False: 'lightpink'}
+
+ if columns_unique is not None:
+ unique_df = df_html.loc[:, columns_unique].iloc[[0]].copy()
+ df_html = df_html.loc[:, ~columns_unique]
+ df_html.dropna(how='all', axis=1, inplace=True)
+ unique_df.dropna(how='all', axis=1, inplace=True)
+
+ if not file:
+ if 'Compound ID' in df_html.columns:
+ df_html.drop(['Compound SMILES'], axis=1, inplace=True)
+ if 'Target ID' in df_html.columns:
+ df_html.drop(['Target FASTA'], axis=1, inplace=True)
+ if 'Target FASTA' in df_html.columns:
+ df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
+ lambda x: wrap_text(x) if not pd.isna(x) else x)
+ if 'Scaffold SMILES' in df_html.columns:
+ df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
+
+ # FIXME: Temporarily drop pharmacophore column before an image solution is found
+ if 'Pharmacophore' in df_html.columns:
+ df_html.drop(['Pharmacophore'], axis=1, inplace=True)
+ if unique_df is not None and 'Pharmacophore' in unique_df.columns:
+ unique_df.drop(['Pharmacophore'], axis=1, inplace=True)
+
+ styled_df = df_html.fillna('').style.format(precision=3)
+
+ for i, col in enumerate(num_cols):
+ cmap = sns.light_palette(num_col_colors[i], as_cmap=True)
+ if col in df_html.columns:
+ if col not in ['Binding Affinity (IC50 [nM])']:
+ cmap.set_bad('white')
+ styled_df = styled_df.background_gradient(
+ subset=[col], cmap=cmap)
+ else:
+ cmap = cmap.reversed()
+ cmap.set_bad('white')
+ styled_df = styled_df.background_gradient(
+ subset=[col], cmap=cmap)
+
+ if any(df_html.columns.isin(bool_cols)):
+ styled_df.applymap(lambda val: f'background-color: {bool_col_colors[val]}', subset=bool_cols)
+
+ table_html = styled_df.to_html()
+ unique_html = ''
+ if unique_df is not None:
+ if 'Target FASTA' in unique_df.columns:
+ unique_df['Target FASTA'] = unique_df['Target FASTA'].str.replace('\n', '
')
+
+ if 'Max. Sequence Identity to Training Targets' in unique_df.columns:
+ # Add alert emoji for sequence identity below 0.85
+ if unique_df['Max. Sequence Identity to Training Targets'].iloc[0] < 0.85:
+ unique_df['Max. Sequence Identity to Training Targets'] = (
+ unique_df['Max. Sequence Identity to Training Targets'].apply(
+ lambda x: f'
{x:.3f}'
+ f' ⚠️Lower than recommended (0.85)'
+ f' - predictive reliability may be compromised'
+ )
+ )
+
+ if 'Max. Tanimoto Similarity to Training Compounds' in unique_df.columns:
+ # Add alert emoji for sequence identity below 0.85
+ if unique_df['Max. Tanimoto Similarity to Training Compounds'].iloc[0] < 0.85:
+ unique_df['Max. Tanimoto Similarity to Training Compounds'] = (
+ unique_df['Max. Tanimoto Similarity to Training Compounds'].apply(
+ lambda x: f'
{x:.3f}'
+ f' ⚠️Lower than recommended (0.85)'
+ f' - predictive reliability may be compromised'
+ )
+ )
+
+ if any(unique_df.columns.isin(bool_cols)):
+ unique_df = unique_df.style.applymap(
+ lambda val: f"background-color: {bool_col_colors[val]}", subset=bool_cols)
+ unique_html = (f'
'
+ f'{unique_df.to_html(escape=False, index=False)}
')
+
+ return (f'
{job} Report Preview (Top 30 Records)
'
+ f'
{unique_html}
'
+ f'
{table_html}
')
+
+ else:
+ image_zoom_formatter = HTMLTemplateFormatter(template='
<%= value %>
')
+ uniprot_id_formatter = HTMLTemplateFormatter(
+ template='<% if (value == value) { ' # Check if value is not NaN
+ 'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) '
+ # Check if value is a valid UniProt ID
+ '{ %>
<%= value %><% '
+ # Else treat it as a sequence or other plain-text string, line-warping every 60 characters
+ '} else { %>
<%= value.match(/.{1,60}/g).join("
") '
+ '%>
<% } %><% } else { %><% } %>' # Output empty string if value is NaN
+ )
+ pubchem_id_formatter = HTMLTemplateFormatter(
+ template='<% if (value == value) { ' # Check if value is not NaN
+ '%>
<%= value %>'
+ '<% } else { %><% } %>' # Output empty string if value is NaN
+ )
+ alert_emoji_formatter = HTMLTemplateFormatter(
+ template='<% if (value < 0.85) { '
+ '%>
<%= value %> '
+ '⚠️Lower than recommended (0.85) - predictive reliability may be compromised<% '
+ '} else { %><%= value %><% } %>'
+ )
+ bool_formatters = {col: BooleanFormatter() for col in bool_cols}
+ float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
+ other_formatters = {
+ 'Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
+ 'Compound': image_zoom_formatter,
+ 'Scaffold': image_zoom_formatter,
+ 'Pharmacophore': {'type': 'executeScriptFormatter'},
+ 'Target FASTA': {'type': 'textarea', 'width': 60},
+ 'Target ID': uniprot_id_formatter,
+ 'Compound ID': pubchem_id_formatter,
+ 'Max. Sim. Ligand': pubchem_id_formatter,
+ 'Max. Id. Target': uniprot_id_formatter,
+ 'Max. Sim. Training Compound': pubchem_id_formatter,
+ 'Max. Id. Training Target': uniprot_id_formatter,
+ 'Max. Sequence Identity to Training Targets': alert_emoji_formatter,
+ 'Max. Sequence Identity to Known Targets of Hit Compound': alert_emoji_formatter,
+ }
+ formatters = {**bool_formatters, **float_formatters, **other_formatters}
+
+ # html = df.to_html(file)
+ # return html
+
+ report_table = pn.widgets.Tabulator(
+ df_html, formatters=formatters,
+ frozen_columns=[
+ 'Index', 'Target ID', 'Compound ID', 'Compound'
+ ],
+ disabled=True, sizing_mode='stretch_both', pagination='local', page_size=10
+ )
+
+ for i, col in enumerate(num_cols):
+ cmap = sns.light_palette(num_col_colors[i], as_cmap=True)
+ if col not in ['Binding Affinity (IC50 [nM])']:
+ if col not in ['Interaction Probability']:
+ cmap.set_bad(color='white')
+ report_table.style.background_gradient(
+ subset=df_html.columns == col, cmap=cmap)
+ else:
+ continue
+ else:
+ cmap = cmap.reversed()
+ cmap.set_bad(color='white')
+ report_table.style.background_gradient(
+ subset=df_html.columns == col, cmap=cmap)
+
+ pie_charts = {}
+ for y in df_html.columns.intersection(['Interaction Probability', 'Binding Affinity (IC50 [nM])']):
+ pie_charts[y] = []
+ for k in [10, 30, 100]:
+ if k < len(df_html):
+ pie_charts[y].append(create_pie_chart(df_html, category=category, value=y, top_k=k))
+ pie_charts[y].append(create_pie_chart(df_html, category=category, value=y, top_k=len(df_html)))
+
+ # Remove keys with empty values
+ pie_charts = {k: v for k, v in pie_charts.items() if any(v)}
+
+ panel_css = """
+ .tabulator {
+ font-family: Courier New !important;
+ font-weight: normal !important;
+ font-size: 12px !important;
+ }
+
+ .tabulator-cell {
+ overflow: visible !important;
+ align-content: center !important;
+ }
+
+ .tabulator-cell:hover {
+ z-index: 1000 !important;
+ }
+
+ .image-zoom-viewer {
+ display: inline-block;
+ overflow: visible;
+ z-index: 1000;
+ }
+
+ .image-zoom-viewer::after {
+ content: "";
+ top: 0;
+ left: 0;
+ width: 100%;
+ height: 100%;
+ pointer-events: none;
+ }
+
+ .image-zoom-viewer:hover::after {
+ pointer-events: all;
+ }
+
+ /* When hovering over the container, scale its child (the SVG) */
+ .tabulator-cell:hover .image-zoom-viewer svg {
+ padding: 3px;
+ position: absolute;
+ background-color: rgba(250, 250, 250, 0.854);
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.618);
+ border-radius: 3px;
+ transform: scale(3); /* Scale up the SVG */
+ transition: transform 0.3s ease;
+ pointer-events: none; /* Prevents the SVG from blocking mouse interactions */
+ z-index: 1000;
+ }
+ """
+
+ pn.extension(
+ raw_css=[panel_css],
+ js_files={'panel_custom': 'static/panel.js', '3Dmol': 'static/3Dmol-min.js'},
+ # js_modules={'3Dmol': 'static/3Dmol-min.js'},
+ inline=True,
+ )
+
+ template = pn.template.VanillaTemplate(
+ title=f'DeepSEQreen {job} Report',
+ sidebar=[],
+ favicon='deepseqreen.ico',
+ logo='deepseqreen.svg',
+ header_background='#F3F5F7',
+ header_color='#4372c4',
+ busy_indicator=None,
+ )
+
+ stats_pane = pn.Row()
+ if unique_df is not None:
+ unique_table = pn.widgets.Tabulator(unique_df, formatters=formatters, sizing_mode='stretch_width',
+ show_index=False, disabled=True,
+ frozen_columns=['Compound ID', 'Compound', 'Target ID'])
+ # if pie_charts:
+ # unique_table.width = 640
+ stats_pane.append(pn.Column(f'### {unique_entity}', unique_table))
+ if pie_charts:
+ for score_name, figure_list in pie_charts.items():
+ stats_pane.append(
+ pn.Column(f'### {category} by Top {score_name}',
+ pn.Tabs(*figure_list, tabs_location='above'))
+ # pn.Card(pn.Row(v), title=f'{category} by Top {k}')
+ )
+
+ if stats_pane:
+ template.main.append(pn.Card(stats_pane,
+ sizing_mode='stretch_width', title='Summary Statistics', margin=10))
+
+ template.main.append(
+ pn.Card(report_table, title=f'{job} Results', # width=1200,
+ margin=10)
+ )
+
+ template.save(file, title=f'DeepSEQreen {job} Report', resources=INLINE)
+ return file
+
+
+def create_pie_chart(df, category, value, top_k):
+ if category not in df or value not in df:
+ return
+ top_k_df = df.nlargest(top_k, value)
+ category_counts = top_k_df[category].value_counts()
+ data = pd.DataFrame({category: category_counts.index, 'value': category_counts.values})
+
+ data['proportion'] = data['value'] / data['value'].sum()
+ # Merge rows with proportion less than 0.2% into one row
+ mask = data['proportion'] < 0.002
+ if any(mask):
+ merged_row = data[mask].sum()
+ merged_row[category] = '...'
+ data = pd.concat([data[~mask], pd.DataFrame(merged_row).T])
+ data['angle'] = data['proportion'] * 2 * pi
+
+ color_dict = {cat: color for cat, color in
+ zip(df[category].unique(),
+ (Category20c_20 * (len(df[category].unique()) // 20 + 1))[:len(df[category].unique())])}
+ color_dict['...'] = '#636363'
+ data['color'] = data[category].map(color_dict)
+
+ tooltips = [
+ (f"{category}", f"@{{{category}}}"),
+ ("Count", "@value"),
+ ("Percentage", "@proportion{0.0%}")
+ ]
+
+ if category == 'Scaffold SMILES' and 'Scaffold' in df.columns:
+ data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left',
+ left_on='Scaffold SMILES', right_on='Scaffold SMILES')
+ tooltips.append(("Scaffold", "
@{Scaffold}{safe}
"))
+ p = figure(height=384, width=960, name=f"Top {top_k}" if top_k < len(df) else 'All', sizing_mode='stretch_height',
+ toolbar_location=None, tools="hover", tooltips=tooltips, x_range=(-0.4, 0.4))
+
+ def truncate_label(label, max_length=60):
+ return label if len(label) <= max_length else label[:max_length] + "..."
+
+ data['legend_field'] = data[category].apply(truncate_label)
+
+ p.add_layout(Legend(padding=0, margin=0), 'right')
+ p.wedge(x=0, y=1, radius=0.3,
+ start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
+ line_color="white", fill_color='color', legend_field='legend_field', source=data)
+
+ # Limit the number of legend items to 20 and add "..." if there are more than 20 items
+ if len(p.legend.items) > 20:
+ new_legend_items = p.legend.items[:20]
+ new_legend_items.append(LegendItem(label="..."))
+ p.legend.items = new_legend_items
+
+ p.legend.label_text_font_size = "10pt"
+ p.legend.label_text_font = "courier"
+ p.axis.axis_label = None
+ p.axis.visible = False
+ p.grid.grid_line_color = None
+ p.outline_line_width = 0
+ p.min_border = 0
+ p.margin = 0
+
+ return p
+
+
+def submit_report(df, score_list, filter_list, opt_list, task, progress=gr.Progress(track_tqdm=True)):
+ df_report = df.copy()
+ try:
+ for filter_name in filter_list:
+ df_report[filter_name] = df_report['Compound'].parallel_apply(
+ lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
+
+ for score_name in score_list:
+ df_report[score_name] = df_report['Compound'].parallel_apply(
+ lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
+
+ if opt_list:
+ df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
+ df_report = apply_advanced_opts(df_report, opt_list, df_training)
+
+ return (create_html_report(df_report, file=None, task=task), df_report,
+ gr.File(visible=False), gr.File(visible=False))
+
+ except Exception as e:
+ gr.Warning(f'Failed to report results due to error: {str(e)}')
+ return None, None, None, None
+
+
+def wrap_text(text, line_length=60):
+ if isinstance(text, str):
+ wrapper = textwrap.TextWrapper(width=line_length)
+ if text.startswith('>'):
+ sections = text.split('>')
+ wrapped_sections = []
+ for section in sections:
+ if not section:
+ continue
+ lines = section.split('\n')
+ seq_header = lines[0]
+ wrapped_seq = wrapper.fill(''.join(lines[1:]))
+ wrapped_sections.append(f">{seq_header}\n{wrapped_seq}")
+ return '\n'.join(wrapped_sections)
+ else:
+ return wrapper.fill(text)
+ else:
+ return text
+
+
+def unwrap_text(text):
+ return text.strip.replece('\n', '')
+
+
+def drug_library_from_sdf(sdf_path):
+ return PandasTools.LoadSDF(
+ sdf_path,
+ smilesName='X1', molColName='Compound', includeFingerprints=True
+ )
+
+
+def process_target_library_upload(library_upload):
+ if library_upload.endswith('.csv'):
+ df = pd.read_csv(library_upload)
+ elif library_upload.endswith('.fasta'):
+ df = target_library_from_fasta(library_upload)
+ else:
+ raise gr.Error('Currently only CSV and FASTA files are supported as target libraries.')
+ validate_columns(df, ['X2'])
+ return df
+
+
+def process_drug_library_upload(library_upload):
+ if library_upload.endswith('.csv'):
+ df = pd.read_csv(library_upload)
+ elif library_upload.endswith('.sdf'):
+ df = drug_library_from_sdf(library_upload)
+ else:
+ raise gr.Error('Currently only CSV and SDF files are supported as drug libraries.')
+ validate_columns(df, ['X1'])
+ return df
+
+
+def target_library_from_fasta(fasta_path):
+ records = list(SeqIO.parse(fasta_path, "fasta"))
+ id2 = [record.id for record in records]
+ seq = [str(record.seq) for record in records]
+ df = pd.DataFrame({'ID2': id2, 'X2': seq})
+ return df
+
+
+theme = gr.themes.Base(spacing_size="sm", text_size='md', font=gr.themes.GoogleFont("Roboto")).set(
+ background_fill_primary='#eef3f9',
+ background_fill_secondary='white',
+ checkbox_label_background_fill='#eef3f9',
+ checkbox_label_background_fill_hover='#dfe6f0',
+ checkbox_background_color='white',
+ checkbox_border_color='#4372c4',
+ border_color_primary='#4372c4',
+ border_color_accent='#2e6ab5',
+ button_primary_background_fill='#2e6ab4',
+ button_primary_text_color='white',
+ body_text_color='#28496F',
+ block_background_fill='#fbfcfd',
+ block_title_text_color='#28496F',
+ block_label_text_color='#28496F',
+ block_info_text_color='#505358',
+ block_border_color=None,
+ # input_border_color='#4372c4',
+ # panel_border_color='#4372c4',
+ input_background_fill='#F1F2F4',
+)
+
+with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48 * 3600)) as demo:
+ run_state = gr.State(value=False)
+ screen_flag = gr.State(value=False)
+ identify_flag = gr.State(value=False)
+ infer_flag = gr.State(value=False)
+
+ with gr.Tabs() as tabs:
+ with gr.TabItem(label='Drug Hit Screening', id='Drug Hit Screening'):
+ gr.Markdown('''
+ #
Drug Hit Screening
+
+
+ To predict interactions or binding affinities of a single target against a compound library.
+
+ ''')
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Enter (paste) a amino acid sequence below manually or upload a FASTA file. "
+ "If multiple entities are in the FASTA, only the first will be used. "
+ "Alternatively, enter a Uniprot ID or gene symbol with organism and click Query for "
+ "the sequence."
+ )
+ target_input_type = gr.Dropdown(
+ label='Step 1. Select Target Input Type and Input',
+ choices=['Sequence', 'UniProt ID', 'Gene symbol'],
+ info='Enter (paste) a FASTA string below manually or upload a FASTA file.',
+ value='Sequence',
+ scale=4, interactive=True
+ )
+
+ with gr.Row():
+ target_id = gr.Textbox(show_label=False, visible=False,
+ interactive=True, scale=4,
+ info='Enter a UniProt ID and query.')
+ target_gene = gr.Textbox(
+ show_label=False, visible=False,
+ interactive=True, scale=4,
+ info='Enter a gene symbol and query. The first record will be used.')
+ target_organism = gr.Textbox(
+ info='Organism scientific name (default: Homo sapiens).',
+ placeholder='Homo sapiens', show_label=False,
+ visible=False, interactive=True, scale=4, )
+ target_upload_btn = gr.UploadButton(label='Upload a FASTA File', type='binary',
+ visible=True, variant='primary',
+ size='lg')
+ target_paste_markdown = gr.Button(value='OR Paste Your Sequence Below',
+ variant='secondary')
+ target_query_btn = gr.Button(value='Query the Sequence', variant='primary',
+ visible=False, scale=4)
+ # with gr.Row():
+ # example_uniprot = gr.Button(value='Example: Q16539', elem_classes='example', visible=False)
+ # example_gene = gr.Button(value='Example: MAPK14', elem_classes='example', visible=False)
+ example_fasta = gr.Button(value='Example: MAPK14 (Q16539)', elem_classes='example')
+ target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5)
+ # with gr.Row():
+ # with gr.Column():
+ # with gr.Column():
+ # gr.File(label='Example FASTA file',
+ # value='data/examples/MAPK14.fasta', interactive=False)
+
+ with gr.Row():
+ with gr.Column(min_width=200):
+ HelpTip(
+ "Click Auto-detect to identify the protein family using sequence alignment. "
+ "This optional step allows applying a family-specific model instead of a all-family "
+ "model (general). "
+ "Manually select general if the alignment results are unsatisfactory."
+ )
+ drug_screen_target_family = gr.Dropdown(
+ choices=list(TARGET_FAMILY_MAP.keys()),
+ value='General',
+ label='Step 2. Select Target Family (Optional)', interactive=True)
+ target_family_detect_btn = gr.Button(value='OR Let Us Auto-Detect for You',
+ variant='primary')
+ with gr.Column(min_width=200):
+ HelpTip(
+ "Interaction prediction provides you binding probability score between the target of "
+ "interest and each compound in the library, "
+ "while affinity prediction directly estimates their binding strength measured using "
+ "half maximal inhibitory concentration (IC
50) in units of nM."
+ )
+ drug_screen_task = gr.Dropdown(
+ list(TASK_MAP.keys()),
+ label='Step 3. Select a Prediction Task',
+ value='Compound-Protein Interaction')
+ with gr.Column(min_width=200):
+ HelpTip(
+ "Select your preferred model, or click Recommend for the best-performing model based "
+ "on the selected task, family, and whether the target was trained. "
+ "Please refer to documentation for detailed benchmark results."
+ )
+ drug_screen_preset = gr.Dropdown(
+ list(PRESET_MAP.keys()),
+ label='Step 4. Select a Preset Model')
+ screen_preset_recommend_btn = gr.Button(
+ value='OR Let Us Recommend for You', variant='primary')
+
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Select a preset compound library (e.g., DrugBank). "
+ "Alternatively, upload a CSV file with a column named X1 containing compound SMILES, "
+ "or use an SDF file (Max. 10,000 compounds per task). Example CSV and SDF files are "
+ "provided below and can be downloaded by clicking the lower right corner."
+ )
+ drug_library = gr.Dropdown(
+ label='Step 5. Select a Preset Compound Library',
+ choices=list(DRUG_LIBRARY_MAP.keys()))
+ with gr.Row():
+ gr.File(label='Example SDF compound library',
+ value='data/examples/compound_library.sdf', interactive=False)
+ gr.File(label='Example CSV compound library',
+ value='data/examples/compound_library.csv', interactive=False)
+ drug_library_upload_btn = gr.UploadButton(
+ label='OR Upload Your Own Library', variant='primary')
+ drug_library_upload = gr.File(label='Custom compound library file', visible=False)
+
+ with gr.Column():
+ HelpTip("""
+
Max. Sequence Identity between the Input Target and Targets in the Training Set:
+this serves as an indicator of the predictioon applicability/reliability –
+higher similarities indicate more reliable predictions (preferably > 0.85).
+
Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target:
+this serves as an indicator of both the confidence level and novelty of the predicted hit compounds –
+higher similarities suggest greater confidence, while lower Tanimoto similarities may indicate the novelty
+of the identified hit compounds compared to known drugs or true interacting compounds of the input target.
+
Max. Sequence Identity between the Input Target and Known Targets of Hit Compound:
+this serves as an additional indicator of the confidence level of the predicted hit compounds –
+higher identities usually lead to greater confidence in the predictions.
+""")
+ drug_screen_opts = gr.CheckboxGroup(
+ label="Step 6. Select Advanced Options",
+ value=DRUG_SCRENN_CPI_OPTS[0],
+ choices=DRUG_SCRENN_CPI_OPTS,
+ info="Advanced features - may increase the job computation time. "
+ "See the Help Tip on the right or the Documentation for detailed explanation.",
+
+ )
+ with gr.Row():
+ with gr.Column():
+ drug_screen_email = gr.Textbox(
+ label='Step 7. Input Your Email Address (Optional)',
+ info="Your email address will be used to notify you of the status of your job. "
+ "If you cannot receive the email, please check your spam/junk folder."
+ )
+
+ with gr.Row(visible=True):
+ with gr.Row():
+ drug_screen_clr_btn = gr.ClearButton(size='lg')
+ drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
+
+ screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
+
+ with gr.TabItem(label='Target Protein Identification', id='Target Protein Identification'):
+ gr.Markdown('''
+ #
Target Protein Identification
+
+
+ To predict interactions or binding affinities of a single compound against a protein library.
+
+ ''')
+ with gr.Column() as identify_page:
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Enter (paste) a compound SMILES below manually or upload a SDF file. "
+ "If multiple entities are in the SDF, only the first will be used. "
+ "SMILES can be obtained by searching for the compound of interest in databases such "
+ "as NCBI, PubChem and and ChEMBL."
+ )
+ compound_type = gr.Dropdown(
+ label='Step 1. Select Compound Input Type and Input',
+ choices=['SMILES', 'SDF'],
+ info='Enter (paste) an SMILES string or upload an SDF file to convert to SMILES.',
+ value='SMILES',
+ interactive=True)
+ compound_upload_btn = gr.UploadButton(
+ label='OR Upload a SDF File', variant='primary', type='binary', visible=False)
+
+ compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
+ example_drug = gr.Button(value='Example: Aspirin', elem_classes='example')
+
+ with gr.Row():
+ with gr.Column(visible=True):
+ HelpTip(
+ "By default, models trained on all protein families (general) will be applied. "
+ "If you upload a target library containing proteins all in the same family, "
+ "you may manually select a Target Family."
+ )
+ # target_identify_target_family = gr.Dropdown(
+ # choices=['Family-Specific Auto-Recommendation'] + list(TARGET_FAMILY_MAP.keys()),
+ # value='Family-Specific Auto-Recommendation',
+ # label='Step 2. Select Target Family')
+ target_identify_target_family = gr.Dropdown(
+ choices=['General'],
+ value='General',
+ label='Step 2. Select Target Family')
+ with gr.Column():
+ HelpTip(
+ "Interaction prediction provides you binding probability score between the target of "
+ "interest and each compound in the library, while affinity prediction directly "
+ "estimates their binding strength measured using "
+ "half maximal inhibitory concentration (IC
50) in units of nM."
+ )
+ target_identify_task = gr.Dropdown(
+ list(TASK_MAP.keys()),
+ label='Step 3. Select a Prediction Task',
+ value='Compound-Protein Interaction')
+
+ with gr.Column():
+ HelpTip(
+ "Select your preferred model, or click Recommend for the best-performing model based "
+ "on the selected task and whether the compound was trained. By default, General-trained "
+ "model is used for Target Protein Identification. "
+ "Please refer to the documentation for detailed benchmark results."
+ )
+ # target_identify_preset = gr.Dropdown(
+ # choices=['Family-Specific Auto-Recommendation'] + list(PRESET_MAP.keys()),
+ # value='Family-Specific Auto-Recommendation',
+ # label='Step 4. Select a Preset Model')
+ target_identify_preset = gr.Dropdown(
+ choices=['DeepConvDTI', 'DrugBAN', 'HyperAttentionDTI'],
+ value='DrugBAN',
+ label='Step 4. Select a Preset Model')
+ identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
+ variant='primary')
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Select a preset target library (e.g., ChEMBL33_human_proteins). "
+ "Alternatively, upload a CSV file with a column named X2 containing target protein "
+ "sequences, or use an FASTA file (Max. 10,000 targets per task). "
+ "Example CSV and SDF files are provided below "
+ "and can be downloaded by clicking the lower right corner."
+ )
+ target_library = gr.Dropdown(
+ label='Step 5. Select a Preset Target Library',
+ choices=list(TARGET_LIBRARY_MAP.keys()))
+ with gr.Row():
+ gr.File(label='Example FASTA target library',
+ value='data/examples/target_library.fasta', interactive=False)
+ gr.File(label='Example CSV target library',
+ value='data/examples/target_library.csv', interactive=False)
+ target_library_upload_btn = gr.UploadButton(
+ label='OR Upload Your Own Library', variant='primary')
+ target_library_upload = gr.File(label='Custom target library file', visible=False)
+ with gr.Column():
+ HelpTip("""
+
Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set:
+this serves as an indicator of prediction applicability and reliability –
+higher similarities indicates more reliable predictions (ideally > 0.85).
+
Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound:
+this serves as an indicator of prediction confidence for the potential targets –
+higher similarities typically imply higher confidence levels.
+
Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target:
+this serves as an additional indicator of the confidence level in the predicted potential targets –
+higher similarities usually correspond to greater prediction confidence.
+""")
+ target_identify_opts = gr.CheckboxGroup(
+ choices=TARGET_IDENTIFY_CPI_OPTS,
+ value=TARGET_IDENTIFY_CPI_OPTS[0],
+ label='Step 6. Select Advanced Options',
+ info="Advanced features - may increase the job computation time. "
+ "See the Help Tip on the right or the Documentation for detailed explanation."
+ )
+ with gr.Row():
+ with gr.Column():
+ target_identify_email = gr.Textbox(
+ label='Step 7. Input Your Email Address (Optional)',
+ info="Your email address will be used to notify you of the status of your job. "
+ "If you cannot receive the email, please check your spam/junk folder."
+ )
+
+ with gr.Row(visible=True):
+ target_identify_clr_btn = gr.ClearButton(size='lg')
+ target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary',
+ size='lg')
+
+ identify_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
+
+ with gr.TabItem(label='Interaction Pair Inference', id='Interaction Pair Inference'):
+ gr.Markdown('''
+ #
Interaction Pair Inference
+
+
To predict interactions or binding affinities between up to
+ 10,000 paired compound-protein data.
+ ''')
+ HelpTip(
+ "A custom interation pair dataset can be a CSV file with 2 required columns "
+ "(X1 for smiles and X2 for sequences) "
+ "and optionally 2 ID columns (ID1 for compound ID and ID2 for target ID), "
+ "or generated from a FASTA file containing multiple "
+ "sequences and a SDF file containing multiple compounds. "
+ "Currently, a maximum of 10,000 pairs is supported, "
+ "which means that the size of CSV file or "
+ "the product of the two library sizes should not exceed 10,000."
+ )
+ infer_type = gr.Dropdown(
+ choices=['Upload a CSV file containing paired compound-protein data',
+ 'Upload a compound library and a target library'],
+ label='Step 1. Select Pair Input Type and Input',
+ value='Upload a CSV file containing paired compound-protein data')
+ with gr.Column() as pair_upload:
+ gr.File(
+ label="Example CSV dataset",
+ value="data/examples/interaction_pair_inference.csv",
+ interactive=False
+ )
+ with gr.Row():
+ infer_csv_prompt = gr.Button(
+ value="Upload Your Own Dataset Below",
+ variant='secondary')
+ with gr.Column():
+ infer_pair = gr.File(
+ label='Upload CSV File Containing Paired Records',
+ file_count="single",
+ type='filepath',
+ visible=True
+ )
+ with gr.Column(visible=False) as pair_generate:
+ with gr.Row():
+ gr.File(
+ label='Example SDF compound library',
+ value='data/examples/compound_library.sdf',
+ interactive=False
+ )
+ gr.File(
+ label='Example FASTA target library',
+ value='data/examples/target_library.fasta',
+ interactive=False
+ )
+ with gr.Row():
+ gr.File(
+ label='Example CSV compound library',
+ value='data/examples/compound_library.csv',
+ interactive=False
+ )
+ gr.File(
+ label='Example CSV target library',
+ value='data/examples/target_library.csv',
+ interactive=False
+ )
+ with gr.Row():
+ infer_library_prompt = gr.Button(
+ value="Upload Your Own Libraries Below",
+ visible=False,
+ variant='secondary'
+ )
+ with gr.Row():
+ infer_drug = gr.File(
+ label='Upload SDF/CSV File Containing Multiple Compounds',
+ file_count="single",
+ type='filepath'
+ )
+ infer_target = gr.File(
+ label='Upload FASTA/CSV File Containing Multiple Targets',
+ file_count="single",
+ type='filepath'
+ )
+
+ with gr.Row():
+ with gr.Column(min_width=200):
+ HelpTip(
+ "By default, models trained on all protein families (general) will be applied. "
+ "If the proteins in the target library of interest "
+ "all belong to the same protein family, manually selecting the family is supported."
+ )
+
+ pair_infer_target_family = gr.Dropdown(
+ choices=list(TARGET_FAMILY_MAP.keys()),
+ value='General',
+ label='Step 2. Select Target Family (Optional)'
+ )
+
+ with gr.Column(min_width=200):
+ HelpTip(
+ "Interaction prediction provides you binding probability score "
+ "between the target of interest and each compound in the library, "
+ "while affinity prediction directly estimates their binding strength "
+ "measured using half maximal inhibitory concentration (IC
50) in units of nM."
+ )
+ pair_infer_task = gr.Dropdown(
+ list(TASK_MAP.keys()),
+ label='Step 3. Select a Prediction Task',
+ value='Compound-Protein Interaction'
+ )
+
+ with gr.Column(min_width=200):
+ HelpTip(
+ "Select your preferred model. Please refer to documentation for detailed benchmark results."
+ )
+ pair_infer_preset = gr.Dropdown(
+ list(PRESET_MAP.keys()),
+ label='Step 4. Select a Preset Model'
+ )
+ # infer_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
+ # variant='primary')
+ pair_infer_opts = gr.CheckboxGroup(visible=False)
+
+ with gr.Row():
+ pair_infer_email = gr.Textbox(
+ label='Step 5. Input Your Email Address (Optional)',
+ info="Your email address will be used to notify you of the status of your job. "
+ "If you cannot receive the email, please check your spam/junk folder.")
+
+ with gr.Row(visible=True):
+ pair_infer_clr_btn = gr.ClearButton(size='lg')
+ pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg')
+
+ infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False)
+
+ with gr.TabItem(label='Chemical Property Report', id='Chemical Property Report'):
+ gr.Markdown('''
+ #
Chemical Property Report
+
+ To compute chemical properties for the predictions of Drug Hit Screening,
+ Target Protein Identification, and Interaction Pair Inference.
+
+ You may also upload your own dataset using a CSV file containing
+ one required column `X1` for compound SMILES.
+
+ The page shows only a preview report displaying at most 30 records
+ (with top predicted CPI/CPA if reporting results from a prediction job).
+
+ Please first `Preview` the report, then `Generate` and download a CSV report
+ or an interactive HTML report below if you wish to access the full report.
+ ''')
+ raw_df = gr.State(value=pd.DataFrame())
+ report_df = gr.State(value=pd.DataFrame())
+ with gr.Row():
+ with gr.Column(scale=1):
+ file_for_report = gr.File(interactive=True, type='filepath')
+ report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False,
+ value='Compound-Protein Interaction',
+ label='Specify the Task Labels in the Uploaded Dataset')
+ with gr.Column(scale=2):
+ with gr.Column():
+ with gr.Row():
+ scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Compound Scores')
+ filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Compound Filters')
+ job_opts = gr.CheckboxGroup(visible=False)
+
+ with gr.Accordion('Report Generate Options', open=True):
+ with gr.Row():
+ csv_sep = gr.Radio(label='CSV Delimiter',
+ choices=['Comma', 'Tab'], value='Comma')
+ html_opts = gr.CheckboxGroup(label='HTML Report Options',
+ choices=[
+ 'Exclude Molecular Graph',
+ 'Exclude Scaffold Graph',
+ 'Exclude Pharmacophore 3D'
+ ])
+
+ with gr.Row():
+ report_clr_btn = gr.ClearButton(size='lg')
+ analyze_btn = gr.Button('Calculate Properties and Preview', variant='primary',
+ size='lg', interactive=False)
+
+ with gr.Row():
+ with gr.Column(scale=3):
+ html_report = gr.HTML() # label='Results', visible=True)
+ ranking_pie_chart = gr.Plot(visible=False)
+
+ with gr.Row():
+ with gr.Column():
+ csv_generate = gr.Button(value='Generate CSV Report',
+ interactive=False, variant='primary')
+ csv_download_file = gr.File(label='Download CSV Report', visible=False)
+ with gr.Column():
+ html_generate = gr.Button(value='Generate HTML Report',
+ interactive=False, variant='primary')
+ html_download_file = gr.File(label='Download HTML Report', visible=False)
+
+ with gr.TabItem(label='Prediction Status Lookup', id='Prediction Status Lookup'):
+ gr.Markdown('''
+ #
Prediction Status Lookup
+
+ To check the status of an in-progress or historical job using the job ID and retrieve the predictions
+ if the job has completed. Note that predictions are only kept for 48 hours upon job completion.
+
+ You will be redirected to Chemical Property Report for carrying out further analysis and
+ generating the full report when the job is done. If the Lookup fails to respond, please wait for a
+ few minutes and refresh the page to try again.
+ ''')
+ with gr.Column():
+ pred_lookup_id = gr.Textbox(
+ label='Input Your Job ID', placeholder='e.g., e9dfd149-3f5c-48a6-b797-c27d027611ac',
+ info="Your job ID is a UUID4 string that you receive after submitting a job on the "
+ "page or in the email notification.")
+ pred_lookup_btn = gr.Button(value='Lookup the Job Status', variant='primary', visible=True)
+ pred_lookup_stop_btn = gr.Button(value='Stop Tracking', variant='stop', visible=False)
+ pred_lookup_status = gr.Markdown()
+
+ # retrieve_email = gr.Textbox(label='Step 2. Input Your Email Address', placeholder='e.g.,
+
+
+ def target_input_type_select(input_type):
+ match input_type:
+ case 'UniProt ID':
+ return [gr.Dropdown(info=''),
+ gr.UploadButton(visible=False),
+ gr.Textbox(visible=True, value=''),
+ gr.Textbox(visible=False, value=''),
+ gr.Textbox(visible=False, value=''),
+ gr.Button(visible=True),
+ gr.Code(value=''),
+ gr.Button(visible=False)]
+ case 'Gene symbol':
+ return [gr.Dropdown(info=''),
+ gr.UploadButton(visible=False),
+ gr.Textbox(visible=False, value=''),
+ gr.Textbox(visible=True, value=''),
+ gr.Textbox(visible=True, value=''),
+ gr.Button(visible=True),
+ gr.Code(value=''),
+ gr.Button(visible=False)]
+ case 'Sequence':
+ return [gr.Dropdown(info='Enter (paste) a FASTA string below manually or upload a FASTA file.'),
+ gr.UploadButton(visible=True),
+ gr.Textbox(visible=False, value=''),
+ gr.Textbox(visible=False, value=''),
+ gr.Textbox(visible=False, value=''),
+ gr.Button(visible=False),
+ gr.Code(value=''),
+ gr.Button(visible=True)]
+
+
+ target_input_type.select(
+ fn=target_input_type_select,
+ inputs=target_input_type,
+ outputs=[
+ target_input_type, target_upload_btn,
+ target_id, target_gene, target_organism, target_query_btn,
+ target_fasta, target_paste_markdown
+ ],
+ show_progress='hidden'
+ )
+
+
+ def uniprot_query(input_type, uid, gene, organism='Human'):
+ uniprot_endpoint = 'https://rest.uniprot.org/uniprotkb/{query}'
+ fasta_rec = ''
+
+ match input_type:
+ case 'UniProt ID':
+ query = f"{uid.strip()}.fasta"
+ case 'Gene symbol':
+ organism = organism if organism else 'Human'
+ query = f'search?query=organism_name:{organism.strip()}+AND+gene:{gene.strip()}&format=fasta'
+
+ try:
+ fasta = session.get(uniprot_endpoint.format(query=query))
+ fasta.raise_for_status()
+ if fasta.text:
+ fasta_rec = next(SeqIO.parse(io.StringIO(fasta.text), format='fasta'))
+ fasta_rec = f">{fasta_rec.description}\n{fasta_rec.seq}"
+
+ except Exception as e:
+ raise gr.Warning(f"Failed to query FASTA from UniProt database due to {str(e)}")
+ finally:
+ return fasta_rec
+
+
+ def process_fasta_upload(fasta_upload):
+ fasta = ''
+ try:
+ fasta = fasta_upload.decode()
+ except Exception as e:
+ gr.Warning(f"Please upload a valid FASTA file. Error: {str(e)}")
+ return fasta
+
+
+ target_upload_btn.upload(
+ fn=process_fasta_upload, inputs=target_upload_btn, outputs=target_fasta
+ ).then(
+ fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden'
+ )
+ target_query_btn.click(
+ fn=uniprot_query, inputs=[target_input_type, target_id, target_gene, target_organism], outputs=target_fasta
+ ).then(
+ fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden'
+ )
+
+
+ def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)):
+ try:
+ aligner = PairwiseAligner(mode='local')
+ alignment_df = get_fasta_family_map()
+
+ processed_fasta = process_target_fasta(fasta)
+
+ # Check for an exact match first
+ exact_match = alignment_df[alignment_df['X2'] == processed_fasta]
+ if not exact_match.empty:
+ row = exact_match.iloc[0]
+ family = str(row['Target Family']).title()
+ return gr.Dropdown(
+ value=family,
+ info=f"Reason: Exact match found with {row['ID2']} from family {family}")
+
+ # If no exact match, then calculate alignment score
+ def align_score(query):
+ alignment = aligner.align(processed_fasta, query)
+ return alignment.score / max(len(processed_fasta), len(query))
+
+ alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
+ row = alignment_df.loc[alignment_df['score'].idxmax()]
+ family = str(row['Target Family']).title()
+ return gr.Dropdown(value=family,
+ info=f"Reason: Best sequence identity ({row['score']}) "
+ f"with {row['ID2']} from family {family}")
+ except Exception as e:
+ gr.Warning("Failed to detect the protein family due to error: " + str(e))
+
+
+ target_family_detect_btn.click(fn=target_family_detect, inputs=target_fasta, outputs=drug_screen_target_family)
+
+ # target_fasta.focus(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden')
+ target_fasta.blur(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden')
+
+ drug_library_upload_btn.upload(fn=lambda x: [
+ x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name])
+ ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
+
+ drug_screen_task.select(
+ fn=lambda task, opts: gr.CheckboxGroup(choices=DRUG_SCRENN_CPA_OPTS)
+ if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup(
+ choices=DRUG_SCRENN_CPI_OPTS, value=DRUG_SCRENN_CPI_OPTS[0]),
+ inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts,
+ show_progress='hidden'
+ )
+
+ target_identify_task.select(
+ fn=lambda task, opts: gr.CheckboxGroup(choices=TARGET_IDENTIFY_CPA_OPTS)
+ if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup(
+ choices=TARGET_IDENTIFY_CPI_OPTS, value=TARGET_IDENTIFY_CPI_OPTS[0]),
+ inputs=[target_identify_task, target_identify_opts], outputs=target_identify_opts,
+ show_progress='hidden'
+ )
+
+ def example_fill(input_type):
+ return {target_id: 'Q16539',
+ target_gene: 'MAPK14',
+ target_organism: 'Human',
+ target_fasta: """
+>sp|Q16539|MK14_HUMAN Mitogen-activated protein kinase 14 OS=Homo sapiens OX=9606 GN=MAPK14 PE=1 SV=3
+MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLRVAVKKLSRPFQ
+SIIHAKRTYRELRLLKHMKHENVIGLLDVFTPARSLEEFNDVYLVTHLMGADLNNIVKCQ
+KLTDDHVQFLIYQILRGLKYIHSADIIHRDLKPSNLAVNEDCELKILDFGLARHTDDEMT
+GYVATRWYRAPEIMLNWMHYNQTVDIWSVGCIMAELLTGRTLFPGTDHIDQLKLILRLVG
+TPGAELLKKISSESARNYIQSLTQMPKMNFANVFIGANPLAVDLLEKMLVLDSDKRITAA
+QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
+"""}
+
+
+ example_fasta.click(fn=example_fill, inputs=target_input_type, outputs=[
+ target_id, target_gene, target_organism, target_fasta], show_progress='hidden')
+
+
+ def screen_recommend_model(fasta, family, task):
+ task = TASK_MAP[task]
+ score = TASK_METRIC_MAP[task]
+ benchmark_df = pd.read_csv(f'data/benchmarks/{task}_test_metrics.csv')
+
+ if not fasta:
+ gr.Warning('Please enter a valid FASTA for model recommendation.')
+ return [None, family]
+
+ if family == 'General':
+ seen_targets = get_seen_fastas('General', task)['X2'].values
+ if process_target_fasta(fasta) in seen_targets:
+ scenario = "Seen Target"
+ else:
+ scenario = "Unseen Target"
+ filtered_df = benchmark_df[(benchmark_df['Family'] == 'All Families')
+ & (benchmark_df['Scenario'] == scenario)
+ & (benchmark_df['Type'] == 'General')]
+
+ else:
+ seen_targets_general = get_seen_fastas('General', task)['X2'].values
+ if process_target_fasta(fasta) in seen_targets_general:
+ scenario_general = "Seen Target"
+ else:
+ scenario_general = "Unseen Target"
+
+ seen_targets_family = get_seen_fastas(family, task)['X2'].values
+ if process_target_fasta(fasta) in seen_targets_family:
+ scenario_family = "Seen Target"
+ else:
+ scenario_family = "Unseen Target"
+
+ filtered_df_general = benchmark_df[(benchmark_df['Family'] == family)
+ & (benchmark_df['Scenario'] == scenario_general)
+ & (benchmark_df['Type'] == 'General')]
+ filtered_df_family = benchmark_df[(benchmark_df['Family'] == family)
+ & (benchmark_df['Scenario'] == scenario_family)
+ & (benchmark_df['Type'] == 'Family')]
+ filtered_df = pd.concat([filtered_df_general, filtered_df_family])
+
+ row = filtered_df.loc[filtered_df[score].idxmax()]
+ if row['Scenario'] == 'Seen Target':
+ scenario = "Seen Target (>=0.85 sequence identity)"
+ elif row['Scenario'] == 'Unseen Target':
+ scenario = "Unseen Target (<0.85 sequence identity)"
+
+ return {drug_screen_preset:
+ gr.Dropdown(value=row['Model'],
+ info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
+ f"model with the best {score} in the {scenario} scenario on {row['Family']}."),
+ drug_screen_target_family:
+ gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
+
+
+ screen_preset_recommend_btn.click(
+ fn=screen_recommend_model,
+ inputs=[target_fasta, drug_screen_target_family, drug_screen_task],
+ outputs=[drug_screen_preset, drug_screen_target_family],
+ show_progress='hidden'
+ )
+
+
+ def compound_input_type_select(input_type):
+ match input_type:
+ case 'SMILES':
+ return gr.Button(visible=False)
+ case 'SDF':
+ return gr.Button(visible=True)
+
+
+ compound_type.select(fn=compound_input_type_select,
+ inputs=compound_type, outputs=compound_upload_btn, show_progress='hidden')
+
+
+ def compound_upload_process(input_type, input_upload):
+ smiles = ''
+ try:
+ match input_type:
+ case 'SMILES':
+ smiles = input_upload.decode()
+ case 'SDF':
+ suppl = Chem.ForwardSDMolSupplier(io.BytesIO(input_upload))
+ smiles = Chem.MolToSmiles(next(suppl))
+ except Exception as e:
+ gr.Warning(f"Please upload a valid {input_type} file. Error: {str(e)}")
+ return smiles
+
+
+ compound_upload_btn.upload(fn=compound_upload_process,
+ inputs=[compound_type, compound_upload_btn],
+ outputs=compound_smiles)
+
+ example_drug.click(fn=lambda: 'CC(=O)Oc1ccccc1C(=O)O', outputs=compound_smiles, show_progress='hidden')
+
+ target_library_upload_btn.upload(fn=lambda x: [
+ x.name, gr.Dropdown(value=Path(x.name).name, choices=list(TARGET_LIBRARY_MAP.keys()) + [Path(x.name).name])
+ ], inputs=target_library_upload_btn, outputs=[target_library_upload, target_library])
+
+
+ def identify_recommend_model(smiles, family, task):
+ task = TASK_MAP[task]
+ score = TASK_METRIC_MAP[task]
+ benchmark_df = pd.read_csv(f'data/benchmarks/{task}_test_metrics.csv')
+
+ if not smiles:
+ gr.Warning('Please enter a valid SMILES for model recommendation.')
+ return None
+ if family == 'Family-Specific Auto-Recommendation':
+ return 'Family-Specific Auto-Recommendation'
+
+ if family == 'General':
+ seen_compounds = pd.read_csv(
+ f'data/benchmarks/seen_compounds/all_families_full_{task.lower()}_random_split.csv')
+ family = 'All Families'
+
+ else:
+ seen_compounds = pd.read_csv(
+ f'data/benchmarks/seen_compounds/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
+
+ if rdkit_canonicalize(smiles) in seen_compounds['X1'].values:
+ scenario = "Seen Compound"
+ else:
+ scenario = "Unseen Compound"
+
+ filtered_df = benchmark_df[(benchmark_df['Family'] == family)
+ & (benchmark_df['Scenario'] == scenario)
+ & (benchmark_df['Type'] == 'General')]
+
+ row = filtered_df.loc[filtered_df[score].idxmax()]
+
+ return gr.Dropdown(value=row['Model'],
+ info=f"Reason: {scenario} in training; choosing the model "
+ f"with the best {score} in the {scenario} scenario.")
+
+
+ identify_preset_recommend_btn.click(fn=identify_recommend_model,
+ inputs=[compound_smiles, target_identify_target_family, target_identify_task],
+ outputs=target_identify_preset, show_progress='hidden')
+
+
+ def infer_type_change(upload_type):
+ match upload_type:
+ case "Upload a compound library and a target library":
+ return {
+ pair_upload: gr.Column(visible=False),
+ pair_generate: gr.Column(visible=True),
+ infer_pair: None,
+ infer_drug: None,
+ infer_target: None,
+ infer_csv_prompt: gr.Button(visible=False),
+ infer_library_prompt: gr.Button(visible=True),
+ }
+ case "Upload a CSV file containing paired compound-protein data":
+ return {
+ pair_upload: gr.Column(visible=True),
+ pair_generate: gr.Column(visible=False),
+ infer_pair: None,
+ infer_drug: None,
+ infer_target: None,
+ infer_csv_prompt: gr.Button(visible=True),
+ infer_library_prompt: gr.Button(visible=False),
+ }
+
+
+ infer_type.select(fn=infer_type_change, inputs=infer_type,
+ outputs=[pair_upload, pair_generate, infer_pair, infer_drug, infer_target,
+ infer_csv_prompt, infer_library_prompt],
+ show_progress='hidden')
+
+
+ def common_input_validate(state, preset, email, request):
+ gr.Info('Start processing inputs...')
+ if not preset:
+ raise gr.Error('Please select a model.')
+
+ if email:
+ try:
+ email_info = validate_email(email, check_deliverability=False)
+ email = email_info.normalized
+ except EmailNotValidError as e:
+ raise gr.Error(f"Invalid email address: {str(e)}.")
+
+ if state:
+ raise gr.Error(f"You already have a running prediction job (ID: {state['id']}) under this session. "
+ "Please wait for it to complete before submitting another job.")
+
+ if check := check_user_running_job(email, request):
+ raise gr.Error(check)
+
+ return state, preset, email
+
+
+ def common_job_initiate(job_id, job_type, email, request, task):
+ gr.Info('Finished processing inputs. Initiating the prediction job... '
+ 'You will be redirected to Prediction Status Lookup once the job has been submitted.')
+ job_info = {'id': job_id,
+ 'type': job_type,
+ 'task': task,
+ 'status': 'RUNNING',
+ 'email': email,
+ 'ip': request.headers.get('x-forwarded-for', request.client.host),
+ 'cookies': dict(request.cookies),
+ 'start_time': time(),
+ 'end_time': None,
+ 'expiry_time': None,
+ 'error': None}
+ # db.insert(job_info)
+ return job_info
+
+
+ def drug_screen_validate(fasta, library, library_upload, preset, task, email, state,
+ request: gr.Request, progress=gr.Progress(track_tqdm=True)):
+ state, preset, email = common_input_validate(state, preset, email, request)
+
+ fasta = process_target_fasta(fasta)
+ err = validate_seq_str(fasta, FASTA_PAT)
+ if err:
+ raise gr.Error(f'Found error(s) in your Target FASTA input: {err}')
+ if not library:
+ raise gr.Error('Please select or upload a compound library.')
+ if library in DRUG_LIBRARY_MAP.keys():
+ screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
+ else:
+ screen_df = process_drug_library_upload(library_upload)
+ if len(screen_df) >= DATASET_MAX_LEN:
+ raise gr.Error(f'The uploaded compound library has more records '
+ f'than the allowed maximum {DATASET_MAX_LEN}.')
+
+ screen_df['X2'] = fasta
+
+ job_id = str(uuid4())
+ temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
+ screen_df.to_csv(temp_file, index=False, na_rep='')
+ if temp_file.is_file():
+ job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task)
+ return {screen_data_for_predict: str(temp_file),
+ run_state: job_info}
+ else:
+ raise gr.Error('System failed to create temporary files. Please try again later.')
+
+
+ def target_identify_validate(smiles, library, library_upload, preset, task, email, state,
+ request: gr.Request, progress=gr.Progress(track_tqdm=True)):
+ state, preset, email = common_input_validate(state, preset, email, request)
+
+ smiles = smiles.strip()
+ err = validate_seq_str(smiles, SMILES_PAT)
+ if err:
+ raise gr.Error(f'Found error(s) in your Compound SMILES input: {err}')
+ if not library:
+ raise gr.Error('Please select or upload a target library.')
+ if library in TARGET_LIBRARY_MAP.keys():
+ identify_df = pd.read_csv(Path('data/target_libraries', TARGET_LIBRARY_MAP[library]))
+ else:
+ identify_df = process_target_library_upload(library_upload)
+ if len(identify_df) >= DATASET_MAX_LEN:
+ raise gr.Error(f'The uploaded target library has more records '
+ f'than the allowed maximum {DATASET_MAX_LEN}.')
+ identify_df['X1'] = smiles
+
+ job_id = str(uuid4())
+ temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
+ identify_df.to_csv(temp_file, index=False, na_rep='')
+ if temp_file.is_file():
+ job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task)
+ return {identify_data_for_predict: str(temp_file),
+ run_state: job_info}
+ else:
+ raise gr.Error('System failed to create temporary files. Please try again later.')
+
+
+ def pair_infer_validate(drug_target_pair_upload, drug_upload, target_upload, preset, task, email, state,
+ request: gr.Request, progress=gr.Progress(track_tqdm=True)):
+ state, preset, email = common_input_validate(state, preset, email, request)
+
+ job_id = str(uuid4())
+ if drug_target_pair_upload:
+ infer_df = pd.read_csv(drug_target_pair_upload)
+ validate_columns(infer_df, ['X1', 'X2'])
+
+ infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
+ validate_seq_str, regex=SMILES_PAT)
+ if not infer_df['X1_ERR'].isna().all():
+ raise ValueError(
+ f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+
+ infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
+ validate_seq_str, regex=FASTA_PAT)
+ if not infer_df['X2_ERR'].isna().all():
+ raise ValueError(
+ f"Encountered invalid FASTA:\n{infer_df[~infer_df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
+
+ temp_file = Path(drug_target_pair_upload).resolve()
+
+ elif drug_upload and target_upload:
+ drug_df = process_drug_library_upload(drug_upload)
+ target_df = process_target_library_upload(target_upload)
+
+ drug_df.drop_duplicates(subset=['X1'], inplace=True)
+ target_df.drop_duplicates(subset=['X2'], inplace=True)
+
+ infer_df = pd.DataFrame(list(itertools.product(drug_df['X1'], target_df['X2'])),
+ columns=['X1', 'X2'])
+ infer_df = infer_df.merge(drug_df, on='X1').merge(target_df, on='X2')
+
+ if len(infer_df) >= DATASET_MAX_LEN:
+ raise gr.Error(f'The uploaded/generated compound-protein pair dataset has more records '
+ f'than the allowed maximum {DATASET_MAX_LEN}.')
+
+ temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
+ infer_df.to_csv(temp_file, index=False, na_rep='')
+
+ else:
+ raise gr.Error('Should upload a compound-protein pair dataset, or '
+ 'upload both a compound library and a target library.')
+
+ if temp_file.is_file():
+ job_info = common_job_initiate(job_id, 'Interaction Pair Inference', email, request, task)
+ return {infer_data_for_predict: str(temp_file),
+ run_state: job_info}
+ else:
+ raise gr.Error('System failed to create temporary files. Please try again later.')
+
+
+ def fill_job_id(job_info):
+ try:
+ return job_info['id']
+ except Exception as e:
+ gr.Warning(f'Failed to fetch job ID due to error: {str(e)}')
+ return ''
+
+
+ drug_screen_click = drug_screen_btn.click(
+ fn=drug_screen_validate,
+ inputs=[target_fasta, drug_library, drug_library_upload, drug_screen_preset, drug_screen_task,
+ drug_screen_email, run_state],
+ outputs=[screen_data_for_predict, run_state],
+ concurrency_limit=2,
+ )
+
+ drug_screen_lookup = drug_screen_click.success(
+ fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs],
+ ).then(
+ fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id]
+ ).then(
+ fn=lookup_job,
+ inputs=[pred_lookup_id],
+ outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
+ show_progress='minimal',
+ concurrency_limit=100,
+ )
+
+ # drug_screen_click.success(
+ # fn=send_email,
+ # inputs=[run_state]
+ # )
+
+ drug_screen_click.success(
+ fn=submit_predict,
+ inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset,
+ drug_screen_target_family, drug_screen_opts, run_state, ],
+ outputs=[run_state, ]
+ )
+
+ drug_screen_clr_btn.click(
+ lambda: ['General'] + [[]] + [None] * 5,
+ outputs=[drug_screen_target_family, drug_screen_opts,
+ target_fasta, drug_screen_preset, drug_library, drug_library_upload, drug_screen_email],
+ show_progress='hidden'
+ )
+
+ target_identify_clr_btn.click(
+ lambda: ['General'] + [[]] + [None] * 5,
+ outputs=[target_identify_target_family, target_identify_opts,
+ compound_smiles, target_identify_preset, target_library, target_library_upload, target_identify_email],
+ show_progress='hidden'
+ )
+
+ pair_infer_clr_btn.click(
+ lambda: ['General'] + [None] * 5,
+ outputs=[pair_infer_target_family,
+ infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_email],
+ show_progress='hidden'
+ )
+
+ report_clr_btn.click(
+ lambda: [[]] * 3 + [None] * 3 +
+ [gr.Button(interactive=False)] * 3 +
+ [gr.File(visible=False, value=None)] * 2 +
+ [gr.Dropdown(visible=False, value=None), gr.HTML(value=''), gr.CheckboxGroup(visible=False)],
+ outputs=[
+ scores, filters, html_opts,
+ file_for_report, raw_df, report_df,
+ csv_generate, html_generate, analyze_btn,
+ csv_download_file, html_download_file,
+ report_task, html_report, job_opts
+ ],
+ show_progress='hidden'
+ )
+
+
+ def update_preset(family, preset):
+ if family == 'Family-Specific Auto-Recommendation':
+ return 'Family-Specific Auto-Recommendation'
+ elif preset == 'Family-Specific Auto-Recommendation':
+ return None
+ else:
+ return preset
+
+
+ def update_family(family, preset):
+ if preset == 'Family-Specific Auto-Recommendation':
+ return 'Family-Specific Auto-Recommendation'
+ elif family == 'Family-Specific Auto-Recommendation':
+ return None
+ else:
+ return family
+
+
+ target_identify_target_family.change(
+ fn=update_preset, inputs=[target_identify_target_family, target_identify_preset],
+ outputs=target_identify_preset, show_progress='hidden')
+ target_identify_preset.change(
+ fn=update_family, inputs=[target_identify_target_family, target_identify_preset],
+ outputs=target_identify_target_family, show_progress='hidden')
+
+ target_identify_click = target_identify_btn.click(
+ fn=target_identify_validate,
+ inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task,
+ target_identify_email, run_state],
+ outputs=[identify_data_for_predict, run_state],
+ concurrency_limit=2,
+ )
+
+ target_identify_lookup = target_identify_click.success(
+ fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs],
+ ).then(
+ fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id]
+ ).then(
+ fn=lookup_job,
+ inputs=[pred_lookup_id],
+ outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
+ show_progress='minimal',
+ concurrency_limit=100
+ )
+
+ # target_identify_click.success(
+ # fn=send_email,
+ # inputs=[run_state]
+ # )
+
+ target_identify_click.success(
+ fn=submit_predict,
+ inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
+ target_identify_target_family, target_identify_opts, run_state, ], # , target_identify_email],
+ outputs=[run_state, ]
+ )
+
+ pair_infer_click = pair_infer_btn.click(
+ fn=pair_infer_validate,
+ inputs=[infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_task,
+ pair_infer_email, run_state],
+ outputs=[infer_data_for_predict, run_state],
+ concurrency_limit=2,
+ )
+
+ pair_infer_lookup = pair_infer_click.success(
+ fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs],
+ ).then(
+ fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id]
+ ).then(
+ fn=lookup_job,
+ inputs=[pred_lookup_id],
+ outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
+ show_progress='minimal',
+ concurrency_limit=100
+ )
+
+ # pair_infer_click.success(
+ # fn=send_email,
+ # inputs=[run_state]
+ # )
+
+ pair_infer_click.success(
+ fn=submit_predict,
+ inputs=[infer_data_for_predict, pair_infer_task, pair_infer_preset,
+ pair_infer_target_family, pair_infer_opts, run_state, ], # , pair_infer_email],
+ outputs=[run_state, ]
+ )
+
+ pred_lookup_click = pred_lookup_btn.click(
+ fn=lookup_job,
+ inputs=[pred_lookup_id],
+ outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
+ show_progress='minimal',
+ cancels=[drug_screen_lookup, target_identify_lookup, pair_infer_lookup],
+ concurrency_limit=100,
+ )
+
+ pred_lookup_stop_btn.click(
+ fn=lambda: [gr.Button(visible=True), gr.Button(visible=False)],
+ outputs=[pred_lookup_btn, pred_lookup_stop_btn],
+ cancels=[pred_lookup_click, drug_screen_lookup, target_identify_lookup, pair_infer_lookup],
+ concurrency_limit=None,
+ )
+
+
+ def inquire_task(df):
+ if 'Y^' in df.columns:
+ label = 'predicted CPI/CPA labels (`Y^`)'
+ return {report_task: gr.Dropdown(visible=True,
+ info=f'Found {label} in your uploaded dataset. '
+ 'Is it compound-protein interaction or binding affinity?'),
+ html_report: ''}
+ else:
+ return {report_task: gr.Dropdown(visible=False)}
+
+ report_df_change = file_for_report.change(
+ fn=update_df, inputs=file_for_report, outputs=[
+ html_report, raw_df, report_df, analyze_btn, report_task, job_opts
+ ],
+ concurrency_limit=100,
+ ).success(
+ fn=lambda: [gr.Button(interactive=True)] * 3 +
+ [gr.File(visible=False, value=None)] * 2,
+ outputs=[
+ csv_generate, html_generate, analyze_btn, csv_download_file, html_download_file
+ ],
+ )
+
+ file_for_report.upload(
+ # fn=update_df, inputs=file_for_report, outputs=[
+ # html_report, raw_df, report_df, analyze_btn, report_task, job_opts
+ # ],
+ # cancels=[report_df_change],
+ # concurrency_limit=100,
+ # ).success(
+ fn=inquire_task, inputs=[raw_df],
+ outputs=[report_task, html_report],
+ )
+
+ file_for_report.clear(
+ fn=lambda: [gr.Button(interactive=False)] * 3 +
+ [gr.File(visible=False, value=None)] * 2 +
+ [gr.Dropdown(visible=False, value=None), '', gr.CheckboxGroup(visible=False)],
+ cancels=[report_df_change],
+ outputs=[
+ csv_generate, html_generate, analyze_btn,
+ csv_download_file, html_download_file,
+ report_task, html_report, job_opts
+ ]
+ )
+
+ analyze_btn.click(
+ fn=submit_report, inputs=[raw_df, scores, filters, job_opts, report_task], outputs=[
+ html_report, report_df, csv_download_file, html_download_file]
+ ).success(
+ fn=lambda: [gr.Button(interactive=True)] * 2,
+ outputs=[csv_generate, html_generate],
+ concurrency_limit=100,
+ )
+
+
+ def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)):
+ csv_sep_map = {
+ 'Comma': ',',
+ 'Tab': '\t',
+ }
+ y_colname = 'Y^'
+ if isinstance(task, str):
+ if task == 'Compound-Protein Interaction':
+ y_colname = 'Y_prob'
+ elif task == 'Compound-Protein Binding Affinity':
+ y_colname = 'Y_IC50'
+ try:
+ now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+ filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv"
+ df.rename(columns={'Y^': y_colname}).drop(
+ labels=['Compound', 'Scaffold'], axis=1
+ ).to_csv(filename, index=False, na_rep='', sep=csv_sep_map[sep])
+
+ return gr.File(filename, visible=True)
+ except Exception as e:
+ gr.Warning(f"Failed to generate CSV due to error: {str(e)}")
+ return None
+
+
+ def create_html_report_file(df, file_report, task, opts, progress=gr.Progress(track_tqdm=True)):
+ try:
+ now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+ filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.html"
+ create_html_report(df, filename, task, opts)
+ return gr.File(filename, visible=True)
+ except Exception as e:
+ gr.Warning(f"Failed to generate HTML due to error: {str(e)}")
+ return None
+
+
+ # html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate])
+
+ csv_generate.click(
+ lambda: gr.File(visible=True), outputs=csv_download_file,
+ ).then(
+ fn=create_csv_report_file, inputs=[report_df, file_for_report, report_task, csv_sep],
+ outputs=csv_download_file, show_progress='full'
+ )
+ html_generate.click(
+ lambda: gr.File(visible=True), outputs=html_download_file,
+ ).then(
+ fn=create_html_report_file, inputs=[report_df, file_for_report, report_task, html_opts],
+ outputs=html_download_file, show_progress='full'
+ )
+
+if __name__ == "__main__":
+ pandarallel.initialize()
+
+ hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
+
+ session = requests.Session()
+ ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))
+ session.mount('http://', ADAPTER)
+ session.mount('https://', ADAPTER)
+
+ db = TinyDB(f'{SERVER_DATA_DIR}/db.json')
+ # Set all RUNNING jobs to FAILED at TinyDB initialization
+ Job = Query()
+ jobs = db.all()
+ for job in jobs:
+ if job['status'] == 'RUNNING':
+ db.update({'status': 'FAILED'}, Job.id == job['id'])
+
+ scheduler = BackgroundScheduler()
+ scheduler.add_job(check_expiry, 'interval', hours=1, timezone=pytz.utc)
+ scheduler.start()
+
+ demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)