diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,2957 +1,2961 @@ -import glob -import smtplib -from datetime import datetime, timedelta -import itertools -import textwrap -from email.mime.multipart import MIMEMultipart -from email.mime.text import MIMEText -from email.utils import formatdate, make_msgid -from functools import cache -from math import pi -from time import sleep, time -from uuid import uuid4 - -import io -import os -from pathlib import Path -import sys - -import pytz -from Bio import SeqIO -from Bio.Align import PairwiseAligner -from email_validator import validate_email, EmailNotValidError -import gradio as gr -import hydra -import pandas as pd -from pandarallel import pandarallel -import requests -from rdkit.DataStructs import BulkTanimotoSimilarity -from requests.adapters import HTTPAdapter, Retry -from markdown import markdown -from rdkit import Chem -from rdkit.Chem import AllChem, Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen -from rdkit.Chem.Features.ShowFeats import _featColors -from rdkit.Chem.Scaffolds import MurckoScaffold -import py3Dmol - -from bokeh.models import Legend, NumberFormatter, BooleanFormatter, HTMLTemplateFormatter, LegendItem -from bokeh.palettes import Category20c_20 -from bokeh.plotting import figure -from bokeh.transform import cumsum -from bokeh.resources import INLINE -import seaborn as sns -import panel as pn - -from apscheduler.schedulers.background import BackgroundScheduler -from tinydb import TinyDB, Query - -#import swifter -from tqdm.auto import tqdm - -from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT -from deepscreen.predict import predict - -sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score')) -import sascorer - -DATASET_MAX_LEN = 10_240 -SERVER_DATA_DIR = os.getenv('DATA') # '/data' -DB_EXPIRY = timedelta(hours=48).total_seconds() - -CSS = """ -.help-tip { - position: absolute; - display: inline-block; - top: 16px; - right: 0px; - text-align: center; - border-radius: 40%; - /* border: 2px solid darkred; background-color: #8B0000;*/ - width: 24px; - height: 24px; - font-size: 16px; - line-height: 26px; - cursor: default; - transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1); - z-index: 100 !important; -} - -.help-tip:hover { - cursor: pointer; - /*background-color: #ccc;*/ -} - -.help-tip:before { - content: '?'; - font-weight: 700; - color: #8B0000; - z-index: 100 !important; -} - -.help-tip p { - visibility: hidden; - opacity: 0; - text-align: left; - background-color: #EFDDE3; - padding: 20px; - width: 300px; - position: absolute; - border-radius: 4px; - right: -4px; - color: #494F5A; - font-size: 13px; - line-height: normal; - transform: scale(0.7); - transform-origin: 100% 0%; - transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1); - z-index: 100; -} - -.help-tip:hover p { - cursor: default; - visibility: visible; - opacity: 1; - transform: scale(1.0); -} - -.help-tip p:before { - position: absolute; - content: ''; - width: 0; - height: 0; - border: 6px solid transparent; - border-bottom-color: #EFDDE3; - right: 10px; - top: -12px; -} - -.help-tip p:after { - width: 100%; - height: 40px; - content: ''; - position: absolute; - top: -5px; - left: 0; - z-index: 101; -} - -.upload_button { - background-color: #008000; -} - -.absolute { - position: absolute; -} - -.example { -padding: 0; -background: none; -border: none; -text-decoration: underline; -box-shadow: none; -text-align: left !important; -display: inline-block !important; -} - -footer { -visibility: hidden -} -""" - - -class View3DmolCell(py3Dmol.view): - def __init__(self, width=320, height=200): - divid = "3dmolviewer_UNIQUEID" - self.uniqueid = None - if isinstance(width, int): - width = '%dpx' % width - if isinstance(height, int): - height = '%dpx' % height - self.startjs = '''
-
\n''' % (divid, width, height) - self.startjs += '' - - self.updatejs = '' - self.viewergrid = None - - self.startjs += 'viewer_UNIQUEID = $3Dmol.createViewer(document.getElementById("%s"),{backgroundColor:"white"});\n' % divid - self.startjs += "viewer_UNIQUEID.zoomTo();\n" - self.endjs = "viewer_UNIQUEID.render();\n" + self.endjs - - -FEAT_FACTORY = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) - - -def rgb_to_hex(rgb): - rgb = tuple(round(i * 255) for i in rgb) - return '#{:02x}{:02x}{:02x}'.format(rgb[0], rgb[1], rgb[2]) - - -def mol_to_pharm3d(mol, mode='html'): - try: - # AllChem.Compute2DCoords(mol) - mol = Chem.AddHs(mol) - params = AllChem.ETKDGv3() - params.randomSeed = 0xf00d # for reproducibility - AllChem.EmbedMolecule(mol, params) - - feats = FEAT_FACTORY.GetFeaturesForMol(mol) - - view = View3DmolCell(width=320, height=200) - for feat in feats: - pos = feat.GetPos() - color = _featColors.get(feat.GetFamily(), (.5, .5, .5)) - view.addSphere({ - 'center': {'x': pos.x, 'y': pos.y, 'z': pos.z}, - 'radius': 0.5, - 'color': rgb_to_hex(color) - }) - - mol_block = Chem.MolToMolBlock(mol) - view.addModel(mol_block, 'sdf') - view.setStyle({'stick': {}}) - view.zoomTo() - - if mode == 'html': - return view.write_html() - # case 'png': - # return view.png() - except Exception: - return None - - -class HelpTip: - def __new__(cls, text): - return gr.HTML( - # elem_classes="absolute", - value=f'

{text}

', - ) - - -TASK_MAP = { - 'Compound-Protein Interaction': 'DTI', - 'Compound-Protein Binding Affinity': 'DTA', -} - -TASK_METRIC_MAP = { - 'DTI': 'AUROC', - 'DTA': 'CI', - 'Compound-Protein Interaction': 'AUROC', - 'Compound-Protein Binding Affinity': 'CI', - 'CPI': 'DTI', - 'CPA': 'DTA', -} - -PRESET_MAP = { - 'DeepDTA': 'deep_dta', - 'DeepConvDTI': 'deep_conv_dti', - 'GraphDTA': 'graph_dta', - 'MGraphDTA': 'm_graph_dta', - 'HyperAttentionDTI': 'hyper_attention_dti', - 'MolTrans': 'mol_trans', - 'TransformerCPI': 'transformer_cpi', - 'TransformerCPI2': 'transformer_cpi_2', - 'DrugBAN': 'drug_ban', - 'DrugVQA-Seq': 'drug_vqa' -} - -TARGET_FAMILY_MAP = { - 'General': 'general', - 'Kinase': 'kinase', - 'Non-Kinase Enzyme': 'non_kinase_enzyme', - 'Membrane Receptor': 'membrane_receptor', - 'Nuclear Receptor': 'nuclear_receptor', - 'Ion Channel': 'ion_channel', - 'Others': 'others', - # 'general': 'general', - # 'kinase': 'kinase', - # 'non-kinase enzyme': 'non_kinase_enzyme', - # 'membrane receptor': 'membrane_receptor', - # 'nuclear Receptor': 'nuclear_receptor', - # 'ion channel': 'ion_channel', - # 'others': 'others', -} - -TARGET_LIBRARY_MAP = { - 'DrugBank (Human)': 'drugbank_targets.csv', - 'ChEMBL33 (Human)': 'ChEMBL33_human_proteins.csv', -} - -DRUG_LIBRARY_MAP = { - 'DrugBank (Human)': 'drugbank_compounds.csv', - 'Drug Repurposing Hub': 'drug_repurposing_hub.csv', - 'Enamine Discovery Diversity Set (DDS-10)': 'Enamine_Discovery_Diversity_Set_10_10240cmpds_20240130.csv', - 'Enamine Phenotypic Screening Library (PSL-5760)': 'Enamine_Phenotypic_Screening_Library_plated_5760cmds_2020_07_20.csv' -} - -COLUMN_ALIASES = { - 'X1': 'Compound SMILES', - 'X2': 'Target FASTA', - 'ID1': 'Compound ID', - 'ID2': 'Target ID', - 'Y': 'Actual CPI/CPA', - 'Y^': 'Predicted CPI/CPA', -} - -DRUG_SCRENN_CPI_OPTS = [ - 'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set', - 'Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target', - 'Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound', -] - -DRUG_SCRENN_CPA_OPTS = [ - 'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set', -] - -TARGET_IDENTIFY_CPI_OPTS = [ - 'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set', - 'Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound', - 'Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target', -] - -TARGET_IDENTIFY_CPA_OPTS = [ - 'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set', -] - -pd.set_option('display.float_format', '{:.3f}'.format) -PandasTools.molRepresentation = 'svg' -PandasTools.drawOptions = Draw.rdMolDraw2D.MolDrawOptions() -PandasTools.drawOptions.clearBackground = False -PandasTools.drawOptions.bondLineWidth = 1 -PandasTools.drawOptions.explicitMethyl = True -PandasTools.drawOptions.singleColourWedgeBonds = True -PandasTools.drawOptions.useCDKAtomPalette() -PandasTools.molSize = (100, 64) - - -def remove_job_record(job_id): - # Delete the job from the database - db.remove(Job.id == job_id) - # Delete the corresponding files - files = glob.glob(f"{SERVER_DATA_DIR}/{job_id}*") - for file_path in files: - if os.path.exists(file_path): - os.remove(file_path) - - -def check_expiry(): - Job = Query() - jobs = db.all() - - for job in jobs: - # Check if the job has expired - if job['status'] != 'RUNNING': - expiry_time = job['expiry_time'] if job['expiry_time'] is not None else job['start_time'] + DB_EXPIRY - if expiry_time < time(): - # Delete the job from the database - db.remove(Job.id == job['id']) - # Delete the corresponding file - files = glob.glob(f"{SERVER_DATA_DIR}/{job['id']}*") - for file_path in files: - if os.path.exists(file_path): - os.remove(file_path) - elif job['status'] == 'RUNNING' and time() - job['start_time'] > 4 * 60 * 60: # 4 hours - # Mark the job as failed - db.update({'status': 'FAILED', - 'error': 'Job has timed out by exceeding the maximum running time of 4 hours.'}, - Job.id == job['id']) - if job.get('email'): - send_email(job) - - -def smiles_to_ecfp(smiles): - mol = Chem.MolFromSmiles(smiles) - if mol: - ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) - else: - ecfp = [] - return ecfp - - -def max_tanimoto_similarity(smi, seen_smiles_with_fp): - if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty: - return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None} - - if smi in seen_smiles_with_fp['X1'].values: - compound = smi - if 'ID1' in seen_smiles_with_fp.columns: - id1 = seen_smiles_with_fp.loc[seen_smiles_with_fp['X1'] == smi, 'ID1'].values[0] - if pd.notnull(id1) and id1 != '': - compound = id1 - return {'Max. Tanimoto Similarity': 1, 'Max. Tanimoto Similarity Compound': compound} - - mol = Chem.MolFromSmiles(smi) - if mol is None: - return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None} - - mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) - sims = pd.Series(BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'].values)).to_numpy() - idx = sims.argmax() - compound = seen_smiles_with_fp.iloc[idx]['X1'] - if 'ID1' in seen_smiles_with_fp.columns: - id1 = seen_smiles_with_fp.iloc[idx]['ID1'] - if pd.notnull(id1) and id1 != '': - compound = id1 - - return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound} - - -def alignment_score(query, target): - aligner = PairwiseAligner() - aligner.mode = 'local' - alignment = aligner.align(query, target) - return alignment.score / max(len(query), len(target)) - - -def max_sequence_identity(seq, seen_fastas): - if seq is None or seen_fastas is None or seen_fastas.empty: - return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None} - - if seq in seen_fastas['X2'].values: - target = seq - if 'ID2' in seen_fastas.columns: - id2 = seen_fastas.loc[seen_fastas['X2'] == seq, 'ID2'].values[0] - if pd.notnull(id2) and id2 != '': - target = id2 - return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target} - - cached_alignment_score = cache(alignment_score) - max_iden = 0 - target = None - for fasta in seen_fastas['X2'].values: - identity = cached_alignment_score(seq, fasta) - - if identity > max_iden: - max_iden = identity - target = fasta - if 'ID2' in seen_fastas.columns: - id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0] - if pd.notnull(id2) and id2 != '': - target = id2 - if max_iden == 1: - break - - cached_alignment_score.cache_clear() - return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target} - - -def get_seen_smiles(family, task): - if family == 'General': - family = 'all_families_full' - else: - family = TARGET_FAMILY_MAP[family.title()] - seen_smiles = pd.read_csv( - f'data/benchmarks/seen_compounds/{family}_{task.lower()}_random_split.csv') - return seen_smiles - - -def get_seen_fastas(family, task): - if family == 'General': - family = 'all_families_full' - else: - family = TARGET_FAMILY_MAP[family.title()] - seen_fastas = pd.read_csv( - f'data/benchmarks/seen_targets/{family}_{task.lower()}_random_split.csv') - return seen_fastas - - -@cache -def get_fasta_family_map(): - usecols = ['X2', 'ID2', 'Target Family'] - fasta_family_map = pd.concat([ - pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv', usecols=usecols), - pd.read_csv('data/target_libraries/idmapping_not_in_chembl.csv', usecols=usecols) - ]).drop_duplicates(subset=['X2'], keep='first') - return fasta_family_map - - -def lipinski(mol): - """ - Lipinski's rules: - Hydrogen bond donors <= 5 - Hydrogen bond acceptors <= 10 - Molecular weight <= 500 daltons - logP <= 5 - """ - return ( - Lipinski.NumHDonors(mol) <= 5 and - Lipinski.NumHAcceptors(mol) <= 10 and - Descriptors.MolWt(mol) <= 500 and - Crippen.MolLogP(mol) <= 5 - ) - - -def reos(mol): - """ - Rapid Elimination Of Swill filter: - Molecular weight between 200 and 500 - LogP between -5.0 and +5.0 - H-bond donor count between 0 and 5 - H-bond acceptor count between 0 and 10 - Formal charge between -2 and +2 - Rotatable bond count between 0 and 8 - Heavy atom count between 15 and 50 - """ - return ( - 200 <= Descriptors.MolWt(mol) <= 500 and - -5.0 <= Crippen.MolLogP(mol) <= 5.0 and - 0 <= Lipinski.NumHDonors(mol) <= 5 and - 0 <= Lipinski.NumHAcceptors(mol) <= 10 and - -2 <= rdmolops.GetFormalCharge(mol) <= 2 and - 0 <= rdMolDescriptors.CalcNumRotatableBonds(mol) <= 8 and - 15 <= rdMolDescriptors.CalcNumHeavyAtoms(mol) <= 50 - ) - - -def ghose(mol): - """ - Ghose drug like filter: - Molecular weight between 160 and 480 - LogP between -0.4 and +5.6 - Atom count between 20 and 70 - Molar refractivity between 40 and 130 - """ - return ( - 160 <= Descriptors.MolWt(mol) <= 480 and - -0.4 <= Crippen.MolLogP(mol) <= 5.6 and - 20 <= rdMolDescriptors.CalcNumAtoms(mol) <= 70 and - 40 <= Crippen.MolMR(mol) <= 130 - ) - - -def veber(mol): - """ - The Veber filter is a rule of thumb filter for orally active drugs described in - Veber et al., J Med Chem. 2002; 45(12): 2615-23.: - Rotatable bonds <= 10 - Topological polar surface area <= 140 - """ - return ( - rdMolDescriptors.CalcNumRotatableBonds(mol) <= 10 and - rdMolDescriptors.CalcTPSA(mol) <= 140 - ) - - -def rule_of_three(mol): - """ - Rule of Three filter (Congreve et al., Drug Discov. Today. 8 (19): 876–7, (2003).): - Molecular weight <= 300 - LogP <= 3 - H-bond donor <= 3 - H-bond acceptor count <= 3 - Rotatable bond count <= 3 - """ - return ( - Descriptors.MolWt(mol) <= 300 and - Crippen.MolLogP(mol) <= 3 and - Lipinski.NumHDonors(mol) <= 3 and - Lipinski.NumHAcceptors(mol) <= 3 and - rdMolDescriptors.CalcNumRotatableBonds(mol) <= 3 - ) - - -@cache -def load_smarts_patterns(smarts_path): - # Load the CSV file containing SMARTS patterns - smarts_df = pd.read_csv(Path(smarts_path)) - # Convert all SMARTS patterns to molecules - smarts_mols = [Chem.MolFromSmarts(smarts) for smarts in smarts_df['smarts']] - return smarts_mols - - -def smarts_filter(mol, smarts_mols): - for smarts_mol in smarts_mols: - if smarts_mol is not None and mol.HasSubstructMatch(smarts_mol): - return False - return True - - -def pains(mol): - smarts_mols = load_smarts_patterns("data/filters/pains.csv") - return smarts_filter(mol, smarts_mols) - - -def mlsmr(mol): - smarts_mols = load_smarts_patterns("data/filters/mlsmr.csv") - return smarts_filter(mol, smarts_mols) - - -def dundee(mol): - smarts_mols = load_smarts_patterns("data/filters/dundee.csv") - return smarts_filter(mol, smarts_mols) - - -def glaxo(mol): - smarts_mols = load_smarts_patterns("data/filters/glaxo.csv") - return smarts_filter(mol, smarts_mols) - - -def bms(mol): - smarts_mols = load_smarts_patterns("data/filters/bms.csv") - return smarts_filter(mol, smarts_mols) - - -SCORE_MAP = { - 'SAscore': sascorer.calculateScore, - 'LogP': Crippen.MolLogP, - 'Molecular Weight': Descriptors.MolWt, - 'Number of Atoms': rdMolDescriptors.CalcNumAtoms, - 'Number of Heavy Atoms': rdMolDescriptors.CalcNumHeavyAtoms, - 'Molar Refractivity': Crippen.MolMR, - 'H-Bond Donor Count': Lipinski.NumHDonors, - 'H-Bond Acceptor Count': Lipinski.NumHAcceptors, - 'Rotatable Bond Count': rdMolDescriptors.CalcNumRotatableBonds, - 'Topological Polar Surface Area': rdMolDescriptors.CalcTPSA, -} - -FILTER_MAP = { - # TODO support number_of_violations - 'REOS': reos, - "Lipinski's Rule of Five": lipinski, - 'Ghose': ghose, - 'Rule of Three': rule_of_three, - 'Veber': veber, - 'PAINS': pains, - 'MLSMR': mlsmr, - 'Dundee': dundee, - 'Glaxo': glaxo, - 'BMS': bms, -} - - -def validate_columns(df, mandatory_cols): - missing_cols = [col for col in mandatory_cols if col not in df.columns] - if missing_cols: - error_message = (f"The following mandatory columns are missing " - f"in the uploaded dataset: {str(mandatory_cols).strip('[]')}.") - raise ValueError(error_message) - else: - return - - -def process_target_fasta(sequence): - try: - if sequence: - lines = sequence.strip().split("\n") - if lines[0].startswith(">"): - lines = lines[1:] - return ''.join(lines).split(">")[0].strip() - # record = list(SeqIO.parse(io.StringIO(sequence), "fasta"))[0] - # return str(record.seq) - else: - raise ValueError('Empty FASTA sequence.') - except Exception as e: - raise gr.Error(f'Failed to process FASTA due to error: {str(e)}') - - -def send_email(job_info): - if job_info.get('email'): - try: - email_info = job_info.copy() - email_serv = os.getenv('EMAIL_SERV') - email_port = os.getenv('EMAIL_PORT') - email_addr = os.getenv('EMAIL_ADDR') - email_pass = os.getenv('EMAIL_PASS') - email_form = os.getenv('EMAIL_FORM') - email_subj = os.getenv('EMAIL_SUBJ') - - for key, value in email_info.items(): - if key.endswith("time") and value: - email_info[key] = ts_to_str(value, get_timezone_by_ip(email_info['ip'])) - - server = smtplib.SMTP(email_serv, int(email_port)) - # server.starttls() - - server.login(email_addr, email_pass) - msg = MIMEMultipart("alternative") - msg["From"] = email_addr - msg["To"] = email_info['email'] - msg["Subject"] = email_subj.format(**email_info) - msg["Date"] = formatdate(localtime=True) - msg["Message-ID"] = make_msgid() - - msg.attach(MIMEText(markdown(email_form.format(**email_info)), 'html')) - msg.attach(MIMEText(email_form.format(**email_info), 'plain')) - - server.sendmail(email_addr, email_info['email'], msg.as_string()) - server.quit() - gr.Info('Email notification sent.') - except Exception as e: - gr.Warning('Failed to send email notification due to error: ' + str(e)) - - -def check_user_running_job(email, request): - message = ("You already have a running prediction job (ID: {id}) under this {reason}. " - "Please wait for it to complete before submitting another job.") - try: - # with open('jobs.json', 'r') as f: # /data/ - # # Load the JSON data from the file - # jobs = json.load(f) - # - # for job_id, job_info in jobs.items(): - # # check if a job is running for the email - # if email: - # if job_info["email"] == email and job_info["status"] == "running": - # return message.format(id=job_id, reason="email") - # # check if a job is running for the session - # elif request.cookies: - # for key, value in job_info["cookies"].items() and job_info["status"] == "running": - # if key in request.cookies and request.cookies[key] == value: - # return message.format(id=job_id, reason="session") - # # check if a job is running for the IP - # else: - # if job_info["IP"] == request.client.host and job_info["status"] == "running": - # return message.format(id=job_id, reason="IP") - # check if a job is running for the email - Job = Query() - if email: - job = db.search((Job.email == email) & (Job.status == "RUNNING")) - if job: - return message.format(id=job[0]['id'], reason="email") - # check if a job is running for the session - elif request.cookies: - for key, value in request.cookies.items(): - job = db.search((Job.cookies[key] == value) & (Job.status == "RUNNING")) - if job: - return message.format(id=job[0]['id'], reason="session") - # check if a job is running for the IP - else: - job = db.search((Job.IP == request.client.host) & (Job.status == "RUNNING")) - if job: - return message.format(id=job[0]['id'], reason="IP") - - return False - except Exception as e: - raise gr.Error(f'Failed to validate user running jobs due to error: {str(e)}') - - -def get_timezone_by_ip(ip): - try: - data = session.get(f'https://worldtimeapi.org/api/ip/{ip}').json() - return data['timezone'] - except Exception: - return 'UTC' - - -def ts_to_str(timestamp, timezone): - # Create a timezone-aware datetime object from the UNIX timestamp - dt = datetime.fromtimestamp(timestamp, pytz.utc) - - # Convert the timezone-aware datetime object to the target timezone - target_timezone = pytz.timezone(timezone) - localized_dt = dt.astimezone(target_timezone) - - # Format the datetime object to the specified string format - return localized_dt.strftime('%Y-%m-%d %H:%M:%S (%Z%z)') - - -def lookup_job(job_id): - gr.Info('Start querying the job database...') - stop = False - retry = 0 - while not stop: - try: - sleep(5) - Job = Query() - jobs = db.search((Job.id == job_id)) - if jobs: - job = jobs[0] - job_status = job['status'] - job_type = job['type'] - error = job['error'] - start_time = ts_to_str(job['start_time'], get_timezone_by_ip(job['ip'])) - if job.get('end_time'): - end_time = ts_to_str(job['end_time'], get_timezone_by_ip(job['ip'])) - if job.get('expiry_time'): - expiry_time = ts_to_str(job['expiry_time'], get_timezone_by_ip(job['ip'])) - if job_status == "RUNNING": - yield { - pred_lookup_status: f''' -Your **{job_type}** job (ID: **{job_id}**) started at -**{start_time}** and is **RUNNING...** - -It might take a few minutes up to a few hours depending on the prediction dataset, the model, and the queue status. -You may keep the page open and wait for job completion, or close the page and revisit later to look up the job status -using the job id. You will also receive an email notification once the job is done. -''', - pred_lookup_btn: gr.Button(visible=False), - pred_lookup_stop_btn: gr.Button(visible=True) - } - if job_status == "COMPLETED": - stop = True - msg = f"Your {job_type} job (ID: {job_id}) has been **COMPLETED**" - msg += f" at {end_time}" if job.get('end_time') else "" - msg += f" and the results will expire by {expiry_time}." if job.get('expiry_time') else "." - msg += f' Redirecting to the report page...' - - gr.Info(msg) - yield { - pred_lookup_status: msg, - pred_lookup_btn: gr.Button(visible=True), - pred_lookup_stop_btn: gr.Button(visible=False), - tabs: gr.Tabs(selected='Chemical Property Report'), - file_for_report: job['output_file'] - } - if job_status == "FAILED": - stop = True - msg = f'Your {job_type} job (ID: {job_id}) has **FAILED**' - msg += f' at {end_time}' if job.get('end_time') else '' - msg += f' due to error: {error}.' if job.get('expiry_time') else '.' - gr.Info(msg) - yield { - pred_lookup_status: msg, - pred_lookup_btn: gr.Button(visible=True), - pred_lookup_stop_btn: gr.Button(visible=False), - tabs: gr.Tabs(selected='Prediction Status Lookup'), - } - else: - stop = (retry > 3) - if not stop: - msg = f'Job ID {job_id} not found. Retrying... ({retry})' - else: - msg = f'Job ID {job_id} not found after {retry} retries. Please check the job ID and try again.' - gr.Info(msg) - retry += 1 - yield { - pred_lookup_status: msg, - pred_lookup_btn: gr.Button(visible=True), - pred_lookup_stop_btn: gr.Button(visible=False), - tabs: gr.Tabs(selected='Prediction Status Lookup'), - } - - except Exception as e: - raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}') - - -def apply_advanced_opts(prediction_df, opts, df_training): - # Advanced options for Drug Hit Screening - if "Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set" in opts: - x2 = prediction_df['X2'].iloc[0] - - prediction_df[[ - 'Max. Sequence Identity to Training Targets', - 'Max. Id. Training Target' - ]] = pd.Series(max_sequence_identity(x2, df_training)) - - if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts: - x2 = prediction_df['X2'].iloc[0] - pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy() - pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp) - - @cache - def max_sim(smiles): - return max_tanimoto_similarity(smiles, pos_compounds_df) - - prediction_df[[ - 'Max. Tanimoto Similarity to Known Ligands', - 'Max. Sim. Ligand' - ]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series) - - max_sim.cache_clear() - - if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts: - x2 = prediction_df['X2'].iloc[0] - prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize) - - @cache - def max_id(compound): - pos_targets_df = df_training.loc[df_training['X1'] == compound] - return max_sequence_identity(x2, pos_targets_df) - - prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound', - 'Max. Id. Target']] = ( - prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series) - ) - prediction_df.drop(['X1^'], axis=1, inplace=True) - - max_id.cache_clear() - - # Advanced options for Target Protein Identification - if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts: - x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0]) - if x1 not in df_training['X1'].values: - df_training['FP'] = df_training['X1'].parallel_apply(smiles_to_ecfp) - - prediction_df[[ - 'Max. Tanimoto Similarity to Training Compounds', - 'Max. Sim. Training Compound' - ]] = pd.Series(max_tanimoto_similarity(x1, df_training)) - - if "Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound" in opts: - x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0]) - pos_targets_df = df_training.loc[(df_training['X1'] == x1) & (df_training['Y'] == 1)].copy() - - @cache - def max_id(fasta): - return max_sequence_identity(fasta, pos_targets_df) - - prediction_df[[ - 'Max. Sequence Identity to Known Targets of Input Compound', - 'Max. Id. Target' - ]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series) - - max_id.cache_clear() - - if "Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target" in opts: - x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0]) - - @cache - def max_sim(fasta): - pos_targets_df = df_training.loc[(df_training['X2'] == fasta) & (df_training['Y'] == 1)].copy() - if x1 not in pos_targets_df['X1'].values: - pos_targets_df['FP'] = pos_targets_df['X1'].apply(smiles_to_ecfp) - return max_tanimoto_similarity(x1, pos_targets_df) - - prediction_df[[ - 'Max. Tanimoto Similarity to Known Ligands of Identified Target', - 'Max. Sim. Ligand' - ]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series) - - max_sim.cache_clear() - - return prediction_df - - -def submit_predict(predict_filepath, task, preset, target_family, opts, job_info): - job_id = job_info['id'] - status = job_info['status'] - send_email(job_info) - db.insert(job_info) - error = None - task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'} - predictions_file = None - df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv') - df_training['X1^'] = df_training['X1'] - orig_df = pd.read_csv(predict_filepath) - alignment_df = get_fasta_family_map() - prediction_df = pd.DataFrame() - - @cache - def detect_family(query): - # Check for an exact match first - exact_match = alignment_df[alignment_df['X2'] == query] - if not exact_match.empty: - row = exact_match.iloc[0] - return row['Target Family'] - # If no exact match, then calculate alignment score - else: - aligner = PairwiseAligner() - aligner.mode = 'local' - - def align_score(target): - alignment = aligner.align(query, target) - return alignment.score / max(len(query), len(target)) - - alignment_df['score'] = alignment_df['X2'].apply(align_score) - row = alignment_df.loc[alignment_df['score'].idxmax()] - return row['Target Family'] - - if 'Target Family' not in orig_df.columns: - orig_df['Target Family'] = None - if orig_df['Target Family'].isna().any(): - orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = ( - orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family) - ) - orig_df['Target Family'] = orig_df['Target Family'].str.capitalize() - detect_family.cache_clear() - - orig_df['X1^'] = orig_df['X1'].parallel_apply(rdkit_canonicalize) - - orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False) - annotated_df = orig_df[~orig_df['Y'].isna()].copy() - annotated_df.rename(columns={'Y': 'Y^'}, inplace=True) - annotated_df['Source'] = 'Database' - columns_to_drop = ['X1^', 'Compound', 'Scaffold', 'Scaffold SMILES'] - columns_to_drop = [col for col in columns_to_drop if col in annotated_df.columns] - annotated_df.drop(columns_to_drop, axis=1, inplace=True) - - # Save the unannotated data - unannotated_df = orig_df[orig_df['Y'].isna()].drop(['Y'], axis=1) - if not unannotated_df.empty: - unannotated_df.to_csv(predict_filepath, index=False, na_rep='') - else: - annotated_df.to_csv(predictions_file, index=False, na_rep='') - status = "COMPLETED" - return {run_state: False} - - columns_to_drop = ['ID1', 'X1^', 'Compound', 'Scaffold', 'Scaffold SMILES', 'ID2', 'Y', 'Y^'] - columns_to_drop = [col for col in columns_to_drop if col in orig_df.columns] - orig_df.drop(columns_to_drop, axis=1, inplace=True) - - try: - if target_family != 'Family-Specific Auto-Recommendation': - target_family_value = TARGET_FAMILY_MAP[target_family.title()] - task_value = TASK_MAP[task] - preset_value = PRESET_MAP[preset] - predictions_file = (f'{SERVER_DATA_DIR}/' - f'{job_id}_{task_file_abbr[task]}_{preset}_{target_family_value}_predictions.csv') - - cfg = hydra.compose( - config_name="webserver_inference", - overrides=[f"task={task_value}", - f"preset={preset_value}", - f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family_value}.ckpt", - f"data.data_file='{str(predict_filepath)}'"]) - - predictions, _ = predict(cfg) - predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True) - predictions['Source'] = f'Predicted ({preset} {target_family})' - df_list = [prediction_df, predictions] - prediction_df = pd.concat([df for df in df_list if not df.empty]) - - else: - predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_family-recommended_predictions.csv' - task_value = TASK_MAP[task] - score = TASK_METRIC_MAP[task] - benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv') - predict_df = pd.read_csv(predict_filepath) - - for family, subset in predict_df.groupby('Target Family'): - predict_subset_filepath = os.path.join( - os.path.dirname(predict_filepath), f'{job_id}_{family}_input.csv' - ) - subset.to_csv(predict_subset_filepath, index=False, na_rep='') - - seen_compounds = get_seen_smiles(family, task_value)['X1'].values - if subset['X1^'].iloc[0] in seen_compounds: - scenario = "Seen Compound" - else: - scenario = "Unseen Compound" - - filtered_df = benchmark_df[(benchmark_df['Family'] == family.title()) - & (benchmark_df['Scenario'] == scenario) - & (benchmark_df['Type'] == 'Family')] - - seen_compounds = get_seen_smiles('General', task_value)['X1'].values - if subset['X1^'].iloc[0] in seen_compounds: - scenario = "Seen Compound" - else: - scenario = "Unseen Compound" - - filtered_df = pd.concat([ - filtered_df, - benchmark_df[(benchmark_df['Family'] == family.title()) - & (benchmark_df['Scenario'] == scenario) - & (benchmark_df['Type'] == 'General')] - ]) - - row = filtered_df.loc[filtered_df[score].idxmax()] - preset_value = PRESET_MAP[row['Model']] - target_family = TARGET_FAMILY_MAP[family.title()] if row['Type'] == 'Family' else 'general' - cfg = hydra.compose( - config_name="webserver_inference", - overrides=[f"task={task_value}", - f"preset={preset_value}", - f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt", - f"data.data_file='{str(predict_subset_filepath)}'"]) - - predictions, _ = predict(cfg) - predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True) - predictions['Source'] = (f'Predicted ({row["Model"]} ' - f'{family.title() if row["Type"] == "Family" else "General"})') - df_list = [prediction_df, predictions] - prediction_df = pd.concat([df for df in df_list if not df.empty]) - - prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False) - df_list = [prediction_df, annotated_df] - prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True) - - prediction_df = apply_advanced_opts(prediction_df, opts, df_training) - - prediction_df.drop(['N', 'FP'], axis=1, errors='ignore').to_csv(predictions_file, index=False, na_rep='') - status = 'COMPLETED' - - return {run_state: False} - - except Exception as e: - gr.Warning(f"Prediction job failed due to error: {str(e)}") - status = "FAILED" - predictions_file = None - error = str(e) - return {run_state: False} - - finally: - Job = Query() - job_query = (Job.id == job_id) - - end_time = time() - expiry_time = end_time + DB_EXPIRY - - db.update({'end_time': end_time, - 'expiry_time': expiry_time, - 'status': status, - 'error': error, - 'input_file': predict_filepath, - 'output_file': predictions_file}, - job_query) - if job_info := db.search(job_query)[0]: - if job_info.get('email'): - send_email(job_info) - - -def update_df(file, progress=gr.Progress(track_tqdm=True)): - if file and Path(file).is_file(): - task = None - job = None - - if "_CPI_" in str(file): - task = 'Compound-Protein Interaction' - elif "_CPA_" in str(file): - task = 'Compound-Protein Binding Affinity' - - df = pd.read_csv(file) - - if 'N' in df.columns: - df.set_index('N', inplace=True) - - if not any(col in ['X1', 'X2'] for col in df.columns): - gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.") - return {analyze_btn: gr.Button(interactive=False)} - - if 'X1' in df.columns: - if 'Compound' not in df.columns or df['Compound'].dtype != 'object': - df['Compound'] = df['X1'].parallel_apply( - lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles))) - df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol) - df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x)) - df['Pharmacophore'] = None - if task == 'Compound-Protein Binding Affinity': - # Convert Y^ from pIC50 (nM) to IC50 (nM) - if 'Y^' in df.columns: - df['Y^'] = 10 ** (-df['Y^']) * 1e9 - - n_compound = df['X1'].nunique() - n_protein = df['X2'].nunique() - - if n_compound == 1 and n_protein >= 2: - job = 'Target Protein Identification' - if task == 'Compound-Protein Interaction': - opts = TARGET_IDENTIFY_CPI_OPTS - elif task == 'Compound-Protein Binding Affinity': - opts = TARGET_IDENTIFY_CPA_OPTS - if n_compound >= 2 and n_protein == 1: - job = 'Drug Hit Screening' - if task == 'Compound-Protein Interaction': - opts = DRUG_SCRENN_CPI_OPTS - elif task == 'Compound-Protein Binding Affinity': - opts = DRUG_SCRENN_CPA_OPTS - - return { - html_report: create_html_report(df, file=None, task=task), - raw_df: df, - report_df: df.copy(), - analyze_btn: gr.Button(interactive=True), - report_task: task, - job_opts: gr.CheckboxGroup( - label=f'{job} Advanced Options', - choices=opts, visible=True - ) if job else gr.CheckboxGroup(visible=False), - } - else: - return {analyze_btn: gr.Button(interactive=False)} - - -def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(track_tqdm=True)): - df_html = df.copy(deep=True) - column_aliases = COLUMN_ALIASES.copy() - cols_left = list(pd.Index([ - 'ID1', 'ID2', 'Compound', 'Scaffold', 'Pharmacophore', 'X1', 'Scaffold SMILES', 'X2', 'Y^' - ]).intersection(df_html.columns)) - # cols_right = list(pd.Index(['X1', 'X2']).intersection(df_html.columns)) - # df_html = df_html[cols_left + (df_html.columns.drop(cols_left + cols_right).tolist()) + cols_right] - df_html = df_html[cols_left + df_html.columns.drop(cols_left).tolist()] - - if isinstance(task, str): - column_aliases.update({ - 'Y^': 'Interaction Probability' if task == 'Compound-Protein Interaction' - else 'Binding Affinity (IC50 [nM])' - }) - - ascending = True if column_aliases['Y^'] == 'Binding Affinity (IC50 [nM])' else False - df_html = df_html.sort_values( - [col for col in ['Y^'] if col in df_html.columns], ascending=ascending - ) - - if not file: - df_html = df_html.iloc[:31] - - # Remove repeated info for one-against-N tasks to save visual and physical space - job = 'Chemical Property' - unique_entity = 'Unique Entity' - unique_df = None - category = None - columns_unique = None - - if 'Exclude Pharmacophore 3D' not in opts: - df_html['Pharmacophore'] = df_html['Compound'].parallel_apply( - lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x) - - if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts: - df_html['Compound'] = df_html['Compound'].parallel_apply( - lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x) - else: - df_html.drop(['Compound'], axis=1, inplace=True) - - if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts: - df_html['Scaffold'] = df_html['Scaffold'].parallel_apply( - lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x) - else: - df_html.drop(['Scaffold'], axis=1, inplace=True) - - if 'X1' in df_html.columns and 'X2' in df_html.columns: - n_compound = df_html['X1'].nunique() - n_protein = df_html['X2'].nunique() - - if n_compound == 1 and n_protein >= 2: - unique_entity = 'Compound of Interest' - if any(col in df_html.columns for col in ['Y^', 'Y']): - job = 'Target Protein Identification' - category = 'Target Family' - columns_unique = df_html.columns.isin( - ['ID1', 'Compound', 'Scaffold', 'X1', 'Scaffold SMILES', 'Pharmacophore', - 'Max. Tanimoto Similarity to Training Compounds', 'Max. Sim. Training Compound'] - + list(FILTER_MAP.keys()) + list(SCORE_MAP.keys()) - ) - - elif n_compound >= 2 and n_protein == 1: - unique_entity = 'Target of Interest' - if any(col in df_html.columns for col in ['Y^', 'Y']): - job = 'Drug Hit Screening' - category = 'Scaffold SMILES' - columns_unique = df_html.columns.isin( - ['X2', 'ID2', 'Max. Sequence Identity to Training Targets', 'Max. Id. Training Target'] - ) - - elif 'Y^' in df_html.columns: - job = 'Interaction Pair Inference' - - df_html.rename(columns=column_aliases, inplace=True) - df_html.index.name = 'Index' - if 'Target FASTA' in df_html.columns: - df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply( - lambda x: wrap_text(x) if not pd.isna(x) else x) - - num_cols = df_html.select_dtypes('number').columns - num_col_colors = sns.color_palette('husl', len(num_cols)) - bool_cols = df_html.select_dtypes(bool).columns - bool_col_colors = {True: 'lightgreen', False: 'lightpink'} - - if columns_unique is not None: - unique_df = df_html.loc[:, columns_unique].iloc[[0]].copy() - df_html = df_html.loc[:, ~columns_unique] - df_html.dropna(how='all', axis=1, inplace=True) - unique_df.dropna(how='all', axis=1, inplace=True) - - if not file: - if 'Compound ID' in df_html.columns: - df_html.drop(['Compound SMILES'], axis=1, inplace=True) - if 'Target ID' in df_html.columns: - df_html.drop(['Target FASTA'], axis=1, inplace=True) - if 'Target FASTA' in df_html.columns: - df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply( - lambda x: wrap_text(x) if not pd.isna(x) else x) - if 'Scaffold SMILES' in df_html.columns: - df_html.drop(['Scaffold SMILES'], axis=1, inplace=True) - - # FIXME: Temporarily drop pharmacophore column before an image solution is found - if 'Pharmacophore' in df_html.columns: - df_html.drop(['Pharmacophore'], axis=1, inplace=True) - if unique_df is not None and 'Pharmacophore' in unique_df.columns: - unique_df.drop(['Pharmacophore'], axis=1, inplace=True) - - styled_df = df_html.fillna('').style.format(precision=3) - - for i, col in enumerate(num_cols): - cmap = sns.light_palette(num_col_colors[i], as_cmap=True) - if col in df_html.columns: - if col not in ['Binding Affinity (IC50 [nM])']: - cmap.set_bad('white') - styled_df = styled_df.background_gradient( - subset=[col], cmap=cmap) - else: - cmap = cmap.reversed() - cmap.set_bad('white') - styled_df = styled_df.background_gradient( - subset=[col], cmap=cmap) - - if any(df_html.columns.isin(bool_cols)): - styled_df.applymap(lambda val: f'background-color: {bool_col_colors[val]}', subset=bool_cols) - - table_html = styled_df.to_html() - unique_html = '' - if unique_df is not None: - if 'Target FASTA' in unique_df.columns: - unique_df['Target FASTA'] = unique_df['Target FASTA'].str.replace('\n', '
') - - if 'Max. Sequence Identity to Training Targets' in unique_df.columns: - # Add alert emoji for sequence identity below 0.85 - if unique_df['Max. Sequence Identity to Training Targets'].iloc[0] < 0.85: - unique_df['Max. Sequence Identity to Training Targets'] = ( - unique_df['Max. Sequence Identity to Training Targets'].apply( - lambda x: f'{x:.3f}' - f' ⚠️Lower than recommended (0.85)' - f' - predictive reliability may be compromised' - ) - ) - - if 'Max. Tanimoto Similarity to Training Compounds' in unique_df.columns: - # Add alert emoji for sequence identity below 0.85 - if unique_df['Max. Tanimoto Similarity to Training Compounds'].iloc[0] < 0.85: - unique_df['Max. Tanimoto Similarity to Training Compounds'] = ( - unique_df['Max. Tanimoto Similarity to Training Compounds'].apply( - lambda x: f'{x:.3f}' - f' ⚠️Lower than recommended (0.85)' - f' - predictive reliability may be compromised' - ) - ) - - if any(unique_df.columns.isin(bool_cols)): - unique_df = unique_df.style.applymap( - lambda val: f"background-color: {bool_col_colors[val]}", subset=bool_cols) - unique_html = (f'
' - f'{unique_df.to_html(escape=False, index=False)}
') - - return (f'
{job} Report Preview (Top 30 Records)
' - f'
{unique_html}
' - f'
{table_html}
') - - else: - image_zoom_formatter = HTMLTemplateFormatter(template='
<%= value %>
') - uniprot_id_formatter = HTMLTemplateFormatter( - template='<% if (value == value) { ' # Check if value is not NaN - 'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) ' - # Check if value is a valid UniProt ID - '{ %><%= value %><% ' - # Else treat it as a sequence or other plain-text string, line-warping every 60 characters - '} else { %>
<%= value.match(/.{1,60}/g).join("
") ' - '%>
<% } %><% } else { %><% } %>' # Output empty string if value is NaN - ) - pubchem_id_formatter = HTMLTemplateFormatter( - template='<% if (value == value) { ' # Check if value is not NaN - '%><%= value %>' - '<% } else { %><% } %>' # Output empty string if value is NaN - ) - alert_emoji_formatter = HTMLTemplateFormatter( - template='<% if (value < 0.85) { ' - '%><%= value %> ' - '⚠️Lower than recommended (0.85) - predictive reliability may be compromised<% ' - '} else { %><%= value %><% } %>' - ) - bool_formatters = {col: BooleanFormatter() for col in bool_cols} - float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns} - other_formatters = { - 'Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True}, - 'Compound': image_zoom_formatter, - 'Scaffold': image_zoom_formatter, - 'Pharmacophore': {'type': 'executeScriptFormatter'}, - 'Target FASTA': {'type': 'textarea', 'width': 60}, - 'Target ID': uniprot_id_formatter, - 'Compound ID': pubchem_id_formatter, - 'Max. Sim. Ligand': pubchem_id_formatter, - 'Max. Id. Target': uniprot_id_formatter, - 'Max. Sim. Training Compound': pubchem_id_formatter, - 'Max. Id. Training Target': uniprot_id_formatter, - 'Max. Sequence Identity to Training Targets': alert_emoji_formatter, - 'Max. Sequence Identity to Known Targets of Hit Compound': alert_emoji_formatter, - } - formatters = {**bool_formatters, **float_formatters, **other_formatters} - - # html = df.to_html(file) - # return html - - report_table = pn.widgets.Tabulator( - df_html, formatters=formatters, - frozen_columns=[ - 'Index', 'Target ID', 'Compound ID', 'Compound' - ], - disabled=True, sizing_mode='stretch_both', pagination='local', page_size=10 - ) - - for i, col in enumerate(num_cols): - cmap = sns.light_palette(num_col_colors[i], as_cmap=True) - if col not in ['Binding Affinity (IC50 [nM])']: - if col not in ['Interaction Probability']: - cmap.set_bad(color='white') - report_table.style.background_gradient( - subset=df_html.columns == col, cmap=cmap) - else: - continue - else: - cmap = cmap.reversed() - cmap.set_bad(color='white') - report_table.style.background_gradient( - subset=df_html.columns == col, cmap=cmap) - - pie_charts = {} - for y in df_html.columns.intersection(['Interaction Probability', 'Binding Affinity (IC50 [nM])']): - pie_charts[y] = [] - for k in [10, 30, 100]: - if k < len(df_html): - pie_charts[y].append(create_pie_chart(df_html, category=category, value=y, top_k=k)) - pie_charts[y].append(create_pie_chart(df_html, category=category, value=y, top_k=len(df_html))) - - # Remove keys with empty values - pie_charts = {k: v for k, v in pie_charts.items() if any(v)} - - panel_css = """ - .tabulator { - font-family: Courier New !important; - font-weight: normal !important; - font-size: 12px !important; - } - - .tabulator-cell { - overflow: visible !important; - align-content: center !important; - } - - .tabulator-cell:hover { - z-index: 1000 !important; - } - - .image-zoom-viewer { - display: inline-block; - overflow: visible; - z-index: 1000; - } - - .image-zoom-viewer::after { - content: ""; - top: 0; - left: 0; - width: 100%; - height: 100%; - pointer-events: none; - } - - .image-zoom-viewer:hover::after { - pointer-events: all; - } - - /* When hovering over the container, scale its child (the SVG) */ - .tabulator-cell:hover .image-zoom-viewer svg { - padding: 3px; - position: absolute; - background-color: rgba(250, 250, 250, 0.854); - box-shadow: 0 0 10px rgba(0, 0, 0, 0.618); - border-radius: 3px; - transform: scale(3); /* Scale up the SVG */ - transition: transform 0.3s ease; - pointer-events: none; /* Prevents the SVG from blocking mouse interactions */ - z-index: 1000; - } - """ - - pn.extension( - raw_css=[panel_css], - js_files={'panel_custom': 'static/panel.js', '3Dmol': 'static/3Dmol-min.js'}, - # js_modules={'3Dmol': 'static/3Dmol-min.js'}, - inline=True, - ) - - template = pn.template.VanillaTemplate( - title=f'DeepSEQreen {job} Report', - sidebar=[], - favicon='deepseqreen.ico', - logo='deepseqreen.svg', - header_background='#F3F5F7', - header_color='#4372c4', - busy_indicator=None, - ) - - stats_pane = pn.Row() - if unique_df is not None: - unique_table = pn.widgets.Tabulator(unique_df, formatters=formatters, sizing_mode='stretch_width', - show_index=False, disabled=True, - frozen_columns=['Compound ID', 'Compound', 'Target ID']) - # if pie_charts: - # unique_table.width = 640 - stats_pane.append(pn.Column(f'### {unique_entity}', unique_table)) - if pie_charts: - for score_name, figure_list in pie_charts.items(): - stats_pane.append( - pn.Column(f'### {category} by Top {score_name}', - pn.Tabs(*figure_list, tabs_location='above')) - # pn.Card(pn.Row(v), title=f'{category} by Top {k}') - ) - - if stats_pane: - template.main.append(pn.Card(stats_pane, - sizing_mode='stretch_width', title='Summary Statistics', margin=10)) - - template.main.append( - pn.Card(report_table, title=f'{job} Results', # width=1200, - margin=10) - ) - - template.save(file, title=f'DeepSEQreen {job} Report', resources=INLINE) - return file - - -def create_pie_chart(df, category, value, top_k): - if category not in df or value not in df: - return - top_k_df = df.nlargest(top_k, value) - category_counts = top_k_df[category].value_counts() - data = pd.DataFrame({category: category_counts.index, 'value': category_counts.values}) - - data['proportion'] = data['value'] / data['value'].sum() - # Merge rows with proportion less than 0.2% into one row - mask = data['proportion'] < 0.002 - if any(mask): - merged_row = data[mask].sum() - merged_row[category] = '...' - data = pd.concat([data[~mask], pd.DataFrame(merged_row).T]) - data['angle'] = data['proportion'] * 2 * pi - - color_dict = {cat: color for cat, color in - zip(df[category].unique(), - (Category20c_20 * (len(df[category].unique()) // 20 + 1))[:len(df[category].unique())])} - color_dict['...'] = '#636363' - data['color'] = data[category].map(color_dict) - - tooltips = [ - (f"{category}", f"@{{{category}}}"), - ("Count", "@value"), - ("Percentage", "@proportion{0.0%}") - ] - - if category == 'Scaffold SMILES' and 'Scaffold' in df.columns: - data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left', - left_on='Scaffold SMILES', right_on='Scaffold SMILES') - tooltips.append(("Scaffold", "
@{Scaffold}{safe}
")) - p = figure(height=384, width=960, name=f"Top {top_k}" if top_k < len(df) else 'All', sizing_mode='stretch_height', - toolbar_location=None, tools="hover", tooltips=tooltips, x_range=(-0.4, 0.4)) - - def truncate_label(label, max_length=60): - return label if len(label) <= max_length else label[:max_length] + "..." - - data['legend_field'] = data[category].apply(truncate_label) - - p.add_layout(Legend(padding=0, margin=0), 'right') - p.wedge(x=0, y=1, radius=0.3, - start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'), - line_color="white", fill_color='color', legend_field='legend_field', source=data) - - # Limit the number of legend items to 20 and add "..." if there are more than 20 items - if len(p.legend.items) > 20: - new_legend_items = p.legend.items[:20] - new_legend_items.append(LegendItem(label="...")) - p.legend.items = new_legend_items - - p.legend.label_text_font_size = "10pt" - p.legend.label_text_font = "courier" - p.axis.axis_label = None - p.axis.visible = False - p.grid.grid_line_color = None - p.outline_line_width = 0 - p.min_border = 0 - p.margin = 0 - - return p - - -def submit_report(df, score_list, filter_list, opt_list, task, progress=gr.Progress(track_tqdm=True)): - df_report = df.copy() - try: - for filter_name in filter_list: - df_report[filter_name] = df_report['Compound'].parallel_apply( - lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x) - - for score_name in score_list: - df_report[score_name] = df_report['Compound'].parallel_apply( - lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x) - - if opt_list: - df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv') - df_report = apply_advanced_opts(df_report, opt_list, df_training) - - return (create_html_report(df_report, file=None, task=task), df_report, - gr.File(visible=False), gr.File(visible=False)) - - except Exception as e: - gr.Warning(f'Failed to report results due to error: {str(e)}') - return None, None, None, None - - -def wrap_text(text, line_length=60): - if isinstance(text, str): - wrapper = textwrap.TextWrapper(width=line_length) - if text.startswith('>'): - sections = text.split('>') - wrapped_sections = [] - for section in sections: - if not section: - continue - lines = section.split('\n') - seq_header = lines[0] - wrapped_seq = wrapper.fill(''.join(lines[1:])) - wrapped_sections.append(f">{seq_header}\n{wrapped_seq}") - return '\n'.join(wrapped_sections) - else: - return wrapper.fill(text) - else: - return text - - -def unwrap_text(text): - return text.strip.replece('\n', '') - - -def drug_library_from_sdf(sdf_path): - return PandasTools.LoadSDF( - sdf_path, - smilesName='X1', molColName='Compound', includeFingerprints=True - ) - - -def process_target_library_upload(library_upload): - if library_upload.endswith('.csv'): - df = pd.read_csv(library_upload) - elif library_upload.endswith('.fasta'): - df = target_library_from_fasta(library_upload) - else: - raise gr.Error('Currently only CSV and FASTA files are supported as target libraries.') - validate_columns(df, ['X2']) - return df - - -def process_drug_library_upload(library_upload): - if library_upload.endswith('.csv'): - df = pd.read_csv(library_upload) - elif library_upload.endswith('.sdf'): - df = drug_library_from_sdf(library_upload) - else: - raise gr.Error('Currently only CSV and SDF files are supported as drug libraries.') - validate_columns(df, ['X1']) - return df - - -def target_library_from_fasta(fasta_path): - records = list(SeqIO.parse(fasta_path, "fasta")) - id2 = [record.id for record in records] - seq = [str(record.seq) for record in records] - df = pd.DataFrame({'ID2': id2, 'X2': seq}) - return df - - -theme = gr.themes.Base(spacing_size="sm", text_size='md', font=gr.themes.GoogleFont("Roboto")).set( - background_fill_primary='#eef3f9', - background_fill_secondary='white', - checkbox_label_background_fill='#eef3f9', - checkbox_label_background_fill_hover='#dfe6f0', - checkbox_background_color='white', - checkbox_border_color='#4372c4', - border_color_primary='#4372c4', - border_color_accent='#2e6ab5', - button_primary_background_fill='#2e6ab4', - button_primary_text_color='white', - body_text_color='#28496F', - block_background_fill='#fbfcfd', - block_title_text_color='#28496F', - block_label_text_color='#28496F', - block_info_text_color='#505358', - block_border_color=None, - # input_border_color='#4372c4', - # panel_border_color='#4372c4', - input_background_fill='#F1F2F4', -) - -with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48 * 3600)) as demo: - run_state = gr.State(value=False) - screen_flag = gr.State(value=False) - identify_flag = gr.State(value=False) - infer_flag = gr.State(value=False) - - with gr.Tabs() as tabs: - with gr.TabItem(label='Drug Hit Screening', id='Drug Hit Screening'): - gr.Markdown(''' - #
Drug Hit Screening
- -
- To predict interactions or binding affinities of a single target against a compound library. -
- ''') - with gr.Row(): - with gr.Column(): - HelpTip( - "Enter (paste) a amino acid sequence below manually or upload a FASTA file. " - "If multiple entities are in the FASTA, only the first will be used. " - "Alternatively, enter a Uniprot ID or gene symbol with organism and click Query for " - "the sequence." - ) - target_input_type = gr.Dropdown( - label='Step 1. Select Target Input Type and Input', - choices=['Sequence', 'UniProt ID', 'Gene symbol'], - info='Enter (paste) a FASTA string below manually or upload a FASTA file.', - value='Sequence', - scale=4, interactive=True - ) - - with gr.Row(): - target_id = gr.Textbox(show_label=False, visible=False, - interactive=True, scale=4, - info='Enter a UniProt ID and query.') - target_gene = gr.Textbox( - show_label=False, visible=False, - interactive=True, scale=4, - info='Enter a gene symbol and query. The first record will be used.') - target_organism = gr.Textbox( - info='Organism scientific name (default: Homo sapiens).', - placeholder='Homo sapiens', show_label=False, - visible=False, interactive=True, scale=4, ) - target_upload_btn = gr.UploadButton(label='Upload a FASTA File', type='binary', - visible=True, variant='primary', - size='lg') - target_paste_markdown = gr.Button(value='OR Paste Your Sequence Below', - variant='secondary') - target_query_btn = gr.Button(value='Query the Sequence', variant='primary', - visible=False, scale=4) - # with gr.Row(): - # example_uniprot = gr.Button(value='Example: Q16539', elem_classes='example', visible=False) - # example_gene = gr.Button(value='Example: MAPK14', elem_classes='example', visible=False) - example_fasta = gr.Button(value='Example: MAPK14 (Q16539)', elem_classes='example') - target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5) - # with gr.Row(): - # with gr.Column(): - # with gr.Column(): - # gr.File(label='Example FASTA file', - # value='data/examples/MAPK14.fasta', interactive=False) - - with gr.Row(): - with gr.Column(min_width=200): - HelpTip( - "Click Auto-detect to identify the protein family using sequence alignment. " - "This optional step allows applying a family-specific model instead of a all-family " - "model (general). " - "Manually select general if the alignment results are unsatisfactory." - ) - drug_screen_target_family = gr.Dropdown( - choices=list(TARGET_FAMILY_MAP.keys()), - value='General', - label='Step 2. Select Target Family (Optional)', interactive=True) - target_family_detect_btn = gr.Button(value='OR Let Us Auto-Detect for You', - variant='primary') - with gr.Column(min_width=200): - HelpTip( - "Interaction prediction provides you binding probability score between the target of " - "interest and each compound in the library, " - "while affinity prediction directly estimates their binding strength measured using " - "half maximal inhibitory concentration (IC50) in units of nM." - ) - drug_screen_task = gr.Dropdown( - list(TASK_MAP.keys()), - label='Step 3. Select a Prediction Task', - value='Compound-Protein Interaction') - with gr.Column(min_width=200): - HelpTip( - "Select your preferred model, or click Recommend for the best-performing model based " - "on the selected task, family, and whether the target was trained. " - "Please refer to documentation for detailed benchmark results." - ) - drug_screen_preset = gr.Dropdown( - list(PRESET_MAP.keys()), - label='Step 4. Select a Preset Model') - screen_preset_recommend_btn = gr.Button( - value='OR Let Us Recommend for You', variant='primary') - - with gr.Row(): - with gr.Column(): - HelpTip( - "Select a preset compound library (e.g., DrugBank). " - "Alternatively, upload a CSV file with a column named X1 containing compound SMILES, " - "or use an SDF file (Max. 10,000 compounds per task). Example CSV and SDF files are " - "provided below and can be downloaded by clicking the lower right corner." - ) - drug_library = gr.Dropdown( - label='Step 5. Select a Preset Compound Library', - choices=list(DRUG_LIBRARY_MAP.keys())) - with gr.Row(): - gr.File(label='Example SDF compound library', - value='data/examples/compound_library.sdf', interactive=False) - gr.File(label='Example CSV compound library', - value='data/examples/compound_library.csv', interactive=False) - drug_library_upload_btn = gr.UploadButton( - label='OR Upload Your Own Library', variant='primary') - drug_library_upload = gr.File(label='Custom compound library file', visible=False) - - with gr.Column(): - HelpTip(""" -Max. Sequence Identity between the Input Target and Targets in the Training Set: -this serves as an indicator of the predictioon applicability/reliability – -higher similarities indicate more reliable predictions (preferably > 0.85).
-Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target: -this serves as an indicator of both the confidence level and novelty of the predicted hit compounds – -higher similarities suggest greater confidence, while lower Tanimoto similarities may indicate the novelty -of the identified hit compounds compared to known drugs or true interacting compounds of the input target.
-Max. Sequence Identity between the Input Target and Known Targets of Hit Compound: -this serves as an additional indicator of the confidence level of the predicted hit compounds – -higher identities usually lead to greater confidence in the predictions.
-""") - drug_screen_opts = gr.CheckboxGroup( - label="Step 6. Select Advanced Options", - value=DRUG_SCRENN_CPI_OPTS[0], - choices=DRUG_SCRENN_CPI_OPTS, - info="Advanced features - may increase the job computation time. " - "See the Help Tip on the right or the Documentation for detailed explanation.", - - ) - with gr.Row(): - with gr.Column(): - drug_screen_email = gr.Textbox( - label='Step 7. Input Your Email Address (Optional)', - info="Your email address will be used to notify you of the status of your job. " - "If you cannot receive the email, please check your spam/junk folder." - ) - - with gr.Row(visible=True): - with gr.Row(): - drug_screen_clr_btn = gr.ClearButton(size='lg') - drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg') - - screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath') - - with gr.TabItem(label='Target Protein Identification', id='Target Protein Identification'): - gr.Markdown(''' - #
Target Protein Identification
- -
- To predict interactions or binding affinities of a single compound against a protein library. -
- ''') - with gr.Column() as identify_page: - with gr.Row(): - with gr.Column(): - HelpTip( - "Enter (paste) a compound SMILES below manually or upload a SDF file. " - "If multiple entities are in the SDF, only the first will be used. " - "SMILES can be obtained by searching for the compound of interest in databases such " - "as NCBI, PubChem and and ChEMBL." - ) - compound_type = gr.Dropdown( - label='Step 1. Select Compound Input Type and Input', - choices=['SMILES', 'SDF'], - info='Enter (paste) an SMILES string or upload an SDF file to convert to SMILES.', - value='SMILES', - interactive=True) - compound_upload_btn = gr.UploadButton( - label='OR Upload a SDF File', variant='primary', type='binary', visible=False) - - compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5) - example_drug = gr.Button(value='Example: Aspirin', elem_classes='example') - - with gr.Row(): - with gr.Column(visible=True): - HelpTip( - "By default, models trained on all protein families (general) will be applied. " - "If you upload a target library containing proteins all in the same family, " - "you may manually select a Target Family." - ) - # target_identify_target_family = gr.Dropdown( - # choices=['Family-Specific Auto-Recommendation'] + list(TARGET_FAMILY_MAP.keys()), - # value='Family-Specific Auto-Recommendation', - # label='Step 2. Select Target Family') - target_identify_target_family = gr.Dropdown( - choices=['General'], - value='General', - label='Step 2. Select Target Family') - with gr.Column(): - HelpTip( - "Interaction prediction provides you binding probability score between the target of " - "interest and each compound in the library, while affinity prediction directly " - "estimates their binding strength measured using " - "half maximal inhibitory concentration (IC50) in units of nM." - ) - target_identify_task = gr.Dropdown( - list(TASK_MAP.keys()), - label='Step 3. Select a Prediction Task', - value='Compound-Protein Interaction') - - with gr.Column(): - HelpTip( - "Select your preferred model, or click Recommend for the best-performing model based " - "on the selected task and whether the compound was trained. By default, General-trained " - "model is used for Target Protein Identification. " - "Please refer to the documentation for detailed benchmark results." - ) - # target_identify_preset = gr.Dropdown( - # choices=['Family-Specific Auto-Recommendation'] + list(PRESET_MAP.keys()), - # value='Family-Specific Auto-Recommendation', - # label='Step 4. Select a Preset Model') - target_identify_preset = gr.Dropdown( - choices=['DeepConvDTI', 'DrugBAN', 'HyperAttentionDTI'], - value='DrugBAN', - label='Step 4. Select a Preset Model') - identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You', - variant='primary') - with gr.Row(): - with gr.Column(): - HelpTip( - "Select a preset target library (e.g., ChEMBL33_human_proteins). " - "Alternatively, upload a CSV file with a column named X2 containing target protein " - "sequences, or use an FASTA file (Max. 10,000 targets per task). " - "Example CSV and SDF files are provided below " - "and can be downloaded by clicking the lower right corner." - ) - target_library = gr.Dropdown( - label='Step 5. Select a Preset Target Library', - choices=list(TARGET_LIBRARY_MAP.keys())) - with gr.Row(): - gr.File(label='Example FASTA target library', - value='data/examples/target_library.fasta', interactive=False) - gr.File(label='Example CSV target library', - value='data/examples/target_library.csv', interactive=False) - target_library_upload_btn = gr.UploadButton( - label='OR Upload Your Own Library', variant='primary') - target_library_upload = gr.File(label='Custom target library file', visible=False) - with gr.Column(): - HelpTip(""" -Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set: -this serves as an indicator of prediction applicability and reliability – -higher similarities indicates more reliable predictions (ideally > 0.85).
-Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound: -this serves as an indicator of prediction confidence for the potential targets – -higher similarities typically imply higher confidence levels.
-Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target: -this serves as an additional indicator of the confidence level in the predicted potential targets – -higher similarities usually correspond to greater prediction confidence.
-""") - target_identify_opts = gr.CheckboxGroup( - choices=TARGET_IDENTIFY_CPI_OPTS, - value=TARGET_IDENTIFY_CPI_OPTS[0], - label='Step 6. Select Advanced Options', - info="Advanced features - may increase the job computation time. " - "See the Help Tip on the right or the Documentation for detailed explanation." - ) - with gr.Row(): - with gr.Column(): - target_identify_email = gr.Textbox( - label='Step 7. Input Your Email Address (Optional)', - info="Your email address will be used to notify you of the status of your job. " - "If you cannot receive the email, please check your spam/junk folder." - ) - - with gr.Row(visible=True): - target_identify_clr_btn = gr.ClearButton(size='lg') - target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary', - size='lg') - - identify_data_for_predict = gr.File(visible=False, file_count="single", type='filepath') - - with gr.TabItem(label='Interaction Pair Inference', id='Interaction Pair Inference'): - gr.Markdown(''' - #
Interaction Pair Inference
- -
To predict interactions or binding affinities between up to - 10,000 paired compound-protein data.
- ''') - HelpTip( - "A custom interation pair dataset can be a CSV file with 2 required columns " - "(X1 for smiles and X2 for sequences) " - "and optionally 2 ID columns (ID1 for compound ID and ID2 for target ID), " - "or generated from a FASTA file containing multiple " - "sequences and a SDF file containing multiple compounds. " - "Currently, a maximum of 10,000 pairs is supported, " - "which means that the size of CSV file or " - "the product of the two library sizes should not exceed 10,000." - ) - infer_type = gr.Dropdown( - choices=['Upload a CSV file containing paired compound-protein data', - 'Upload a compound library and a target library'], - label='Step 1. Select Pair Input Type and Input', - value='Upload a CSV file containing paired compound-protein data') - with gr.Column() as pair_upload: - gr.File( - label="Example CSV dataset", - value="data/examples/interaction_pair_inference.csv", - interactive=False - ) - with gr.Row(): - infer_csv_prompt = gr.Button( - value="Upload Your Own Dataset Below", - variant='secondary') - with gr.Column(): - infer_pair = gr.File( - label='Upload CSV File Containing Paired Records', - file_count="single", - type='filepath', - visible=True - ) - with gr.Column(visible=False) as pair_generate: - with gr.Row(): - gr.File( - label='Example SDF compound library', - value='data/examples/compound_library.sdf', - interactive=False - ) - gr.File( - label='Example FASTA target library', - value='data/examples/target_library.fasta', - interactive=False - ) - with gr.Row(): - gr.File( - label='Example CSV compound library', - value='data/examples/compound_library.csv', - interactive=False - ) - gr.File( - label='Example CSV target library', - value='data/examples/target_library.csv', - interactive=False - ) - with gr.Row(): - infer_library_prompt = gr.Button( - value="Upload Your Own Libraries Below", - visible=False, - variant='secondary' - ) - with gr.Row(): - infer_drug = gr.File( - label='Upload SDF/CSV File Containing Multiple Compounds', - file_count="single", - type='filepath' - ) - infer_target = gr.File( - label='Upload FASTA/CSV File Containing Multiple Targets', - file_count="single", - type='filepath' - ) - - with gr.Row(): - with gr.Column(min_width=200): - HelpTip( - "By default, models trained on all protein families (general) will be applied. " - "If the proteins in the target library of interest " - "all belong to the same protein family, manually selecting the family is supported." - ) - - pair_infer_target_family = gr.Dropdown( - choices=list(TARGET_FAMILY_MAP.keys()), - value='General', - label='Step 2. Select Target Family (Optional)' - ) - - with gr.Column(min_width=200): - HelpTip( - "Interaction prediction provides you binding probability score " - "between the target of interest and each compound in the library, " - "while affinity prediction directly estimates their binding strength " - "measured using half maximal inhibitory concentration (IC50) in units of nM." - ) - pair_infer_task = gr.Dropdown( - list(TASK_MAP.keys()), - label='Step 3. Select a Prediction Task', - value='Compound-Protein Interaction' - ) - - with gr.Column(min_width=200): - HelpTip( - "Select your preferred model. Please refer to documentation for detailed benchmark results." - ) - pair_infer_preset = gr.Dropdown( - list(PRESET_MAP.keys()), - label='Step 4. Select a Preset Model' - ) - # infer_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You', - # variant='primary') - pair_infer_opts = gr.CheckboxGroup(visible=False) - - with gr.Row(): - pair_infer_email = gr.Textbox( - label='Step 5. Input Your Email Address (Optional)', - info="Your email address will be used to notify you of the status of your job. " - "If you cannot receive the email, please check your spam/junk folder.") - - with gr.Row(visible=True): - pair_infer_clr_btn = gr.ClearButton(size='lg') - pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg') - - infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False) - - with gr.TabItem(label='Chemical Property Report', id='Chemical Property Report'): - gr.Markdown(''' - #
Chemical Property Report
- - To compute chemical properties for the predictions of Drug Hit Screening, - Target Protein Identification, and Interaction Pair Inference. - - You may also upload your own dataset using a CSV file containing - one required column `X1` for compound SMILES. - - The page shows only a preview report displaying at most 30 records - (with top predicted CPI/CPA if reporting results from a prediction job). - - Please first `Preview` the report, then `Generate` and download a CSV report - or an interactive HTML report below if you wish to access the full report. - ''') - raw_df = gr.State(value=pd.DataFrame()) - report_df = gr.State(value=pd.DataFrame()) - with gr.Row(): - with gr.Column(scale=1): - file_for_report = gr.File(interactive=True, type='filepath') - report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False, - value='Compound-Protein Interaction', - label='Specify the Task Labels in the Uploaded Dataset') - with gr.Column(scale=2): - with gr.Column(): - with gr.Row(): - scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Compound Scores') - filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Compound Filters') - job_opts = gr.CheckboxGroup(visible=False) - - with gr.Accordion('Report Generate Options', open=True): - with gr.Row(): - csv_sep = gr.Radio(label='CSV Delimiter', - choices=['Comma', 'Tab'], value='Comma') - html_opts = gr.CheckboxGroup(label='HTML Report Options', - choices=[ - 'Exclude Molecular Graph', - 'Exclude Scaffold Graph', - 'Exclude Pharmacophore 3D' - ]) - - with gr.Row(): - report_clr_btn = gr.ClearButton(size='lg') - analyze_btn = gr.Button('Calculate Properties and Preview', variant='primary', - size='lg', interactive=False) - - with gr.Row(): - with gr.Column(scale=3): - html_report = gr.HTML() # label='Results', visible=True) - ranking_pie_chart = gr.Plot(visible=False) - - with gr.Row(): - with gr.Column(): - csv_generate = gr.Button(value='Generate CSV Report', - interactive=False, variant='primary') - csv_download_file = gr.File(label='Download CSV Report', visible=False) - with gr.Column(): - html_generate = gr.Button(value='Generate HTML Report', - interactive=False, variant='primary') - html_download_file = gr.File(label='Download HTML Report', visible=False) - - with gr.TabItem(label='Prediction Status Lookup', id='Prediction Status Lookup'): - gr.Markdown(''' - #
Prediction Status Lookup
- - To check the status of an in-progress or historical job using the job ID and retrieve the predictions - if the job has completed. Note that predictions are only kept for 48 hours upon job completion. - - You will be redirected to Chemical Property Report for carrying out further analysis and - generating the full report when the job is done. If the Lookup fails to respond, please wait for a - few minutes and refresh the page to try again. - ''') - with gr.Column(): - pred_lookup_id = gr.Textbox( - label='Input Your Job ID', placeholder='e.g., e9dfd149-3f5c-48a6-b797-c27d027611ac', - info="Your job ID is a UUID4 string that you receive after submitting a job on the " - "page or in the email notification.") - pred_lookup_btn = gr.Button(value='Lookup the Job Status', variant='primary', visible=True) - pred_lookup_stop_btn = gr.Button(value='Stop Tracking', variant='stop', visible=False) - pred_lookup_status = gr.Markdown() - - # retrieve_email = gr.Textbox(label='Step 2. Input Your Email Address', placeholder='e.g., - - - def target_input_type_select(input_type): - match input_type: - case 'UniProt ID': - return [gr.Dropdown(info=''), - gr.UploadButton(visible=False), - gr.Textbox(visible=True, value=''), - gr.Textbox(visible=False, value=''), - gr.Textbox(visible=False, value=''), - gr.Button(visible=True), - gr.Code(value=''), - gr.Button(visible=False)] - case 'Gene symbol': - return [gr.Dropdown(info=''), - gr.UploadButton(visible=False), - gr.Textbox(visible=False, value=''), - gr.Textbox(visible=True, value=''), - gr.Textbox(visible=True, value=''), - gr.Button(visible=True), - gr.Code(value=''), - gr.Button(visible=False)] - case 'Sequence': - return [gr.Dropdown(info='Enter (paste) a FASTA string below manually or upload a FASTA file.'), - gr.UploadButton(visible=True), - gr.Textbox(visible=False, value=''), - gr.Textbox(visible=False, value=''), - gr.Textbox(visible=False, value=''), - gr.Button(visible=False), - gr.Code(value=''), - gr.Button(visible=True)] - - - target_input_type.select( - fn=target_input_type_select, - inputs=target_input_type, - outputs=[ - target_input_type, target_upload_btn, - target_id, target_gene, target_organism, target_query_btn, - target_fasta, target_paste_markdown - ], - show_progress='hidden' - ) - - - def uniprot_query(input_type, uid, gene, organism='Human'): - uniprot_endpoint = 'https://rest.uniprot.org/uniprotkb/{query}' - fasta_rec = '' - - match input_type: - case 'UniProt ID': - query = f"{uid.strip()}.fasta" - case 'Gene symbol': - organism = organism if organism else 'Human' - query = f'search?query=organism_name:{organism.strip()}+AND+gene:{gene.strip()}&format=fasta' - - try: - fasta = session.get(uniprot_endpoint.format(query=query)) - fasta.raise_for_status() - if fasta.text: - fasta_rec = next(SeqIO.parse(io.StringIO(fasta.text), format='fasta')) - fasta_rec = f">{fasta_rec.description}\n{fasta_rec.seq}" - - except Exception as e: - raise gr.Warning(f"Failed to query FASTA from UniProt database due to {str(e)}") - finally: - return fasta_rec - - - def process_fasta_upload(fasta_upload): - fasta = '' - try: - fasta = fasta_upload.decode() - except Exception as e: - gr.Warning(f"Please upload a valid FASTA file. Error: {str(e)}") - return fasta - - - target_upload_btn.upload( - fn=process_fasta_upload, inputs=target_upload_btn, outputs=target_fasta - ).then( - fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden' - ) - target_query_btn.click( - fn=uniprot_query, inputs=[target_input_type, target_id, target_gene, target_organism], outputs=target_fasta - ).then( - fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden' - ) - - - def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)): - try: - aligner = PairwiseAligner(mode='local') - alignment_df = get_fasta_family_map() - - processed_fasta = process_target_fasta(fasta) - - # Check for an exact match first - exact_match = alignment_df[alignment_df['X2'] == processed_fasta] - if not exact_match.empty: - row = exact_match.iloc[0] - family = str(row['Target Family']).title() - return gr.Dropdown( - value=family, - info=f"Reason: Exact match found with {row['ID2']} from family {family}") - - # If no exact match, then calculate alignment score - def align_score(query): - alignment = aligner.align(processed_fasta, query) - return alignment.score / max(len(processed_fasta), len(query)) - - alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score) - row = alignment_df.loc[alignment_df['score'].idxmax()] - family = str(row['Target Family']).title() - return gr.Dropdown(value=family, - info=f"Reason: Best sequence identity ({row['score']}) " - f"with {row['ID2']} from family {family}") - except Exception as e: - gr.Warning("Failed to detect the protein family due to error: " + str(e)) - - - target_family_detect_btn.click(fn=target_family_detect, inputs=target_fasta, outputs=drug_screen_target_family) - - # target_fasta.focus(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden') - target_fasta.blur(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden') - - drug_library_upload_btn.upload(fn=lambda x: [ - x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name]) - ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library]) - - drug_screen_task.select( - fn=lambda task, opts: gr.CheckboxGroup(choices=DRUG_SCRENN_CPA_OPTS) - if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup( - choices=DRUG_SCRENN_CPI_OPTS, value=DRUG_SCRENN_CPI_OPTS[0]), - inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts, - show_progress='hidden' - ) - - target_identify_task.select( - fn=lambda task, opts: gr.CheckboxGroup(choices=TARGET_IDENTIFY_CPA_OPTS) - if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup( - choices=TARGET_IDENTIFY_CPI_OPTS, value=TARGET_IDENTIFY_CPI_OPTS[0]), - inputs=[target_identify_task, target_identify_opts], outputs=target_identify_opts, - show_progress='hidden' - ) - - def example_fill(input_type): - return {target_id: 'Q16539', - target_gene: 'MAPK14', - target_organism: 'Human', - target_fasta: """ ->sp|Q16539|MK14_HUMAN Mitogen-activated protein kinase 14 OS=Homo sapiens OX=9606 GN=MAPK14 PE=1 SV=3 -MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLRVAVKKLSRPFQ -SIIHAKRTYRELRLLKHMKHENVIGLLDVFTPARSLEEFNDVYLVTHLMGADLNNIVKCQ -KLTDDHVQFLIYQILRGLKYIHSADIIHRDLKPSNLAVNEDCELKILDFGLARHTDDEMT -GYVATRWYRAPEIMLNWMHYNQTVDIWSVGCIMAELLTGRTLFPGTDHIDQLKLILRLVG -TPGAELLKKISSESARNYIQSLTQMPKMNFANVFIGANPLAVDLLEKMLVLDSDKRITAA -QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES -"""} - - - example_fasta.click(fn=example_fill, inputs=target_input_type, outputs=[ - target_id, target_gene, target_organism, target_fasta], show_progress='hidden') - - - def screen_recommend_model(fasta, family, task): - task = TASK_MAP[task] - score = TASK_METRIC_MAP[task] - benchmark_df = pd.read_csv(f'data/benchmarks/{task}_test_metrics.csv') - - if not fasta: - gr.Warning('Please enter a valid FASTA for model recommendation.') - return [None, family] - - if family == 'General': - seen_targets = get_seen_fastas('General', task)['X2'].values - if process_target_fasta(fasta) in seen_targets: - scenario = "Seen Target" - else: - scenario = "Unseen Target" - filtered_df = benchmark_df[(benchmark_df['Family'] == 'All Families') - & (benchmark_df['Scenario'] == scenario) - & (benchmark_df['Type'] == 'General')] - - else: - seen_targets_general = get_seen_fastas('General', task)['X2'].values - if process_target_fasta(fasta) in seen_targets_general: - scenario_general = "Seen Target" - else: - scenario_general = "Unseen Target" - - seen_targets_family = get_seen_fastas(family, task)['X2'].values - if process_target_fasta(fasta) in seen_targets_family: - scenario_family = "Seen Target" - else: - scenario_family = "Unseen Target" - - filtered_df_general = benchmark_df[(benchmark_df['Family'] == family) - & (benchmark_df['Scenario'] == scenario_general) - & (benchmark_df['Type'] == 'General')] - filtered_df_family = benchmark_df[(benchmark_df['Family'] == family) - & (benchmark_df['Scenario'] == scenario_family) - & (benchmark_df['Type'] == 'Family')] - filtered_df = pd.concat([filtered_df_general, filtered_df_family]) - - row = filtered_df.loc[filtered_df[score].idxmax()] - if row['Scenario'] == 'Seen Target': - scenario = "Seen Target (>=0.85 sequence identity)" - elif row['Scenario'] == 'Unseen Target': - scenario = "Unseen Target (<0.85 sequence identity)" - - return {drug_screen_preset: - gr.Dropdown(value=row['Model'], - info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained " - f"model with the best {score} in the {scenario} scenario on {row['Family']}."), - drug_screen_target_family: - gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)} - - - screen_preset_recommend_btn.click( - fn=screen_recommend_model, - inputs=[target_fasta, drug_screen_target_family, drug_screen_task], - outputs=[drug_screen_preset, drug_screen_target_family], - show_progress='hidden' - ) - - - def compound_input_type_select(input_type): - match input_type: - case 'SMILES': - return gr.Button(visible=False) - case 'SDF': - return gr.Button(visible=True) - - - compound_type.select(fn=compound_input_type_select, - inputs=compound_type, outputs=compound_upload_btn, show_progress='hidden') - - - def compound_upload_process(input_type, input_upload): - smiles = '' - try: - match input_type: - case 'SMILES': - smiles = input_upload.decode() - case 'SDF': - suppl = Chem.ForwardSDMolSupplier(io.BytesIO(input_upload)) - smiles = Chem.MolToSmiles(next(suppl)) - except Exception as e: - gr.Warning(f"Please upload a valid {input_type} file. Error: {str(e)}") - return smiles - - - compound_upload_btn.upload(fn=compound_upload_process, - inputs=[compound_type, compound_upload_btn], - outputs=compound_smiles) - - example_drug.click(fn=lambda: 'CC(=O)Oc1ccccc1C(=O)O', outputs=compound_smiles, show_progress='hidden') - - target_library_upload_btn.upload(fn=lambda x: [ - x.name, gr.Dropdown(value=Path(x.name).name, choices=list(TARGET_LIBRARY_MAP.keys()) + [Path(x.name).name]) - ], inputs=target_library_upload_btn, outputs=[target_library_upload, target_library]) - - - def identify_recommend_model(smiles, family, task): - task = TASK_MAP[task] - score = TASK_METRIC_MAP[task] - benchmark_df = pd.read_csv(f'data/benchmarks/{task}_test_metrics.csv') - - if not smiles: - gr.Warning('Please enter a valid SMILES for model recommendation.') - return None - if family == 'Family-Specific Auto-Recommendation': - return 'Family-Specific Auto-Recommendation' - - if family == 'General': - seen_compounds = pd.read_csv( - f'data/benchmarks/seen_compounds/all_families_full_{task.lower()}_random_split.csv') - family = 'All Families' - - else: - seen_compounds = pd.read_csv( - f'data/benchmarks/seen_compounds/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv') - - if rdkit_canonicalize(smiles) in seen_compounds['X1'].values: - scenario = "Seen Compound" - else: - scenario = "Unseen Compound" - - filtered_df = benchmark_df[(benchmark_df['Family'] == family) - & (benchmark_df['Scenario'] == scenario) - & (benchmark_df['Type'] == 'General')] - - row = filtered_df.loc[filtered_df[score].idxmax()] - - return gr.Dropdown(value=row['Model'], - info=f"Reason: {scenario} in training; choosing the model " - f"with the best {score} in the {scenario} scenario.") - - - identify_preset_recommend_btn.click(fn=identify_recommend_model, - inputs=[compound_smiles, target_identify_target_family, target_identify_task], - outputs=target_identify_preset, show_progress='hidden') - - - def infer_type_change(upload_type): - match upload_type: - case "Upload a compound library and a target library": - return { - pair_upload: gr.Column(visible=False), - pair_generate: gr.Column(visible=True), - infer_pair: None, - infer_drug: None, - infer_target: None, - infer_csv_prompt: gr.Button(visible=False), - infer_library_prompt: gr.Button(visible=True), - } - case "Upload a CSV file containing paired compound-protein data": - return { - pair_upload: gr.Column(visible=True), - pair_generate: gr.Column(visible=False), - infer_pair: None, - infer_drug: None, - infer_target: None, - infer_csv_prompt: gr.Button(visible=True), - infer_library_prompt: gr.Button(visible=False), - } - - - infer_type.select(fn=infer_type_change, inputs=infer_type, - outputs=[pair_upload, pair_generate, infer_pair, infer_drug, infer_target, - infer_csv_prompt, infer_library_prompt], - show_progress='hidden') - - - def common_input_validate(state, preset, email, request): - gr.Info('Start processing inputs...') - if not preset: - raise gr.Error('Please select a model.') - - if email: - try: - email_info = validate_email(email, check_deliverability=False) - email = email_info.normalized - except EmailNotValidError as e: - raise gr.Error(f"Invalid email address: {str(e)}.") - - if state: - raise gr.Error(f"You already have a running prediction job (ID: {state['id']}) under this session. " - "Please wait for it to complete before submitting another job.") - - if check := check_user_running_job(email, request): - raise gr.Error(check) - - return state, preset, email - - - def common_job_initiate(job_id, job_type, email, request, task): - gr.Info('Finished processing inputs. Initiating the prediction job... ' - 'You will be redirected to Prediction Status Lookup once the job has been submitted.') - job_info = {'id': job_id, - 'type': job_type, - 'task': task, - 'status': 'RUNNING', - 'email': email, - 'ip': request.headers.get('x-forwarded-for', request.client.host), - 'cookies': dict(request.cookies), - 'start_time': time(), - 'end_time': None, - 'expiry_time': None, - 'error': None} - # db.insert(job_info) - return job_info - - - def drug_screen_validate(fasta, library, library_upload, preset, task, email, state, - request: gr.Request, progress=gr.Progress(track_tqdm=True)): - state, preset, email = common_input_validate(state, preset, email, request) - - fasta = process_target_fasta(fasta) - err = validate_seq_str(fasta, FASTA_PAT) - if err: - raise gr.Error(f'Found error(s) in your Target FASTA input: {err}') - if not library: - raise gr.Error('Please select or upload a compound library.') - if library in DRUG_LIBRARY_MAP.keys(): - screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library])) - else: - screen_df = process_drug_library_upload(library_upload) - if len(screen_df) >= DATASET_MAX_LEN: - raise gr.Error(f'The uploaded compound library has more records ' - f'than the allowed maximum {DATASET_MAX_LEN}.') - - screen_df['X2'] = fasta - - job_id = str(uuid4()) - temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve() - screen_df.to_csv(temp_file, index=False, na_rep='') - if temp_file.is_file(): - job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task) - return {screen_data_for_predict: str(temp_file), - run_state: job_info} - else: - raise gr.Error('System failed to create temporary files. Please try again later.') - - - def target_identify_validate(smiles, library, library_upload, preset, task, email, state, - request: gr.Request, progress=gr.Progress(track_tqdm=True)): - state, preset, email = common_input_validate(state, preset, email, request) - - smiles = smiles.strip() - err = validate_seq_str(smiles, SMILES_PAT) - if err: - raise gr.Error(f'Found error(s) in your Compound SMILES input: {err}') - if not library: - raise gr.Error('Please select or upload a target library.') - if library in TARGET_LIBRARY_MAP.keys(): - identify_df = pd.read_csv(Path('data/target_libraries', TARGET_LIBRARY_MAP[library])) - else: - identify_df = process_target_library_upload(library_upload) - if len(identify_df) >= DATASET_MAX_LEN: - raise gr.Error(f'The uploaded target library has more records ' - f'than the allowed maximum {DATASET_MAX_LEN}.') - identify_df['X1'] = smiles - - job_id = str(uuid4()) - temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve() - identify_df.to_csv(temp_file, index=False, na_rep='') - if temp_file.is_file(): - job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task) - return {identify_data_for_predict: str(temp_file), - run_state: job_info} - else: - raise gr.Error('System failed to create temporary files. Please try again later.') - - - def pair_infer_validate(drug_target_pair_upload, drug_upload, target_upload, preset, task, email, state, - request: gr.Request, progress=gr.Progress(track_tqdm=True)): - state, preset, email = common_input_validate(state, preset, email, request) - - job_id = str(uuid4()) - if drug_target_pair_upload: - infer_df = pd.read_csv(drug_target_pair_upload) - validate_columns(infer_df, ['X1', 'X2']) - - infer_df['X1_ERR'] = infer_df['X1'].parallel_apply( - validate_seq_str, regex=SMILES_PAT) - if not infer_df['X1_ERR'].isna().all(): - raise ValueError( - f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}") - - infer_df['X2_ERR'] = infer_df['X2'].parallel_apply( - validate_seq_str, regex=FASTA_PAT) - if not infer_df['X2_ERR'].isna().all(): - raise ValueError( - f"Encountered invalid FASTA:\n{infer_df[~infer_df['X2_ERR'].isna()][['X2', 'X2_ERR']]}") - - temp_file = Path(drug_target_pair_upload).resolve() - - elif drug_upload and target_upload: - drug_df = process_drug_library_upload(drug_upload) - target_df = process_target_library_upload(target_upload) - - drug_df.drop_duplicates(subset=['X1'], inplace=True) - target_df.drop_duplicates(subset=['X2'], inplace=True) - - infer_df = pd.DataFrame(list(itertools.product(drug_df['X1'], target_df['X2'])), - columns=['X1', 'X2']) - infer_df = infer_df.merge(drug_df, on='X1').merge(target_df, on='X2') - - if len(infer_df) >= DATASET_MAX_LEN: - raise gr.Error(f'The uploaded/generated compound-protein pair dataset has more records ' - f'than the allowed maximum {DATASET_MAX_LEN}.') - - temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve() - infer_df.to_csv(temp_file, index=False, na_rep='') - - else: - raise gr.Error('Should upload a compound-protein pair dataset, or ' - 'upload both a compound library and a target library.') - - if temp_file.is_file(): - job_info = common_job_initiate(job_id, 'Interaction Pair Inference', email, request, task) - return {infer_data_for_predict: str(temp_file), - run_state: job_info} - else: - raise gr.Error('System failed to create temporary files. Please try again later.') - - - def fill_job_id(job_info): - try: - return job_info['id'] - except Exception as e: - gr.Warning(f'Failed to fetch job ID due to error: {str(e)}') - return '' - - - drug_screen_click = drug_screen_btn.click( - fn=drug_screen_validate, - inputs=[target_fasta, drug_library, drug_library_upload, drug_screen_preset, drug_screen_task, - drug_screen_email, run_state], - outputs=[screen_data_for_predict, run_state], - concurrency_limit=2, - ) - - drug_screen_lookup = drug_screen_click.success( - fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs], - ).then( - fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id] - ).then( - fn=lookup_job, - inputs=[pred_lookup_id], - outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report], - show_progress='minimal', - concurrency_limit=100, - ) - - # drug_screen_click.success( - # fn=send_email, - # inputs=[run_state] - # ) - - drug_screen_click.success( - fn=submit_predict, - inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset, - drug_screen_target_family, drug_screen_opts, run_state, ], - outputs=[run_state, ] - ) - - drug_screen_clr_btn.click( - lambda: ['General'] + [[]] + [None] * 5, - outputs=[drug_screen_target_family, drug_screen_opts, - target_fasta, drug_screen_preset, drug_library, drug_library_upload, drug_screen_email], - show_progress='hidden' - ) - - target_identify_clr_btn.click( - lambda: ['General'] + [[]] + [None] * 5, - outputs=[target_identify_target_family, target_identify_opts, - compound_smiles, target_identify_preset, target_library, target_library_upload, target_identify_email], - show_progress='hidden' - ) - - pair_infer_clr_btn.click( - lambda: ['General'] + [None] * 5, - outputs=[pair_infer_target_family, - infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_email], - show_progress='hidden' - ) - - report_clr_btn.click( - lambda: [[]] * 3 + [None] * 3 + - [gr.Button(interactive=False)] * 3 + - [gr.File(visible=False, value=None)] * 2 + - [gr.Dropdown(visible=False, value=None), gr.HTML(value=''), gr.CheckboxGroup(visible=False)], - outputs=[ - scores, filters, html_opts, - file_for_report, raw_df, report_df, - csv_generate, html_generate, analyze_btn, - csv_download_file, html_download_file, - report_task, html_report, job_opts - ], - show_progress='hidden' - ) - - - def update_preset(family, preset): - if family == 'Family-Specific Auto-Recommendation': - return 'Family-Specific Auto-Recommendation' - elif preset == 'Family-Specific Auto-Recommendation': - return None - else: - return preset - - - def update_family(family, preset): - if preset == 'Family-Specific Auto-Recommendation': - return 'Family-Specific Auto-Recommendation' - elif family == 'Family-Specific Auto-Recommendation': - return None - else: - return family - - - target_identify_target_family.change( - fn=update_preset, inputs=[target_identify_target_family, target_identify_preset], - outputs=target_identify_preset, show_progress='hidden') - target_identify_preset.change( - fn=update_family, inputs=[target_identify_target_family, target_identify_preset], - outputs=target_identify_target_family, show_progress='hidden') - - target_identify_click = target_identify_btn.click( - fn=target_identify_validate, - inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task, - target_identify_email, run_state], - outputs=[identify_data_for_predict, run_state], - concurrency_limit=2, - ) - - target_identify_lookup = target_identify_click.success( - fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs], - ).then( - fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id] - ).then( - fn=lookup_job, - inputs=[pred_lookup_id], - outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report], - show_progress='minimal', - concurrency_limit=100 - ) - - # target_identify_click.success( - # fn=send_email, - # inputs=[run_state] - # ) - - target_identify_click.success( - fn=submit_predict, - inputs=[identify_data_for_predict, target_identify_task, target_identify_preset, - target_identify_target_family, target_identify_opts, run_state, ], # , target_identify_email], - outputs=[run_state, ] - ) - - pair_infer_click = pair_infer_btn.click( - fn=pair_infer_validate, - inputs=[infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_task, - pair_infer_email, run_state], - outputs=[infer_data_for_predict, run_state], - concurrency_limit=2, - ) - - pair_infer_lookup = pair_infer_click.success( - fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs], - ).then( - fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id] - ).then( - fn=lookup_job, - inputs=[pred_lookup_id], - outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report], - show_progress='minimal', - concurrency_limit=100 - ) - - # pair_infer_click.success( - # fn=send_email, - # inputs=[run_state] - # ) - - pair_infer_click.success( - fn=submit_predict, - inputs=[infer_data_for_predict, pair_infer_task, pair_infer_preset, - pair_infer_target_family, pair_infer_opts, run_state, ], # , pair_infer_email], - outputs=[run_state, ] - ) - - pred_lookup_click = pred_lookup_btn.click( - fn=lookup_job, - inputs=[pred_lookup_id], - outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report], - show_progress='minimal', - cancels=[drug_screen_lookup, target_identify_lookup, pair_infer_lookup], - concurrency_limit=100, - ) - - pred_lookup_stop_btn.click( - fn=lambda: [gr.Button(visible=True), gr.Button(visible=False)], - outputs=[pred_lookup_btn, pred_lookup_stop_btn], - cancels=[pred_lookup_click, drug_screen_lookup, target_identify_lookup, pair_infer_lookup], - concurrency_limit=None, - ) - - - def inquire_task(df): - if 'Y^' in df.columns: - label = 'predicted CPI/CPA labels (`Y^`)' - return {report_task: gr.Dropdown(visible=True, - info=f'Found {label} in your uploaded dataset. ' - 'Is it compound-protein interaction or binding affinity?'), - html_report: ''} - else: - return {report_task: gr.Dropdown(visible=False)} - - report_df_change = file_for_report.change( - fn=update_df, inputs=file_for_report, outputs=[ - html_report, raw_df, report_df, analyze_btn, report_task, job_opts - ], - concurrency_limit=100, - ).success( - fn=lambda: [gr.Button(interactive=True)] * 3 + - [gr.File(visible=False, value=None)] * 2, - outputs=[ - csv_generate, html_generate, analyze_btn, csv_download_file, html_download_file - ], - ) - - file_for_report.upload( - # fn=update_df, inputs=file_for_report, outputs=[ - # html_report, raw_df, report_df, analyze_btn, report_task, job_opts - # ], - # cancels=[report_df_change], - # concurrency_limit=100, - # ).success( - fn=inquire_task, inputs=[raw_df], - outputs=[report_task, html_report], - ) - - file_for_report.clear( - fn=lambda: [gr.Button(interactive=False)] * 3 + - [gr.File(visible=False, value=None)] * 2 + - [gr.Dropdown(visible=False, value=None), '', gr.CheckboxGroup(visible=False)], - cancels=[report_df_change], - outputs=[ - csv_generate, html_generate, analyze_btn, - csv_download_file, html_download_file, - report_task, html_report, job_opts - ] - ) - - analyze_btn.click( - fn=submit_report, inputs=[raw_df, scores, filters, job_opts, report_task], outputs=[ - html_report, report_df, csv_download_file, html_download_file] - ).success( - fn=lambda: [gr.Button(interactive=True)] * 2, - outputs=[csv_generate, html_generate], - concurrency_limit=100, - ) - - - def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)): - csv_sep_map = { - 'Comma': ',', - 'Tab': '\t', - } - y_colname = 'Y^' - if isinstance(task, str): - if task == 'Compound-Protein Interaction': - y_colname = 'Y_prob' - elif task == 'Compound-Protein Binding Affinity': - y_colname = 'Y_IC50' - try: - now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv" - df.rename(columns={'Y^': y_colname}).drop( - labels=['Compound', 'Scaffold'], axis=1 - ).to_csv(filename, index=False, na_rep='', sep=csv_sep_map[sep]) - - return gr.File(filename, visible=True) - except Exception as e: - gr.Warning(f"Failed to generate CSV due to error: {str(e)}") - return None - - - def create_html_report_file(df, file_report, task, opts, progress=gr.Progress(track_tqdm=True)): - try: - now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.html" - create_html_report(df, filename, task, opts) - return gr.File(filename, visible=True) - except Exception as e: - gr.Warning(f"Failed to generate HTML due to error: {str(e)}") - return None - - - # html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate]) - - csv_generate.click( - lambda: gr.File(visible=True), outputs=csv_download_file, - ).then( - fn=create_csv_report_file, inputs=[report_df, file_for_report, report_task, csv_sep], - outputs=csv_download_file, show_progress='full' - ) - html_generate.click( - lambda: gr.File(visible=True), outputs=html_download_file, - ).then( - fn=create_html_report_file, inputs=[report_df, file_for_report, report_task, html_opts], - outputs=html_download_file, show_progress='full' - ) - -if __name__ == "__main__": - pandarallel.initialize() - - hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference") - - session = requests.Session() - ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])) - session.mount('http://', ADAPTER) - session.mount('https://', ADAPTER) - - db = TinyDB(f'{SERVER_DATA_DIR}/db.json') - # Set all RUNNING jobs to FAILED at TinyDB initialization - Job = Query() - jobs = db.all() - for job in jobs: - if job['status'] == 'RUNNING': - db.update({'status': 'FAILED'}, Job.id == job['id']) - - scheduler = BackgroundScheduler() - scheduler.add_job(check_expiry, 'interval', hours=1, timezone=pytz.utc) - scheduler.start() - - demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False) +import glob +import smtplib +from datetime import datetime, timedelta +import itertools +import textwrap +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from email.utils import formatdate, make_msgid +from functools import cache +from math import pi +from time import sleep, time +from uuid import uuid4 + +import io +import os +from pathlib import Path +import sys + +import pytz +from Bio import SeqIO +from Bio.Align import PairwiseAligner +from email_validator import validate_email, EmailNotValidError +import gradio as gr +import hydra +import pandas as pd +from pandarallel import pandarallel +import requests +from rdkit.DataStructs import BulkTanimotoSimilarity +from requests.adapters import HTTPAdapter, Retry +from markdown import markdown +from rdkit import Chem +from rdkit.Chem import AllChem, Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen +from rdkit.Chem.Features.ShowFeats import _featColors +from rdkit.Chem.Scaffolds import MurckoScaffold +import py3Dmol + +from bokeh.models import Legend, NumberFormatter, BooleanFormatter, HTMLTemplateFormatter, LegendItem +from bokeh.palettes import Category20c_20 +from bokeh.plotting import figure +from bokeh.transform import cumsum +from bokeh.resources import INLINE +import seaborn as sns +import panel as pn + +from apscheduler.schedulers.background import BackgroundScheduler +from tinydb import TinyDB, Query + +#import swifter +from tqdm.auto import tqdm + +from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT +from deepscreen.predict import predict + +sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score')) +import sascorer + +DATASET_MAX_LEN = 10_240 +SERVER_DATA_DIR = os.getenv('DATA') # '/data' +DB_EXPIRY = timedelta(hours=48).total_seconds() + +CSS = """ +.help-tip { + position: absolute; + display: inline-block; + top: 16px; + right: 0px; + text-align: center; + border-radius: 40%; + /* border: 2px solid darkred; background-color: #8B0000;*/ + width: 24px; + height: 24px; + font-size: 16px; + line-height: 26px; + cursor: default; + transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1); + z-index: 100 !important; +} + +.help-tip:hover { + cursor: pointer; + /*background-color: #ccc;*/ +} + +.help-tip:before { + content: '?'; + font-weight: 700; + color: #8B0000; + z-index: 100 !important; +} + +.help-tip p { + visibility: hidden; + opacity: 0; + text-align: left; + background-color: #EFDDE3; + padding: 20px; + width: 300px; + position: absolute; + border-radius: 4px; + right: -4px; + color: #494F5A; + font-size: 13px; + line-height: normal; + transform: scale(0.7); + transform-origin: 100% 0%; + transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1); + z-index: 100; +} + +.help-tip:hover p { + cursor: default; + visibility: visible; + opacity: 1; + transform: scale(1.0); +} + +.help-tip p:before { + position: absolute; + content: ''; + width: 0; + height: 0; + border: 6px solid transparent; + border-bottom-color: #EFDDE3; + right: 10px; + top: -12px; +} + +.help-tip p:after { + width: 100%; + height: 40px; + content: ''; + position: absolute; + top: -5px; + left: 0; + z-index: 101; +} + +.upload_button { + background-color: #008000; +} + +.absolute { + position: absolute; +} + +.example { +padding: 0; +background: none; +border: none; +text-decoration: underline; +box-shadow: none; +text-align: left !important; +display: inline-block !important; +} + +footer { +visibility: hidden +} +""" + + +class View3DmolCell(py3Dmol.view): + def __init__(self, width=320, height=200): + divid = "3dmolviewer_UNIQUEID" + self.uniqueid = None + if isinstance(width, int): + width = '%dpx' % width + if isinstance(height, int): + height = '%dpx' % height + self.startjs = '''
+
\n''' % (divid, width, height) + self.startjs += '' + + self.updatejs = '' + self.viewergrid = None + + self.startjs += 'viewer_UNIQUEID = $3Dmol.createViewer(document.getElementById("%s"),{backgroundColor:"white"});\n' % divid + self.startjs += "viewer_UNIQUEID.zoomTo();\n" + self.endjs = "viewer_UNIQUEID.render();\n" + self.endjs + + +FEAT_FACTORY = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) + + +def rgb_to_hex(rgb): + rgb = tuple(round(i * 255) for i in rgb) + return '#{:02x}{:02x}{:02x}'.format(rgb[0], rgb[1], rgb[2]) + + +def mol_to_pharm3d(mol, mode='html'): + try: + # AllChem.Compute2DCoords(mol) + mol = Chem.AddHs(mol) + params = AllChem.ETKDGv3() + params.randomSeed = 0xf00d # for reproducibility + AllChem.EmbedMolecule(mol, params) + + feats = FEAT_FACTORY.GetFeaturesForMol(mol) + + view = View3DmolCell(width=320, height=200) + for feat in feats: + pos = feat.GetPos() + color = _featColors.get(feat.GetFamily(), (.5, .5, .5)) + view.addSphere({ + 'center': {'x': pos.x, 'y': pos.y, 'z': pos.z}, + 'radius': 0.5, + 'color': rgb_to_hex(color) + }) + + mol_block = Chem.MolToMolBlock(mol) + view.addModel(mol_block, 'sdf') + view.setStyle({'stick': {}}) + view.zoomTo() + + if mode == 'html': + return view.write_html() + # case 'png': + # return view.png() + except Exception: + return None + + +class HelpTip: + def __new__(cls, text): + return gr.HTML( + # elem_classes="absolute", + value=f'

{text}

', + ) + + +TASK_MAP = { + 'Compound-Protein Interaction': 'DTI', + 'Compound-Protein Binding Affinity': 'DTA', +} + +TASK_METRIC_MAP = { + 'DTI': 'AUROC', + 'DTA': 'CI', + 'Compound-Protein Interaction': 'AUROC', + 'Compound-Protein Binding Affinity': 'CI', + 'CPI': 'DTI', + 'CPA': 'DTA', +} + +PRESET_MAP = { + 'DeepDTA': 'deep_dta', + 'DeepConvDTI': 'deep_conv_dti', + 'GraphDTA': 'graph_dta', + 'MGraphDTA': 'm_graph_dta', + 'HyperAttentionDTI': 'hyper_attention_dti', + 'MolTrans': 'mol_trans', + 'TransformerCPI': 'transformer_cpi', + 'TransformerCPI2': 'transformer_cpi_2', + 'DrugBAN': 'drug_ban', + 'DrugVQA-Seq': 'drug_vqa' +} + +TARGET_FAMILY_MAP = { + 'General': 'general', + 'Kinase': 'kinase', + 'Non-Kinase Enzyme': 'non_kinase_enzyme', + 'Membrane Receptor': 'membrane_receptor', + 'Nuclear Receptor': 'nuclear_receptor', + 'Ion Channel': 'ion_channel', + 'Others': 'others', + # 'general': 'general', + # 'kinase': 'kinase', + # 'non-kinase enzyme': 'non_kinase_enzyme', + # 'membrane receptor': 'membrane_receptor', + # 'nuclear Receptor': 'nuclear_receptor', + # 'ion channel': 'ion_channel', + # 'others': 'others', +} + +TARGET_LIBRARY_MAP = { + 'DrugBank (Human)': 'drugbank_targets.csv', + 'ChEMBL33 (Human)': 'ChEMBL33_human_proteins.csv', +} + +DRUG_LIBRARY_MAP = { + 'DrugBank (Human)': 'drugbank_compounds.csv', + 'Drug Repurposing Hub': 'drug_repurposing_hub.csv', + 'Enamine Discovery Diversity Set (DDS-10)': 'Enamine_Discovery_Diversity_Set_10_10240cmpds_20240130.csv', + 'Enamine Phenotypic Screening Library (PSL-5760)': 'Enamine_Phenotypic_Screening_Library_plated_5760cmds_2020_07_20.csv' +} + +COLUMN_ALIASES = { + 'X1': 'Compound SMILES', + 'X2': 'Target FASTA', + 'ID1': 'Compound ID', + 'ID2': 'Target ID', + 'Y': 'Actual CPI/CPA', + 'Y^': 'Predicted CPI/CPA', +} + +DRUG_SCRENN_CPI_OPTS = [ + 'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set', + 'Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target', + 'Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound', +] + +DRUG_SCRENN_CPA_OPTS = [ + 'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set', +] + +TARGET_IDENTIFY_CPI_OPTS = [ + 'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set', + 'Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound', + 'Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target', +] + +TARGET_IDENTIFY_CPA_OPTS = [ + 'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set', +] + +pd.set_option('display.float_format', '{:.3f}'.format) +PandasTools.molRepresentation = 'svg' +PandasTools.drawOptions = Draw.rdMolDraw2D.MolDrawOptions() +PandasTools.drawOptions.clearBackground = False +PandasTools.drawOptions.bondLineWidth = 1 +PandasTools.drawOptions.explicitMethyl = True +PandasTools.drawOptions.singleColourWedgeBonds = True +PandasTools.drawOptions.useCDKAtomPalette() +PandasTools.molSize = (100, 64) + + +def remove_job_record(job_id): + # Delete the job from the database + db.remove(Job.id == job_id) + # Delete the corresponding files + files = glob.glob(f"{SERVER_DATA_DIR}/{job_id}*") + for file_path in files: + if os.path.exists(file_path): + os.remove(file_path) + + +def check_expiry(): + Job = Query() + jobs = db.all() + + for job in jobs: + # Check if the job has expired + if job['status'] != 'RUNNING': + expiry_time = job['expiry_time'] if job['expiry_time'] is not None else job['start_time'] + DB_EXPIRY + if expiry_time < time(): + # Delete the job from the database + db.remove(Job.id == job['id']) + # Delete the corresponding file + files = glob.glob(f"{SERVER_DATA_DIR}/{job['id']}*") + for file_path in files: + if os.path.exists(file_path): + os.remove(file_path) + elif job['status'] == 'RUNNING' and time() - job['start_time'] > 4 * 60 * 60: # 4 hours + # Mark the job as failed + db.update({'status': 'FAILED', + 'error': 'Job has timed out by exceeding the maximum running time of 4 hours.'}, + Job.id == job['id']) + if job.get('email'): + send_email(job) + + +def smiles_to_ecfp(smiles): + mol = Chem.MolFromSmiles(smiles) + if mol: + ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) + else: + ecfp = [] + return ecfp + + +def max_tanimoto_similarity(smi, seen_smiles_with_fp): + if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty: + return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None} + + if smi in seen_smiles_with_fp['X1'].values: + compound = smi + if 'ID1' in seen_smiles_with_fp.columns: + id1 = seen_smiles_with_fp.loc[seen_smiles_with_fp['X1'] == smi, 'ID1'].values[0] + if pd.notnull(id1) and id1 != '': + compound = id1 + return {'Max. Tanimoto Similarity': 1, 'Max. Tanimoto Similarity Compound': compound} + + mol = Chem.MolFromSmiles(smi) + if mol is None: + return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None} + + mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) + sims = pd.Series(BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'].values)).to_numpy() + idx = sims.argmax() + compound = seen_smiles_with_fp.iloc[idx]['X1'] + if 'ID1' in seen_smiles_with_fp.columns: + id1 = seen_smiles_with_fp.iloc[idx]['ID1'] + if pd.notnull(id1) and id1 != '': + compound = id1 + + return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound} + + +def alignment_score(query, target): + aligner = PairwiseAligner() + aligner.mode = 'local' + alignment = aligner.align(query, target) + return alignment.score / max(len(query), len(target)) + + +def max_sequence_identity(seq, seen_fastas): + if seq is None or seen_fastas is None or seen_fastas.empty: + return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None} + + if seq in seen_fastas['X2'].values: + target = seq + if 'ID2' in seen_fastas.columns: + id2 = seen_fastas.loc[seen_fastas['X2'] == seq, 'ID2'].values[0] + if pd.notnull(id2) and id2 != '': + target = id2 + return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target} + + cached_alignment_score = cache(alignment_score) + max_iden = 0 + target = None + for fasta in seen_fastas['X2'].values: + identity = cached_alignment_score(seq, fasta) + + if identity > max_iden: + max_iden = identity + target = fasta + if 'ID2' in seen_fastas.columns: + id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0] + if pd.notnull(id2) and id2 != '': + target = id2 + if max_iden == 1: + break + + cached_alignment_score.cache_clear() + return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target} + + +def get_seen_smiles(family, task): + if family == 'General': + family = 'all_families_full' + else: + family = TARGET_FAMILY_MAP[family.title()] + seen_smiles = pd.read_csv( + f'data/benchmarks/seen_compounds/{family}_{task.lower()}_random_split.csv') + return seen_smiles + + +def get_seen_fastas(family, task): + if family == 'General': + family = 'all_families_full' + else: + family = TARGET_FAMILY_MAP[family.title()] + seen_fastas = pd.read_csv( + f'data/benchmarks/seen_targets/{family}_{task.lower()}_random_split.csv') + return seen_fastas + + +@cache +def get_fasta_family_map(): + usecols = ['X2', 'ID2', 'Target Family'] + fasta_family_map = pd.concat([ + pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv', usecols=usecols), + pd.read_csv('data/target_libraries/idmapping_not_in_chembl.csv', usecols=usecols) + ]).drop_duplicates(subset=['X2'], keep='first') + return fasta_family_map + + +def lipinski(mol): + """ + Lipinski's rules: + Hydrogen bond donors <= 5 + Hydrogen bond acceptors <= 10 + Molecular weight <= 500 daltons + logP <= 5 + """ + return ( + Lipinski.NumHDonors(mol) <= 5 and + Lipinski.NumHAcceptors(mol) <= 10 and + Descriptors.MolWt(mol) <= 500 and + Crippen.MolLogP(mol) <= 5 + ) + + +def reos(mol): + """ + Rapid Elimination Of Swill filter: + Molecular weight between 200 and 500 + LogP between -5.0 and +5.0 + H-bond donor count between 0 and 5 + H-bond acceptor count between 0 and 10 + Formal charge between -2 and +2 + Rotatable bond count between 0 and 8 + Heavy atom count between 15 and 50 + """ + return ( + 200 <= Descriptors.MolWt(mol) <= 500 and + -5.0 <= Crippen.MolLogP(mol) <= 5.0 and + 0 <= Lipinski.NumHDonors(mol) <= 5 and + 0 <= Lipinski.NumHAcceptors(mol) <= 10 and + -2 <= rdmolops.GetFormalCharge(mol) <= 2 and + 0 <= rdMolDescriptors.CalcNumRotatableBonds(mol) <= 8 and + 15 <= rdMolDescriptors.CalcNumHeavyAtoms(mol) <= 50 + ) + + +def ghose(mol): + """ + Ghose drug like filter: + Molecular weight between 160 and 480 + LogP between -0.4 and +5.6 + Atom count between 20 and 70 + Molar refractivity between 40 and 130 + """ + return ( + 160 <= Descriptors.MolWt(mol) <= 480 and + -0.4 <= Crippen.MolLogP(mol) <= 5.6 and + 20 <= rdMolDescriptors.CalcNumAtoms(mol) <= 70 and + 40 <= Crippen.MolMR(mol) <= 130 + ) + + +def veber(mol): + """ + The Veber filter is a rule of thumb filter for orally active drugs described in + Veber et al., J Med Chem. 2002; 45(12): 2615-23.: + Rotatable bonds <= 10 + Topological polar surface area <= 140 + """ + return ( + rdMolDescriptors.CalcNumRotatableBonds(mol) <= 10 and + rdMolDescriptors.CalcTPSA(mol) <= 140 + ) + + +def rule_of_three(mol): + """ + Rule of Three filter (Congreve et al., Drug Discov. Today. 8 (19): 876–7, (2003).): + Molecular weight <= 300 + LogP <= 3 + H-bond donor <= 3 + H-bond acceptor count <= 3 + Rotatable bond count <= 3 + """ + return ( + Descriptors.MolWt(mol) <= 300 and + Crippen.MolLogP(mol) <= 3 and + Lipinski.NumHDonors(mol) <= 3 and + Lipinski.NumHAcceptors(mol) <= 3 and + rdMolDescriptors.CalcNumRotatableBonds(mol) <= 3 + ) + + +@cache +def load_smarts_patterns(smarts_path): + # Load the CSV file containing SMARTS patterns + smarts_df = pd.read_csv(Path(smarts_path)) + # Convert all SMARTS patterns to molecules + smarts_mols = [Chem.MolFromSmarts(smarts) for smarts in smarts_df['smarts']] + return smarts_mols + + +def smarts_filter(mol, smarts_mols): + for smarts_mol in smarts_mols: + if smarts_mol is not None and mol.HasSubstructMatch(smarts_mol): + return False + return True + + +def pains(mol): + smarts_mols = load_smarts_patterns("data/filters/pains.csv") + return smarts_filter(mol, smarts_mols) + + +def mlsmr(mol): + smarts_mols = load_smarts_patterns("data/filters/mlsmr.csv") + return smarts_filter(mol, smarts_mols) + + +def dundee(mol): + smarts_mols = load_smarts_patterns("data/filters/dundee.csv") + return smarts_filter(mol, smarts_mols) + + +def glaxo(mol): + smarts_mols = load_smarts_patterns("data/filters/glaxo.csv") + return smarts_filter(mol, smarts_mols) + + +def bms(mol): + smarts_mols = load_smarts_patterns("data/filters/bms.csv") + return smarts_filter(mol, smarts_mols) + + +SCORE_MAP = { + 'SAscore': sascorer.calculateScore, + 'LogP': Crippen.MolLogP, + 'Molecular Weight': Descriptors.MolWt, + 'Number of Atoms': rdMolDescriptors.CalcNumAtoms, + 'Number of Heavy Atoms': rdMolDescriptors.CalcNumHeavyAtoms, + 'Molar Refractivity': Crippen.MolMR, + 'H-Bond Donor Count': Lipinski.NumHDonors, + 'H-Bond Acceptor Count': Lipinski.NumHAcceptors, + 'Rotatable Bond Count': rdMolDescriptors.CalcNumRotatableBonds, + 'Topological Polar Surface Area': rdMolDescriptors.CalcTPSA, +} + +FILTER_MAP = { + # TODO support number_of_violations + 'REOS': reos, + "Lipinski's Rule of Five": lipinski, + 'Ghose': ghose, + 'Rule of Three': rule_of_three, + 'Veber': veber, + 'PAINS': pains, + 'MLSMR': mlsmr, + 'Dundee': dundee, + 'Glaxo': glaxo, + 'BMS': bms, +} + + +def validate_columns(df, mandatory_cols): + missing_cols = [col for col in mandatory_cols if col not in df.columns] + if missing_cols: + error_message = (f"The following mandatory columns are missing " + f"in the uploaded dataset: {str(mandatory_cols).strip('[]')}.") + raise ValueError(error_message) + else: + return + + +def process_target_fasta(sequence): + try: + if sequence: + lines = sequence.strip().split("\n") + if lines[0].startswith(">"): + lines = lines[1:] + return ''.join(lines).split(">")[0].strip() + # record = list(SeqIO.parse(io.StringIO(sequence), "fasta"))[0] + # return str(record.seq) + else: + raise ValueError('Empty FASTA sequence.') + except Exception as e: + raise gr.Error(f'Failed to process FASTA due to error: {str(e)}') + + +def send_email(job_info): + if job_info.get('email'): + try: + email_info = job_info.copy() + email_serv = os.getenv('EMAIL_SERV') + email_port = os.getenv('EMAIL_PORT') + email_addr = os.getenv('EMAIL_ADDR') + email_pass = os.getenv('EMAIL_PASS') + email_form = os.getenv('EMAIL_FORM') + email_subj = os.getenv('EMAIL_SUBJ') + + for key, value in email_info.items(): + if key.endswith("time") and value: + email_info[key] = ts_to_str(value, get_timezone_by_ip(email_info['ip'])) + + server = smtplib.SMTP(email_serv, int(email_port)) + # server.starttls() + + server.login(email_addr, email_pass) + msg = MIMEMultipart("alternative") + msg["From"] = email_addr + msg["To"] = email_info['email'] + msg["Subject"] = email_subj.format(**email_info) + msg["Date"] = formatdate(localtime=True) + msg["Message-ID"] = make_msgid() + + msg.attach(MIMEText(markdown(email_form.format(**email_info)), 'html')) + msg.attach(MIMEText(email_form.format(**email_info), 'plain')) + + server.sendmail(email_addr, email_info['email'], msg.as_string()) + server.quit() + gr.Info('Email notification sent.') + except Exception as e: + gr.Warning('Failed to send email notification due to error: ' + str(e)) + + +def check_user_running_job(email, request): + message = ("You already have a running prediction job (ID: {id}) under this {reason}. " + "Please wait for it to complete before submitting another job.") + try: + # with open('jobs.json', 'r') as f: # /data/ + # # Load the JSON data from the file + # jobs = json.load(f) + # + # for job_id, job_info in jobs.items(): + # # check if a job is running for the email + # if email: + # if job_info["email"] == email and job_info["status"] == "running": + # return message.format(id=job_id, reason="email") + # # check if a job is running for the session + # elif request.cookies: + # for key, value in job_info["cookies"].items() and job_info["status"] == "running": + # if key in request.cookies and request.cookies[key] == value: + # return message.format(id=job_id, reason="session") + # # check if a job is running for the IP + # else: + # if job_info["IP"] == request.client.host and job_info["status"] == "running": + # return message.format(id=job_id, reason="IP") + # check if a job is running for the email + Job = Query() + if email: + job = db.search((Job.email == email) & (Job.status == "RUNNING")) + if job: + return message.format(id=job[0]['id'], reason="email") + # check if a job is running for the session + elif request.cookies: + for key, value in request.cookies.items(): + job = db.search((Job.cookies[key] == value) & (Job.status == "RUNNING")) + if job: + return message.format(id=job[0]['id'], reason="session") + # check if a job is running for the IP + else: + job = db.search((Job.IP == request.client.host) & (Job.status == "RUNNING")) + if job: + return message.format(id=job[0]['id'], reason="IP") + + return False + except Exception as e: + raise gr.Error(f'Failed to validate user running jobs due to error: {str(e)}') + + +def get_timezone_by_ip(ip): + try: + data = session.get(f'https://worldtimeapi.org/api/ip/{ip}').json() + return data['timezone'] + except Exception: + return 'UTC' + + +def ts_to_str(timestamp, timezone): + # Create a timezone-aware datetime object from the UNIX timestamp + dt = datetime.fromtimestamp(timestamp, pytz.utc) + + # Convert the timezone-aware datetime object to the target timezone + target_timezone = pytz.timezone(timezone) + localized_dt = dt.astimezone(target_timezone) + + # Format the datetime object to the specified string format + return localized_dt.strftime('%Y-%m-%d %H:%M:%S (%Z%z)') + + +def lookup_job(job_id): + gr.Info('Start querying the job database...') + stop = False + retry = 0 + while not stop: + try: + sleep(5) + Job = Query() + jobs = db.search((Job.id == job_id)) + if jobs: + job = jobs[0] + job_status = job['status'] + job_type = job['type'] + error = job['error'] + start_time = ts_to_str(job['start_time'], get_timezone_by_ip(job['ip'])) + if job.get('end_time'): + end_time = ts_to_str(job['end_time'], get_timezone_by_ip(job['ip'])) + if job.get('expiry_time'): + expiry_time = ts_to_str(job['expiry_time'], get_timezone_by_ip(job['ip'])) + if job_status == "RUNNING": + yield { + pred_lookup_status: f''' +Your **{job_type}** job (ID: **{job_id}**) started at +**{start_time}** and is **RUNNING...** + +It might take a few minutes up to a few hours depending on the prediction dataset, the model, and the queue status. +You may keep the page open and wait for job completion, or close the page and revisit later to look up the job status +using the job id. You will also receive an email notification once the job is done. +''', + pred_lookup_btn: gr.Button(visible=False), + pred_lookup_stop_btn: gr.Button(visible=True) + } + if job_status == "COMPLETED": + stop = True + msg = f"Your {job_type} job (ID: {job_id}) has been **COMPLETED**" + msg += f" at {end_time}" if job.get('end_time') else "" + msg += f" and the results will expire by {expiry_time}." if job.get('expiry_time') else "." + msg += f' Redirecting to the report page...' + + gr.Info(msg) + yield { + pred_lookup_status: msg, + pred_lookup_btn: gr.Button(visible=True), + pred_lookup_stop_btn: gr.Button(visible=False), + tabs: gr.Tabs(selected='Chemical Property Report'), + file_for_report: job['output_file'] + } + if job_status == "FAILED": + stop = True + msg = f'Your {job_type} job (ID: {job_id}) has **FAILED**' + msg += f' at {end_time}' if job.get('end_time') else '' + msg += f' due to error: {error}.' if job.get('expiry_time') else '.' + gr.Info(msg) + yield { + pred_lookup_status: msg, + pred_lookup_btn: gr.Button(visible=True), + pred_lookup_stop_btn: gr.Button(visible=False), + tabs: gr.Tabs(selected='Prediction Status Lookup'), + } + else: + stop = (retry > 3) + if not stop: + msg = f'Job ID {job_id} not found. Retrying... ({retry})' + else: + msg = f'Job ID {job_id} not found after {retry} retries. Please check the job ID and try again.' + gr.Info(msg) + retry += 1 + yield { + pred_lookup_status: msg, + pred_lookup_btn: gr.Button(visible=True), + pred_lookup_stop_btn: gr.Button(visible=False), + tabs: gr.Tabs(selected='Prediction Status Lookup'), + } + + except Exception as e: + raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}') + + +def apply_advanced_opts(prediction_df, opts, df_training): + # Advanced options for Drug Hit Screening + if "Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set" in opts: + x2 = prediction_df['X2'].iloc[0] + + prediction_df[[ + 'Max. Sequence Identity to Training Targets', + 'Max. Id. Training Target' + ]] = pd.Series(max_sequence_identity(x2, df_training)) + + if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts: + x2 = prediction_df['X2'].iloc[0] + pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy() + pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp) + + @cache + def max_sim(smiles): + return max_tanimoto_similarity(smiles, pos_compounds_df) + + prediction_df[[ + 'Max. Tanimoto Similarity to Known Ligands', + 'Max. Sim. Ligand' + ]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series) + + max_sim.cache_clear() + + if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts: + x2 = prediction_df['X2'].iloc[0] + prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize) + + @cache + def max_id(compound): + pos_targets_df = df_training.loc[df_training['X1'] == compound] + return max_sequence_identity(x2, pos_targets_df) + + prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound', + 'Max. Id. Target']] = ( + prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series) + ) + prediction_df.drop(['X1^'], axis=1, inplace=True) + + max_id.cache_clear() + + # Advanced options for Target Protein Identification + if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts: + x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0]) + if x1 not in df_training['X1'].values: + df_training['FP'] = df_training['X1'].parallel_apply(smiles_to_ecfp) + + prediction_df[[ + 'Max. Tanimoto Similarity to Training Compounds', + 'Max. Sim. Training Compound' + ]] = pd.Series(max_tanimoto_similarity(x1, df_training)) + + if "Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound" in opts: + x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0]) + pos_targets_df = df_training.loc[(df_training['X1'] == x1) & (df_training['Y'] == 1)].copy() + + @cache + def max_id(fasta): + return max_sequence_identity(fasta, pos_targets_df) + + prediction_df[[ + 'Max. Sequence Identity to Known Targets of Input Compound', + 'Max. Id. Target' + ]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series) + + max_id.cache_clear() + + if "Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target" in opts: + x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0]) + + @cache + def max_sim(fasta): + pos_targets_df = df_training.loc[(df_training['X2'] == fasta) & (df_training['Y'] == 1)].copy() + if x1 not in pos_targets_df['X1'].values: + pos_targets_df['FP'] = pos_targets_df['X1'].apply(smiles_to_ecfp) + return max_tanimoto_similarity(x1, pos_targets_df) + + prediction_df[[ + 'Max. Tanimoto Similarity to Known Ligands of Identified Target', + 'Max. Sim. Ligand' + ]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series) + + max_sim.cache_clear() + + return prediction_df + + +def submit_predict(predict_filepath, task, preset, target_family, opts, job_info): + job_id = job_info['id'] + status = job_info['status'] + send_email(job_info) + db.insert(job_info) + error = None + task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'} + predictions_file = None + df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv') + df_training['X1^'] = df_training['X1'] + orig_df = pd.read_csv(predict_filepath) + alignment_df = get_fasta_family_map() + prediction_df = pd.DataFrame() + + @cache + def detect_family(query): + # Check for an exact match first + exact_match = alignment_df[alignment_df['X2'] == query] + if not exact_match.empty: + row = exact_match.iloc[0] + return row['Target Family'] + # If no exact match, then calculate alignment score + else: + aligner = PairwiseAligner() + aligner.mode = 'local' + + def align_score(target): + alignment = aligner.align(query, target) + return alignment.score / max(len(query), len(target)) + + alignment_df['score'] = alignment_df['X2'].apply(align_score) + row = alignment_df.loc[alignment_df['score'].idxmax()] + return row['Target Family'] + + if 'Target Family' not in orig_df.columns: + orig_df['Target Family'] = None + if orig_df['Target Family'].isna().any(): + if orig_df['X2'].nunique() > 1: + orig_df = orig_df.reset_index(drop=True) + orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = ( + orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family) + ) + else: + orig_df['Target Family'] = detect_family(orig_df['X2'].iloc[0]) + orig_df['Target Family'] = orig_df['Target Family'].str.capitalize() + detect_family.cache_clear() + + orig_df['X1^'] = orig_df['X1'].parallel_apply(rdkit_canonicalize) + + orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False) + annotated_df = orig_df[~orig_df['Y'].isna()].copy() + annotated_df.rename(columns={'Y': 'Y^'}, inplace=True) + annotated_df['Source'] = 'Database' + columns_to_drop = ['X1^', 'Compound', 'Scaffold', 'Scaffold SMILES'] + columns_to_drop = [col for col in columns_to_drop if col in annotated_df.columns] + annotated_df.drop(columns_to_drop, axis=1, inplace=True) + + # Save the unannotated data + unannotated_df = orig_df[orig_df['Y'].isna()].drop(['Y'], axis=1) + if not unannotated_df.empty: + unannotated_df.to_csv(predict_filepath, index=False, na_rep='') + else: + annotated_df.to_csv(predictions_file, index=False, na_rep='') + status = "COMPLETED" + return {run_state: False} + + columns_to_drop = ['ID1', 'X1^', 'Compound', 'Scaffold', 'Scaffold SMILES', 'ID2', 'Y', 'Y^'] + columns_to_drop = [col for col in columns_to_drop if col in orig_df.columns] + orig_df.drop(columns_to_drop, axis=1, inplace=True) + + try: + if target_family != 'Family-Specific Auto-Recommendation': + target_family_value = TARGET_FAMILY_MAP[target_family.title()] + task_value = TASK_MAP[task] + preset_value = PRESET_MAP[preset] + predictions_file = (f'{SERVER_DATA_DIR}/' + f'{job_id}_{task_file_abbr[task]}_{preset}_{target_family_value}_predictions.csv') + + cfg = hydra.compose( + config_name="webserver_inference", + overrides=[f"task={task_value}", + f"preset={preset_value}", + f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family_value}.ckpt", + f"data.data_file='{str(predict_filepath)}'"]) + + predictions, _ = predict(cfg) + predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True) + predictions['Source'] = f'Predicted ({preset} {target_family})' + df_list = [prediction_df, predictions] + prediction_df = pd.concat([df for df in df_list if not df.empty]) + + else: + predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_family-recommended_predictions.csv' + task_value = TASK_MAP[task] + score = TASK_METRIC_MAP[task] + benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv') + predict_df = pd.read_csv(predict_filepath) + + for family, subset in predict_df.groupby('Target Family'): + predict_subset_filepath = os.path.join( + os.path.dirname(predict_filepath), f'{job_id}_{family}_input.csv' + ) + subset.to_csv(predict_subset_filepath, index=False, na_rep='') + + seen_compounds = get_seen_smiles(family, task_value)['X1'].values + if subset['X1^'].iloc[0] in seen_compounds: + scenario = "Seen Compound" + else: + scenario = "Unseen Compound" + + filtered_df = benchmark_df[(benchmark_df['Family'] == family.title()) + & (benchmark_df['Scenario'] == scenario) + & (benchmark_df['Type'] == 'Family')] + + seen_compounds = get_seen_smiles('General', task_value)['X1'].values + if subset['X1^'].iloc[0] in seen_compounds: + scenario = "Seen Compound" + else: + scenario = "Unseen Compound" + + filtered_df = pd.concat([ + filtered_df, + benchmark_df[(benchmark_df['Family'] == family.title()) + & (benchmark_df['Scenario'] == scenario) + & (benchmark_df['Type'] == 'General')] + ]) + + row = filtered_df.loc[filtered_df[score].idxmax()] + preset_value = PRESET_MAP[row['Model']] + target_family = TARGET_FAMILY_MAP[family.title()] if row['Type'] == 'Family' else 'general' + cfg = hydra.compose( + config_name="webserver_inference", + overrides=[f"task={task_value}", + f"preset={preset_value}", + f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt", + f"data.data_file='{str(predict_subset_filepath)}'"]) + + predictions, _ = predict(cfg) + predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True) + predictions['Source'] = (f'Predicted ({row["Model"]} ' + f'{family.title() if row["Type"] == "Family" else "General"})') + df_list = [prediction_df, predictions] + prediction_df = pd.concat([df for df in df_list if not df.empty]) + + prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False) + df_list = [prediction_df, annotated_df] + prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True) + + prediction_df = apply_advanced_opts(prediction_df, opts, df_training) + + prediction_df.drop(['N', 'FP'], axis=1, errors='ignore').to_csv(predictions_file, index=False, na_rep='') + status = 'COMPLETED' + + return {run_state: False} + + except Exception as e: + gr.Warning(f"Prediction job failed due to error: {str(e)}") + status = "FAILED" + predictions_file = None + error = str(e) + return {run_state: False} + + finally: + Job = Query() + job_query = (Job.id == job_id) + + end_time = time() + expiry_time = end_time + DB_EXPIRY + + db.update({'end_time': end_time, + 'expiry_time': expiry_time, + 'status': status, + 'error': error, + 'input_file': predict_filepath, + 'output_file': predictions_file}, + job_query) + if job_info := db.search(job_query)[0]: + if job_info.get('email'): + send_email(job_info) + + +def update_df(file, progress=gr.Progress(track_tqdm=True)): + if file and Path(file).is_file(): + task = None + job = None + + if "_CPI_" in str(file): + task = 'Compound-Protein Interaction' + elif "_CPA_" in str(file): + task = 'Compound-Protein Binding Affinity' + + df = pd.read_csv(file) + + if 'N' in df.columns: + df.set_index('N', inplace=True) + + if not any(col in ['X1', 'X2'] for col in df.columns): + gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.") + return {analyze_btn: gr.Button(interactive=False)} + + if 'X1' in df.columns: + if 'Compound' not in df.columns or df['Compound'].dtype != 'object': + df['Compound'] = df['X1'].parallel_apply( + lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles))) + df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol) + df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x)) + df['Pharmacophore'] = None + if task == 'Compound-Protein Binding Affinity': + # Convert Y^ from pIC50 (nM) to IC50 (nM) + if 'Y^' in df.columns: + df['Y^'] = 10 ** (-df['Y^']) * 1e9 + + n_compound = df['X1'].nunique() + n_protein = df['X2'].nunique() + + if n_compound == 1 and n_protein >= 2: + job = 'Target Protein Identification' + if task == 'Compound-Protein Interaction': + opts = TARGET_IDENTIFY_CPI_OPTS + elif task == 'Compound-Protein Binding Affinity': + opts = TARGET_IDENTIFY_CPA_OPTS + if n_compound >= 2 and n_protein == 1: + job = 'Drug Hit Screening' + if task == 'Compound-Protein Interaction': + opts = DRUG_SCRENN_CPI_OPTS + elif task == 'Compound-Protein Binding Affinity': + opts = DRUG_SCRENN_CPA_OPTS + + return { + html_report: create_html_report(df, file=None, task=task), + raw_df: df, + report_df: df.copy(), + analyze_btn: gr.Button(interactive=True), + report_task: task, + job_opts: gr.CheckboxGroup( + label=f'{job} Advanced Options', + choices=opts, visible=True + ) if job else gr.CheckboxGroup(visible=False), + } + else: + return {analyze_btn: gr.Button(interactive=False)} + + +def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(track_tqdm=True)): + df_html = df.copy(deep=True) + column_aliases = COLUMN_ALIASES.copy() + cols_left = list(pd.Index([ + 'ID1', 'ID2', 'Compound', 'Scaffold', 'Pharmacophore', 'X1', 'Scaffold SMILES', 'X2', 'Y^' + ]).intersection(df_html.columns)) + # cols_right = list(pd.Index(['X1', 'X2']).intersection(df_html.columns)) + # df_html = df_html[cols_left + (df_html.columns.drop(cols_left + cols_right).tolist()) + cols_right] + df_html = df_html[cols_left + df_html.columns.drop(cols_left).tolist()] + + if isinstance(task, str): + column_aliases.update({ + 'Y^': 'Interaction Probability' if task == 'Compound-Protein Interaction' + else 'Binding Affinity (IC50 [nM])' + }) + + ascending = True if column_aliases['Y^'] == 'Binding Affinity (IC50 [nM])' else False + df_html = df_html.sort_values( + [col for col in ['Y^'] if col in df_html.columns], ascending=ascending + ) + + if not file: + df_html = df_html.iloc[:31] + + # Remove repeated info for one-against-N tasks to save visual and physical space + job = 'Chemical Property' + unique_entity = 'Unique Entity' + unique_df = None + category = None + columns_unique = None + + if 'Exclude Pharmacophore 3D' not in opts: + df_html['Pharmacophore'] = df_html['Compound'].parallel_apply( + lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x) + + if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts: + df_html['Compound'] = df_html['Compound'].parallel_apply( + lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x) + else: + df_html.drop(['Compound'], axis=1, inplace=True) + + if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts: + df_html['Scaffold'] = df_html['Scaffold'].parallel_apply( + lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x) + else: + df_html.drop(['Scaffold'], axis=1, inplace=True) + + if 'X1' in df_html.columns and 'X2' in df_html.columns: + n_compound = df_html['X1'].nunique() + n_protein = df_html['X2'].nunique() + + if n_compound == 1 and n_protein >= 2: + unique_entity = 'Compound of Interest' + if any(col in df_html.columns for col in ['Y^', 'Y']): + job = 'Target Protein Identification' + category = 'Target Family' + columns_unique = df_html.columns.isin( + ['ID1', 'Compound', 'Scaffold', 'X1', 'Scaffold SMILES', 'Pharmacophore', + 'Max. Tanimoto Similarity to Training Compounds', 'Max. Sim. Training Compound'] + + list(FILTER_MAP.keys()) + list(SCORE_MAP.keys()) + ) + + elif n_compound >= 2 and n_protein == 1: + unique_entity = 'Target of Interest' + if any(col in df_html.columns for col in ['Y^', 'Y']): + job = 'Drug Hit Screening' + category = 'Scaffold SMILES' + columns_unique = df_html.columns.isin( + ['X2', 'ID2', 'Max. Sequence Identity to Training Targets', 'Max. Id. Training Target'] + ) + + elif 'Y^' in df_html.columns: + job = 'Interaction Pair Inference' + + df_html.rename(columns=column_aliases, inplace=True) + df_html.index.name = 'Index' + if 'Target FASTA' in df_html.columns: + df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply( + lambda x: wrap_text(x) if not pd.isna(x) else x) + + num_cols = df_html.select_dtypes('number').columns + num_col_colors = sns.color_palette('husl', len(num_cols)) + bool_cols = df_html.select_dtypes(bool).columns + bool_col_colors = {True: 'lightgreen', False: 'lightpink'} + + if columns_unique is not None: + unique_df = df_html.loc[:, columns_unique].iloc[[0]].copy() + df_html = df_html.loc[:, ~columns_unique] + df_html.dropna(how='all', axis=1, inplace=True) + unique_df.dropna(how='all', axis=1, inplace=True) + + if not file: + if 'Compound ID' in df_html.columns: + df_html.drop(['Compound SMILES'], axis=1, inplace=True) + if 'Target ID' in df_html.columns: + df_html.drop(['Target FASTA'], axis=1, inplace=True) + if 'Target FASTA' in df_html.columns: + df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply( + lambda x: wrap_text(x) if not pd.isna(x) else x) + if 'Scaffold SMILES' in df_html.columns: + df_html.drop(['Scaffold SMILES'], axis=1, inplace=True) + + # FIXME: Temporarily drop pharmacophore column before an image solution is found + if 'Pharmacophore' in df_html.columns: + df_html.drop(['Pharmacophore'], axis=1, inplace=True) + if unique_df is not None and 'Pharmacophore' in unique_df.columns: + unique_df.drop(['Pharmacophore'], axis=1, inplace=True) + + styled_df = df_html.fillna('').style.format(precision=3) + + for i, col in enumerate(num_cols): + cmap = sns.light_palette(num_col_colors[i], as_cmap=True) + if col in df_html.columns: + if col not in ['Binding Affinity (IC50 [nM])']: + cmap.set_bad('white') + styled_df = styled_df.background_gradient( + subset=[col], cmap=cmap) + else: + cmap = cmap.reversed() + cmap.set_bad('white') + styled_df = styled_df.background_gradient( + subset=[col], cmap=cmap) + + if any(df_html.columns.isin(bool_cols)): + styled_df.applymap(lambda val: f'background-color: {bool_col_colors[val]}', subset=bool_cols) + + table_html = styled_df.to_html() + unique_html = '' + if unique_df is not None: + if 'Target FASTA' in unique_df.columns: + unique_df['Target FASTA'] = unique_df['Target FASTA'].str.replace('\n', '
') + + if 'Max. Sequence Identity to Training Targets' in unique_df.columns: + # Add alert emoji for sequence identity below 0.85 + if unique_df['Max. Sequence Identity to Training Targets'].iloc[0] < 0.85: + unique_df['Max. Sequence Identity to Training Targets'] = ( + unique_df['Max. Sequence Identity to Training Targets'].apply( + lambda x: f'{x:.3f}' + f' ⚠️Lower than recommended (0.85)' + f' - predictive reliability may be compromised' + ) + ) + + if 'Max. Tanimoto Similarity to Training Compounds' in unique_df.columns: + # Add alert emoji for sequence identity below 0.85 + if unique_df['Max. Tanimoto Similarity to Training Compounds'].iloc[0] < 0.85: + unique_df['Max. Tanimoto Similarity to Training Compounds'] = ( + unique_df['Max. Tanimoto Similarity to Training Compounds'].apply( + lambda x: f'{x:.3f}' + f' ⚠️Lower than recommended (0.85)' + f' - predictive reliability may be compromised' + ) + ) + + if any(unique_df.columns.isin(bool_cols)): + unique_df = unique_df.style.applymap( + lambda val: f"background-color: {bool_col_colors[val]}", subset=bool_cols) + unique_html = (f'
' + f'{unique_df.to_html(escape=False, index=False)}
') + + return (f'
{job} Report Preview (Top 30 Records)
' + f'
{unique_html}
' + f'
{table_html}
') + + else: + image_zoom_formatter = HTMLTemplateFormatter(template='
<%= value %>
') + uniprot_id_formatter = HTMLTemplateFormatter( + template='<% if (value == value) { ' # Check if value is not NaN + 'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) ' + # Check if value is a valid UniProt ID + '{ %><%= value %><% ' + # Else treat it as a sequence or other plain-text string, line-warping every 60 characters + '} else { %>
<%= value.match(/.{1,60}/g).join("
") ' + '%>
<% } %><% } else { %><% } %>' # Output empty string if value is NaN + ) + pubchem_id_formatter = HTMLTemplateFormatter( + template='<% if (value == value) { ' # Check if value is not NaN + '%><%= value %>' + '<% } else { %><% } %>' # Output empty string if value is NaN + ) + alert_emoji_formatter = HTMLTemplateFormatter( + template='<% if (value < 0.85) { ' + '%><%= value %> ' + '⚠️Lower than recommended (0.85) - predictive reliability may be compromised<% ' + '} else { %><%= value %><% } %>' + ) + bool_formatters = {col: BooleanFormatter() for col in bool_cols} + float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns} + other_formatters = { + 'Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True}, + 'Compound': image_zoom_formatter, + 'Scaffold': image_zoom_formatter, + 'Pharmacophore': {'type': 'executeScriptFormatter'}, + 'Target FASTA': {'type': 'textarea', 'width': 60}, + 'Target ID': uniprot_id_formatter, + 'Compound ID': pubchem_id_formatter, + 'Max. Sim. Ligand': pubchem_id_formatter, + 'Max. Id. Target': uniprot_id_formatter, + 'Max. Sim. Training Compound': pubchem_id_formatter, + 'Max. Id. Training Target': uniprot_id_formatter, + 'Max. Sequence Identity to Training Targets': alert_emoji_formatter, + 'Max. Sequence Identity to Known Targets of Hit Compound': alert_emoji_formatter, + } + formatters = {**bool_formatters, **float_formatters, **other_formatters} + + # html = df.to_html(file) + # return html + + report_table = pn.widgets.Tabulator( + df_html, formatters=formatters, + frozen_columns=[ + 'Index', 'Target ID', 'Compound ID', 'Compound' + ], + disabled=True, sizing_mode='stretch_both', pagination='local', page_size=10 + ) + + for i, col in enumerate(num_cols): + cmap = sns.light_palette(num_col_colors[i], as_cmap=True) + if col not in ['Binding Affinity (IC50 [nM])']: + if col not in ['Interaction Probability']: + cmap.set_bad(color='white') + report_table.style.background_gradient( + subset=df_html.columns == col, cmap=cmap) + else: + continue + else: + cmap = cmap.reversed() + cmap.set_bad(color='white') + report_table.style.background_gradient( + subset=df_html.columns == col, cmap=cmap) + + pie_charts = {} + for y in df_html.columns.intersection(['Interaction Probability', 'Binding Affinity (IC50 [nM])']): + pie_charts[y] = [] + for k in [10, 30, 100]: + if k < len(df_html): + pie_charts[y].append(create_pie_chart(df_html, category=category, value=y, top_k=k)) + pie_charts[y].append(create_pie_chart(df_html, category=category, value=y, top_k=len(df_html))) + + # Remove keys with empty values + pie_charts = {k: v for k, v in pie_charts.items() if any(v)} + + panel_css = """ + .tabulator { + font-family: Courier New !important; + font-weight: normal !important; + font-size: 12px !important; + } + + .tabulator-cell { + overflow: visible !important; + align-content: center !important; + } + + .tabulator-cell:hover { + z-index: 1000 !important; + } + + .image-zoom-viewer { + display: inline-block; + overflow: visible; + z-index: 1000; + } + + .image-zoom-viewer::after { + content: ""; + top: 0; + left: 0; + width: 100%; + height: 100%; + pointer-events: none; + } + + .image-zoom-viewer:hover::after { + pointer-events: all; + } + + /* When hovering over the container, scale its child (the SVG) */ + .tabulator-cell:hover .image-zoom-viewer svg { + padding: 3px; + position: absolute; + background-color: rgba(250, 250, 250, 0.854); + box-shadow: 0 0 10px rgba(0, 0, 0, 0.618); + border-radius: 3px; + transform: scale(3); /* Scale up the SVG */ + transition: transform 0.3s ease; + pointer-events: none; /* Prevents the SVG from blocking mouse interactions */ + z-index: 1000; + } + """ + + pn.extension( + raw_css=[panel_css], + js_files={'panel_custom': 'static/panel.js', '3Dmol': 'static/3Dmol-min.js'}, + # js_modules={'3Dmol': 'static/3Dmol-min.js'}, + inline=True, + ) + + template = pn.template.VanillaTemplate( + title=f'DeepSEQreen {job} Report', + sidebar=[], + favicon='deepseqreen.ico', + logo='deepseqreen.svg', + header_background='#F3F5F7', + header_color='#4372c4', + busy_indicator=None, + ) + + stats_pane = pn.Row() + if unique_df is not None: + unique_table = pn.widgets.Tabulator(unique_df, formatters=formatters, sizing_mode='stretch_width', + show_index=False, disabled=True, + frozen_columns=['Compound ID', 'Compound', 'Target ID']) + # if pie_charts: + # unique_table.width = 640 + stats_pane.append(pn.Column(f'### {unique_entity}', unique_table)) + if pie_charts: + for score_name, figure_list in pie_charts.items(): + stats_pane.append( + pn.Column(f'### {category} by Top {score_name}', + pn.Tabs(*figure_list, tabs_location='above')) + # pn.Card(pn.Row(v), title=f'{category} by Top {k}') + ) + + if stats_pane: + template.main.append(pn.Card(stats_pane, + sizing_mode='stretch_width', title='Summary Statistics', margin=10)) + + template.main.append( + pn.Card(report_table, title=f'{job} Results', # width=1200, + margin=10) + ) + + template.save(file, title=f'DeepSEQreen {job} Report', resources=INLINE) + return file + + +def create_pie_chart(df, category, value, top_k): + if category not in df or value not in df: + return + top_k_df = df.nlargest(top_k, value) + category_counts = top_k_df[category].value_counts() + data = pd.DataFrame({category: category_counts.index, 'value': category_counts.values}) + + data['proportion'] = data['value'] / data['value'].sum() + # Merge rows with proportion less than 0.2% into one row + mask = data['proportion'] < 0.002 + if any(mask): + merged_row = data[mask].sum() + merged_row[category] = '...' + data = pd.concat([data[~mask], pd.DataFrame(merged_row).T]) + data['angle'] = data['proportion'] * 2 * pi + + color_dict = {cat: color for cat, color in + zip(df[category].unique(), + (Category20c_20 * (len(df[category].unique()) // 20 + 1))[:len(df[category].unique())])} + color_dict['...'] = '#636363' + data['color'] = data[category].map(color_dict) + + tooltips = [ + (f"{category}", f"@{{{category}}}"), + ("Count", "@value"), + ("Percentage", "@proportion{0.0%}") + ] + + if category == 'Scaffold SMILES' and 'Scaffold' in df.columns: + data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left', + left_on='Scaffold SMILES', right_on='Scaffold SMILES') + tooltips.append(("Scaffold", "
@{Scaffold}{safe}
")) + p = figure(height=384, width=960, name=f"Top {top_k}" if top_k < len(df) else 'All', sizing_mode='stretch_height', + toolbar_location=None, tools="hover", tooltips=tooltips, x_range=(-0.4, 0.4)) + + def truncate_label(label, max_length=60): + return label if len(label) <= max_length else label[:max_length] + "..." + + data['legend_field'] = data[category].apply(truncate_label) + + p.add_layout(Legend(padding=0, margin=0), 'right') + p.wedge(x=0, y=1, radius=0.3, + start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'), + line_color="white", fill_color='color', legend_field='legend_field', source=data) + + # Limit the number of legend items to 20 and add "..." if there are more than 20 items + if len(p.legend.items) > 20: + new_legend_items = p.legend.items[:20] + new_legend_items.append(LegendItem(label="...")) + p.legend.items = new_legend_items + + p.legend.label_text_font_size = "10pt" + p.legend.label_text_font = "courier" + p.axis.axis_label = None + p.axis.visible = False + p.grid.grid_line_color = None + p.outline_line_width = 0 + p.min_border = 0 + p.margin = 0 + + return p + + +def submit_report(df, score_list, filter_list, opt_list, task, progress=gr.Progress(track_tqdm=True)): + df_report = df.copy() + try: + for filter_name in filter_list: + df_report[filter_name] = df_report['Compound'].parallel_apply( + lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x) + + for score_name in score_list: + df_report[score_name] = df_report['Compound'].parallel_apply( + lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x) + + if opt_list: + df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv') + df_report = apply_advanced_opts(df_report, opt_list, df_training) + + return (create_html_report(df_report, file=None, task=task), df_report, + gr.File(visible=False), gr.File(visible=False)) + + except Exception as e: + gr.Warning(f'Failed to report results due to error: {str(e)}') + return None, None, None, None + + +def wrap_text(text, line_length=60): + if isinstance(text, str): + wrapper = textwrap.TextWrapper(width=line_length) + if text.startswith('>'): + sections = text.split('>') + wrapped_sections = [] + for section in sections: + if not section: + continue + lines = section.split('\n') + seq_header = lines[0] + wrapped_seq = wrapper.fill(''.join(lines[1:])) + wrapped_sections.append(f">{seq_header}\n{wrapped_seq}") + return '\n'.join(wrapped_sections) + else: + return wrapper.fill(text) + else: + return text + + +def unwrap_text(text): + return text.strip.replece('\n', '') + + +def drug_library_from_sdf(sdf_path): + return PandasTools.LoadSDF( + sdf_path, + smilesName='X1', molColName='Compound', includeFingerprints=True + ) + + +def process_target_library_upload(library_upload): + if library_upload.endswith('.csv'): + df = pd.read_csv(library_upload) + elif library_upload.endswith('.fasta'): + df = target_library_from_fasta(library_upload) + else: + raise gr.Error('Currently only CSV and FASTA files are supported as target libraries.') + validate_columns(df, ['X2']) + return df + + +def process_drug_library_upload(library_upload): + if library_upload.endswith('.csv'): + df = pd.read_csv(library_upload) + elif library_upload.endswith('.sdf'): + df = drug_library_from_sdf(library_upload) + else: + raise gr.Error('Currently only CSV and SDF files are supported as drug libraries.') + validate_columns(df, ['X1']) + return df + + +def target_library_from_fasta(fasta_path): + records = list(SeqIO.parse(fasta_path, "fasta")) + id2 = [record.id for record in records] + seq = [str(record.seq) for record in records] + df = pd.DataFrame({'ID2': id2, 'X2': seq}) + return df + + +theme = gr.themes.Base(spacing_size="sm", text_size='md', font=gr.themes.GoogleFont("Roboto")).set( + background_fill_primary='#eef3f9', + background_fill_secondary='white', + checkbox_label_background_fill='#eef3f9', + checkbox_label_background_fill_hover='#dfe6f0', + checkbox_background_color='white', + checkbox_border_color='#4372c4', + border_color_primary='#4372c4', + border_color_accent='#2e6ab5', + button_primary_background_fill='#2e6ab4', + button_primary_text_color='white', + body_text_color='#28496F', + block_background_fill='#fbfcfd', + block_title_text_color='#28496F', + block_label_text_color='#28496F', + block_info_text_color='#505358', + block_border_color=None, + # input_border_color='#4372c4', + # panel_border_color='#4372c4', + input_background_fill='#F1F2F4', +) + +with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48 * 3600)) as demo: + run_state = gr.State(value=False) + screen_flag = gr.State(value=False) + identify_flag = gr.State(value=False) + infer_flag = gr.State(value=False) + + with gr.Tabs() as tabs: + with gr.TabItem(label='Drug Hit Screening', id='Drug Hit Screening'): + gr.Markdown(''' + #
Drug Hit Screening
+ +
+ To predict interactions or binding affinities of a single target against a compound library. +
+ ''') + with gr.Row(): + with gr.Column(): + HelpTip( + "Enter (paste) a amino acid sequence below manually or upload a FASTA file. " + "If multiple entities are in the FASTA, only the first will be used. " + "Alternatively, enter a Uniprot ID or gene symbol with organism and click Query for " + "the sequence." + ) + target_input_type = gr.Dropdown( + label='Step 1. Select Target Input Type and Input', + choices=['Sequence', 'UniProt ID', 'Gene symbol'], + info='Enter (paste) a FASTA string below manually or upload a FASTA file.', + value='Sequence', + scale=4, interactive=True + ) + + with gr.Row(): + target_id = gr.Textbox(show_label=False, visible=False, + interactive=True, scale=4, + info='Enter a UniProt ID and query.') + target_gene = gr.Textbox( + show_label=False, visible=False, + interactive=True, scale=4, + info='Enter a gene symbol and query. The first record will be used.') + target_organism = gr.Textbox( + info='Organism scientific name (default: Homo sapiens).', + placeholder='Homo sapiens', show_label=False, + visible=False, interactive=True, scale=4, ) + target_upload_btn = gr.UploadButton(label='Upload a FASTA File', type='binary', + visible=True, variant='primary', + size='lg') + target_paste_markdown = gr.Button(value='OR Paste Your Sequence Below', + variant='secondary') + target_query_btn = gr.Button(value='Query the Sequence', variant='primary', + visible=False, scale=4) + # with gr.Row(): + # example_uniprot = gr.Button(value='Example: Q16539', elem_classes='example', visible=False) + # example_gene = gr.Button(value='Example: MAPK14', elem_classes='example', visible=False) + example_fasta = gr.Button(value='Example: MAPK14 (Q16539)', elem_classes='example') + target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5) + # with gr.Row(): + # with gr.Column(): + # with gr.Column(): + # gr.File(label='Example FASTA file', + # value='data/examples/MAPK14.fasta', interactive=False) + + with gr.Row(): + with gr.Column(min_width=200): + HelpTip( + "Click Auto-detect to identify the protein family using sequence alignment. " + "This optional step allows applying a family-specific model instead of a all-family " + "model (general). " + "Manually select general if the alignment results are unsatisfactory." + ) + drug_screen_target_family = gr.Dropdown( + choices=list(TARGET_FAMILY_MAP.keys()), + value='General', + label='Step 2. Select Target Family (Optional)', interactive=True) + target_family_detect_btn = gr.Button(value='OR Let Us Auto-Detect for You', + variant='primary') + with gr.Column(min_width=200): + HelpTip( + "Interaction prediction provides you binding probability score between the target of " + "interest and each compound in the library, " + "while affinity prediction directly estimates their binding strength measured using " + "half maximal inhibitory concentration (IC50) in units of nM." + ) + drug_screen_task = gr.Dropdown( + list(TASK_MAP.keys()), + label='Step 3. Select a Prediction Task', + value='Compound-Protein Interaction') + with gr.Column(min_width=200): + HelpTip( + "Select your preferred model, or click Recommend for the best-performing model based " + "on the selected task, family, and whether the target was trained. " + "Please refer to documentation for detailed benchmark results." + ) + drug_screen_preset = gr.Dropdown( + list(PRESET_MAP.keys()), + label='Step 4. Select a Preset Model') + screen_preset_recommend_btn = gr.Button( + value='OR Let Us Recommend for You', variant='primary') + + with gr.Row(): + with gr.Column(): + HelpTip( + "Select a preset compound library (e.g., DrugBank). " + "Alternatively, upload a CSV file with a column named X1 containing compound SMILES, " + "or use an SDF file (Max. 10,000 compounds per task). Example CSV and SDF files are " + "provided below and can be downloaded by clicking the lower right corner." + ) + drug_library = gr.Dropdown( + label='Step 5. Select a Preset Compound Library', + choices=list(DRUG_LIBRARY_MAP.keys())) + with gr.Row(): + gr.File(label='Example SDF compound library', + value='data/examples/compound_library.sdf', interactive=False) + gr.File(label='Example CSV compound library', + value='data/examples/compound_library.csv', interactive=False) + drug_library_upload_btn = gr.UploadButton( + label='OR Upload Your Own Library', variant='primary') + drug_library_upload = gr.File(label='Custom compound library file', visible=False) + + with gr.Column(): + HelpTip(""" +Max. Sequence Identity between the Input Target and Targets in the Training Set: +this serves as an indicator of the predictioon applicability/reliability – +higher similarities indicate more reliable predictions (preferably > 0.85).
+Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target: +this serves as an indicator of both the confidence level and novelty of the predicted hit compounds – +higher similarities suggest greater confidence, while lower Tanimoto similarities may indicate the novelty +of the identified hit compounds compared to known drugs or true interacting compounds of the input target.
+Max. Sequence Identity between the Input Target and Known Targets of Hit Compound: +this serves as an additional indicator of the confidence level of the predicted hit compounds – +higher identities usually lead to greater confidence in the predictions.
+""") + drug_screen_opts = gr.CheckboxGroup( + label="Step 6. Select Advanced Options", + value=DRUG_SCRENN_CPI_OPTS[0], + choices=DRUG_SCRENN_CPI_OPTS, + info="Advanced features - may increase the job computation time. " + "See the Help Tip on the right or the Documentation for detailed explanation.", + + ) + with gr.Row(): + with gr.Column(): + drug_screen_email = gr.Textbox( + label='Step 7. Input Your Email Address (Optional)', + info="Your email address will be used to notify you of the status of your job. " + "If you cannot receive the email, please check your spam/junk folder." + ) + + with gr.Row(visible=True): + with gr.Row(): + drug_screen_clr_btn = gr.ClearButton(size='lg') + drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg') + + screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath') + + with gr.TabItem(label='Target Protein Identification', id='Target Protein Identification'): + gr.Markdown(''' + #
Target Protein Identification
+ +
+ To predict interactions or binding affinities of a single compound against a protein library. +
+ ''') + with gr.Column() as identify_page: + with gr.Row(): + with gr.Column(): + HelpTip( + "Enter (paste) a compound SMILES below manually or upload a SDF file. " + "If multiple entities are in the SDF, only the first will be used. " + "SMILES can be obtained by searching for the compound of interest in databases such " + "as NCBI, PubChem and and ChEMBL." + ) + compound_type = gr.Dropdown( + label='Step 1. Select Compound Input Type and Input', + choices=['SMILES', 'SDF'], + info='Enter (paste) an SMILES string or upload an SDF file to convert to SMILES.', + value='SMILES', + interactive=True) + compound_upload_btn = gr.UploadButton( + label='OR Upload a SDF File', variant='primary', type='binary', visible=False) + + compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5) + example_drug = gr.Button(value='Example: Aspirin', elem_classes='example') + + with gr.Row(): + with gr.Column(visible=True): + HelpTip( + "By default, models trained on all protein families (general) will be applied. " + "If you upload a target library containing proteins all in the same family, " + "you may manually select a Target Family." + ) + # target_identify_target_family = gr.Dropdown( + # choices=['Family-Specific Auto-Recommendation'] + list(TARGET_FAMILY_MAP.keys()), + # value='Family-Specific Auto-Recommendation', + # label='Step 2. Select Target Family') + target_identify_target_family = gr.Dropdown( + choices=['General'], + value='General', + label='Step 2. Select Target Family') + with gr.Column(): + HelpTip( + "Interaction prediction provides you binding probability score between the target of " + "interest and each compound in the library, while affinity prediction directly " + "estimates their binding strength measured using " + "half maximal inhibitory concentration (IC50) in units of nM." + ) + target_identify_task = gr.Dropdown( + list(TASK_MAP.keys()), + label='Step 3. Select a Prediction Task', + value='Compound-Protein Interaction') + + with gr.Column(): + HelpTip( + "Select your preferred model, or click Recommend for the best-performing model based " + "on the selected task and whether the compound was trained. By default, General-trained " + "model is used for Target Protein Identification. " + "Please refer to the documentation for detailed benchmark results." + ) + # target_identify_preset = gr.Dropdown( + # choices=['Family-Specific Auto-Recommendation'] + list(PRESET_MAP.keys()), + # value='Family-Specific Auto-Recommendation', + # label='Step 4. Select a Preset Model') + target_identify_preset = gr.Dropdown( + choices=['DeepConvDTI', 'DrugBAN', 'HyperAttentionDTI'], + value='DrugBAN', + label='Step 4. Select a Preset Model') + identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You', + variant='primary') + with gr.Row(): + with gr.Column(): + HelpTip( + "Select a preset target library (e.g., ChEMBL33_human_proteins). " + "Alternatively, upload a CSV file with a column named X2 containing target protein " + "sequences, or use an FASTA file (Max. 10,000 targets per task). " + "Example CSV and SDF files are provided below " + "and can be downloaded by clicking the lower right corner." + ) + target_library = gr.Dropdown( + label='Step 5. Select a Preset Target Library', + choices=list(TARGET_LIBRARY_MAP.keys())) + with gr.Row(): + gr.File(label='Example FASTA target library', + value='data/examples/target_library.fasta', interactive=False) + gr.File(label='Example CSV target library', + value='data/examples/target_library.csv', interactive=False) + target_library_upload_btn = gr.UploadButton( + label='OR Upload Your Own Library', variant='primary') + target_library_upload = gr.File(label='Custom target library file', visible=False) + with gr.Column(): + HelpTip(""" +Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set: +this serves as an indicator of prediction applicability and reliability – +higher similarities indicates more reliable predictions (ideally > 0.85).
+Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound: +this serves as an indicator of prediction confidence for the potential targets – +higher similarities typically imply higher confidence levels.
+Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target: +this serves as an additional indicator of the confidence level in the predicted potential targets – +higher similarities usually correspond to greater prediction confidence.
+""") + target_identify_opts = gr.CheckboxGroup( + choices=TARGET_IDENTIFY_CPI_OPTS, + value=TARGET_IDENTIFY_CPI_OPTS[0], + label='Step 6. Select Advanced Options', + info="Advanced features - may increase the job computation time. " + "See the Help Tip on the right or the Documentation for detailed explanation." + ) + with gr.Row(): + with gr.Column(): + target_identify_email = gr.Textbox( + label='Step 7. Input Your Email Address (Optional)', + info="Your email address will be used to notify you of the status of your job. " + "If you cannot receive the email, please check your spam/junk folder." + ) + + with gr.Row(visible=True): + target_identify_clr_btn = gr.ClearButton(size='lg') + target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary', + size='lg') + + identify_data_for_predict = gr.File(visible=False, file_count="single", type='filepath') + + with gr.TabItem(label='Interaction Pair Inference', id='Interaction Pair Inference'): + gr.Markdown(''' + #
Interaction Pair Inference
+ +
To predict interactions or binding affinities between up to + 10,000 paired compound-protein data.
+ ''') + HelpTip( + "A custom interation pair dataset can be a CSV file with 2 required columns " + "(X1 for smiles and X2 for sequences) " + "and optionally 2 ID columns (ID1 for compound ID and ID2 for target ID), " + "or generated from a FASTA file containing multiple " + "sequences and a SDF file containing multiple compounds. " + "Currently, a maximum of 10,000 pairs is supported, " + "which means that the size of CSV file or " + "the product of the two library sizes should not exceed 10,000." + ) + infer_type = gr.Dropdown( + choices=['Upload a CSV file containing paired compound-protein data', + 'Upload a compound library and a target library'], + label='Step 1. Select Pair Input Type and Input', + value='Upload a CSV file containing paired compound-protein data') + with gr.Column() as pair_upload: + gr.File( + label="Example CSV dataset", + value="data/examples/interaction_pair_inference.csv", + interactive=False + ) + with gr.Row(): + infer_csv_prompt = gr.Button( + value="Upload Your Own Dataset Below", + variant='secondary') + with gr.Column(): + infer_pair = gr.File( + label='Upload CSV File Containing Paired Records', + file_count="single", + type='filepath', + visible=True + ) + with gr.Column(visible=False) as pair_generate: + with gr.Row(): + gr.File( + label='Example SDF compound library', + value='data/examples/compound_library.sdf', + interactive=False + ) + gr.File( + label='Example FASTA target library', + value='data/examples/target_library.fasta', + interactive=False + ) + with gr.Row(): + gr.File( + label='Example CSV compound library', + value='data/examples/compound_library.csv', + interactive=False + ) + gr.File( + label='Example CSV target library', + value='data/examples/target_library.csv', + interactive=False + ) + with gr.Row(): + infer_library_prompt = gr.Button( + value="Upload Your Own Libraries Below", + visible=False, + variant='secondary' + ) + with gr.Row(): + infer_drug = gr.File( + label='Upload SDF/CSV File Containing Multiple Compounds', + file_count="single", + type='filepath' + ) + infer_target = gr.File( + label='Upload FASTA/CSV File Containing Multiple Targets', + file_count="single", + type='filepath' + ) + + with gr.Row(): + with gr.Column(min_width=200): + HelpTip( + "By default, models trained on all protein families (general) will be applied. " + "If the proteins in the target library of interest " + "all belong to the same protein family, manually selecting the family is supported." + ) + + pair_infer_target_family = gr.Dropdown( + choices=list(TARGET_FAMILY_MAP.keys()), + value='General', + label='Step 2. Select Target Family (Optional)' + ) + + with gr.Column(min_width=200): + HelpTip( + "Interaction prediction provides you binding probability score " + "between the target of interest and each compound in the library, " + "while affinity prediction directly estimates their binding strength " + "measured using half maximal inhibitory concentration (IC50) in units of nM." + ) + pair_infer_task = gr.Dropdown( + list(TASK_MAP.keys()), + label='Step 3. Select a Prediction Task', + value='Compound-Protein Interaction' + ) + + with gr.Column(min_width=200): + HelpTip( + "Select your preferred model. Please refer to documentation for detailed benchmark results." + ) + pair_infer_preset = gr.Dropdown( + list(PRESET_MAP.keys()), + label='Step 4. Select a Preset Model' + ) + # infer_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You', + # variant='primary') + pair_infer_opts = gr.CheckboxGroup(visible=False) + + with gr.Row(): + pair_infer_email = gr.Textbox( + label='Step 5. Input Your Email Address (Optional)', + info="Your email address will be used to notify you of the status of your job. " + "If you cannot receive the email, please check your spam/junk folder.") + + with gr.Row(visible=True): + pair_infer_clr_btn = gr.ClearButton(size='lg') + pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg') + + infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False) + + with gr.TabItem(label='Chemical Property Report', id='Chemical Property Report'): + gr.Markdown(''' + #
Chemical Property Report
+ + To compute chemical properties for the predictions of Drug Hit Screening, + Target Protein Identification, and Interaction Pair Inference. + + You may also upload your own dataset using a CSV file containing + one required column `X1` for compound SMILES. + + The page shows only a preview report displaying at most 30 records + (with top predicted CPI/CPA if reporting results from a prediction job). + + Please first `Preview` the report, then `Generate` and download a CSV report + or an interactive HTML report below if you wish to access the full report. + ''') + raw_df = gr.State(value=pd.DataFrame()) + report_df = gr.State(value=pd.DataFrame()) + with gr.Row(): + with gr.Column(scale=1): + file_for_report = gr.File(interactive=True, type='filepath') + report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False, + value='Compound-Protein Interaction', + label='Specify the Task Labels in the Uploaded Dataset') + with gr.Column(scale=2): + with gr.Column(): + with gr.Row(): + scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Compound Scores') + filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Compound Filters') + job_opts = gr.CheckboxGroup(visible=False) + + with gr.Accordion('Report Generate Options', open=True): + with gr.Row(): + csv_sep = gr.Radio(label='CSV Delimiter', + choices=['Comma', 'Tab'], value='Comma') + html_opts = gr.CheckboxGroup(label='HTML Report Options', + choices=[ + 'Exclude Molecular Graph', + 'Exclude Scaffold Graph', + 'Exclude Pharmacophore 3D' + ]) + + with gr.Row(): + report_clr_btn = gr.ClearButton(size='lg') + analyze_btn = gr.Button('Calculate Properties and Preview', variant='primary', + size='lg', interactive=False) + + with gr.Row(): + with gr.Column(scale=3): + html_report = gr.HTML() # label='Results', visible=True) + ranking_pie_chart = gr.Plot(visible=False) + + with gr.Row(): + with gr.Column(): + csv_generate = gr.Button(value='Generate CSV Report', + interactive=False, variant='primary') + csv_download_file = gr.File(label='Download CSV Report', visible=False) + with gr.Column(): + html_generate = gr.Button(value='Generate HTML Report', + interactive=False, variant='primary') + html_download_file = gr.File(label='Download HTML Report', visible=False) + + with gr.TabItem(label='Prediction Status Lookup', id='Prediction Status Lookup'): + gr.Markdown(''' + #
Prediction Status Lookup
+ + To check the status of an in-progress or historical job using the job ID and retrieve the predictions + if the job has completed. Note that predictions are only kept for 48 hours upon job completion. + + You will be redirected to Chemical Property Report for carrying out further analysis and + generating the full report when the job is done. If the Lookup fails to respond, please wait for a + few minutes and refresh the page to try again. + ''') + with gr.Column(): + pred_lookup_id = gr.Textbox( + label='Input Your Job ID', placeholder='e.g., e9dfd149-3f5c-48a6-b797-c27d027611ac', + info="Your job ID is a UUID4 string that you receive after submitting a job on the " + "page or in the email notification.") + pred_lookup_btn = gr.Button(value='Lookup the Job Status', variant='primary', visible=True) + pred_lookup_stop_btn = gr.Button(value='Stop Tracking', variant='stop', visible=False) + pred_lookup_status = gr.Markdown() + + # retrieve_email = gr.Textbox(label='Step 2. Input Your Email Address', placeholder='e.g., + + + def target_input_type_select(input_type): + match input_type: + case 'UniProt ID': + return [gr.Dropdown(info=''), + gr.UploadButton(visible=False), + gr.Textbox(visible=True, value=''), + gr.Textbox(visible=False, value=''), + gr.Textbox(visible=False, value=''), + gr.Button(visible=True), + gr.Code(value=''), + gr.Button(visible=False)] + case 'Gene symbol': + return [gr.Dropdown(info=''), + gr.UploadButton(visible=False), + gr.Textbox(visible=False, value=''), + gr.Textbox(visible=True, value=''), + gr.Textbox(visible=True, value=''), + gr.Button(visible=True), + gr.Code(value=''), + gr.Button(visible=False)] + case 'Sequence': + return [gr.Dropdown(info='Enter (paste) a FASTA string below manually or upload a FASTA file.'), + gr.UploadButton(visible=True), + gr.Textbox(visible=False, value=''), + gr.Textbox(visible=False, value=''), + gr.Textbox(visible=False, value=''), + gr.Button(visible=False), + gr.Code(value=''), + gr.Button(visible=True)] + + + target_input_type.select( + fn=target_input_type_select, + inputs=target_input_type, + outputs=[ + target_input_type, target_upload_btn, + target_id, target_gene, target_organism, target_query_btn, + target_fasta, target_paste_markdown + ], + show_progress='hidden' + ) + + + def uniprot_query(input_type, uid, gene, organism='Human'): + uniprot_endpoint = 'https://rest.uniprot.org/uniprotkb/{query}' + fasta_rec = '' + + match input_type: + case 'UniProt ID': + query = f"{uid.strip()}.fasta" + case 'Gene symbol': + organism = organism if organism else 'Human' + query = f'search?query=organism_name:{organism.strip()}+AND+gene:{gene.strip()}&format=fasta' + + try: + fasta = session.get(uniprot_endpoint.format(query=query)) + fasta.raise_for_status() + if fasta.text: + fasta_rec = next(SeqIO.parse(io.StringIO(fasta.text), format='fasta')) + fasta_rec = f">{fasta_rec.description}\n{fasta_rec.seq}" + + except Exception as e: + raise gr.Warning(f"Failed to query FASTA from UniProt database due to {str(e)}") + finally: + return fasta_rec + + + def process_fasta_upload(fasta_upload): + fasta = '' + try: + fasta = fasta_upload.decode() + except Exception as e: + gr.Warning(f"Please upload a valid FASTA file. Error: {str(e)}") + return fasta + + + target_upload_btn.upload( + fn=process_fasta_upload, inputs=target_upload_btn, outputs=target_fasta + ).then( + fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden' + ) + target_query_btn.click( + fn=uniprot_query, inputs=[target_input_type, target_id, target_gene, target_organism], outputs=target_fasta + ).then( + fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden' + ) + + + def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)): + try: + aligner = PairwiseAligner(mode='local') + alignment_df = get_fasta_family_map() + + processed_fasta = process_target_fasta(fasta) + + # Check for an exact match first + exact_match = alignment_df[alignment_df['X2'] == processed_fasta] + if not exact_match.empty: + row = exact_match.iloc[0] + family = str(row['Target Family']).title() + return gr.Dropdown( + value=family, + info=f"Reason: Exact match found with {row['ID2']} from family {family}") + + # If no exact match, then calculate alignment score + def align_score(query): + alignment = aligner.align(processed_fasta, query) + return alignment.score / max(len(processed_fasta), len(query)) + + alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score) + row = alignment_df.loc[alignment_df['score'].idxmax()] + family = str(row['Target Family']).title() + return gr.Dropdown(value=family, + info=f"Reason: Best sequence identity ({row['score']}) " + f"with {row['ID2']} from family {family}") + except Exception as e: + gr.Warning("Failed to detect the protein family due to error: " + str(e)) + + + target_family_detect_btn.click(fn=target_family_detect, inputs=target_fasta, outputs=drug_screen_target_family) + + # target_fasta.focus(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden') + target_fasta.blur(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden') + + drug_library_upload_btn.upload(fn=lambda x: [ + x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name]) + ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library]) + + drug_screen_task.select( + fn=lambda task, opts: gr.CheckboxGroup(choices=DRUG_SCRENN_CPA_OPTS) + if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup( + choices=DRUG_SCRENN_CPI_OPTS, value=DRUG_SCRENN_CPI_OPTS[0]), + inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts, + show_progress='hidden' + ) + + target_identify_task.select( + fn=lambda task, opts: gr.CheckboxGroup(choices=TARGET_IDENTIFY_CPA_OPTS) + if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup( + choices=TARGET_IDENTIFY_CPI_OPTS, value=TARGET_IDENTIFY_CPI_OPTS[0]), + inputs=[target_identify_task, target_identify_opts], outputs=target_identify_opts, + show_progress='hidden' + ) + + def example_fill(input_type): + return {target_id: 'Q16539', + target_gene: 'MAPK14', + target_organism: 'Human', + target_fasta: """ +>sp|Q16539|MK14_HUMAN Mitogen-activated protein kinase 14 OS=Homo sapiens OX=9606 GN=MAPK14 PE=1 SV=3 +MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLRVAVKKLSRPFQ +SIIHAKRTYRELRLLKHMKHENVIGLLDVFTPARSLEEFNDVYLVTHLMGADLNNIVKCQ +KLTDDHVQFLIYQILRGLKYIHSADIIHRDLKPSNLAVNEDCELKILDFGLARHTDDEMT +GYVATRWYRAPEIMLNWMHYNQTVDIWSVGCIMAELLTGRTLFPGTDHIDQLKLILRLVG +TPGAELLKKISSESARNYIQSLTQMPKMNFANVFIGANPLAVDLLEKMLVLDSDKRITAA +QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES +"""} + + + example_fasta.click(fn=example_fill, inputs=target_input_type, outputs=[ + target_id, target_gene, target_organism, target_fasta], show_progress='hidden') + + + def screen_recommend_model(fasta, family, task): + task = TASK_MAP[task] + score = TASK_METRIC_MAP[task] + benchmark_df = pd.read_csv(f'data/benchmarks/{task}_test_metrics.csv') + + if not fasta: + gr.Warning('Please enter a valid FASTA for model recommendation.') + return [None, family] + + if family == 'General': + seen_targets = get_seen_fastas('General', task)['X2'].values + if process_target_fasta(fasta) in seen_targets: + scenario = "Seen Target" + else: + scenario = "Unseen Target" + filtered_df = benchmark_df[(benchmark_df['Family'] == 'All Families') + & (benchmark_df['Scenario'] == scenario) + & (benchmark_df['Type'] == 'General')] + + else: + seen_targets_general = get_seen_fastas('General', task)['X2'].values + if process_target_fasta(fasta) in seen_targets_general: + scenario_general = "Seen Target" + else: + scenario_general = "Unseen Target" + + seen_targets_family = get_seen_fastas(family, task)['X2'].values + if process_target_fasta(fasta) in seen_targets_family: + scenario_family = "Seen Target" + else: + scenario_family = "Unseen Target" + + filtered_df_general = benchmark_df[(benchmark_df['Family'] == family) + & (benchmark_df['Scenario'] == scenario_general) + & (benchmark_df['Type'] == 'General')] + filtered_df_family = benchmark_df[(benchmark_df['Family'] == family) + & (benchmark_df['Scenario'] == scenario_family) + & (benchmark_df['Type'] == 'Family')] + filtered_df = pd.concat([filtered_df_general, filtered_df_family]) + + row = filtered_df.loc[filtered_df[score].idxmax()] + if row['Scenario'] == 'Seen Target': + scenario = "Seen Target (>=0.85 sequence identity)" + elif row['Scenario'] == 'Unseen Target': + scenario = "Unseen Target (<0.85 sequence identity)" + + return {drug_screen_preset: + gr.Dropdown(value=row['Model'], + info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained " + f"model with the best {score} in the {scenario} scenario on {row['Family']}."), + drug_screen_target_family: + gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)} + + + screen_preset_recommend_btn.click( + fn=screen_recommend_model, + inputs=[target_fasta, drug_screen_target_family, drug_screen_task], + outputs=[drug_screen_preset, drug_screen_target_family], + show_progress='hidden' + ) + + + def compound_input_type_select(input_type): + match input_type: + case 'SMILES': + return gr.Button(visible=False) + case 'SDF': + return gr.Button(visible=True) + + + compound_type.select(fn=compound_input_type_select, + inputs=compound_type, outputs=compound_upload_btn, show_progress='hidden') + + + def compound_upload_process(input_type, input_upload): + smiles = '' + try: + match input_type: + case 'SMILES': + smiles = input_upload.decode() + case 'SDF': + suppl = Chem.ForwardSDMolSupplier(io.BytesIO(input_upload)) + smiles = Chem.MolToSmiles(next(suppl)) + except Exception as e: + gr.Warning(f"Please upload a valid {input_type} file. Error: {str(e)}") + return smiles + + + compound_upload_btn.upload(fn=compound_upload_process, + inputs=[compound_type, compound_upload_btn], + outputs=compound_smiles) + + example_drug.click(fn=lambda: 'CC(=O)Oc1ccccc1C(=O)O', outputs=compound_smiles, show_progress='hidden') + + target_library_upload_btn.upload(fn=lambda x: [ + x.name, gr.Dropdown(value=Path(x.name).name, choices=list(TARGET_LIBRARY_MAP.keys()) + [Path(x.name).name]) + ], inputs=target_library_upload_btn, outputs=[target_library_upload, target_library]) + + + def identify_recommend_model(smiles, family, task): + task = TASK_MAP[task] + score = TASK_METRIC_MAP[task] + benchmark_df = pd.read_csv(f'data/benchmarks/{task}_test_metrics.csv') + + if not smiles: + gr.Warning('Please enter a valid SMILES for model recommendation.') + return None + if family == 'Family-Specific Auto-Recommendation': + return 'Family-Specific Auto-Recommendation' + + if family == 'General': + seen_compounds = pd.read_csv( + f'data/benchmarks/seen_compounds/all_families_full_{task.lower()}_random_split.csv') + family = 'All Families' + + else: + seen_compounds = pd.read_csv( + f'data/benchmarks/seen_compounds/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv') + + if rdkit_canonicalize(smiles) in seen_compounds['X1'].values: + scenario = "Seen Compound" + else: + scenario = "Unseen Compound" + + filtered_df = benchmark_df[(benchmark_df['Family'] == family) + & (benchmark_df['Scenario'] == scenario) + & (benchmark_df['Type'] == 'General')] + + row = filtered_df.loc[filtered_df[score].idxmax()] + + return gr.Dropdown(value=row['Model'], + info=f"Reason: {scenario} in training; choosing the model " + f"with the best {score} in the {scenario} scenario.") + + + identify_preset_recommend_btn.click(fn=identify_recommend_model, + inputs=[compound_smiles, target_identify_target_family, target_identify_task], + outputs=target_identify_preset, show_progress='hidden') + + + def infer_type_change(upload_type): + match upload_type: + case "Upload a compound library and a target library": + return { + pair_upload: gr.Column(visible=False), + pair_generate: gr.Column(visible=True), + infer_pair: None, + infer_drug: None, + infer_target: None, + infer_csv_prompt: gr.Button(visible=False), + infer_library_prompt: gr.Button(visible=True), + } + case "Upload a CSV file containing paired compound-protein data": + return { + pair_upload: gr.Column(visible=True), + pair_generate: gr.Column(visible=False), + infer_pair: None, + infer_drug: None, + infer_target: None, + infer_csv_prompt: gr.Button(visible=True), + infer_library_prompt: gr.Button(visible=False), + } + + + infer_type.select(fn=infer_type_change, inputs=infer_type, + outputs=[pair_upload, pair_generate, infer_pair, infer_drug, infer_target, + infer_csv_prompt, infer_library_prompt], + show_progress='hidden') + + + def common_input_validate(state, preset, email, request): + gr.Info('Start processing inputs...') + if not preset: + raise gr.Error('Please select a model.') + + if email: + try: + email_info = validate_email(email, check_deliverability=False) + email = email_info.normalized + except EmailNotValidError as e: + raise gr.Error(f"Invalid email address: {str(e)}.") + + if state: + raise gr.Error(f"You already have a running prediction job (ID: {state['id']}) under this session. " + "Please wait for it to complete before submitting another job.") + + if check := check_user_running_job(email, request): + raise gr.Error(check) + + return state, preset, email + + + def common_job_initiate(job_id, job_type, email, request, task): + gr.Info('Finished processing inputs. Initiating the prediction job... ' + 'You will be redirected to Prediction Status Lookup once the job has been submitted.') + job_info = {'id': job_id, + 'type': job_type, + 'task': task, + 'status': 'RUNNING', + 'email': email, + 'ip': request.headers.get('x-forwarded-for', request.client.host), + 'cookies': dict(request.cookies), + 'start_time': time(), + 'end_time': None, + 'expiry_time': None, + 'error': None} + # db.insert(job_info) + return job_info + + + def drug_screen_validate(fasta, library, library_upload, preset, task, email, state, + request: gr.Request, progress=gr.Progress(track_tqdm=True)): + state, preset, email = common_input_validate(state, preset, email, request) + + fasta = process_target_fasta(fasta) + err = validate_seq_str(fasta, FASTA_PAT) + if err: + raise gr.Error(f'Found error(s) in your Target FASTA input: {err}') + if not library: + raise gr.Error('Please select or upload a compound library.') + if library in DRUG_LIBRARY_MAP.keys(): + screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library])) + else: + screen_df = process_drug_library_upload(library_upload) + if len(screen_df) >= DATASET_MAX_LEN: + raise gr.Error(f'The uploaded compound library has more records ' + f'than the allowed maximum {DATASET_MAX_LEN}.') + + screen_df['X2'] = fasta + + job_id = str(uuid4()) + temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve() + screen_df.to_csv(temp_file, index=False, na_rep='') + if temp_file.is_file(): + job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task) + return {screen_data_for_predict: str(temp_file), + run_state: job_info} + else: + raise gr.Error('System failed to create temporary files. Please try again later.') + + + def target_identify_validate(smiles, library, library_upload, preset, task, email, state, + request: gr.Request, progress=gr.Progress(track_tqdm=True)): + state, preset, email = common_input_validate(state, preset, email, request) + + smiles = smiles.strip() + err = validate_seq_str(smiles, SMILES_PAT) + if err: + raise gr.Error(f'Found error(s) in your Compound SMILES input: {err}') + if not library: + raise gr.Error('Please select or upload a target library.') + if library in TARGET_LIBRARY_MAP.keys(): + identify_df = pd.read_csv(Path('data/target_libraries', TARGET_LIBRARY_MAP[library])) + else: + identify_df = process_target_library_upload(library_upload) + if len(identify_df) >= DATASET_MAX_LEN: + raise gr.Error(f'The uploaded target library has more records ' + f'than the allowed maximum {DATASET_MAX_LEN}.') + identify_df['X1'] = smiles + + job_id = str(uuid4()) + temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve() + identify_df.to_csv(temp_file, index=False, na_rep='') + if temp_file.is_file(): + job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task) + return {identify_data_for_predict: str(temp_file), + run_state: job_info} + else: + raise gr.Error('System failed to create temporary files. Please try again later.') + + + def pair_infer_validate(drug_target_pair_upload, drug_upload, target_upload, preset, task, email, state, + request: gr.Request, progress=gr.Progress(track_tqdm=True)): + state, preset, email = common_input_validate(state, preset, email, request) + + job_id = str(uuid4()) + if drug_target_pair_upload: + infer_df = pd.read_csv(drug_target_pair_upload) + validate_columns(infer_df, ['X1', 'X2']) + + infer_df['X1_ERR'] = infer_df['X1'].parallel_apply( + validate_seq_str, regex=SMILES_PAT) + if not infer_df['X1_ERR'].isna().all(): + raise ValueError( + f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}") + + infer_df['X2_ERR'] = infer_df['X2'].parallel_apply( + validate_seq_str, regex=FASTA_PAT) + if not infer_df['X2_ERR'].isna().all(): + raise ValueError( + f"Encountered invalid FASTA:\n{infer_df[~infer_df['X2_ERR'].isna()][['X2', 'X2_ERR']]}") + + temp_file = Path(drug_target_pair_upload).resolve() + + elif drug_upload and target_upload: + drug_df = process_drug_library_upload(drug_upload) + target_df = process_target_library_upload(target_upload) + + drug_df.drop_duplicates(subset=['X1'], inplace=True) + target_df.drop_duplicates(subset=['X2'], inplace=True) + + infer_df = pd.DataFrame(list(itertools.product(drug_df['X1'], target_df['X2'])), + columns=['X1', 'X2']) + infer_df = infer_df.merge(drug_df, on='X1').merge(target_df, on='X2') + + if len(infer_df) >= DATASET_MAX_LEN: + raise gr.Error(f'The uploaded/generated compound-protein pair dataset has more records ' + f'than the allowed maximum {DATASET_MAX_LEN}.') + + temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve() + infer_df.to_csv(temp_file, index=False, na_rep='') + + else: + raise gr.Error('Should upload a compound-protein pair dataset, or ' + 'upload both a compound library and a target library.') + + if temp_file.is_file(): + job_info = common_job_initiate(job_id, 'Interaction Pair Inference', email, request, task) + return {infer_data_for_predict: str(temp_file), + run_state: job_info} + else: + raise gr.Error('System failed to create temporary files. Please try again later.') + + + def fill_job_id(job_info): + try: + return job_info['id'] + except Exception as e: + gr.Warning(f'Failed to fetch job ID due to error: {str(e)}') + return '' + + + drug_screen_click = drug_screen_btn.click( + fn=drug_screen_validate, + inputs=[target_fasta, drug_library, drug_library_upload, drug_screen_preset, drug_screen_task, + drug_screen_email, run_state], + outputs=[screen_data_for_predict, run_state], + concurrency_limit=2, + ) + + drug_screen_lookup = drug_screen_click.success( + fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs], + ).then( + fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id] + ).then( + fn=lookup_job, + inputs=[pred_lookup_id], + outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report], + show_progress='minimal', + concurrency_limit=100, + ) + + # drug_screen_click.success( + # fn=send_email, + # inputs=[run_state] + # ) + + drug_screen_click.success( + fn=submit_predict, + inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset, + drug_screen_target_family, drug_screen_opts, run_state, ], + outputs=[run_state, ] + ) + + drug_screen_clr_btn.click( + lambda: ['General'] + [[]] + [None] * 5, + outputs=[drug_screen_target_family, drug_screen_opts, + target_fasta, drug_screen_preset, drug_library, drug_library_upload, drug_screen_email], + show_progress='hidden' + ) + + target_identify_clr_btn.click( + lambda: ['General'] + [[]] + [None] * 5, + outputs=[target_identify_target_family, target_identify_opts, + compound_smiles, target_identify_preset, target_library, target_library_upload, target_identify_email], + show_progress='hidden' + ) + + pair_infer_clr_btn.click( + lambda: ['General'] + [None] * 5, + outputs=[pair_infer_target_family, + infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_email], + show_progress='hidden' + ) + + report_clr_btn.click( + lambda: [[]] * 3 + [None] * 3 + + [gr.Button(interactive=False)] * 3 + + [gr.File(visible=False, value=None)] * 2 + + [gr.Dropdown(visible=False, value=None), gr.HTML(value=''), gr.CheckboxGroup(visible=False)], + outputs=[ + scores, filters, html_opts, + file_for_report, raw_df, report_df, + csv_generate, html_generate, analyze_btn, + csv_download_file, html_download_file, + report_task, html_report, job_opts + ], + show_progress='hidden' + ) + + + def update_preset(family, preset): + if family == 'Family-Specific Auto-Recommendation': + return 'Family-Specific Auto-Recommendation' + elif preset == 'Family-Specific Auto-Recommendation': + return None + else: + return preset + + + def update_family(family, preset): + if preset == 'Family-Specific Auto-Recommendation': + return 'Family-Specific Auto-Recommendation' + elif family == 'Family-Specific Auto-Recommendation': + return None + else: + return family + + + target_identify_target_family.change( + fn=update_preset, inputs=[target_identify_target_family, target_identify_preset], + outputs=target_identify_preset, show_progress='hidden') + target_identify_preset.change( + fn=update_family, inputs=[target_identify_target_family, target_identify_preset], + outputs=target_identify_target_family, show_progress='hidden') + + target_identify_click = target_identify_btn.click( + fn=target_identify_validate, + inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task, + target_identify_email, run_state], + outputs=[identify_data_for_predict, run_state], + concurrency_limit=2, + ) + + target_identify_lookup = target_identify_click.success( + fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs], + ).then( + fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id] + ).then( + fn=lookup_job, + inputs=[pred_lookup_id], + outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report], + show_progress='minimal', + concurrency_limit=100 + ) + + # target_identify_click.success( + # fn=send_email, + # inputs=[run_state] + # ) + + target_identify_click.success( + fn=submit_predict, + inputs=[identify_data_for_predict, target_identify_task, target_identify_preset, + target_identify_target_family, target_identify_opts, run_state, ], # , target_identify_email], + outputs=[run_state, ] + ) + + pair_infer_click = pair_infer_btn.click( + fn=pair_infer_validate, + inputs=[infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_task, + pair_infer_email, run_state], + outputs=[infer_data_for_predict, run_state], + concurrency_limit=2, + ) + + pair_infer_lookup = pair_infer_click.success( + fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs], + ).then( + fn=fill_job_id, inputs=[run_state], outputs=[pred_lookup_id] + ).then( + fn=lookup_job, + inputs=[pred_lookup_id], + outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report], + show_progress='minimal', + concurrency_limit=100 + ) + + # pair_infer_click.success( + # fn=send_email, + # inputs=[run_state] + # ) + + pair_infer_click.success( + fn=submit_predict, + inputs=[infer_data_for_predict, pair_infer_task, pair_infer_preset, + pair_infer_target_family, pair_infer_opts, run_state, ], # , pair_infer_email], + outputs=[run_state, ] + ) + + pred_lookup_click = pred_lookup_btn.click( + fn=lookup_job, + inputs=[pred_lookup_id], + outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report], + show_progress='minimal', + cancels=[drug_screen_lookup, target_identify_lookup, pair_infer_lookup], + concurrency_limit=100, + ) + + pred_lookup_stop_btn.click( + fn=lambda: [gr.Button(visible=True), gr.Button(visible=False)], + outputs=[pred_lookup_btn, pred_lookup_stop_btn], + cancels=[pred_lookup_click, drug_screen_lookup, target_identify_lookup, pair_infer_lookup], + concurrency_limit=None, + ) + + + def inquire_task(df): + if 'Y^' in df.columns: + label = 'predicted CPI/CPA labels (`Y^`)' + return {report_task: gr.Dropdown(visible=True, + info=f'Found {label} in your uploaded dataset. ' + 'Is it compound-protein interaction or binding affinity?'), + html_report: ''} + else: + return {report_task: gr.Dropdown(visible=False)} + + report_df_change = file_for_report.change( + fn=update_df, inputs=file_for_report, outputs=[ + html_report, raw_df, report_df, analyze_btn, report_task, job_opts + ], + concurrency_limit=100, + ).success( + fn=lambda: [gr.Button(interactive=True)] * 3 + + [gr.File(visible=False, value=None)] * 2, + outputs=[ + csv_generate, html_generate, analyze_btn, csv_download_file, html_download_file + ], + ) + + file_for_report.upload( + # fn=update_df, inputs=file_for_report, outputs=[ + # html_report, raw_df, report_df, analyze_btn, report_task, job_opts + # ], + # cancels=[report_df_change], + # concurrency_limit=100, + # ).success( + fn=inquire_task, inputs=[raw_df], + outputs=[report_task, html_report], + ) + + file_for_report.clear( + fn=lambda: [gr.Button(interactive=False)] * 3 + + [gr.File(visible=False, value=None)] * 2 + + [gr.Dropdown(visible=False, value=None), '', gr.CheckboxGroup(visible=False)], + cancels=[report_df_change], + outputs=[ + csv_generate, html_generate, analyze_btn, + csv_download_file, html_download_file, + report_task, html_report, job_opts + ] + ) + + analyze_btn.click( + fn=submit_report, inputs=[raw_df, scores, filters, job_opts, report_task], outputs=[ + html_report, report_df, csv_download_file, html_download_file] + ).success( + fn=lambda: [gr.Button(interactive=True)] * 2, + outputs=[csv_generate, html_generate], + concurrency_limit=100, + ) + + + def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)): + csv_sep_map = { + 'Comma': ',', + 'Tab': '\t', + } + y_colname = 'Y^' + if isinstance(task, str): + if task == 'Compound-Protein Interaction': + y_colname = 'Y_prob' + elif task == 'Compound-Protein Binding Affinity': + y_colname = 'Y_IC50' + try: + now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv" + df.rename(columns={'Y^': y_colname}).drop( + labels=['Compound', 'Scaffold'], axis=1 + ).to_csv(filename, index=False, na_rep='', sep=csv_sep_map[sep]) + + return gr.File(filename, visible=True) + except Exception as e: + gr.Warning(f"Failed to generate CSV due to error: {str(e)}") + return None + + + def create_html_report_file(df, file_report, task, opts, progress=gr.Progress(track_tqdm=True)): + try: + now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.html" + create_html_report(df, filename, task, opts) + return gr.File(filename, visible=True) + except Exception as e: + gr.Warning(f"Failed to generate HTML due to error: {str(e)}") + return None + + + # html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate]) + + csv_generate.click( + lambda: gr.File(visible=True), outputs=csv_download_file, + ).then( + fn=create_csv_report_file, inputs=[report_df, file_for_report, report_task, csv_sep], + outputs=csv_download_file, show_progress='full' + ) + html_generate.click( + lambda: gr.File(visible=True), outputs=html_download_file, + ).then( + fn=create_html_report_file, inputs=[report_df, file_for_report, report_task, html_opts], + outputs=html_download_file, show_progress='full' + ) + +if __name__ == "__main__": + pandarallel.initialize() + + hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference") + + session = requests.Session() + ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])) + session.mount('http://', ADAPTER) + session.mount('https://', ADAPTER) + + db = TinyDB(f'{SERVER_DATA_DIR}/db.json') + # Set all RUNNING jobs to FAILED at TinyDB initialization + Job = Query() + jobs = db.all() + for job in jobs: + if job['status'] == 'RUNNING': + db.update({'status': 'FAILED'}, Job.id == job['id']) + + scheduler = BackgroundScheduler() + scheduler.add_job(check_expiry, 'interval', hours=1, timezone=pytz.utc) + scheduler.start() + + demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)