diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -1,50 +1,2249 @@
-from email.utils import formatdate, make_msgid
+import concurrent.futures
+import glob
+import smtplib
+from datetime import datetime
+import itertools
+import textwrap
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
-import smtplib
+from email.utils import formatdate, make_msgid
+from math import pi
+from time import sleep, time
+from uuid import uuid4
+
+import io
+import os
+from pathlib import Path
+import sys
+
+import pytz
+from Bio import SeqIO
+from Bio.Align import PairwiseAligner
+from email_validator import validate_email, EmailNotValidError
+import gradio as gr
+import hydra
+import pandas as pd
+import requests
from markdown import markdown
+from rdkit.Chem.PandasTools import _MolPlusFingerprint
+from rdkit.Chem.rdMolDescriptors import CalcNumRotatableBonds, CalcNumHeavyAtoms, CalcNumAtoms, CalcTPSA
+from requests.adapters import HTTPAdapter, Retry
+from rdkit import Chem
+from rdkit.Chem import RDConfig, Descriptors, Draw, Lipinski, Crippen, PandasTools
+from rdkit.Chem.Scaffolds import MurckoScaffold
+import seaborn as sns
+
+from bokeh.models import Legend, NumberFormatter, BooleanFormatter, HTMLTemplateFormatter, LegendItem
+from bokeh.palettes import Category20c_20
+from bokeh.plotting import figure
+from bokeh.transform import cumsum
+from bokeh.resources import INLINE
+import panel as pn
+
+from apscheduler.schedulers.background import BackgroundScheduler
+from tinydb import TinyDB, Query
+
+import swifter
+from tqdm.auto import tqdm
+
+from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
+from deepscreen.predict import predict
+sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
+import sascorer
-def send_email(receiver, job_info):
- email_serv = "smtpdm.aliyun.com" # "ciddr-lab.ac.cn" # "srvsmtp.xjtlu.edu.cn"
- email_port = 80 # 1025 # 587 # 25
- email_addr = "deepseqreen@ciddr-lab.ac.cn"
- email_pass = "ciddrw447JkpB"
- email_form = """
-Dear user,
+UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
+DATASET_MAX_LEN = 10_000
+SERVER_DATA_DIR = 'data' # '/data'
-Your DeepSEQreen job is {status}.
+CSS = """
+.help-tip {
+ position: absolute;
+ display: inline-block;
+ top: 16px;
+ right: 0px;
+ text-align: center;
+ border-radius: 40%;
+ /* border: 2px solid darkred; background-color: #8B0000;*/
+ width: 24px;
+ height: 24px;
+ font-size: 16px;
+ line-height: 26px;
+ cursor: default;
+ transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1);
+ z-index: 100 !important;
+}
-**Job details:**
+.help-tip:hover {
+ cursor: pointer;
+ /*background-color: #ccc;*/
+}
- - Job id: {id}
- - Job type: {type}
- - Start time: {start_time}
- - End time: {end_time}
- - Expiry time: {expiry_time}
- - Error: {error}
+.help-tip:before {
+ content: '?';
+ font-weight: 700;
+ color: #8B0000;
+ z-index: 100 !important;
+}
-Please visit the [DeepSEQreen web server](https://www.ciddr-lab.ac.cn/deepseqreen/) to check the job status or retrieve the results.
+.help-tip p {
+ visibility: hidden;
+ opacity: 0;
+ text-align: left;
+ background-color: #EFDDE3;
+ padding: 20px;
+ width: 300px;
+ position: absolute;
+ border-radius: 4px;
+ right: -4px;
+ color: #494F5A;
+ font-size: 13px;
+ line-height: normal;
+ transform: scale(0.7);
+ transform-origin: 100% 0%;
+ transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1);
+ z-index: 100;
+}
-Best,
+.help-tip:hover p {
+ cursor: default;
+ visibility: visible;
+ opacity: 1;
+ transform: scale(1.0);
+}
+
+.help-tip p:before {
+ position: absolute;
+ content: '';
+ width: 0;
+ height: 0;
+ border: 6px solid transparent;
+ border-bottom-color: #EFDDE3;
+ right: 10px;
+ top: -12px;
+}
+
+.help-tip p:after {
+ width: 100%;
+ height: 40px;
+ content: '';
+ position: absolute;
+ top: -5px;
+ left: 0;
+ z-index: 101;
+}
+
+.upload_button {
+ background-color: #008000;
+}
+
+.absolute {
+ position: absolute;
+}
+
+.example {
+padding: 0;
+background: none;
+border: none;
+text-decoration: underline;
+box-shadow: none;
+text-align: left !important;
+display: inline-block !important;
+}
+
+footer {
+visibility: hidden
+}
-CIDDR Team
"""
- server = smtplib.SMTP(email_serv, email_port)
- # server.starttls()
- server.login(email_addr, email_pass)
- msg = MIMEMultipart("alternative")
- msg["From"] = email_addr
- msg["To"] = receiver
- msg["Subject"] = f"DeepSEQreen Job {job_info['status']}: {job_info['id']}"
- msg["Date"] = formatdate(localtime=True)
- msg["Message-ID"] = make_msgid()
- msg.attach(MIMEText(markdown(email_form.format(**job_info)), 'html'))
- msg.attach(MIMEText(email_form.format(**job_info), 'plain'))
+class HelpTip:
+ def __new__(cls, text):
+ return gr.HTML(
+ # elem_classes="absolute",
+ value=f'
{text}
',
+ )
+
+
+TASK_MAP = {
+ 'Compound-Protein Interaction': 'DTI',
+ 'Compound-Protein Binding Affinity': 'DTA',
+}
+
+TASK_METRIC_MAP = {
+ 'DTI': 'AUROC',
+ 'DTA': 'CI',
+}
+
+PRESET_MAP = {
+ 'DeepDTA': 'deep_dta',
+ 'DeepConvDTI': 'deep_conv_dti',
+ 'GraphDTA': 'graph_dta',
+ 'MGraphDTA': 'm_graph_dta',
+ 'HyperAttentionDTI': 'hyper_attention_dti',
+ 'MolTrans': 'mol_trans',
+ 'TransformerCPI': 'transformer_cpi',
+ 'TransformerCPI2': 'transformer_cpi_2',
+ 'DrugBAN': 'drug_ban',
+ 'DrugVQA-Seq': 'drug_vqa'
+}
+
+TARGET_FAMILY_MAP = {
+ 'General': 'general',
+ 'Kinase': 'kinase',
+ 'Non-Kinase Enzyme': 'non_kinase_enzyme',
+ 'Membrane Receptor': 'membrane_receptor',
+ 'Nuclear Receptor': 'nuclear_receptor',
+ 'Ion Channel': 'ion_channel',
+ 'Others': 'others',
+}
+
+TARGET_LIBRARY_MAP = {
+ 'DrugBank (Human)': 'drugbank_targets.csv',
+ 'ChEMBL33 (Human)': 'ChEMBL33_human_proteins.csv',
+}
+
+DRUG_LIBRARY_MAP = {
+ 'DrugBank (Human)': 'drugbank_compounds.csv',
+ 'Drug Repurposing Hub': 'drug_repurposing_hub.csv'
+}
+
+COLUMN_ALIASES = {
+ 'X1': 'Compound SMILES',
+ 'X2': 'Target FASTA',
+ 'ID1': 'Compound ID',
+ 'ID2': 'Target ID',
+ 'Y': 'Actual CPI/CPA',
+ 'Y^': 'Predicted CPI/CPA',
+}
+
+pd.set_option('display.float_format', '{:.3f}'.format)
+PandasTools.molRepresentation = 'svg'
+PandasTools.drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
+PandasTools.drawOptions.clearBackground = False
+PandasTools.drawOptions.bondLineWidth = 1
+PandasTools.drawOptions.explicitMethyl = True
+PandasTools.drawOptions.singleColourWedgeBonds = True
+PandasTools.drawOptions.useCDKAtomPalette()
+PandasTools.molSize = (128, 80)
+
+session = requests.Session()
+ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))
+session.mount('http://', ADAPTER)
+session.mount('https://', ADAPTER)
+
+db = TinyDB(f'{SERVER_DATA_DIR}/db.json')
+# Set all RUNNING jobs to FAILED at TinyDB initialization
+Job = Query()
+jobs = db.all()
+for job in jobs:
+ if job['status'] == 'RUNNING':
+ db.update({'status': 'FAILED'}, Job.id == job['id'])
+
+scheduler = BackgroundScheduler()
+
+
+def check_expiry():
+ Job = Query()
+ jobs = db.all()
+
+ for job in jobs:
+ # Check if the job has expired
+ if job['expiry_time'] < time():
+ # Delete the job from the database
+ db.remove(Job.id == job['id'])
+ # Delete the corresponding file
+ files = glob.glob(f"/data/{job['id']}*")
+ for file_path in files:
+ if os.path.exists(file_path):
+ os.remove(file_path)
+ elif job['status'] == 'RUNNING' and time() - job['start_time'] > 4 * 60 * 60: # 4 hours
+ # Mark the job as failed
+ db.update({'status': 'FAILED',
+ 'error': 'Job has timed out by exceeding the maximum running time of 4 hours.'},
+ Job.id == job['id'])
+ if job.get('email'):
+ send_email(job)
+
+
+scheduler.add_job(check_expiry, 'interval', hours=1)
+scheduler.start()
+
+
+def sa_score(mol):
+ return sascorer.calculateScore(mol)
+
+
+def mw(mol):
+ return Chem.Descriptors.MolWt(mol)
+
+
+def mr(mol):
+ return Crippen.MolMR(mol)
+
+
+def hbd(mol):
+ return Lipinski.NumHDonors(mol)
+
+
+def hba(mol):
+ return Lipinski.NumHAcceptors(mol)
+
+
+def logp(mol):
+ return Crippen.MolLogP(mol)
+
+
+def atom(mol):
+ return CalcNumAtoms(mol)
+
+
+def heavy_atom(mol):
+ return CalcNumHeavyAtoms(mol)
+
+
+def rotatable_bond(mol):
+ return CalcNumRotatableBonds((mol))
+
+
+def tpsa(mol):
+ return CalcTPSA((mol))
+
+
+def lipinski(mol):
+ """
+ Lipinski's rules:
+ Hydrogen bond donors <= 5
+ Hydrogen bond acceptors <= 10
+ Molecular weight <= 500 daltons
+ logP <= 5
+ """
+ if hbd(mol) > 5:
+ return False
+ elif hba(mol) > 10:
+ return False
+ elif mw(mol) > 500:
+ return False
+ elif logp(mol) > 5:
+ return False
+ else:
+ return True
+
+
+def reos(mol):
+ """
+ Rapid Elimination Of Swill filter:
+ Molecular weight between 200 and 500
+ LogP between -5.0 and +5.0
+ H-bond donor count between 0 and 5
+ H-bond acceptor count between 0 and 10
+ Formal charge between -2 and +2
+ Rotatable bond count between 0 and 8
+ Heavy atom count between 15 and 50
+ """
+ if not 200 < mw(mol) < 500:
+ return False
+ elif not -5.0 < logp(mol) < 5.0:
+ return False
+ elif not 0 < hbd(mol) < 5:
+ return False
+ elif not 0 < hba(mol) < 10:
+ return False
+ elif not 0 < rotatable_bond(mol) < 8:
+ return False
+ elif not 15 < heavy_atom(mol) < 50:
+ return False
+ else:
+ return True
+
+
+def ghose(mol):
+ """
+ Ghose drug like filter:
+ Molecular weight between 160 and 480
+ LogP between -0.4 and +5.6
+ Atom count between 20 and 70
+ Molar refractivity between 40 and 130
+ """
+ if not 160 < mw(mol) < 480:
+ return False
+ elif not -0.4 < logp(mol) < 5.6:
+ return False
+ elif not 20 < atom(mol) < 70:
+ return False
+ elif not 40 < mr(mol) < 130:
+ return False
+ else:
+ return True
+
+
+def veber(mol):
+ """
+ The Veber filter is a rule of thumb filter for orally active drugs described in
+ Veber et al., J Med Chem. 2002; 45(12): 2615-23.:
+ Rotatable bonds <= 10
+ Topological polar surface area <= 140
+ """
+ if not rotatable_bond(mol) <= 10:
+ return False
+ elif not tpsa(mol) <= 140:
+ return False
+ else:
+ return True
+
+
+def rule_of_three(mol):
+ """
+ Rule of Three filter (Congreve et al., Drug Discov. Today. 8 (19): 876–7, (2003).):
+ Molecular weight <= 300
+ LogP <= 3
+ H-bond donor <= 3
+ H-bond acceptor count <= 3
+ Rotatable bond count <= 3
+ """
+ if not mw(mol) <= 300:
+ return False
+ elif not logp(mol) <= 3:
+ return False
+ elif not hbd(mol) <= 3:
+ return False
+ elif not hba(mol) <= 3:
+ return False
+ elif not rotatable_bond(mol) <= 3:
+ return False
+ else:
+ return True
+
+
+# def smarts_filter():
+# alerts = Chem.MolFromSmarts("enter one smart here")
+# detected_alerts = []
+# for smiles in data['X1']:
+# mol = Chem.MolFromSmiles(smiles)
+# detected_alerts.append(mol.HasSubstructMatch(alerts))
+
+
+SCORE_MAP = {
+ 'SAscore': sa_score,
+ 'LogP': logp,
+ 'Molecular Weight': mw,
+ 'Number of Heavy Atoms': heavy_atom,
+ 'Molar Refractivity': mr,
+ 'H-Bond Donor Count': hbd,
+ 'H-Bond Acceptor Count': hba,
+ 'Rotatable Bond Count': rotatable_bond,
+ 'Topological Polar Surface Area': tpsa,
+}
+
+FILTER_MAP = {
+ # TODO support number_of_violations
+ 'REOS': reos,
+ "Lipinski's Rule of Five": lipinski,
+ 'Ghose': ghose,
+ 'Rule of Three': rule_of_three,
+ 'Veber': veber,
+ # 'PAINS': pains,
+}
+
+
+def validate_columns(df, mandatory_cols):
+ missing_cols = [col for col in mandatory_cols if col not in df.columns]
+ if missing_cols:
+ error_message = (f"The following mandatory columns are missing "
+ f"in the uploaded dataset: {str(mandatory_cols).strip('[]')}.")
+ raise ValueError(error_message)
+ else:
+ return
+
+
+def process_target_fasta(sequence):
+ try:
+ if sequence:
+ lines = sequence.strip().split("\n")
+ if lines[0].startswith(">"):
+ lines = lines[1:]
+ return ''.join(lines).split(">")[0]
+ # record = list(SeqIO.parse(io.StringIO(sequence), "fasta"))[0]
+ # return str(record.seq)
+ else:
+ raise ValueError('Empty FASTA sequence.')
+ except Exception as e:
+ raise gr.Error(f'Failed to process FASTA due to error: {str(e)}')
+
+
+def send_email(job_info):
+ if job_info.get('email'):
+ try:
+ email_serv = os.getenv('EMAIL_SERV')
+ email_port = os.getenv('EMAIL_PORT')
+ email_addr = os.getenv('EMAIL_ADDR')
+ email_pass = os.getenv('EMAIL_PASS')
+ email_form = os.getenv('EMAIL_FORM')
+ email_subj = os.getenv('EMAIL_SUBJ')
+
+ for key, value in job_info.items():
+ if key.endswith("time") and value:
+ job_info[key] = datetime.fromtimestamp(value).strftime("%Y-%m-%d %H:%M:%S")
+
+ server = smtplib.SMTP(email_serv, int(email_port))
+ # server.starttls()
+
+ server.login(email_addr, email_pass)
+ msg = MIMEMultipart("alternative")
+ msg["From"] = email_addr
+ msg["To"] = job_info['email']
+ msg["Subject"] = email_subj.format(**job_info)
+ msg["Date"] = formatdate(localtime=True)
+ msg["Message-ID"] = make_msgid()
+
+ msg.attach(MIMEText(markdown(email_form.format(**job_info)), 'html'))
+ msg.attach(MIMEText(email_form.format(**job_info), 'plain'))
+
+ server.sendmail(email_addr, job_info['email'], msg.as_string())
+ server.quit()
+ gr.Info('Email notification sent successfully.')
+ except Exception as e:
+ gr.Warning('Failed to send email notification due to error: ' + str(e))
+ else:
+ gr.Info('You won\'t receive an email notification as you haven\'t provided an email address. '
+ 'Please make sure you take note of the job ID.')
+
+
+def check_user_running_job(email, request):
+ message = ("You already have a running prediction job (ID: {id}) under this {reason}. "
+ "Please wait for it to complete before submitting another job.")
+ try:
+ # with open('jobs.json', 'r') as f: # /data/
+ # # Load the JSON data from the file
+ # jobs = json.load(f)
+ #
+ # for job_id, job_info in jobs.items():
+ # # check if a job is running for the email
+ # if email:
+ # if job_info["email"] == email and job_info["status"] == "running":
+ # return message.format(id=job_id, reason="email")
+ # # check if a job is running for the session
+ # elif request.cookies:
+ # for key, value in job_info["cookies"].items() and job_info["status"] == "running":
+ # if key in request.cookies and request.cookies[key] == value:
+ # return message.format(id=job_id, reason="session")
+ # # check if a job is running for the IP
+ # else:
+ # if job_info["IP"] == request.client.host and job_info["status"] == "running":
+ # return message.format(id=job_id, reason="IP")
+ # check if a job is running for the email
+ Job = Query()
+ if email:
+ job = db.search((Job.email == email) & (Job.status == "RUNNING"))
+ if job:
+ return message.format(id=job[0]['id'], reason="email")
+ # check if a job is running for the session
+ elif request.cookies:
+ for key, value in request.cookies.items():
+ job = db.search((Job.cookies[key] == value) & (Job.status == "RUNNING"))
+ if job:
+ return message.format(id=job[0]['id'], reason="session")
+ # check if a job is running for the IP
+ else:
+ job = db.search((Job.IP == request.client.host) & (Job.status == "RUNNING"))
+ if job:
+ return message.format(id=job[0]['id'], reason="IP")
+
+ return False
+ except Exception as e:
+ raise gr.Error(f'Failed to validate user running jobs due to error: {str(e)}')
+
+
+def get_timezone_by_ip(ip):
+ try:
+ data = session.get(f'http://ip-api.com/json/{ip}').json()
+ return data['timezone']
+ except Exception:
+ return 'UTC'
+
+
+def ts_to_str(ts, timezone_str):
+ if isinstance(ts, str):
+ return ts
+ local_tz = pytz.timezone(timezone_str)
+ dt = datetime.fromtimestamp(ts)
+ dt = dt.replace(tzinfo=pytz.utc) # Set the datetime object to UTC
+ localized_dt = dt.astimezone(local_tz) # Convert the datetime object to the desired timezone
+ return localized_dt.strftime('%Y-%m-%d %H:%M:%S (%Z%z)')
+
+
+def lookup_job(job_id):
+ stop = False
+ while not stop:
+ try:
+ Job = Query()
+ jobs = db.search((Job.id == job_id))
+ if jobs:
+ job = jobs[0]
+ job_status = job['status']
+ job_type = job['type']
+ error = job['error']
+ start_time = ts_to_str(job['start_time'], get_timezone_by_ip(job['ip']))
+ if job.get('end_time'):
+ end_time = ts_to_str(job['end_time'], get_timezone_by_ip(job['ip']))
+ if job.get('expiry_time'):
+ expiry_time = ts_to_str(job['expiry_time'], get_timezone_by_ip(job['ip']))
+ if job_status == "RUNNING":
+ sleep(5)
+ yield {
+ pred_lookup_status: f'''
+Your **{job_type}** job (ID: {job_id}) started at
+**{start_time}** and is **RUNNING...**
+
+It might take a few minutes up to a few hours depending on the prediction dataset, the model, and the queue status.
+You may keep the page open and wait for the completion, or close the page and revisit later to look up the job status
+using the job id. You will also receive an email notification once the job is done.
+''',
+ pred_lookup_btn: gr.Button(visible=False),
+ pred_lookup_stop_btn: gr.Button(visible=True)
+ }
+ if job_status == "COMPLETED":
+ stop = True
+ yield {
+ pred_lookup_status: f'Your {job_type} job (ID: {job_id}) has been **COMPLETED**' +
+ f' at {end_time}' if job.get('end_time') else '' +
+ f', and the results will expire by {expiry_time}.' if job.get(
+ 'expiry_time') else '.' +
+ f'Redirecting to the report page...',
+ pred_lookup_btn: gr.Button(visible=True),
+ pred_lookup_stop_btn: gr.Button(visible=False),
+ tabs: gr.Tabs(selected='Chemical Property Report'),
+ file_for_report: job['output_file']
+ }
+ if job_status == "FAILED":
+ stop = True
+ yield {
+ pred_lookup_status: f'Your {job_type} job (ID: {job_id}) has **FAILED**' +
+ f' at {end_time}' if job.get('end_time') else '' +
+ f' due to error: {error}.' if job.get(
+ 'expiry_time') else '.',
+ pred_lookup_btn: gr.Button(visible=True),
+ pred_lookup_stop_btn: gr.Button(visible=False),
+ tabs: gr.Tabs(selected='Prediction Status Lookup'),
+ }
+ else:
+ stop = True
+ yield {
+ pred_lookup_status: f'Job ID {job_id} not found.',
+ pred_lookup_btn: gr.Button(visible=True),
+ pred_lookup_stop_btn: gr.Button(visible=False),
+ tabs: gr.Tabs(selected='Prediction Status Lookup'),
+ }
+
+ except Exception as e:
+ raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
+
+
+def submit_predict(predict_filepath, task, preset, target_family, state):
+ job_id = state['id']
+ status = "RUNNING"
+ error = None
+ task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
+ predictions_file = None
+ try:
+ target_family = TARGET_FAMILY_MAP[target_family]
+
+ predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_{preset}_{target_family}_predictions.csv'
+
+ task = TASK_MAP[task]
+ preset = PRESET_MAP[preset]
+
+ prediction_df = pd.DataFrame()
+ with hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference"):
+ cfg = hydra.compose(
+ config_name="webserver_inference",
+ overrides=[f"task={task}",
+ f"preset={preset}",
+ f"ckpt_path=resources/checkpoints/{preset}-{task}-{target_family}.ckpt",
+ f"data.data_file='{str(predict_filepath)}'"])
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
+ # future = executor.submit(predict, cfg)
+ # try:
+ # predictions, _ = future.result(timeout=4*60*60)
+ # except concurrent.futures.TimeoutError:
+ # raise gr.Error("Prediction timed out.")
+ predictions, _ = predict(cfg)
+ predictions = [pd.DataFrame(prediction) for prediction in predictions]
+ prediction_df = pd.concat([prediction_df, pd.concat(predictions, ignore_index=True)])
+ prediction_df.set_index('N', inplace=True)
+ orig_df = pd.read_csv(
+ predict_filepath,
+ usecols=lambda x: x not in ['X1', 'ID1', 'Compound', 'Scaffold', 'Scaffold SMILES',
+ 'X2', 'ID2',
+ 'Y', 'Y^']
+ )
+ prediction_df = pd.merge(prediction_df, orig_df, left_index=True, right_index=True, how='left')
+
+ prediction_df.to_csv(predictions_file)
+ status = "COMPLETED"
+
+ return {run_state: False}
+ except Exception as e:
+ gr.Warning(f"Prediction job failed due to error: {str(e)}")
+ status = "FAILED"
+ predictions_file = None
+ error = str(e)
+ return {run_state: False}
+ finally:
+ Job = Query()
+ job_query = (Job.id == job_id)
+
+ end_time = time()
+ expiry_time = end_time + 48 * 60 * 60 # Add 48 hours
+
+ db.update({'end_time': end_time,
+ 'expiry_time': expiry_time,
+ 'status': status,
+ 'error': error,
+ 'input_file': predict_filepath,
+ 'output_file': predictions_file},
+ job_query)
+ if job_info := db.search(job_query)[0]:
+ if job_info.get('email'):
+ send_email(job_info)
+
+
+def update_df(file, progress=gr.Progress(track_tqdm=True)):
+ if file and Path(file).is_file():
+ task = None
+ if "_CPI_" in str(file):
+ task = 'Compound-Protein Interaction'
+ elif "_CPA_" in str(file):
+ task = 'Compound-Protein Binding Affinity'
+
+ df = pd.read_csv(file)
+ if 'N' in df.columns:
+ df.set_index('N', inplace=True)
+ if not any(col in ['X1', 'X2'] for col in df.columns):
+ gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.")
+ return {analyze_btn: gr.Button(interactive=False)}
+ if 'X1' in df.columns:
+ df['Scaffold SMILES'] = df['X1'].swifter.progress_bar(
+ desc=f"Calculating scaffold...").apply(MurckoScaffold.MurckoScaffoldSmilesFromSmiles)
+ df['Scaffold'] = df['Scaffold SMILES'].swifter.progress_bar(
+ desc='Generating scaffold graphs...').apply(
+ lambda smiles: _MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
+ # Add a new column with RDKit molecule objects
+ if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
+ df['Compound'] = df['X1'].swifter.progress_bar(
+ desc='Generating molecular graphs...').apply(
+ lambda smiles: _MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
+
+ # DF_FOR_REPORT = df.copy()
+
+ # pie_chart = None
+ # value = None
+ # if 'Y^' in DF_FOR_REPORT.columns:
+ # value = 'Y^'
+ # elif 'Y' in DF_FOR_REPORT.columns:
+ # value = 'Y'
+
+ # if value:
+ # if DF_FOR_REPORT['X1'].nunique() > 1 >= DF_FOR_REPORT['X2'].nunique():
+ # pie_chart = create_pie_chart(DF_FOR_REPORT, category='Scaffold SMILES', value=value, top_k=100)
+ # elif DF_FOR_REPORT['X2'].nunique() > 1 >= DF_FOR_REPORT['X1'].nunique():
+ # pie_chart = create_pie_chart(DF_FOR_REPORT, category='Target family', value=value, top_k=100)
+
+ return {html_report: create_html_report(df, file=None, task=task),
+ raw_df: df,
+ report_df: df.copy(),
+ analyze_btn: gr.Button(interactive=True),
+ report_task: gr.Dropdown(value=task)} # pie_chart
+ else:
+ return {analyze_btn: gr.Button(interactive=False)}
+
+
+def create_html_report(df, file=None, task=None, progress=gr.Progress(track_tqdm=True)):
+ df_html = df.copy(deep=True)
+ column_aliases = COLUMN_ALIASES.copy()
+ cols_left = list(pd.Index(
+ ['ID1', 'Compound', 'Scaffold', 'Scaffold SMILES', 'ID2', 'Y', 'Y^']).intersection(df_html.columns))
+ cols_right = list(pd.Index(['X1', 'X2']).intersection(df_html.columns))
+ df_html = df_html[cols_left + (df_html.columns.drop(cols_left + cols_right).tolist()) + cols_right]
+
+ if isinstance(task, str):
+ column_aliases.update({
+ 'Y': 'Actual Interaction Probability' if task == 'Compound-Protein Interaction'
+ else 'Actual Binding Affinity',
+ 'Y^': 'Predicted Interaction Probability' if task == 'Compound-Protein Interaction'
+ else 'Predicted Binding Affinity'
+ })
+
+ ascending = True if column_aliases['Y^'] == 'Predicted Binding Affinity' else False
+ df_html = df_html.sort_values(
+ [col for col in ['Y', 'Y^'] if col in df_html.columns], ascending=ascending
+ )
+
+ if not file:
+ df_html = df_html.iloc[:31]
+
+ # Remove repeated info for one-against-N tasks to save visual and physical space
+ job = 'Chemical Property'
+ unique_entity = 'Unique Entity'
+ unique_df = None
+ category = None
+ columns_unique = None
+ if 'X1' in df_html.columns and 'X2' in df_html.columns:
+ n_compound = df_html['X1'].nunique()
+ n_protein = df_html['X2'].nunique()
+
+ if n_compound == 1 and n_protein >= 2:
+ unique_entity = 'Compound of Interest'
+ if any(col in df_html.columns for col in ['Y^', 'Y']):
+ job = 'Target Protein Identification'
+ category = 'Target Family'
+ columns_unique = df_html.columns.isin(['X1', 'ID1', 'Scaffold', 'Compound', 'Scaffold SMILES']
+ + list(FILTER_MAP.keys()) + list(SCORE_MAP.keys()))
+
+ elif n_compound >= 2 and n_protein == 1:
+ unique_entity = 'Target of Interest'
+ if any(col in df_html.columns for col in ['Y^', 'Y']):
+ job = 'Drug Hit Screening'
+ category = 'Scaffold SMILES'
+ columns_unique = df_html.columns.isin(['X2', 'ID2'])
+
+ elif 'Y^' in df_html.columns:
+ job = 'Interaction Pair Inference'
+ if 'Compound' in df_html.columns:
+ df_html['Compound'] = df_html['Compound'].swifter.progress_bar(
+ desc='Generating compound graph...').apply(
+ lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
+ if 'Scaffold' in df_html.columns:
+ df_html['Scaffold'] = df_html['Scaffold'].swifter.progress_bar(
+ desc='Generating scaffold graph...').apply(
+ lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
+
+ df_html.rename(columns=column_aliases, inplace=True)
+ df_html.index.name = 'Index'
+ if 'Target FASTA' in df_html.columns:
+ df_html['Target FASTA'] = df_html['Target FASTA'].swifter.progress_bar(
+ desc='Processing FASTA...').apply(
+ lambda x: wrap_text(x) if not pd.isna(x) else x)
+
+ num_cols = df_html.select_dtypes('number').columns
+ num_col_colors = sns.color_palette('husl', len(num_cols))
+ bool_cols = df_html.select_dtypes(bool).columns
+ bool_col_colors = {True: 'lightgreen', False: 'lightpink'}
+
+ if columns_unique is not None:
+ unique_df = df_html.loc[:, columns_unique].iloc[[0]].copy()
+ df_html = df_html.loc[:, ~columns_unique]
+
+ if not file:
+ if 'Compound ID' in df_html.columns:
+ df_html.drop(['Compound SMILES'], axis=1, inplace=True)
+ if 'Target ID' in df_html.columns:
+ df_html.drop(['Target FASTA'], axis=1, inplace=True)
+ if 'Target FASTA' in df_html.columns:
+ df_html['Target FASTA'] = df_html['Target FASTA'].swifter.progress_bar(
+ desc='Processing FASTA...').apply(
+ lambda x: wrap_text(x) if not pd.isna(x) else x)
+ if 'Scaffold SMILES' in df_html.columns:
+ df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
+ styled_df = df_html.style.format(precision=3)
+
+ for i, col in enumerate(num_cols):
+ if col in df_html.columns:
+ if col not in ['Predicted Binding Affinity', 'Actual Binding Affinity']:
+ styled_df = styled_df.background_gradient(
+ subset=[col], cmap=sns.light_palette(num_col_colors[i], as_cmap=True))
+ else:
+ styled_df = styled_df.background_gradient(
+ subset=[col], cmap=sns.light_palette(num_col_colors[i], as_cmap=True).reversed())
+
+ if any(df_html.columns.isin(bool_cols)):
+ styled_df.applymap(lambda val: f'background-color: {bool_col_colors[val]}', subset=bool_cols)
+
+ table_html = styled_df.to_html()
+ unique_html = ''
+ if unique_df is not None:
+ if 'Target FASTA' in unique_df.columns:
+ unique_df['Target FASTA'] = unique_df['Target FASTA'].str.replace('\n', '
')
+ if any(unique_df.columns.isin(bool_cols)):
+ unique_df = unique_df.style.applymap(
+ lambda val: f"background-color: {bool_col_colors[val]}", subset=bool_cols)
+ unique_html = (f'
'
+ f'{unique_df.to_html(escape=False, index=False)}
')
+
+ return (f'
{job} Report Preview (Top 30 Records)
'
+ f'
{unique_html}
'
+ f'
{table_html}
')
+
+ else:
+ bool_formatters = {col: BooleanFormatter() for col in bool_cols}
+ float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
+ other_formatters = {
+ 'Predicted Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
+ 'Actual Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
+ 'Compound': HTMLTemplateFormatter(template='
<%= value %>
'),
+ 'Scaffold': HTMLTemplateFormatter(template='
<%= value %>
'),
+ 'Target FASTA': {'type': 'textarea', 'width': 60},
+ 'Target ID': HTMLTemplateFormatter(
+ template='
<%= value %>'),
+ 'Compound ID': HTMLTemplateFormatter(
+ template='
<%= value %>')
+ }
+ formatters = {**bool_formatters, **float_formatters, **other_formatters}
+
+ # html = df.to_html(file)
+ # return html
+
+ report_table = pn.widgets.Tabulator(
+ df_html, formatters=formatters,
+ frozen_columns=['Index', 'Target ID', 'Compound ID', 'Compound', 'Scaffold'],
+ disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
+
+ for i, col in enumerate(num_cols):
+ if col not in ['Predicted Binding Affinity', 'Actual Binding Affinity']:
+ if col not in ['Predicted Interaction Probability', 'Actual Interaction Probability']:
+ report_table.style.background_gradient(
+ subset=df_html.columns == col, cmap=sns.light_palette(num_col_colors[i], as_cmap=True))
+ else:
+ continue
+ else:
+ report_table.style.background_gradient(
+ subset=df_html.columns == col, cmap=sns.light_palette(num_col_colors[i], as_cmap=True).reversed())
+
+ pie_charts = {}
+ for y in df_html.columns.intersection(['Predicted Interaction Probability', 'Actual Interaction Probability',
+ 'Predicted Binding Affinity', 'Actual Binding Affinity']):
+ pie_charts[y] = []
+ for k in [10, 30, 100]:
+ if k < len(df_html):
+ pie_charts[y].append(create_pie_chart(df_html, category=category, value=y, top_k=k))
+ pie_charts[y].append(create_pie_chart(df_html, category=category, value=y, top_k=len(df_html)))
+
+ # Remove keys with empty values
+ pie_charts = {k: v for k, v in pie_charts.items() if any(v)}
+
+ pn_css = """
+ .tabulator {
+ font-family: Courier New !important;
+ font-weight: normal !important;
+ font-size: 12px !important;
+ }
+
+ .tabulator-cell {
+ overflow: visible !important;
+ }
+
+ .tabulator-cell:hover {
+ z-index: 1000 !important;
+ }
+
+ .tabulator-cell.tabulator-frozen:hover {
+ z-index: 1000 !important;
+ }
+
+ .image-zoom-viewer {
+ display: inline-block;
+ overflow: visible;
+ z-index: 1000;
+ }
+
+ .image-zoom-viewer::after {
+ content: "";
+ top: 0;
+ left: 0;
+ width: 100%;
+ height: 100%;
+ pointer-events: none;
+ }
+
+ .image-zoom-viewer:hover::after {
+ pointer-events: all;
+ }
+
+ /* When hovering over the container, scale its child (the SVG) */
+ .tabulator-cell:hover .image-zoom-viewer svg {
+ padding: 3px;
+ position: absolute;
+ background-color: rgba(250, 250, 250, 0.854);
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.618);
+ border-radius: 3px;
+ transform: scale(3); /* Scale up the SVG */
+ transition: transform 0.3s ease;
+ pointer-events: none; /* Prevents the SVG from blocking mouse interactions */
+ z-index: 1000;
+ }
+
+ .image-zoom-viewer svg {
+ display: block; /* SVG is a block-level element for proper scaling */
+ z-index: 1000;
+ }
+
+ .image-zoom-viewer:hover {
+ z-index: 1000;
+ }
+ """
+
+ pn.extension(raw_css=[pn_css])
+
+ template = pn.template.VanillaTemplate(
+ title=f'DeepSEQreen {job} Report',
+ sidebar=[],
+ favicon='deepseqreen.svg',
+ logo='deepseqreen.svg',
+ header_background='#F3F5F7',
+ header_color='#4372c4',
+ busy_indicator=None,
+ )
+
+ stats_pane = pn.Row()
+ if unique_df is not None:
+ unique_table = pn.widgets.Tabulator(unique_df, formatters=formatters, sizing_mode='stretch_width',
+ show_index=False, disabled=True,
+ frozen_columns=['Compound ID', 'Compound', 'Scaffold'])
+ # if pie_charts:
+ # unique_table.width = 640
+ stats_pane.append(pn.Column(f'### {unique_entity}', unique_table))
+ if pie_charts:
+ for score_name, figure_list in pie_charts.items():
+ stats_pane.append(
+ pn.Column(f'### {category} by Top {score_name}',
+ pn.Tabs(*figure_list, tabs_location='above'))
+ # pn.Card(pn.Row(v), title=f'{category} by Top {k}')
+ )
+
+ if stats_pane:
+ template.main.append(pn.Card(stats_pane,
+ sizing_mode='stretch_width', title='Summary Statistics', margin=10))
+
+ template.main.append(
+ pn.Card(report_table, title=f'{job} Results', # width=1200,
+ margin=10)
+ )
+
+ template.save(file, resources=INLINE)
+ return file
+
+
+def create_pie_chart(df, category, value, top_k):
+ if category not in df or value not in df:
+ return
+ top_k_df = df.nlargest(top_k, value)
+ category_counts = top_k_df[category].value_counts()
+ data = pd.DataFrame({category: category_counts.index, 'value': category_counts.values})
+
+ data['proportion'] = data['value'] / data['value'].sum()
+ # Merge rows with proportion less than 0.2% into one row
+ mask = data['proportion'] < 0.002
+ if any(mask):
+ merged_row = data[mask].sum()
+ merged_row[category] = '...'
+ data = pd.concat([data[~mask], pd.DataFrame(merged_row).T])
+ data['angle'] = data['proportion'] * 2 * pi
+
+ color_dict = {cat: color for cat, color in
+ zip(df[category].unique(),
+ (Category20c_20 * (len(df[category].unique()) // 20 + 1))[:len(df[category].unique())])}
+ color_dict['...'] = '#636363'
+ data['color'] = data[category].map(color_dict)
+
+ tooltips = [
+ (f"{category}", f"@{{{category}}}"),
+ ("Count", "@value"),
+ ("Percentage", "@proportion{0.0%}")
+ ]
+
+ if category == 'Scaffold SMILES':
+ data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left',
+ left_on='Scaffold SMILES', right_on='Scaffold SMILES')
+ tooltips.append(("Scaffold", "
@{Scaffold}{safe}
"))
+ p = figure(height=384, width=960, name=f"Top {top_k}" if top_k < len(df) else 'All', sizing_mode='stretch_height',
+ toolbar_location=None, tools="hover", tooltips=tooltips, x_range=(-0.4, 0.4))
+
+ def truncate_label(label, max_length=60):
+ return label if len(label) <= max_length else label[:max_length] + "..."
+
+ data['legend_field'] = data[category].apply(truncate_label)
+
+ p.add_layout(Legend(padding=0, margin=0), 'right')
+ p.wedge(x=0, y=1, radius=0.3,
+ start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
+ line_color="white", fill_color='color', legend_field='legend_field', source=data)
+
+ # Limit the number of legend items to 20 and add "..." if there are more than 20 items
+ if len(p.legend.items) > 20:
+ new_legend_items = p.legend.items[:20]
+ new_legend_items.append(LegendItem(label="..."))
+ p.legend.items = new_legend_items
+
+ p.legend.label_text_font_size = "10pt"
+ p.legend.label_text_font = "courier"
+ p.axis.axis_label = None
+ p.axis.visible = False
+ p.grid.grid_line_color = None
+ p.outline_line_width = 0
+ p.min_border = 0
+ p.margin = 0
+
+ return p
+
+
+def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_tqdm=True)):
+ df_report = df.copy()
+ try:
+ for filter_name in filter_list:
+ df_report[filter_name] = df_report['Compound'].swifter.progress_bar(
+ desc=f"Calculating {filter_name}").apply(
+ lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
+
+ for score_name in score_list:
+ df_report[score_name] = df_report['Compound'].swifter.progress_bar(
+ desc=f"Calculating {score_name}").apply(
+ lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
+
+ # pie_chart = None
+ # value = None
+ # if 'Y^' in df.columns:
+ # value = 'Y^'
+ # elif 'Y' in df.columns:
+ # value = 'Y'
+ #
+ # if value:
+ # if df['X1'].nunique() > 1 >= df['X2'].nunique():
+ # pie_chart = create_pie_chart(df, category='Scaffold SMILES', value=value, top_k=100)
+ # elif df['X2'].nunique() > 1 >= df['X1'].nunique():
+ # pie_chart = create_pie_chart(df, category='Target family', value=value, top_k=100)
+
+ return (create_html_report(df_report, file=None, task=task), df_report,
+ gr.File(visible=False), gr.File(visible=False))
+
+ except Exception as e:
+ gr.Warning(f'Failed to report results due to error: {str(e)}')
+ return None, None, None, None
+
+
+def wrap_text(text, line_length=60):
+ if isinstance(text, str):
+ wrapper = textwrap.TextWrapper(width=line_length)
+ if text.startswith('>'):
+ sections = text.split('>')
+ wrapped_sections = []
+ for section in sections:
+ if not section:
+ continue
+ lines = section.split('\n')
+ seq_header = lines[0]
+ wrapped_seq = wrapper.fill(''.join(lines[1:]))
+ wrapped_sections.append(f">{seq_header}\n{wrapped_seq}")
+ return '\n'.join(wrapped_sections)
+ else:
+ return wrapper.fill(text)
+ else:
+ return text
+
+
+def unwrap_text(text):
+ return text.strip.replece('\n', '')
+
+
+def drug_library_from_sdf(sdf_path):
+ return PandasTools.LoadSDF(
+ sdf_path,
+ smilesName='X1', molColName='Compound', includeFingerprints=True
+ )
+
+
+def process_target_library_upload(library_upload):
+ if library_upload.endswith('.csv'):
+ df = pd.read_csv(library_upload)
+ elif library_upload.endswith('.fasta'):
+ df = target_library_from_fasta(library_upload)
+ else:
+ raise gr.Error('Currently only CSV and FASTA files are supported as target libraries.')
+ validate_columns(df, ['X2'])
+ return df
+
+
+def process_drug_library_upload(library_upload):
+ if library_upload.endswith('.csv'):
+ df = pd.read_csv(library_upload)
+ elif library_upload.endswith('.sdf'):
+ df = drug_library_from_sdf(library_upload)
+ else:
+ raise gr.Error('Currently only CSV and SDF files are supported as drug libraries.')
+ validate_columns(df, ['X1'])
+ return df
+
+
+def target_library_from_fasta(fasta_path):
+ records = list(SeqIO.parse(fasta_path, "fasta"))
+ id2 = [record.id for record in records]
+ seq = [str(record.seq) for record in records]
+ df = pd.DataFrame({'ID2': id2, 'X2': seq})
+ return df
+
+
+theme = gr.themes.Base(spacing_size="sm", text_size='md').set(
+ background_fill_primary='#dfe6f0',
+ background_fill_secondary='white',
+ checkbox_label_background_fill='#dfe6f0',
+ checkbox_label_background_fill_hover='#dfe6f0',
+ checkbox_background_color='white',
+ checkbox_border_color='#4372c4',
+ border_color_primary='#4372c4',
+ border_color_accent='#4372c4',
+ button_primary_background_fill='#4372c4',
+ button_primary_text_color='white',
+ button_secondary_border_color='#4372c4',
+ body_text_color='#4372c4',
+ block_title_text_color='#4372c4',
+ block_label_text_color='#4372c4',
+ block_info_text_color='#505358',
+ block_border_color=None,
+ input_border_color='#4372c4',
+ panel_border_color='#4372c4',
+ input_background_fill='white',
+ code_background_fill='white',
+)
+
+with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48 * 3600)) as demo:
+ run_state = gr.State(value=False)
+ screen_flag = gr.State(value=False)
+ identify_flag = gr.State(value=False)
+ infer_flag = gr.State(value=False)
+
+ with gr.Tabs() as tabs:
+ with gr.TabItem(label='Drug Hit Screening', id='Drug Hit Screening'):
+ gr.Markdown('''
+ #
Drug Hit Screening
+
+
+ To predict interactions or binding affinities of a single target against a compound library.
+
+ ''')
+ with gr.Blocks() as screen_block:
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Enter (paste) a amino acid sequence below manually or upload a FASTA file. "
+ "If multiple entities are in the FASTA, only the first will be used. "
+ "Alternatively, enter a Uniprot ID or gene symbol with organism and click Query for "
+ "the sequence."
+ )
+ target_input_type = gr.Dropdown(
+ label='Step 1. Select Target Input Type and Input',
+ choices=['Sequence', 'UniProt ID', 'Gene symbol'],
+ info='Enter (paste) a FASTA string below manually or upload a FASTA file.',
+ value='Sequence',
+ scale=4, interactive=True
+ )
+
+ with gr.Row():
+ target_id = gr.Textbox(show_label=False, visible=False,
+ interactive=True, scale=4,
+ info='Enter a UniProt ID and query.')
+ target_gene = gr.Textbox(
+ show_label=False, visible=False,
+ interactive=True, scale=4,
+ info='Enter a gene symbol and query.')
+ target_organism = gr.Textbox(
+ info='Organism scientific name (default: Homo sapiens).',
+ placeholder='Homo sapiens', show_label=False,
+ visible=False, interactive=True, scale=4, )
+ target_upload_btn = gr.UploadButton(label='Upload a FASTA File', type='binary',
+ visible=True, variant='primary',
+ size='lg')
+ target_paste_markdown = gr.Button(value='OR Paste Your Sequence Below',
+ variant='secondary')
+ target_query_btn = gr.Button(value='Query the Sequence', variant='primary',
+ visible=False, scale=4)
+ # with gr.Row():
+ # example_uniprot = gr.Button(value='Example: Q16539', elem_classes='example', visible=False)
+ # example_gene = gr.Button(value='Example: MAPK14', elem_classes='example', visible=False)
+ example_fasta = gr.Button(value='Example: MAPK14 (Q16539)', elem_classes='example')
+ target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5)
+ # with gr.Row():
+ # with gr.Column():
+ # with gr.Column():
+ # gr.File(label='Example FASTA file',
+ # value='data/examples/MAPK14.fasta', interactive=False)
+
+ with gr.Row():
+ with gr.Column(min_width=200):
+ HelpTip(
+ "Click Auto-detect to identify the protein family using sequence alignment. "
+ "This optional step allows applying a family-specific model instead of a all-family "
+ "model (general). "
+ "Manually select general if the alignment results are unsatisfactory."
+ )
+ drug_screen_target_family = gr.Dropdown(
+ choices=list(TARGET_FAMILY_MAP.keys()),
+ value='General',
+ label='Step 2. Select Target Family (Optional)', interactive=True)
+ target_family_detect_btn = gr.Button(value='OR Let Us Auto-Detect for You',
+ variant='primary')
+ with gr.Column(min_width=200):
+ HelpTip(
+ "Interaction prediction provides you binding probability score between the target of "
+ "interest and each compound in the library, "
+ "while affinity prediction directly estimates their binding strength measured using "
+ "IC50."
+ )
+ drug_screen_task = gr.Dropdown(
+ list(TASK_MAP.keys()),
+ label='Step 3. Select the Prediction Task',
+ value='Compound-Protein Interaction')
+ with gr.Column(min_width=200):
+ HelpTip(
+ "Select your preferred model, or click Recommend for the best-performing model based "
+ "on the selected task, family, and whether the target was trained. "
+ "Please refer to documentation for detailed benchmark results."
+ )
+ drug_screen_preset = gr.Dropdown(
+ list(PRESET_MAP.keys()),
+ label='Step 4. Select a Preset Model')
+ screen_preset_recommend_btn = gr.Button(
+ value='OR Let Us Recommend for You', variant='primary')
+
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Select a preset compound library (e.g., DrugBank). "
+ "Alternatively, upload a CSV file with a column named X1 containing compound SMILES, "
+ "or use an SDF file (Max. 10,000 compounds per task). Example CSV and SDF files are "
+ "provided below and can be downloaded by clicking the lower right corner."
+ )
+ drug_library = gr.Dropdown(
+ label='Step 5. Select a Preset Compound Library',
+ choices=list(DRUG_LIBRARY_MAP.keys()))
+ with gr.Row():
+ gr.File(label='Example SDF compound library',
+ value='data/examples/compound_library.sdf', interactive=False)
+ gr.File(label='Example CSV compound library',
+ value='data/examples/compound_library.csv', interactive=False)
+ drug_library_upload_btn = gr.UploadButton(
+ label='OR Upload Your Own Library', variant='primary')
+ drug_library_upload = gr.File(label='Custom compound library file', visible=False)
+ with gr.Row():
+ with gr.Column():
+ drug_screen_email = gr.Textbox(
+ label='Step 6. Input Your Email Address (Optional)',
+ info="Your email address will be used to notify you of the status of your job. "
+ "If you cannot receive the email, please check your spam/junk folder."
+ )
+
+ with gr.Row(visible=True):
+ with gr.Column():
+ # drug_screen_clr_btn = gr.ClearButton(size='lg')
+ drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
+ # TODO Modify the pd df directly with df['X2'] = target
+
+ screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
+
+ with gr.TabItem(label='Target Protein Identification', id='Target Protein Identification'):
+ gr.Markdown('''
+ #
Target Protein Identification
+
+
+ To predict interactions or binding affinities of a single compound against a protein library.
+
+ ''')
+ with gr.Blocks() as identify_block:
+ with gr.Column() as identify_page:
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Enter (paste) a compound SMILES below manually or upload a SDF file. "
+ "If multiple entities are in the SDF, only the first will be used. "
+ "SMILES can be obtained by searching for the compound of interest in databases such "
+ "as NCBI, PubChem and and ChEMBL."
+ )
+ compound_type = gr.Dropdown(
+ label='Step 1. Select Compound Input Type and Input',
+ choices=['SMILES', 'SDF'],
+ info='Enter (paste) an SMILES string or upload an SDF file to convert to SMILES.',
+ value='SMILES',
+ interactive=True)
+ compound_upload_btn = gr.UploadButton(label='OR Upload a SDF File', variant='primary',
+ type='binary', visible=False)
+
+ compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
+ example_drug = gr.Button(value='Example: Aspirin', elem_classes='example')
+
+ with gr.Row():
+ with gr.Column(visible=False):
+ HelpTip(
+ "By default, models trained on all protein families (general) will be applied. "
+ # "If the proteins in the target library of interest all belong to the same protein "
+ # "family, manually selecting the family is supported."
+ )
+ target_identify_target_family = gr.Dropdown(
+ choices=['General'], value='General',
+ label='Step 2. Select Target Family (Optional)')
+
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Select a preset target library (e.g., ChEMBL33_human_proteins). "
+ "Alternatively, upload a CSV file with a column named X2 containing target protein "
+ "sequences, or use an FASTA file (Max. 10,000 targets per task). "
+ "Example CSV and SDF files are provided below "
+ "and can be downloaded by clicking the lower right corner."
+ )
+ target_library = gr.Dropdown(label='Step 3. Select a Preset Target Library',
+ choices=list(TARGET_LIBRARY_MAP.keys()))
+ with gr.Row():
+ gr.File(label='Example FASTA target library',
+ value='data/examples/target_library.fasta', interactive=False)
+ gr.File(label='Example CSV target library',
+ value='data/examples/target_library.csv', interactive=False)
+ target_library_upload_btn = gr.UploadButton(
+ label='OR Upload Your Own Library', variant='primary')
+ target_library_upload = gr.File(label='Custom target library file', visible=False)
+
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Interaction prediction provides you binding probability score between the target of "
+ "interest and each compound in the library, "
+ "while affinity prediction directly estimates their binding strength measured using "
+ "IC50."
+ )
+ target_identify_task = gr.Dropdown(
+ list(TASK_MAP.keys()),
+ label='Step 4. Select the Prediction Task You Want to Conduct',
+ value='Compound-Protein Interaction')
+
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Select your preferred model, or click Recommend for the best-performing model based "
+ "on the selected task, family, and whether the compound was trained. "
+ "Please refer to documentation for detailed benchamrk results."
+ )
+ target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()),
+ label='Step 5. Select a Preset Model')
+ identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
+ variant='primary')
+
+ with gr.Row():
+ with gr.Column():
+ target_identify_email = gr.Textbox(
+ label='Step 6. Input Your Email Address (Optional)',
+ info="Your email address will be used to notify you of the status of your job. "
+ "If you cannot receive the email, please check your spam/junk folder."
+ )
+
+ with gr.Row(visible=True):
+ # target_identify_clr_btn = gr.ClearButton(size='lg')
+ target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary',
+ size='lg')
+
+ identify_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
+
+ with gr.TabItem(label='Interaction Pair Inference', id='Interaction Pair Inference'):
+ gr.Markdown('''
+ #
Interaction Pair Inference
+
+
To predict interactions or binding affinities between up to
+ 10,000 paired compound-protein data.
+ ''')
+ with gr.Blocks() as infer_block:
+ HelpTip(
+ "A custom interation pair dataset can be a CSV file with 2 required columns "
+ "(X1 for smiles and X2 for sequences) "
+ "and optionally 2 ID columns (ID1 for compound ID and ID2 for target ID), "
+ "or generated from a FASTA file containing multiple "
+ "sequences and a SDF file containing multiple compounds. "
+ "Currently, a maximum of 10,000 pairs is supported, "
+ "which means that the size of CSV file or "
+ "the product of the two library sizes should not exceed 10,000."
+ )
+ infer_type = gr.Dropdown(
+ choices=['Upload a CSV file containing paired compound-protein data',
+ 'Upload a compound library and a target library'],
+ label='Step 1. Select Pair Input Type and Input',
+ value='Upload a CSV file containing paired compound-protein data')
+ with gr.Column() as pair_upload:
+ gr.File(label="Example CSV dataset",
+ value="data/examples/interaction_pair_inference.csv",
+ interactive=False)
+ with gr.Row():
+ infer_csv_prompt = gr.Button(
+ value="Upload Your Own Dataset Below",
+ variant='secondary')
+ with gr.Column():
+ infer_pair = gr.File(
+ label='Upload CSV File Containing Paired Records',
+ file_count="single", type='filepath', visible=True)
+ with gr.Column(visible=False) as pair_generate:
+ with gr.Row():
+ gr.File(label='Example SDF compound library',
+ value='data/examples/compound_library.sdf', interactive=False)
+ gr.File(label='Example FASTA target library',
+ value='data/examples/target_library.fasta', interactive=False)
+ with gr.Row():
+ gr.File(label='Example CSV compound library',
+ value='data/examples/compound_library.csv', interactive=False)
+ gr.File(label='Example CSV target library',
+ value='data/examples/target_library.csv', interactive=False)
+ with gr.Row():
+ infer_library_prompt = gr.Button(
+ value="Upload Your Own Libraries Below",
+ visible=False, variant='secondary')
+ with gr.Row():
+ infer_drug = gr.File(label='Upload SDF/CSV File Containing Multiple Compounds',
+ file_count="single", type='filepath')
+ infer_target = gr.File(label='Upload FASTA/CSV File Containing Multiple Targets',
+ file_count="single", type='filepath')
+
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "By default, models trained on all protein families (general) will be applied. "
+ "If the proteins in the target library of interest "
+ "all belong to the same protein family, manually selecting the family is supported."
+ )
+ pair_infer_target_family = gr.Dropdown(choices=list(TARGET_FAMILY_MAP.keys()),
+ value='General',
+ label='Step 2. Select Target Family (Optional)')
+
+ with gr.Row():
+ with gr.Column():
+ HelpTip(
+ "Interaction prediction provides you binding probability score "
+ "between the target of interest and each compound in the library, "
+ "while affinity prediction directly estimates their binding strength "
+ "measured using IC50."
+ )
+ pair_infer_task = gr.Dropdown(
+ list(TASK_MAP.keys()),
+ label='Step 3. Select the Prediction Task You Want to Conduct',
+ value='Compound-Protein Interaction')
+
+ with gr.Row():
+ with gr.Column():
+ HelpTip("Select your preferred model. "
+ "Please refer to documentation for detailed benchmark results."
+ )
+ pair_infer_preset = gr.Dropdown(list(PRESET_MAP.keys()),
+ label='Step 4. Select a Preset Model')
+ # infer_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
+ # variant='primary')
+
+ with gr.Row():
+ pair_infer_email = gr.Textbox(
+ label='Step 5. Input Your Email Address (Optional)',
+ info="Your email address will be used to notify you of the status of your job. "
+ "If you cannot receive the email, please check your spam/junk folder.")
+
+ with gr.Row(visible=True):
+ # pair_infer_clr_btn = gr.ClearButton(size='lg')
+ pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg')
+
+ infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False)
+
+ with gr.TabItem(label='Chemical Property Report', id='Chemical Property Report'):
+ gr.Markdown('''
+ #
Chemical Property Report
+
+ To compute chemical properties for the predictions of Drug Hit Screening,
+ Target Protein Identification, and Interaction Pair Inference.
+
+ You may also upload your own dataset using a CSV file containing
+ one required column `X1` for compound SMILES.
+
+ The page shows only a preview report displaying at most 30 records
+ (with top predicted CPI/CPA if reporting results from a prediction job).
+
+ Please first `Preview` the report, then `Generate` and download a CSV report
+ or an interactive HTML report below if you wish to access the full report.
+ ''')
+ with gr.Blocks() as report_block:
+ with gr.Row():
+ with gr.Column():
+ file_for_report = gr.File(interactive=True, type='filepath')
+ report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False, value=None,
+ label='Specify the Task for the Labels in the Upload Dataset')
+ raw_df = gr.State(value=pd.DataFrame())
+ report_df = gr.State(value=pd.DataFrame())
+ scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Scores')
+ filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Filters')
+
+ with gr.Row():
+ # clear_btn = gr.ClearButton(size='lg')
+ analyze_btn = gr.Button('Preview Top 30 Records', variant='primary', size='lg',
+ interactive=False)
+
+ with gr.Row():
+ with gr.Column(scale=3):
+ html_report = gr.HTML() # label='Results', visible=True)
+ ranking_pie_chart = gr.Plot(visible=False)
+
+ with gr.Row():
+ with gr.Column():
+ csv_generate = gr.Button(value='Generate CSV Report',
+ interactive=False, variant='primary')
+ csv_download_file = gr.File(label='Download CSV Report', visible=False)
+ with gr.Column():
+ html_generate = gr.Button(value='Generate HTML Report',
+ interactive=False, variant='primary')
+ html_download_file = gr.File(label='Download HTML Report', visible=False)
+
+ with gr.TabItem(label='Prediction Status Lookup', id='Prediction Status Lookup'):
+ gr.Markdown('''
+ #
Prediction Status Lookup
+
+ To check the status of an in-progress or historical job using the job ID and retrieve the predictions
+ if the job has completed. Note that predictions are only kept for 48 hours upon job completion.
+
+ You will be redirected to Chemical Property Report for carrying out further analysis and
+ generating the full report if the job is done.
+ ''')
+ with gr.Blocks() as lookup_block:
+ with gr.Column():
+ pred_lookup_id = gr.Textbox(
+ label='Input Your Job ID', placeholder='e.g., e9dfd149-3f5c-48a6-b797-c27d027611ac',
+ info="Your job ID is a UUID4 string that you receive after submitting a job on the "
+ "page or in the email notification.")
+ pred_lookup_btn = gr.Button(value='Lookup the Job Status', variant='primary', visible=True)
+ pred_lookup_stop_btn = gr.Button(value='Stop Tracking', variant='stop', visible=False)
+ pred_lookup_status = gr.Markdown()
+
+ # retrieve_email = gr.Textbox(label='Step 2. Input Your Email Address', placeholder='e.g.,
+
+
+ def target_input_type_select(input_type):
+ match input_type:
+ case 'UniProt ID':
+ return [gr.Dropdown(info=''),
+ gr.UploadButton(visible=False),
+ gr.Textbox(visible=True, value=''),
+ gr.Textbox(visible=False, value=''),
+ gr.Textbox(visible=False, value=''),
+ gr.Button(visible=True),
+ gr.Code(value=''),
+ gr.Button(visible=False)]
+ case 'Gene symbol':
+ return [gr.Dropdown(info=''),
+ gr.UploadButton(visible=False),
+ gr.Textbox(visible=False, value=''),
+ gr.Textbox(visible=True, value=''),
+ gr.Textbox(visible=True, value=''),
+ gr.Button(visible=True),
+ gr.Code(value=''),
+ gr.Button(visible=False)]
+ case 'Sequence':
+ return [gr.Dropdown(info='Enter (paste) a FASTA string below manually or upload a FASTA file.'),
+ gr.UploadButton(visible=True),
+ gr.Textbox(visible=False, value=''),
+ gr.Textbox(visible=False, value=''),
+ gr.Textbox(visible=False, value=''),
+ gr.Button(visible=False),
+ gr.Code(value=''),
+ gr.Button(visible=True)]
+
+
+ target_input_type.select(
+ fn=target_input_type_select,
+ inputs=target_input_type,
+ outputs=[
+ target_input_type, target_upload_btn,
+ target_id, target_gene, target_organism, target_query_btn,
+ target_fasta, target_paste_markdown
+ ],
+ show_progress='hidden'
+ )
+
+
+ def uniprot_query(input_type, uid, gene, organism='Human'):
+ fasta_seq = ''
+
+ match input_type:
+ case 'UniProt ID':
+ query = f"{uid.strip()}.fasta"
+ case 'Gene symbol':
+ organism = organism if organism else 'Human'
+ query = f'search?query=organism_name:{organism.strip()}+AND+gene:{gene.strip()}&format=fasta'
+
+ try:
+ fasta = session.get(UNIPROT_ENDPOINT.format(query=query))
+ fasta.raise_for_status()
+ fasta_seq = fasta.text
+ except Exception as e:
+ raise gr.Warning(f"Failed to query FASTA from UniProt database due to {str(e)}")
+ finally:
+ return fasta_seq
+
+
+ def process_fasta_upload(fasta_upload):
+ fasta = ''
+ try:
+ fasta = fasta_upload.decode()
+ except Exception as e:
+ gr.Warning(f"Please upload a valid FASTA file. Error: {str(e)}")
+ return fasta
+
+
+ target_upload_btn.upload(fn=process_fasta_upload, inputs=target_upload_btn, outputs=target_fasta)
+ target_query_btn.click(uniprot_query,
+ inputs=[target_input_type, target_id, target_gene, target_organism],
+ outputs=target_fasta)
+
+
+ def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)):
+ aligner = PairwiseAligner(scoring='blastp', mode='local')
+ alignment_df = pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv')
+
+ def align_score(query):
+ return aligner.align(process_target_fasta(fasta), query).score
+
+ alignment_df['score'] = alignment_df['X2'].swifter.progress_bar(
+ desc="Detecting protein family of the target...").apply(align_score)
+ row = alignment_df.loc[alignment_df['score'].idxmax()]
+ return gr.Dropdown(value=row['protein_family'].capitalize(),
+ info=f"Reason: Best BLASTP score ({row['score']}) "
+ f"with {row['ID2']} from family {row['protein_family']}")
+
+
+ target_family_detect_btn.click(fn=target_family_detect, inputs=target_fasta, outputs=drug_screen_target_family)
+
+ # target_fasta.focus(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden')
+ target_fasta.blur(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress='hidden')
+
+ drug_library_upload_btn.upload(fn=lambda x: [
+ x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name])
+ ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
+
+
+ def example_fill(input_type):
+ return {target_id: 'Q16539',
+ target_gene: 'MAPK14',
+ target_organism: 'Human',
+ target_fasta: """
+>sp|Q16539|MK14_HUMAN Mitogen-activated protein kinase 14 OS=Homo sapiens OX=9606 GN=MAPK14 PE=1 SV=3
+MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLRVAVKKLSRPFQ
+SIIHAKRTYRELRLLKHMKHENVIGLLDVFTPARSLEEFNDVYLVTHLMGADLNNIVKCQ
+KLTDDHVQFLIYQILRGLKYIHSADIIHRDLKPSNLAVNEDCELKILDFGLARHTDDEMT
+GYVATRWYRAPEIMLNWMHYNQTVDIWSVGCIMAELLTGRTLFPGTDHIDQLKLILRLVG
+TPGAELLKKISSESARNYIQSLTQMPKMNFANVFIGANPLAVDLLEKMLVLDSDKRITAA
+QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
+"""}
+
+
+ example_fasta.click(fn=example_fill, inputs=target_input_type, outputs=[
+ target_id, target_gene, target_organism, target_fasta], show_progress='hidden')
+
+
+ def screen_recommend_model(fasta, family, task):
+ task = TASK_MAP[task]
+ score = TASK_METRIC_MAP[task]
+ benchmark_df = pd.read_csv(f'data/benchmarks/{task}_test_metrics.csv')
+
+ if not fasta:
+ gr.Warning('Please enter a valid FASTA for model recommendation.')
+ return [None, family]
+
+ if family == 'General':
+ seen_targets = pd.read_csv(
+ f'data/benchmarks/seen_targets/all_families_full_{task.lower()}_random_split.csv')
+ if process_target_fasta(fasta) in seen_targets['X2'].values:
+ scenario = "Seen Target"
+ else:
+ scenario = "Unseen Target"
+ filtered_df = benchmark_df[(benchmark_df['Family'] == 'All Families')
+ & (benchmark_df['Scenario'] == scenario)
+ & (benchmark_df['Type'] == 'General')]
+
+ else:
+ seen_targets_general = pd.read_csv(
+ f'data/benchmarks/seen_targets/all_families_full_{task.lower()}_random_split.csv')
+ if process_target_fasta(fasta) in seen_targets_general['X2'].values:
+ scenario_general = "Seen Target"
+ else:
+ scenario_general = "Unseen Target"
+
+ seen_targets_family = pd.read_csv(
+ f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family]}_{task.lower()}_random_split.csv')
+ if process_target_fasta(fasta) in seen_targets_family['X2'].values:
+ scenario_family = "Seen Target"
+ else:
+ scenario_family = "Unseen Target"
+
+ filtered_df_general = benchmark_df[(benchmark_df['Family'] == family)
+ & (benchmark_df['Scenario'] == scenario_general)
+ & (benchmark_df['Type'] == 'General')]
+ filtered_df_family = benchmark_df[(benchmark_df['Family'] == family)
+ & (benchmark_df['Scenario'] == scenario_family)
+ & (benchmark_df['Type'] == 'Family')]
+ filtered_df = pd.concat([filtered_df_general, filtered_df_family])
+
+ row = filtered_df.loc[filtered_df[score].idxmax()]
+
+ return {drug_screen_preset:
+ gr.Dropdown(value=row['Model'],
+ info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
+ f"model with the best {score} ({float(row[score]):.3f}) "
+ f"in the {row['Scenario']} scenario on {row['Family']}."),
+ drug_screen_target_family:
+ gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
+
+
+ screen_preset_recommend_btn.click(fn=screen_recommend_model,
+ inputs=[target_fasta, drug_screen_target_family, drug_screen_task],
+ outputs=[drug_screen_preset, drug_screen_target_family], show_progress='hidden')
+
+
+ def compound_input_type_select(input_type):
+ match input_type:
+ case 'SMILES':
+ return gr.Button(visible=False)
+ case 'SDF':
+ return gr.Button(visible=True)
+
+
+ compound_type.select(fn=compound_input_type_select,
+ inputs=compound_type, outputs=compound_upload_btn, show_progress='hidden')
+
+
+ def compound_upload_process(input_type, input_upload):
+ smiles = ''
+ try:
+ match input_type:
+ case 'SMILES':
+ smiles = input_upload.decode()
+ case 'SDF':
+ suppl = Chem.ForwardSDMolSupplier(io.BytesIO(input_upload))
+ smiles = Chem.MolToSmiles(next(suppl))
+ except Exception as e:
+ gr.Warning(f"Please upload a valid {input_type} file. Error: {str(e)}")
+ return smiles
+
+
+ compound_upload_btn.upload(fn=compound_upload_process,
+ inputs=[compound_type, compound_upload_btn],
+ outputs=compound_smiles)
+
+ example_drug.click(fn=lambda: 'CC(=O)Oc1ccccc1C(=O)O', outputs=compound_smiles, show_progress='hidden')
+
+ target_library_upload_btn.upload(fn=lambda x: [
+ x.name, gr.Dropdown(value=Path(x.name).name, choices=list(TARGET_LIBRARY_MAP.keys()) + [Path(x.name).name])
+ ], inputs=target_library_upload_btn, outputs=[target_library_upload, target_library])
+
+
+ def identify_recommend_model(smiles, task):
+ task = TASK_MAP[task]
+ score = TASK_METRIC_MAP[task]
+ benchmark_df = pd.read_csv(f'data/benchmarks/{task}_test_metrics.csv')
+
+ if not smiles:
+ gr.Warning('Please enter a valid SMILES for model recommendation.')
+ return None
+
+ seen_drugs = pd.read_csv(
+ f'data/benchmarks/seen_drugs/all_families_full_{task.lower()}_random_split.csv')
+ if rdkit_canonicalize(smiles) in seen_drugs['X1'].values:
+ scenario = "Seen Compound"
+ else:
+ scenario = "Unseen Compound"
+
+ filtered_df = benchmark_df[(benchmark_df['Family'] == 'All Families')
+ & (benchmark_df['Scenario'] == scenario)
+ & (benchmark_df['Type'] == 'General')]
+
+ row = filtered_df.loc[filtered_df[score].idxmax()]
+
+ return gr.Dropdown(value=row['Model'],
+ info=f"Reason: {scenario} in training; choosing the model "
+ f"with the best {score} ({float(row[score]):3f}) "
+ f"in the {scenario} scenario.")
+
+
+ identify_preset_recommend_btn.click(fn=identify_recommend_model,
+ inputs=[compound_smiles, target_identify_task],
+ outputs=target_identify_preset, show_progress='hidden')
+
+
+ def infer_type_change(upload_type):
+ match upload_type:
+ case "Upload a compound library and a target library":
+ return {
+ pair_upload: gr.Column(visible=False),
+ pair_generate: gr.Column(visible=True),
+ infer_pair: None,
+ infer_drug: None,
+ infer_target: None,
+ infer_csv_prompt: gr.Button(visible=False),
+ infer_library_prompt: gr.Button(visible=True),
+ }
+ match upload_type:
+ case "Upload a CSV file containing paired compound-protein data":
+ return {
+ pair_upload: gr.Column(visible=True),
+ pair_generate: gr.Column(visible=False),
+ infer_pair: None,
+ infer_drug: None,
+ infer_target: None,
+ infer_csv_prompt: gr.Button(visible=True),
+ infer_library_prompt: gr.Button(visible=False),
+ }
+
+
+ infer_type.select(fn=infer_type_change, inputs=infer_type,
+ outputs=[pair_upload, pair_generate, infer_pair, infer_drug, infer_target,
+ infer_csv_prompt, infer_library_prompt])
+
+
+ def common_input_validate(state, preset, email, request):
+ if not preset:
+ raise gr.Error('Please select a model.')
+
+ if email:
+ try:
+ email_info = validate_email(email, check_deliverability=False)
+ email = email_info.normalized
+ except EmailNotValidError as e:
+ raise gr.Error(f"Invalid email address: {str(e)}.")
+
+ if state:
+ raise gr.Error(f"You already have a running prediction job (ID: {state['id']}) under this session. "
+ "Please wait for it to complete before submitting another job.")
+
+ if check := check_user_running_job(email, request):
+ raise gr.Error(check)
+
+ return state, preset, email
+
+
+ def common_job_initiate(job_id, job_type, email, request, task):
+ gr.Info('Finished input validation. Initiating the prediction job... '
+ 'You will be redirected to Prediction Status Lookup after the job is submitted.')
+ job_info = {'id': job_id,
+ 'type': job_type,
+ 'task': task,
+ 'status': 'RUNNING',
+ 'email': email,
+ 'ip': str(request.client.host),
+ 'cookies': dict(request.cookies),
+ 'start_time': time(),
+ 'end_time': None,
+ 'expiry_time': None,
+ 'error': None}
+ db.insert(job_info)
+ return job_info
+
+
+ def drug_screen_validate(fasta, library, library_upload, preset, task, email, state,
+ request: gr.Request, progress=gr.Progress(track_tqdm=True)):
+ state, preset, email = common_input_validate(state, preset, email, request)
+
+ fasta = process_target_fasta(fasta)
+ err = validate_seq_str(fasta, FASTA_PAT)
+ if err:
+ raise gr.Error(f'Found error(s) in your Target FASTA input: {err}')
+ if not library:
+ raise gr.Error('Please select or upload a compound library.')
+ if library in DRUG_LIBRARY_MAP.keys():
+ screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
+ else:
+ screen_df = process_drug_library_upload(library_upload)
+ if len(screen_df) >= DATASET_MAX_LEN:
+ raise gr.Error(f'The uploaded compound library has more records '
+ f'than the allowed maximum {DATASET_MAX_LEN}.')
+
+ screen_df['X2'] = fasta
+
+ job_id = str(uuid4())
+ temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
+ screen_df.to_csv(temp_file, index=False)
+ if temp_file.is_file():
+ job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task)
+ return {screen_data_for_predict: str(temp_file),
+ run_state: job_info}
+ else:
+ raise gr.Error('System failed to create temporary files. Please try again later.')
+
+
+ def target_identify_validate(smiles, library, library_upload, preset, task, email, state,
+ request: gr.Request, progress=gr.Progress(track_tqdm=True)):
+ state, preset, email = common_input_validate(state, preset, email, request)
+
+ smiles = smiles.strip()
+ err = validate_seq_str(smiles, SMILES_PAT)
+ if err:
+ raise gr.Error(f'Found error(s) in your Compound SMILES input: {err}')
+ if not library:
+ raise gr.Error('Please select or upload a target library.')
+ if library in TARGET_LIBRARY_MAP.keys():
+ identify_df = pd.read_csv(Path('data/target_libraries', TARGET_LIBRARY_MAP[library]))
+ else:
+ identify_df = process_target_library_upload(library_upload)
+ if len(identify_df) >= DATASET_MAX_LEN:
+ raise gr.Error(f'The uploaded target library has more records '
+ f'than the allowed maximum {DATASET_MAX_LEN}.')
+ identify_df['X1'] = smiles
+
+ job_id = str(uuid4())
+ temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
+ identify_df.to_csv(temp_file, index=False)
+ if temp_file.is_file():
+ job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task)
+ return {identify_data_for_predict: str(temp_file),
+ run_state: job_info}
+ else:
+ raise gr.Error('System failed to create temporary files. Please try again later.')
+
+
+ def pair_infer_validate(drug_target_pair_upload, drug_upload, target_upload, preset, task, email, state,
+ request: gr.Request, progress=gr.Progress(track_tqdm=True)):
+ state, preset, email = common_input_validate(state, preset, email, request)
+
+ job_id = str(uuid4())
+ if drug_target_pair_upload:
+ infer_df = pd.read_csv(drug_target_pair_upload)
+ validate_columns(infer_df, ['X1', 'X2'])
+
+ infer_df['X1_ERR'] = infer_df['X1'].swifter.progress_bar(desc="Validating SMILES...").apply(
+ validate_seq_str, regex=SMILES_PAT)
+ if not infer_df['X1_ERR'].isna().all():
+ raise ValueError(
+ f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+
+ infer_df['X2_ERR'] = infer_df['X2'].swifter.progress_bar(desc="Validating FASTA...").apply(
+ validate_seq_str, regex=FASTA_PAT)
+ if not infer_df['X2_ERR'].isna().all():
+ raise ValueError(
+ f"Encountered invalid FASTA:\n{infer_df[~infer_df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
+
+ temp_file = Path(drug_target_pair_upload).resolve()
+
+ elif drug_upload and target_upload:
+ drug_df = process_drug_library_upload(drug_upload)
+ target_df = process_target_library_upload(target_upload)
+
+ drug_df.drop_duplicates(subset=['X1'], inplace=True)
+ target_df.drop_duplicates(subset=['X2'], inplace=True)
+
+ infer_df = pd.DataFrame(list(itertools.product(drug_df['X1'], target_df['X2'])),
+ columns=['X1', 'X2'])
+ infer_df = infer_df.merge(drug_df, on='X1').merge(target_df, on='X2')
+
+ if len(infer_df) >= DATASET_MAX_LEN:
+ raise gr.Error(f'The uploaded/generated compound-protein pair dataset has more records '
+ f'than the allowed maximum {DATASET_MAX_LEN}.')
+
+ temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
+ infer_df.to_csv(temp_file, index=False)
+
+ else:
+ raise gr.Error('Should upload a compound-protein pair dataset, or '
+ 'upload both a compound library and a target library.')
+
+ if temp_file.is_file():
+ job_info = common_job_initiate(job_id, 'Interaction Pair Inference', email, request, task)
+ return {infer_data_for_predict: str(temp_file),
+ run_state: job_info}
+ else:
+ raise gr.Error('System failed to create temporary files. Please try again later.')
+
+
+ drug_screen_click = drug_screen_btn.click(
+ fn=drug_screen_validate,
+ inputs=[target_fasta, drug_library, drug_library_upload, drug_screen_preset, drug_screen_task,
+ drug_screen_email, run_state],
+ outputs=[screen_data_for_predict, run_state]
+ )
+
+ drug_screen_lookup = drug_screen_click.success(
+ fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs],
+ ).then(
+ fn=lambda x: x['id'], inputs=[run_state], outputs=[pred_lookup_id]
+ ).then(
+ fn=lookup_job,
+ inputs=[pred_lookup_id],
+ outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
+ show_progress='hidden'
+ )
+
+ drug_screen_click.success(
+ fn=send_email,
+ inputs=[run_state]
+ )
+
+ drug_screen_click.success(
+ fn=submit_predict,
+ inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset,
+ drug_screen_target_family, run_state, ],
+ outputs=[run_state, ]
+ )
+
+ target_identify_click = target_identify_btn.click(
+ fn=target_identify_validate,
+ inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task,
+ target_identify_email, run_state],
+ outputs=[identify_data_for_predict, run_state]
+ )
+
+ target_identify_lookup = target_identify_click.success(
+ fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs],
+ ).then(
+ fn=lambda x: x['id'], inputs=[run_state], outputs=[pred_lookup_id]
+ ).then(
+ fn=lookup_job,
+ inputs=[pred_lookup_id],
+ outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
+ show_progress='hidden'
+ )
+
+ target_identify_click.success(
+ fn=send_email,
+ inputs=[run_state]
+ )
+
+ target_identify_click.success(
+ fn=submit_predict,
+ inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
+ target_identify_target_family, run_state, ], # , target_identify_email],
+ outputs=[run_state, ]
+ )
+
+ pair_infer_click = pair_infer_btn.click(
+ fn=pair_infer_validate,
+ inputs=[infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_task,
+ pair_infer_email, run_state],
+ outputs=[infer_data_for_predict, run_state],
+ queue=False
+ )
+
+ pair_infer_lookup = pair_infer_click.success(
+ fn=lambda: gr.Tabs(selected='Prediction Status Lookup'), outputs=[tabs],
+ ).then(
+ fn=lambda x: x['id'], inputs=[run_state], outputs=[pred_lookup_id]
+ ).then(
+ fn=lookup_job,
+ inputs=[pred_lookup_id],
+ outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
+ show_progress='hidden'
+ )
+
+ pair_infer_click.success(
+ fn=send_email,
+ inputs=[run_state]
+ )
+
+ pair_infer_click.success(
+ fn=submit_predict,
+ inputs=[infer_data_for_predict, pair_infer_task, pair_infer_preset,
+ pair_infer_target_family, run_state, ], # , pair_infer_email],
+ outputs=[run_state, ]
+ )
+
+ pred_lookup_click = pred_lookup_btn.click(
+ fn=lookup_job,
+ inputs=[pred_lookup_id],
+ outputs=[pred_lookup_status, pred_lookup_btn, pred_lookup_stop_btn, tabs, file_for_report],
+ show_progress='hidden'
+ )
+
+ pred_lookup_stop_btn.click(
+ fn=lambda: [gr.Button(visible=True), gr.Button(visible=False)],
+ outputs=[pred_lookup_btn, pred_lookup_stop_btn],
+ cancels=[pred_lookup_click, drug_screen_lookup, target_identify_lookup, pair_infer_lookup],
+ queue=False
+ )
+
+
+ def inquire_task(df):
+ if 'Y' in df.columns:
+ label = 'actual CPI/CPA labels (`Y`)'
+ elif 'Y^' in df.columns:
+ label = 'predicted CPI/CPA labels (`Y^`)'
+ else:
+ return {analyze_btn: gr.Button(interactive=True),
+ csv_generate: gr.Button(interactive=True),
+ html_generate: gr.Button(interactive=True)}
+
+ return {report_task: gr.Dropdown(visible=True,
+ info=f'Found {label} in your uploaded dataset. '
+ 'Is it compound-protein interaction or binding affinity?'),
+ html_report: '',
+ analyze_btn: gr.Button(interactive=False),
+ csv_generate: gr.Button(interactive=False),
+ html_generate: gr.Button(interactive=False)}
+
+
+ report_df_change = file_for_report.change(
+ fn=update_df, inputs=file_for_report, outputs=[html_report, raw_df, report_df, analyze_btn, report_task]
+ )
+
+ file_for_report.upload(
+ fn=update_df, inputs=file_for_report, outputs=[html_report, raw_df, report_df, analyze_btn, report_task],
+ cancels=[report_df_change]
+ ).then(
+ fn=inquire_task, inputs=[raw_df],
+ outputs=[report_task, html_report, analyze_btn, csv_generate, html_generate],
+ )
+
+ file_for_report.clear(
+ fn=lambda: [gr.Button(visible=False)] * 2 +
+ [gr.File(visible=False, value=None)] * 2 +
+ [gr.Dropdown(visible=False, value=None),
+ gr.HTML(visible=False),
+ gr.Button(interactive=False)],
+ outputs=[
+ csv_generate, html_generate, csv_download_file, html_download_file, report_task, html_report, analyze_btn
+ ])
+
+ analyze_btn.click(fn=submit_report, inputs=[raw_df, scores, filters, report_task], outputs=[
+ html_report, report_df, csv_download_file, html_download_file
+ ]).success(fn=lambda: [gr.Button(interactive=True)] * 2,
+ outputs=[csv_generate, html_generate])
+
+ report_task.select(fn=lambda: gr.Button(interactive=True),
+ outputs=analyze_btn)
+
+
+ def create_csv_report_file(df, file_report, progress=gr.Progress(track_tqdm=True)):
+ try:
+ now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+ filename = f"/data/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv"
+ df.drop(labels=['Compound', 'Scaffold'], axis=1).to_csv(filename, index=False)
+
+ return gr.File(filename)
+ except Exception as e:
+ gr.Warning(f"Failed to generate CSV due to error: {str(e)}")
+ return None
+
+
+ def create_html_report_file(df, file_report, progress=gr.Progress(track_tqdm=True)):
+ try:
+ now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+ filename = f"/data/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.html"
+ create_html_report(df, filename)
+ return gr.File(filename, visible=True)
+ except Exception as e:
+ gr.Warning(f"Failed to generate HTML due to error: {str(e)}")
+ return None
+
+
+ html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate])
+ csv_generate.click(
+ lambda: [gr.Button(visible=False), gr.File(visible=True)], outputs=[csv_generate, csv_download_file],
+ ).then(fn=create_csv_report_file, inputs=[report_df, file_for_report],
+ outputs=csv_download_file, show_progress='full')
+ html_generate.click(
+ lambda: [gr.Button(visible=False), gr.File(visible=True)], outputs=[html_generate, html_download_file],
+ ).then(fn=create_html_report_file, inputs=[report_df, file_for_report],
+ outputs=html_download_file, show_progress='full')
- server.sendmail(email_addr, receiver, msg.as_string())
- server.quit()
+if __name__ == "__main__":
+ screen_block.queue(default_concurrency_limit=2, max_size=10)
+ identify_block.queue(default_concurrency_limit=2, max_size=10)
+ infer_block.queue(default_concurrency_limit=2, max_size=10)
+ report_block.queue(default_concurrency_limit=10, max_size=10)
-send_email('xinran.qin19@student.xjtlu.edu.cn', {'id': 'a1b2c3d', 'type': 'Drug Hit Screening', 'status': 'RUNNING', 'start_time': '2021-10-10 10:00:00', 'end_time': 'TBD', 'expiry_time': 'TBD', 'error': 'TBD'})
\ No newline at end of file
+ demo.launch(show_api=False)