libokj commited on
Commit
f789b0f
·
1 Parent(s): ae135d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +617 -326
app.py CHANGED
@@ -11,30 +11,31 @@ import pathlib
11
  from pathlib import Path
12
  import sys
13
 
14
- from Bio import AlignIO, SeqIO
 
15
  # from email_validator import validate_email
16
  import gradio as gr
17
  import hydra
18
  import pandas as pd
19
  import plotly.express as px
20
  import requests
 
21
  from requests.adapters import HTTPAdapter, Retry
22
  from rdkit import Chem
23
- from rdkit.Chem import RDConfig, Descriptors, Draw, Lipinski, Crippen, PandasTools
24
  from rdkit.Chem.Scaffolds import MurckoScaffold
25
  import seaborn as sns
26
 
27
  import swifter
28
  from tqdm.auto import tqdm
29
 
30
- from deepscreen.data.dti import rdkit_canonicalize, validate_seq_str, FASTA_PAT, SMILES_PAT
31
  from deepscreen.predict import predict
32
 
33
  sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
34
  import sascorer
35
 
36
  ROOT = Path.cwd()
37
- DATA_PATH = Path("./") # Path("/data")
38
 
39
  DF_FOR_REPORT = pd.DataFrame()
40
 
@@ -56,6 +57,7 @@ SESSION.mount('https://', ADAPTER)
56
  # SCHEDULER = BackgroundScheduler()
57
 
58
  UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
 
59
  CSS = """
60
  .help-tip {
61
  position: absolute;
@@ -63,11 +65,11 @@ CSS = """
63
  top: 0px;
64
  right: 0px;
65
  text-align: center;
66
- background-color: #29b6f6;
67
- border-radius: 50%;
68
  width: 24px;
69
  height: 24px;
70
- font-size: 12px;
71
  line-height: 26px;
72
  cursor: default;
73
  transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1);
@@ -75,13 +77,13 @@ CSS = """
75
 
76
  .help-tip:hover {
77
  cursor: pointer;
78
- background-color: #ccc;
79
  }
80
 
81
  .help-tip:before {
82
  content: '?';
83
  font-weight: 700;
84
- color: #fff;
85
  z-index: 100;
86
  }
87
 
@@ -89,13 +91,13 @@ CSS = """
89
  visibility: hidden;
90
  opacity: 0;
91
  text-align: left;
92
- background-color: #039be5;
93
  padding: 20px;
94
  width: 300px;
95
  position: absolute;
96
  border-radius: 4px;
97
  right: -4px;
98
- color: #fff;
99
  font-size: 13px;
100
  line-height: normal;
101
  transform: scale(0.7);
@@ -117,7 +119,7 @@ CSS = """
117
  width: 0;
118
  height: 0;
119
  border: 6px solid transparent;
120
- border-bottom-color: #039be5;
121
  right: 10px;
122
  top: -12px;
123
  }
@@ -131,16 +133,6 @@ CSS = """
131
  left: 0;
132
  }
133
 
134
- .help-tip a {
135
- color: #fff;
136
- font-weight: 700;
137
- }
138
-
139
- .help-tip a:hover, .help-tip a:focus {
140
- color: #fff;
141
- text-decoration: underline;
142
- }
143
-
144
  .upload_button {
145
  background-color: #008000;
146
  }
@@ -174,46 +166,131 @@ class HelpTip:
174
 
175
 
176
  def sa_score(row):
177
- return sascorer.calculateScore((row['Compound']))
178
 
179
 
180
  def mw(row):
181
- return Chem.Descriptors.MolWt((row['Compound']))
 
 
 
 
182
 
183
 
184
  def hbd(row):
185
- return Lipinski.NumHDonors((row['Compound']))
186
 
187
 
188
  def hba(row):
189
- return Lipinski.NumHAcceptors((row['Compound']))
190
 
191
 
192
  def logp(row):
193
- return Crippen.MolLogP((row['Compound']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
 
196
  SCORE_MAP = {
197
  'SAscore': sa_score,
198
- 'RAscore': None, # https://github.com/reymond-group/RAscore
199
- 'SCScore': None, # https://pubs.acs.org/doi/10.1021/acs.jcim.7b00622
200
- 'LogP': logp, # https://www.rdkit.org/docs/source/rdkit.Chem.Crippen.html
201
- 'MW': mw, # https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors.html
202
- 'HBD': hbd, # https://www.rdkit.org/docs/source/rdkit.Chem.Lipinski.html
203
- 'HBA': hba, # https://www.rdkit.org/docs/source/rdkit.Chem.Lipinski.html
204
- 'TopoPSA': None, # http://mordred-descriptor.github.io/documentation/master/api/mordred.TopoPSA.html
205
  }
206
 
207
  FILTER_MAP = {
208
- 'PAINS filter': None,
209
- "Lipinski's rule of five": None, # https://gist.github.com/strets123/fdc4db6d450b66345f46
210
- 'ADMET filter': None,
211
- 'TCL filter': None
 
 
212
  }
213
 
214
  TASK_MAP = {
215
- 'Drug-target interaction': 'binary',
216
- 'Drug-target binding affinity': 'regression',
217
  }
218
 
219
  PRESET_MAP = {
@@ -231,22 +308,21 @@ PRESET_MAP = {
231
 
232
  TARGET_FAMILY_MAP = {
233
  'General': 'general',
234
- 'Kinase': 'kinases',
235
- 'Non-kinase enzyme': 'non-kinase_enzymes',
236
- 'Membrane receptor': 'membrane_receptors',
237
- 'Nuclear receptor': 'nuclear_receptors',
238
- 'Ion channel': 'ion_channels',
239
- 'Other protein targets': 'other_protein_targets',
240
  }
241
 
242
  TARGET_LIBRARY_MAP = {
243
- # 'STITCH': 'stitch.csv',
244
- 'ChEMBL33 (all species)': 'ChEMBL33_all_spe_single_prot_info.csv',
245
- 'DrugBank (Human)': 'drugbank_human_py_annot.csv',
246
  }
247
 
248
  DRUG_LIBRARY_MAP = {
249
- # 'ChEMBL': 'chembl.csv',
250
  'DrugBank (Human)': 'drugbank_human_py_annot.csv',
251
  }
252
 
@@ -257,21 +333,28 @@ MODE_LIST = [
257
  ]
258
 
259
  COLUMN_ALIASES = {
260
- 'X1': 'Drug SMILES',
261
  'X2': 'Target FASTA',
262
- 'ID1': 'Drug ID',
263
  'ID2': 'Target ID',
264
  }
265
 
266
- URL = "https://ciddr-lab.ac.cn/deepseqreen"
267
-
268
 
269
  def validate_columns(df, mandatory_cols):
270
  missing_cols = [col for col in mandatory_cols if col not in df.columns]
271
  if missing_cols:
272
  error_message = (f"The following mandatory columns are missing "
273
  f"in the uploaded dataset: {str(['X1', 'X2']).strip('[]')}.")
274
- raise gr.Error(error_message)
 
 
 
 
 
 
 
 
 
275
 
276
 
277
  def send_email(receiver, msg):
@@ -280,40 +363,48 @@ def send_email(receiver, msg):
280
 
281
  def submit_predict(predict_filepath, task, preset, target_family, flag, progress=gr.Progress(track_tqdm=True)):
282
  if flag:
283
- job_id = flag
284
- global COLUMN_ALIASES
285
- task = TASK_MAP[task]
286
- preset = PRESET_MAP[preset]
287
- target_family = TARGET_FAMILY_MAP[target_family]
288
- # email_hash = hashlib.sha256(email.encode()).hexdigest()
289
- COLUMN_ALIASES = COLUMN_ALIASES | {
290
- 'Y': 'Actual interaction' if task == 'binary' else 'Actual affinity',
291
- 'Y^': 'Predicted interaction' if task == 'binary' else 'Predicted affinity'
292
- }
293
-
294
- # target_family_list = [target_family]
295
- # for family in target_family_list:
296
-
297
- # try:
298
- prediction_df = pd.DataFrame()
299
- with hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference"):
300
- cfg = hydra.compose(
301
- config_name="webserver_inference",
302
- overrides=[f"task={task}",
303
- f"preset={preset}",
304
- f"ckpt_path=resources/checkpoints/{preset}-{task}-{target_family}.ckpt",
305
- f"data.data_file='{str(predict_filepath)}'"])
306
-
307
- predictions, _ = predict(cfg)
308
- predictions = [pd.DataFrame(prediction) for prediction in predictions]
309
- prediction_df = pd.concat([prediction_df, pd.concat(predictions, ignore_index=True)])
310
-
311
- predictions_file = f'{job_id}_predictions.csv'
312
- prediction_df.to_csv(predictions_file)
313
-
314
- return [gr.Markdown(visible=True),
315
- gr.File(predictions_file),
316
- gr.State(False)]
 
 
 
 
 
 
 
 
317
  #
318
  # except Exception as e:
319
  # raise gr.Error(str(e))
@@ -405,18 +496,18 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
405
  elif 'Y' in DF_FOR_REPORT.columns:
406
  value = 'Y'
407
 
408
- if value:
409
- if DF_FOR_REPORT['X1'].nunique() > 1 >= DF_FOR_REPORT['X2'].nunique():
410
- pie_chart = create_pie_chart(DF_FOR_REPORT, category='Scaffold SMILES', value=value, top_k=100)
411
- elif DF_FOR_REPORT['X2'].nunique() > 1 >= DF_FOR_REPORT['X1'].nunique():
412
- pie_chart = create_pie_chart(DF_FOR_REPORT, category='Target family', value=value, top_k=100)
413
 
414
  return create_html_report(DF_FOR_REPORT), pie_chart
415
  else:
416
  return gr.HTML(''), gr.Plot()
417
 
418
 
419
- def create_html_report(df, progress=gr.Progress(track_tqdm=True)):
420
  cols_left = ['ID2', 'Y', 'Y^', 'ID1', 'Compound', 'Scaffold', 'Scaffold SMILES', ]
421
  cols_right = ['X1', 'X2']
422
  cols_left = [col for col in cols_left if col in df.columns]
@@ -435,8 +526,12 @@ def create_html_report(df, progress=gr.Progress(track_tqdm=True)):
435
  # Return the DataFrame as HTML
436
  PandasTools.RenderImagesInAllDataFrames(images=True)
437
 
438
- html = df.to_html()
439
- return f'<div style="overflow:auto; height: 500px;">{html}</div>'
 
 
 
 
440
  # return gr.HTML(pn.widgets.Tabulator(df).embed())
441
 
442
 
@@ -495,45 +590,46 @@ def submit_report(score_list, filter_list, progress=gr.Progress(track_tqdm=True)
495
  df = DF_FOR_REPORT.copy()
496
  try:
497
  for filter_name in filter_list:
498
- pass
 
499
 
500
  for score_name in score_list:
501
  df[score_name] = df.swifter.progress_bar(desc=f"Calculating {score_name}").apply(
502
  SCORE_MAP[score_name], axis=1)
503
 
504
- pie_chart = None
505
- value = None
506
- if 'Y^' in df.columns:
507
- value = 'Y^'
508
- elif 'Y' in df.columns:
509
- value = 'Y'
510
-
511
- if value:
512
- if df['X1'].nunique() > 1 >= df['X2'].nunique():
513
- pie_chart = create_pie_chart(df, category='Scaffold SMILES', value=value, top_k=100)
514
- elif df['X2'].nunique() > 1 >= df['X1'].nunique():
515
- pie_chart = create_pie_chart(df, category='Target famiy', value=value, top_k=100)
516
 
517
- return create_html_report(df), pie_chart
518
 
519
  except Exception as e:
520
  raise gr.Error(str(e))
521
 
522
 
523
- def check_job_status(job_id):
524
- job_lock = DATA_PATH / f"{job_id}.lock"
525
- job_file = DATA_PATH / f"{job_id}.csv"
526
- if job_lock.is_file():
527
- return {gr.Markdown(f"Your job ({job_id}) is still running... "
528
- f"You may stay on this page or come back later to retrieve the results "
529
- f"Once you receive our email notification."),
530
- None,
531
- None
532
- }
533
- elif job_file.is_file():
534
- return {gr.Markdown(f"Your job ({job_id}) is done! Redirecting you to generate reports..."),
535
- gr.Tabs(selected=3),
536
- gr.File(str(job_lock))}
537
 
538
 
539
  def wrap_text(text, line_length=60):
@@ -602,60 +698,70 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
602
  with gr.Blocks() as screen_block:
603
  with gr.Column() as screen_page:
604
  with gr.Row():
605
- with gr.Column(scale=4, variant='panel'):
606
- target_fasta = gr.Code(label='Target sequence FASTA',
607
- interactive=True, lines=5)
608
- example_target = gr.Button(value='Example: Human MAPK14', elem_id='example')
609
  with gr.Row():
610
- with gr.Column(scale=1):
611
- with gr.Group():
612
- with gr.Row():
613
- target_input_type = gr.Radio(label='Target input type',
614
- choices=['Sequence', 'UniProt ID', 'Gene symbol'],
615
- value='Sequence')
616
- target_query = gr.Textbox(label='UniProt ID/Accession',
617
- visible=False, interactive=True)
618
- target_upload_btn = gr.UploadButton(label='Upload a FASTA file',
619
- type='binary',
620
- visible=True, variant='primary',
621
- size='lg', elem_classes="upload_button")
622
- target_query_btn = gr.Button(value='Query the sequence', variant='primary',
623
- elem_classes='upload_button', visible=False)
624
-
625
- with gr.Column(scale=1):
626
- with gr.Row():
627
- with gr.Group():
628
- drug_screen_target_family = gr.Dropdown(
629
- choices=list(TARGET_FAMILY_MAP.keys()),
630
- value='General',
631
- label='Target family', interactive=True)
632
- # with gr.Column(scale=1, min_width=24):
633
- auto_detect_btn = gr.Button(value='Auto-detect', variant='primary')
634
- HelpTip(
635
- "Target amino acid sequence in the FASTA format. Alternatively, you may use a "
636
- "UniProt ID/accession to query UniProt database for the sequence of your target"
637
- "of interest. You can also search on databases like UniProt, RCSB PDB, "
638
- "NCBI Protein for the FASTA string representing your target of interest. If "
639
- "the input FASTA contains multiple entities, only the first one will be used."
640
- )
641
-
642
- with gr.Column(variant='panel'):
643
- with gr.Group():
644
- drug_library = gr.Radio(label='Drug library',
645
- choices=list(DRUG_LIBRARY_MAP.keys()) + ['Upload a drug library'])
646
- drug_library_upload = gr.File(label='Custom drug library file', visible=True)
647
-
648
- with gr.Row(variant='panel'):
649
- drug_screen_task = gr.Radio(list(TASK_MAP.keys()), label='Task',
650
- value='Drug-target interaction')
651
-
652
- with gr.Column(scale=2):
653
- with gr.Group():
654
- drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Model')
655
- recommend_btn = gr.Button(value='Recommend a model', variant='primary')
 
 
 
 
 
 
 
 
 
 
 
 
656
  HelpTip("We recommend the appropriate model for your use case based on model performance "
657
- "in drug-target interaction or binding affinity prediction "
658
- "benchmarked on different target families and real-world data scenarios.")
 
659
 
660
  # drug_screen_email = gr.Textbox(
661
  # label='Email (optional)',
@@ -663,8 +769,8 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
663
  # )
664
 
665
  with gr.Row(visible=True):
666
- drug_screen_clr_btn = gr.ClearButton()
667
- drug_screen_btn = gr.Button(value='SCREEN', variant='primary')
668
  # TODO Modify the pd df directly with df['X2'] = target
669
 
670
  screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
@@ -685,37 +791,45 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
685
  with gr.Blocks() as identify_block:
686
  with gr.Column() as identify_page:
687
  with gr.Row():
688
- with gr.Group():
689
- drug_type = gr.Dropdown(label='Drug input type',
690
- choices=['SMILES', 'SDF'],
691
- value='SMILES',
692
- scale=1,
693
- interactive=True)
694
- drug_upload = gr.UploadButton(label='⤒ Upload a file')
695
- drug_smiles = gr.Code(label='Drug canonical SMILES', interactive=True, scale=5, lines=5)
696
- with gr.Column(scale=1):
697
  HelpTip(
698
- """Drug molecule in the SMILES format. You may search on databases like
699
- NCBI PubChem, ChEMBL, and DrugBank for the SMILES strings
700
- representing your drugs of interest.
 
701
  """
702
  )
703
- example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
 
 
704
 
705
- with gr.Column(variant='panel'):
706
- with gr.Group():
707
- target_library = gr.Radio(label='Target library',
708
- choices=list(TARGET_LIBRARY_MAP.keys()) + ['Upload a target library'])
709
- target_library_upload = gr.File(label='Custom target library file', visible=True)
710
 
711
- with gr.Row(visible=True):
712
- target_identify_task = gr.Dropdown(list(TASK_MAP.keys()), label='Task')
713
- HelpTip("Choose a preset model for making the predictions.")
714
- target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
715
- HelpTip("Choose the protein family of your target.")
716
- target_identify_target_family = gr.Dropdown(choices=['General'],
717
- value='General',
718
- label='Target family')
 
 
 
 
 
 
 
 
 
719
 
720
  # with gr.Row():
721
  # target_identify_email = gr.Textbox(
@@ -724,8 +838,8 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
724
  # )
725
 
726
  with gr.Row(visible=True):
727
- target_identify_clr_btn = gr.ClearButton()
728
- target_identify_btn = gr.Button(value='IDENTIFY', variant='primary')
729
 
730
  identify_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
731
  identify_waiting = gr.Markdown(f"Your job is running... It might take a few minutes."
@@ -763,8 +877,8 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
763
  # )
764
 
765
  with gr.Row(visible=True):
766
- pair_infer_clr_btn = gr.ClearButton()
767
- pair_infer_btn = gr.Button(value='INFER', variant='primary')
768
 
769
  infer_waiting = gr.Markdown(f"Your job is running... It might take a few minutes."
770
  f"When it's done, you will be redirected to the report page. "
@@ -783,7 +897,7 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
783
  ''')
784
  with gr.Row():
785
  file_for_report = gr.File(interactive=True, type='filepath')
786
- # df_original = gr.Dataframe(type="pandas", interactive=False, visible=False)
787
  scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Scores')
788
  filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Filters')
789
 
@@ -797,68 +911,105 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
797
  ranking_pie_chart = gr.Plot(visible=False)
798
 
799
  with gr.Row():
800
- csv_download_btn = gr.Button('Download report (HTML)', variant='primary')
801
- html_download_btn = gr.Button('Download raw data (CSV)', variant='primary')
 
 
 
 
802
 
803
 
804
  def target_input_type_select(input_type):
805
  match input_type:
806
  case 'UniProt ID':
807
- return [gr.UploadButton(visible=False),
808
- gr.Textbox(visible=True, label='UniProt ID/accession', info=None, value=''),
809
- gr.Button(visible=True)]
 
 
 
 
810
  case 'Gene symbol':
811
- return [gr.UploadButton(visible=False),
812
- gr.Textbox(visible=True, label='Gene symbol/name', info='Organism: human', value=''),
813
- gr.Button(visible=True)]
 
 
 
 
814
  case 'Sequence':
815
- return [gr.UploadButton(visible=True),
816
- gr.Textbox(visible=False), gr.Button(visible=False)]
817
-
818
-
819
- target_input_type.select(fn=target_input_type_select,
820
- inputs=target_input_type, outputs=[target_upload_btn, target_query, target_query_btn],
821
- show_progress=False)
 
 
 
 
 
 
 
 
 
 
822
 
823
 
824
- def uniprot_query(query, input_type):
825
  fasta_seq = ''
826
- query = query.strip()
827
 
828
  match input_type:
829
  case 'UniProt ID':
830
- query = f"{query.strip()}.fasta"
831
  case 'Gene symbol':
832
- query = f'search?query=organism_id:9606+AND+gene:{query}&format=fasta'
833
 
834
  try:
835
  fasta = SESSION.get(UNIPROT_ENDPOINT.format(query=query))
836
  fasta.raise_for_status()
837
  fasta_seq = fasta.text
838
  except Exception as e:
839
- raise gr.Warning(f"Failed to query FASTA from UniProt due to {str(e)}")
840
  finally:
841
  return fasta_seq
842
 
843
 
844
  target_upload_btn.upload(fn=lambda x: x.decode(), inputs=target_upload_btn, outputs=target_fasta)
845
- target_query_btn.click(uniprot_query, inputs=[target_query, target_input_type], outputs=target_fasta)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
846
 
847
  target_fasta.focus(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress=False)
848
  target_fasta.blur(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress=False)
849
- drug_smiles.focus(fn=wrap_text, inputs=drug_smiles, outputs=drug_smiles, show_progress=False)
850
- drug_smiles.blur(fn=wrap_text, inputs=drug_smiles, outputs=drug_smiles, show_progress=False)
 
 
851
 
852
 
853
  def example_fill(input_type):
854
- match input_type:
855
- case 'UniProt ID':
856
- query = 'Q16539'
857
- case 'Gene symbol':
858
- query = 'MAPK14'
859
- case _:
860
- query = ''
861
- return {target_query: query,
862
  target_fasta: """
863
  >sp|Q16539|MK14_HUMAN Mitogen-activated protein kinase 14 OS=Homo sapiens OX=9606 GN=MAPK14 PE=1 SV=3
864
  MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLRVAVKKLSRPFQ
@@ -870,101 +1021,218 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
870
  """}
871
 
872
 
873
- example_target.click(fn=example_fill, inputs=target_input_type,
874
- outputs=[target_query, target_fasta], show_progress=False)
875
- example_drug.click(fn=lambda: 'CC(=O)Oc1ccccc1C(=O)O', outputs=drug_smiles, show_progress=False)
876
 
877
 
878
- def drug_screen_validate(fasta, library, library_upload, state):
879
- if not state:
880
- def process_target_fasta(sequence):
881
- lines = sequence.strip().split("\n")
882
- if lines[0].startswith(">"):
883
- lines = lines[1:]
884
- return ''.join(lines).split(">")[0]
885
-
886
- fasta = process_target_fasta(fasta)
887
- err = validate_seq_str(fasta, FASTA_PAT)
888
- if err:
889
- raise gr.Error(f'Found error(s) in your target fasta input: {err}')
890
-
891
- if library in DRUG_LIBRARY_MAP.keys():
892
- screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
893
- else:
894
- screen_df = pd.read_csv(library_upload)
895
- validate_columns(screen_df, ['X1'])
896
-
897
- screen_df['X2'] = fasta
898
-
899
- job_id = uuid4()
900
- temp_file = Path(f'{job_id}_temp.csv').resolve()
901
- screen_df.to_csv(temp_file)
902
- if temp_file.is_file():
903
- return {screen_data_for_predict: str(temp_file),
904
- screen_flag: job_id,
905
- run_state: job_id}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
907
  else:
908
  gr.Warning('You have another prediction job '
909
  '(drug hit screening, target protein identification, or interation pair inference) '
910
  'running in the session right now. '
911
  'Please submit another job when your current job has finished.')
912
- return {screen_flag: False}
 
913
 
914
- def target_identify_validate(smiles, library, library_upload, state):
915
- if not state:
916
- err = validate_seq_str(smiles, SMILES_PAT)
917
- if err:
918
- raise gr.Error(f'Found error(s) in your compound SMILES input: {err}')
919
-
920
- if library in TARGET_LIBRARY_MAP.keys():
921
- identify_df = pd.read_csv(TARGET_LIBRARY_MAP['target_library'])
922
- else:
923
- identify_df = pd.read_csv(library_upload)
924
- validate_columns(identify_df, ['X2'])
925
-
926
- identify_df['X1'] = smiles
927
-
928
- job_id = uuid4()
929
- temp_file = Path(f'{job_id}_temp.csv').resolve()
930
- identify_df.to_csv(temp_file)
931
- if temp_file.is_file():
932
- return {identify_data_for_predict: str(temp_file),
933
- identify_flag: gr.State(job_id),
934
- run_state: gr.State(job_id)}
935
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936
  else:
937
  gr.Warning('You have another prediction job '
938
  '(drug hit screening, target protein identification, or interation pair inference) '
939
  'running in the session right now. '
940
  'Please submit another job when your current job has finished.')
941
- return {identify_flag: False}
 
 
942
 
943
 
944
- def pair_infer_validate(drug_target_pair_upload, run_state):
945
- if not run_state:
946
- df = pd.read_csv(drug_target_pair_upload)
947
- validate_columns(df, ['X1', 'X2'])
948
- df['X1_ERR'] = df['X1'].swifter.apply(
949
- validate_seq_str, regex=SMILES_PAT)
950
- df['X2_ERR'] = df['X2'].swifter.apply(
951
- validate_seq_str, regex=FASTA_PAT)
952
-
953
- if not df['X1_ERR'].isna().all():
954
- raise gr.Error(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
955
- if not df['X2_ERR'].isna().all():
956
- raise gr.Error(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
957
-
958
- job_id = uuid4()
959
- return {infer_flag: gr.State(job_id),
960
- run_state: gr.State(job_id)}
 
 
 
 
 
 
961
 
962
  else:
963
  gr.Warning('You have another prediction job '
964
  '(drug hit screening, target protein identification, or interation pair inference) '
965
  'running in the session right now. '
966
  'Please submit another job when your current job has finished.')
967
- return {infer_flag: False}
 
968
 
969
 
970
  drug_screen_btn.click(
@@ -980,25 +1248,25 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
980
  drug_screen_target_family, screen_flag], # , drug_screen_email],
981
  outputs=[file_for_report, run_state]
982
  ).then(
983
- fn=lambda: [gr.Column(visible=True), gr.Markdown(visible=False)],
984
- outputs=[screen_page, screen_waiting]
985
  )
986
 
987
  target_identify_btn.click(
988
  fn=target_identify_validate,
989
- inputs=[drug_smiles, target_library, target_library_upload, run_state], # , drug_screen_email],
990
  outputs=[identify_data_for_predict, identify_flag, run_state]
991
  ).then(
992
- fn=lambda: [gr.Column(visible=False), gr.Markdown(visible=True)],
993
- outputs=[identify_page, identify_waiting]
994
  ).then(
995
  fn=submit_predict,
996
  inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
997
  target_identify_target_family, identify_flag], # , target_identify_email],
998
  outputs=[file_for_report, run_state]
999
  ).then(
1000
- fn=lambda: [gr.Column(visible=True), gr.Markdown(visible=False)],
1001
- outputs=[identify_page, identify_waiting]
1002
  )
1003
 
1004
  pair_infer_btn.click(
@@ -1020,9 +1288,36 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1020
 
1021
  # TODO background job from these 3 pipelines to update file_for_report
1022
 
1023
- file_for_report.change(fn=update_df, inputs=file_for_report, outputs=[html_report, ranking_pie_chart])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1024
 
1025
- analyze_btn.click(fn=submit_report, inputs=[scores, filters], outputs=[html_report, ranking_pie_chart])
 
1026
 
1027
  # screen_waiting.change(fn=check_job_status, inputs=run_state, outputs=[pair_waiting, tabs, file_for_report],
1028
  # every=5)
@@ -1043,9 +1338,5 @@ if __name__ == "__main__":
1043
  # SCHEDULER.start()
1044
 
1045
  demo.launch(
1046
- # debug=True,
1047
  show_api=False,
1048
- # favicon_path=,
1049
- # inline=False
1050
- debug=True
1051
  )
 
11
  from pathlib import Path
12
  import sys
13
 
14
+ import numpy as np
15
+ from Bio.Align import PairwiseAligner
16
  # from email_validator import validate_email
17
  import gradio as gr
18
  import hydra
19
  import pandas as pd
20
  import plotly.express as px
21
  import requests
22
+ from rdkit.Chem.rdMolDescriptors import CalcNumRotatableBonds, CalcNumHeavyAtoms, CalcNumAtoms
23
  from requests.adapters import HTTPAdapter, Retry
24
  from rdkit import Chem
25
+ from rdkit.Chem import RDConfig, Descriptors, Draw, Lipinski, Crippen, PandasTools, AllChem
26
  from rdkit.Chem.Scaffolds import MurckoScaffold
27
  import seaborn as sns
28
 
29
  import swifter
30
  from tqdm.auto import tqdm
31
 
32
+ from deepscreen.data.dti import validate_seq_str, FASTA_PAT, SMILES_PAT
33
  from deepscreen.predict import predict
34
 
35
  sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
36
  import sascorer
37
 
38
  ROOT = Path.cwd()
 
39
 
40
  DF_FOR_REPORT = pd.DataFrame()
41
 
 
57
  # SCHEDULER = BackgroundScheduler()
58
 
59
  UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
60
+
61
  CSS = """
62
  .help-tip {
63
  position: absolute;
 
65
  top: 0px;
66
  right: 0px;
67
  text-align: center;
68
+ border-radius: 40%;
69
+ /* border: 2px solid darkred; background-color: #8B0000;*/
70
  width: 24px;
71
  height: 24px;
72
+ font-size: 16px;
73
  line-height: 26px;
74
  cursor: default;
75
  transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1);
 
77
 
78
  .help-tip:hover {
79
  cursor: pointer;
80
+ /*background-color: #ccc;*/
81
  }
82
 
83
  .help-tip:before {
84
  content: '?';
85
  font-weight: 700;
86
+ color: #8B0000;
87
  z-index: 100;
88
  }
89
 
 
91
  visibility: hidden;
92
  opacity: 0;
93
  text-align: left;
94
+ background-color: #EFDDE3;
95
  padding: 20px;
96
  width: 300px;
97
  position: absolute;
98
  border-radius: 4px;
99
  right: -4px;
100
+ color: #494F5A;
101
  font-size: 13px;
102
  line-height: normal;
103
  transform: scale(0.7);
 
119
  width: 0;
120
  height: 0;
121
  border: 6px solid transparent;
122
+ border-bottom-color: #EFDDE3;
123
  right: 10px;
124
  top: -12px;
125
  }
 
133
  left: 0;
134
  }
135
 
 
 
 
 
 
 
 
 
 
 
136
  .upload_button {
137
  background-color: #008000;
138
  }
 
166
 
167
 
168
  def sa_score(row):
169
+ return sascorer.calculateScore(row['Compound'])
170
 
171
 
172
  def mw(row):
173
+ return Chem.Descriptors.MolWt(row['Compound'])
174
+
175
+
176
+ def mr(row):
177
+ return Crippen.MolMR(row['Compound'])
178
 
179
 
180
  def hbd(row):
181
+ return Lipinski.NumHDonors(row['Compound'])
182
 
183
 
184
  def hba(row):
185
+ return Lipinski.NumHAcceptors(row['Compound'])
186
 
187
 
188
  def logp(row):
189
+ return Crippen.MolLogP(row['Compound'])
190
+
191
+
192
+ def atom(row):
193
+ return CalcNumAtoms(row['Compound'])
194
+
195
+
196
+ def heavy_atom(row):
197
+ return CalcNumHeavyAtoms(row['Compound'])
198
+
199
+
200
+ def rotatable_bond(row):
201
+ return CalcNumRotatableBonds((row['Compound']))
202
+
203
+
204
+ def lipinski(row):
205
+ """
206
+ Lipinski's rules:
207
+ Hydrogen bond donors <= 5
208
+ Hydrogen bond acceptors <= 10
209
+ Molecular weight <= 500 daltons
210
+ logP <= 5
211
+ """
212
+ if hbd(row) > 5:
213
+ return False
214
+ elif hba(row) > 10:
215
+ return False
216
+ elif mw(row) > 500:
217
+ return False
218
+ elif logp(row) > 5:
219
+ return False
220
+ else:
221
+ return True
222
+
223
+
224
+ def reos(row):
225
+ """
226
+ Rapid Elimination Of Swill filter:
227
+ Molecular weight between 200 and 500
228
+ LogP between -5.0 and +5.0
229
+ H-bond donor count between 0 and 5
230
+ H-bond acceptor count between 0 and 10
231
+ Formal charge between -2 and +2
232
+ Rotatable bond count between 0 and 8
233
+ Heavy atom count between 15 and 50
234
+ """
235
+ if not 200 < mw(row) < 500:
236
+ return False
237
+ elif not -5.0 < logp(row) < 5.0:
238
+ return False
239
+ elif not 0 < hbd(row) < 5:
240
+ return False
241
+ elif not 0 < hba(row) < 10:
242
+ return False
243
+ elif not 0 < rotatable_bond(row) < 8:
244
+ return False
245
+ elif not 15 < heavy_atom(row) < 50:
246
+ return False
247
+ else:
248
+ return True
249
+
250
+
251
+ def ghose(row):
252
+ """
253
+ Ghose drug like filter:
254
+ Molecular weight between 160 and 480
255
+ LogP between -0.4 and +5.6
256
+ Atom count between 20 and 70
257
+ Molar refractivity between 40 and 130
258
+ """
259
+ if not 160 < mw(row) < 480:
260
+ return False
261
+ elif not -0.4 < logp(row) < 5.6:
262
+ return False
263
+ elif not 20 < atom(row) < 70:
264
+ return False
265
+ elif not 40 < mr(row) < 130:
266
+ return False
267
+ else:
268
+ return True
269
 
270
 
271
  SCORE_MAP = {
272
  'SAscore': sa_score,
273
+ 'LogP': logp,
274
+ 'Molecular weight': mw,
275
+ 'Molar refractivity': mr,
276
+ 'H-bond donor count': hbd,
277
+ 'H-Bond acceptor count': hba,
278
+ 'Rotatable bond count': rotatable_bond,
279
+ # 'TopoPSA': None,
280
  }
281
 
282
  FILTER_MAP = {
283
+ 'REOS': reos,
284
+ "Lipinski's rule of 5": lipinski,
285
+ 'Ghose': ghose,
286
+ # 'Rule of 3': rule_of_3,
287
+ # 'Veber': veber,
288
+ # 'PAINS': pains,
289
  }
290
 
291
  TASK_MAP = {
292
+ 'Drug-target interaction': 'DTI',
293
+ 'Drug-target binding affinity': 'DTA',
294
  }
295
 
296
  PRESET_MAP = {
 
308
 
309
  TARGET_FAMILY_MAP = {
310
  'General': 'general',
311
+ 'Kinase': 'kinase',
312
+ 'Non-kinase enzyme': 'enzyme',
313
+ 'Membrane receptor': 'membrane',
314
+ 'Nuclear receptor': 'nuclear',
315
+ 'Ion channel': 'ion',
316
+ 'Other protein targets': 'others',
317
  }
318
 
319
  TARGET_LIBRARY_MAP = {
320
+ 'ChEMBL33 (all species)': 'ChEMBL33_all_spe_single_prot_info.csv.csv',
321
+ 'STITCH': 'stitch.csv',
322
+ 'Drug Repurposing Hub': 'drug_repurposing_hub.csv',
323
  }
324
 
325
  DRUG_LIBRARY_MAP = {
 
326
  'DrugBank (Human)': 'drugbank_human_py_annot.csv',
327
  }
328
 
 
333
  ]
334
 
335
  COLUMN_ALIASES = {
336
+ 'X1': 'Compound SMILES',
337
  'X2': 'Target FASTA',
338
+ 'ID1': 'Compound ID',
339
  'ID2': 'Target ID',
340
  }
341
 
 
 
342
 
343
  def validate_columns(df, mandatory_cols):
344
  missing_cols = [col for col in mandatory_cols if col not in df.columns]
345
  if missing_cols:
346
  error_message = (f"The following mandatory columns are missing "
347
  f"in the uploaded dataset: {str(['X1', 'X2']).strip('[]')}.")
348
+ raise ValueError(error_message)
349
+ else:
350
+ return
351
+
352
+
353
+ def process_target_fasta(sequence):
354
+ lines = sequence.strip().split("\n")
355
+ if lines[0].startswith(">"):
356
+ lines = lines[1:]
357
+ return ''.join(lines).split(">")[0]
358
 
359
 
360
  def send_email(receiver, msg):
 
363
 
364
  def submit_predict(predict_filepath, task, preset, target_family, flag, progress=gr.Progress(track_tqdm=True)):
365
  if flag:
366
+ try:
367
+ job_id = flag
368
+ global COLUMN_ALIASES
369
+ task = TASK_MAP[task]
370
+ preset = PRESET_MAP[preset]
371
+ target_family = TARGET_FAMILY_MAP[target_family]
372
+ # email_hash = hashlib.sha256(email.encode()).hexdigest()
373
+ COLUMN_ALIASES = COLUMN_ALIASES | {
374
+ 'Y': 'Actual interaction' if task == 'binary' else 'Actual affinity',
375
+ 'Y^': 'Predicted interaction' if task == 'binary' else 'Predicted affinity'
376
+ }
377
+
378
+ # target_family_list = [target_family]
379
+ # for family in target_family_list:
380
+
381
+ # try:
382
+ prediction_df = pd.DataFrame()
383
+ with hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference"):
384
+ cfg = hydra.compose(
385
+ config_name="webserver_inference",
386
+ overrides=[f"task={task}",
387
+ f"preset={preset}",
388
+ f"ckpt_path=resources/checkpoints/{preset}-{task}-{target_family}.ckpt",
389
+ f"data.data_file='{str(predict_filepath)}'"])
390
+
391
+ predictions, _ = predict(cfg)
392
+ predictions = [pd.DataFrame(prediction) for prediction in predictions]
393
+ prediction_df = pd.concat([prediction_df, pd.concat(predictions, ignore_index=True)])
394
+
395
+ predictions_file = f'temp/{job_id}_predictions.csv'
396
+ prediction_df.to_csv(predictions_file, index=False)
397
+
398
+ return [predictions_file,
399
+ False]
400
+ except Exception as e:
401
+ gr.Warning(f"Prediction job failed due to error: {str(e)}")
402
+ return [None,
403
+ False]
404
+
405
+ else:
406
+ return [None,
407
+ False]
408
  #
409
  # except Exception as e:
410
  # raise gr.Error(str(e))
 
496
  elif 'Y' in DF_FOR_REPORT.columns:
497
  value = 'Y'
498
 
499
+ # if value:
500
+ # if DF_FOR_REPORT['X1'].nunique() > 1 >= DF_FOR_REPORT['X2'].nunique():
501
+ # pie_chart = create_pie_chart(DF_FOR_REPORT, category='Scaffold SMILES', value=value, top_k=100)
502
+ # elif DF_FOR_REPORT['X2'].nunique() > 1 >= DF_FOR_REPORT['X1'].nunique():
503
+ # pie_chart = create_pie_chart(DF_FOR_REPORT, category='Target family', value=value, top_k=100)
504
 
505
  return create_html_report(DF_FOR_REPORT), pie_chart
506
  else:
507
  return gr.HTML(''), gr.Plot()
508
 
509
 
510
+ def create_html_report(df, file=None, progress=gr.Progress(track_tqdm=True)):
511
  cols_left = ['ID2', 'Y', 'Y^', 'ID1', 'Compound', 'Scaffold', 'Scaffold SMILES', ]
512
  cols_right = ['X1', 'X2']
513
  cols_left = [col for col in cols_left if col in df.columns]
 
526
  # Return the DataFrame as HTML
527
  PandasTools.RenderImagesInAllDataFrames(images=True)
528
 
529
+ if not file:
530
+ html = df.to_html()
531
+ return f'<div style="overflow:auto; height: 500px;">{html}</div>'
532
+ else:
533
+ html = df.to_html(file)
534
+ return html
535
  # return gr.HTML(pn.widgets.Tabulator(df).embed())
536
 
537
 
 
590
  df = DF_FOR_REPORT.copy()
591
  try:
592
  for filter_name in filter_list:
593
+ df[filter_name] = df.swifter.progress_bar(desc=f"Calculating {filter_name}").apply(
594
+ FILTER_MAP[filter_name], axis=1)
595
 
596
  for score_name in score_list:
597
  df[score_name] = df.swifter.progress_bar(desc=f"Calculating {score_name}").apply(
598
  SCORE_MAP[score_name], axis=1)
599
 
600
+ # pie_chart = None
601
+ # value = None
602
+ # if 'Y^' in df.columns:
603
+ # value = 'Y^'
604
+ # elif 'Y' in df.columns:
605
+ # value = 'Y'
606
+ #
607
+ # if value:
608
+ # if df['X1'].nunique() > 1 >= df['X2'].nunique():
609
+ # pie_chart = create_pie_chart(df, category='Scaffold SMILES', value=value, top_k=100)
610
+ # elif df['X2'].nunique() > 1 >= df['X1'].nunique():
611
+ # pie_chart = create_pie_chart(df, category='Target family', value=value, top_k=100)
612
 
613
+ return create_html_report(df), df # pie_chart
614
 
615
  except Exception as e:
616
  raise gr.Error(str(e))
617
 
618
 
619
+ # def check_job_status(job_id):
620
+ # job_lock = DATA_PATH / f"{job_id}.lock"
621
+ # job_file = DATA_PATH / f"{job_id}.csv"
622
+ # if job_lock.is_file():
623
+ # return {gr.Markdown(f"Your job ({job_id}) is still running... "
624
+ # f"You may stay on this page or come back later to retrieve the results "
625
+ # f"Once you receive our email notification."),
626
+ # None,
627
+ # None
628
+ # }
629
+ # elif job_file.is_file():
630
+ # return {gr.Markdown(f"Your job ({job_id}) is done! Redirecting you to generate reports..."),
631
+ # gr.Tabs(selected=3),
632
+ # gr.File(str(job_lock))}
633
 
634
 
635
  def wrap_text(text, line_length=60):
 
698
  with gr.Blocks() as screen_block:
699
  with gr.Column() as screen_page:
700
  with gr.Row():
701
+ with gr.Column():
 
 
 
702
  with gr.Row():
703
+ target_input_type = gr.Dropdown(
704
+ label='Target Input Type',
705
+ choices=['Sequence', 'UniProt ID', 'Gene symbol'],
706
+ info='Enter (paste) a FASTA string below manually or upload a FASTA file.',
707
+ value='Sequence',
708
+ scale=3, interactive=True
709
+ )
710
+ target_id = gr.Textbox(show_label=False, visible=False,
711
+ interactive=True, scale=4,
712
+ info='Query a sequence on UniProt with a UniProt ID.')
713
+ target_gene = gr.Textbox(
714
+ show_label=False, visible=False,
715
+ interactive=True, scale=4,
716
+ info='Query a sequence on UniProt with a gene symbol.')
717
+ target_organism = gr.Textbox(
718
+ info='Organism common name or scientific name (default: human).',
719
+ placeholder='Human', show_label=False,
720
+ visible=False, interactive=True, scale=4, )
721
+ HelpTip(
722
+ "Target amino acid sequence in the FASTA format. Alternatively, you may use a "
723
+ "UniProt ID/accession to query UniProt database for the sequence of your "
724
+ "target of interest. If the input FASTA contains multiple entities, "
725
+ "only the first one will be used."
726
+ )
727
+ with gr.Column():
728
+ drug_screen_target_family = gr.Dropdown(
729
+ choices=list(TARGET_FAMILY_MAP.keys()),
730
+ value='General',
731
+ label='Select Input Protein Family (Optional)', interactive=True)
732
+ # with gr.Column(scale=1, min_width=24):
733
+ HelpTip(
734
+ "Identify the protein family by conducting sequence alignment. "
735
+ "You may select General if you find the alignment score unsatisfactory."
736
+ )
737
+ with gr.Row():
738
+ with gr.Column():
739
+ target_upload_btn = gr.UploadButton(label='Upload a FASTA file', type='binary',
740
+ visible=True, variant='primary',
741
+ size='lg')
742
+ target_query_btn = gr.Button(value='Query the sequence', variant='primary',
743
+ visible=False)
744
+ target_family_detect_btn = gr.Button(value='Auto-detect', variant='primary')
745
+
746
+ target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5)
747
+ example_fasta = gr.Button(value='Example: Human MAPK14', elem_id='example')
748
+
749
+ with gr.Row():
750
+ with gr.Column():
751
+ drug_library = gr.Dropdown(label='Select a Compound Library',
752
+ choices=list(DRUG_LIBRARY_MAP.keys()))
753
+ drug_library_upload_btn = gr.UploadButton(
754
+ label='Upload a custom library', variant='primary')
755
+ drug_library_upload = gr.File(label='Custom drug library file', visible=False)
756
+ drug_screen_task = gr.Dropdown(list(TASK_MAP.keys()), label='Select a Prediction Task',
757
+ value='Drug-target interaction')
758
+ with gr.Column():
759
+ drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Select a Preset Model')
760
+ screen_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
761
  HelpTip("We recommend the appropriate model for your use case based on model performance "
762
+ "in drug-target interaction or binding affinity prediction. "
763
+ "The models were benchmarked on different target families "
764
+ "and real-world data scenarios.")
765
 
766
  # drug_screen_email = gr.Textbox(
767
  # label='Email (optional)',
 
769
  # )
770
 
771
  with gr.Row(visible=True):
772
+ drug_screen_clr_btn = gr.ClearButton(size='lg')
773
+ drug_screen_btn = gr.Button(value='SCREEN', variant='primary', size='lg')
774
  # TODO Modify the pd df directly with df['X2'] = target
775
 
776
  screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
 
791
  with gr.Blocks() as identify_block:
792
  with gr.Column() as identify_page:
793
  with gr.Row():
794
+ with gr.Column():
795
+ compound_type = gr.Dropdown(
796
+ label='Compound Input Type',
797
+ choices=['SMILES', 'SDF'],
798
+ info='Enter (paste) an SMILES string or upload an SMI file.',
799
+ value='SMILES',
800
+ interactive=True)
801
+ compound_upload_btn = gr.UploadButton(label='Upload', variant='primary', type='binary')
 
802
  HelpTip(
803
+ """Compound molecule in the SMILES format. You may input the SMILES string directly,
804
+ upload an SMI file, or upload an SDF file to convert to SMILES. Alternatively,
805
+ you may search on databases like NCBI PubChem, ChEMBL, and DrugBank for the SMILES
806
+ representing your drug of interest.
807
  """
808
  )
809
+ with gr.Column():
810
+ target_identify_target_family = gr.Dropdown(choices=['General'], value='General',
811
+ label='Target Protein Family')
812
 
813
+ compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
814
+ example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
 
 
 
815
 
816
+ with gr.Row():
817
+ with gr.Column():
818
+ target_library = gr.Dropdown(label='Select a Target Library',
819
+ choices=list(TARGET_LIBRARY_MAP.keys()))
820
+ target_library_upload_btn = gr.UploadButton(
821
+ label='Upload a custom library', variant='primary')
822
+ target_library_upload = gr.File(label='Custom target library file', visible=False)
823
+ target_identify_task = gr.Dropdown(list(TASK_MAP.keys()), label='Select a Prediction Task',
824
+ value='Drug-target interaction')
825
+
826
+ with gr.Column():
827
+ target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
828
+ identify_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
829
+ HelpTip("We recommend the appropriate model for your use case based on model performance "
830
+ "in drug-target interaction or binding affinity prediction. "
831
+ "The models were benchmarked on different target families "
832
+ "and real-world data scenarios.")
833
 
834
  # with gr.Row():
835
  # target_identify_email = gr.Textbox(
 
838
  # )
839
 
840
  with gr.Row(visible=True):
841
+ target_identify_clr_btn = gr.ClearButton(size='lg')
842
+ target_identify_btn = gr.Button(value='IDENTIFY', variant='primary', size='lg')
843
 
844
  identify_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
845
  identify_waiting = gr.Markdown(f"Your job is running... It might take a few minutes."
 
877
  # )
878
 
879
  with gr.Row(visible=True):
880
+ pair_infer_clr_btn = gr.ClearButton(size='lg')
881
+ pair_infer_btn = gr.Button(value='INFER', variant='primary', size='lg')
882
 
883
  infer_waiting = gr.Markdown(f"Your job is running... It might take a few minutes."
884
  f"When it's done, you will be redirected to the report page. "
 
897
  ''')
898
  with gr.Row():
899
  file_for_report = gr.File(interactive=True, type='filepath')
900
+ df_raw = gr.Dataframe(type="pandas", interactive=False, visible=False)
901
  scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Scores')
902
  filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Filters')
903
 
 
911
  ranking_pie_chart = gr.Plot(visible=False)
912
 
913
  with gr.Row():
914
+ with gr.Column():
915
+ csv_generate = gr.Button(value='Generate raw data (CSV)')
916
+ csv_download_file = gr.File(label='Download raw data (CSV)', visible=False)
917
+ with gr.Column():
918
+ html_generate = gr.Button(value='Generate report (HTML)')
919
+ html_download_file = gr.File(label='Download report (HTML)', visible=False)
920
 
921
 
922
  def target_input_type_select(input_type):
923
  match input_type:
924
  case 'UniProt ID':
925
+ return [gr.Dropdown(info=''),
926
+ gr.UploadButton(visible=False),
927
+ gr.Textbox(visible=True, value=''),
928
+ gr.Textbox(visible=False, value=''),
929
+ gr.Textbox(visible=False, value=''),
930
+ gr.Button(visible=True),
931
+ gr.Code(interactive=False, value='')]
932
  case 'Gene symbol':
933
+ return [gr.Dropdown(info=''),
934
+ gr.UploadButton(visible=False),
935
+ gr.Textbox(visible=False, value=''),
936
+ gr.Textbox(visible=True, value=''),
937
+ gr.Textbox(visible=True, value=''),
938
+ gr.Button(visible=True),
939
+ gr.Code(interactive=False, value='')]
940
  case 'Sequence':
941
+ return [gr.Dropdown(info='Enter (paste) a FASTA string below manually or upload a FASTA file.'),
942
+ gr.UploadButton(visible=True),
943
+ gr.Textbox(visible=False, value=''),
944
+ gr.Textbox(visible=False, value=''),
945
+ gr.Textbox(visible=False, value=''),
946
+ gr.Button(visible=False),
947
+ gr.Code(interactive=True, value='')]
948
+
949
+
950
+ target_input_type.select(
951
+ fn=target_input_type_select,
952
+ inputs=target_input_type,
953
+ outputs=[
954
+ target_input_type, target_upload_btn, target_id, target_gene, target_organism, target_query_btn
955
+ ],
956
+ show_progress=False
957
+ )
958
 
959
 
960
+ def uniprot_query(input_type, uid, gene, organism='Human'):
961
  fasta_seq = ''
 
962
 
963
  match input_type:
964
  case 'UniProt ID':
965
+ query = f"{uid.strip()}.fasta"
966
  case 'Gene symbol':
967
+ query = f'search?query=organism_name:{organism.strip()}+AND+gene:{gene.strip()}&format=fasta'
968
 
969
  try:
970
  fasta = SESSION.get(UNIPROT_ENDPOINT.format(query=query))
971
  fasta.raise_for_status()
972
  fasta_seq = fasta.text
973
  except Exception as e:
974
+ raise gr.Warning(f"Failed to query FASTA from UniProt database due to {str(e)}")
975
  finally:
976
  return fasta_seq
977
 
978
 
979
  target_upload_btn.upload(fn=lambda x: x.decode(), inputs=target_upload_btn, outputs=target_fasta)
980
+ target_query_btn.click(uniprot_query,
981
+ inputs=[target_input_type, target_id, target_gene, target_organism],
982
+ outputs=target_fasta)
983
+
984
+
985
+ def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)):
986
+ aligner = PairwiseAligner(scoring='blastp', mode='local')
987
+ alignment_df = pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv')
988
+
989
+ def align_score(query):
990
+ return aligner.align(process_target_fasta(fasta), query).score
991
+
992
+ alignment_df['score'] = alignment_df['X2'].swifter.progress_bar(
993
+ desc="Detecting protein family of the target...").apply(align_score)
994
+ row = alignment_df.loc[alignment_df['score'].idxmax()]
995
+ return gr.Dropdown(value=row['protein_family'].capitalize(),
996
+ info=f"Reason: Best BLASTP score ({row['score']}) with {row['ID2']} from family {row['protein_family']}")
997
+
998
+
999
+ target_family_detect_btn.click(fn=target_family_detect, inputs=target_fasta, outputs=drug_screen_target_family)
1000
 
1001
  target_fasta.focus(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress=False)
1002
  target_fasta.blur(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress=False)
1003
+
1004
+ drug_library_upload_btn.upload(fn=lambda x: [
1005
+ x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name])
1006
+ ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
1007
 
1008
 
1009
  def example_fill(input_type):
1010
+ return {target_id: 'Q16539',
1011
+ target_gene: 'MAPK14',
1012
+ target_organism: 'Human',
 
 
 
 
 
1013
  target_fasta: """
1014
  >sp|Q16539|MK14_HUMAN Mitogen-activated protein kinase 14 OS=Homo sapiens OX=9606 GN=MAPK14 PE=1 SV=3
1015
  MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLRVAVKKLSRPFQ
 
1021
  """}
1022
 
1023
 
1024
+ example_fasta.click(fn=example_fill, inputs=target_input_type,
1025
+ outputs=[target_id, target_gene, target_organism, target_fasta], show_progress=False)
 
1026
 
1027
 
1028
+ def screen_recommend_model(fasta, family, task):
1029
+ task = TASK_MAP[task]
1030
+ if task == 'DTI':
1031
+ train = pd.read_csv('data/benchmarks/all_families_reduced_dti_train.csv')
1032
+ score = 'AUROC'
1033
+ elif task == 'DTA':
1034
+ train = pd.read_csv('data/benchmarks/all_families_reduced_dta_train.csv')
1035
+ score = 'CI'
1036
+
1037
+ if fasta not in train['X2']:
1038
+ scenario = "Unseen target"
1039
+ else:
1040
+ scenario = "Seen target"
1041
+ benchmark_df = pd.read_csv('data/benchmarks/compound_screen.csv')
1042
+
1043
+ if task == 'General':
1044
+ filtered_df = benchmark_df[(benchmark_df[f'Task'] == task)
1045
+ & (benchmark_df['Target.family'] == 'All families reduced')
1046
+ & (benchmark_df['Scenario'] == 'Random split')
1047
+ & (benchmark_df['all'] == True)]
1048
+ else:
1049
+ filtered_df = benchmark_df[(benchmark_df['Task'] == task)
1050
+ & (benchmark_df['Target.family'] == family)
1051
+ & (benchmark_df['Scenario'] == scenario)
1052
+ & (benchmark_df['all'] == False)]
1053
+ row = filtered_df.loc[filtered_df[score].idxmax()]
1054
+
1055
+ return gr.Dropdown(value=row['preset'],
1056
+ info=f"Reason: {scenario} in the training dataset; we recommend the model "
1057
+ f"with the best {score} ({float(row[score]):.3f}) "
1058
+ f"in the {scenario.lower()} scenario on {family.lower()} family.")
1059
+
1060
+
1061
+ screen_preset_recommend_btn.click(fn=screen_recommend_model,
1062
+ inputs=[target_fasta, drug_screen_target_family, drug_screen_task],
1063
+ outputs=drug_screen_preset)
1064
+
1065
+
1066
+ # compound_smiles.focus(fn=wrap_text, inputs=compound_smiles, outputs=compound_smiles, show_progress=False)
1067
+ # compound_smiles.blur(fn=wrap_text, inputs=compound_smiles, outputs=compound_smiles, show_progress=False)
1068
+
1069
+ def compound_input_type_select(input_type):
1070
+ match input_type:
1071
+ case 'SMILES':
1072
+ return gr.Dropdown(info='Input an SMILES string or upload an SMI file')
1073
+ case 'SDF':
1074
+ return gr.Dropdown(info='Convert the first molecule in an SDF file to SMILES')
1075
+
1076
+
1077
+ compound_type.select(fn=compound_input_type_select,
1078
+ inputs=compound_type, outputs=compound_type, show_progress=False)
1079
+
1080
+
1081
+ def compound_upload_process(input_type, input_upload):
1082
+ match input_type:
1083
+ case 'SMILES':
1084
+ return input_upload.decode()
1085
+ case 'SDF':
1086
+ suppl = Chem.ForwardSDMolSupplier(io.BytesIO(input_upload))
1087
+ return Chem.MolToSmiles(next(suppl))
1088
+
1089
+
1090
+ compound_upload_btn.upload(fn=compound_upload_process,
1091
+ inputs=[compound_type, compound_upload_btn],
1092
+ outputs=compound_smiles)
1093
 
1094
+ example_drug.click(fn=lambda: 'CC(=O)Oc1ccccc1C(=O)O', outputs=compound_smiles, show_progress=False)
1095
+
1096
+ target_library_upload_btn.upload(fn=lambda x: [
1097
+ x.name, gr.Dropdown(value=Path(x.name).name, choices=list(TARGET_LIBRARY_MAP.keys()) + [Path(x.name).name])
1098
+ ], inputs=target_library_upload_btn, outputs=[target_library_upload, target_library])
1099
+
1100
+
1101
+ def identify_recommend_model(smiles, task):
1102
+ if task == 'Drug-target interaction':
1103
+ train = pd.read_csv('data/benchmarks/all_families_reduced_dti_train.csv')
1104
+ score = 'AUROC'
1105
+ elif task == 'Drug-target binding affinity':
1106
+ train = pd.read_csv('data/benchmarks/all_families_reduced_dta_train.csv')
1107
+ score = 'CI'
1108
+ task = TASK_MAP[task]
1109
+ if smiles not in train['X1']:
1110
+ scenario = "Unseen drug"
1111
+ else:
1112
+ scenario = "Seen drug"
1113
+ benchmark_df = pd.read_csv('data/benchmarks/target_identification.csv')
1114
+
1115
+ filtered_df = benchmark_df[(benchmark_df['Task'] == task)
1116
+ & (benchmark_df['Scenario'] == scenario)]
1117
+ row = filtered_df.loc[filtered_df[score].idxmax()]
1118
+
1119
+ return gr.Dropdown(value=row['preset'],
1120
+ info=f"Reason: {scenario} in the training dataset; choosing the model"
1121
+ f"with the best {score} ({row[score]}) "
1122
+ f"in the {scenario.lower()} scenario.")
1123
+
1124
+
1125
+ identify_preset_recommend_btn.click(fn=identify_recommend_model,
1126
+ inputs=[compound_smiles, target_identify_task],
1127
+ outputs=drug_screen_preset)
1128
+
1129
+
1130
+ def drug_screen_validate(fasta, library, library_upload, state, progress=gr.Progress(track_tqdm=True)):
1131
+ if not state:
1132
+ try:
1133
+ fasta = process_target_fasta(fasta)
1134
+ err = validate_seq_str(fasta, FASTA_PAT)
1135
+ if err:
1136
+ raise ValueError(f'Found error(s) in your target fasta input: {err}')
1137
+ if library in DRUG_LIBRARY_MAP.keys():
1138
+ screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
1139
+ else:
1140
+ screen_df = pd.read_csv(library_upload)
1141
+ validate_columns(screen_df, ['X1'])
1142
+
1143
+ screen_df['X2'] = fasta
1144
+
1145
+ job_id = uuid4()
1146
+ temp_file = Path(f'temp/{job_id}_input.csv').resolve()
1147
+ screen_df.to_csv(temp_file, index=False)
1148
+ if temp_file.is_file():
1149
+ return {screen_data_for_predict: str(temp_file),
1150
+ screen_flag: job_id,
1151
+ run_state: job_id}
1152
+ else:
1153
+ raise SystemError('Failed to create temporary files. Please try again later.')
1154
+ except Exception as e:
1155
+ gr.Warning(f'Failed to submit the job due to error: {str(e)}')
1156
+ return {screen_flag: False,
1157
+ run_state: False}
1158
  else:
1159
  gr.Warning('You have another prediction job '
1160
  '(drug hit screening, target protein identification, or interation pair inference) '
1161
  'running in the session right now. '
1162
  'Please submit another job when your current job has finished.')
1163
+ return {screen_flag: False,
1164
+ run_state: state}
1165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1166
 
1167
+ def target_identify_validate(smiles, library, library_upload, state, progress=gr.Progress(track_tqdm=True)):
1168
+ if not state:
1169
+ try:
1170
+ smiles = smiles.strip()
1171
+ err = validate_seq_str(smiles, SMILES_PAT)
1172
+ if err:
1173
+ raise ValueError(f'Found error(s) in your target fasta input: {err}')
1174
+ if library in TARGET_LIBRARY_MAP.keys():
1175
+ identify_df = pd.read_csv(TARGET_LIBRARY_MAP['target_library'])
1176
+ else:
1177
+ identify_df = pd.read_csv(library_upload)
1178
+ validate_columns(identify_df, ['X2'])
1179
+
1180
+ identify_df['X1'] = smiles
1181
+
1182
+ job_id = uuid4()
1183
+ temp_file = Path(f'temp/{job_id}_input.csv').resolve()
1184
+ identify_df.to_csv(temp_file, index=False)
1185
+ if temp_file.is_file():
1186
+ return {identify_data_for_predict: str(temp_file),
1187
+ identify_flag: job_id,
1188
+ run_state: job_id}
1189
+ else:
1190
+ raise SystemError('Failed to create temporary files. Please try again later.')
1191
+ except Exception as e:
1192
+ gr.Warning(f'Failed to submit the job due to error: {str(e)}')
1193
+ return {identify_flag: False,
1194
+ run_state: False}
1195
  else:
1196
  gr.Warning('You have another prediction job '
1197
  '(drug hit screening, target protein identification, or interation pair inference) '
1198
  'running in the session right now. '
1199
  'Please submit another job when your current job has finished.')
1200
+ return {identify_flag: False,
1201
+ run_state: state}
1202
+ # return {identify_flag: False}
1203
 
1204
 
1205
+ def pair_infer_validate(drug_target_pair_upload, state, progress=gr.Progress(track_tqdm=True)):
1206
+ if not state:
1207
+ try:
1208
+ df = pd.read_csv(drug_target_pair_upload)
1209
+ validate_columns(df, ['X1', 'X2'])
1210
+
1211
+ df['X1_ERR'] = df['X1'].swifter.progress_bar(desc="Validating SMILES...").apply(
1212
+ validate_seq_str, regex=SMILES_PAT)
1213
+ if not df['X1_ERR'].isna().all():
1214
+ raise ValueError(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
1215
+
1216
+ df['X2_ERR'] = df['X2'].swifter.progress_bar(desc="Validating FASTA...").apply(
1217
+ validate_seq_str, regex=FASTA_PAT)
1218
+ if not df['X2_ERR'].isna().all():
1219
+ raise ValueError(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
1220
+
1221
+ job_id = uuid4()
1222
+ return {infer_flag: job_id,
1223
+ run_state: job_id}
1224
+ except Exception as e:
1225
+ gr.Warning(f'Failed to submit the job due to error: {str(e)}')
1226
+ return {infer_flag: False,
1227
+ run_state: False}
1228
 
1229
  else:
1230
  gr.Warning('You have another prediction job '
1231
  '(drug hit screening, target protein identification, or interation pair inference) '
1232
  'running in the session right now. '
1233
  'Please submit another job when your current job has finished.')
1234
+ return {infer_flag: False,
1235
+ run_state: state}
1236
 
1237
 
1238
  drug_screen_btn.click(
 
1248
  drug_screen_target_family, screen_flag], # , drug_screen_email],
1249
  outputs=[file_for_report, run_state]
1250
  ).then(
1251
+ fn=lambda: [gr.Column(visible=True), gr.Markdown(visible=False), gr.Tabs(selected=3)],
1252
+ outputs=[screen_page, screen_waiting, tabs]
1253
  )
1254
 
1255
  target_identify_btn.click(
1256
  fn=target_identify_validate,
1257
+ inputs=[compound_smiles, target_library, target_library_upload, run_state], # , drug_screen_email],
1258
  outputs=[identify_data_for_predict, identify_flag, run_state]
1259
  ).then(
1260
+ fn=lambda: [gr.Column(visible=False), gr.Markdown(visible=True), gr.Tabs(selected=3)],
1261
+ outputs=[identify_page, identify_waiting, tabs]
1262
  ).then(
1263
  fn=submit_predict,
1264
  inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
1265
  target_identify_target_family, identify_flag], # , target_identify_email],
1266
  outputs=[file_for_report, run_state]
1267
  ).then(
1268
+ fn=lambda: [gr.Column(visible=True), gr.Markdown(visible=False), gr.Tabs(selected=3)],
1269
+ outputs=[identify_page, identify_waiting, tabs]
1270
  )
1271
 
1272
  pair_infer_btn.click(
 
1288
 
1289
  # TODO background job from these 3 pipelines to update file_for_report
1290
 
1291
+ file_for_report.change(fn=update_df, inputs=file_for_report, outputs=[
1292
+ html_report,
1293
+ df_raw,
1294
+ # ranking_pie_chart
1295
+ ])
1296
+ analyze_btn.click(fn=submit_report, inputs=[scores, filters], outputs=[
1297
+ html_report,
1298
+ df_raw,
1299
+ # ranking_pie_chart
1300
+ ])
1301
+
1302
+
1303
+ def create_csv_raw_file(df, file_report):
1304
+ from datetime import datetime
1305
+ now = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
1306
+ filename = f"reports/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv"
1307
+ df.to_csv(filename, index=False)
1308
+ return gr.File(filename, visible=True)
1309
+
1310
+
1311
+ def create_html_report_file(df, file_report):
1312
+ from datetime import datetime
1313
+ now = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
1314
+ filename = f"reports/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv"
1315
+ create_html_report(df, filename)
1316
+ return gr.File(filename, visible=True)
1317
+
1318
 
1319
+ csv_generate.click(fn=create_csv_raw_file, inputs=[df_raw, file_for_report], outputs=csv_download_file)
1320
+ html_generate.click(fn=create_html_report_file, inputs=[df_raw, file_for_report], outputs=html_download_file)
1321
 
1322
  # screen_waiting.change(fn=check_job_status, inputs=run_state, outputs=[pair_waiting, tabs, file_for_report],
1323
  # every=5)
 
1338
  # SCHEDULER.start()
1339
 
1340
  demo.launch(
 
1341
  show_api=False,
 
 
 
1342
  )