libokj commited on
Commit
2a4780f
·
1 Parent(s): 5cb3cd9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +262 -86
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import hashlib
 
2
  import json
3
  import textwrap
4
  import threading
@@ -20,7 +21,7 @@ import hydra
20
  import pandas as pd
21
  import plotly.express as px
22
  import requests
23
- from rdkit.Chem.rdMolDescriptors import CalcNumRotatableBonds, CalcNumHeavyAtoms, CalcNumAtoms
24
  from requests.adapters import HTTPAdapter, Retry
25
  from rdkit import Chem
26
  from rdkit.Chem import RDConfig, Descriptors, Draw, Lipinski, Crippen, PandasTools, AllChem
@@ -59,11 +60,13 @@ SESSION.mount('https://', ADAPTER)
59
 
60
  UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
61
 
 
 
62
  CSS = """
63
  .help-tip {
64
  position: absolute;
65
  display: inline-block;
66
- top: 24px;
67
  right: 0px;
68
  text-align: center;
69
  border-radius: 40%;
@@ -204,6 +207,10 @@ def rotatable_bond(row):
204
  return CalcNumRotatableBonds((row['Compound']))
205
 
206
 
 
 
 
 
207
  def lipinski(row):
208
  """
209
  Lipinski's rules:
@@ -271,24 +278,72 @@ def ghose(row):
271
  return True
272
 
273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  SCORE_MAP = {
275
  'SAscore': sa_score,
276
  'LogP': logp,
277
  'Molecular weight': mw,
 
278
  'Molar refractivity': mr,
279
  'H-bond donor count': hbd,
280
  'H-Bond acceptor count': hba,
281
  'Rotatable bond count': rotatable_bond,
282
- # 'TopoPSA': None,
283
  }
284
 
285
  FILTER_MAP = {
 
286
  'REOS': reos,
287
- "Lipinski's rule of 5": lipinski,
288
  'Ghose': ghose,
289
- # 'Rule of 3': rule_of_3,
290
- # 'Veber': veber,
291
- # 'PAINS': pains,
292
  }
293
 
294
  TASK_MAP = {
@@ -348,10 +403,12 @@ def validate_columns(df, mandatory_cols):
348
 
349
 
350
  def process_target_fasta(sequence):
351
- lines = sequence.strip().split("\n")
352
- if lines[0].startswith(">"):
353
- lines = lines[1:]
354
- return ''.join(lines).split(">")[0]
 
 
355
 
356
 
357
  def send_email(receiver, msg):
@@ -480,10 +537,11 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
480
  df['Scaffold SMILES'] = df['X1'].swifter.progress_bar(
481
  desc=f"Calculating scaffold...").apply(MurckoScaffold.MurckoScaffoldSmilesFromSmiles)
482
  # Add a new column with RDKit molecule objects
483
- PandasTools.AddMoleculeColumnToFrame(df, smilesCol='X1', molCol='Compound',
484
- includeFingerprints=False)
 
485
  PandasTools.AddMoleculeColumnToFrame(df, smilesCol='Scaffold SMILES', molCol='Scaffold',
486
- includeFingerprints=False)
487
  DF_FOR_REPORT = df.copy()
488
 
489
  # pie_chart = None
@@ -665,6 +723,43 @@ def smiles_from_sdf(sdf_path):
665
  return Chem.MolToSmiles(suppl[0])
666
 
667
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
  theme = gr.themes.Base(spacing_size="sm", text_size='md').set(
669
  background_fill_primary='#dfe6f0',
670
  background_fill_secondary='#dfe6f0',
@@ -697,10 +792,10 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
697
  with gr.Tabs() as tabs:
698
  with gr.TabItem(label='Drug hit screening', id=0):
699
  gr.Markdown('''
700
- # <center>DeepSEQreen Drug Hit Screening</center>
701
- <center>
702
- To predict interactions/binding affinities of a single target against a library of drugs.
703
- </center>
704
  ''')
705
  with gr.Blocks() as screen_block:
706
  with gr.Column() as screen_page:
@@ -751,13 +846,18 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
751
  visible=False)
752
  target_family_detect_btn = gr.Button(value='Auto-detect', variant='primary')
753
 
754
- target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5)
755
  example_fasta = gr.Button(value='Example: Human MAPK14', elem_id='example')
756
 
757
  with gr.Row():
758
  with gr.Column():
759
  drug_library = gr.Dropdown(label='Select a Compound Library',
760
  choices=list(DRUG_LIBRARY_MAP.keys()))
 
 
 
 
 
761
  drug_library_upload_btn = gr.UploadButton(
762
  label='Upload a custom library', variant='primary')
763
  drug_library_upload = gr.File(label='Custom drug library file', visible=False)
@@ -771,7 +871,6 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
771
  drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Select a Preset Model')
772
  screen_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
773
 
774
-
775
  # drug_screen_email = gr.Textbox(
776
  # label='Email (optional)',
777
  # info="Your email will be used to send you notifications when your job finishes."
@@ -791,11 +890,24 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
791
 
792
  with gr.TabItem(label='Target protein identification', id=1):
793
  gr.Markdown('''
794
- # <center>DeepSEQreen Target Protein Identification</center>
795
-
796
- <center>
797
- To predict interactions/binding affinities of a single drug against a library of targets.
798
- </center>
 
 
 
 
 
 
 
 
 
 
 
 
 
799
  ''')
800
  with gr.Blocks() as identify_block:
801
  with gr.Column() as identify_page:
@@ -816,9 +928,8 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
816
  interactive=True)
817
  compound_upload_btn = gr.UploadButton(label='Upload', variant='primary', type='binary')
818
 
819
- with gr.Column():
820
- target_identify_target_family = gr.Dropdown(choices=['General'], value='General',
821
- label='Target Protein Family')
822
 
823
  compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
824
  example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
@@ -827,6 +938,11 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
827
  with gr.Column():
828
  target_library = gr.Dropdown(label='Select a Target Library',
829
  choices=list(TARGET_LIBRARY_MAP.keys()))
 
 
 
 
 
830
  target_library_upload_btn = gr.UploadButton(
831
  label='Upload a custom library', variant='primary')
832
  target_library_upload = gr.File(label='Custom target library file', visible=False)
@@ -841,7 +957,6 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
841
  target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
842
  identify_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
843
 
844
-
845
  # with gr.Row():
846
  # target_identify_email = gr.Textbox(
847
  # label='Email (optional)',
@@ -861,41 +976,51 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
861
  gr.Markdown('''
862
  # <center>DeepSEQreen Interaction Pair Inference</center>
863
  <center>To predict interactions/binding affinities between any drug-target pairs.</center>
864
- ''')
865
- with gr.Blocks() as infer_block:
866
- with gr.Column() as infer_page:
867
- with gr.Column() as custom_upload:
868
- gr.Markdown("""
869
- Please upload a custom dataset CSV file with 2 required string columns and optionally 2 ID columns:
870
 
871
  <b>X1</b>: the SMILES string of a compound\n
872
  <b>X2</b>: the FASTA sequence of a target\n
873
- <b>ID1</b>: the ID (PubChem or any arbitrary unique identifier) of a compound\n
874
- <b>ID22</b>: the ID (UniProt or any arbitrary unique identifier) of a target
875
 
876
- Example:
877
 
878
  | X1 | X2 | ID1 | ID2 |
879
  |---------------------------------------- |---------------|--------------|--------|
880
  | CCOC(=O)Nc1ccc(NCc2ccc(F)cc2)cc1N | MVQKSRNGGV... | CHEMBL41355 | O88943 |
881
  | CCCCCc1cc(O)c(C/C=C(\C)CCC=C(C)C)c(O)c1 | MTSPSSSPVF... | CHEMBL497318 | Q9Y5S1 |
882
- """)
 
 
 
 
 
 
883
  gr.File(label="Example custom dataset",
884
  value="data/examples/interaction_pair_inference.csv",
885
  interactive=False)
886
  with gr.Column():
887
  infer_data_for_predict = gr.File(
888
- label='Custom dataset file', file_count="single", type='filepath', visible=True)
889
  with gr.Column() as pair_generate:
890
- gr.Markdown("""
891
- Upload a SDF file which contains multiple compounds of interest and a FASTA file which contains multiple targets of
892
- interest. All combinations of drug-target pairs from these two files will be automatically generated and submitted to
893
- a prediction job.
894
- """)
895
- pair_sdf = gr.File(label='SDF file containing multiple compounds')
896
- pair_fasta = gr.File(label='FASTA file containing multiple targets')
897
-
898
-
 
 
 
 
 
 
899
 
900
  with gr.Row(visible=True):
901
  pair_infer_task = gr.Dropdown(list(TASK_MAP.keys()), label='Task')
@@ -925,7 +1050,9 @@ a prediction job.
925
  # <center>DeepSEQreen Chemical Property Report</center>
926
  <center>
927
  To compute chemical properties for the predictions of drug hit screening,
928
- target protein identification, and interaction pair inference. You may also upload
 
 
929
  your own dataset. The page shows only a preview report displaying at most 30 records
930
  (with top predicted DTI/DTA if reporting results from a prediction job). For a full report, please
931
  generate and download a raw data CSV or interactive table HTML file below.
@@ -1002,6 +1129,7 @@ a prediction job.
1002
  case 'UniProt ID':
1003
  query = f"{uid.strip()}.fasta"
1004
  case 'Gene symbol':
 
1005
  query = f'search?query=organism_name:{organism.strip()}+AND+gene:{gene.strip()}&format=fasta'
1006
 
1007
  try:
@@ -1031,7 +1159,8 @@ a prediction job.
1031
  desc="Detecting protein family of the target...").apply(align_score)
1032
  row = alignment_df.loc[alignment_df['score'].idxmax()]
1033
  return gr.Dropdown(value=row['protein_family'].capitalize(),
1034
- info=f"Reason: Best BLASTP score ({row['score']}) with {row['ID2']} from family {row['protein_family']}")
 
1035
 
1036
 
1037
  target_family_detect_btn.click(fn=target_family_detect, inputs=target_fasta, outputs=drug_screen_target_family)
@@ -1132,6 +1261,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1132
  x.name, gr.Dropdown(value=Path(x.name).name, choices=list(TARGET_LIBRARY_MAP.keys()) + [Path(x.name).name])
1133
  ], inputs=target_library_upload_btn, outputs=[target_library_upload, target_library])
1134
 
 
1135
  def identify_recommend_model(smiles, task):
1136
  if task == 'Drug-target interaction':
1137
  train = pd.read_csv('data/benchmarks/all_families_reduced_dti_train.csv')
@@ -1161,6 +1291,31 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1161
  outputs=target_identify_preset)
1162
 
1163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1164
  def drug_screen_validate(fasta, library, library_upload, state, progress=gr.Progress(track_tqdm=True)):
1165
  if not state:
1166
  try:
@@ -1171,14 +1326,10 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1171
  if library in DRUG_LIBRARY_MAP.keys():
1172
  screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
1173
  else:
1174
- if library_upload.endswith('.csv'):
1175
- screen_df = pd.read_csv(library_upload)
1176
- elif library_upload.endswith('.sdf'):
1177
- screen_df = PandasTools.LoadSDF(library_upload,
1178
- smilesName='X1', molColName='Compound', includeFingerprints=True)
1179
- else:
1180
- raise gr.Error('Currently only CSV and SDF files are supported.')
1181
- validate_columns(screen_df, ['X1'])
1182
 
1183
  screen_df['X2'] = fasta
1184
 
@@ -1214,17 +1365,10 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1214
  if library in TARGET_LIBRARY_MAP.keys():
1215
  identify_df = pd.read_csv(Path('data/target_libraries', TARGET_LIBRARY_MAP[library]))
1216
  else:
1217
- if library_upload.endswith('.csv'):
1218
- identify_df = pd.read_csv(library_upload)
1219
- elif library_upload.endswith('.fasta'):
1220
- records = list(SeqIO.parse(library_upload, "fasta"))
1221
- id2 = [record.id for record in records]
1222
- seq = [str(record.seq) for record in records]
1223
- identify_df = pd.DataFrame({'ID2': id2, 'X2': seq})
1224
- else:
1225
- raise 'Currently only csv and fasta files are supported.'
1226
- validate_columns(identify_df, ['X2'])
1227
-
1228
  identify_df['X1'] = smiles
1229
 
1230
  job_id = uuid4()
@@ -1250,25 +1394,57 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1250
  # return {identify_flag: False}
1251
 
1252
 
1253
- def pair_infer_validate(drug_target_pair_upload, state, progress=gr.Progress(track_tqdm=True)):
 
1254
  if not state:
1255
  try:
1256
- df = pd.read_csv(drug_target_pair_upload)
1257
- validate_columns(df, ['X1', 'X2'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1258
 
1259
- df['X1_ERR'] = df['X1'].swifter.progress_bar(desc="Validating SMILES...").apply(
1260
- validate_seq_str, regex=SMILES_PAT)
1261
- if not df['X1_ERR'].isna().all():
1262
- raise ValueError(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
1263
 
1264
- df['X2_ERR'] = df['X2'].swifter.progress_bar(desc="Validating FASTA...").apply(
1265
- validate_seq_str, regex=FASTA_PAT)
1266
- if not df['X2_ERR'].isna().all():
1267
- raise ValueError(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1268
 
1269
- job_id = uuid4()
1270
- return {infer_flag: job_id,
1271
- run_state: job_id}
1272
  except Exception as e:
1273
  gr.Warning(f'Failed to submit the job due to error: {str(e)}')
1274
  return {infer_flag: False,
@@ -1319,8 +1495,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1319
 
1320
  pair_infer_btn.click(
1321
  fn=pair_infer_validate,
1322
- inputs=[infer_data_for_predict, run_state], # , drug_screen_email],
1323
- outputs=[infer_flag, run_state]
1324
  ).then(
1325
  fn=lambda: [gr.Column(visible=False), gr.Markdown(visible=True)],
1326
  outputs=[infer_page, infer_waiting]
 
1
  import hashlib
2
+ import itertools
3
  import json
4
  import textwrap
5
  import threading
 
21
  import pandas as pd
22
  import plotly.express as px
23
  import requests
24
+ from rdkit.Chem.rdMolDescriptors import CalcNumRotatableBonds, CalcNumHeavyAtoms, CalcNumAtoms, CalcTPSA
25
  from requests.adapters import HTTPAdapter, Retry
26
  from rdkit import Chem
27
  from rdkit.Chem import RDConfig, Descriptors, Draw, Lipinski, Crippen, PandasTools, AllChem
 
60
 
61
  UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
62
 
63
+ CUSTOM_DATASET_MAX_LEN = 10_000
64
+
65
  CSS = """
66
  .help-tip {
67
  position: absolute;
68
  display: inline-block;
69
+ top: 16px;
70
  right: 0px;
71
  text-align: center;
72
  border-radius: 40%;
 
207
  return CalcNumRotatableBonds((row['Compound']))
208
 
209
 
210
+ def tpsa(row):
211
+ return CalcTPSA((row['Compound']))
212
+
213
+
214
  def lipinski(row):
215
  """
216
  Lipinski's rules:
 
278
  return True
279
 
280
 
281
+ def veber(row):
282
+ """
283
+ The Veber filter is a rule of thumb filter for orally active drugs described in
284
+ Veber et al., J Med Chem. 2002; 45(12): 2615-23.:
285
+ Rotatable bonds <= 10
286
+ Topological polar surface area <= 140
287
+ """
288
+ if not rotatable_bond(row) <= 10:
289
+ return False
290
+ elif not tpsa(row) <= 140:
291
+ return False
292
+ else:
293
+ return True
294
+
295
+
296
+ def rule_of_three(row):
297
+ """
298
+ Rule of Three filter (Congreve et al., Drug Discov. Today. 8 (19): 876–7, (2003).):
299
+ Molecular weight <= 300
300
+ LogP <= 3
301
+ H-bond donor <= 3
302
+ H-bond acceptor count <= 3
303
+ Rotatable bond count <= 3
304
+ """
305
+ if not mw(row) <= 300:
306
+ return False
307
+ elif not logp(row) <= 3:
308
+ return False
309
+ elif not hbd(row) <= 3:
310
+ return False
311
+ elif not hba(row) <= 3:
312
+ return False
313
+ elif not rotatable_bond(row) <= 3:
314
+ return False
315
+ else:
316
+ return True
317
+
318
+
319
+ # def smarts_filter():
320
+ # alerts = Chem.MolFromSmarts("enter one smart here")
321
+ # detected_alerts = []
322
+ # for smiles in data['X1']:
323
+ # mol = Chem.MolFromSmiles(smiles)
324
+ # detected_alerts.append(mol.HasSubstructMatch(alerts))
325
+
326
+
327
  SCORE_MAP = {
328
  'SAscore': sa_score,
329
  'LogP': logp,
330
  'Molecular weight': mw,
331
+ 'Number of heavy atoms': heavy_atom,
332
  'Molar refractivity': mr,
333
  'H-bond donor count': hbd,
334
  'H-Bond acceptor count': hba,
335
  'Rotatable bond count': rotatable_bond,
336
+ 'Topological polar surface area': tpsa,
337
  }
338
 
339
  FILTER_MAP = {
340
+ # TODO support number_of_violations
341
  'REOS': reos,
342
+ "Lipinski's Rule of Five": lipinski,
343
  'Ghose': ghose,
344
+ 'Rule of Three': rule_of_three,
345
+ 'Veber': veber,
346
+ 'PAINS': pains,
347
  }
348
 
349
  TASK_MAP = {
 
403
 
404
 
405
  def process_target_fasta(sequence):
406
+ # lines = sequence.strip().split("\n")
407
+ # if lines[0].startswith(">"):
408
+ # lines = lines[1:]
409
+ # return ''.join(lines).split(">")[0]
410
+ record = SeqIO.parse(io.StringIO(sequence), "fasta")[0]
411
+ return str(record.seq)
412
 
413
 
414
  def send_email(receiver, msg):
 
537
  df['Scaffold SMILES'] = df['X1'].swifter.progress_bar(
538
  desc=f"Calculating scaffold...").apply(MurckoScaffold.MurckoScaffoldSmilesFromSmiles)
539
  # Add a new column with RDKit molecule objects
540
+ if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
541
+ PandasTools.AddMoleculeColumnToFrame(df, smilesCol='X1', molCol='Compound',
542
+ includeFingerprints=True)
543
  PandasTools.AddMoleculeColumnToFrame(df, smilesCol='Scaffold SMILES', molCol='Scaffold',
544
+ includeFingerprints=True)
545
  DF_FOR_REPORT = df.copy()
546
 
547
  # pie_chart = None
 
723
  return Chem.MolToSmiles(suppl[0])
724
 
725
 
726
+ def drug_library_from_sdf(sdf_path):
727
+ return PandasTools.LoadSDF(
728
+ sdf_path,
729
+ smilesName='X1', molColName='Compound', includeFingerprints=True
730
+ )
731
+
732
+
733
+ def process_target_library_upload(library_upload):
734
+ if library_upload.endswith('.csv'):
735
+ identify_df = pd.read_csv(library_upload)
736
+ elif library_upload.endswith('.fasta'):
737
+ identify_df = target_library_from_fasta(library_upload)
738
+ else:
739
+ raise gr.Error('Currently only CSV and FASTA files are supported as target libraries.')
740
+ validate_columns(identify_df, ['X2'])
741
+ return library_upload
742
+
743
+
744
+ def process_drug_library_upload(library_upload):
745
+ if library_upload.endswith('.csv'):
746
+ screen_df = pd.read_csv(library_upload)
747
+ elif library_upload.endswith('.sdf'):
748
+ screen_df = drug_library_from_sdf(library_upload)
749
+ else:
750
+ raise gr.Error('Currently only CSV and SDF files are supported as drug libraries.')
751
+ validate_columns(screen_df, ['X1'])
752
+ return library_upload
753
+
754
+
755
+ def target_library_from_fasta(fasta_path):
756
+ records = list(SeqIO.parse(fasta_path, "fasta"))
757
+ id2 = [record.id for record in records]
758
+ seq = [str(record.seq) for record in records]
759
+ df = pd.DataFrame({'ID2': id2, 'X2': seq})
760
+ return df
761
+
762
+
763
  theme = gr.themes.Base(spacing_size="sm", text_size='md').set(
764
  background_fill_primary='#dfe6f0',
765
  background_fill_secondary='#dfe6f0',
 
792
  with gr.Tabs() as tabs:
793
  with gr.TabItem(label='Drug hit screening', id=0):
794
  gr.Markdown('''
795
+ # <center>DeepSEQreen Drug Hit Screening</center>
796
+ <center>
797
+ To predict interactions/binding affinities of a single target against a library of drugs.
798
+ </center>
799
  ''')
800
  with gr.Blocks() as screen_block:
801
  with gr.Column() as screen_page:
 
846
  visible=False)
847
  target_family_detect_btn = gr.Button(value='Auto-detect', variant='primary')
848
 
849
+ target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5, max_lines=5)
850
  example_fasta = gr.Button(value='Example: Human MAPK14', elem_id='example')
851
 
852
  with gr.Row():
853
  with gr.Column():
854
  drug_library = gr.Dropdown(label='Select a Compound Library',
855
  choices=list(DRUG_LIBRARY_MAP.keys()))
856
+ with gr.Row():
857
+ gr.File(label='Example SDF drug library',
858
+ value='data/examples/drug_library.sdf', interactive=False)
859
+ gr.File(label='Example CSV drug library',
860
+ value='data/examples/drug_library.csv', interactive=False)
861
  drug_library_upload_btn = gr.UploadButton(
862
  label='Upload a custom library', variant='primary')
863
  drug_library_upload = gr.File(label='Custom drug library file', visible=False)
 
871
  drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Select a Preset Model')
872
  screen_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
873
 
 
874
  # drug_screen_email = gr.Textbox(
875
  # label='Email (optional)',
876
  # info="Your email will be used to send you notifications when your job finishes."
 
890
 
891
  with gr.TabItem(label='Target protein identification', id=1):
892
  gr.Markdown('''
893
+ # <center>DeepSEQreen Target Protein Identification</center>
894
+
895
+ <center>
896
+ To predict interactions/binding affinities of a single drug against a library of protein targets.
897
+ </center>
898
+
899
+ ℹ️ A custom target library can be a FASTA file with a single or multiple amino acid sequences,
900
+ or a CSV file has a required FASTA string column and optionally an ID column:
901
+
902
+ <b>X2</b>: the FASTA sequence of a target\n
903
+ <b>ID2</b> (optional): the ID (PubChem or any arbitrary unique identifier) of a compound\n
904
+
905
+ Example CSV target library:
906
+
907
+ | X2 | ID2 |
908
+ |---------------|--------|
909
+ | MVQKSRNGGV... | O88943 |
910
+ | MTSPSSSPVF... | Q9Y5S1 |
911
  ''')
912
  with gr.Blocks() as identify_block:
913
  with gr.Column() as identify_page:
 
928
  interactive=True)
929
  compound_upload_btn = gr.UploadButton(label='Upload', variant='primary', type='binary')
930
 
931
+ target_identify_target_family = gr.Dropdown(choices=['General'], value='General',
932
+ label='Target Protein Family')
 
933
 
934
  compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
935
  example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
 
938
  with gr.Column():
939
  target_library = gr.Dropdown(label='Select a Target Library',
940
  choices=list(TARGET_LIBRARY_MAP.keys()))
941
+ with gr.Row():
942
+ gr.File(label='Example FASTA target library',
943
+ value='data/examples/target_library.fasta', interactive=False)
944
+ gr.File(label='Example CSV target library',
945
+ value='data/examples/target_library.csv', interactive=False)
946
  target_library_upload_btn = gr.UploadButton(
947
  label='Upload a custom library', variant='primary')
948
  target_library_upload = gr.File(label='Custom target library file', visible=False)
 
957
  target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
958
  identify_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
959
 
 
960
  # with gr.Row():
961
  # target_identify_email = gr.Textbox(
962
  # label='Email (optional)',
 
976
  gr.Markdown('''
977
  # <center>DeepSEQreen Interaction Pair Inference</center>
978
  <center>To predict interactions/binding affinities between any drug-target pairs.</center>
979
+
980
+ ℹ️ A custom interaction pair dataset can be generated from a FASTA file containing multiple sequences
981
+ and a SDF file containing multiple compounds (for predicting DTI/DTA of all possible combinations of
982
+ drug-target pairs), or a CSV file with 2 required string columns and optionally 2 ID columns:
 
 
983
 
984
  <b>X1</b>: the SMILES string of a compound\n
985
  <b>X2</b>: the FASTA sequence of a target\n
986
+ <b>ID1</b> (optional): the ID (PubChem or any arbitrary unique identifier) of a compound\n
987
+ <b>ID2</b> (optional): the ID (UniProt or any arbitrary unique identifier) of a target
988
 
989
+ Example CSV interaction pair dataset:
990
 
991
  | X1 | X2 | ID1 | ID2 |
992
  |---------------------------------------- |---------------|--------------|--------|
993
  | CCOC(=O)Nc1ccc(NCc2ccc(F)cc2)cc1N | MVQKSRNGGV... | CHEMBL41355 | O88943 |
994
  | CCCCCc1cc(O)c(C/C=C(\C)CCC=C(C)C)c(O)c1 | MTSPSSSPVF... | CHEMBL497318 | Q9Y5S1 |
995
+ ''')
996
+ with gr.Blocks() as infer_block:
997
+ with gr.Column() as infer_page:
998
+ infer_type = gr.Dropdown(choices=['Upload a drug library and a target library',
999
+ 'Upload a CSV interaction pair dataset'],
1000
+ value='Upload a drug library and a target library')
1001
+ with gr.Column() as pair_upload:
1002
  gr.File(label="Example custom dataset",
1003
  value="data/examples/interaction_pair_inference.csv",
1004
  interactive=False)
1005
  with gr.Column():
1006
  infer_data_for_predict = gr.File(
1007
+ label='Upload a custom dataset', file_count="single", type='filepath', visible=True)
1008
  with gr.Column() as pair_generate:
1009
+ with gr.Row():
1010
+ gr.File(label='Example SDF drug library',
1011
+ value='data/examples/drug_library.sdf', interactive=False)
1012
+ gr.File(label='Example FASTA target library',
1013
+ value='data/examples/target_library.fasta', interactive=False)
1014
+ with gr.Row():
1015
+ gr.File(label='Example CSV drug library',
1016
+ value='data/examples/drug_library.csv', interactive=False)
1017
+ gr.File(label='Example CSV target library',
1018
+ value='data/examples/target_library.csv', interactive=False)
1019
+ with gr.Row():
1020
+ infer_drug = gr.File(label='SDF/CSV file containing multiple compounds',
1021
+ file_count="single", type='filepath')
1022
+ infer_target = gr.File(label='FASTA/CSV file containing multiple targets',
1023
+ file_count="single", type='filepath')
1024
 
1025
  with gr.Row(visible=True):
1026
  pair_infer_task = gr.Dropdown(list(TASK_MAP.keys()), label='Task')
 
1050
  # <center>DeepSEQreen Chemical Property Report</center>
1051
  <center>
1052
  To compute chemical properties for the predictions of drug hit screening,
1053
+ target protein identification, and interaction pair inference.
1054
+
1055
+ You may also upload
1056
  your own dataset. The page shows only a preview report displaying at most 30 records
1057
  (with top predicted DTI/DTA if reporting results from a prediction job). For a full report, please
1058
  generate and download a raw data CSV or interactive table HTML file below.
 
1129
  case 'UniProt ID':
1130
  query = f"{uid.strip()}.fasta"
1131
  case 'Gene symbol':
1132
+ organism = organism if organism else 'Human'
1133
  query = f'search?query=organism_name:{organism.strip()}+AND+gene:{gene.strip()}&format=fasta'
1134
 
1135
  try:
 
1159
  desc="Detecting protein family of the target...").apply(align_score)
1160
  row = alignment_df.loc[alignment_df['score'].idxmax()]
1161
  return gr.Dropdown(value=row['protein_family'].capitalize(),
1162
+ info=f"Reason: Best BLASTP score ({row['score']}) "
1163
+ f"with {row['ID2']} from family {row['protein_family']}")
1164
 
1165
 
1166
  target_family_detect_btn.click(fn=target_family_detect, inputs=target_fasta, outputs=drug_screen_target_family)
 
1261
  x.name, gr.Dropdown(value=Path(x.name).name, choices=list(TARGET_LIBRARY_MAP.keys()) + [Path(x.name).name])
1262
  ], inputs=target_library_upload_btn, outputs=[target_library_upload, target_library])
1263
 
1264
+
1265
  def identify_recommend_model(smiles, task):
1266
  if task == 'Drug-target interaction':
1267
  train = pd.read_csv('data/benchmarks/all_families_reduced_dti_train.csv')
 
1291
  outputs=target_identify_preset)
1292
 
1293
 
1294
+ def infer_type_change(upload_type):
1295
+ match upload_type:
1296
+ case "Upload a drug library and a target library":
1297
+ return {
1298
+ pair_upload: gr.Column(visible=False),
1299
+ pair_generate: gr.Column(visible=True),
1300
+ infer_data_for_predict: None,
1301
+ infer_drug: None,
1302
+ infer_target: None
1303
+ }
1304
+ match upload_type:
1305
+ case "Upload a CSV interaction pair dataset":
1306
+ return {
1307
+ pair_upload: gr.Column(visible=True),
1308
+ pair_generate: gr.Column(visible=False),
1309
+ infer_data_for_predict: None,
1310
+ infer_drug: None,
1311
+ infer_target: None
1312
+ }
1313
+
1314
+
1315
+ infer_type.select(fn=infer_type_change, inputs=infer_type,
1316
+ outputs=[pair_upload, pair_generate, infer_data_for_predict, infer_drug, infer_target])
1317
+
1318
+
1319
  def drug_screen_validate(fasta, library, library_upload, state, progress=gr.Progress(track_tqdm=True)):
1320
  if not state:
1321
  try:
 
1326
  if library in DRUG_LIBRARY_MAP.keys():
1327
  screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
1328
  else:
1329
+ screen_df = process_drug_library_upload(library_upload)
1330
+ if len(screen_df) >= CUSTOM_DATASET_MAX_LEN:
1331
+ raise gr.Error(f'The uploaded drug library has more records '
1332
+ f'than the allowed maximum (CUSTOM_DATASET_MAX_LEN).')
 
 
 
 
1333
 
1334
  screen_df['X2'] = fasta
1335
 
 
1365
  if library in TARGET_LIBRARY_MAP.keys():
1366
  identify_df = pd.read_csv(Path('data/target_libraries', TARGET_LIBRARY_MAP[library]))
1367
  else:
1368
+ identify_df = process_target_library_upload(library_upload)
1369
+ if len(identify_df) >= CUSTOM_DATASET_MAX_LEN:
1370
+ raise gr.Error(f'The uploaded target library has more records '
1371
+ f'than the allowed maximum (CUSTOM_DATASET_MAX_LEN).')
 
 
 
 
 
 
 
1372
  identify_df['X1'] = smiles
1373
 
1374
  job_id = uuid4()
 
1394
  # return {identify_flag: False}
1395
 
1396
 
1397
+ def pair_infer_validate(drug_target_pair_upload, drug_upload, target_upload, state,
1398
+ progress=gr.Progress(track_tqdm=True)):
1399
  if not state:
1400
  try:
1401
+ job_id = uuid4()
1402
+ if drug_target_pair_upload:
1403
+ infer_df = pd.read_csv(drug_target_pair_upload)
1404
+ validate_columns(infer_df, ['X1', 'X2'])
1405
+
1406
+ infer_df['X1_ERR'] = infer_df['X1'].swifter.progress_bar(desc="Validating SMILES...").apply(
1407
+ validate_seq_str, regex=SMILES_PAT)
1408
+ if not infer_df['X1_ERR'].isna().all():
1409
+ raise ValueError(
1410
+ f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
1411
+
1412
+ infer_df['X2_ERR'] = infer_df['X2'].swifter.progress_bar(desc="Validating FASTA...").apply(
1413
+ validate_seq_str, regex=FASTA_PAT)
1414
+ if not infer_df['X2_ERR'].isna().all():
1415
+ raise ValueError(
1416
+ f"Encountered invalid FASTA:\n{infer_df[~infer_df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
1417
+
1418
+ return {infer_data_for_predict: str(drug_target_pair_upload),
1419
+ infer_flag: job_id,
1420
+ run_state: job_id}
1421
 
1422
+ elif drug_upload and target_upload:
1423
+ drug_df = process_drug_library_upload(drug_upload)
1424
+ target_df = process_target_library_upload(target_upload)
 
1425
 
1426
+ drug_df.drop_duplicates(subset=['X1'], inplace=True)
1427
+ target_df.drop_duplicates(subset=['X2'], inplace=True)
1428
+
1429
+ infer_df = pd.DataFrame(list(itertools.product(drug_df['X1'], target_df['X2'])),
1430
+ columns=['X1', 'X2'])
1431
+ infer_df = infer_df.merge(drug_df, on='X1').merge(target_df, on='X2')
1432
+
1433
+ temp_file = Path(f'temp/{job_id}_input.csv').resolve()
1434
+ infer_df.to_csv(temp_file, index=False)
1435
+ if temp_file.is_file():
1436
+ return {infer_data_for_predict: str(temp_file),
1437
+ infer_flag: job_id,
1438
+ run_state: job_id}
1439
+
1440
+ else:
1441
+ raise gr.Error('Should upload a drug-target pair dataset,or '
1442
+ 'upload both a drug library and a target library.')
1443
+
1444
+ if len(infer_df) >= CUSTOM_DATASET_MAX_LEN:
1445
+ raise gr.Error(f'The uploaded/generated drug-target pair dataset has more records '
1446
+ f'than the allowed maximum (CUSTOM_DATASET_MAX_LEN).')
1447
 
 
 
 
1448
  except Exception as e:
1449
  gr.Warning(f'Failed to submit the job due to error: {str(e)}')
1450
  return {infer_flag: False,
 
1495
 
1496
  pair_infer_btn.click(
1497
  fn=pair_infer_validate,
1498
+ inputs=[infer_data_for_predict, infer_drug, infer_target, run_state], # , drug_screen_email],
1499
+ outputs=[infer_data_for_predict, infer_flag, run_state]
1500
  ).then(
1501
  fn=lambda: [gr.Column(visible=False), gr.Markdown(visible=True)],
1502
  outputs=[infer_page, infer_waiting]