libokj commited on
Commit
51a0841
·
1 Parent(s): 41c7c53

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -76
app.py CHANGED
@@ -29,7 +29,7 @@ from rdkit.DataStructs import BulkTanimotoSimilarity
29
  from requests.adapters import HTTPAdapter, Retry
30
  from markdown import markdown
31
  from rdkit import Chem, DataStructs
32
- from rdkit.Chem import Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen, AllChem
33
  from rdkit.Chem.Scaffolds import MurckoScaffold
34
  import seaborn as sns
35
 
@@ -231,6 +231,16 @@ COLUMN_ALIASES = {
231
  'Y^': 'Predicted CPI/CPA',
232
  }
233
 
 
 
 
 
 
 
 
 
 
 
234
  pd.set_option('display.float_format', '{:.3f}'.format)
235
  PandasTools.molRepresentation = 'svg'
236
  PandasTools.drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
@@ -277,6 +287,15 @@ def check_expiry():
277
  send_email(job)
278
 
279
 
 
 
 
 
 
 
 
 
 
280
  def max_tanimoto_similarity(smi, seen_smiles_with_fp):
281
  if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty:
282
  return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
@@ -294,7 +313,7 @@ def max_tanimoto_similarity(smi, seen_smiles_with_fp):
294
  return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
295
 
296
  mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
297
- sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
298
  idx = sims.argmax()
299
  compound = seen_smiles_with_fp.iloc[idx]['X1']
300
  if 'ID1' in seen_smiles_with_fp.columns:
@@ -658,6 +677,7 @@ def lookup_job(job_id):
658
  retry = 0
659
  while not stop:
660
  try:
 
661
  Job = Query()
662
  jobs = db.search((Job.id == job_id))
663
  if jobs:
@@ -671,7 +691,6 @@ def lookup_job(job_id):
671
  if job.get('expiry_time'):
672
  expiry_time = ts_to_str(job['expiry_time'], get_timezone_by_ip(job['ip']))
673
  if job_status == "RUNNING":
674
- sleep(5)
675
  yield {
676
  pred_lookup_status: f'''
677
  Your **{job_type}** job (ID: **{job_id}**) started at
@@ -713,10 +732,12 @@ using the job id. You will also receive an email notification once the job is do
713
  }
714
  else:
715
  stop = (retry > 3)
716
- msg = f'Job ID {job_id} not found. Retrying... ({retry})'
 
 
 
717
  gr.Info(msg)
718
  retry += 1
719
- sleep(5)
720
  yield {
721
  pred_lookup_status: msg,
722
  pred_lookup_btn: gr.Button(visible=True),
@@ -864,10 +885,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
864
  if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
865
  for family in prediction_df['Target Family'].unique():
866
  family_smiles_df = get_seen_smiles(family=family, task=task_value)
867
- family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
868
- lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
869
- Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
870
- )
871
 
872
  @cache
873
  def max_sim(smi):
@@ -881,12 +899,13 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
881
 
882
  if "Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target" in opts:
883
  x2 = prediction_df['X2'].iloc[0]
884
- pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)]
885
- pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(
886
- lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
887
- Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
888
- )
889
- max_sim = cache(partial(max_tanimoto_similarity, seen_smiles_with_fp=pos_compounds_df))
 
890
  prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
891
  prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
892
  )
@@ -904,6 +923,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
904
  prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
905
  prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
906
  )
 
907
 
908
  calculate_max_sequence_identity.cache_clear()
909
 
@@ -946,7 +966,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
946
  'error': error,
947
  'input_file': predict_filepath,
948
  'output_file': predictions_file},
949
- job_query)
950
  if job_info := db.search(job_query)[0]:
951
  if job_info.get('email'):
952
  send_email(job_info)
@@ -981,21 +1001,6 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
981
  if 'Y^' in df.columns:
982
  df['Y^'] = 10 ** (-df['Y^'])
983
 
984
- # DF_FOR_REPORT = df.copy()
985
-
986
- # pie_chart = None
987
- # value = None
988
- # if 'Y^' in DF_FOR_REPORT.columns:
989
- # value = 'Y^'
990
- # elif 'Y' in DF_FOR_REPORT.columns:
991
- # value = 'Y'
992
-
993
- # if value:
994
- # if DF_FOR_REPORT['X1'].nunique() > 1 >= DF_FOR_REPORT['X2'].nunique():
995
- # pie_chart = create_pie_chart(DF_FOR_REPORT, category='Scaffold SMILES', value=value, top_k=100)
996
- # elif DF_FOR_REPORT['X2'].nunique() > 1 >= DF_FOR_REPORT['X1'].nunique():
997
- # pie_chart = create_pie_chart(DF_FOR_REPORT, category='Target family', value=value, top_k=100)
998
-
999
  return {html_report: create_html_report(df, file=None, task=task),
1000
  raw_df: df,
1001
  report_df: df.copy(),
@@ -1121,23 +1126,30 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1121
  f'<div style="overflow:auto; height: 300px; font-family: Courier !important;">{table_html}</div>')
1122
 
1123
  else:
 
 
 
 
 
 
 
 
 
 
 
 
1124
  bool_formatters = {col: BooleanFormatter() for col in bool_cols}
1125
  float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
1126
  other_formatters = {
1127
  'Predicted Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
1128
  'Actual Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
1129
- 'Compound': HTMLTemplateFormatter(template='<div class="image-zoom-viewer"><%= value %></div>'),
1130
- 'Scaffold': HTMLTemplateFormatter(template='<div class="image-zoom-viewer"><%= value %></div>'),
1131
  'Target FASTA': {'type': 'textarea', 'width': 60},
1132
- 'Target ID': HTMLTemplateFormatter(
1133
- template='<a href="<% '
1134
- 'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) '
1135
- '{ %>https://www.uniprot.org/uniprotkb/<%= value %><% } '
1136
- 'else { %>https://www.uniprot.org/uniprotkb?query=<%= value %><% } '
1137
- '%>" target="_blank"><%= value %></a>'),
1138
- 'Compound ID': HTMLTemplateFormatter(
1139
- template='<a href="https://pubchem.ncbi.nlm.nih.gov/compound/<%= value %>" '
1140
- 'target="_blank"><%= value %></a>')
1141
  }
1142
  formatters = {**bool_formatters, **float_formatters, **other_formatters}
1143
 
@@ -1161,8 +1173,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1161
  subset=df_html.columns == col, cmap=sns.light_palette(num_col_colors[i], as_cmap=True).reversed())
1162
 
1163
  pie_charts = {}
1164
- for y in df_html.columns.intersection(['Predicted Interaction Probability', 'Actual Interaction Probability',
1165
- 'Predicted Binding Affinity', 'Actual Binding Affinity']):
1166
  pie_charts[y] = []
1167
  for k in [10, 30, 100]:
1168
  if k < len(df_html):
@@ -1348,19 +1359,6 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
1348
  df_report[score_name] = df_report['Compound'].parallel_apply(
1349
  lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
1350
 
1351
- # pie_chart = None
1352
- # value = None
1353
- # if 'Y^' in df.columns:
1354
- # value = 'Y^'
1355
- # elif 'Y' in df.columns:
1356
- # value = 'Y'
1357
- #
1358
- # if value:
1359
- # if df['X1'].nunique() > 1 >= df['X2'].nunique():
1360
- # pie_chart = create_pie_chart(df, category='Scaffold SMILES', value=value, top_k=100)
1361
- # elif df['X2'].nunique() > 1 >= df['X1'].nunique():
1362
- # pie_chart = create_pie_chart(df, category='Target family', value=value, top_k=100)
1363
-
1364
  return (create_html_report(df_report, file=None, task=task), df_report,
1365
  gr.File(visible=False), gr.File(visible=False))
1366
 
@@ -1572,17 +1570,13 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1572
 
1573
  drug_screen_opts = gr.CheckboxGroup(
1574
  label="Step 6. Select Additional Options",
1575
- choices=[
1576
- 'Include Compound Max. Tanimoto Similarity to Training Compounds',
1577
- 'Include Target Max. Sequence Identity to Known Interacting Targets of Compound',
1578
- 'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
1579
- ],
1580
- info="These are experimental features and may increase the job computation time. "
1581
  "Compound Max. Tanimoto Similarity to Training Compounds and "
1582
  "Target Max. Sequence Identity to Known Interacting Targets of Compound "
1583
- "are indicative of the predictive reliability of the model (the higher the more reliable), "
1584
- "while Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target "
1585
- "is indicative of the novelty of the compound (the lower the more novel)."
1586
  )
1587
  with gr.Row():
1588
  with gr.Column():
@@ -1687,9 +1681,9 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1687
  target_identify_opts = gr.CheckboxGroup(
1688
  ['Include Target Max. Sequence Identity to Training Targets'],
1689
  label='Step 6. Select Additional Options',
1690
- info="These are experimental features and may increase the job computation time. "
1691
- "Target Max. Sequence Identity to Training Targets is indicative of the "
1692
- "predictive reliability of the model (the higher the more reliable)."
1693
  )
1694
  with gr.Row():
1695
  with gr.Column():
@@ -2015,10 +2009,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
2015
  ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
2016
 
2017
  drug_screen_task.select(
2018
- fn=lambda task, opts: [opt for opt in opts if opt not in [
2019
- 'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
2020
- 'Include Target Max. Sequence Identity to Known Interacting Targets of Compound'
2021
- ]] if task == 'Compound-Protein Binding Affinity' else opts,
2022
  inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts,
2023
  show_progress='hidden'
2024
  )
@@ -2089,9 +2081,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2089
  scenario = "Unseen Target (<0.85 sequence identity)"
2090
 
2091
  return {drug_screen_preset:
2092
- gr.Dropdown(value=row['Model'],
2093
- info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
2094
- f"model with the best {score} in the {scenario} scenario on {row['Family']}."),
2095
  drug_screen_target_family:
2096
  gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
2097
 
 
29
  from requests.adapters import HTTPAdapter, Retry
30
  from markdown import markdown
31
  from rdkit import Chem, DataStructs
32
+ from rdkit.Chem import AllChem, Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen
33
  from rdkit.Chem.Scaffolds import MurckoScaffold
34
  import seaborn as sns
35
 
 
231
  'Y^': 'Predicted CPI/CPA',
232
  }
233
 
234
+ DRUG_SCRENN_CPI_OPTS = [
235
+ 'Include Compound Max. Tanimoto Similarity to Training Compounds',
236
+ 'Include Target Max. Sequence Identity to Known Interacting Targets of Compound',
237
+ 'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
238
+ ]
239
+
240
+ DRUG_SCRENN_CPA_OPTS = [
241
+ 'Include Compound Max. Tanimoto Similarity to Training Compounds',
242
+ ]
243
+
244
  pd.set_option('display.float_format', '{:.3f}'.format)
245
  PandasTools.molRepresentation = 'svg'
246
  PandasTools.drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
 
287
  send_email(job)
288
 
289
 
290
+ def smiles_to_ecfp(smiles):
291
+ mol = Chem.MolFromSmiles(smiles)
292
+ if mol:
293
+ ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
294
+ else:
295
+ ecfp = []
296
+ return ecfp
297
+
298
+
299
  def max_tanimoto_similarity(smi, seen_smiles_with_fp):
300
  if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty:
301
  return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
 
313
  return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
314
 
315
  mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
316
+ sims = pd.Series(BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'].values)).to_numpy()
317
  idx = sims.argmax()
318
  compound = seen_smiles_with_fp.iloc[idx]['X1']
319
  if 'ID1' in seen_smiles_with_fp.columns:
 
677
  retry = 0
678
  while not stop:
679
  try:
680
+ sleep(5)
681
  Job = Query()
682
  jobs = db.search((Job.id == job_id))
683
  if jobs:
 
691
  if job.get('expiry_time'):
692
  expiry_time = ts_to_str(job['expiry_time'], get_timezone_by_ip(job['ip']))
693
  if job_status == "RUNNING":
 
694
  yield {
695
  pred_lookup_status: f'''
696
  Your **{job_type}** job (ID: **{job_id}**) started at
 
732
  }
733
  else:
734
  stop = (retry > 3)
735
+ if not stop:
736
+ msg = f'Job ID {job_id} not found. Retrying... ({retry})'
737
+ else:
738
+ msg = f'Job ID {job_id} not found after {retry} retries. Please check the job ID and try again.'
739
  gr.Info(msg)
740
  retry += 1
 
741
  yield {
742
  pred_lookup_status: msg,
743
  pred_lookup_btn: gr.Button(visible=True),
 
885
  if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
886
  for family in prediction_df['Target Family'].unique():
887
  family_smiles_df = get_seen_smiles(family=family, task=task_value)
888
+ family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(smiles_to_ecfp)
 
 
 
889
 
890
  @cache
891
  def max_sim(smi):
 
899
 
900
  if "Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target" in opts:
901
  x2 = prediction_df['X2'].iloc[0]
902
+ pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
903
+ pos_compounds_df['FP'] = pos_compounds_df['X1'].apply(smiles_to_ecfp)
904
+
905
+ @cache
906
+ def max_sim(smiles):
907
+ return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
908
+
909
  prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
910
  prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
911
  )
 
923
  prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
924
  prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
925
  )
926
+ prediction_df.drop(['N'], axis=1, inplace=True)
927
 
928
  calculate_max_sequence_identity.cache_clear()
929
 
 
966
  'error': error,
967
  'input_file': predict_filepath,
968
  'output_file': predictions_file},
969
+ job_query)
970
  if job_info := db.search(job_query)[0]:
971
  if job_info.get('email'):
972
  send_email(job_info)
 
1001
  if 'Y^' in df.columns:
1002
  df['Y^'] = 10 ** (-df['Y^'])
1003
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1004
  return {html_report: create_html_report(df, file=None, task=task),
1005
  raw_df: df,
1006
  report_df: df.copy(),
 
1126
  f'<div style="overflow:auto; height: 300px; font-family: Courier !important;">{table_html}</div>')
1127
 
1128
  else:
1129
+ image_zoom_formatter = HTMLTemplateFormatter(template='<div class="image-zoom-viewer"><%= value %></div>')
1130
+ uniprot_id_formatter = HTMLTemplateFormatter(
1131
+ template='<a href="<% '
1132
+ 'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) '
1133
+ '{ %>https://www.uniprot.org/uniprotkb/<%= value %><% } '
1134
+ 'else { %><textarea style="width: 60ch;"><%= value %></textarea><% } '
1135
+ '%>" target="_blank"><%= value %></a>'
1136
+ )
1137
+ pubchem_id_formatter = HTMLTemplateFormatter(
1138
+ template='<a href="https://pubchem.ncbi.nlm.nih.gov/#query=<%= value %>" '
1139
+ 'target="_blank"><%= value %></a>'
1140
+ )
1141
  bool_formatters = {col: BooleanFormatter() for col in bool_cols}
1142
  float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
1143
  other_formatters = {
1144
  'Predicted Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
1145
  'Actual Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
1146
+ 'Compound': image_zoom_formatter,
1147
+ 'Scaffold': image_zoom_formatter,
1148
  'Target FASTA': {'type': 'textarea', 'width': 60},
1149
+ 'Target ID': uniprot_id_formatter,
1150
+ 'Compound ID': pubchem_id_formatter,
1151
+ 'Max. Sequence Identity Target': uniprot_id_formatter,
1152
+ 'Max. Tanimoto Similarity Compound': pubchem_id_formatter,
 
 
 
 
 
1153
  }
1154
  formatters = {**bool_formatters, **float_formatters, **other_formatters}
1155
 
 
1173
  subset=df_html.columns == col, cmap=sns.light_palette(num_col_colors[i], as_cmap=True).reversed())
1174
 
1175
  pie_charts = {}
1176
+ for y in df_html.columns.intersection(['Interaction Probability', 'Binding Affinity']):
 
1177
  pie_charts[y] = []
1178
  for k in [10, 30, 100]:
1179
  if k < len(df_html):
 
1359
  df_report[score_name] = df_report['Compound'].parallel_apply(
1360
  lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
1361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1362
  return (create_html_report(df_report, file=None, task=task), df_report,
1363
  gr.File(visible=False), gr.File(visible=False))
1364
 
 
1570
 
1571
  drug_screen_opts = gr.CheckboxGroup(
1572
  label="Step 6. Select Additional Options",
1573
+ choices=DRUG_SCRENN_CPI_OPTS,
1574
+ info="Experimental features - may increase the job computation time. "
 
 
 
 
1575
  "Compound Max. Tanimoto Similarity to Training Compounds and "
1576
  "Target Max. Sequence Identity to Known Interacting Targets of Compound "
1577
+ "suggest predictive reliability (higher values - greater reliability)."
1578
+ "Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target "
1579
+ "suggests the novelty of the compound (lower values - higher novelty)."
1580
  )
1581
  with gr.Row():
1582
  with gr.Column():
 
1681
  target_identify_opts = gr.CheckboxGroup(
1682
  ['Include Target Max. Sequence Identity to Training Targets'],
1683
  label='Step 6. Select Additional Options',
1684
+ info="Experimental features - may increase the job computation time. "
1685
+ "Target Max. Sequence Identity to Training Targets suggest "
1686
+ "predictive reliability (higher values - greater reliability))."
1687
  )
1688
  with gr.Row():
1689
  with gr.Column():
 
2009
  ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
2010
 
2011
  drug_screen_task.select(
2012
+ fn=lambda task, opts: gr.CheckboxGroup(choices=DRUG_SCRENN_CPA_OPTS)
2013
+ if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup(choices=DRUG_SCRENN_CPI_OPTS),
 
 
2014
  inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts,
2015
  show_progress='hidden'
2016
  )
 
2081
  scenario = "Unseen Target (<0.85 sequence identity)"
2082
 
2083
  return {drug_screen_preset:
2084
+ gr.Dropdown(value=row['Model'],
2085
+ info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
2086
+ f"model with the best {score} in the {scenario} scenario on {row['Family']}."),
2087
  drug_screen_target_family:
2088
  gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
2089