libokj commited on
Commit
f0342df
·
1 Parent(s): c359763

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -186
app.py CHANGED
@@ -45,7 +45,7 @@ import panel as pn
45
  from apscheduler.schedulers.background import BackgroundScheduler
46
  from tinydb import TinyDB, Query
47
 
48
- # import swifter
49
  from tqdm.auto import tqdm
50
 
51
  from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
@@ -156,7 +156,6 @@ display: inline-block !important;
156
  footer {
157
  visibility: hidden
158
  }
159
-
160
  """
161
 
162
 
@@ -192,7 +191,11 @@ def rgb_to_hex(rgb):
192
  def mol_to_pharm3d(mol, mode='html'):
193
  if mol is None:
194
  return
195
- AllChem.Compute2DCoords(mol)
 
 
 
 
196
 
197
  feats = FEAT_FACTORY.GetFeaturesForMol(mol)
198
 
@@ -291,13 +294,23 @@ COLUMN_ALIASES = {
291
  }
292
 
293
  DRUG_SCRENN_CPI_OPTS = [
294
- 'Include Compound Max. Tanimoto Similarity to Training Compounds',
295
- 'Include Target Max. Sequence Identity to Known Interacting Targets of Compound',
296
- 'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
297
  ]
298
 
299
  DRUG_SCRENN_CPA_OPTS = [
300
- 'Include Compound Max. Tanimoto Similarity to Training Compounds',
 
 
 
 
 
 
 
 
 
 
301
  ]
302
 
303
  pd.set_option('display.float_format', '{:.3f}'.format)
@@ -383,6 +396,13 @@ def max_tanimoto_similarity(smi, seen_smiles_with_fp):
383
  return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound}
384
 
385
 
 
 
 
 
 
 
 
386
  def max_sequence_identity(seq, seen_fastas):
387
  if seq is None or seen_fastas is None or seen_fastas.empty:
388
  return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None}
@@ -395,20 +415,12 @@ def max_sequence_identity(seq, seen_fastas):
395
  target = id2
396
  return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
397
 
398
- aligner = PairwiseAligner()
399
- aligner.mode = 'local'
400
  max_iden = 0
401
  target = None
402
  for fasta in seen_fastas['X2'].values:
403
- alignment = aligner.align(seq, fasta)
404
- identity = alignment.score / max(len(seq), len(fasta))
405
- if identity == 1:
406
- target = fasta
407
- if 'ID2' in seen_fastas.columns:
408
- id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
409
- if pd.notnull(id2) and id2 != '':
410
- target = id2
411
- return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
412
  if identity > max_iden:
413
  max_iden = identity
414
  target = fasta
@@ -416,7 +428,10 @@ def max_sequence_identity(seq, seen_fastas):
416
  id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
417
  if pd.notnull(id2) and id2 != '':
418
  target = id2
 
 
419
 
 
420
  return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target}
421
 
422
 
@@ -846,12 +861,12 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
846
  orig_df['Target Family'] = None
847
  if orig_df['Target Family'].isna().any():
848
  orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
849
- orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
850
  )
851
  orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
852
  detect_family.cache_clear()
853
 
854
- orig_df['X1^'] = orig_df['X1'].parallel_apply(rdkit_canonicalize)
855
 
856
  orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
857
  annotated_df = orig_df[~orig_df['Y'].isna()].copy()
@@ -952,66 +967,88 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
952
  df_list = [prediction_df, annotated_df]
953
  prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True)
954
 
955
- if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
956
- for family in prediction_df['Target Family'].unique():
957
- family_smiles_df = get_seen_smiles(family=family, task=task_value)
958
- family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(smiles_to_ecfp)
959
-
960
- @cache
961
- def max_sim(smi):
962
- return max_tanimoto_similarity(smi, family_smiles_df)['Max. Tanimoto Similarity']
963
 
964
- prediction_df.loc[
965
- prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
966
- prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
967
- )
968
- max_sim.cache_clear()
969
 
970
- if "Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target" in opts:
971
  x2 = prediction_df['X2'].iloc[0]
972
  pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
973
- pos_compounds_df['FP'] = pos_compounds_df['X1'].apply(smiles_to_ecfp)
974
 
975
  @cache
976
  def max_sim(smiles):
977
  return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
978
 
979
- prediction_df[['Max. Tanimoto Similarity to Known Target Ligands',
980
- 'Max. Tanimoto Similarity Target Ligand']] = (
981
- prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
982
- )
 
983
  max_sim.cache_clear()
984
 
985
- if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
986
  x2 = prediction_df['X2'].iloc[0]
987
- prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
988
 
989
  @cache
990
- def calculate_max_sequence_identity(compound):
991
- compound_targets = df_training.loc[df_training['X1'] == compound]
992
- return max_sequence_identity(x2, seen_fastas=compound_targets)
993
 
994
- prediction_df[['Max. Sequence Identity to Known Ligand Targets',
995
- 'Max. Sequence Identity Ligand Target']] = (
996
- prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
997
  )
998
  prediction_df.drop(['X1^'], axis=1, inplace=True)
999
 
1000
- calculate_max_sequence_identity.cache_clear()
1001
 
1002
- if "Include Target Max. Sequence Identity to Training Targets" in opts:
1003
- for family in prediction_df['Target Family'].unique():
1004
- family_fastas_df = get_seen_fastas(family=family, task=task_value)
 
1005
 
1006
- @cache
1007
- def max_id(seq):
1008
- return max_sequence_identity(seq, seen_fastas=family_fastas_df)['Max. Sequence Identity']
 
1009
 
1010
- prediction_df.loc[
1011
- prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
1012
- prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
1013
- )
1014
- max_id.cache_clear()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1015
 
1016
  prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
1017
  status = "COMPLETED"
@@ -1063,10 +1100,10 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
1063
 
1064
  if 'X1' in df.columns:
1065
  if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
1066
- df['Compound'] = df['X1'].parallel_apply(
1067
  lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
1068
- df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
1069
- df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
1070
 
1071
  if task == 'Compound-Protein Binding Affinity':
1072
  # Convert Y^ from pIC50 to IC50
@@ -1114,17 +1151,17 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1114
  columns_unique = None
1115
 
1116
  if 'Exclude Pharmacophore 3D' not in opts:
1117
- df_html['Pharmacophore'] = df_html['Compound'].parallel_apply(
1118
  lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
1119
 
1120
  if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
1121
- df_html['Compound'] = df_html['Compound'].parallel_apply(
1122
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1123
  else:
1124
  df_html.drop(['Compound'], axis=1, inplace=True)
1125
 
1126
  if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
1127
- df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
1128
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1129
  else:
1130
  df_html.drop(['Scaffold'], axis=1, inplace=True)
@@ -1138,15 +1175,20 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1138
  if any(col in df_html.columns for col in ['Y^', 'Y']):
1139
  job = 'Target Protein Identification'
1140
  category = 'Target Family'
1141
- columns_unique = df_html.columns.isin(['X1', 'ID1', 'Scaffold', 'Compound', 'Scaffold SMILES']
1142
- + list(FILTER_MAP.keys()) + list(SCORE_MAP.keys()))
 
 
 
1143
 
1144
  elif n_compound >= 2 and n_protein == 1:
1145
  unique_entity = 'Target of Interest'
1146
  if any(col in df_html.columns for col in ['Y^', 'Y']):
1147
  job = 'Drug Hit Screening'
1148
  category = 'Scaffold SMILES'
1149
- columns_unique = df_html.columns.isin(['X2', 'ID2'])
 
 
1150
 
1151
  elif 'Y^' in df_html.columns:
1152
  job = 'Interaction Pair Inference'
@@ -1154,7 +1196,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1154
  df_html.rename(columns=column_aliases, inplace=True)
1155
  df_html.index.name = 'Index'
1156
  if 'Target FASTA' in df_html.columns:
1157
- df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
1158
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1159
 
1160
  num_cols = df_html.select_dtypes('number').columns
@@ -1172,7 +1214,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1172
  if 'Target ID' in df_html.columns:
1173
  df_html.drop(['Target FASTA'], axis=1, inplace=True)
1174
  if 'Target FASTA' in df_html.columns:
1175
- df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
1176
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1177
  if 'Scaffold SMILES' in df_html.columns:
1178
  df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
@@ -1248,9 +1290,9 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1248
 
1249
  report_table = pn.widgets.Tabulator(
1250
  df_html, formatters=formatters,
1251
- frozen_columns=[col for col in df_html.columns if col in [
1252
- 'Target ID', 'Compound ID', 'Compound'
1253
- ]],
1254
  disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
1255
 
1256
  for i, col in enumerate(num_cols):
@@ -1279,71 +1321,15 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1279
  # Remove keys with empty values
1280
  pie_charts = {k: v for k, v in pie_charts.items() if any(v)}
1281
 
1282
- pn_css = """
1283
- .tabulator {
1284
- font-family: Courier New !important;
1285
- font-weight: normal !important;
1286
- font-size: 12px !important;
1287
- }
1288
-
1289
- .tabulator-cell {
1290
- overflow: visible !important;
1291
- }
1292
-
1293
- .tabulator-cell:hover {
1294
- z-index: 1000 !important;
1295
- }
1296
-
1297
- .tabulator-cell.tabulator-frozen:hover {
1298
- z-index: 1000 !important;
1299
- }
1300
-
1301
- .image-zoom-viewer {
1302
- display: inline-block;
1303
- overflow: visible;
1304
- z-index: 1000;
1305
- }
1306
-
1307
- .image-zoom-viewer::after {
1308
- content: "";
1309
- top: 0;
1310
- left: 0;
1311
- width: 100%;
1312
- height: 100%;
1313
- pointer-events: none;
1314
- }
1315
-
1316
- .image-zoom-viewer:hover::after {
1317
- pointer-events: all;
1318
- }
1319
-
1320
- /* When hovering over the container, scale its child (the SVG) */
1321
- .tabulator-cell:hover .image-zoom-viewer svg {
1322
- padding: 3px;
1323
- position: absolute;
1324
- background-color: rgba(250, 250, 250, 0.854);
1325
- box-shadow: 0 0 10px rgba(0, 0, 0, 0.618);
1326
- border-radius: 3px;
1327
- transform: scale(3); /* Scale up the SVG */
1328
- transition: transform 0.3s ease;
1329
- pointer-events: none; /* Prevents the SVG from blocking mouse interactions */
1330
- z-index: 1000;
1331
- }
1332
-
1333
- .image-zoom-viewer svg {
1334
- display: block; /* SVG is a block-level element for proper scaling */
1335
- z-index: 1000;
1336
- }
1337
-
1338
- .image-zoom-viewer:hover {
1339
- z-index: 1000;
1340
- }
1341
- """
1342
-
1343
- pn.extension(raw_css=[pn_css], js_files={
1344
- '3Dmol': './3Dmol-min.js',
1345
- 'panel_custom': './panel.js',
1346
- })
1347
 
1348
  template = pn.template.VanillaTemplate(
1349
  title=f'DeepSEQreen {job} Report',
@@ -1359,7 +1345,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1359
  if unique_df is not None:
1360
  unique_table = pn.widgets.Tabulator(unique_df, formatters=formatters, sizing_mode='stretch_width',
1361
  show_index=False, disabled=True,
1362
- frozen_columns=['Compound ID', 'Compound', 'Scaffold'])
1363
  # if pie_charts:
1364
  # unique_table.width = 640
1365
  stats_pane.append(pn.Column(f'### {unique_entity}', unique_table))
@@ -1451,11 +1437,11 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
1451
  df_report = df.copy()
1452
  try:
1453
  for filter_name in filter_list:
1454
- df_report[filter_name] = df_report['Compound'].parallel_apply(
1455
  lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
1456
 
1457
  for score_name in score_list:
1458
- df_report[score_name] = df_report['Compound'].parallel_apply(
1459
  lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
1460
 
1461
  return (create_html_report(df_report, file=None, task=task), df_report,
@@ -1667,16 +1653,25 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1667
  label='OR Upload Your Own Library', variant='primary')
1668
  drug_library_upload = gr.File(label='Custom compound library file', visible=False)
1669
 
1670
- drug_screen_opts = gr.CheckboxGroup(
1671
- label="Step 6. Select Additional Options",
1672
- choices=DRUG_SCRENN_CPI_OPTS,
1673
- info="Experimental features - may increase the job computation time. "
1674
- "Compound Max. Tanimoto Similarity to Training Compounds and "
1675
- "Target Max. Sequence Identity to Known Interacting Targets of Compound "
1676
- "suggest predictive reliability (higher values - greater reliability)."
1677
- "Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target "
1678
- "suggests the novelty of the compound (lower values - higher novelty)."
1679
- )
 
 
 
 
 
 
 
 
 
1680
  with gr.Row():
1681
  with gr.Column():
1682
  drug_screen_email = gr.Textbox(
@@ -1777,14 +1772,24 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1777
  target_library_upload_btn = gr.UploadButton(
1778
  label='OR Upload Your Own Library', variant='primary')
1779
  target_library_upload = gr.File(label='Custom target library file', visible=False)
1780
-
1781
- target_identify_opts = gr.CheckboxGroup(
1782
- ['Include Target Max. Sequence Identity to Training Targets'],
1783
- label='Step 6. Select Additional Options',
1784
- info="Experimental features - may increase the job computation time. "
1785
- "Target Max. Sequence Identity to Training Targets suggest "
1786
- "predictive reliability (higher values - greater reliability)."
1787
- )
 
 
 
 
 
 
 
 
 
 
1788
  with gr.Row():
1789
  with gr.Column():
1790
  target_identify_email = gr.Textbox(
@@ -1823,9 +1828,11 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1823
  label='Step 1. Select Pair Input Type and Input',
1824
  value='Upload a CSV file containing paired compound-protein data')
1825
  with gr.Column() as pair_upload:
1826
- gr.File(label="Example CSV dataset",
1827
- value="data/examples/interaction_pair_inference.csv",
1828
- interactive=False)
 
 
1829
  with gr.Row():
1830
  infer_csv_prompt = gr.Button(
1831
  value="Upload Your Own Dataset Below",
@@ -1833,27 +1840,50 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1833
  with gr.Column():
1834
  infer_pair = gr.File(
1835
  label='Upload CSV File Containing Paired Records',
1836
- file_count="single", type='filepath', visible=True)
 
 
 
1837
  with gr.Column(visible=False) as pair_generate:
1838
  with gr.Row():
1839
- gr.File(label='Example SDF compound library',
1840
- value='data/examples/compound_library.sdf', interactive=False)
1841
- gr.File(label='Example FASTA target library',
1842
- value='data/examples/target_library.fasta', interactive=False)
 
 
 
 
 
 
1843
  with gr.Row():
1844
- gr.File(label='Example CSV compound library',
1845
- value='data/examples/compound_library.csv', interactive=False)
1846
- gr.File(label='Example CSV target library',
1847
- value='data/examples/target_library.csv', interactive=False)
 
 
 
 
 
 
1848
  with gr.Row():
1849
  infer_library_prompt = gr.Button(
1850
  value="Upload Your Own Libraries Below",
1851
- visible=False, variant='secondary')
 
 
1852
  with gr.Row():
1853
- infer_drug = gr.File(label='Upload SDF/CSV File Containing Multiple Compounds',
1854
- file_count="single", type='filepath')
1855
- infer_target = gr.File(label='Upload FASTA/CSV File Containing Multiple Targets',
1856
- file_count="single", type='filepath')
 
 
 
 
 
 
1857
 
1858
  with gr.Row():
1859
  with gr.Column(min_width=200):
@@ -1862,10 +1892,12 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1862
  "If the proteins in the target library of interest "
1863
  "all belong to the same protein family, manually selecting the family is supported."
1864
  )
 
1865
  pair_infer_target_family = gr.Dropdown(
1866
  choices=list(TARGET_FAMILY_MAP.keys()),
1867
  value='General',
1868
- label='Step 2. Select Target Family (Optional)')
 
1869
 
1870
  with gr.Column(min_width=200):
1871
  HelpTip(
@@ -1877,15 +1909,17 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1877
  pair_infer_task = gr.Dropdown(
1878
  list(TASK_MAP.keys()),
1879
  label='Step 3. Select a Prediction Task',
1880
- value='Compound-Protein Interaction')
 
1881
 
1882
  with gr.Column(min_width=200):
1883
- HelpTip("Select your preferred model. "
1884
- "Please refer to documentation for detailed benchmark results."
1885
- )
1886
  pair_infer_preset = gr.Dropdown(
1887
  list(PRESET_MAP.keys()),
1888
- label='Step 4. Select a Preset Model')
 
1889
  # infer_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
1890
  # variant='primary')
1891
  pair_infer_opts = gr.CheckboxGroup(visible=False)
@@ -2093,7 +2127,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
2093
  alignment = aligner.align(processed_fasta, query)
2094
  return alignment.score / max(len(processed_fasta), len(query))
2095
 
2096
- alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
2097
  row = alignment_df.loc[alignment_df['score'].idxmax()]
2098
  family = str(row['Target Family']).title()
2099
  return gr.Dropdown(value=family,
@@ -2119,6 +2153,12 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
2119
  show_progress='hidden'
2120
  )
2121
 
 
 
 
 
 
 
2122
 
2123
  def example_fill(input_type):
2124
  return {target_id: 'Q16539',
@@ -2419,13 +2459,13 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2419
  infer_df = pd.read_csv(drug_target_pair_upload)
2420
  validate_columns(infer_df, ['X1', 'X2'])
2421
 
2422
- infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
2423
  validate_seq_str, regex=SMILES_PAT)
2424
  if not infer_df['X1_ERR'].isna().all():
2425
  raise ValueError(
2426
  f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
2427
 
2428
- infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
2429
  validate_seq_str, regex=FASTA_PAT)
2430
  if not infer_df['X2_ERR'].isna().all():
2431
  raise ValueError(
@@ -2757,7 +2797,7 @@ if __name__ == "__main__":
2757
  db.update({'status': 'FAILED'}, Job.id == job['id'])
2758
 
2759
  scheduler = BackgroundScheduler()
2760
- scheduler.add_job(check_expiry, 'interval', hours=1)
2761
  scheduler.start()
2762
 
2763
  demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
 
45
  from apscheduler.schedulers.background import BackgroundScheduler
46
  from tinydb import TinyDB, Query
47
 
48
+ import swifter
49
  from tqdm.auto import tqdm
50
 
51
  from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
 
156
  footer {
157
  visibility: hidden
158
  }
 
159
  """
160
 
161
 
 
191
  def mol_to_pharm3d(mol, mode='html'):
192
  if mol is None:
193
  return
194
+ # AllChem.Compute2DCoords(mol)
195
+ mol = Chem.AddHs(mol)
196
+ params = AllChem.ETKDGv3()
197
+ params.randomSeed = 0xf00d # for reproducibility
198
+ AllChem.EmbedMolecule(mol, params)
199
 
200
  feats = FEAT_FACTORY.GetFeaturesForMol(mol)
201
 
 
294
  }
295
 
296
  DRUG_SCRENN_CPI_OPTS = [
297
+ 'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set',
298
+ 'Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target',
299
+ 'Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound',
300
  ]
301
 
302
  DRUG_SCRENN_CPA_OPTS = [
303
+ 'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set',
304
+ ]
305
+
306
+ TARGET_IDENTIFY_CPI_OPTS = [
307
+ 'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set',
308
+ 'Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound',
309
+ 'Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target',
310
+ ]
311
+
312
+ TARGET_IDENTIFY_CPA_OPTS = [
313
+ 'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set',
314
  ]
315
 
316
  pd.set_option('display.float_format', '{:.3f}'.format)
 
396
  return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound}
397
 
398
 
399
+ def alignment_score(query, target):
400
+ aligner = PairwiseAligner()
401
+ aligner.mode = 'local'
402
+ alignment = aligner.align(query, target)
403
+ return alignment.score / max(len(query), len(target))
404
+
405
+
406
  def max_sequence_identity(seq, seen_fastas):
407
  if seq is None or seen_fastas is None or seen_fastas.empty:
408
  return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None}
 
415
  target = id2
416
  return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
417
 
418
+ cached_alignment_score = cache(alignment_score)
 
419
  max_iden = 0
420
  target = None
421
  for fasta in seen_fastas['X2'].values:
422
+ identity = cached_alignment_score(seq, fasta)
423
+
 
 
 
 
 
 
 
424
  if identity > max_iden:
425
  max_iden = identity
426
  target = fasta
 
428
  id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
429
  if pd.notnull(id2) and id2 != '':
430
  target = id2
431
+ if max_iden == 1:
432
+ break
433
 
434
+ cached_alignment_score.cache_clear()
435
  return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target}
436
 
437
 
 
861
  orig_df['Target Family'] = None
862
  if orig_df['Target Family'].isna().any():
863
  orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
864
+ orig_df.loc[orig_df['Target Family'].isna(), 'X2'].swifter.apply(detect_family)
865
  )
866
  orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
867
  detect_family.cache_clear()
868
 
869
+ orig_df['X1^'] = orig_df['X1'].swifter.apply(rdkit_canonicalize)
870
 
871
  orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
872
  annotated_df = orig_df[~orig_df['Y'].isna()].copy()
 
967
  df_list = [prediction_df, annotated_df]
968
  prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True)
969
 
970
+ # Advanced options for Drug Hit Screening
971
+ if "Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set" in opts:
972
+ x2 = prediction_df['X2'].iloc[0]
 
 
 
 
 
973
 
974
+ prediction_df[[
975
+ 'Max. Sequence Identity to Training Targets',
976
+ 'Max. Id. Training Target'
977
+ ]] = pd.Series(max_sequence_identity(x2, df_training))
 
978
 
979
+ if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
980
  x2 = prediction_df['X2'].iloc[0]
981
  pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
982
+ pos_compounds_df['FP'] = pos_compounds_df['X1'].swifter.apply(smiles_to_ecfp)
983
 
984
  @cache
985
  def max_sim(smiles):
986
  return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
987
 
988
+ prediction_df[[
989
+ 'Max. Tanimoto Similarity to Known Ligands',
990
+ 'Max. Sim. Ligand'
991
+ ]] = prediction_df['X1'].swifter.apply(max_sim).apply(pd.Series)
992
+
993
  max_sim.cache_clear()
994
 
995
+ if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
996
  x2 = prediction_df['X2'].iloc[0]
997
+ prediction_df['X1^'] = prediction_df['X1'].swifter.apply(rdkit_canonicalize)
998
 
999
  @cache
1000
+ def max_id(compound):
1001
+ pos_targets_df = df_training.loc[df_training['X1'] == compound]
1002
+ return max_sequence_identity(x2, seen_fastas=pos_targets_df)
1003
 
1004
+ prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
1005
+ 'Max. Id. Target']] = (
1006
+ prediction_df['X1^'].swifter.apply(max_id).apply(pd.Series)
1007
  )
1008
  prediction_df.drop(['X1^'], axis=1, inplace=True)
1009
 
1010
+ max_id.cache_clear()
1011
 
1012
+ # Advanced options for Target Protein Identification
1013
+ if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
1014
+ x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
1015
+ prediction_df['FP'] = prediction_df['X1'].swifter.apply(smiles_to_ecfp)
1016
 
1017
+ prediction_df[[
1018
+ 'Max. Tanimoto Similarity to Training Compounds',
1019
+ 'Max. Sim. Training Compound'
1020
+ ]] = pd.Series(max_tanimoto_similarity(x1, df_training))
1021
 
1022
+ if "Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound" in opts:
1023
+ x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
1024
+ pos_targets_df = df_training.loc[(df_training['X1'] == x1) & (df_training['Y'] == 1)].copy()
1025
+
1026
+ @cache
1027
+ def max_id(fasta):
1028
+ return max_sequence_identity(fasta, seen_fastas=pos_targets_df)
1029
+
1030
+ prediction_df[[
1031
+ 'Max. Sequence Identity to Known Targets of Input Compound',
1032
+ 'Max. Id. Target'
1033
+ ]] = prediction_df['X2'].swifter.apply(max_id).apply(pd.Series)
1034
+
1035
+ max_id.cache_clear()
1036
+
1037
+ if "Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target" in opts:
1038
+ x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
1039
+
1040
+ @cache
1041
+ def max_sim(fasta):
1042
+ pos_targets_df = df_training.loc[(df_training['X2'] == fasta) & (df_training['Y'] == 1)].copy()
1043
+ pos_targets_df['FP'] = pos_targets_df['X1'].swifter.apply(smiles_to_ecfp)
1044
+ return max_tanimoto_similarity(x1, seen_smiles_with_fp=pos_targets_df)
1045
+
1046
+ prediction_df[[
1047
+ 'Max. Tanimoto Similarity to Known Ligands of Identified Target',
1048
+ 'Max. Sim. Ligand'
1049
+ ]] = prediction_df['X2'].swifter.apply(max_sim).apply(pd.Series)
1050
+
1051
+ max_sim.cache_clear()
1052
 
1053
  prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
1054
  status = "COMPLETED"
 
1100
 
1101
  if 'X1' in df.columns:
1102
  if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
1103
+ df['Compound'] = df['X1'].swifter.apply(
1104
  lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
1105
+ df['Scaffold'] = df['Compound'].swifter.apply(MurckoScaffold.GetScaffoldForMol)
1106
+ df['Scaffold SMILES'] = df['Scaffold'].swifter.apply(lambda x: Chem.MolToSmiles(x))
1107
 
1108
  if task == 'Compound-Protein Binding Affinity':
1109
  # Convert Y^ from pIC50 to IC50
 
1151
  columns_unique = None
1152
 
1153
  if 'Exclude Pharmacophore 3D' not in opts:
1154
+ df_html['Pharmacophore'] = df_html['Compound'].swifter.apply(
1155
  lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
1156
 
1157
  if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
1158
+ df_html['Compound'] = df_html['Compound'].swifter.apply(
1159
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1160
  else:
1161
  df_html.drop(['Compound'], axis=1, inplace=True)
1162
 
1163
  if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
1164
+ df_html['Scaffold'] = df_html['Scaffold'].swifter.apply(
1165
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1166
  else:
1167
  df_html.drop(['Scaffold'], axis=1, inplace=True)
 
1175
  if any(col in df_html.columns for col in ['Y^', 'Y']):
1176
  job = 'Target Protein Identification'
1177
  category = 'Target Family'
1178
+ columns_unique = df_html.columns.isin(
1179
+ ['ID1', 'Pharmacophore', 'Compound', 'Scaffold', 'X1', 'Scaffold SMILES',
1180
+ 'Max. Tanimoto Similarity to Training Compounds', 'Max. Sim. Training Compound']
1181
+ + list(FILTER_MAP.keys()) + list(SCORE_MAP.keys())
1182
+ )
1183
 
1184
  elif n_compound >= 2 and n_protein == 1:
1185
  unique_entity = 'Target of Interest'
1186
  if any(col in df_html.columns for col in ['Y^', 'Y']):
1187
  job = 'Drug Hit Screening'
1188
  category = 'Scaffold SMILES'
1189
+ columns_unique = df_html.columns.isin(
1190
+ ['X2', 'ID2', 'Max. Sequence Identity to Training Targets', 'Max. Id. Training Target']
1191
+ )
1192
 
1193
  elif 'Y^' in df_html.columns:
1194
  job = 'Interaction Pair Inference'
 
1196
  df_html.rename(columns=column_aliases, inplace=True)
1197
  df_html.index.name = 'Index'
1198
  if 'Target FASTA' in df_html.columns:
1199
+ df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
1200
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1201
 
1202
  num_cols = df_html.select_dtypes('number').columns
 
1214
  if 'Target ID' in df_html.columns:
1215
  df_html.drop(['Target FASTA'], axis=1, inplace=True)
1216
  if 'Target FASTA' in df_html.columns:
1217
+ df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
1218
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1219
  if 'Scaffold SMILES' in df_html.columns:
1220
  df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
 
1290
 
1291
  report_table = pn.widgets.Tabulator(
1292
  df_html, formatters=formatters,
1293
+ frozen_columns=[
1294
+ 'Index', 'Target ID', 'Compound ID', 'Compound'
1295
+ ],
1296
  disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
1297
 
1298
  for i, col in enumerate(num_cols):
 
1321
  # Remove keys with empty values
1322
  pie_charts = {k: v for k, v in pie_charts.items() if any(v)}
1323
 
1324
+ pn.extension(
1325
+ css_files=[
1326
+ './static/panel.css',
1327
+ ],
1328
+ js_files={
1329
+ '3Dmol': './static/3Dmol-min.js',
1330
+ 'panel_custom': './static/panel.js'
1331
+ }
1332
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1333
 
1334
  template = pn.template.VanillaTemplate(
1335
  title=f'DeepSEQreen {job} Report',
 
1345
  if unique_df is not None:
1346
  unique_table = pn.widgets.Tabulator(unique_df, formatters=formatters, sizing_mode='stretch_width',
1347
  show_index=False, disabled=True,
1348
+ frozen_columns=['Compound ID', 'Compound', 'Target ID'])
1349
  # if pie_charts:
1350
  # unique_table.width = 640
1351
  stats_pane.append(pn.Column(f'### {unique_entity}', unique_table))
 
1437
  df_report = df.copy()
1438
  try:
1439
  for filter_name in filter_list:
1440
+ df_report[filter_name] = df_report['Compound'].swifter.apply(
1441
  lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
1442
 
1443
  for score_name in score_list:
1444
+ df_report[score_name] = df_report['Compound'].swifter.apply(
1445
  lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
1446
 
1447
  return (create_html_report(df_report, file=None, task=task), df_report,
 
1653
  label='OR Upload Your Own Library', variant='primary')
1654
  drug_library_upload = gr.File(label='Custom compound library file', visible=False)
1655
 
1656
+ with gr.Column():
1657
+ HelpTip("""
1658
+ <b>Max. Sequence Identity between the Input Target and Targets in the Training Set</b>:
1659
+ this serves as an indicator of the predictioon applicability/reliability
1660
+ higher similarities indicate more reliable predictions (preferably > 0.85).<br>
1661
+ <b>Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target</b>:
1662
+ this serves as an indicator of both the confidence level and novelty of the predicted hit compounds –
1663
+ higher similarities suggest greater confidence, while lower Tanimoto similarities may indicate the novelty
1664
+ of the identified hit compounds compared to known drugs or true interacting compounds of the input target.<br>
1665
+ <b>Max. Sequence Identity between the Input Target and Known Targets of Hit Compound</b>:
1666
+ this serves as an additional indicator of the confidence level of the predicted hit compounds –
1667
+ higher identities usually lead to greater confidence in the predictions.<br>
1668
+ """)
1669
+ drug_screen_opts = gr.CheckboxGroup(
1670
+ label="Step 6. Select Additional Options",
1671
+ choices=DRUG_SCRENN_CPI_OPTS,
1672
+ info="Experimental features - may increase the job computation time."
1673
+ "See the Help Tip on the right or the Documentation for detailed explanation."
1674
+ )
1675
  with gr.Row():
1676
  with gr.Column():
1677
  drug_screen_email = gr.Textbox(
 
1772
  target_library_upload_btn = gr.UploadButton(
1773
  label='OR Upload Your Own Library', variant='primary')
1774
  target_library_upload = gr.File(label='Custom target library file', visible=False)
1775
+ with gr.Column():
1776
+ HelpTip("""
1777
+ <b>Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set</b>:
1778
+ this serves as an indicator of prediction applicability and reliability –
1779
+ higher similarities indicates more reliable predictions (ideally > 0.85).<br>
1780
+ <b>Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound</b>:
1781
+ this serves as an indicator of prediction confidence for the potential targets –
1782
+ higher similarities typically imply higher confidence levels.<br>
1783
+ <b>Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target</b>:
1784
+ this serves as an additional indicator of the confidence level in the predicted potential targets –
1785
+ higher similarities usually correspond to greater prediction confidence.<br>
1786
+ """)
1787
+ target_identify_opts = gr.CheckboxGroup(
1788
+ choices=TARGET_IDENTIFY_CPI_OPTS,
1789
+ label='Step 6. Select Additional Options',
1790
+ info="Experimental features - may increase the job computation time. "
1791
+ "See the Help Tip on the right or the Documentation for detailed explanation."
1792
+ )
1793
  with gr.Row():
1794
  with gr.Column():
1795
  target_identify_email = gr.Textbox(
 
1828
  label='Step 1. Select Pair Input Type and Input',
1829
  value='Upload a CSV file containing paired compound-protein data')
1830
  with gr.Column() as pair_upload:
1831
+ gr.File(
1832
+ label="Example CSV dataset",
1833
+ value="data/examples/interaction_pair_inference.csv",
1834
+ interactive=False
1835
+ )
1836
  with gr.Row():
1837
  infer_csv_prompt = gr.Button(
1838
  value="Upload Your Own Dataset Below",
 
1840
  with gr.Column():
1841
  infer_pair = gr.File(
1842
  label='Upload CSV File Containing Paired Records',
1843
+ file_count="single",
1844
+ type='filepath',
1845
+ visible=True
1846
+ )
1847
  with gr.Column(visible=False) as pair_generate:
1848
  with gr.Row():
1849
+ gr.File(
1850
+ label='Example SDF compound library',
1851
+ value='data/examples/compound_library.sdf',
1852
+ interactive=False
1853
+ )
1854
+ gr.File(
1855
+ label='Example FASTA target library',
1856
+ value='data/examples/target_library.fasta',
1857
+ interactive=False
1858
+ )
1859
  with gr.Row():
1860
+ gr.File(
1861
+ label='Example CSV compound library',
1862
+ value='data/examples/compound_library.csv',
1863
+ interactive=False
1864
+ )
1865
+ gr.File(
1866
+ label='Example CSV target library',
1867
+ value='data/examples/target_library.csv',
1868
+ interactive=False
1869
+ )
1870
  with gr.Row():
1871
  infer_library_prompt = gr.Button(
1872
  value="Upload Your Own Libraries Below",
1873
+ visible=False,
1874
+ variant='secondary'
1875
+ )
1876
  with gr.Row():
1877
+ infer_drug = gr.File(
1878
+ label='Upload SDF/CSV File Containing Multiple Compounds',
1879
+ file_count="single",
1880
+ type='filepath'
1881
+ )
1882
+ infer_target = gr.File(
1883
+ label='Upload FASTA/CSV File Containing Multiple Targets',
1884
+ file_count="single",
1885
+ type='filepath'
1886
+ )
1887
 
1888
  with gr.Row():
1889
  with gr.Column(min_width=200):
 
1892
  "If the proteins in the target library of interest "
1893
  "all belong to the same protein family, manually selecting the family is supported."
1894
  )
1895
+
1896
  pair_infer_target_family = gr.Dropdown(
1897
  choices=list(TARGET_FAMILY_MAP.keys()),
1898
  value='General',
1899
+ label='Step 2. Select Target Family (Optional)'
1900
+ )
1901
 
1902
  with gr.Column(min_width=200):
1903
  HelpTip(
 
1909
  pair_infer_task = gr.Dropdown(
1910
  list(TASK_MAP.keys()),
1911
  label='Step 3. Select a Prediction Task',
1912
+ value='Compound-Protein Interaction'
1913
+ )
1914
 
1915
  with gr.Column(min_width=200):
1916
+ HelpTip(
1917
+ "Select your preferred model. Please refer to documentation for detailed benchmark results."
1918
+ )
1919
  pair_infer_preset = gr.Dropdown(
1920
  list(PRESET_MAP.keys()),
1921
+ label='Step 4. Select a Preset Model'
1922
+ )
1923
  # infer_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
1924
  # variant='primary')
1925
  pair_infer_opts = gr.CheckboxGroup(visible=False)
 
2127
  alignment = aligner.align(processed_fasta, query)
2128
  return alignment.score / max(len(processed_fasta), len(query))
2129
 
2130
+ alignment_df['score'] = alignment_df['X2'].swifter.apply(align_score)
2131
  row = alignment_df.loc[alignment_df['score'].idxmax()]
2132
  family = str(row['Target Family']).title()
2133
  return gr.Dropdown(value=family,
 
2153
  show_progress='hidden'
2154
  )
2155
 
2156
+ target_identify_task.select(
2157
+ fn=lambda task, opts: gr.CheckboxGroup(choices=TARGET_IDENTIFY_CPA_OPTS)
2158
+ if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup(choices=DRUG_SCRENN_CPI_OPTS),
2159
+ inputs=[target_identify_task, target_identify_opts], outputs=target_identify_opts,
2160
+ show_progress='hidden'
2161
+ )
2162
 
2163
  def example_fill(input_type):
2164
  return {target_id: 'Q16539',
 
2459
  infer_df = pd.read_csv(drug_target_pair_upload)
2460
  validate_columns(infer_df, ['X1', 'X2'])
2461
 
2462
+ infer_df['X1_ERR'] = infer_df['X1'].swifter.apply(
2463
  validate_seq_str, regex=SMILES_PAT)
2464
  if not infer_df['X1_ERR'].isna().all():
2465
  raise ValueError(
2466
  f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
2467
 
2468
+ infer_df['X2_ERR'] = infer_df['X2'].swifter.apply(
2469
  validate_seq_str, regex=FASTA_PAT)
2470
  if not infer_df['X2_ERR'].isna().all():
2471
  raise ValueError(
 
2797
  db.update({'status': 'FAILED'}, Job.id == job['id'])
2798
 
2799
  scheduler = BackgroundScheduler()
2800
+ scheduler.add_job(check_expiry, 'interval', hours=1, timezone=pytz.utc)
2801
  scheduler.start()
2802
 
2803
  demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)