DeepSEQreen_NAR_fb

Sleeping

App Files Files Community

libokj commited on May 14, 2024

Commit

fc6f770

1 Parent(s): 9df0702

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -33

app.py CHANGED Viewed

@@ -45,7 +45,7 @@ import panel as pn
 from apscheduler.schedulers.background import BackgroundScheduler
 from tinydb import TinyDB, Query
-#import swifter
 from tqdm.auto import tqdm
 from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
@@ -160,7 +160,7 @@ visibility: hidden
 class View3DmolCell(py3Dmol.view):
-    def __init__(self, width=400, height=250):
         divid = "3dmolviewer_UNIQUEID"
         self.uniqueid = None
         if isinstance(width, int):
@@ -861,12 +861,12 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         orig_df['Target Family'] = None
     if orig_df['Target Family'].isna().any():
         orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
-            orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
         )
     orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
     detect_family.cache_clear()
-    orig_df['X1^'] = orig_df['X1'].parallel_apply(rdkit_canonicalize)
     orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
     annotated_df = orig_df[~orig_df['Y'].isna()].copy()
@@ -979,7 +979,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
             x2 = prediction_df['X2'].iloc[0]
             pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
-            pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp)
             @cache
             def max_sim(smiles):
@@ -988,13 +988,13 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
             prediction_df[[
                 'Max. Tanimoto Similarity to Known Ligands',
                 'Max. Sim. Ligand'
-            ]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
             max_sim.cache_clear()
         if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
             x2 = prediction_df['X2'].iloc[0]
-            prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
             @cache
             def max_id(compound):
@@ -1003,7 +1003,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
             prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
                            'Max. Id. Target']] = (
-                prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series)
             )
             prediction_df.drop(['X1^'], axis=1, inplace=True)
@@ -1012,7 +1012,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         # Advanced options for Target Protein Identification
         if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
             x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
-            prediction_df['FP'] = prediction_df['X1'].parallel_apply(smiles_to_ecfp)
             prediction_df[[
                 'Max. Tanimoto Similarity to Training Compounds',
@@ -1030,7 +1030,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
             prediction_df[[
                 'Max. Sequence Identity to Known Targets of Input Compound',
                 'Max. Id. Target'
-            ]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series)
             max_id.cache_clear()
@@ -1046,7 +1046,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
             prediction_df[[
                 'Max. Tanimoto Similarity to Known Ligands of Identified Target',
                 'Max. Sim. Ligand'
-            ]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series)
             max_sim.cache_clear()
@@ -1100,10 +1100,10 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
         if 'X1' in df.columns:
             if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
-                df['Compound'] = df['X1'].parallel_apply(
                     lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
-            df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
-            df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
             df['Pharmacophore'] = None
         if task == 'Compound-Protein Binding Affinity':
             # Convert Y^ from pIC50 to IC50
@@ -1121,10 +1121,9 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
 def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(track_tqdm=True)):
     df_html = df.copy(deep=True)
-    df_html.dropna(how='all', axis=1, inplace=True)
     column_aliases = COLUMN_ALIASES.copy()
     cols_left = list(pd.Index([
-            'ID1', 'ID2', 'Compound', 'Scaffold', 'Pharmacophore', 'X1', 'Scaffold SMILES', 'X2', 'Y^'
     ]).intersection(df_html.columns))
     # cols_right = list(pd.Index(['X1', 'X2']).intersection(df_html.columns))
     # df_html = df_html[cols_left + (df_html.columns.drop(cols_left + cols_right).tolist()) + cols_right]
@@ -1152,17 +1151,17 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
     columns_unique = None
     if 'Exclude Pharmacophore 3D' not in opts:
-        df_html['Pharmacophore'] = df_html['Compound'].parallel_apply(
             lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
     if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
-        df_html['Compound'] = df_html['Compound'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Compound'], axis=1, inplace=True)
     if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
-        df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Scaffold'], axis=1, inplace=True)
@@ -1197,7 +1196,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
     df_html.rename(columns=column_aliases, inplace=True)
     df_html.index.name = 'Index'
     if 'Target FASTA' in df_html.columns:
-        df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
             lambda x: wrap_text(x) if not pd.isna(x) else x)
     num_cols = df_html.select_dtypes('number').columns
@@ -1208,6 +1207,8 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
     if columns_unique is not None:
         unique_df = df_html.loc[:, columns_unique].iloc[[0]].copy()
         df_html = df_html.loc[:, ~columns_unique]
     if not file:
         if 'Compound ID' in df_html.columns:
@@ -1215,11 +1216,17 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         if 'Target ID' in df_html.columns:
             df_html.drop(['Target FASTA'], axis=1, inplace=True)
         if 'Target FASTA' in df_html.columns:
-            df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
                 lambda x: wrap_text(x) if not pd.isna(x) else x)
         if 'Scaffold SMILES' in df_html.columns:
             df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
         styled_df = df_html.fillna('').style.format(precision=3)
         for i, col in enumerate(num_cols):
@@ -1293,9 +1300,10 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         report_table = pn.widgets.Tabulator(
             df_html, formatters=formatters,
             frozen_columns=[
-                'Index', 'Target ID', 'Compound ID', 'Compound'
             ],
-            disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
         for i, col in enumerate(num_cols):
             cmap = sns.light_palette(num_col_colors[i], as_cmap=True)
@@ -1332,6 +1340,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
             .tabulator-cell {
                 overflow: visible !important;
             }
             .tabulator-cell:hover {
@@ -1375,7 +1384,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
             raw_css=[panel_css],
             js_files={'panel_custom': 'static/panel.js', '3Dmol': 'static/3Dmol-min.js'},
             # js_modules={'3Dmol': 'static/3Dmol-min.js'},
-            inline=True
         )
         template = pn.template.VanillaTemplate(
@@ -1484,11 +1493,11 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
     df_report = df.copy()
     try:
         for filter_name in filter_list:
-            df_report[filter_name] = df_report['Compound'].parallel_apply(
                 lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
         for score_name in score_list:
-            df_report[score_name] = df_report['Compound'].parallel_apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
         return (create_html_report(df_report, file=None, task=task), df_report,
@@ -2174,7 +2183,7 @@ higher similarities usually correspond to greater prediction confidence.<br>
                 alignment = aligner.align(processed_fasta, query)
                 return alignment.score / max(len(processed_fasta), len(query))
-            alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
             row = alignment_df.loc[alignment_df['score'].idxmax()]
             family = str(row['Target Family']).title()
             return gr.Dropdown(value=family,
@@ -2506,13 +2515,13 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
             infer_df = pd.read_csv(drug_target_pair_upload)
             validate_columns(infer_df, ['X1', 'X2'])
-            infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
                 validate_seq_str, regex=SMILES_PAT)
             if not infer_df['X1_ERR'].isna().all():
                 raise ValueError(
                     f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
-            infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
                 validate_seq_str, regex=FASTA_PAT)
             if not infer_df['X2_ERR'].isna().all():
                 raise ValueError(
@@ -2818,12 +2827,16 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
     csv_generate.click(
         lambda: gr.File(visible=True), outputs=csv_download_file,
-    ).then(fn=create_csv_report_file, inputs=[report_df, file_for_report, report_task, csv_sep],
-           outputs=csv_download_file, show_progress='full')
     html_generate.click(
         lambda: gr.File(visible=True), outputs=html_download_file,
-    ).then(fn=create_html_report_file, inputs=[report_df, file_for_report, report_task, html_opts],
-           outputs=html_download_file, show_progress='full')
 if __name__ == "__main__":
     pandarallel.initialize()

 from apscheduler.schedulers.background import BackgroundScheduler
 from tinydb import TinyDB, Query
+import swifter
 from tqdm.auto import tqdm
 from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
 class View3DmolCell(py3Dmol.view):
+    def __init__(self, width=320, height=200):
         divid = "3dmolviewer_UNIQUEID"
         self.uniqueid = None
         if isinstance(width, int):
         orig_df['Target Family'] = None
     if orig_df['Target Family'].isna().any():
         orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
+            orig_df.loc[orig_df['Target Family'].isna(), 'X2'].swifter.apply(detect_family)
         )
     orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
     detect_family.cache_clear()
+    orig_df['X1^'] = orig_df['X1'].swifter.apply(rdkit_canonicalize)
     orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
     annotated_df = orig_df[~orig_df['Y'].isna()].copy()
         if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
             x2 = prediction_df['X2'].iloc[0]
             pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
+            pos_compounds_df['FP'] = pos_compounds_df['X1'].swifter.apply(smiles_to_ecfp)
             @cache
             def max_sim(smiles):
             prediction_df[[
                 'Max. Tanimoto Similarity to Known Ligands',
                 'Max. Sim. Ligand'
+            ]] = prediction_df['X1'].swifter.apply(max_sim).apply(pd.Series)
             max_sim.cache_clear()
         if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
             x2 = prediction_df['X2'].iloc[0]
+            prediction_df['X1^'] = prediction_df['X1'].swifter.apply(rdkit_canonicalize)
             @cache
             def max_id(compound):
             prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
                            'Max. Id. Target']] = (
+                prediction_df['X1^'].swifter.apply(max_id).apply(pd.Series)
             )
             prediction_df.drop(['X1^'], axis=1, inplace=True)
         # Advanced options for Target Protein Identification
         if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
             x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
+            prediction_df['FP'] = prediction_df['X1'].swifter.apply(smiles_to_ecfp)
             prediction_df[[
                 'Max. Tanimoto Similarity to Training Compounds',
             prediction_df[[
                 'Max. Sequence Identity to Known Targets of Input Compound',
                 'Max. Id. Target'
+            ]] = prediction_df['X2'].swifter.apply(max_id).apply(pd.Series)
             max_id.cache_clear()
             prediction_df[[
                 'Max. Tanimoto Similarity to Known Ligands of Identified Target',
                 'Max. Sim. Ligand'
+            ]] = prediction_df['X2'].swifter.apply(max_sim).apply(pd.Series)
             max_sim.cache_clear()
         if 'X1' in df.columns:
             if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
+                df['Compound'] = df['X1'].swifter.apply(
                     lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
+            df['Scaffold'] = df['Compound'].swifter.apply(MurckoScaffold.GetScaffoldForMol)
+            df['Scaffold SMILES'] = df['Scaffold'].swifter.apply(lambda x: Chem.MolToSmiles(x))
             df['Pharmacophore'] = None
         if task == 'Compound-Protein Binding Affinity':
             # Convert Y^ from pIC50 to IC50
 def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(track_tqdm=True)):
     df_html = df.copy(deep=True)
     column_aliases = COLUMN_ALIASES.copy()
     cols_left = list(pd.Index([
+        'ID1', 'ID2', 'Compound', 'Scaffold', 'Pharmacophore', 'X1', 'Scaffold SMILES', 'X2', 'Y^'
     ]).intersection(df_html.columns))
     # cols_right = list(pd.Index(['X1', 'X2']).intersection(df_html.columns))
     # df_html = df_html[cols_left + (df_html.columns.drop(cols_left + cols_right).tolist()) + cols_right]
     columns_unique = None
     if 'Exclude Pharmacophore 3D' not in opts:
+        df_html['Pharmacophore'] = df_html['Compound'].swifter.apply(
             lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
     if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
+        df_html['Compound'] = df_html['Compound'].swifter.apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Compound'], axis=1, inplace=True)
     if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
+        df_html['Scaffold'] = df_html['Scaffold'].swifter.apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Scaffold'], axis=1, inplace=True)
     df_html.rename(columns=column_aliases, inplace=True)
     df_html.index.name = 'Index'
     if 'Target FASTA' in df_html.columns:
+        df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
             lambda x: wrap_text(x) if not pd.isna(x) else x)
     num_cols = df_html.select_dtypes('number').columns
     if columns_unique is not None:
         unique_df = df_html.loc[:, columns_unique].iloc[[0]].copy()
         df_html = df_html.loc[:, ~columns_unique]
+        df_html.dropna(how='all', axis=1, inplace=True)
+        unique_df.dropna(how='all', axis=1, inplace=True)
     if not file:
         if 'Compound ID' in df_html.columns:
         if 'Target ID' in df_html.columns:
             df_html.drop(['Target FASTA'], axis=1, inplace=True)
         if 'Target FASTA' in df_html.columns:
+            df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
                 lambda x: wrap_text(x) if not pd.isna(x) else x)
         if 'Scaffold SMILES' in df_html.columns:
             df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
+        # FIXME: Temporarily drop pharmacophore column before an image solution is found
+        if 'Pharmacophore' in df_html.columns:
+            df_html.drop(['Pharmacophore'], axis=1, inplace=True)
+        if unique_df is not None and 'Pharmacophore' in unique_df.columns:
+            unique_df.drop(['Pharmacophore'], axis=1, inplace=True)
         styled_df = df_html.fillna('').style.format(precision=3)
         for i, col in enumerate(num_cols):
         report_table = pn.widgets.Tabulator(
             df_html, formatters=formatters,
             frozen_columns=[
+                'Index', 'Target ID', 'Compound ID', 'Compound Name', 'Compound'
             ],
+            disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30
+        )
         for i, col in enumerate(num_cols):
             cmap = sns.light_palette(num_col_colors[i], as_cmap=True)
             .tabulator-cell {
                 overflow: visible !important;
+                align-content: center !important;
             }
             .tabulator-cell:hover {
             raw_css=[panel_css],
             js_files={'panel_custom': 'static/panel.js', '3Dmol': 'static/3Dmol-min.js'},
             # js_modules={'3Dmol': 'static/3Dmol-min.js'},
+            inline=True,
         )
         template = pn.template.VanillaTemplate(
     df_report = df.copy()
     try:
         for filter_name in filter_list:
+            df_report[filter_name] = df_report['Compound'].swifter.apply(
                 lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
         for score_name in score_list:
+            df_report[score_name] = df_report['Compound'].swifter.apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
         return (create_html_report(df_report, file=None, task=task), df_report,
                 alignment = aligner.align(processed_fasta, query)
                 return alignment.score / max(len(processed_fasta), len(query))
+            alignment_df['score'] = alignment_df['X2'].swifter.apply(align_score)
             row = alignment_df.loc[alignment_df['score'].idxmax()]
             family = str(row['Target Family']).title()
             return gr.Dropdown(value=family,
             infer_df = pd.read_csv(drug_target_pair_upload)
             validate_columns(infer_df, ['X1', 'X2'])
+            infer_df['X1_ERR'] = infer_df['X1'].swifter.apply(
                 validate_seq_str, regex=SMILES_PAT)
             if not infer_df['X1_ERR'].isna().all():
                 raise ValueError(
                     f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+            infer_df['X2_ERR'] = infer_df['X2'].swifter.apply(
                 validate_seq_str, regex=FASTA_PAT)
             if not infer_df['X2_ERR'].isna().all():
                 raise ValueError(
     csv_generate.click(
         lambda: gr.File(visible=True), outputs=csv_download_file,
+    ).then(
+        fn=create_csv_report_file, inputs=[report_df, file_for_report, report_task, csv_sep],
+        outputs=csv_download_file, show_progress='full'
+    )
     html_generate.click(
         lambda: gr.File(visible=True), outputs=html_download_file,
+    ).then(
+        fn=create_html_report_file, inputs=[report_df, file_for_report, report_task, html_opts],
+        outputs=html_download_file, show_progress='full'
+    )
 if __name__ == "__main__":
     pandarallel.initialize()