DeepSEQreen_NAR_fb

Sleeping

App Files Files Community

libokj commited on May 19, 2024

Commit

47d08b9

1 Parent(s): 11c31b2

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -100

app.py CHANGED Viewed

@@ -824,6 +824,93 @@ using the job id. You will also receive an email notification once the job is do
             raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
 def submit_predict(predict_filepath, task, preset, target_family, opts, job_info):
     job_id = job_info['id']
     status = job_info['status']
@@ -968,88 +1055,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         df_list = [prediction_df, annotated_df]
         prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True)
-        # Advanced options for Drug Hit Screening
-        if "Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set" in opts:
-            x2 = prediction_df['X2'].iloc[0]
-            prediction_df[[
-                'Max. Sequence Identity to Training Targets',
-                'Max. Id. Training Target'
-            ]] = pd.Series(max_sequence_identity(x2, df_training))
-        if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
-            x2 = prediction_df['X2'].iloc[0]
-            pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
-            pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp)
-            @cache
-            def max_sim(smiles):
-                return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
-            prediction_df[[
-                'Max. Tanimoto Similarity to Known Ligands',
-                'Max. Sim. Ligand'
-            ]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
-            max_sim.cache_clear()
-        if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
-            x2 = prediction_df['X2'].iloc[0]
-            prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
-            @cache
-            def max_id(compound):
-                pos_targets_df = df_training.loc[df_training['X1'] == compound]
-                return max_sequence_identity(x2, seen_fastas=pos_targets_df)
-            prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
-                           'Max. Id. Target']] = (
-                prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series)
-            )
-            prediction_df.drop(['X1^'], axis=1, inplace=True)
-            max_id.cache_clear()
-        # Advanced options for Target Protein Identification
-        if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
-            x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
-            prediction_df['FP'] = prediction_df['X1'].parallel_apply(smiles_to_ecfp)
-            prediction_df[[
-                'Max. Tanimoto Similarity to Training Compounds',
-                'Max. Sim. Training Compound'
-            ]] = pd.Series(max_tanimoto_similarity(x1, df_training))
-        if "Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound" in opts:
-            x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
-            pos_targets_df = df_training.loc[(df_training['X1'] == x1) & (df_training['Y'] == 1)].copy()
-            @cache
-            def max_id(fasta):
-                return max_sequence_identity(fasta, seen_fastas=pos_targets_df)
-            prediction_df[[
-                'Max. Sequence Identity to Known Targets of Input Compound',
-                'Max. Id. Target'
-            ]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series)
-            max_id.cache_clear()
-        if "Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target" in opts:
-            x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
-            @cache
-            def max_sim(fasta):
-                pos_targets_df = df_training.loc[(df_training['X2'] == fasta) & (df_training['Y'] == 1)].copy()
-                pos_targets_df['FP'] = pos_targets_df['X1'].apply(smiles_to_ecfp)
-                return max_tanimoto_similarity(x1, seen_smiles_with_fp=pos_targets_df)
-            prediction_df[[
-                'Max. Tanimoto Similarity to Known Ligands of Identified Target',
-                'Max. Sim. Ligand'
-            ]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series)
-            max_sim.cache_clear()
         prediction_df.drop(
             [col for col in ['N', 'FP'] if col in prediction_df.columns], axis=1
@@ -1087,6 +1093,8 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
 def update_df(file, progress=gr.Progress(track_tqdm=True)):
     if file and Path(file).is_file():
         task = None
         if "_CPI_" in str(file):
             task = 'Compound-Protein Interaction'
         elif "_CPA_" in str(file):
@@ -1113,11 +1121,33 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
             if 'Y^' in df.columns:
                 df['Y^'] = 10 ** (-df['Y^'])
-        return {html_report: create_html_report(df, file=None, task=task),
-                raw_df: df,
-                report_df: df.copy(),
-                analyze_btn: gr.Button(interactive=True),
-                report_task: task}  # pie_chart
     else:
         return {analyze_btn: gr.Button(interactive=False)}
@@ -1253,6 +1283,21 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         if unique_df is not None:
             if 'Target FASTA' in unique_df.columns:
                 unique_df['Target FASTA'] = unique_df['Target FASTA'].str.replace('\n', '<br>')
             if any(unique_df.columns.isin(bool_cols)):
                 unique_df = unique_df.style.applymap(
                     lambda val: f"background-color: {bool_col_colors[val]}", subset=bool_cols)
@@ -1268,11 +1313,11 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         uniprot_id_formatter = HTMLTemplateFormatter(
             template='<% if (value == value) { '  # Check if value is not NaN
                      'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) '
-            # Check if value is a valid UniProt ID
                      '{ %><a href="https://www.uniprot.org/uniprotkb/<%= value %>" target="_blank"><%= value %></a><% '
-            # Else treat it as a sequence or other plain-text string, line-warping every 60 characters
-                     '} else { %><div style="white-space: pre-wrap;"><%= value.match(/.{1,60}/g).join("<br>") %></div><% } %>'
-                     '<% } else { %><% } %>'  # Output empty string if value is NaN
         )
         pubchem_id_formatter = HTMLTemplateFormatter(
             template='<% if (value == value) { '  # Check if value is not NaN
@@ -1280,6 +1325,9 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
                      'target="_blank"><%= value %></a>'
                      '<% } else { %><% } %>'  # Output empty string if value is NaN
         )
         bool_formatters = {col: BooleanFormatter() for col in bool_cols}
         float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
         other_formatters = {
@@ -1294,6 +1342,8 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
             'Max. Id. Target': uniprot_id_formatter,
             'Max. Sim. Training Compound': pubchem_id_formatter,
             'Max. Id. Training Target': uniprot_id_formatter,
         }
         formatters = {**bool_formatters, **float_formatters, **other_formatters}
@@ -1492,7 +1542,7 @@ def create_pie_chart(df, category, value, top_k):
     return p
-def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_tqdm=True)):
     df_report = df.copy()
     try:
         for filter_name in filter_list:
@@ -1503,6 +1553,10 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
             df_report[score_name] = df_report['Compound'].parallel_apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
         return (create_html_report(df_report, file=None, task=task), df_report,
                 gr.File(visible=False), gr.File(visible=False))
@@ -1726,10 +1780,12 @@ this serves as an additional indicator of the confidence level of the predicted
 higher identities usually lead to greater confidence in the predictions.<br>
 """)
                     drug_screen_opts = gr.CheckboxGroup(
-                        label="Step 6. Select Additional Options",
                         choices=DRUG_SCRENN_CPI_OPTS,
-                        info="Experimental features - may increase the job computation time. "
-                             "See the Help Tip on the right or the Documentation for detailed explanation."
                     )
             with gr.Row():
                 with gr.Column():
@@ -1845,8 +1901,9 @@ higher similarities usually correspond to greater prediction confidence.<br>
 """)
                         target_identify_opts = gr.CheckboxGroup(
                             choices=TARGET_IDENTIFY_CPI_OPTS,
-                            label='Step 6. Select Additional Options',
-                            info="Experimental features - may increase the job computation time. "
                                  "See the Help Tip on the right or the Documentation for detailed explanation."
                         )
                 with gr.Row():
@@ -2021,8 +2078,11 @@ higher similarities usually correspond to greater prediction confidence.<br>
                                               label='Specify the Task Labels in the Uploaded Dataset')
                 with gr.Column(scale=2):
                     with gr.Row():
-                        scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Compound Scores')
-                        filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Compound Filters')
                     with gr.Accordion('Report Generate Options', open=True):
                         with gr.Row():
                             csv_sep = gr.Radio(label='CSV Delimiter',
@@ -2784,7 +2844,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
     )
     analyze_btn.click(
-        fn=submit_report, inputs=[raw_df, scores, filters, report_task], outputs=[
             html_report, report_df, csv_download_file, html_download_file]
     ).success(
         fn=lambda: [gr.Button(interactive=True)] * 2,
@@ -2793,6 +2853,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
     )
     def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)):
         csv_sep_map = {
             'Comma': ',',

             raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
+def apply_advanced_opts(prediction_df, opts, df_training):
+    # Advanced options for Drug Hit Screening
+    if "Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set" in opts:
+        x2 = prediction_df['X2'].iloc[0]
+        prediction_df[[
+            'Max. Sequence Identity to Training Targets',
+            'Max. Id. Training Target'
+        ]] = pd.Series(max_sequence_identity(x2, df_training))
+    if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
+        x2 = prediction_df['X2'].iloc[0]
+        pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
+        pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp)
+        @cache
+        def max_sim(smiles):
+            return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
+        prediction_df[[
+            'Max. Tanimoto Similarity to Known Ligands',
+            'Max. Sim. Ligand'
+        ]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
+        max_sim.cache_clear()
+    if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
+        x2 = prediction_df['X2'].iloc[0]
+        prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
+        @cache
+        def max_id(compound):
+            pos_targets_df = df_training.loc[df_training['X1'] == compound]
+            return max_sequence_identity(x2, seen_fastas=pos_targets_df)
+        prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
+                       'Max. Id. Target']] = (
+            prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series)
+        )
+        prediction_df.drop(['X1^'], axis=1, inplace=True)
+        max_id.cache_clear()
+    # Advanced options for Target Protein Identification
+    if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
+        x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
+        prediction_df['FP'] = prediction_df['X1'].parallel_apply(smiles_to_ecfp)
+        prediction_df[[
+            'Max. Tanimoto Similarity to Training Compounds',
+            'Max. Sim. Training Compound'
+        ]] = pd.Series(max_tanimoto_similarity(x1, df_training))
+    if "Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound" in opts:
+        x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
+        pos_targets_df = df_training.loc[(df_training['X1'] == x1) & (df_training['Y'] == 1)].copy()
+        @cache
+        def max_id(fasta):
+            return max_sequence_identity(fasta, seen_fastas=pos_targets_df)
+        prediction_df[[
+            'Max. Sequence Identity to Known Targets of Input Compound',
+            'Max. Id. Target'
+        ]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series)
+        max_id.cache_clear()
+    if "Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target" in opts:
+        x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
+        @cache
+        def max_sim(fasta):
+            pos_targets_df = df_training.loc[(df_training['X2'] == fasta) & (df_training['Y'] == 1)].copy()
+            pos_targets_df['FP'] = pos_targets_df['X1'].apply(smiles_to_ecfp)
+            return max_tanimoto_similarity(x1, seen_smiles_with_fp=pos_targets_df)
+        prediction_df[[
+            'Max. Tanimoto Similarity to Known Ligands of Identified Target',
+            'Max. Sim. Ligand'
+        ]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series)
+        max_sim.cache_clear()
+        return prediction_df
 def submit_predict(predict_filepath, task, preset, target_family, opts, job_info):
     job_id = job_info['id']
     status = job_info['status']
         df_list = [prediction_df, annotated_df]
         prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True)
+        prediction_df = apply_advanced_opts(prediction_df, opts, df_training)
         prediction_df.drop(
             [col for col in ['N', 'FP'] if col in prediction_df.columns], axis=1
 def update_df(file, progress=gr.Progress(track_tqdm=True)):
     if file and Path(file).is_file():
         task = None
+        job = None
         if "_CPI_" in str(file):
             task = 'Compound-Protein Interaction'
         elif "_CPA_" in str(file):
             if 'Y^' in df.columns:
                 df['Y^'] = 10 ** (-df['Y^'])
+        n_compound = df['X1'].nunique()
+        n_protein = df['X2'].nunique()
+        if n_compound == 1 and n_protein >= 2:
+            job = 'Target Protein Identification'
+            if task == 'Compound-Protein Interaction':
+                opts = TARGET_IDENTIFY_CPI_OPTS
+            elif task == 'Compound-Protein Binding Affinity':
+                opts = TARGET_IDENTIFY_CPA_OPTS
+        if n_compound >= 2 and n_protein == 1:
+            job = 'Drug Hit Screening'
+            if task == 'Compound-Protein Interaction':
+                opts = DRUG_SCRENN_CPI_OPTS
+            elif task == 'Compound-Protein Binding Affinity':
+                opts = DRUG_SCRENN_CPA_OPTS
+        return {
+            html_report: create_html_report(df, file=None, task=task),
+            raw_df: df,
+            report_df: df.copy(),
+            analyze_btn: gr.Button(interactive=True),
+            report_task: task,
+            job_opts: gr.CheckboxGroup(
+                label=f'{job} Advanced Options',
+                choices=opts,
+            ) if job else gr.CheckboxGroup(visible=False),
+        }
     else:
         return {analyze_btn: gr.Button(interactive=False)}
         if unique_df is not None:
             if 'Target FASTA' in unique_df.columns:
                 unique_df['Target FASTA'] = unique_df['Target FASTA'].str.replace('\n', '<br>')
+            if 'Max. Sequence Identity to Training Targets' in unique_df.columns:
+                # Add alert emoji for sequence identity below 0.85
+                if unique_df['Max. Sequence Identity to Training Targets'].iloc[0] < 0.85:
+                    unique_df['Max. Sequence Identity to Training Targets'] = (
+                        f'{unique_df["Max. Sequence Identity to Training Targets"]:.3f} ⚠️'
+                    )
+            if 'Max. Tanimoto Similarity to Training Compounds' in unique_df.columns:
+                # Add alert emoji for sequence identity below 0.85
+                if unique_df['Max. Tanimoto Similarity to Training Compounds'].iloc[0] < 0.85:
+                    unique_df['Max. Tanimoto Similarity to Training Compounds'] = (
+                        f'{unique_df["Max. Tanimoto Similarity to Training Compounds"]:.3f} ⚠️'
+                    )
             if any(unique_df.columns.isin(bool_cols)):
                 unique_df = unique_df.style.applymap(
                     lambda val: f"background-color: {bool_col_colors[val]}", subset=bool_cols)
         uniprot_id_formatter = HTMLTemplateFormatter(
             template='<% if (value == value) { '  # Check if value is not NaN
                      'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) '
+                     # Check if value is a valid UniProt ID
                      '{ %><a href="https://www.uniprot.org/uniprotkb/<%= value %>" target="_blank"><%= value %></a><% '
+                     # Else treat it as a sequence or other plain-text string, line-warping every 60 characters
+                     '} else { %><div style="white-space: pre-wrap;"><%= value.match(/.{1,60}/g).join("<br>") '
+                     '%></div><% } %><% } else { %><% } %>'  # Output empty string if value is NaN
         )
         pubchem_id_formatter = HTMLTemplateFormatter(
             template='<% if (value == value) { '  # Check if value is not NaN
                      'target="_blank"><%= value %></a>'
                      '<% } else { %><% } %>'  # Output empty string if value is NaN
         )
+        alert_emoji_formatter = HTMLTemplateFormatter(
+            template='<% if (value < 0.85) { %><%= value %> ⚠️<% } else { %><%= value %><% } %>'
+        )
         bool_formatters = {col: BooleanFormatter() for col in bool_cols}
         float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
         other_formatters = {
             'Max. Id. Target': uniprot_id_formatter,
             'Max. Sim. Training Compound': pubchem_id_formatter,
             'Max. Id. Training Target': uniprot_id_formatter,
+            'Max. Sequence Identity to Training Targets': alert_emoji_formatter,
+            'Max. Sequence Identity to Known Targets of Hit Compound': alert_emoji_formatter,
         }
         formatters = {**bool_formatters, **float_formatters, **other_formatters}
     return p
+def submit_report(df, score_list, filter_list, opt_list, task, progress=gr.Progress(track_tqdm=True)):
     df_report = df.copy()
     try:
         for filter_name in filter_list:
             df_report[score_name] = df_report['Compound'].parallel_apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
+        if opt_list:
+            df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
+            df_report = apply_advanced_opts(df_report, opt_list, df_training)
         return (create_html_report(df_report, file=None, task=task), df_report,
                 gr.File(visible=False), gr.File(visible=False))
 higher identities usually lead to greater confidence in the predictions.<br>
 """)
                     drug_screen_opts = gr.CheckboxGroup(
+                        label="Step 6. Select Advanced Options",
+                        value=DRUG_SCRENN_CPI_OPTS[0],
                         choices=DRUG_SCRENN_CPI_OPTS,
+                        info="Advanced features - may increase the job computation time. "
+                             "See the Help Tip on the right or the Documentation for detailed explanation.",
                     )
             with gr.Row():
                 with gr.Column():
 """)
                         target_identify_opts = gr.CheckboxGroup(
                             choices=TARGET_IDENTIFY_CPI_OPTS,
+                            value=TARGET_IDENTIFY_CPI_OPTS[0],
+                            label='Step 6. Select Advanced Options',
+                            info="Advanced features - may increase the job computation time. "
                                  "See the Help Tip on the right or the Documentation for detailed explanation."
                         )
                 with gr.Row():
                                               label='Specify the Task Labels in the Uploaded Dataset')
                 with gr.Column(scale=2):
                     with gr.Row():
+                        with gr.Row():
+                            scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Compound Scores')
+                            filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Compound Filters')
+                        job_opts = gr.CheckboxGroup(visible=False)
                     with gr.Accordion('Report Generate Options', open=True):
                         with gr.Row():
                             csv_sep = gr.Radio(label='CSV Delimiter',
     )
     analyze_btn.click(
+        fn=submit_report, inputs=[raw_df, scores, filters, job_opts, report_task], outputs=[
             html_report, report_df, csv_download_file, html_download_file]
     ).success(
         fn=lambda: [gr.Button(interactive=True)] * 2,
     )
     def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)):
         csv_sep_map = {
             'Comma': ',',