DeepSEQreen_NAR_fb

Sleeping

App Files Files Community

libokj commited on Apr 22, 2024

Commit

51a0841

1 Parent(s): 41c7c53

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -76

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ from rdkit.DataStructs import BulkTanimotoSimilarity
 from requests.adapters import HTTPAdapter, Retry
 from markdown import markdown
 from rdkit import Chem, DataStructs
-from rdkit.Chem import Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen, AllChem
 from rdkit.Chem.Scaffolds import MurckoScaffold
 import seaborn as sns
@@ -231,6 +231,16 @@ COLUMN_ALIASES = {
     'Y^': 'Predicted CPI/CPA',
 }
 pd.set_option('display.float_format', '{:.3f}'.format)
 PandasTools.molRepresentation = 'svg'
 PandasTools.drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
@@ -277,6 +287,15 @@ def check_expiry():
                 send_email(job)
 def max_tanimoto_similarity(smi, seen_smiles_with_fp):
     if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty:
         return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
@@ -294,7 +313,7 @@ def max_tanimoto_similarity(smi, seen_smiles_with_fp):
         return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
     mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
-    sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
     idx = sims.argmax()
     compound = seen_smiles_with_fp.iloc[idx]['X1']
     if 'ID1' in seen_smiles_with_fp.columns:
@@ -658,6 +677,7 @@ def lookup_job(job_id):
     retry = 0
     while not stop:
         try:
             Job = Query()
             jobs = db.search((Job.id == job_id))
             if jobs:
@@ -671,7 +691,6 @@ def lookup_job(job_id):
                 if job.get('expiry_time'):
                     expiry_time = ts_to_str(job['expiry_time'], get_timezone_by_ip(job['ip']))
                 if job_status == "RUNNING":
-                    sleep(5)
                     yield {
                         pred_lookup_status: f'''
 Your **{job_type}** job (ID: **{job_id}**) started at
@@ -713,10 +732,12 @@ using the job id. You will also receive an email notification once the job is do
                     }
             else:
                 stop = (retry > 3)
-                msg = f'Job ID {job_id} not found. Retrying... ({retry})'
                 gr.Info(msg)
                 retry += 1
-                sleep(5)
                 yield {
                     pred_lookup_status: msg,
                     pred_lookup_btn: gr.Button(visible=True),
@@ -864,10 +885,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
             for family in prediction_df['Target Family'].unique():
                 family_smiles_df = get_seen_smiles(family=family, task=task_value)
-                family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
-                    lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
-                        Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
-                )
                 @cache
                 def max_sim(smi):
@@ -881,12 +899,13 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         if "Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target" in opts:
             x2 = prediction_df['X2'].iloc[0]
-            pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)]
-            pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(
-                lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
-                    Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
-            )
-            max_sim = cache(partial(max_tanimoto_similarity, seen_smiles_with_fp=pos_compounds_df))
             prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
                 prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
             )
@@ -904,6 +923,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
             prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
                 prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
             )
             calculate_max_sequence_identity.cache_clear()
@@ -946,7 +966,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
                    'error': error,
                    'input_file': predict_filepath,
                    'output_file': predictions_file},
-                   job_query)
         if job_info := db.search(job_query)[0]:
             if job_info.get('email'):
                 send_email(job_info)
@@ -981,21 +1001,6 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
             if 'Y^' in df.columns:
                 df['Y^'] = 10 ** (-df['Y^'])
-        # DF_FOR_REPORT = df.copy()
-        # pie_chart = None
-        # value = None
-        # if 'Y^' in DF_FOR_REPORT.columns:
-        #     value = 'Y^'
-        # elif 'Y' in DF_FOR_REPORT.columns:
-        #     value = 'Y'
-        # if value:
-        #     if DF_FOR_REPORT['X1'].nunique() > 1 >= DF_FOR_REPORT['X2'].nunique():
-        #         pie_chart = create_pie_chart(DF_FOR_REPORT, category='Scaffold SMILES', value=value, top_k=100)
-        #     elif DF_FOR_REPORT['X2'].nunique() > 1 >= DF_FOR_REPORT['X1'].nunique():
-        #         pie_chart = create_pie_chart(DF_FOR_REPORT, category='Target family', value=value, top_k=100)
         return {html_report: create_html_report(df, file=None, task=task),
                 raw_df: df,
                 report_df: df.copy(),
@@ -1121,23 +1126,30 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
                 f'<div style="overflow:auto; height: 300px; font-family: Courier !important;">{table_html}</div>')
     else:
         bool_formatters = {col: BooleanFormatter() for col in bool_cols}
         float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
         other_formatters = {
             'Predicted Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
             'Actual Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
-            'Compound': HTMLTemplateFormatter(template='<div class="image-zoom-viewer"><%= value %></div>'),
-            'Scaffold': HTMLTemplateFormatter(template='<div class="image-zoom-viewer"><%= value %></div>'),
             'Target FASTA': {'type': 'textarea', 'width': 60},
-            'Target ID': HTMLTemplateFormatter(
-                template='<a href="<% '
-                         'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) '
-                         '{ %>https://www.uniprot.org/uniprotkb/<%= value %><% } '
-                         'else { %>https://www.uniprot.org/uniprotkb?query=<%= value %><% } '
-                         '%>" target="_blank"><%= value %></a>'),
-            'Compound ID': HTMLTemplateFormatter(
-                template='<a href="https://pubchem.ncbi.nlm.nih.gov/compound/<%= value %>" '
-                         'target="_blank"><%= value %></a>')
         }
         formatters = {**bool_formatters, **float_formatters, **other_formatters}
@@ -1161,8 +1173,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
                     subset=df_html.columns == col, cmap=sns.light_palette(num_col_colors[i], as_cmap=True).reversed())
         pie_charts = {}
-        for y in df_html.columns.intersection(['Predicted Interaction Probability', 'Actual Interaction Probability',
-                                               'Predicted Binding Affinity', 'Actual Binding Affinity']):
             pie_charts[y] = []
             for k in [10, 30, 100]:
                 if k < len(df_html):
@@ -1348,19 +1359,6 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
             df_report[score_name] = df_report['Compound'].parallel_apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
-        # pie_chart = None
-        # value = None
-        # if 'Y^' in df.columns:
-        #     value = 'Y^'
-        # elif 'Y' in df.columns:
-        #     value = 'Y'
-        #
-        # if value:
-        #     if df['X1'].nunique() > 1 >= df['X2'].nunique():
-        #         pie_chart = create_pie_chart(df, category='Scaffold SMILES', value=value, top_k=100)
-        #     elif df['X2'].nunique() > 1 >= df['X1'].nunique():
-        #         pie_chart = create_pie_chart(df, category='Target family', value=value, top_k=100)
         return (create_html_report(df_report, file=None, task=task), df_report,
                 gr.File(visible=False), gr.File(visible=False))
@@ -1572,17 +1570,13 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                 drug_screen_opts = gr.CheckboxGroup(
                     label="Step 6. Select Additional Options",
-                    choices=[
-                        'Include Compound Max. Tanimoto Similarity to Training Compounds',
-                        'Include Target Max. Sequence Identity to Known Interacting Targets of Compound',
-                        'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
-                    ],
-                    info="These are experimental features and may increase the job computation time. "
                          "Compound Max. Tanimoto Similarity to Training Compounds and "
                          "Target Max. Sequence Identity to Known Interacting Targets of Compound "
-                         "are indicative of the predictive reliability of the model (the higher the more reliable), "
-                         "while Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target "
-                         "is indicative of the novelty of the compound (the lower the more novel)."
                 )
             with gr.Row():
                 with gr.Column():
@@ -1687,9 +1681,9 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                     target_identify_opts = gr.CheckboxGroup(
                         ['Include Target Max. Sequence Identity to Training Targets'],
                         label='Step 6. Select Additional Options',
-                        info="These are experimental features and may increase the job computation time. "
-                             "Target Max. Sequence Identity to Training Targets is indicative of the "
-                             "predictive reliability of the model (the higher the more reliable)."
                     )
                 with gr.Row():
                     with gr.Column():
@@ -2015,10 +2009,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
     ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
     drug_screen_task.select(
-        fn=lambda task, opts: [opt for opt in opts if opt not in [
-            'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
-            'Include Target Max. Sequence Identity to Known Interacting Targets of Compound'
-        ]] if task == 'Compound-Protein Binding Affinity' else opts,
         inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts,
         show_progress='hidden'
     )
@@ -2089,9 +2081,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
             scenario = "Unseen Target (<0.85 sequence identity)"
         return {drug_screen_preset:
-                gr.Dropdown(value=row['Model'],
-                            info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
-                                 f"model with the best {score} in the {scenario} scenario on {row['Family']}."),
                 drug_screen_target_family:
                     gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}

 from requests.adapters import HTTPAdapter, Retry
 from markdown import markdown
 from rdkit import Chem, DataStructs
+from rdkit.Chem import AllChem, Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen
 from rdkit.Chem.Scaffolds import MurckoScaffold
 import seaborn as sns
     'Y^': 'Predicted CPI/CPA',
 }
+DRUG_SCRENN_CPI_OPTS = [
+    'Include Compound Max. Tanimoto Similarity to Training Compounds',
+    'Include Target Max. Sequence Identity to Known Interacting Targets of Compound',
+    'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
+]
+DRUG_SCRENN_CPA_OPTS = [
+    'Include Compound Max. Tanimoto Similarity to Training Compounds',
+]
 pd.set_option('display.float_format', '{:.3f}'.format)
 PandasTools.molRepresentation = 'svg'
 PandasTools.drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
                 send_email(job)
+def smiles_to_ecfp(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol:
+        ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
+    else:
+        ecfp = []
+    return ecfp
 def max_tanimoto_similarity(smi, seen_smiles_with_fp):
     if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty:
         return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
         return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
     mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
+    sims = pd.Series(BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'].values)).to_numpy()
     idx = sims.argmax()
     compound = seen_smiles_with_fp.iloc[idx]['X1']
     if 'ID1' in seen_smiles_with_fp.columns:
     retry = 0
     while not stop:
         try:
+            sleep(5)
             Job = Query()
             jobs = db.search((Job.id == job_id))
             if jobs:
                 if job.get('expiry_time'):
                     expiry_time = ts_to_str(job['expiry_time'], get_timezone_by_ip(job['ip']))
                 if job_status == "RUNNING":
                     yield {
                         pred_lookup_status: f'''
 Your **{job_type}** job (ID: **{job_id}**) started at
                     }
             else:
                 stop = (retry > 3)
+                if not stop:
+                    msg = f'Job ID {job_id} not found. Retrying... ({retry})'
+                else:
+                    msg = f'Job ID {job_id} not found after {retry} retries. Please check the job ID and try again.'
                 gr.Info(msg)
                 retry += 1
                 yield {
                     pred_lookup_status: msg,
                     pred_lookup_btn: gr.Button(visible=True),
         if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
             for family in prediction_df['Target Family'].unique():
                 family_smiles_df = get_seen_smiles(family=family, task=task_value)
+                family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(smiles_to_ecfp)
                 @cache
                 def max_sim(smi):
         if "Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target" in opts:
             x2 = prediction_df['X2'].iloc[0]
+            pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
+            pos_compounds_df['FP'] = pos_compounds_df['X1'].apply(smiles_to_ecfp)
+            @cache
+            def max_sim(smiles):
+                return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
             prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
                 prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
             )
             prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
                 prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
             )
+            prediction_df.drop(['N'], axis=1, inplace=True)
             calculate_max_sequence_identity.cache_clear()
                    'error': error,
                    'input_file': predict_filepath,
                    'output_file': predictions_file},
+                  job_query)
         if job_info := db.search(job_query)[0]:
             if job_info.get('email'):
                 send_email(job_info)
             if 'Y^' in df.columns:
                 df['Y^'] = 10 ** (-df['Y^'])
         return {html_report: create_html_report(df, file=None, task=task),
                 raw_df: df,
                 report_df: df.copy(),
                 f'<div style="overflow:auto; height: 300px; font-family: Courier !important;">{table_html}</div>')
     else:
+        image_zoom_formatter = HTMLTemplateFormatter(template='<div class="image-zoom-viewer"><%= value %></div>')
+        uniprot_id_formatter = HTMLTemplateFormatter(
+            template='<a href="<% '
+                     'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) '
+                     '{ %>https://www.uniprot.org/uniprotkb/<%= value %><% } '
+                     'else { %><textarea style="width: 60ch;"><%= value %></textarea><% } '
+                     '%>" target="_blank"><%= value %></a>'
+        )
+        pubchem_id_formatter = HTMLTemplateFormatter(
+            template='<a href="https://pubchem.ncbi.nlm.nih.gov/#query=<%= value %>" '
+                     'target="_blank"><%= value %></a>'
+        )
         bool_formatters = {col: BooleanFormatter() for col in bool_cols}
         float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
         other_formatters = {
             'Predicted Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
             'Actual Interaction Probability': {'type': 'progress', 'max': 1.0, 'legend': True},
+            'Compound': image_zoom_formatter,
+            'Scaffold': image_zoom_formatter,
             'Target FASTA': {'type': 'textarea', 'width': 60},
+            'Target ID': uniprot_id_formatter,
+            'Compound ID': pubchem_id_formatter,
+            'Max. Sequence Identity Target': uniprot_id_formatter,
+            'Max. Tanimoto Similarity Compound': pubchem_id_formatter,
         }
         formatters = {**bool_formatters, **float_formatters, **other_formatters}
                     subset=df_html.columns == col, cmap=sns.light_palette(num_col_colors[i], as_cmap=True).reversed())
         pie_charts = {}
+        for y in df_html.columns.intersection(['Interaction Probability', 'Binding Affinity']):
             pie_charts[y] = []
             for k in [10, 30, 100]:
                 if k < len(df_html):
             df_report[score_name] = df_report['Compound'].parallel_apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
         return (create_html_report(df_report, file=None, task=task), df_report,
                 gr.File(visible=False), gr.File(visible=False))
                 drug_screen_opts = gr.CheckboxGroup(
                     label="Step 6. Select Additional Options",
+                    choices=DRUG_SCRENN_CPI_OPTS,
+                    info="Experimental features - may increase the job computation time. "
                          "Compound Max. Tanimoto Similarity to Training Compounds and "
                          "Target Max. Sequence Identity to Known Interacting Targets of Compound "
+                         "suggest predictive reliability (higher values - greater reliability)."
+                         "Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target "
+                         "suggests the novelty of the compound (lower values - higher novelty)."
                 )
             with gr.Row():
                 with gr.Column():
                     target_identify_opts = gr.CheckboxGroup(
                         ['Include Target Max. Sequence Identity to Training Targets'],
                         label='Step 6. Select Additional Options',
+                        info="Experimental features - may increase the job computation time. "
+                             "Target Max. Sequence Identity to Training Targets suggest "
+                             "predictive reliability (higher values - greater reliability))."
                     )
                 with gr.Row():
                     with gr.Column():
     ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
     drug_screen_task.select(
+        fn=lambda task, opts: gr.CheckboxGroup(choices=DRUG_SCRENN_CPA_OPTS)
+        if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup(choices=DRUG_SCRENN_CPI_OPTS),
         inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts,
         show_progress='hidden'
     )
             scenario = "Unseen Target (<0.85 sequence identity)"
         return {drug_screen_preset:
+                    gr.Dropdown(value=row['Model'],
+                                info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
+                                     f"model with the best {score} in the {scenario} scenario on {row['Family']}."),
                 drug_screen_target_family:
                     gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}