DeepSEQreen_NAR_fb

Sleeping

App Files Files Community

libokj commited on May 14, 2024

Commit

f0342df

1 Parent(s): c359763

Update app.py

Browse files

Files changed (1) hide show

app.py +226 -186

app.py CHANGED Viewed

@@ -45,7 +45,7 @@ import panel as pn
 from apscheduler.schedulers.background import BackgroundScheduler
 from tinydb import TinyDB, Query
-# import swifter
 from tqdm.auto import tqdm
 from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
@@ -156,7 +156,6 @@ display: inline-block !important;
 footer {
 visibility: hidden
 }
 """
@@ -192,7 +191,11 @@ def rgb_to_hex(rgb):
 def mol_to_pharm3d(mol, mode='html'):
     if mol is None:
         return
-    AllChem.Compute2DCoords(mol)
     feats = FEAT_FACTORY.GetFeaturesForMol(mol)
@@ -291,13 +294,23 @@ COLUMN_ALIASES = {
 }
 DRUG_SCRENN_CPI_OPTS = [
-    'Include Compound Max. Tanimoto Similarity to Training Compounds',
-    'Include Target Max. Sequence Identity to Known Interacting Targets of Compound',
-    'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
 ]
 DRUG_SCRENN_CPA_OPTS = [
-    'Include Compound Max. Tanimoto Similarity to Training Compounds',
 ]
 pd.set_option('display.float_format', '{:.3f}'.format)
@@ -383,6 +396,13 @@ def max_tanimoto_similarity(smi, seen_smiles_with_fp):
     return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound}
 def max_sequence_identity(seq, seen_fastas):
     if seq is None or seen_fastas is None or seen_fastas.empty:
         return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None}
@@ -395,20 +415,12 @@ def max_sequence_identity(seq, seen_fastas):
                 target = id2
         return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
-    aligner = PairwiseAligner()
-    aligner.mode = 'local'
     max_iden = 0
     target = None
     for fasta in seen_fastas['X2'].values:
-        alignment = aligner.align(seq, fasta)
-        identity = alignment.score / max(len(seq), len(fasta))
-        if identity == 1:
-            target = fasta
-            if 'ID2' in seen_fastas.columns:
-                id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
-                if pd.notnull(id2) and id2 != '':
-                    target = id2
-            return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
         if identity > max_iden:
             max_iden = identity
             target = fasta
@@ -416,7 +428,10 @@ def max_sequence_identity(seq, seen_fastas):
                 id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
                 if pd.notnull(id2) and id2 != '':
                     target = id2
     return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target}
@@ -846,12 +861,12 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         orig_df['Target Family'] = None
     if orig_df['Target Family'].isna().any():
         orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
-            orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
         )
     orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
     detect_family.cache_clear()
-    orig_df['X1^'] = orig_df['X1'].parallel_apply(rdkit_canonicalize)
     orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
     annotated_df = orig_df[~orig_df['Y'].isna()].copy()
@@ -952,66 +967,88 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         df_list = [prediction_df, annotated_df]
         prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True)
-        if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
-            for family in prediction_df['Target Family'].unique():
-                family_smiles_df = get_seen_smiles(family=family, task=task_value)
-                family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(smiles_to_ecfp)
-                @cache
-                def max_sim(smi):
-                    return max_tanimoto_similarity(smi, family_smiles_df)['Max. Tanimoto Similarity']
-                prediction_df.loc[
-                    prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
-                    prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
-                )
-                max_sim.cache_clear()
-        if "Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target" in opts:
             x2 = prediction_df['X2'].iloc[0]
             pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
-            pos_compounds_df['FP'] = pos_compounds_df['X1'].apply(smiles_to_ecfp)
             @cache
             def max_sim(smiles):
                 return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
-            prediction_df[['Max. Tanimoto Similarity to Known Target Ligands',
-                           'Max. Tanimoto Similarity Target Ligand']] = (
-                prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
-            )
             max_sim.cache_clear()
-        if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
             x2 = prediction_df['X2'].iloc[0]
-            prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
             @cache
-            def calculate_max_sequence_identity(compound):
-                compound_targets = df_training.loc[df_training['X1'] == compound]
-                return max_sequence_identity(x2, seen_fastas=compound_targets)
-            prediction_df[['Max. Sequence Identity to Known Ligand Targets',
-                           'Max. Sequence Identity Ligand Target']] = (
-                prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
             )
             prediction_df.drop(['X1^'], axis=1, inplace=True)
-            calculate_max_sequence_identity.cache_clear()
-        if "Include Target Max. Sequence Identity to Training Targets" in opts:
-            for family in prediction_df['Target Family'].unique():
-                family_fastas_df = get_seen_fastas(family=family, task=task_value)
-                @cache
-                def max_id(seq):
-                    return max_sequence_identity(seq, seen_fastas=family_fastas_df)['Max. Sequence Identity']
-                prediction_df.loc[
-                    prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
-                    prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
-                )
-                max_id.cache_clear()
         prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
         status = "COMPLETED"
@@ -1063,10 +1100,10 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
         if 'X1' in df.columns:
             if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
-                df['Compound'] = df['X1'].parallel_apply(
                     lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
-            df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
-            df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
         if task == 'Compound-Protein Binding Affinity':
             # Convert Y^ from pIC50 to IC50
@@ -1114,17 +1151,17 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
     columns_unique = None
     if 'Exclude Pharmacophore 3D' not in opts:
-        df_html['Pharmacophore'] = df_html['Compound'].parallel_apply(
             lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
     if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
-        df_html['Compound'] = df_html['Compound'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Compound'], axis=1, inplace=True)
     if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
-        df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Scaffold'], axis=1, inplace=True)
@@ -1138,15 +1175,20 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
             if any(col in df_html.columns for col in ['Y^', 'Y']):
                 job = 'Target Protein Identification'
                 category = 'Target Family'
-            columns_unique = df_html.columns.isin(['X1', 'ID1', 'Scaffold', 'Compound', 'Scaffold SMILES']
-                                                  + list(FILTER_MAP.keys()) + list(SCORE_MAP.keys()))
         elif n_compound >= 2 and n_protein == 1:
             unique_entity = 'Target of Interest'
             if any(col in df_html.columns for col in ['Y^', 'Y']):
                 job = 'Drug Hit Screening'
                 category = 'Scaffold SMILES'
-            columns_unique = df_html.columns.isin(['X2', 'ID2'])
         elif 'Y^' in df_html.columns:
             job = 'Interaction Pair Inference'
@@ -1154,7 +1196,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
     df_html.rename(columns=column_aliases, inplace=True)
     df_html.index.name = 'Index'
     if 'Target FASTA' in df_html.columns:
-        df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
             lambda x: wrap_text(x) if not pd.isna(x) else x)
     num_cols = df_html.select_dtypes('number').columns
@@ -1172,7 +1214,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         if 'Target ID' in df_html.columns:
             df_html.drop(['Target FASTA'], axis=1, inplace=True)
         if 'Target FASTA' in df_html.columns:
-            df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
                 lambda x: wrap_text(x) if not pd.isna(x) else x)
         if 'Scaffold SMILES' in df_html.columns:
             df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
@@ -1248,9 +1290,9 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         report_table = pn.widgets.Tabulator(
             df_html, formatters=formatters,
-            frozen_columns=[col for col in df_html.columns if col in [
-                'Target ID', 'Compound ID', 'Compound'
-            ]],
             disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
         for i, col in enumerate(num_cols):
@@ -1279,71 +1321,15 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         # Remove keys with empty values
         pie_charts = {k: v for k, v in pie_charts.items() if any(v)}
-        pn_css = """
-        .tabulator {
-            font-family: Courier New !important;
-            font-weight: normal !important;
-            font-size: 12px !important;
-        }
-        .tabulator-cell {
-            overflow: visible !important;
-        }
-        .tabulator-cell:hover {
-            z-index: 1000 !important;
-        }
-        .tabulator-cell.tabulator-frozen:hover {
-            z-index: 1000 !important;
-        }
-        .image-zoom-viewer {
-            display: inline-block;
-            overflow: visible;
-            z-index: 1000;
-        }
-        .image-zoom-viewer::after {
-            content: "";
-            top: 0;
-            left: 0;
-            width: 100%;
-            height: 100%;
-            pointer-events: none;
-        }
-        .image-zoom-viewer:hover::after {
-            pointer-events: all;
-        }
-        /* When hovering over the container, scale its child (the SVG) */
-        .tabulator-cell:hover .image-zoom-viewer svg {
-            padding: 3px;
-            position: absolute;
-            background-color: rgba(250, 250, 250, 0.854);
-            box-shadow: 0 0 10px rgba(0, 0, 0, 0.618);
-            border-radius: 3px;
-            transform: scale(3); /* Scale up the SVG */
-            transition: transform 0.3s ease;
-            pointer-events: none; /* Prevents the SVG from blocking mouse interactions */
-            z-index: 1000;
-        }
-        .image-zoom-viewer svg {
-            display: block; /* SVG is a block-level element for proper scaling */
-            z-index: 1000;
-        }
-        .image-zoom-viewer:hover {
-            z-index: 1000;
-        }
-        """
-        pn.extension(raw_css=[pn_css], js_files={
-            '3Dmol': './3Dmol-min.js',
-            'panel_custom': './panel.js',
-        })
         template = pn.template.VanillaTemplate(
             title=f'DeepSEQreen {job} Report',
@@ -1359,7 +1345,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         if unique_df is not None:
             unique_table = pn.widgets.Tabulator(unique_df, formatters=formatters, sizing_mode='stretch_width',
                                                 show_index=False, disabled=True,
-                                                frozen_columns=['Compound ID', 'Compound', 'Scaffold'])
             # if pie_charts:
             #     unique_table.width = 640
             stats_pane.append(pn.Column(f'### {unique_entity}', unique_table))
@@ -1451,11 +1437,11 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
     df_report = df.copy()
     try:
         for filter_name in filter_list:
-            df_report[filter_name] = df_report['Compound'].parallel_apply(
                 lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
         for score_name in score_list:
-            df_report[score_name] = df_report['Compound'].parallel_apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
         return (create_html_report(df_report, file=None, task=task), df_report,
@@ -1667,16 +1653,25 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                         label='OR Upload Your Own Library', variant='primary')
                     drug_library_upload = gr.File(label='Custom compound library file', visible=False)
-                drug_screen_opts = gr.CheckboxGroup(
-                    label="Step 6. Select Additional Options",
-                    choices=DRUG_SCRENN_CPI_OPTS,
-                    info="Experimental features - may increase the job computation time. "
-                         "Compound Max. Tanimoto Similarity to Training Compounds and "
-                         "Target Max. Sequence Identity to Known Interacting Targets of Compound "
-                         "suggest predictive reliability (higher values - greater reliability)."
-                         "Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target "
-                         "suggests the novelty of the compound (lower values - higher novelty)."
-                )
             with gr.Row():
                 with gr.Column():
                     drug_screen_email = gr.Textbox(
@@ -1777,14 +1772,24 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                         target_library_upload_btn = gr.UploadButton(
                             label='OR Upload Your Own Library', variant='primary')
                         target_library_upload = gr.File(label='Custom target library file', visible=False)
-                    target_identify_opts = gr.CheckboxGroup(
-                        ['Include Target Max. Sequence Identity to Training Targets'],
-                        label='Step 6. Select Additional Options',
-                        info="Experimental features - may increase the job computation time. "
-                             "Target Max. Sequence Identity to Training Targets suggest "
-                             "predictive reliability (higher values - greater reliability)."
-                    )
                 with gr.Row():
                     with gr.Column():
                         target_identify_email = gr.Textbox(
@@ -1823,9 +1828,11 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                 label='Step 1. Select Pair Input Type and Input',
                 value='Upload a CSV file containing paired compound-protein data')
             with gr.Column() as pair_upload:
-                gr.File(label="Example CSV dataset",
-                        value="data/examples/interaction_pair_inference.csv",
-                        interactive=False)
                 with gr.Row():
                     infer_csv_prompt = gr.Button(
                         value="Upload Your Own Dataset Below",
@@ -1833,27 +1840,50 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                 with gr.Column():
                     infer_pair = gr.File(
                         label='Upload CSV File Containing Paired Records',
-                        file_count="single", type='filepath', visible=True)
             with gr.Column(visible=False) as pair_generate:
                 with gr.Row():
-                    gr.File(label='Example SDF compound library',
-                            value='data/examples/compound_library.sdf', interactive=False)
-                    gr.File(label='Example FASTA target library',
-                            value='data/examples/target_library.fasta', interactive=False)
                 with gr.Row():
-                    gr.File(label='Example CSV compound library',
-                            value='data/examples/compound_library.csv', interactive=False)
-                    gr.File(label='Example CSV target library',
-                            value='data/examples/target_library.csv', interactive=False)
                 with gr.Row():
                     infer_library_prompt = gr.Button(
                         value="Upload Your Own Libraries Below",
-                        visible=False, variant='secondary')
                 with gr.Row():
-                    infer_drug = gr.File(label='Upload SDF/CSV File Containing Multiple Compounds',
-                                         file_count="single", type='filepath')
-                    infer_target = gr.File(label='Upload FASTA/CSV File Containing Multiple Targets',
-                                           file_count="single", type='filepath')
             with gr.Row():
                 with gr.Column(min_width=200):
@@ -1862,10 +1892,12 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                         "If the proteins in the target library of interest "
                         "all belong to the same protein family, manually selecting the family is supported."
                     )
                     pair_infer_target_family = gr.Dropdown(
                         choices=list(TARGET_FAMILY_MAP.keys()),
                         value='General',
-                        label='Step 2. Select Target Family (Optional)')
                 with gr.Column(min_width=200):
                     HelpTip(
@@ -1877,15 +1909,17 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                     pair_infer_task = gr.Dropdown(
                         list(TASK_MAP.keys()),
                         label='Step 3. Select a Prediction Task',
-                        value='Compound-Protein Interaction')
                 with gr.Column(min_width=200):
-                    HelpTip("Select your preferred model. "
-                            "Please refer to documentation for detailed benchmark results."
-                            )
                     pair_infer_preset = gr.Dropdown(
                         list(PRESET_MAP.keys()),
-                        label='Step 4. Select a Preset Model')
                     # infer_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
                     #                                        variant='primary')
             pair_infer_opts = gr.CheckboxGroup(visible=False)
@@ -2093,7 +2127,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                 alignment = aligner.align(processed_fasta, query)
                 return alignment.score / max(len(processed_fasta), len(query))
-            alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
             row = alignment_df.loc[alignment_df['score'].idxmax()]
             family = str(row['Target Family']).title()
             return gr.Dropdown(value=family,
@@ -2119,6 +2153,12 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
         show_progress='hidden'
     )
     def example_fill(input_type):
         return {target_id: 'Q16539',
@@ -2419,13 +2459,13 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
             infer_df = pd.read_csv(drug_target_pair_upload)
             validate_columns(infer_df, ['X1', 'X2'])
-            infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
                 validate_seq_str, regex=SMILES_PAT)
             if not infer_df['X1_ERR'].isna().all():
                 raise ValueError(
                     f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
-            infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
                 validate_seq_str, regex=FASTA_PAT)
             if not infer_df['X2_ERR'].isna().all():
                 raise ValueError(
@@ -2757,7 +2797,7 @@ if __name__ == "__main__":
             db.update({'status': 'FAILED'}, Job.id == job['id'])
     scheduler = BackgroundScheduler()
-    scheduler.add_job(check_expiry, 'interval', hours=1)
     scheduler.start()
     demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)

 from apscheduler.schedulers.background import BackgroundScheduler
 from tinydb import TinyDB, Query
+import swifter
 from tqdm.auto import tqdm
 from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
 footer {
 visibility: hidden
 }
 """
 def mol_to_pharm3d(mol, mode='html'):
     if mol is None:
         return
+    # AllChem.Compute2DCoords(mol)
+    mol = Chem.AddHs(mol)
+    params = AllChem.ETKDGv3()
+    params.randomSeed = 0xf00d  # for reproducibility
+    AllChem.EmbedMolecule(mol, params)
     feats = FEAT_FACTORY.GetFeaturesForMol(mol)
 }
 DRUG_SCRENN_CPI_OPTS = [
+    'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set',
+    'Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target',
+    'Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound',
 ]
 DRUG_SCRENN_CPA_OPTS = [
+    'Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set',
+]
+TARGET_IDENTIFY_CPI_OPTS = [
+    'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set',
+    'Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound',
+    'Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target',
+]
+TARGET_IDENTIFY_CPA_OPTS = [
+    'Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set',
 ]
 pd.set_option('display.float_format', '{:.3f}'.format)
     return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound}
+def alignment_score(query, target):
+    aligner = PairwiseAligner()
+    aligner.mode = 'local'
+    alignment = aligner.align(query, target)
+    return alignment.score / max(len(query), len(target))
 def max_sequence_identity(seq, seen_fastas):
     if seq is None or seen_fastas is None or seen_fastas.empty:
         return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None}
                 target = id2
         return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
+    cached_alignment_score = cache(alignment_score)
     max_iden = 0
     target = None
     for fasta in seen_fastas['X2'].values:
+        identity = cached_alignment_score(seq, fasta)
         if identity > max_iden:
             max_iden = identity
             target = fasta
                 id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
                 if pd.notnull(id2) and id2 != '':
                     target = id2
+            if max_iden == 1:
+                break
+    cached_alignment_score.cache_clear()
     return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target}
         orig_df['Target Family'] = None
     if orig_df['Target Family'].isna().any():
         orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
+            orig_df.loc[orig_df['Target Family'].isna(), 'X2'].swifter.apply(detect_family)
         )
     orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
     detect_family.cache_clear()
+    orig_df['X1^'] = orig_df['X1'].swifter.apply(rdkit_canonicalize)
     orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
     annotated_df = orig_df[~orig_df['Y'].isna()].copy()
         df_list = [prediction_df, annotated_df]
         prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True)
+        # Advanced options for Drug Hit Screening
+        if "Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set" in opts:
+            x2 = prediction_df['X2'].iloc[0]
+            prediction_df[[
+                'Max. Sequence Identity to Training Targets',
+                'Max. Id. Training Target'
+            ]] = pd.Series(max_sequence_identity(x2, df_training))
+        if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
             x2 = prediction_df['X2'].iloc[0]
             pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
+            pos_compounds_df['FP'] = pos_compounds_df['X1'].swifter.apply(smiles_to_ecfp)
             @cache
             def max_sim(smiles):
                 return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
+            prediction_df[[
+                'Max. Tanimoto Similarity to Known Ligands',
+                'Max. Sim. Ligand'
+            ]] = prediction_df['X1'].swifter.apply(max_sim).apply(pd.Series)
             max_sim.cache_clear()
+        if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
             x2 = prediction_df['X2'].iloc[0]
+            prediction_df['X1^'] = prediction_df['X1'].swifter.apply(rdkit_canonicalize)
             @cache
+            def max_id(compound):
+                pos_targets_df = df_training.loc[df_training['X1'] == compound]
+                return max_sequence_identity(x2, seen_fastas=pos_targets_df)
+            prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
+                           'Max. Id. Target']] = (
+                prediction_df['X1^'].swifter.apply(max_id).apply(pd.Series)
             )
             prediction_df.drop(['X1^'], axis=1, inplace=True)
+            max_id.cache_clear()
+        # Advanced options for Target Protein Identification
+        if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
+            x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
+            prediction_df['FP'] = prediction_df['X1'].swifter.apply(smiles_to_ecfp)
+            prediction_df[[
+                'Max. Tanimoto Similarity to Training Compounds',
+                'Max. Sim. Training Compound'
+            ]] = pd.Series(max_tanimoto_similarity(x1, df_training))
+        if "Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound" in opts:
+            x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
+            pos_targets_df = df_training.loc[(df_training['X1'] == x1) & (df_training['Y'] == 1)].copy()
+            @cache
+            def max_id(fasta):
+                return max_sequence_identity(fasta, seen_fastas=pos_targets_df)
+            prediction_df[[
+                'Max. Sequence Identity to Known Targets of Input Compound',
+                'Max. Id. Target'
+            ]] = prediction_df['X2'].swifter.apply(max_id).apply(pd.Series)
+            max_id.cache_clear()
+        if "Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target" in opts:
+            x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
+            @cache
+            def max_sim(fasta):
+                pos_targets_df = df_training.loc[(df_training['X2'] == fasta) & (df_training['Y'] == 1)].copy()
+                pos_targets_df['FP'] = pos_targets_df['X1'].swifter.apply(smiles_to_ecfp)
+                return max_tanimoto_similarity(x1, seen_smiles_with_fp=pos_targets_df)
+            prediction_df[[
+                'Max. Tanimoto Similarity to Known Ligands of Identified Target',
+                'Max. Sim. Ligand'
+            ]] = prediction_df['X2'].swifter.apply(max_sim).apply(pd.Series)
+            max_sim.cache_clear()
         prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
         status = "COMPLETED"
         if 'X1' in df.columns:
             if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
+                df['Compound'] = df['X1'].swifter.apply(
                     lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
+            df['Scaffold'] = df['Compound'].swifter.apply(MurckoScaffold.GetScaffoldForMol)
+            df['Scaffold SMILES'] = df['Scaffold'].swifter.apply(lambda x: Chem.MolToSmiles(x))
         if task == 'Compound-Protein Binding Affinity':
             # Convert Y^ from pIC50 to IC50
     columns_unique = None
     if 'Exclude Pharmacophore 3D' not in opts:
+        df_html['Pharmacophore'] = df_html['Compound'].swifter.apply(
             lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
     if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
+        df_html['Compound'] = df_html['Compound'].swifter.apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Compound'], axis=1, inplace=True)
     if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
+        df_html['Scaffold'] = df_html['Scaffold'].swifter.apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Scaffold'], axis=1, inplace=True)
             if any(col in df_html.columns for col in ['Y^', 'Y']):
                 job = 'Target Protein Identification'
                 category = 'Target Family'
+            columns_unique = df_html.columns.isin(
+                ['ID1', 'Pharmacophore', 'Compound', 'Scaffold', 'X1', 'Scaffold SMILES',
+                 'Max. Tanimoto Similarity to Training Compounds', 'Max. Sim. Training Compound']
+                + list(FILTER_MAP.keys()) + list(SCORE_MAP.keys())
+            )
         elif n_compound >= 2 and n_protein == 1:
             unique_entity = 'Target of Interest'
             if any(col in df_html.columns for col in ['Y^', 'Y']):
                 job = 'Drug Hit Screening'
                 category = 'Scaffold SMILES'
+            columns_unique = df_html.columns.isin(
+                ['X2', 'ID2', 'Max. Sequence Identity to Training Targets', 'Max. Id. Training Target']
+            )
         elif 'Y^' in df_html.columns:
             job = 'Interaction Pair Inference'
     df_html.rename(columns=column_aliases, inplace=True)
     df_html.index.name = 'Index'
     if 'Target FASTA' in df_html.columns:
+        df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
             lambda x: wrap_text(x) if not pd.isna(x) else x)
     num_cols = df_html.select_dtypes('number').columns
         if 'Target ID' in df_html.columns:
             df_html.drop(['Target FASTA'], axis=1, inplace=True)
         if 'Target FASTA' in df_html.columns:
+            df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
                 lambda x: wrap_text(x) if not pd.isna(x) else x)
         if 'Scaffold SMILES' in df_html.columns:
             df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
         report_table = pn.widgets.Tabulator(
             df_html, formatters=formatters,
+            frozen_columns=[
+                'Index', 'Target ID', 'Compound ID', 'Compound'
+            ],
             disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
         for i, col in enumerate(num_cols):
         # Remove keys with empty values
         pie_charts = {k: v for k, v in pie_charts.items() if any(v)}
+        pn.extension(
+            css_files=[
+                './static/panel.css',
+            ],
+            js_files={
+                '3Dmol': './static/3Dmol-min.js',
+                'panel_custom': './static/panel.js'
+            }
+        )
         template = pn.template.VanillaTemplate(
             title=f'DeepSEQreen {job} Report',
         if unique_df is not None:
             unique_table = pn.widgets.Tabulator(unique_df, formatters=formatters, sizing_mode='stretch_width',
                                                 show_index=False, disabled=True,
+                                                frozen_columns=['Compound ID', 'Compound', 'Target ID'])
             # if pie_charts:
             #     unique_table.width = 640
             stats_pane.append(pn.Column(f'### {unique_entity}', unique_table))
     df_report = df.copy()
     try:
         for filter_name in filter_list:
+            df_report[filter_name] = df_report['Compound'].swifter.apply(
                 lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
         for score_name in score_list:
+            df_report[score_name] = df_report['Compound'].swifter.apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
         return (create_html_report(df_report, file=None, task=task), df_report,
                         label='OR Upload Your Own Library', variant='primary')
                     drug_library_upload = gr.File(label='Custom compound library file', visible=False)
+                with gr.Column():
+                    HelpTip("""
+<b>Max. Sequence Identity between the Input Target and Targets in the Training Set</b>:
+this serves as an indicator of the predictioon applicability/reliability –
+higher similarities indicate more reliable predictions (preferably > 0.85).<br>
+<b>Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target</b>:
+this serves as an indicator of both the confidence level and novelty of the predicted hit compounds –
+higher similarities suggest greater confidence, while lower Tanimoto similarities may indicate the novelty
+of the identified hit compounds compared to known drugs or true interacting compounds of the input target.<br>
+<b>Max. Sequence Identity between the Input Target and Known Targets of Hit Compound</b>:
+this serves as an additional indicator of the confidence level of the predicted hit compounds –
+higher identities usually lead to greater confidence in the predictions.<br>
+""")
+                    drug_screen_opts = gr.CheckboxGroup(
+                        label="Step 6. Select Additional Options",
+                        choices=DRUG_SCRENN_CPI_OPTS,
+                        info="Experimental features - may increase the job computation time."
+                             "See the Help Tip on the right or the Documentation for detailed explanation."
+                    )
             with gr.Row():
                 with gr.Column():
                     drug_screen_email = gr.Textbox(
                         target_library_upload_btn = gr.UploadButton(
                             label='OR Upload Your Own Library', variant='primary')
                         target_library_upload = gr.File(label='Custom target library file', visible=False)
+                    with gr.Column():
+                        HelpTip("""
+<b>Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set</b>:
+this serves as an indicator of prediction applicability and reliability –
+higher similarities indicates more reliable predictions (ideally > 0.85).<br>
+<b>Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound</b>:
+this serves as an indicator of prediction confidence for the potential targets –
+higher similarities typically imply higher confidence levels.<br>
+<b>Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target</b>:
+this serves as an additional indicator of the confidence level in the predicted potential targets –
+higher similarities usually correspond to greater prediction confidence.<br>
+""")
+                        target_identify_opts = gr.CheckboxGroup(
+                            choices=TARGET_IDENTIFY_CPI_OPTS,
+                            label='Step 6. Select Additional Options',
+                            info="Experimental features - may increase the job computation time. "
+                                 "See the Help Tip on the right or the Documentation for detailed explanation."
+                        )
                 with gr.Row():
                     with gr.Column():
                         target_identify_email = gr.Textbox(
                 label='Step 1. Select Pair Input Type and Input',
                 value='Upload a CSV file containing paired compound-protein data')
             with gr.Column() as pair_upload:
+                gr.File(
+                    label="Example CSV dataset",
+                    value="data/examples/interaction_pair_inference.csv",
+                    interactive=False
+                )
                 with gr.Row():
                     infer_csv_prompt = gr.Button(
                         value="Upload Your Own Dataset Below",
                 with gr.Column():
                     infer_pair = gr.File(
                         label='Upload CSV File Containing Paired Records',
+                        file_count="single",
+                        type='filepath',
+                        visible=True
+                    )
             with gr.Column(visible=False) as pair_generate:
                 with gr.Row():
+                    gr.File(
+                        label='Example SDF compound library',
+                        value='data/examples/compound_library.sdf',
+                        interactive=False
+                    )
+                    gr.File(
+                        label='Example FASTA target library',
+                        value='data/examples/target_library.fasta',
+                        interactive=False
+                    )
                 with gr.Row():
+                    gr.File(
+                        label='Example CSV compound library',
+                        value='data/examples/compound_library.csv',
+                        interactive=False
+                    )
+                    gr.File(
+                        label='Example CSV target library',
+                        value='data/examples/target_library.csv',
+                        interactive=False
+                    )
                 with gr.Row():
                     infer_library_prompt = gr.Button(
                         value="Upload Your Own Libraries Below",
+                        visible=False,
+                        variant='secondary'
+                    )
                 with gr.Row():
+                    infer_drug = gr.File(
+                        label='Upload SDF/CSV File Containing Multiple Compounds',
+                        file_count="single",
+                        type='filepath'
+                    )
+                    infer_target = gr.File(
+                        label='Upload FASTA/CSV File Containing Multiple Targets',
+                        file_count="single",
+                        type='filepath'
+                    )
             with gr.Row():
                 with gr.Column(min_width=200):
                         "If the proteins in the target library of interest "
                         "all belong to the same protein family, manually selecting the family is supported."
                     )
                     pair_infer_target_family = gr.Dropdown(
                         choices=list(TARGET_FAMILY_MAP.keys()),
                         value='General',
+                        label='Step 2. Select Target Family (Optional)'
+                    )
                 with gr.Column(min_width=200):
                     HelpTip(
                     pair_infer_task = gr.Dropdown(
                         list(TASK_MAP.keys()),
                         label='Step 3. Select a Prediction Task',
+                        value='Compound-Protein Interaction'
+                    )
                 with gr.Column(min_width=200):
+                    HelpTip(
+                        "Select your preferred model. Please refer to documentation for detailed benchmark results."
+                    )
                     pair_infer_preset = gr.Dropdown(
                         list(PRESET_MAP.keys()),
+                        label='Step 4. Select a Preset Model'
+                    )
                     # infer_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
                     #                                        variant='primary')
             pair_infer_opts = gr.CheckboxGroup(visible=False)
                 alignment = aligner.align(processed_fasta, query)
                 return alignment.score / max(len(processed_fasta), len(query))
+            alignment_df['score'] = alignment_df['X2'].swifter.apply(align_score)
             row = alignment_df.loc[alignment_df['score'].idxmax()]
             family = str(row['Target Family']).title()
             return gr.Dropdown(value=family,
         show_progress='hidden'
     )
+    target_identify_task.select(
+        fn=lambda task, opts: gr.CheckboxGroup(choices=TARGET_IDENTIFY_CPA_OPTS)
+        if task == 'Compound-Protein Binding Affinity' else gr.CheckboxGroup(choices=DRUG_SCRENN_CPI_OPTS),
+        inputs=[target_identify_task, target_identify_opts], outputs=target_identify_opts,
+        show_progress='hidden'
+    )
     def example_fill(input_type):
         return {target_id: 'Q16539',
             infer_df = pd.read_csv(drug_target_pair_upload)
             validate_columns(infer_df, ['X1', 'X2'])
+            infer_df['X1_ERR'] = infer_df['X1'].swifter.apply(
                 validate_seq_str, regex=SMILES_PAT)
             if not infer_df['X1_ERR'].isna().all():
                 raise ValueError(
                     f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+            infer_df['X2_ERR'] = infer_df['X2'].swifter.apply(
                 validate_seq_str, regex=FASTA_PAT)
             if not infer_df['X2_ERR'].isna().all():
                 raise ValueError(
             db.update({'status': 'FAILED'}, Job.id == job['id'])
     scheduler = BackgroundScheduler()
+    scheduler.add_job(check_expiry, 'interval', hours=1, timezone=pytz.utc)
     scheduler.start()
     demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)