DeepSEQreen_NAR_fb

Sleeping

App Files Files Community

libokj commited on Dec 24, 2023

Commit

2a4780f

1 Parent(s): 5cb3cd9

Update app.py

Browse files

Files changed (1) hide show

app.py +262 -86

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import hashlib
 import json
 import textwrap
 import threading
@@ -20,7 +21,7 @@ import hydra
 import pandas as pd
 import plotly.express as px
 import requests
-from rdkit.Chem.rdMolDescriptors import CalcNumRotatableBonds, CalcNumHeavyAtoms, CalcNumAtoms
 from requests.adapters import HTTPAdapter, Retry
 from rdkit import Chem
 from rdkit.Chem import RDConfig, Descriptors, Draw, Lipinski, Crippen, PandasTools, AllChem
@@ -59,11 +60,13 @@ SESSION.mount('https://', ADAPTER)
 UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
 CSS = """
 .help-tip {
   position: absolute;
   display: inline-block;
-  top: 24px;
   right: 0px;
   text-align: center;
   border-radius: 40%;
@@ -204,6 +207,10 @@ def rotatable_bond(row):
     return CalcNumRotatableBonds((row['Compound']))
 def lipinski(row):
     """
     Lipinski's rules:
@@ -271,24 +278,72 @@ def ghose(row):
         return True
 SCORE_MAP = {
     'SAscore': sa_score,
     'LogP': logp,
     'Molecular weight': mw,
     'Molar refractivity': mr,
     'H-bond donor count': hbd,
     'H-Bond acceptor count': hba,
     'Rotatable bond count': rotatable_bond,
-    # 'TopoPSA': None,
 }
 FILTER_MAP = {
     'REOS': reos,
-    "Lipinski's rule of 5": lipinski,
     'Ghose': ghose,
-    # 'Rule of 3': rule_of_3,
-    # 'Veber': veber,
-    # 'PAINS': pains,
 }
 TASK_MAP = {
@@ -348,10 +403,12 @@ def validate_columns(df, mandatory_cols):
 def process_target_fasta(sequence):
-    lines = sequence.strip().split("\n")
-    if lines[0].startswith(">"):
-        lines = lines[1:]
-    return ''.join(lines).split(">")[0]
 def send_email(receiver, msg):
@@ -480,10 +537,11 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
             df['Scaffold SMILES'] = df['X1'].swifter.progress_bar(
                 desc=f"Calculating scaffold...").apply(MurckoScaffold.MurckoScaffoldSmilesFromSmiles)
             # Add a new column with RDKit molecule objects
-            PandasTools.AddMoleculeColumnToFrame(df, smilesCol='X1', molCol='Compound',
-                                                 includeFingerprints=False)
             PandasTools.AddMoleculeColumnToFrame(df, smilesCol='Scaffold SMILES', molCol='Scaffold',
-                                                 includeFingerprints=False)
         DF_FOR_REPORT = df.copy()
         # pie_chart = None
@@ -665,6 +723,43 @@ def smiles_from_sdf(sdf_path):
         return Chem.MolToSmiles(suppl[0])
 theme = gr.themes.Base(spacing_size="sm", text_size='md').set(
     background_fill_primary='#dfe6f0',
     background_fill_secondary='#dfe6f0',
@@ -697,10 +792,10 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
     with gr.Tabs() as tabs:
         with gr.TabItem(label='Drug hit screening', id=0):
             gr.Markdown('''
-                    # <center>DeepSEQreen Drug Hit Screening</center>
-                    <center>
-                    To predict interactions/binding affinities of a single target against a library of drugs.
-                    </center>
                     ''')
             with gr.Blocks() as screen_block:
                 with gr.Column() as screen_page:
@@ -751,13 +846,18 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
                                                          visible=False)
                         target_family_detect_btn = gr.Button(value='Auto-detect', variant='primary')
-                    target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5)
                     example_fasta = gr.Button(value='Example: Human MAPK14', elem_id='example')
                     with gr.Row():
                         with gr.Column():
                             drug_library = gr.Dropdown(label='Select a Compound Library',
                                                        choices=list(DRUG_LIBRARY_MAP.keys()))
                             drug_library_upload_btn = gr.UploadButton(
                                 label='Upload a custom library', variant='primary')
                             drug_library_upload = gr.File(label='Custom drug library file', visible=False)
@@ -771,7 +871,6 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
                             drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Select a Preset Model')
                             screen_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
                     # drug_screen_email = gr.Textbox(
                     #     label='Email (optional)',
                     #     info="Your email will be used to send you notifications when your job finishes."
@@ -791,11 +890,24 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
         with gr.TabItem(label='Target protein identification', id=1):
             gr.Markdown('''
-                # <center>DeepSEQreen Target Protein Identification</center>
-                <center>
-                To predict interactions/binding affinities of a single drug against a library of targets.
-                </center>
                 ''')
             with gr.Blocks() as identify_block:
                 with gr.Column() as identify_page:
@@ -816,9 +928,8 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
                                 interactive=True)
                             compound_upload_btn = gr.UploadButton(label='Upload', variant='primary', type='binary')
-                        with gr.Column():
-                            target_identify_target_family = gr.Dropdown(choices=['General'], value='General',
-                                                                        label='Target Protein Family')
                     compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
                     example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
@@ -827,6 +938,11 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
                         with gr.Column():
                             target_library = gr.Dropdown(label='Select a Target Library',
                                                          choices=list(TARGET_LIBRARY_MAP.keys()))
                             target_library_upload_btn = gr.UploadButton(
                                 label='Upload a custom library', variant='primary')
                             target_library_upload = gr.File(label='Custom target library file', visible=False)
@@ -841,7 +957,6 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
                             target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
                             identify_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
                     # with gr.Row():
                     #     target_identify_email = gr.Textbox(
                     #         label='Email (optional)',
@@ -861,41 +976,51 @@ with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
             gr.Markdown('''
 # <center>DeepSEQreen Interaction Pair Inference</center>
 <center>To predict interactions/binding affinities between any drug-target pairs.</center>
-''')
-            with gr.Blocks() as infer_block:
-                with gr.Column() as infer_page:
-                    with gr.Column() as custom_upload:
-                        gr.Markdown("""
-Please upload a custom dataset CSV file with 2 required string columns and optionally 2 ID columns:
 <b>X1</b>: the SMILES string of a compound\n
 <b>X2</b>: the FASTA sequence of a target\n
-<b>ID1</b>: the ID (PubChem or any arbitrary unique identifier) of a compound\n
-<b>ID22</b>: the ID (UniProt or any arbitrary unique identifier) of a target
-Example:
 | X1                                      | X2            | ID1          | ID2    |
 |---------------------------------------- |---------------|--------------|--------|
 | CCOC(=O)Nc1ccc(NCc2ccc(F)cc2)cc1N       | MVQKSRNGGV... | CHEMBL41355  | O88943 |
 | CCCCCc1cc(O)c(C/C=C(\C)CCC=C(C)C)c(O)c1 | MTSPSSSPVF... | CHEMBL497318 | Q9Y5S1 |
-                        """)
                         gr.File(label="Example custom dataset",
                                 value="data/examples/interaction_pair_inference.csv",
                                 interactive=False)
                         with gr.Column():
                             infer_data_for_predict = gr.File(
-                                label='Custom dataset file', file_count="single", type='filepath', visible=True)
                     with gr.Column() as pair_generate:
-                        gr.Markdown("""
-Upload a SDF file which contains multiple compounds of interest and a FASTA file which contains multiple targets of
-interest. All combinations of drug-target pairs from these two files will be automatically generated and submitted to
-a prediction job.
-                        """)
-                        pair_sdf = gr.File(label='SDF file containing multiple compounds')
-                        pair_fasta = gr.File(label='FASTA file containing multiple targets')
                     with gr.Row(visible=True):
                         pair_infer_task = gr.Dropdown(list(TASK_MAP.keys()), label='Task')
@@ -925,7 +1050,9 @@ a prediction job.
                 # <center>DeepSEQreen Chemical Property Report</center>
                 <center>
                 To compute chemical properties for the predictions of drug hit screening,
-                target protein identification, and interaction pair inference. You may also upload
                 your own dataset. The page shows only a preview report displaying at most 30 records
                 (with top predicted DTI/DTA if reporting results from a prediction job). For a full report, please
                 generate and download a raw data CSV or interactive table HTML file below.
@@ -1002,6 +1129,7 @@ a prediction job.
             case 'UniProt ID':
                 query = f"{uid.strip()}.fasta"
             case 'Gene symbol':
                 query = f'search?query=organism_name:{organism.strip()}+AND+gene:{gene.strip()}&format=fasta'
         try:
@@ -1031,7 +1159,8 @@ a prediction job.
             desc="Detecting protein family of the target...").apply(align_score)
         row = alignment_df.loc[alignment_df['score'].idxmax()]
         return gr.Dropdown(value=row['protein_family'].capitalize(),
-                           info=f"Reason: Best BLASTP score ({row['score']}) with {row['ID2']} from family {row['protein_family']}")
     target_family_detect_btn.click(fn=target_family_detect, inputs=target_fasta, outputs=drug_screen_target_family)
@@ -1132,6 +1261,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
         x.name, gr.Dropdown(value=Path(x.name).name, choices=list(TARGET_LIBRARY_MAP.keys()) + [Path(x.name).name])
     ], inputs=target_library_upload_btn, outputs=[target_library_upload, target_library])
     def identify_recommend_model(smiles, task):
         if task == 'Drug-target interaction':
             train = pd.read_csv('data/benchmarks/all_families_reduced_dti_train.csv')
@@ -1161,6 +1291,31 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
                                         outputs=target_identify_preset)
     def drug_screen_validate(fasta, library, library_upload, state, progress=gr.Progress(track_tqdm=True)):
         if not state:
             try:
@@ -1171,14 +1326,10 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
                 if library in DRUG_LIBRARY_MAP.keys():
                     screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
                 else:
-                    if library_upload.endswith('.csv'):
-                        screen_df = pd.read_csv(library_upload)
-                    elif library_upload.endswith('.sdf'):
-                        screen_df = PandasTools.LoadSDF(library_upload,
-                        smilesName='X1', molColName='Compound', includeFingerprints=True)
-                    else:
-                        raise gr.Error('Currently only CSV and SDF files are supported.')
-                    validate_columns(screen_df, ['X1'])
                 screen_df['X2'] = fasta
@@ -1214,17 +1365,10 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
                 if library in TARGET_LIBRARY_MAP.keys():
                     identify_df = pd.read_csv(Path('data/target_libraries', TARGET_LIBRARY_MAP[library]))
                 else:
-                    if library_upload.endswith('.csv'):
-                        identify_df = pd.read_csv(library_upload)
-                    elif library_upload.endswith('.fasta'):
-                        records = list(SeqIO.parse(library_upload, "fasta"))
-                        id2 = [record.id for record in records]
-                        seq = [str(record.seq) for record in records]
-                        identify_df = pd.DataFrame({'ID2': id2, 'X2': seq})
-                    else:
-                        raise 'Currently only csv and fasta files are supported.'
-                    validate_columns(identify_df, ['X2'])
                 identify_df['X1'] = smiles
                 job_id = uuid4()
@@ -1250,25 +1394,57 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
             # return {identify_flag: False}
-    def pair_infer_validate(drug_target_pair_upload, state, progress=gr.Progress(track_tqdm=True)):
         if not state:
             try:
-                df = pd.read_csv(drug_target_pair_upload)
-                validate_columns(df, ['X1', 'X2'])
-                df['X1_ERR'] = df['X1'].swifter.progress_bar(desc="Validating SMILES...").apply(
-                    validate_seq_str, regex=SMILES_PAT)
-                if not df['X1_ERR'].isna().all():
-                    raise ValueError(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
-                df['X2_ERR'] = df['X2'].swifter.progress_bar(desc="Validating FASTA...").apply(
-                    validate_seq_str, regex=FASTA_PAT)
-                if not df['X2_ERR'].isna().all():
-                    raise ValueError(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
-                job_id = uuid4()
-                return {infer_flag: job_id,
-                        run_state: job_id}
             except Exception as e:
                 gr.Warning(f'Failed to submit the job due to error: {str(e)}')
                 return {infer_flag: False,
@@ -1319,8 +1495,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
     pair_infer_btn.click(
         fn=pair_infer_validate,
-        inputs=[infer_data_for_predict, run_state],  # , drug_screen_email],
-        outputs=[infer_flag, run_state]
     ).then(
         fn=lambda: [gr.Column(visible=False), gr.Markdown(visible=True)],
         outputs=[infer_page, infer_waiting]

 import hashlib
+import itertools
 import json
 import textwrap
 import threading
 import pandas as pd
 import plotly.express as px
 import requests
+from rdkit.Chem.rdMolDescriptors import CalcNumRotatableBonds, CalcNumHeavyAtoms, CalcNumAtoms, CalcTPSA
 from requests.adapters import HTTPAdapter, Retry
 from rdkit import Chem
 from rdkit.Chem import RDConfig, Descriptors, Draw, Lipinski, Crippen, PandasTools, AllChem
 UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
+CUSTOM_DATASET_MAX_LEN = 10_000
 CSS = """
 .help-tip {
   position: absolute;
   display: inline-block;
+  top: 16px;
   right: 0px;
   text-align: center;
   border-radius: 40%;
     return CalcNumRotatableBonds((row['Compound']))
+def tpsa(row):
+    return CalcTPSA((row['Compound']))
 def lipinski(row):
     """
     Lipinski's rules:
         return True
+def veber(row):
+    """
+    The Veber filter is a rule of thumb filter for orally active drugs described in
+    Veber et al., J Med Chem. 2002; 45(12): 2615-23.:
+    Rotatable bonds <= 10
+    Topological polar surface area <= 140
+    """
+    if not rotatable_bond(row) <= 10:
+        return False
+    elif not tpsa(row) <= 140:
+        return False
+    else:
+        return True
+def rule_of_three(row):
+    """
+    Rule of Three filter (Congreve et al., Drug Discov. Today. 8 (19): 876–7, (2003).):
+    Molecular weight <= 300
+    LogP <= 3
+    H-bond donor <= 3
+    H-bond acceptor count <= 3
+    Rotatable bond count <= 3
+    """
+    if not mw(row) <= 300:
+        return False
+    elif not logp(row) <= 3:
+        return False
+    elif not hbd(row) <= 3:
+        return False
+    elif not hba(row) <= 3:
+        return False
+    elif not rotatable_bond(row) <= 3:
+        return False
+    else:
+        return True
+# def smarts_filter():
+#     alerts = Chem.MolFromSmarts("enter one smart here")
+#     detected_alerts = []
+#     for smiles in data['X1']:
+#         mol = Chem.MolFromSmiles(smiles)
+#         detected_alerts.append(mol.HasSubstructMatch(alerts))
 SCORE_MAP = {
     'SAscore': sa_score,
     'LogP': logp,
     'Molecular weight': mw,
+    'Number of heavy atoms': heavy_atom,
     'Molar refractivity': mr,
     'H-bond donor count': hbd,
     'H-Bond acceptor count': hba,
     'Rotatable bond count': rotatable_bond,
+    'Topological polar surface area': tpsa,
 }
 FILTER_MAP = {
+    # TODO support number_of_violations
     'REOS': reos,
+    "Lipinski's Rule of Five": lipinski,
     'Ghose': ghose,
+    'Rule of Three': rule_of_three,
+    'Veber': veber,
+    'PAINS': pains,
 }
 TASK_MAP = {
 def process_target_fasta(sequence):
+    # lines = sequence.strip().split("\n")
+    # if lines[0].startswith(">"):
+    #     lines = lines[1:]
+    # return ''.join(lines).split(">")[0]
+    record = SeqIO.parse(io.StringIO(sequence), "fasta")[0]
+    return str(record.seq)
 def send_email(receiver, msg):
             df['Scaffold SMILES'] = df['X1'].swifter.progress_bar(
                 desc=f"Calculating scaffold...").apply(MurckoScaffold.MurckoScaffoldSmilesFromSmiles)
             # Add a new column with RDKit molecule objects
+            if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
+                PandasTools.AddMoleculeColumnToFrame(df, smilesCol='X1', molCol='Compound',
+                                                     includeFingerprints=True)
             PandasTools.AddMoleculeColumnToFrame(df, smilesCol='Scaffold SMILES', molCol='Scaffold',
+                                                 includeFingerprints=True)
         DF_FOR_REPORT = df.copy()
         # pie_chart = None
         return Chem.MolToSmiles(suppl[0])
+def drug_library_from_sdf(sdf_path):
+    return PandasTools.LoadSDF(
+        sdf_path,
+        smilesName='X1', molColName='Compound', includeFingerprints=True
+    )
+def process_target_library_upload(library_upload):
+    if library_upload.endswith('.csv'):
+        identify_df = pd.read_csv(library_upload)
+    elif library_upload.endswith('.fasta'):
+        identify_df = target_library_from_fasta(library_upload)
+    else:
+        raise gr.Error('Currently only CSV and FASTA files are supported as target libraries.')
+    validate_columns(identify_df, ['X2'])
+    return library_upload
+def process_drug_library_upload(library_upload):
+    if library_upload.endswith('.csv'):
+        screen_df = pd.read_csv(library_upload)
+    elif library_upload.endswith('.sdf'):
+        screen_df = drug_library_from_sdf(library_upload)
+    else:
+        raise gr.Error('Currently only CSV and SDF files are supported as drug libraries.')
+    validate_columns(screen_df, ['X1'])
+    return library_upload
+def target_library_from_fasta(fasta_path):
+    records = list(SeqIO.parse(fasta_path, "fasta"))
+    id2 = [record.id for record in records]
+    seq = [str(record.seq) for record in records]
+    df = pd.DataFrame({'ID2': id2, 'X2': seq})
+    return df
 theme = gr.themes.Base(spacing_size="sm", text_size='md').set(
     background_fill_primary='#dfe6f0',
     background_fill_secondary='#dfe6f0',
     with gr.Tabs() as tabs:
         with gr.TabItem(label='Drug hit screening', id=0):
             gr.Markdown('''
+# <center>DeepSEQreen Drug Hit Screening</center>
+<center>
+To predict interactions/binding affinities of a single target against a library of drugs.
+</center>
                     ''')
             with gr.Blocks() as screen_block:
                 with gr.Column() as screen_page:
                                                          visible=False)
                         target_family_detect_btn = gr.Button(value='Auto-detect', variant='primary')
+                    target_fasta = gr.Code(label='Input or Display FASTA', interactive=True, lines=5, max_lines=5)
                     example_fasta = gr.Button(value='Example: Human MAPK14', elem_id='example')
                     with gr.Row():
                         with gr.Column():
                             drug_library = gr.Dropdown(label='Select a Compound Library',
                                                        choices=list(DRUG_LIBRARY_MAP.keys()))
+                            with gr.Row():
+                                gr.File(label='Example SDF drug library',
+                                        value='data/examples/drug_library.sdf', interactive=False)
+                                gr.File(label='Example CSV drug library',
+                                        value='data/examples/drug_library.csv', interactive=False)
                             drug_library_upload_btn = gr.UploadButton(
                                 label='Upload a custom library', variant='primary')
                             drug_library_upload = gr.File(label='Custom drug library file', visible=False)
                             drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Select a Preset Model')
                             screen_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
                     # drug_screen_email = gr.Textbox(
                     #     label='Email (optional)',
                     #     info="Your email will be used to send you notifications when your job finishes."
         with gr.TabItem(label='Target protein identification', id=1):
             gr.Markdown('''
+# <center>DeepSEQreen Target Protein Identification</center>
+<center>
+To predict interactions/binding affinities of a single drug against a library of protein targets.
+</center>
+ℹ️ A custom target library can be a FASTA file with a single or multiple amino acid sequences,
+or a CSV file has a required FASTA string column and optionally an ID column:
+<b>X2</b>: the FASTA sequence of a target\n
+<b>ID2</b> (optional): the ID (PubChem or any arbitrary unique identifier) of a compound\n
+Example CSV target library:
+| X2            | ID2    |
+|---------------|--------|
+| MVQKSRNGGV... | O88943 |
+| MTSPSSSPVF... | Q9Y5S1 |
                 ''')
             with gr.Blocks() as identify_block:
                 with gr.Column() as identify_page:
                                 interactive=True)
                             compound_upload_btn = gr.UploadButton(label='Upload', variant='primary', type='binary')
+                        target_identify_target_family = gr.Dropdown(choices=['General'], value='General',
+                                                                    label='Target Protein Family')
                     compound_smiles = gr.Code(label='Input or Display Compound SMILES', interactive=True, lines=5)
                     example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
                         with gr.Column():
                             target_library = gr.Dropdown(label='Select a Target Library',
                                                          choices=list(TARGET_LIBRARY_MAP.keys()))
+                            with gr.Row():
+                                gr.File(label='Example FASTA target library',
+                                        value='data/examples/target_library.fasta', interactive=False)
+                                gr.File(label='Example CSV target library',
+                                        value='data/examples/target_library.csv', interactive=False)
                             target_library_upload_btn = gr.UploadButton(
                                 label='Upload a custom library', variant='primary')
                             target_library_upload = gr.File(label='Custom target library file', visible=False)
                             target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
                             identify_preset_recommend_btn = gr.Button(value='Recommend a model', variant='primary')
                     # with gr.Row():
                     #     target_identify_email = gr.Textbox(
                     #         label='Email (optional)',
             gr.Markdown('''
 # <center>DeepSEQreen Interaction Pair Inference</center>
 <center>To predict interactions/binding affinities between any drug-target pairs.</center>
+ℹ️ A custom interaction pair dataset can be generated from a FASTA file containing multiple sequences
+and a SDF file containing multiple compounds (for predicting DTI/DTA of all possible combinations of
+drug-target pairs), or a CSV file with 2 required string columns and optionally 2 ID columns:
 <b>X1</b>: the SMILES string of a compound\n
 <b>X2</b>: the FASTA sequence of a target\n
+<b>ID1</b> (optional): the ID (PubChem or any arbitrary unique identifier) of a compound\n
+<b>ID2</b> (optional): the ID (UniProt or any arbitrary unique identifier) of a target
+Example CSV interaction pair dataset:
 | X1                                      | X2            | ID1          | ID2    |
 |---------------------------------------- |---------------|--------------|--------|
 | CCOC(=O)Nc1ccc(NCc2ccc(F)cc2)cc1N       | MVQKSRNGGV... | CHEMBL41355  | O88943 |
 | CCCCCc1cc(O)c(C/C=C(\C)CCC=C(C)C)c(O)c1 | MTSPSSSPVF... | CHEMBL497318 | Q9Y5S1 |
+''')
+            with gr.Blocks() as infer_block:
+                with gr.Column() as infer_page:
+                    infer_type = gr.Dropdown(choices=['Upload a drug library and a target library',
+                                                      'Upload a CSV interaction pair dataset'],
+                                             value='Upload a drug library and a target library')
+                    with gr.Column() as pair_upload:
                         gr.File(label="Example custom dataset",
                                 value="data/examples/interaction_pair_inference.csv",
                                 interactive=False)
                         with gr.Column():
                             infer_data_for_predict = gr.File(
+                                label='Upload a custom dataset', file_count="single", type='filepath', visible=True)
                     with gr.Column() as pair_generate:
+                        with gr.Row():
+                            gr.File(label='Example SDF drug library',
+                                    value='data/examples/drug_library.sdf', interactive=False)
+                            gr.File(label='Example FASTA target library',
+                                    value='data/examples/target_library.fasta', interactive=False)
+                        with gr.Row():
+                            gr.File(label='Example CSV drug library',
+                                    value='data/examples/drug_library.csv', interactive=False)
+                            gr.File(label='Example CSV target library',
+                                    value='data/examples/target_library.csv', interactive=False)
+                        with gr.Row():
+                            infer_drug = gr.File(label='SDF/CSV file containing multiple compounds',
+                                                 file_count="single", type='filepath')
+                            infer_target = gr.File(label='FASTA/CSV file containing multiple targets',
+                                                   file_count="single", type='filepath')
                     with gr.Row(visible=True):
                         pair_infer_task = gr.Dropdown(list(TASK_MAP.keys()), label='Task')
                 # <center>DeepSEQreen Chemical Property Report</center>
                 <center>
                 To compute chemical properties for the predictions of drug hit screening,
+                target protein identification, and interaction pair inference.
+                You may also upload
                 your own dataset. The page shows only a preview report displaying at most 30 records
                 (with top predicted DTI/DTA if reporting results from a prediction job). For a full report, please
                 generate and download a raw data CSV or interactive table HTML file below.
             case 'UniProt ID':
                 query = f"{uid.strip()}.fasta"
             case 'Gene symbol':
+                organism = organism if organism else 'Human'
                 query = f'search?query=organism_name:{organism.strip()}+AND+gene:{gene.strip()}&format=fasta'
         try:
             desc="Detecting protein family of the target...").apply(align_score)
         row = alignment_df.loc[alignment_df['score'].idxmax()]
         return gr.Dropdown(value=row['protein_family'].capitalize(),
+                           info=f"Reason: Best BLASTP score ({row['score']}) "
+                                f"with {row['ID2']} from family {row['protein_family']}")
     target_family_detect_btn.click(fn=target_family_detect, inputs=target_fasta, outputs=drug_screen_target_family)
         x.name, gr.Dropdown(value=Path(x.name).name, choices=list(TARGET_LIBRARY_MAP.keys()) + [Path(x.name).name])
     ], inputs=target_library_upload_btn, outputs=[target_library_upload, target_library])
     def identify_recommend_model(smiles, task):
         if task == 'Drug-target interaction':
             train = pd.read_csv('data/benchmarks/all_families_reduced_dti_train.csv')
                                         outputs=target_identify_preset)
+    def infer_type_change(upload_type):
+        match upload_type:
+            case "Upload a drug library and a target library":
+                return {
+                    pair_upload: gr.Column(visible=False),
+                    pair_generate: gr.Column(visible=True),
+                    infer_data_for_predict: None,
+                    infer_drug: None,
+                    infer_target: None
+                }
+        match upload_type:
+            case "Upload a CSV interaction pair dataset":
+                return {
+                    pair_upload: gr.Column(visible=True),
+                    pair_generate: gr.Column(visible=False),
+                    infer_data_for_predict: None,
+                    infer_drug: None,
+                    infer_target: None
+                }
+    infer_type.select(fn=infer_type_change, inputs=infer_type,
+                      outputs=[pair_upload, pair_generate, infer_data_for_predict, infer_drug, infer_target])
     def drug_screen_validate(fasta, library, library_upload, state, progress=gr.Progress(track_tqdm=True)):
         if not state:
             try:
                 if library in DRUG_LIBRARY_MAP.keys():
                     screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
                 else:
+                    screen_df = process_drug_library_upload(library_upload)
+                    if len(screen_df) >= CUSTOM_DATASET_MAX_LEN:
+                        raise gr.Error(f'The uploaded drug library has more records '
+                                       f'than the allowed maximum (CUSTOM_DATASET_MAX_LEN).')
                 screen_df['X2'] = fasta
                 if library in TARGET_LIBRARY_MAP.keys():
                     identify_df = pd.read_csv(Path('data/target_libraries', TARGET_LIBRARY_MAP[library]))
                 else:
+                    identify_df = process_target_library_upload(library_upload)
+                    if len(identify_df) >= CUSTOM_DATASET_MAX_LEN:
+                        raise gr.Error(f'The uploaded target library has more records '
+                                       f'than the allowed maximum (CUSTOM_DATASET_MAX_LEN).')
                 identify_df['X1'] = smiles
                 job_id = uuid4()
             # return {identify_flag: False}
+    def pair_infer_validate(drug_target_pair_upload, drug_upload, target_upload, state,
+                            progress=gr.Progress(track_tqdm=True)):
         if not state:
             try:
+                job_id = uuid4()
+                if drug_target_pair_upload:
+                    infer_df = pd.read_csv(drug_target_pair_upload)
+                    validate_columns(infer_df, ['X1', 'X2'])
+                    infer_df['X1_ERR'] = infer_df['X1'].swifter.progress_bar(desc="Validating SMILES...").apply(
+                        validate_seq_str, regex=SMILES_PAT)
+                    if not infer_df['X1_ERR'].isna().all():
+                        raise ValueError(
+                            f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+                    infer_df['X2_ERR'] = infer_df['X2'].swifter.progress_bar(desc="Validating FASTA...").apply(
+                        validate_seq_str, regex=FASTA_PAT)
+                    if not infer_df['X2_ERR'].isna().all():
+                        raise ValueError(
+                            f"Encountered invalid FASTA:\n{infer_df[~infer_df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
+                    return {infer_data_for_predict: str(drug_target_pair_upload),
+                            infer_flag: job_id,
+                            run_state: job_id}
+                elif drug_upload and target_upload:
+                    drug_df = process_drug_library_upload(drug_upload)
+                    target_df = process_target_library_upload(target_upload)
+                    drug_df.drop_duplicates(subset=['X1'], inplace=True)
+                    target_df.drop_duplicates(subset=['X2'], inplace=True)
+                    infer_df = pd.DataFrame(list(itertools.product(drug_df['X1'], target_df['X2'])),
+                                            columns=['X1', 'X2'])
+                    infer_df = infer_df.merge(drug_df, on='X1').merge(target_df, on='X2')
+                    temp_file = Path(f'temp/{job_id}_input.csv').resolve()
+                    infer_df.to_csv(temp_file, index=False)
+                    if temp_file.is_file():
+                        return {infer_data_for_predict: str(temp_file),
+                                infer_flag: job_id,
+                                run_state: job_id}
+                else:
+                    raise gr.Error('Should upload a drug-target pair dataset,or '
+                                   'upload both a drug library and a target library.')
+                if len(infer_df) >= CUSTOM_DATASET_MAX_LEN:
+                    raise gr.Error(f'The uploaded/generated drug-target pair dataset has more records '
+                                   f'than the allowed maximum (CUSTOM_DATASET_MAX_LEN).')
             except Exception as e:
                 gr.Warning(f'Failed to submit the job due to error: {str(e)}')
                 return {infer_flag: False,
     pair_infer_btn.click(
         fn=pair_infer_validate,
+        inputs=[infer_data_for_predict, infer_drug, infer_target, run_state],  # , drug_screen_email],
+        outputs=[infer_data_for_predict, infer_flag, run_state]
     ).then(
         fn=lambda: [gr.Column(visible=False), gr.Markdown(visible=True)],
         outputs=[infer_page, infer_waiting]