Spaces:

HUBioDataLab
/

DrugGEN

Running

App Files Files Community

mgyigit commited on Mar 29

Commit

969a6ef

verified ·

1 Parent(s): fa6133d

Update app.py

Browse files

Files changed (1) hide show

app.py +234 -185

app.py CHANGED Viewed

@@ -13,50 +13,50 @@ import time
 class DrugGENConfig:
     # Inference configuration
-    submodel='DrugGEN'
-    inference_model="/home/user/app/experiments/models/DrugGEN/"
-    sample_num=100
     # Data configuration
-    inf_smiles='/home/user/app/data/chembl_test.smi'
-    train_smiles='/home/user/app/data/chembl_train.smi'
-    inf_batch_size=1
-    mol_data_dir='/home/user/app/data'
-    features=False
     # Model configuration
-    act='relu'
-    max_atom=45
-    dim=128
-    depth=1
-    heads=8
-    mlp_ratio=3
-    dropout=0.
     # Seed configuration
-    set_seed=True
-    seed=10
-    disable_correction=False
 class DrugGENAKT1Config(DrugGENConfig):
-    submodel='DrugGEN'
-    inference_model="/home/user/app/experiments/models/DrugGEN-akt1/"
-    train_drug_smiles='/home/user/app/data/akt_train.smi'
-    max_atom=45
 class DrugGENCDK2Config(DrugGENConfig):
-    submodel='DrugGEN'
-    inference_model="/home/user/app/experiments/models/DrugGEN-cdk2/"
-    train_drug_smiles='/home/user/app//data/cdk2_train.smi'
-    max_atom=38
 class NoTargetConfig(DrugGENConfig):
-    submodel="NoTarget"
-    inference_model="/home/user/app/experiments/models/NoTarget/"
 model_configs = {
@@ -66,60 +66,60 @@ model_configs = {
 }
-def function(model_name: str, num_molecules: int, seed_num: int):
-    '''
-    Returns:
-    image, metrics_df, file_path, basic_metrics, advanced_metrics
-    '''
-    if model_name == "DrugGEN-NoTarget":
-        model_name = "NoTarget"
     config = model_configs[model_name]
-    config.sample_num = num_molecules
-    if config.sample_num > 250:
-        raise gr.Error("You have requested to generate more than the allowed limit of 250 molecules. Please reduce your request to 250 or fewer.")
-    if seed_num is None or seed_num.strip() == "":
         config.seed = random.randint(0, 10000)
     else:
-        try:
-            config.seed = int(seed_num)
-        except ValueError:
-            raise gr.Error("The seed must be an integer value!")
-    if model_name != "NoTarget":
-        model_name = "DrugGEN"
     inferer = Inference(config)
     start_time = time.time()
     scores = inferer.inference()  # This returns a DataFrame with specific columns
     et = time.time() - start_time
-    score_df = pd.DataFrame({
-        "Runtime (seconds)": [et],
-        "Validity": [scores["validity"].iloc[0]],
-        "Uniqueness": [scores["uniqueness"].iloc[0]],
-        "Novelty (Train)": [scores["novelty"].iloc[0]],
-        "Novelty (Test)": [scores["novelty_test"].iloc[0]],
-        "Drug Novelty": [scores["drug_novelty"].iloc[0]],
-        "Max Length": [scores["max_len"].iloc[0]],
-        "Mean Atom Type": [scores["mean_atom_type"].iloc[0]],
-        "SNN ChEMBL": [scores["snn_chembl"].iloc[0]],
-        "SNN Drug": [scores["snn_drug"].iloc[0]],
-        "Internal Diversity": [scores["IntDiv"].iloc[0]],
-        "QED": [scores["qed"].iloc[0]],
-        "SA Score": [scores["sa"].iloc[0]]
-    })
     # Create basic metrics dataframe
     basic_metrics = pd.DataFrame({
         "Validity": [scores["validity"].iloc[0]],
         "Uniqueness": [scores["uniqueness"].iloc[0]],
         "Novelty (Train)": [scores["novelty"].iloc[0]],
-        "Novelty (Test)": [scores["novelty_test"].iloc[0]],
-        "Drug Novelty": [scores["drug_novelty"].iloc[0]],
         "Runtime (s)": [round(et, 2)]
     })
@@ -129,13 +129,13 @@ def function(model_name: str, num_molecules: int, seed_num: int):
         "SA Score": [scores["sa"].iloc[0]],
         "Internal Diversity": [scores["IntDiv"].iloc[0]],
         "SNN ChEMBL": [scores["snn_chembl"].iloc[0]],
-        "SNN Drug": [scores["snn_drug"].iloc[0]],
-        "Max Length": [scores["max_len"].iloc[0]]
     })
-    output_file_path = f'/home/user/app/experiments/inference/{model_name}/inference_drugs.txt'
-    new_path = f'{model_name}_denovo_mols.smi'
     os.rename(output_file_path, new_path)
     with open(new_path) as f:
@@ -143,13 +143,14 @@ def function(model_name: str, num_molecules: int, seed_num: int):
     generated_molecule_list = inference_drugs.split("\n")[:-1]
     rng = random.Random(config.seed)
-    if num_molecules > 12:
-        selected_molecules = rng.choices(generated_molecule_list, k=12)
     else:
-        selected_molecules = generated_molecule_list
-    selected_molecules = [Chem.MolFromSmiles(mol) for mol in selected_molecules if Chem.MolFromSmiles(mol) is not None]
     drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
     drawOptions.prepareMolsBeforeDrawing = False
@@ -160,7 +161,6 @@ def function(model_name: str, num_molecules: int, seed_num: int):
         molsPerRow=3,
         subImgSize=(400, 400),
         maxMols=len(selected_molecules),
-        # legends=None,
         returnPNG=False,
         drawOptions=drawOptions,
         highlightAtomLists=None,
@@ -170,7 +170,6 @@ def function(model_name: str, num_molecules: int, seed_num: int):
     return molecule_image, new_path, basic_metrics, advanced_metrics
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     # Add custom CSS for styling
     gr.HTML("""
@@ -186,44 +185,40 @@ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     </style>
     """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("# DrugGEN: Target Centric De Novo Design of Drug Candidate Molecules with Graph Generative Deep Adversarial Networks")
-            gr.HTML("""
-            <div style="display: flex; gap: 10px; margin-bottom: 15px;">
-                <!-- arXiv badge -->
-                <a href="https://arxiv.org/abs/2302.07868" target="_blank" style="text-decoration: none;">
-                    <div style="
-                        display: inline-block;
-                        background-color: #b31b1b;
-                        color: #ffffff !important;  /* Force white text */
-                        padding: 5px 10px;
-                        border-radius: 5px;
-                        font-size: 14px;"
-                    >
-                        <span style="font-weight: bold;">arXiv</span> 2302.07868
-                    </div>
-                </a>
-                <!-- GitHub badge -->
-                <a href="https://github.com/HUBioDataLab/DrugGEN" target="_blank" style="text-decoration: none;">
-                    <div style="
-                        display: inline-block;
-                        background-color: #24292e;
-                        color: #ffffff !important;  /* Force white text */
-                        padding: 5px 10px;
-                        border-radius: 5px;
-                        font-size: 14px;"
-                    >
-                        <span style="font-weight: bold;">GitHub</span> Repository
-                    </div>
-                </a>
             </div>
-            """)
-            with gr.Accordion("About DrugGEN Models", open=False):
-                gr.Markdown("""
 ## Model Variations
 ### DrugGEN-AKT1
@@ -233,104 +228,158 @@ This model is designed to generate molecules targeting the human AKT1 protein (U
 This model is designed to generate molecules targeting the human CDK2 protein (UniProt ID: P24941).
 ### DrugGEN-NoTarget
-This is a general-purpose model that generates diverse drug-like molecules without targeting a specific protein. It's useful for:
-- Exploring chemical space
-- Generating diverse scaffolds
-- Creating molecules with drug-like properties
 For more details, see our [paper on arXiv](https://arxiv.org/abs/2302.07868).
-                """)
-            with gr.Accordion("Understanding the Metrics", open=False):
-                gr.Markdown("""
 ## Evaluation Metrics
 ### Basic Metrics
 - **Validity**: Percentage of generated molecules that are chemically valid
 - **Uniqueness**: Percentage of unique molecules among valid ones
-- **Runtime**: Time taken to generate the requested molecules
 ### Novelty Metrics
 - **Novelty (Train)**: Percentage of molecules not found in the training set
-- **Novelty (Test)**: Percentage of molecules not found in the test set
-- **Drug Novelty**: Percentage of molecules not found in known inhibitors of the target protein
 ### Structural Metrics
-- **Max Length**: Maximum component length in the generated molecules
 - **Mean Atom Type**: Average distribution of atom types
 - **Internal Diversity**: Diversity within the generated set (higher is more diverse)
 ### Drug-likeness Metrics
 - **QED (Quantitative Estimate of Drug-likeness)**: Score from 0-1 measuring how drug-like a molecule is (higher is better)
-- **SA Score (Synthetic Accessibility)**: Score from 1-10 indicating ease of synthesis (lower is easier)
 ### Similarity Metrics
 - **SNN ChEMBL**: Similarity to ChEMBL molecules (higher means more similar to known drug-like compounds)
-- **SNN Drug**: Similarity to known drugs (higher means more similar to approved drugs)
-                """)
-            model_name = gr.Radio(
-                choices=("DrugGEN-AKT1", "DrugGEN-CDK2", "DrugGEN-NoTarget"),
-                value="DrugGEN-AKT1",
-                label="Select Target Model",
-                info="Choose which protein target or general model to use for molecule generation"
-            )
-            num_molecules = gr.Slider(
-                minimum=10,
-                maximum=250,
-                value=100,
-                step=10,
-                label="Number of Molecules to Generate",
-                info="This space runs on a CPU, which may result in slower performance. Generating 200 molecules takes approximately 6 minutes. Therefore, We set a 250-molecule cap. On a GPU, the model can generate 10,000 molecules in the same amount of time. Please check our GitHub repo for running our models on GPU."
-            )
-            seed_num = gr.Textbox(
-                label="Random Seed (Optional)",
-                value="",
-                info="Set a specific seed for reproducible results, or leave empty for random generation"
-            )
-            submit_button = gr.Button(
-                value="Generate Molecules",
-                variant="primary",
-                size="lg"
-            )
-        with gr.Column(scale=2):
-            basic_metrics_df = gr.Dataframe(
-                headers=["Validity", "Uniqueness", "Novelty (Train)", "Novelty (Test)", "Novelty (Drug)", "Runtime (s)"],
-                elem_id="basic-metrics"
-            )
-            advanced_metrics_df = gr.Dataframe(
-                headers=["QED", "SA Score", "Internal Diversity", "SNN (ChEMBL)", "SNN (Drug)", "Max Length"],
-                elem_id="advanced-metrics"
-                )
-            file_download = gr.File(
-                label="Download All Generated Molecules (SMILES format)",
-            )
-            image_output = gr.Image(
-                label="Structures of Randomly Selected Generated Molecules",
-                elem_id="molecule_display"
-            )
     gr.Markdown("### Created by the HUBioDataLab | [GitHub](https://github.com/HUBioDataLab/DrugGEN) | [Paper](https://arxiv.org/abs/2302.07868)")
-    submit_button.click(
-        function,
-        inputs=[model_name, num_molecules, seed_num],
         outputs=[
             image_output,
             file_download,
             basic_metrics_df,
             advanced_metrics_df
-        ],
-        api_name="inference"
     )
-#demo.queue(concurrency_count=1)
 demo.queue()
 demo.launch()

 class DrugGENConfig:
     # Inference configuration
+    submodel = 'DrugGEN'
+    inference_model = "/home/user/app/experiments/models/DrugGEN/"
+    sample_num = 100
     # Data configuration
+    inf_smiles = '/home/user/app/data/chembl_test.smi'
+    train_smiles = '/home/user/app/data/chembl_train.smi'
+    inf_batch_size = 1
+    mol_data_dir = '/home/user/app/data'
+    features = False
     # Model configuration
+    act = 'relu'
+    max_atom = 45
+    dim = 128
+    depth = 1
+    heads = 8
+    mlp_ratio = 3
+    dropout = 0.
     # Seed configuration
+    set_seed = True
+    seed = 10
+    disable_correction = False
 class DrugGENAKT1Config(DrugGENConfig):
+    submodel = 'DrugGEN'
+    inference_model = "/home/user/app/experiments/models/DrugGEN-akt1/"
+    train_drug_smiles = '/home/user/app/data/akt_train.smi'
+    max_atom = 45
 class DrugGENCDK2Config(DrugGENConfig):
+    submodel = 'DrugGEN'
+    inference_model = "/home/user/app/experiments/models/DrugGEN-cdk2/"
+    train_drug_smiles = '/home/user/app/data/cdk2_train.smi'
+    max_atom = 38
 class NoTargetConfig(DrugGENConfig):
+    submodel = "NoTarget"
+    inference_model = "/home/user/app/experiments/models/NoTarget/"
 model_configs = {
 }
+def run_inference(mode: str, model_name: str, num_molecules: int, seed_num: str, custom_smiles: str):
+    """
+    Depending on the selected mode, either generate new molecules or evaluate provided SMILES.
+    Returns:
+        image, file_path, basic_metrics, advanced_metrics
+    """
     config = model_configs[model_name]
+    if mode == "Custom Input SMILES":
+        # Process the custom input SMILES
+        smiles_list = [s.strip() for s in custom_smiles.strip().splitlines() if s.strip() != ""]
+        if len(smiles_list) > 100:
+            raise gr.Error("You have provided more than the allowed limit of 100 molecules. Please provide 100 or fewer.")
+        # Write the custom SMILES to a temporary file and update config
+        temp_input_file = "custom_input.smi"
+        with open(temp_input_file, "w") as f:
+            for s in smiles_list:
+                f.write(s + "\n")
+        config.inf_smiles = temp_input_file
+        config.sample_num = len(smiles_list)
+        # Always use a random seed for custom mode
         config.seed = random.randint(0, 10000)
     else:
+        # Classical Generation mode
+        config.sample_num = num_molecules
+        if config.sample_num > 250:
+            raise gr.Error("You have requested to generate more than the allowed limit of 250 molecules. Please reduce your request to 250 or fewer.")
+        if seed_num is None or seed_num.strip() == "":
+            config.seed = random.randint(0, 10000)
+        else:
+            try:
+                config.seed = int(seed_num)
+            except ValueError:
+                raise gr.Error("The seed must be an integer value!")
+    # Adjust model name for the inference if not using NoTarget
+    if model_name != "DrugGEN-NoTarget":
+        target_model_name = "DrugGEN"
+    else:
+        target_model_name = "NoTarget"
     inferer = Inference(config)
     start_time = time.time()
     scores = inferer.inference()  # This returns a DataFrame with specific columns
     et = time.time() - start_time
     # Create basic metrics dataframe
     basic_metrics = pd.DataFrame({
         "Validity": [scores["validity"].iloc[0]],
         "Uniqueness": [scores["uniqueness"].iloc[0]],
         "Novelty (Train)": [scores["novelty"].iloc[0]],
+        "Novelty (Inference)": [scores["novelty_test"].iloc[0]],
+        "Novelty (Real Inhibitors)": [scores["drug_novelty"].iloc[0]],
         "Runtime (s)": [round(et, 2)]
     })
         "SA Score": [scores["sa"].iloc[0]],
         "Internal Diversity": [scores["IntDiv"].iloc[0]],
         "SNN ChEMBL": [scores["snn_chembl"].iloc[0]],
+        "SNN Real Inhibitors": [scores["snn_drug"].iloc[0]],
+        "Average Length": [scores["max_len"].iloc[0]]
     })
+    # Process the output file from inference
+    output_file_path = f'/home/user/app/experiments/inference/{target_model_name}/inference_drugs.txt'
+    new_path = f'{target_model_name}_denovo_mols.smi'
     os.rename(output_file_path, new_path)
     with open(new_path) as f:
     generated_molecule_list = inference_drugs.split("\n")[:-1]
+    # Randomly select up to 12 molecules for display
     rng = random.Random(config.seed)
+    if len(generated_molecule_list) > 12:
+        selected_smiles = rng.choices(generated_molecule_list, k=12)
     else:
+        selected_smiles = generated_molecule_list
+    selected_molecules = [Chem.MolFromSmiles(mol) for mol in selected_smiles if Chem.MolFromSmiles(mol) is not None]
     drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
     drawOptions.prepareMolsBeforeDrawing = False
         molsPerRow=3,
         subImgSize=(400, 400),
         maxMols=len(selected_molecules),
         returnPNG=False,
         drawOptions=drawOptions,
         highlightAtomLists=None,
     return molecule_image, new_path, basic_metrics, advanced_metrics
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     # Add custom CSS for styling
     gr.HTML("""
     </style>
     """)
+    gr.Markdown("# DrugGEN: Target Centric De Novo Design of Drug Candidate Molecules with Graph Generative Deep Adversarial Networks")
+    gr.HTML("""
+    <div style="display: flex; gap: 10px; margin-bottom: 15px;">
+        <!-- arXiv badge -->
+        <a href="https://arxiv.org/abs/2302.07868" target="_blank" style="text-decoration: none;">
+            <div style="
+                display: inline-block;
+                background-color: #b31b1b;
+                color: #ffffff !important;
+                padding: 5px 10px;
+                border-radius: 5px;
+                font-size: 14px;">
+                <span style="font-weight: bold;">arXiv</span> 2302.07868
+            </div>
+        </a>
+        <!-- GitHub badge -->
+        <a href="https://github.com/HUBioDataLab/DrugGEN" target="_blank" style="text-decoration: none;">
+            <div style="
+                display: inline-block;
+                background-color: #24292e;
+                color: #ffffff !important;
+                padding: 5px 10px;
+                border-radius: 5px;
+                font-size: 14px;">
+                <span style="font-weight: bold;">GitHub</span> Repository
             </div>
+        </a>
+    </div>
+    """)
+    with gr.Accordion("About DrugGEN Models", open=False):
+        gr.Markdown("""
 ## Model Variations
 ### DrugGEN-AKT1
 This model is designed to generate molecules targeting the human CDK2 protein (UniProt ID: P24941).
 ### DrugGEN-NoTarget
+This is a general-purpose model that generates diverse drug-like molecules without targeting a specific protein.
+- Useful for exploring chemical space, generating diverse scaffolds, and creating molecules with drug-like properties.
 For more details, see our [paper on arXiv](https://arxiv.org/abs/2302.07868).
+        """)
+    with gr.Accordion("Understanding the Metrics", open=False):
+        gr.Markdown("""
 ## Evaluation Metrics
 ### Basic Metrics
 - **Validity**: Percentage of generated molecules that are chemically valid
 - **Uniqueness**: Percentage of unique molecules among valid ones
+- **Runtime**: Time taken to generate or evaluate the molecules
 ### Novelty Metrics
 - **Novelty (Train)**: Percentage of molecules not found in the training set
+- **Novelty (Inference)**: Percentage of molecules not found in the test set
+- **Novelty (Real Inhibitors)**: Percentage of molecules not found in known inhibitors of the target protein
 ### Structural Metrics
+- **Average Length**: Average component length in the generated molecules
 - **Mean Atom Type**: Average distribution of atom types
 - **Internal Diversity**: Diversity within the generated set (higher is more diverse)
 ### Drug-likeness Metrics
 - **QED (Quantitative Estimate of Drug-likeness)**: Score from 0-1 measuring how drug-like a molecule is (higher is better)
+- **SA Score (Synthetic Accessibility)**: Score from 1-10 indicating ease of synthesis (lower is better)
 ### Similarity Metrics
 - **SNN ChEMBL**: Similarity to ChEMBL molecules (higher means more similar to known drug-like compounds)
+- **SNN Real Inhibitors**: Similarity to known drugs (higher means more similar to approved drugs)
+        """)
+    # Use Gradio Tabs to separate the two modes.
+    with gr.Tabs():
+        with gr.TabItem("Classical Generation"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    model_name = gr.Radio(
+                        choices=("DrugGEN-AKT1", "DrugGEN-CDK2", "DrugGEN-NoTarget"),
+                        value="DrugGEN-AKT1",
+                        label="Select Target Model",
+                        info="Choose which protein target or general model to use for molecule generation"
+                    )
+                    num_molecules = gr.Slider(
+                        minimum=10,
+                        maximum=250,
+                        value=100,
+                        step=10,
+                        label="Number of Molecules to Generate",
+                        info="This space runs on a CPU, which may result in slower performance. Generating 200 molecules takes approximately 6 minutes. Therefore, we set a 250-molecule cap."
+                    )
+                    seed_num = gr.Textbox(
+                        label="Random Seed (Optional)",
+                        value="",
+                        info="Set a specific seed for reproducible results, or leave empty for random generation"
+                    )
+                    classical_submit = gr.Button(
+                        value="Generate Molecules",
+                        variant="primary",
+                        size="lg"
+                    )
+                with gr.Column(scale=2):
+                    basic_metrics_df = gr.Dataframe(
+                        headers=["Validity", "Uniqueness", "Novelty (Train)", "Novelty (Inference)", "Novelty (Real Inhibitors)", "Runtime (s)"],
+                        elem_id="basic-metrics"
+                    )
+                    advanced_metrics_df = gr.Dataframe(
+                        headers=["QED", "SA Score", "Internal Diversity", "SNN (ChEMBL)", "SNN (Real Inhibitors)", "Average Length"],
+                        elem_id="advanced-metrics"
+                    )
+                    file_download = gr.File(
+                        label="Download All Generated Molecules (SMILES format)"
+                    )
+                    image_output = gr.Image(
+                        label="Structures of Randomly Selected Generated Molecules",
+                        elem_id="molecule_display"
+                    )
+        with gr.TabItem("Custom Input SMILES"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    # Reuse model selection for custom input
+                    model_name_custom = gr.Radio(
+                        choices=("DrugGEN-AKT1", "DrugGEN-CDK2", "DrugGEN-NoTarget"),
+                        value="DrugGEN-AKT1",
+                        label="Select Target Model",
+                        info="Choose which protein target or general model to use for evaluation"
+                    )
+                    custom_smiles = gr.Textbox(
+                        label="Input SMILES (one per line, maximum 100 molecules)",
+                        placeholder="C(C(=O)O)N\nCCO\n...",
+                        lines=10
+                    )
+                    custom_submit = gr.Button(
+                        value="Evaluate Custom SMILES",
+                        variant="primary",
+                        size="lg"
+                    )
+                with gr.Column(scale=2):
+                    basic_metrics_df_custom = gr.Dataframe(
+                        headers=["Validity", "Uniqueness", "Novelty (Train)", "Novelty (Inference)", "Novelty (Real Inhibitors)", "Runtime (s)"],
+                        elem_id="basic-metrics-custom"
+                    )
+                    advanced_metrics_df_custom = gr.Dataframe(
+                        headers=["QED", "SA Score", "Internal Diversity", "SNN (ChEMBL)", "SNN (Real Inhibitors)", "Average Length"],
+                        elem_id="advanced-metrics-custom"
+                    )
+                    file_download_custom = gr.File(
+                        label="Download All Molecules (SMILES format)"
+                    )
+                    image_output_custom = gr.Image(
+                        label="Structures of Randomly Selected Molecules",
+                        elem_id="molecule_display_custom"
+                    )
     gr.Markdown("### Created by the HUBioDataLab | [GitHub](https://github.com/HUBioDataLab/DrugGEN) | [Paper](https://arxiv.org/abs/2302.07868)")
+    # Set up the click actions for each tab.
+    classical_submit.click(
+        run_inference,
+        inputs=[gr.Variable("Generate Molecules"), model_name, num_molecules, seed_num, gr.Textbox.update(value="")],
         outputs=[
             image_output,
             file_download,
             basic_metrics_df,
             advanced_metrics_df
+        ],
+        api_name="inference_classical"
     )
+    custom_submit.click(
+        run_inference,
+        inputs=[gr.Variable("Custom Input SMILES"), model_name_custom, 0, gr.Textbox.update(value=""), custom_smiles],
+        outputs=[
+            image_output_custom,
+            file_download_custom,
+            basic_metrics_df_custom,
+            advanced_metrics_df_custom
+        ],
+        api_name="inference_custom"
+    )
 demo.queue()
 demo.launch()