getdemo

Running

App Files Files Community

Xi "Alexander" Fu commited on Sep 19, 2023

Commit

40af086

unverified ·

2 Parent(s): 9a25e9f b3e8ff2

Merge pull request #1 from fuxialexander/buendia/read-from-s3

Browse files

Files changed (2) hide show

Dockerfile +2 -2
app/main.py +73 -38

Dockerfile CHANGED Viewed

@@ -9,7 +9,7 @@ USER $MAMBA_USER
 # Set the working directory in the container to /app
 WORKDIR /app
 # Create a new environment using mamba with specified packages
-RUN micromamba install -n base -c conda-forge -c bioconda -y python=3.10 pip biopython
 RUN micromamba install -n base -c conda-forge -c bioconda -y nglview tqdm matplotlib pandas
 RUN micromamba install -n base -c conda-forge -c bioconda -y openpyxl pyarrow python-box xmlschema seaborn numpy py3Dmol pyranges scipy pyyaml zarr numcodecs
 RUN micromamba install -n base -c conda-forge -c bioconda -y pybigwig networkx plotly pysam requests seqlogo MOODS urllib3 pyliftover gprofiler-official pyfaidx
@@ -57,4 +57,4 @@ EXPOSE 7681
 # Set the working directory where your app resides
 # Command to run the Gradio app automatically
-CMD ["python", "app/main.py", "-p", "7681", "-s", "-d", "/data"]

 # Set the working directory in the container to /app
 WORKDIR /app
 # Create a new environment using mamba with specified packages
+RUN micromamba install -n base -c conda-forge -c bioconda -y python=3.10 pip biopython s3fs
 RUN micromamba install -n base -c conda-forge -c bioconda -y nglview tqdm matplotlib pandas
 RUN micromamba install -n base -c conda-forge -c bioconda -y openpyxl pyarrow python-box xmlschema seaborn numpy py3Dmol pyranges scipy pyyaml zarr numcodecs
 RUN micromamba install -n base -c conda-forge -c bioconda -y pybigwig networkx plotly pysam requests seqlogo MOODS urllib3 pyliftover gprofiler-official pyfaidx
 # Set the working directory where your app resides
 # Command to run the Gradio app automatically
+CMD ["python", "app/main.py", "-p", "7681", "-s", "-u", "s3://2023-get-xf2217/get_demo_test_data", "-d", "/data"]

app/main.py CHANGED Viewed

@@ -6,67 +6,102 @@ import matplotlib.pyplot as plt
 import pandas as pd
 import pkg_resources
 from dash_bio import Clustergram
-from proscope.data import get_genename_to_uniprot, get_lddt, get_seq
-seq = get_seq()
-genename_to_uniprot = get_genename_to_uniprot()
-lddt = get_lddt()
 import sys
 from glob import glob
 import numpy as np
 from atac_rna_data_processing.config.load_config import load_config
 from atac_rna_data_processing.io.celltype import GETCellType
 from atac_rna_data_processing.io.nr_motif_v1 import NrMotifV1
 from proscope.af2 import AFPairseg
 from proscope.protein import Protein
 from proscope.viewer import view_pdb_html
 args = argparse.ArgumentParser()
 args.add_argument("-p", "--port", type=int, default=7860, help="Port number")
 args.add_argument("-s", "--share", action="store_true", help="Share on network")
-args.add_argument("-d", "--data", type=str, default="/data", help="Data directory")
 args = args.parse_args()
-# set pseudo args
-# args = args.parse_args(['-p', '7869', '-s', '-d', '/manitou/pmg/users/xf2217/demo_data'])
-gene_pairs = glob(f"{args.data}/structures/causal/*")
-gene_pairs = [os.path.basename(pair) for pair in gene_pairs]
 GET_CONFIG = load_config(
-    "/manitou/pmg/users/xf2217/atac_rna_data_processing/atac_rna_data_processing/config/GET"
 )
 GET_CONFIG.celltype.jacob = True
 GET_CONFIG.celltype.num_cls = 2
 GET_CONFIG.celltype.input = True
 GET_CONFIG.celltype.embed = True
-GET_CONFIG.celltype.data_dir = (
-    "/manitou/pmg/users/xf2217/pretrain_human_bingren_shendure_apr2023/fetal_adult/"
-)
-GET_CONFIG.celltype.interpret_dir = (
-    "/manitou/pmg/users/xf2217/Interpretation_all_hg38_allembed_v4_natac/"
-)
-GET_CONFIG.motif_dir = "/manitou/pmg/users/xf2217/interpret_natac/motif-clustering"
-motif = NrMotifV1.load_from_pickle(
-    pkg_resources.resource_filename("atac_rna_data_processing", "data/NrMotifV1.pkl"),
-    GET_CONFIG.motif_dir,
-)
-cell_type_annot = pd.read_csv(
-    GET_CONFIG.celltype.data_dir.split("fetal_adult")[0]
-    + "data/cell_type_pretrain_human_bingren_shendure_apr2023.txt"
-)
-cell_type_id_to_name = dict(zip(cell_type_annot["id"], cell_type_annot["celltype"]))
-cell_type_name_to_id = dict(zip(cell_type_annot["celltype"], cell_type_annot["id"]))
-avaliable_celltypes = sorted(
-    [
-        cell_type_id_to_name[f.split("/")[-1]]
-        for f in glob(GET_CONFIG.celltype.interpret_dir + "*")
-    ]
-)
 plt.rcParams["figure.dpi"] = 100
 def visualize_AF2(tf_pair, a):
-    strcture_dir = f"{args.data}/structures/causal/{tf_pair}"
-    fasta_dir = f"{args.data}/sequences/causal/{tf_pair}"
     if not os.path.exists(strcture_dir):
         gr.ErrorText("No such gene pair")
@@ -185,7 +220,7 @@ This section enables you to select different cell types and generates a plot tha
 """
                 )
                 celltype_name = gr.Dropdown(
-                    label="Cell Type", choices=avaliable_celltypes, value='Fetal Astrocyte 1'
                 )
                 celltype_btn = gr.Button(value="Load & plot gene expression")
                 gene_exp_plot = gr.Plot(label="Gene expression prediction vs observation")

 import pandas as pd
 import pkg_resources
 from dash_bio import Clustergram
 import sys
+import s3fs
 from glob import glob
 import numpy as np
 from atac_rna_data_processing.config.load_config import load_config
 from atac_rna_data_processing.io.celltype import GETCellType
 from atac_rna_data_processing.io.nr_motif_v1 import NrMotifV1
 from proscope.af2 import AFPairseg
+from proscope.data import get_genename_to_uniprot, get_lddt, get_seq
 from proscope.protein import Protein
 from proscope.viewer import view_pdb_html
+seq = get_seq()
+genename_to_uniprot = get_genename_to_uniprot()
+lddt = get_lddt()
 args = argparse.ArgumentParser()
 args.add_argument("-p", "--port", type=int, default=7860, help="Port number")
 args.add_argument("-s", "--share", action="store_true", help="Share on network")
+args.add_argument("-u", "--s3_uri", type=str, default=None, help="Path to demo S3 bucket")
+args.add_argument("-d", "--data", type=str, default=None, help="Data directory")
 args = args.parse_args()
 GET_CONFIG = load_config(
+    "/app/modules/atac_rna_data_processing/atac_rna_data_processing/config/GET"
 )
 GET_CONFIG.celltype.jacob = True
 GET_CONFIG.celltype.num_cls = 2
 GET_CONFIG.celltype.input = True
 GET_CONFIG.celltype.embed = True
 plt.rcParams["figure.dpi"] = 100
+if args.s3_uri: # Use S3 path if exists
+    GET_CONFIG.s3_uri = args.s3_uri
+    s3 = s3fs.S3FileSystem()
+    GET_CONFIG.celltype.data_dir = (
+        f"{args.s3_uri}/pretrain_human_bingren_shendure_apr2023/fetal_adult/"
+    )
+    GET_CONFIG.celltype.interpret_dir = (
+        f"{args.s3_uri}/Interpretation_all_hg38_allembed_v4_natac/"
+    )
+    GET_CONFIG.motif_dir = f"{args.s3_uri}/interpret_natac/motif-clustering"
+    cell_type_annot = pd.read_csv(
+        GET_CONFIG.celltype.data_dir.split("fetal_adult")[0]
+            + "data/cell_type_pretrain_human_bingren_shendure_apr2023.txt"
+    )
+    cell_type_id_to_name = dict(zip(cell_type_annot["id"], cell_type_annot["celltype"]))
+    cell_type_name_to_id = dict(zip(cell_type_annot["celltype"], cell_type_annot["id"]))
+    available_celltypes = sorted(
+        [
+            cell_type_id_to_name[f.split("/")[-1]]
+            for f in s3.glob(GET_CONFIG.celltype.interpret_dir + "*")
+        ]
+    )
+    gene_pairs = s3.glob(f"{args.s3_uri}/structures/causal/*")
+    gene_pairs = [os.path.basename(pair) for pair in gene_pairs]
+    motif = NrMotifV1.load_from_pickle(
+        pkg_resources.resource_filename("atac_rna_data_processing", "data/NrMotifV1.pkl"),
+        GET_CONFIG.motif_dir,
+    )
+else: # Run with local data
+    GET_CONFIG.celltype.data_dir = (
+        f"{args.data}/pretrain_human_bingren_shendure_apr2023/fetal_adult/"
+    )
+    GET_CONFIG.celltype.interpret_dir = (
+        f"{args.data}/Interpretation_all_hg38_allembed_v4_natac/"
+    )
+    GET_CONFIG.motif_dir = f"{args.data}/interpret_natac/motif-clustering"
+    cell_type_annot = pd.read_csv(
+        GET_CONFIG.celltype.data_dir.split("fetal_adult")[0]
+            + "data/cell_type_pretrain_human_bingren_shendure_apr2023.txt"
+    )
+    cell_type_id_to_name = dict(zip(cell_type_annot["id"], cell_type_annot["celltype"]))
+    cell_type_name_to_id = dict(zip(cell_type_annot["celltype"], cell_type_annot["id"]))
+    available_celltypes = sorted(
+        [
+            cell_type_id_to_name[f.split("/")[-1]]
+            for f in glob(GET_CONFIG.celltype.interpret_dir + "*")
+        ]
+    )
+    gene_pairs = glob(f"{args.data}/structures/causal/*")
+    gene_pairs = [os.path.basename(pair) for pair in gene_pairs]
+    motif = NrMotifV1.load_from_pickle(
+        pkg_resources.resource_filename("atac_rna_data_processing", "data/NrMotifV1.pkl"),
+        GET_CONFIG.motif_dir,
+    )
 def visualize_AF2(tf_pair, a):
+    if args.s3_uri:
+        strcture_dir = f"{args.s3_uri}/structures/causal/{tf_pair}"
+        fasta_dir = f"{args.s3_uri}/sequences/causal/{tf_pair}"
+    else:
+        strcture_dir = f"{args.data}/structures/causal/{tf_pair}"
+        fasta_dir = f"{args.data}/sequences/causal/{tf_pair}"
     if not os.path.exists(strcture_dir):
         gr.ErrorText("No such gene pair")
 """
                 )
                 celltype_name = gr.Dropdown(
+                    label="Cell Type", choices=available_celltypes, value='Fetal Astrocyte 1'
                 )
                 celltype_btn = gr.Button(value="Load & plot gene expression")
                 gene_exp_plot = gr.Plot(label="Gene expression prediction vs observation")