Spaces:

AI4PD
/

hexviz

Sleeping

App Files Files Community

aksell commited on Apr 21, 2023

Commit

a2cfd88

1 Parent(s): 8cef26d

Format with Black

Browse files

Files changed (3) hide show

hexviz/models.py +14 -8
hexviz/view.py +50 -21
hexviz/🧬Attention_Visualization.py +92 -30

hexviz/models.py CHANGED Viewed

@@ -4,12 +4,17 @@ import streamlit as st
 import torch
 from tape import ProteinBertModel, TAPETokenizer
 from tokenizers import Tokenizer
-from transformers import (AutoTokenizer, BertModel, BertTokenizer,
-                          GPT2LMHeadModel, GPT2TokenizerFast)
 class ModelType(str, Enum):
-    TAPE_BERT = "TAPE-BERT"
     ZymCTRL = "ZymCTRL"
     PROT_BERT = "ProtBert"
@@ -24,22 +29,23 @@ class Model:
 @st.cache
 def get_tape_bert() -> tuple[TAPETokenizer, ProteinBertModel]:
     tokenizer = TAPETokenizer()
-    model = ProteinBertModel.from_pretrained('bert-base', output_attentions=True)
     return tokenizer, model
 # Streamlit is not able to hash the tokenizer for ZymCTRL
 # With streamlit 1.19 cache_object should work without this
 @st.cache(hash_funcs={Tokenizer: lambda _: None})
 def get_zymctrl() -> tuple[GPT2TokenizerFast, GPT2LMHeadModel]:
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    tokenizer = AutoTokenizer.from_pretrained('nferruz/ZymCTRL')
-    model = GPT2LMHeadModel.from_pretrained('nferruz/ZymCTRL').to(device)
     return tokenizer, model
 @st.cache(hash_funcs={BertTokenizer: lambda _: None})
 def get_prot_bert() -> tuple[BertTokenizer, BertModel]:
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
     model = BertModel.from_pretrained("Rostlab/prot_bert").to(device)
-    return tokenizer, model

 import torch
 from tape import ProteinBertModel, TAPETokenizer
 from tokenizers import Tokenizer
+from transformers import (
+    AutoTokenizer,
+    BertModel,
+    BertTokenizer,
+    GPT2LMHeadModel,
+    GPT2TokenizerFast,
+)
 class ModelType(str, Enum):
+    TAPE_BERT = "TapeBert"
     ZymCTRL = "ZymCTRL"
     PROT_BERT = "ProtBert"
 @st.cache
 def get_tape_bert() -> tuple[TAPETokenizer, ProteinBertModel]:
     tokenizer = TAPETokenizer()
+    model = ProteinBertModel.from_pretrained("bert-base", output_attentions=True)
     return tokenizer, model
 # Streamlit is not able to hash the tokenizer for ZymCTRL
 # With streamlit 1.19 cache_object should work without this
 @st.cache(hash_funcs={Tokenizer: lambda _: None})
 def get_zymctrl() -> tuple[GPT2TokenizerFast, GPT2LMHeadModel]:
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    tokenizer = AutoTokenizer.from_pretrained("nferruz/ZymCTRL")
+    model = GPT2LMHeadModel.from_pretrained("nferruz/ZymCTRL").to(device)
     return tokenizer, model
 @st.cache(hash_funcs={BertTokenizer: lambda _: None})
 def get_prot_bert() -> tuple[BertTokenizer, BertModel]:
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
     model = BertModel.from_pretrained("Rostlab/prot_bert").to(device)
+    return tokenizer, model

hexviz/view.py CHANGED Viewed

@@ -6,17 +6,26 @@ from Bio.PDB import PDBParser
 from hexviz.attention import get_pdb_file, get_pdb_from_seq
 menu_items = {
-    "Get Help": "https://huggingface.co/spaces/aksell/hexviz/discussions/new",
-    "Report a bug": "https://huggingface.co/spaces/aksell/hexviz/discussions/new",
-    "About": "Created by [Aksel Lenes](https://github.com/aksell/) from Noelia Ferruz's group at the Institute of Molecular Biology of Barcelona. Read more at https://www.aiproteindesign.com/"
-    }
 def get_selecte_model_index(models):
     selected_model_name = st.session_state.get("selected_model_name", None)
     if selected_model_name is None:
         return 0
     else:
-        return next((i for i, model in enumerate(models) if model.name.value == selected_model_name), None)
 def clear_model_state():
     if "plot_heads" in st.session_state:
@@ -32,13 +41,22 @@ def clear_model_state():
     if "plot_heads" in st.session_state:
         del st.session_state.plot_heads
 def select_model(models):
     if "selected_model_name" not in st.session_state:
         st.session_state.selected_model_name = models[0].name.value
-    selected_model_name = st.selectbox("Select model", [model.name.value for model in models], key="selected_model_name", on_change=clear_model_state)
-    select_model = next((model for model in models if model.name.value == selected_model_name), None)
     return select_model
 def clear_pdb_state():
     if "selected_chains" in st.session_state:
         del st.session_state.selected_chains
@@ -49,16 +67,14 @@ def clear_pdb_state():
     if "uploaded_pdb_str" in st.session_state:
         del st.session_state.uploaded_pdb_str
 def select_pdb():
     if "pdb_id" not in st.session_state:
         st.session_state.pdb_id = "2FZ5"
-    pdb_id = st.text_input(
-            label = "1.PDB ID",
-            key = "pdb_id",
-            on_change=clear_pdb_state
-            )
     return pdb_id
 def select_protein(pdb_code, uploaded_file, input_sequence):
     # We get the pdb from 1 of 3 places:
     # 1. Cached pdb from session storage
@@ -85,6 +101,7 @@ def select_protein(pdb_code, uploaded_file, input_sequence):
     structure = parser.get_structure(pdb_code, StringIO(pdb_str))
     return pdb_str, structure, source
 def select_heads_and_layers(sidebar, model):
     sidebar.markdown(
         """
@@ -93,23 +110,35 @@ def select_heads_and_layers(sidebar, model):
         """
     )
     if "plot_heads" not in st.session_state:
-        st.session_state.plot_heads = (1, model.heads//2)
-    head_range = sidebar.slider("Heads to plot", min_value=1, max_value=model.heads, key="plot_heads", step=1)
     if "plot_layers" not in st.session_state:
-        st.session_state.plot_layers = (1, model.layers//2)
-    layer_range = sidebar.slider("Layers to plot", min_value=1, max_value=model.layers, key="plot_layers", step=1)
     if "plot_step_size" not in st.session_state:
         st.session_state.plot_step_size = 1
-    step_size = sidebar.number_input("Optional step size to skip heads and layers", key="plot_step_size", min_value=1, max_value=model.layers)
-    layer_sequence = list(range(layer_range[0]-1, layer_range[1], step_size))
-    head_sequence = list(range(head_range[0]-1, head_range[1], step_size))
     return layer_sequence, head_sequence
 def select_sequence_slice(sequence_length):
     st.sidebar.markdown("Sequence segment to plot")
     if "sequence_slice" not in st.session_state:
         st.session_state.sequence_slice = (1, min(50, sequence_length))
-    slice = st.sidebar.slider("Sequence", key="sequence_slice", min_value=1, max_value=sequence_length, step=1)
-    return slice

 from hexviz.attention import get_pdb_file, get_pdb_from_seq
 menu_items = {
+    "Get Help": "https://huggingface.co/spaces/aksell/hexviz/discussions/new",
+    "Report a bug": "https://huggingface.co/spaces/aksell/hexviz/discussions/new",
+    "About": "Created by [Aksel Lenes](https://github.com/aksell/) from Noelia Ferruz's group at the Institute of Molecular Biology of Barcelona. Read more at https://www.aiproteindesign.com/",
+}
 def get_selecte_model_index(models):
     selected_model_name = st.session_state.get("selected_model_name", None)
     if selected_model_name is None:
         return 0
     else:
+        return next(
+            (
+                i
+                for i, model in enumerate(models)
+                if model.name.value == selected_model_name
+            ),
+            None,
+        )
 def clear_model_state():
     if "plot_heads" in st.session_state:
     if "plot_heads" in st.session_state:
         del st.session_state.plot_heads
 def select_model(models):
     if "selected_model_name" not in st.session_state:
         st.session_state.selected_model_name = models[0].name.value
+    selected_model_name = st.selectbox(
+        "Select model",
+        [model.name.value for model in models],
+        key="selected_model_name",
+        on_change=clear_model_state,
+    )
+    select_model = next(
+        (model for model in models if model.name.value == selected_model_name), None
+    )
     return select_model
 def clear_pdb_state():
     if "selected_chains" in st.session_state:
         del st.session_state.selected_chains
     if "uploaded_pdb_str" in st.session_state:
         del st.session_state.uploaded_pdb_str
 def select_pdb():
     if "pdb_id" not in st.session_state:
         st.session_state.pdb_id = "2FZ5"
+    pdb_id = st.text_input(label="1.PDB ID", key="pdb_id", on_change=clear_pdb_state)
     return pdb_id
 def select_protein(pdb_code, uploaded_file, input_sequence):
     # We get the pdb from 1 of 3 places:
     # 1. Cached pdb from session storage
     structure = parser.get_structure(pdb_code, StringIO(pdb_str))
     return pdb_str, structure, source
 def select_heads_and_layers(sidebar, model):
     sidebar.markdown(
         """
         """
     )
     if "plot_heads" not in st.session_state:
+        st.session_state.plot_heads = (1, model.heads // 2)
+    head_range = sidebar.slider(
+        "Heads to plot", min_value=1, max_value=model.heads, key="plot_heads", step=1
+    )
     if "plot_layers" not in st.session_state:
+        st.session_state.plot_layers = (1, model.layers // 2)
+    layer_range = sidebar.slider(
+        "Layers to plot", min_value=1, max_value=model.layers, key="plot_layers", step=1
+    )
     if "plot_step_size" not in st.session_state:
         st.session_state.plot_step_size = 1
+    step_size = sidebar.number_input(
+        "Optional step size to skip heads and layers",
+        key="plot_step_size",
+        min_value=1,
+        max_value=model.layers,
+    )
+    layer_sequence = list(range(layer_range[0] - 1, layer_range[1], step_size))
+    head_sequence = list(range(head_range[0] - 1, head_range[1], step_size))
     return layer_sequence, head_sequence
 def select_sequence_slice(sequence_length):
     st.sidebar.markdown("Sequence segment to plot")
     if "sequence_slice" not in st.session_state:
         st.session_state.sequence_slice = (1, min(50, sequence_length))
+    slice = st.sidebar.slider(
+        "Sequence", key="sequence_slice", min_value=1, max_value=sequence_length, step=1
+    )
+    return slice

hexviz/🧬Attention_Visualization.py CHANGED Viewed

@@ -4,8 +4,11 @@ import stmol
 import streamlit as st
 from stmol import showmol
-from hexviz.attention import (clean_and_validate_sequence, get_attention_pairs,
-                              get_chains)
 from hexviz.models import Model, ModelType
 from hexviz.view import menu_items, select_model, select_pdb, select_protein
@@ -21,10 +24,14 @@ models = [
     Model(name=ModelType.PROT_BERT, layers=30, heads=16),
 ]
-with st.expander("Input a PDB id, upload a PDB file or input a sequence", expanded=True):
     pdb_id = select_pdb()
     uploaded_file = st.file_uploader("2.Upload PDB", type=["pdb"])
-    input_sequence = st.text_area("3.Input sequence", "", key="input_sequence", max_chars=400)
     sequence, error = clean_and_validate_sequence(input_sequence)
     if error:
         st.error(error)
@@ -35,14 +42,19 @@ st.sidebar.markdown(
     """
     Configure visualization
     ---
-    """)
 chains = get_chains(structure)
 if "selected_chains" not in st.session_state:
     st.session_state.selected_chains = chains
-selected_chains = st.sidebar.multiselect(label="Select Chain(s)", options=chains, key="selected_chains")
-show_ligands = st.sidebar.checkbox("Show ligands", value=st.session_state.get("show_ligands", True))
 st.session_state.show_ligands = show_ligands
@@ -50,9 +62,14 @@ st.sidebar.markdown(
     """
     Attention parameters
     ---
-    """)
-min_attn = st.sidebar.slider("Minimum attention", min_value=0.0, max_value=0.4, value=0.1)
-n_highest_resis = st.sidebar.number_input("Num highest attention resis to label", value=2, min_value=1, max_value=100)
 label_highest = st.sidebar.checkbox("Label highest attention residues", value=True)
 sidechain_highest = st.sidebar.checkbox("Show sidechains", value=True)
 # TODO add avg or max attention as params
@@ -60,7 +77,9 @@ sidechain_highest = st.sidebar.checkbox("Show sidechains", value=True)
 with st.sidebar.expander("Label residues manually"):
     hl_chain = st.selectbox(label="Chain to label", options=selected_chains, index=0)
-    hl_resi_list = st.multiselect(label="Selected Residues",options=list(range(1,5000)))
     label_resi = st.checkbox(label="Label Residues", value=True)
@@ -71,26 +90,46 @@ with left:
 with mid:
     if "selected_layer" not in st.session_state:
         st.session_state["selected_layer"] = 5
-    layer_one = st.selectbox("Layer", options=[i for i in range(1, selected_model.layers+1)], key="selected_layer")
     layer = layer_one - 1
 with right:
     if "selected_head" not in st.session_state:
         st.session_state["selected_head"] = 1
-    head_one = st.selectbox("Head", options=[i for i in range(1, selected_model.heads+1)], key="selected_head")
     head = head_one - 1
 if selected_model.name == ModelType.ZymCTRL:
     try:
         ec_class = structure.header["compound"]["1"]["ec"]
     except KeyError:
-        ec_class = None
-    if ec_class and selected_model.name == ModelType.ZymCTRL:
-        ec_class = st.sidebar.text_input("Enzyme classification number fetched from PDB", ec_class)
-attention_pairs, top_residues = get_attention_pairs(pdb_str=pdb_str, chain_ids=selected_chains, layer=layer, head=head, threshold=min_attn, model_type=selected_model.name, top_n=n_highest_resis)
-sorted_by_attention = sorted(attention_pairs, key=lambda x: x[0], reverse=True)
 def get_3dview(pdb):
     xyzview = py3Dmol.view()
@@ -100,38 +139,61 @@ def get_3dview(pdb):
     # Show all ligands as stick (heteroatoms)
     if show_ligands:
-        xyzview.addStyle({"hetflag": True},
-                            {"stick": {"radius": 0.2}})
     # If no chains are selected, show all chains
     if selected_chains:
         hidden_chains = [x for x in chains if x not in selected_chains]
         for chain in hidden_chains:
-            xyzview.setStyle({"chain": chain},{"cross":{"hidden":"true"}})
             # Hide ligands for chain too
-            xyzview.addStyle({"chain": chain, "hetflag": True},{"cross": {"hidden": "true"}})
     if len(selected_chains) == 1:
-        xyzview.zoomTo({'chain': f'{selected_chains[0]}'})
     else:
         xyzview.zoomTo()
     for att_weight, first, second, _, _, _ in attention_pairs:
-        stmol.add_cylinder(xyzview, start=first, end=second, cylradius=att_weight, cylColor='red', dashed=False)
     if label_resi:
         for hl_resi in hl_resi_list:
-            xyzview.addResLabels({"chain": hl_chain,"resi": hl_resi},
-            {"backgroundColor": "lightgray","fontColor": "black","backgroundOpacity": 0.5})
     if label_highest:
         for _, _, chain, res in top_residues:
-            xyzview.addResLabels({"chain": chain, "resi": res},
-            {"backgroundColor": "lightgray", "fontColor": "black", "backgroundOpacity": 0.5})
             if sidechain_highest:
-                xyzview.addStyle({"chain": chain, "resi": res},{"stick": {"radius": 0.2}})
     return xyzview
 xyzview = get_3dview(pdb_id)
 showmol(xyzview, height=500, width=800)

 import streamlit as st
 from stmol import showmol
+from hexviz.attention import (
+    clean_and_validate_sequence,
+    get_attention_pairs,
+    get_chains,
+)
 from hexviz.models import Model, ModelType
 from hexviz.view import menu_items, select_model, select_pdb, select_protein
     Model(name=ModelType.PROT_BERT, layers=30, heads=16),
 ]
+with st.expander(
+    "Input a PDB id, upload a PDB file or input a sequence", expanded=True
+):
     pdb_id = select_pdb()
     uploaded_file = st.file_uploader("2.Upload PDB", type=["pdb"])
+    input_sequence = st.text_area(
+        "3.Input sequence", "", key="input_sequence", max_chars=400
+    )
     sequence, error = clean_and_validate_sequence(input_sequence)
     if error:
         st.error(error)
     """
     Configure visualization
     ---
+    """
+)
 chains = get_chains(structure)
 if "selected_chains" not in st.session_state:
     st.session_state.selected_chains = chains
+selected_chains = st.sidebar.multiselect(
+    label="Select Chain(s)", options=chains, key="selected_chains"
+)
+show_ligands = st.sidebar.checkbox(
+    "Show ligands", value=st.session_state.get("show_ligands", True)
+)
 st.session_state.show_ligands = show_ligands
     """
     Attention parameters
     ---
+    """
+)
+min_attn = st.sidebar.slider(
+    "Minimum attention", min_value=0.0, max_value=0.4, value=0.1
+)
+n_highest_resis = st.sidebar.number_input(
+    "Num highest attention resis to label", value=2, min_value=1, max_value=100
+)
 label_highest = st.sidebar.checkbox("Label highest attention residues", value=True)
 sidechain_highest = st.sidebar.checkbox("Show sidechains", value=True)
 # TODO add avg or max attention as params
 with st.sidebar.expander("Label residues manually"):
     hl_chain = st.selectbox(label="Chain to label", options=selected_chains, index=0)
+    hl_resi_list = st.multiselect(
+        label="Selected Residues", options=list(range(1, 5000))
+    )
     label_resi = st.checkbox(label="Label Residues", value=True)
 with mid:
     if "selected_layer" not in st.session_state:
         st.session_state["selected_layer"] = 5
+    layer_one = st.selectbox(
+        "Layer",
+        options=[i for i in range(1, selected_model.layers + 1)],
+        key="selected_layer",
+    )
     layer = layer_one - 1
 with right:
     if "selected_head" not in st.session_state:
         st.session_state["selected_head"] = 1
+    head_one = st.selectbox(
+        "Head",
+        options=[i for i in range(1, selected_model.heads + 1)],
+        key="selected_head",
+    )
     head = head_one - 1
+ec_class = ""
 if selected_model.name == ModelType.ZymCTRL:
     try:
         ec_class = structure.header["compound"]["1"]["ec"]
     except KeyError:
+        pass
+    ec_class = st.sidebar.text_input(
+        "Enzyme classification number fetched from PDB", ec_class
+    )
+attention_pairs, top_residues = get_attention_pairs(
+    pdb_str=pdb_str,
+    chain_ids=selected_chains,
+    layer=layer,
+    head=head,
+    threshold=min_attn,
+    model_type=selected_model.name,
+    ec_class=ec_class,
+    top_n=n_highest_resis,
+)
+sorted_by_attention = sorted(attention_pairs, key=lambda x: x[0], reverse=True)
 def get_3dview(pdb):
     xyzview = py3Dmol.view()
     # Show all ligands as stick (heteroatoms)
     if show_ligands:
+        xyzview.addStyle({"hetflag": True}, {"stick": {"radius": 0.2}})
     # If no chains are selected, show all chains
     if selected_chains:
         hidden_chains = [x for x in chains if x not in selected_chains]
         for chain in hidden_chains:
+            xyzview.setStyle({"chain": chain}, {"cross": {"hidden": "true"}})
             # Hide ligands for chain too
+            xyzview.addStyle(
+                {"chain": chain, "hetflag": True}, {"cross": {"hidden": "true"}}
+            )
     if len(selected_chains) == 1:
+        xyzview.zoomTo({"chain": f"{selected_chains[0]}"})
     else:
         xyzview.zoomTo()
     for att_weight, first, second, _, _, _ in attention_pairs:
+        stmol.add_cylinder(
+            xyzview,
+            start=first,
+            end=second,
+            cylradius=att_weight,
+            cylColor="red",
+            dashed=False,
+        )
     if label_resi:
         for hl_resi in hl_resi_list:
+            xyzview.addResLabels(
+                {"chain": hl_chain, "resi": hl_resi},
+                {
+                    "backgroundColor": "lightgray",
+                    "fontColor": "black",
+                    "backgroundOpacity": 0.5,
+                },
+            )
     if label_highest:
         for _, _, chain, res in top_residues:
+            xyzview.addResLabels(
+                {"chain": chain, "resi": res},
+                {
+                    "backgroundColor": "lightgray",
+                    "fontColor": "black",
+                    "backgroundOpacity": 0.5,
+                },
+            )
             if sidechain_highest:
+                xyzview.addStyle(
+                    {"chain": chain, "resi": res}, {"stick": {"radius": 0.2}}
+                )
     return xyzview
 xyzview = get_3dview(pdb_id)
 showmol(xyzview, height=500, width=800)