Spaces:

AI4PD
/

hexviz

Sleeping

App Files Files Community

aksell commited on Apr 24, 2023

Commit

852aa1f

1 Parent(s): c663b1c

Add ProtT5

Browse files

Files changed (5) hide show

hexviz/attention.py +64 -19
hexviz/models.py +15 -0
hexviz/pages/1_🗺️Identify_Interesting_Heads.py +1 -0
hexviz/pages/2_📄Documentation.py +1 -0
hexviz/🧬Attention_Visualization.py +3 -2

hexviz/attention.py CHANGED Viewed

@@ -6,7 +6,13 @@ import streamlit as st
 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
-from hexviz.models import ModelType, get_prot_bert, get_tape_bert, get_zymctrl
 def get_structure(pdb_code: str) -> Structure:
@@ -20,6 +26,7 @@ def get_structure(pdb_code: str) -> Structure:
     structure = parser.get_structure(pdb_code, file)
     return structure
 def get_pdb_file(pdb_code: str) -> Structure:
     """
     Get structure from PDB
@@ -29,6 +36,7 @@ def get_pdb_file(pdb_code: str) -> Structure:
     file = StringIO(pdb_data)
     return file
 @st.cache
 def get_pdb_from_seq(sequence: str) -> str:
     """
@@ -39,6 +47,7 @@ def get_pdb_from_seq(sequence: str) -> str:
     pdb_str = res.text
     return pdb_str
 def get_chains(structure: Structure) -> list[str]:
     """
     Get list of chains in a structure
@@ -49,6 +58,7 @@ def get_chains(structure: Structure) -> list[str]:
             chains.append(chain.id)
     return chains
 def get_sequence(chain) -> str:
     """
     Get sequence from a chain
@@ -57,13 +67,18 @@ def get_sequence(chain) -> str:
     """
     residues = [residue.get_resname() for residue in chain.get_residues()]
     # TODO ask if using protein_letters_3to1_extended makes sense
-    residues_single_letter = map(lambda x: Polypeptide.protein_letters_3to1.get(x, "X"), residues)
     return "".join(list(residues_single_letter))
 def clean_and_validate_sequence(sequence: str) -> tuple[str, str | None]:
     lines = sequence.split("\n")
-    cleaned_sequence = "".join(line.upper() for line in lines if not line.startswith(">"))
     cleaned_sequence = cleaned_sequence.replace(" ", "")
     valid_residues = set(Polypeptide.protein_letters_3to1.values())
     residues_in_sequence = set(cleaned_sequence)
@@ -84,9 +99,7 @@ def clean_and_validate_sequence(sequence: str) -> tuple[str, str | None]:
 @st.cache
-def get_attention(
-    sequence: str, model_type: ModelType = ModelType.TAPE_BERT
-):
     """
     Returns a tensor of shape [n_layers, n_heads, n_res, n_res] with attention weights
     """
@@ -104,11 +117,15 @@ def get_attention(
     elif model_type == ModelType.ZymCTRL:
         tokenizer, model = get_zymctrl()
-        inputs = tokenizer(sequence, return_tensors='pt').input_ids.to(device)
-        attention_mask = tokenizer(sequence, return_tensors='pt').attention_mask.to(device)
         with torch.no_grad():
-            outputs = model(inputs, attention_mask=attention_mask, output_attentions=True)
             attentions = outputs.attentions
         # torch.Size([1, n_heads, n_res, n_res]) -> torch.Size([n_heads, n_res, n_res])
@@ -128,12 +145,27 @@ def get_attention(
         attentions = [attention[:, :, 1:-1, 1:-1] for attention in attentions]
         attentions = torch.stack([attention.squeeze(0) for attention in attentions])
     else:
         raise ValueError(f"Model {model_type} not supported")
     # Transfer to CPU to avoid issues with streamlit caching
     return attentions.cpu()
 def unidirectional_avg_filtered(attention, layer, head, threshold):
     num_layers, num_heads, seq_len, _ = attention.shape
     attention_head = attention[layer, head]
@@ -147,7 +179,7 @@ def unidirectional_avg_filtered(attention, layer, head, threshold):
             if avg >= threshold:
                 unidirectional_avg_for_head.append((avg, i, j))
     return unidirectional_avg_for_head
 # Passing the pdb_str here is a workaround for streamlit caching
 # where I need the input to be hashable and not changing
@@ -155,7 +187,15 @@ def unidirectional_avg_filtered(attention, layer, head, threshold):
 # Thist twice. If streamlit is upgaded to past 0.17 this can be
 # fixed.
 @st.cache
-def get_attention_pairs(pdb_str: str, layer: int, head: int, chain_ids: list[str] | None ,threshold: int = 0.2, model_type: ModelType = ModelType.TAPE_BERT, top_n: int = 2):
     structure = PDBParser().get_structure("pdb", StringIO(pdb_str))
     if chain_ids:
         chains = [ch for ch in structure.get_chains() if ch.id in chain_ids]
@@ -167,9 +207,11 @@ def get_attention_pairs(pdb_str: str, layer: int, head: int, chain_ids: list[str
     for chain in chains:
         sequence = get_sequence(chain)
         attention = get_attention(sequence=sequence, model_type=model_type)
-        attention_unidirectional = unidirectional_avg_filtered(attention, layer, head, threshold)
-        # Store sum of attention in to a resiue (from the unidirectional attention)
         residue_attention = {}
         for attn_value, res_1, res_2 in attention_unidirectional:
             try:
@@ -178,15 +220,18 @@ def get_attention_pairs(pdb_str: str, layer: int, head: int, chain_ids: list[str
             except KeyError:
                 continue
-            attention_pairs.append((attn_value, coord_1, coord_2, chain.id, res_1, res_2))
             residue_attention[res_1] = residue_attention.get(res_1, 0) + attn_value
             residue_attention[res_2] = residue_attention.get(res_2, 0) + attn_value
-        top_n_residues = sorted(residue_attention.items(), key=lambda x: x[1], reverse=True)[:top_n]
         for res, attn_sum in top_n_residues:
             coord = chain[res]["CA"].coord.tolist()
             top_residues.append((attn_sum, coord, chain.id, res))
-    return attention_pairs, top_residues

 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
+from hexviz.models import (
+    ModelType,
+    get_prot_bert,
+    get_prot_t5,
+    get_tape_bert,
+    get_zymctrl,
+)
 def get_structure(pdb_code: str) -> Structure:
     structure = parser.get_structure(pdb_code, file)
     return structure
 def get_pdb_file(pdb_code: str) -> Structure:
     """
     Get structure from PDB
     file = StringIO(pdb_data)
     return file
 @st.cache
 def get_pdb_from_seq(sequence: str) -> str:
     """
     pdb_str = res.text
     return pdb_str
 def get_chains(structure: Structure) -> list[str]:
     """
     Get list of chains in a structure
             chains.append(chain.id)
     return chains
 def get_sequence(chain) -> str:
     """
     Get sequence from a chain
     """
     residues = [residue.get_resname() for residue in chain.get_residues()]
     # TODO ask if using protein_letters_3to1_extended makes sense
+    residues_single_letter = map(
+        lambda x: Polypeptide.protein_letters_3to1.get(x, "X"), residues
+    )
     return "".join(list(residues_single_letter))
 def clean_and_validate_sequence(sequence: str) -> tuple[str, str | None]:
     lines = sequence.split("\n")
+    cleaned_sequence = "".join(
+        line.upper() for line in lines if not line.startswith(">")
+    )
     cleaned_sequence = cleaned_sequence.replace(" ", "")
     valid_residues = set(Polypeptide.protein_letters_3to1.values())
     residues_in_sequence = set(cleaned_sequence)
 @st.cache
+def get_attention(sequence: str, model_type: ModelType = ModelType.TAPE_BERT):
     """
     Returns a tensor of shape [n_layers, n_heads, n_res, n_res] with attention weights
     """
     elif model_type == ModelType.ZymCTRL:
         tokenizer, model = get_zymctrl()
+        inputs = tokenizer(sequence, return_tensors="pt").input_ids.to(device)
+        attention_mask = tokenizer(sequence, return_tensors="pt").attention_mask.to(
+            device
+        )
         with torch.no_grad():
+            outputs = model(
+                inputs, attention_mask=attention_mask, output_attentions=True
+            )
             attentions = outputs.attentions
         # torch.Size([1, n_heads, n_res, n_res]) -> torch.Size([n_heads, n_res, n_res])
         attentions = [attention[:, :, 1:-1, 1:-1] for attention in attentions]
         attentions = torch.stack([attention.squeeze(0) for attention in attentions])
+    elif model_type == ModelType.PROT_T5:
+        tokenizer, model = get_prot_t5()
+        sequence_separated = " ".join(sequence)
+        token_idxs = tokenizer.encode(sequence_separated)
+        inputs = torch.tensor(token_idxs).unsqueeze(0).to(device)
+        with torch.no_grad():
+            attentions = model(inputs, output_attentions=True)[
+                -1
+            ]  # Do you need an attention mask?
+        # Remove attention to <pad> (first) and <extra_id_1>, <extra_id_2> (last) tokens
+        attentions = [attention[:, :, 3:-3, 3:-3] for attention in attentions]
+        attentions = torch.stack([attention.squeeze(0) for attention in attentions])
     else:
         raise ValueError(f"Model {model_type} not supported")
     # Transfer to CPU to avoid issues with streamlit caching
     return attentions.cpu()
 def unidirectional_avg_filtered(attention, layer, head, threshold):
     num_layers, num_heads, seq_len, _ = attention.shape
     attention_head = attention[layer, head]
             if avg >= threshold:
                 unidirectional_avg_for_head.append((avg, i, j))
     return unidirectional_avg_for_head
 # Passing the pdb_str here is a workaround for streamlit caching
 # where I need the input to be hashable and not changing
 # Thist twice. If streamlit is upgaded to past 0.17 this can be
 # fixed.
 @st.cache
+def get_attention_pairs(
+    pdb_str: str,
+    layer: int,
+    head: int,
+    chain_ids: list[str] | None,
+    threshold: int = 0.2,
+    model_type: ModelType = ModelType.TAPE_BERT,
+    top_n: int = 2,
+):
     structure = PDBParser().get_structure("pdb", StringIO(pdb_str))
     if chain_ids:
         chains = [ch for ch in structure.get_chains() if ch.id in chain_ids]
     for chain in chains:
         sequence = get_sequence(chain)
         attention = get_attention(sequence=sequence, model_type=model_type)
+        attention_unidirectional = unidirectional_avg_filtered(
+            attention, layer, head, threshold
+        )
+        # Store sum of attention in to a resiue (from the unidirectional attention)
         residue_attention = {}
         for attn_value, res_1, res_2 in attention_unidirectional:
             try:
             except KeyError:
                 continue
+            attention_pairs.append(
+                (attn_value, coord_1, coord_2, chain.id, res_1, res_2)
+            )
             residue_attention[res_1] = residue_attention.get(res_1, 0) + attn_value
             residue_attention[res_2] = residue_attention.get(res_2, 0) + attn_value
+        top_n_residues = sorted(
+            residue_attention.items(), key=lambda x: x[1], reverse=True
+        )[:top_n]
         for res, attn_sum in top_n_residues:
             coord = chain[res]["CA"].coord.tolist()
             top_residues.append((attn_sum, coord, chain.id, res))
+    return attention_pairs, top_residues

hexviz/models.py CHANGED Viewed

@@ -10,6 +10,8 @@ from transformers import (
     BertTokenizer,
     GPT2LMHeadModel,
     GPT2TokenizerFast,
 )
@@ -17,6 +19,7 @@ class ModelType(str, Enum):
     TAPE_BERT = "TapeBert"
     ZymCTRL = "ZymCTRL"
     PROT_BERT = "ProtBert"
 class Model:
@@ -49,3 +52,15 @@ def get_prot_bert() -> tuple[BertTokenizer, BertModel]:
     tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
     model = BertModel.from_pretrained("Rostlab/prot_bert").to(device)
     return tokenizer, model

     BertTokenizer,
     GPT2LMHeadModel,
     GPT2TokenizerFast,
+    T5EncoderModel,
+    T5Tokenizer,
 )
     TAPE_BERT = "TapeBert"
     ZymCTRL = "ZymCTRL"
     PROT_BERT = "ProtBert"
+    PROT_T5 = "ProtT5"
 class Model:
     tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
     model = BertModel.from_pretrained("Rostlab/prot_bert").to(device)
     return tokenizer, model
+@st.cache
+def get_prot_t5():
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    tokenizer = T5Tokenizer.from_pretrained(
+        "Rostlab/prot_t5_xl_half_uniref50-enc", do_lower_case=False
+    )
+    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(
+        device
+    )
+    return tokenizer, model

hexviz/pages/1_🗺️Identify_Interesting_Heads.py CHANGED Viewed

@@ -23,6 +23,7 @@ models = [
     Model(name=ModelType.TAPE_BERT, layers=12, heads=12),
     Model(name=ModelType.ZymCTRL, layers=36, heads=16),
     Model(name=ModelType.PROT_BERT, layers=30, heads=16),
 ]
 with st.expander(

     Model(name=ModelType.TAPE_BERT, layers=12, heads=12),
     Model(name=ModelType.ZymCTRL, layers=36, heads=16),
     Model(name=ModelType.PROT_BERT, layers=30, heads=16),
+    Model(name=ModelType.PROT_T5, layers=24, heads=32),
 ]
 with st.expander(

hexviz/pages/2_📄Documentation.py CHANGED Viewed

@@ -45,6 +45,7 @@ Hexviz currently supports the following models:
 1. [ProtBERT](https://huggingface.co/Rostlab/prot_bert_bfd)
 2. [ZymCTRL](https://huggingface.co/nferruz/ZymCTRL)
 3. [TapeBert](https://github.com/songlab-cal/tape/blob/master/tape/models/modeling_bert.py) - a nickname coined in BERTOLOGY meets biology for the Bert Base model pre-trained on Pfam in [TAPE](https://www.biorxiv.org/content/10.1101/676825v1). TapeBert is used extensively in BERTOlogy meets biology.
 ## FAQ
 1. I can't see any attention- "bars" in the visualization, what is wrong? -> Lower the `minimum attention`.

 1. [ProtBERT](https://huggingface.co/Rostlab/prot_bert_bfd)
 2. [ZymCTRL](https://huggingface.co/nferruz/ZymCTRL)
 3. [TapeBert](https://github.com/songlab-cal/tape/blob/master/tape/models/modeling_bert.py) - a nickname coined in BERTOLOGY meets biology for the Bert Base model pre-trained on Pfam in [TAPE](https://www.biorxiv.org/content/10.1101/676825v1). TapeBert is used extensively in BERTOlogy meets biology.
+4. [ProtT5 half](https://huggingface.co/Rostlab/prot_t5_xl_half_uniref50-enc)
 ## FAQ
 1. I can't see any attention- "bars" in the visualization, what is wrong? -> Lower the `minimum attention`.

hexviz/🧬Attention_Visualization.py CHANGED Viewed

@@ -22,6 +22,7 @@ models = [
     Model(name=ModelType.TAPE_BERT, layers=12, heads=12),
     Model(name=ModelType.ZymCTRL, layers=36, heads=16),
     Model(name=ModelType.PROT_BERT, layers=30, heads=16),
 ]
 with st.expander(
@@ -219,8 +220,8 @@ st.table(df)
 st.markdown(
     """
 ### Check out the other pages
-[🗺️Identify Interesting heads](Identify_Interesting_Heads) give a birds-eye view of attention patterns for a model,
-this can help you pick what specific attention heads to look at for your protein.
 [📄Documentation](Documentation) has information on protein language models, attention analysis and hexviz."""
 )

     Model(name=ModelType.TAPE_BERT, layers=12, heads=12),
     Model(name=ModelType.ZymCTRL, layers=36, heads=16),
     Model(name=ModelType.PROT_BERT, layers=30, heads=16),
+    Model(name=ModelType.PROT_T5, layers=24, heads=32),
 ]
 with st.expander(
 st.markdown(
     """
 ### Check out the other pages
+[🗺️Identify Interesting heads](Identify_Interesting_Heads) gives a bird's eye view of attention patterns for a model.
+This can help you pick what specific attention heads to look at for your protein.
 [📄Documentation](Documentation) has information on protein language models, attention analysis and hexviz."""
 )