Spaces:
Sleeping
Sleeping
Commit
·
c2a02c6
0
Parent(s):
Duplicate from fatmacankara/ASCARIS
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +38 -0
- README.md +14 -0
- app.py +129 -0
- code/__pycache__/add_3Dalignment.cpython-37.pyc +0 -0
- code/__pycache__/add_alignment.cpython-37.pyc +0 -0
- code/__pycache__/add_annotations.cpython-37.pyc +0 -0
- code/__pycache__/add_domains.cpython-37.pyc +0 -0
- code/__pycache__/add_interface_pos.cpython-37.pyc +0 -0
- code/__pycache__/add_sasa.cpython-37.pyc +0 -0
- code/__pycache__/add_sequence.cpython-37.pyc +0 -0
- code/__pycache__/add_structure.cpython-37.pyc +0 -0
- code/__pycache__/alphafold_featureVector.cpython-37.pyc +0 -0
- code/__pycache__/alphafold_model.cpython-37.pyc +0 -0
- code/__pycache__/calc_pc_property.cpython-37.pyc +0 -0
- code/__pycache__/manage_files.cpython-37.pyc +0 -0
- code/__pycache__/pdb_featureVector.cpython-37.pyc +0 -0
- code/__pycache__/process_input.cpython-37.pyc +0 -0
- code/__pycache__/standard.cpython-37.pyc +0 -0
- code/__pycache__/uniprotSequenceMatch.cpython-37.pyc +0 -0
- code/add_3Dalignment.py +261 -0
- code/add_alignment.py +423 -0
- code/add_annotations.py +95 -0
- code/add_domains.py +57 -0
- code/add_interface_pos.py +35 -0
- code/add_sasa.py +131 -0
- code/add_sequence.py +44 -0
- code/add_structure.py +168 -0
- code/alphafold_featureVector.py +579 -0
- code/alphafold_model.py +33 -0
- code/calc_pc_property.py +441 -0
- code/create_swissmodelSummary.py +1 -0
- code/get_alphafoldStructures.py +97 -0
- code/main.py +35 -0
- code/manage_files.py +42 -0
- code/pdb_featureVector.py +0 -0
- code/process_input.py +40 -0
- code/standard.py +13 -0
- code/uniprotSequenceMatch.py +40 -0
- input_files/H_sapiens_interfacesHQ.txt +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.cif.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.pdb.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.cif.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.pdb.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.cif.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.pdb.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.cif.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.pdb.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.cif.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.pdb.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MS02-F1-model_v1.cif.gz +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
input_files/alphafold_summary.txt filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
input_files/H_sapiens_interfacesHQ.txt filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
input_files/swissmodel_structures.txt filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: ASCARIS
|
| 3 |
+
emoji: 🦀
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
python_version: '3.7'
|
| 8 |
+
sdk_version: 1.21.0
|
| 9 |
+
app_file: app.py
|
| 10 |
+
pinned: false
|
| 11 |
+
duplicated_from: fatmacankara/ASCARIS
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from os import path
|
| 4 |
+
import sys
|
| 5 |
+
import streamlit.components.v1 as components
|
| 6 |
+
sys.path.append('code/')
|
| 7 |
+
#sys.path.append('ASCARIS/code/')
|
| 8 |
+
import pdb_featureVector
|
| 9 |
+
import alphafold_featureVector
|
| 10 |
+
import argparse
|
| 11 |
+
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode,GridUpdateMode
|
| 12 |
+
showWarningOnDirectExecution = False
|
| 13 |
+
def download_button(object_to_download, download_filename):
|
| 14 |
+
|
| 15 |
+
if isinstance(object_to_download, pd.DataFrame):
|
| 16 |
+
object_to_download = object_to_download.to_csv(index=False)
|
| 17 |
+
|
| 18 |
+
# Try JSON encode for everything else
|
| 19 |
+
else:
|
| 20 |
+
object_to_download = json.dumps(object_to_download)
|
| 21 |
+
try:
|
| 22 |
+
# some strings <-> bytes conversions necessary here
|
| 23 |
+
b64 = base64.b64encode(object_to_download.encode()).decode()
|
| 24 |
+
|
| 25 |
+
except AttributeError as e:
|
| 26 |
+
b64 = base64.b64encode(object_to_download).decode()
|
| 27 |
+
|
| 28 |
+
dl_link = f"""<html><head><title>Start Auto Download file</title><script src="http://code.jquery.com/jquery-3.2.1.min.js"></script><script>$('<a href="data:text/csv;base64,{b64}" download="{download_filename}">')[0].click()</script></head></html>"""
|
| 29 |
+
return dl_link
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def download_df():
|
| 33 |
+
components.html(
|
| 34 |
+
download_button(selected_df, st.session_state.filename),
|
| 35 |
+
height=0,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
original_title = '<p style="font-family:Trebuchet MS; color:#FD7456; font-size: 35px; font-weight:bold; text-align:center">Welcome to ASCARIS</p>'
|
| 40 |
+
st.markdown(original_title, unsafe_allow_html=True)
|
| 41 |
+
st.write('')
|
| 42 |
+
st.write('')
|
| 43 |
+
st.write('')
|
| 44 |
+
st.write('')
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
source = st.selectbox('Select Protein Structure Database (1: PDB, SwissModel, Modbase 2: AlphaFold)',[1,2])
|
| 49 |
+
impute = st.selectbox('Select Imputation',[True, False])
|
| 50 |
+
input_data = st.text_input('Enter Input Variation')
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
#sys.path.append(path.abspath('../code/'))
|
| 56 |
+
parser = argparse.ArgumentParser(description='ASCARIS')
|
| 57 |
+
|
| 58 |
+
parser.add_argument('-s', '--source_option',
|
| 59 |
+
help='Selection of input structure data.\n 1: PDB Structures (default), 2: AlphaFold Structures',
|
| 60 |
+
default=1)
|
| 61 |
+
parser.add_argument('-i', '--input_datapoint',
|
| 62 |
+
help='Input file or query datapoint\n Option 1: Comma-separated list of idenfiers (UniProt ID-wt residue-position-mutated residue (e.g. Q9Y4W6-N-432-T or Q9Y4W6-N-432-T, Q9Y4W6-N-432-T)) \n Option 2: Enter comma-separated file path')
|
| 63 |
+
|
| 64 |
+
parser.add_argument('-impute', '--imputation_state', default='True',
|
| 65 |
+
help='Whether resulting feature vector should be imputed or not. Default True.')
|
| 66 |
+
|
| 67 |
+
args = parser.parse_args()
|
| 68 |
+
|
| 69 |
+
input_set = input_data
|
| 70 |
+
mode = source
|
| 71 |
+
impute = impute
|
| 72 |
+
|
| 73 |
+
print('*****************************************')
|
| 74 |
+
print('Feature vector generation is in progress. \nPlease check log file for updates..')
|
| 75 |
+
print('*****************************************')
|
| 76 |
+
mode = int(mode)
|
| 77 |
+
|
| 78 |
+
with st.spinner('In progress...This may take a while...'):
|
| 79 |
+
try:
|
| 80 |
+
if mode == 1:
|
| 81 |
+
selected_df = pdb_featureVector.pdb(input_set, mode, impute)
|
| 82 |
+
int_builder = GridOptionsBuilder.from_dataframe(selected_df)
|
| 83 |
+
int_builder.configure_default_column(editable=False, filterable=True, cellStyle={'text-align': 'center'})
|
| 84 |
+
int_builder.configure_pagination(enabled=True, paginationAutoPageSize=False, paginationPageSize=10)
|
| 85 |
+
int_builder.configure_selection(selection_mode='multiple', use_checkbox=True)
|
| 86 |
+
gridoptions = int_builder.build()
|
| 87 |
+
int_return = AgGrid(selected_df,
|
| 88 |
+
width='100%',
|
| 89 |
+
height=(len(selected_df) + 4) * 35.2 + 3,
|
| 90 |
+
theme='light',
|
| 91 |
+
enable_enterprise_modules=False,
|
| 92 |
+
gridOptions=gridoptions,
|
| 93 |
+
fit_columns_on_grid_load=False,
|
| 94 |
+
update_mode=GridUpdateMode.SELECTION_CHANGED, # or MODEL_CHANGED
|
| 95 |
+
custom_css={".ag-header-cell-label": {"justify-content": "center"}})
|
| 96 |
+
st.success('Feature vector successfully created.')
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
elif mode == 2:
|
| 100 |
+
selected_df = alphafold_featureVector.alphafold(input_set, mode, impute)
|
| 101 |
+
int_builder = GridOptionsBuilder.from_dataframe(selected_df)
|
| 102 |
+
int_builder.configure_default_column(editable=False, filterable=True, cellStyle={'text-align': 'center'})
|
| 103 |
+
int_builder.configure_pagination(enabled=True, paginationAutoPageSize=False, paginationPageSize=10)
|
| 104 |
+
int_builder.configure_selection(selection_mode='multiple', use_checkbox=True)
|
| 105 |
+
gridoptions = int_builder.build()
|
| 106 |
+
int_return = AgGrid(selected_df,
|
| 107 |
+
width='100%',
|
| 108 |
+
height=(len(selected_df) + 4) * 35.2 + 3,
|
| 109 |
+
theme='light',
|
| 110 |
+
enable_enterprise_modules=False,
|
| 111 |
+
gridOptions=gridoptions,
|
| 112 |
+
fit_columns_on_grid_load=False,
|
| 113 |
+
update_mode=GridUpdateMode.SELECTION_CHANGED, # or MODEL_CHANGED
|
| 114 |
+
custom_css={".ag-header-cell-label": {"justify-content": "center"}})
|
| 115 |
+
st.success('Feature vector successfully created.')
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
except:
|
| 119 |
+
pass
|
| 120 |
+
download_df = pd.DataFrame()
|
| 121 |
+
|
| 122 |
+
with st.form("my_form", clear_on_submit=False):
|
| 123 |
+
st.text_input("Enter filename", key="filename")
|
| 124 |
+
submit = st.form_submit_button("Download feature vector", on_click=download_df)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
|
code/__pycache__/add_3Dalignment.cpython-37.pyc
ADDED
|
Binary file (5.67 kB). View file
|
|
|
code/__pycache__/add_alignment.cpython-37.pyc
ADDED
|
Binary file (7.99 kB). View file
|
|
|
code/__pycache__/add_annotations.cpython-37.pyc
ADDED
|
Binary file (3.78 kB). View file
|
|
|
code/__pycache__/add_domains.cpython-37.pyc
ADDED
|
Binary file (1.44 kB). View file
|
|
|
code/__pycache__/add_interface_pos.cpython-37.pyc
ADDED
|
Binary file (1.12 kB). View file
|
|
|
code/__pycache__/add_sasa.cpython-37.pyc
ADDED
|
Binary file (3.17 kB). View file
|
|
|
code/__pycache__/add_sequence.cpython-37.pyc
ADDED
|
Binary file (1.27 kB). View file
|
|
|
code/__pycache__/add_structure.cpython-37.pyc
ADDED
|
Binary file (5.93 kB). View file
|
|
|
code/__pycache__/alphafold_featureVector.cpython-37.pyc
ADDED
|
Binary file (15.4 kB). View file
|
|
|
code/__pycache__/alphafold_model.cpython-37.pyc
ADDED
|
Binary file (1.35 kB). View file
|
|
|
code/__pycache__/calc_pc_property.cpython-37.pyc
ADDED
|
Binary file (8.84 kB). View file
|
|
|
code/__pycache__/manage_files.cpython-37.pyc
ADDED
|
Binary file (1.43 kB). View file
|
|
|
code/__pycache__/pdb_featureVector.cpython-37.pyc
ADDED
|
Binary file (33.7 kB). View file
|
|
|
code/__pycache__/process_input.cpython-37.pyc
ADDED
|
Binary file (1.69 kB). View file
|
|
|
code/__pycache__/standard.cpython-37.pyc
ADDED
|
Binary file (749 Bytes). View file
|
|
|
code/__pycache__/uniprotSequenceMatch.cpython-37.pyc
ADDED
|
Binary file (1.28 kB). View file
|
|
|
code/add_3Dalignment.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This code file produces alignments between the structure and the sequence for a given protein.
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import math
|
| 7 |
+
import glob
|
| 8 |
+
import numpy as np
|
| 9 |
+
from Bio import Align
|
| 10 |
+
import gzip
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from Bio.Align import substitution_matrices
|
| 13 |
+
aligner = Align.PairwiseAligner()
|
| 14 |
+
|
| 15 |
+
def distance(x1, y1, z1, x2, y2, z2):
|
| 16 |
+
d = math.sqrt(math.pow(x2 - x1, 2) +
|
| 17 |
+
math.pow(y2 - y1, 2) +
|
| 18 |
+
math.pow(z2 - z1, 2) * 1.0)
|
| 19 |
+
return d
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def find_distance(coordMut, coordAnnot):
|
| 23 |
+
if coordMut != np.NaN:
|
| 24 |
+
try:
|
| 25 |
+
dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
|
| 26 |
+
float(coordAnnot[1]), float(coordAnnot[2]))
|
| 27 |
+
return "%.2f" % dist
|
| 28 |
+
except:
|
| 29 |
+
ValueError
|
| 30 |
+
dist = 'nan'
|
| 31 |
+
return dist
|
| 32 |
+
else:
|
| 33 |
+
return np.NaN
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def threeToOne(variant):
|
| 37 |
+
if variant == "ALA":
|
| 38 |
+
variant = "A"
|
| 39 |
+
elif variant == "ARG":
|
| 40 |
+
variant = "R"
|
| 41 |
+
elif variant == "VAL":
|
| 42 |
+
variant = "V"
|
| 43 |
+
elif variant == "GLU":
|
| 44 |
+
variant = "E"
|
| 45 |
+
elif variant == "PRO":
|
| 46 |
+
variant = "P"
|
| 47 |
+
elif variant == "LEU":
|
| 48 |
+
variant = "L"
|
| 49 |
+
elif variant == "GLY":
|
| 50 |
+
variant = "G"
|
| 51 |
+
elif variant == "ASN":
|
| 52 |
+
variant = "N"
|
| 53 |
+
elif variant == "SER":
|
| 54 |
+
variant = "S"
|
| 55 |
+
elif variant == "GLN":
|
| 56 |
+
variant = "Q"
|
| 57 |
+
elif variant == "THR":
|
| 58 |
+
variant = "T"
|
| 59 |
+
elif variant == "MET":
|
| 60 |
+
variant = "M"
|
| 61 |
+
elif variant == "LYS":
|
| 62 |
+
variant = "K"
|
| 63 |
+
elif variant == "ASP":
|
| 64 |
+
variant = "D"
|
| 65 |
+
elif variant == "ILE":
|
| 66 |
+
variant = "I"
|
| 67 |
+
elif variant == "PHE":
|
| 68 |
+
variant = "F"
|
| 69 |
+
elif variant == "TRP":
|
| 70 |
+
variant = "W"
|
| 71 |
+
elif variant == "TYR":
|
| 72 |
+
variant = "Y"
|
| 73 |
+
elif variant == "HIS":
|
| 74 |
+
variant = "H"
|
| 75 |
+
elif variant == "CYS":
|
| 76 |
+
variant = "C"
|
| 77 |
+
elif variant == 'UNK':
|
| 78 |
+
variant = 'X'
|
| 79 |
+
elif variant == 'ASX':
|
| 80 |
+
variant = 'O'
|
| 81 |
+
return (variant)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def get_coords(annot, alignments, coords, resnums_for_sasa, mode):
|
| 85 |
+
if mode == 1:
|
| 86 |
+
for alignment in alignments[0]:
|
| 87 |
+
alignment = (str(alignment).strip().split('\n'))
|
| 88 |
+
startGap = 0
|
| 89 |
+
if alignment[0].startswith('.'):
|
| 90 |
+
for k in alignment[0]:
|
| 91 |
+
if k == '.' or k == '-':
|
| 92 |
+
startGap += 1
|
| 93 |
+
else:
|
| 94 |
+
break
|
| 95 |
+
countGap = startGap
|
| 96 |
+
countResidue = 0
|
| 97 |
+
for j in alignment[0][startGap:]:
|
| 98 |
+
if j == '.' or j == '-':
|
| 99 |
+
countGap += 1
|
| 100 |
+
else:
|
| 101 |
+
countResidue += 1
|
| 102 |
+
if countResidue == float(annot):
|
| 103 |
+
break
|
| 104 |
+
countGap_pdb = 0
|
| 105 |
+
countResidue_pdb = 0
|
| 106 |
+
for m in alignment[2][0:countResidue + countGap - 1]:
|
| 107 |
+
if m == '.' or m == '-':
|
| 108 |
+
countGap_pdb += 1
|
| 109 |
+
posAtom = countResidue + countGap - countGap_pdb
|
| 110 |
+
|
| 111 |
+
realpdbStart = 0
|
| 112 |
+
for j in alignment[2]:
|
| 113 |
+
if j == '.' or j == '-':
|
| 114 |
+
realpdbStart += 1
|
| 115 |
+
else:
|
| 116 |
+
break
|
| 117 |
+
|
| 118 |
+
if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
|
| 119 |
+
try:
|
| 120 |
+
coordinates = alignments[1]
|
| 121 |
+
residue_numbers = alignments[2]
|
| 122 |
+
coordWeWant = coordinates[posAtom - 1]
|
| 123 |
+
residue_number_we_want = residue_numbers[posAtom - 1]
|
| 124 |
+
|
| 125 |
+
except:
|
| 126 |
+
IndexError
|
| 127 |
+
coordWeWant = 'nan'
|
| 128 |
+
else:
|
| 129 |
+
coordWeWant = 'nan'
|
| 130 |
+
return coordWeWant, posAtom, residue_number_we_want
|
| 131 |
+
if mode == 2:
|
| 132 |
+
if annot != 'nan':
|
| 133 |
+
if int(annot) <= 1400:
|
| 134 |
+
alignment = (str(alignments).strip().split('\n'))
|
| 135 |
+
startGap = 0
|
| 136 |
+
if alignment[0].startswith('.'):
|
| 137 |
+
for k in alignment[0]:
|
| 138 |
+
if k == '.' or k == '-':
|
| 139 |
+
startGap += 1
|
| 140 |
+
else:
|
| 141 |
+
break
|
| 142 |
+
countGap = startGap
|
| 143 |
+
countResidue = 0
|
| 144 |
+
for j in alignment[0][startGap:]:
|
| 145 |
+
if j == '.' or j == '-':
|
| 146 |
+
countGap += 1
|
| 147 |
+
else:
|
| 148 |
+
countResidue += 1
|
| 149 |
+
if countResidue == float(annot):
|
| 150 |
+
break
|
| 151 |
+
countGap_pdb = 0
|
| 152 |
+
countResidue_pdb = 0
|
| 153 |
+
for m in alignment[2][0:countResidue + countGap - 1]:
|
| 154 |
+
if m == '.' or m == '-':
|
| 155 |
+
countGap_pdb += 1
|
| 156 |
+
posAtom = countResidue + countGap - countGap_pdb
|
| 157 |
+
realpdbStart = 0
|
| 158 |
+
for j in alignment[2]:
|
| 159 |
+
if j == '.' or j == '-':
|
| 160 |
+
realpdbStart += 1
|
| 161 |
+
else:
|
| 162 |
+
break
|
| 163 |
+
if len(alignment[2]) > (countResidue + countGap - 1):
|
| 164 |
+
if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
|
| 165 |
+
try:
|
| 166 |
+
coordinates = coords
|
| 167 |
+
residue_numbers = resnums_for_sasa
|
| 168 |
+
coordWeWant = coordinates[posAtom - 1]
|
| 169 |
+
residue_number_we_want = residue_numbers[posAtom - 1]
|
| 170 |
+
except:
|
| 171 |
+
IndexError
|
| 172 |
+
coordWeWant = 'nan'
|
| 173 |
+
residue_number_we_want = 'nan'
|
| 174 |
+
else:
|
| 175 |
+
coordWeWant = 'nan'
|
| 176 |
+
residue_number_we_want = 'nan'
|
| 177 |
+
return coordWeWant, posAtom, residue_number_we_want
|
| 178 |
+
else:
|
| 179 |
+
coordWeWant = 'nan'
|
| 180 |
+
residue_number_we_want = 'nan'
|
| 181 |
+
return coordWeWant, posAtom, residue_number_we_want
|
| 182 |
+
else:
|
| 183 |
+
return np.NaN, np.NaN, np.NaN
|
| 184 |
+
else:
|
| 185 |
+
return np.NaN, np.NaN, np.NaN
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def get_alignments_3D(identifier, model_num, pdb_path, pdbSequence, source, chain, pdbID, mode, path_3D_alignment,file_format = 'gzip'):
|
| 189 |
+
if mode == 1:
|
| 190 |
+
atomSequence = ''
|
| 191 |
+
coords = []
|
| 192 |
+
resnums_for_sasa = []
|
| 193 |
+
with open(pdb_path, encoding="utf8") as f:
|
| 194 |
+
for line in f.readlines():
|
| 195 |
+
if source != 'MODBASE':
|
| 196 |
+
if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21].upper() == chain.upper():
|
| 197 |
+
atomSequence += threeToOne(line[17:20].strip())
|
| 198 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
| 199 |
+
resnums_for_sasa.append(line[22:26].strip())
|
| 200 |
+
elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
|
| 201 |
+
atomSequence += threeToOne(line[17:20].strip())
|
| 202 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
| 203 |
+
resnums_for_sasa.append(line[22:26].strip())
|
| 204 |
+
else:
|
| 205 |
+
if line[0:7].strip() == 'ATOM' and line[13:15].strip() == 'CA':
|
| 206 |
+
atomSequence += threeToOne(line[17:20].strip())
|
| 207 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
| 208 |
+
resnums_for_sasa.append(line[22:26].strip())
|
| 209 |
+
|
| 210 |
+
f = open(Path(path_3D_alignment / f'{identifier}_{pdbID}_{str(chain)}_alignment.txt'),"w")
|
| 211 |
+
|
| 212 |
+
aligner.mode = 'local'
|
| 213 |
+
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
|
| 214 |
+
aligner.open_gap_score = -11
|
| 215 |
+
aligner.extend_gap_score = -1
|
| 216 |
+
alignments = aligner.align(pdbSequence, atomSequence)
|
| 217 |
+
alignments = (list(alignments))
|
| 218 |
+
for alignment in alignments:
|
| 219 |
+
f.write(str(alignment))
|
| 220 |
+
f.write('\n')
|
| 221 |
+
f.write('\n')
|
| 222 |
+
return alignments, coords, resnums_for_sasa
|
| 223 |
+
elif mode==2:
|
| 224 |
+
atomSequence = ''
|
| 225 |
+
coords = []
|
| 226 |
+
resnums_for_sasa = []
|
| 227 |
+
if file_format == 'txt':
|
| 228 |
+
with open(name, encoding="utf8") as f:
|
| 229 |
+
for line in f.readlines():
|
| 230 |
+
if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
|
| 231 |
+
atomSequence += threeToOne(line[17:20].strip())
|
| 232 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
| 233 |
+
resnums_for_sasa.append(line[22:26].strip())
|
| 234 |
+
elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
|
| 235 |
+
atomSequence += threeToOne(line[17:20].strip())
|
| 236 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
| 237 |
+
resnums_for_sasa.append(line[22:26].strip())
|
| 238 |
+
elif file_format == 'gzip':
|
| 239 |
+
with gzip.open(pdb_path, mode='rb') as f:
|
| 240 |
+
for line in f:
|
| 241 |
+
line = line.decode()
|
| 242 |
+
if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
|
| 243 |
+
atomSequence += threeToOne(line[17:20].strip())
|
| 244 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
| 245 |
+
resnums_for_sasa.append(line[22:26].strip())
|
| 246 |
+
elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
|
| 247 |
+
atomSequence += threeToOne(line[17:20].strip())
|
| 248 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
| 249 |
+
resnums_for_sasa.append(line[22:26].strip())
|
| 250 |
+
f = open(Path(path_3D_alignment / f'{identifier}_{str(model_num)}_3Dalignment.txt'),"w")
|
| 251 |
+
aligner.mode = 'local'
|
| 252 |
+
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
|
| 253 |
+
aligner.open_gap_score = -11
|
| 254 |
+
aligner.extend_gap_score = -1
|
| 255 |
+
alignments = aligner.align(pdbSequence, atomSequence)
|
| 256 |
+
alignments = (list(alignments))
|
| 257 |
+
for alignment in alignments:
|
| 258 |
+
f.write(str(alignment))
|
| 259 |
+
f.write('\n')
|
| 260 |
+
f.write('\n')
|
| 261 |
+
return alignments, coords, resnums_for_sasa
|
code/add_alignment.py
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Bio import Align
|
| 2 |
+
from Bio.Align import substitution_matrices
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import streamlit as st
|
| 5 |
+
from Bio.pairwise2 import format_alignment
|
| 6 |
+
from Bio import pairwise2
|
| 7 |
+
from Bio import pairwise2
|
| 8 |
+
from Bio.SubsMat import MatrixInfo as matlist
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
"""
|
| 13 |
+
def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
|
| 14 |
+
aligner = Align.PairwiseAligner()
|
| 15 |
+
#print(f'Aligning Datapoint: {identifier}')
|
| 16 |
+
if len(pdbSequence) >= 1:
|
| 17 |
+
f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
|
| 18 |
+
aligner.mode = 'local'
|
| 19 |
+
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
|
| 20 |
+
aligner.open_gap_score = -11
|
| 21 |
+
aligner.extend_gap_score = -1
|
| 22 |
+
alignments = aligner.align(uniprotSequence, pdbSequence)
|
| 23 |
+
alignments = (list(alignments))
|
| 24 |
+
|
| 25 |
+
merge_in_threes = str(alignments[0]).split('\n')
|
| 26 |
+
K = 3
|
| 27 |
+
res = ["".join(str(alignments[0]).split('\n')[idx: idx + K]) for idx in range(len(str(alignments[0]).split('\n')) - K + 1)]
|
| 28 |
+
slice_val = slice(0,len(res),4)
|
| 29 |
+
writtenlist = res[slice_val]
|
| 30 |
+
|
| 31 |
+
new_alignment = []
|
| 32 |
+
for i in writtenlist:
|
| 33 |
+
cont1 = list(filter(None, i.split('target')))
|
| 34 |
+
cont2 = cont1[0].split('query')
|
| 35 |
+
target_pos = (list(filter(None,cont2[0].split(' '))))[0]
|
| 36 |
+
target = (list(filter(None,cont2[0].split(' '))))[1]
|
| 37 |
+
alg_pos = (list(filter(None,cont2[0].split(' '))))[2]
|
| 38 |
+
alg = (list(filter(None,cont2[0].split(' '))))[3]
|
| 39 |
+
query_pos = (list(filter(None,cont2[1].split(' '))))[0]
|
| 40 |
+
query = (list(filter(None,cont2[1].split(' '))))[1]
|
| 41 |
+
if int(target_pos)>0:
|
| 42 |
+
new_target = int(target_pos) * 'X' + target
|
| 43 |
+
else:
|
| 44 |
+
new_target = int(target_pos) * ' ' + target
|
| 45 |
+
|
| 46 |
+
if int(alg_pos)>0:
|
| 47 |
+
new_alg = int(target_pos) * 'X' + target
|
| 48 |
+
else:
|
| 49 |
+
new_alg = int(target_pos) * ' ' + alg
|
| 50 |
+
|
| 51 |
+
if int(query_pos)>0:
|
| 52 |
+
new_query = int(target_pos) * 'X' + target
|
| 53 |
+
else:
|
| 54 |
+
new_query = int(target_pos) * ' ' + target
|
| 55 |
+
|
| 56 |
+
new_alignment.append(new_target+'\n' +new_alg +'\n' +new_query)
|
| 57 |
+
alignment_list = []
|
| 58 |
+
k = 0
|
| 59 |
+
for alignment in new_alignment:
|
| 60 |
+
k += 1
|
| 61 |
+
st.write('COUNT', k)
|
| 62 |
+
st.write('alignment')
|
| 63 |
+
st.write(alignment)
|
| 64 |
+
f.write(str(alignment))
|
| 65 |
+
f.write('\n')
|
| 66 |
+
f.write('\n')
|
| 67 |
+
alignment = (str(alignment).strip().split('\n'))
|
| 68 |
+
alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
|
| 69 |
+
st.write('alignment_updated')
|
| 70 |
+
st.write(alignment)
|
| 71 |
+
alignment_list.append(alignment)
|
| 72 |
+
return alignment_list
|
| 73 |
+
|
| 74 |
+
"""
|
| 75 |
+
def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
|
| 76 |
+
aligner = Align.PairwiseAligner()
|
| 77 |
+
#print(f'Aligning Datapoint: {identifier}')
|
| 78 |
+
if len(pdbSequence) >= 1:
|
| 79 |
+
f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
|
| 80 |
+
aligner.mode = 'local'
|
| 81 |
+
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
|
| 82 |
+
aligner.open_gap_score = -11
|
| 83 |
+
aligner.extend_gap_score = -1
|
| 84 |
+
alignments = aligner.align(uniprotSequence, pdbSequence)
|
| 85 |
+
|
| 86 |
+
sub_matrix = matlist.blosum62
|
| 87 |
+
alignments2 = pairwise2.align.localds(uniprotSequence, pdbSequence, sub_matrix, -11, -1)
|
| 88 |
+
|
| 89 |
+
alignment_list = []
|
| 90 |
+
k = 0
|
| 91 |
+
for alignment in alignments:
|
| 92 |
+
|
| 93 |
+
f.write(str(alignment))
|
| 94 |
+
f.write('\n')
|
| 95 |
+
f.write('\n')
|
| 96 |
+
alignment = (str(alignment).strip().split('\n'))
|
| 97 |
+
alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
|
| 98 |
+
|
| 99 |
+
alignment_list.append(alignment)
|
| 100 |
+
return alignment_list
|
| 101 |
+
|
| 102 |
+
def mutation_position_on_pdb(alignment_list, pos):
|
| 103 |
+
which_alignment_to_go = 0
|
| 104 |
+
for alignment in alignment_list:
|
| 105 |
+
|
| 106 |
+
#char_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
|
| 107 |
+
#for char in alignment[1]:
|
| 108 |
+
# if char in char_list:
|
| 109 |
+
# alignment[1] = alignment[1].replace(char, '.')
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
which_alignment_to_go += 1
|
| 113 |
+
alignment_uniprot = alignment[0]
|
| 114 |
+
alignment_pdb = alignment[2]
|
| 115 |
+
startGap = 0
|
| 116 |
+
if alignment_uniprot.startswith('.') or alignment_uniprot.startswith('-'):
|
| 117 |
+
for k in alignment_uniprot:
|
| 118 |
+
if k == '.' or k == '-':
|
| 119 |
+
startGap += 1
|
| 120 |
+
else:
|
| 121 |
+
break
|
| 122 |
+
|
| 123 |
+
countGap = startGap
|
| 124 |
+
countResidue = 0
|
| 125 |
+
canonicalRes = ' '
|
| 126 |
+
pdbRes = ' '
|
| 127 |
+
for j in alignment_uniprot[startGap:]:
|
| 128 |
+
if j == '.' or j == '-':
|
| 129 |
+
countGap += 1
|
| 130 |
+
else:
|
| 131 |
+
countResidue += 1
|
| 132 |
+
|
| 133 |
+
if int(countResidue) == int(pos):
|
| 134 |
+
canonicalRes = alignment_uniprot[countResidue + countGap - 1]
|
| 135 |
+
try:
|
| 136 |
+
pdbRes = alignment_pdb[countResidue + countGap - 1]
|
| 137 |
+
except:
|
| 138 |
+
IndexError
|
| 139 |
+
pdbRes = 'nan'
|
| 140 |
+
break
|
| 141 |
+
|
| 142 |
+
if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
|
| 143 |
+
if canonicalRes == pdbRes:
|
| 144 |
+
pdb_alignStatus = 'aligned'
|
| 145 |
+
elif canonicalRes != pdbRes:
|
| 146 |
+
pdb_alignStatus = 'aligned*'
|
| 147 |
+
countGap_pdb = 0
|
| 148 |
+
countResidue_pdb = 0
|
| 149 |
+
pdbRes = ' '
|
| 150 |
+
for j in alignment_pdb[0:countResidue + countGap - 1]:
|
| 151 |
+
if j == '.' or j == '-':
|
| 152 |
+
countGap_pdb += 1
|
| 153 |
+
if alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
|
| 154 |
+
countResidue + countGap - 1] == '-':
|
| 155 |
+
mutationPositionOnPDB = 'nan'
|
| 156 |
+
posPDB = 'nan'
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
else:
|
| 160 |
+
posPDB = countResidue + countGap - countGap_pdb
|
| 161 |
+
|
| 162 |
+
mutationPositionOnPDB = str(posPDB)
|
| 163 |
+
|
| 164 |
+
break
|
| 165 |
+
elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
|
| 166 |
+
alignment[1][poscountResidue+ countGap - 1] == '-')):
|
| 167 |
+
pdb_alignStatus = 'not_aligned'
|
| 168 |
+
mutationPositionOnPDB = 'nan'
|
| 169 |
+
elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
|
| 170 |
+
alignment[1][countResidue + countGap - 1] == '-')):
|
| 171 |
+
pdb_alignStatus = 'not_aligned'
|
| 172 |
+
mutationPositionOnPDB = 'nan'
|
| 173 |
+
elif alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
|
| 174 |
+
countResidue + countGap - 1] == '-':
|
| 175 |
+
mutationPositionOnPDB = 'nan'
|
| 176 |
+
posPDB = 'nan'
|
| 177 |
+
|
| 178 |
+
return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def find_position_on_pdb_for_range_annotations(posAnnotation, startGap, alignment_to_use):
|
| 182 |
+
annotation_on_pdb_start = 'nan'
|
| 183 |
+
annotation_on_pdb_end = 'nan'
|
| 184 |
+
pos1 = int(posAnnotation.split('-')[0])
|
| 185 |
+
count_gap = startGap
|
| 186 |
+
count_residue = 0
|
| 187 |
+
for j in alignment_to_use[0][startGap:]:
|
| 188 |
+
if j == '.' or j == '-':
|
| 189 |
+
count_gap += 1
|
| 190 |
+
else:
|
| 191 |
+
count_residue += 1
|
| 192 |
+
if int(count_residue) == int(pos1): # count gaps until the first position
|
| 193 |
+
break
|
| 194 |
+
annotation_on_up_start = int(pos1) + int(count_gap)
|
| 195 |
+
|
| 196 |
+
pos2 = int(posAnnotation.split('-')[1])
|
| 197 |
+
count_gap = startGap
|
| 198 |
+
count_residue = 0
|
| 199 |
+
for j in alignment_to_use[0][startGap:]:
|
| 200 |
+
if j == '.' or j == '-':
|
| 201 |
+
count_gap += 1
|
| 202 |
+
else:
|
| 203 |
+
count_residue += 1
|
| 204 |
+
if int(count_residue) == int(pos2): # count gaps until the first position
|
| 205 |
+
break
|
| 206 |
+
|
| 207 |
+
annotation_on_up_end = int(pos2) + int(count_gap)
|
| 208 |
+
try:
|
| 209 |
+
pdb_residue_start = alignment_to_use[2][annotation_on_up_start - 1].strip()
|
| 210 |
+
if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
|
| 211 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
|
| 212 |
+
if (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '.') and \
|
| 213 |
+
(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '-') and \
|
| 214 |
+
((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
|
| 215 |
+
(alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
|
| 216 |
+
annotation_on_up_start += ran
|
| 217 |
+
break
|
| 218 |
+
elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
|
| 219 |
+
((alignment_to_use[1][annotation_on_up_start - 1] == '.') or (
|
| 220 |
+
alignment_to_use[1][annotation_on_up_start - 1] == '-')):
|
| 221 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
|
| 222 |
+
if ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
|
| 223 |
+
(alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
|
| 224 |
+
annotation_on_up_start += ran
|
| 225 |
+
break
|
| 226 |
+
count_gap_pdb = 0
|
| 227 |
+
if annotation_on_up_start != 'nan':
|
| 228 |
+
for q in alignment_to_use[2][0:annotation_on_up_start - 1]:
|
| 229 |
+
if q == '.' or q == '-':
|
| 230 |
+
count_gap_pdb += 1
|
| 231 |
+
if alignment_to_use[1][annotation_on_up_start] == '-' or alignment_to_use[1][annotation_on_up_start] == '.':
|
| 232 |
+
annotation_on_pdb_start = 'nan'
|
| 233 |
+
else:
|
| 234 |
+
annotation_on_pdb_start = int(annotation_on_up_start) - count_gap_pdb
|
| 235 |
+
else:
|
| 236 |
+
annotation_on_pdb_start = 'nan'
|
| 237 |
+
except:
|
| 238 |
+
IndexError
|
| 239 |
+
try:
|
| 240 |
+
pdb_residue_end = alignment_to_use[2][annotation_on_up_end - 1].strip()
|
| 241 |
+
if pdb_residue_end == '.' or pdb_residue_end == '-':
|
| 242 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
|
| 243 |
+
if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
|
| 244 |
+
(alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
|
| 245 |
+
annotation_on_up_start += (ran - 1)
|
| 246 |
+
annotation_on_up_end = annotation_on_up_start
|
| 247 |
+
break
|
| 248 |
+
elif (pdb_residue_end != '.') and (pdb_residue_end != '-') and \
|
| 249 |
+
((alignment_to_use[1][annotation_on_up_end - 1] == '.') or (
|
| 250 |
+
alignment_to_use[1][annotation_on_up_end - 1] == '-')):
|
| 251 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
|
| 252 |
+
if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
|
| 253 |
+
(alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
|
| 254 |
+
annotation_on_up_start += (ran - 1)
|
| 255 |
+
annotation_on_up_end = annotation_on_up_start
|
| 256 |
+
break
|
| 257 |
+
count_gap_pdb = 0
|
| 258 |
+
if annotation_on_up_end != 'nan':
|
| 259 |
+
for q in alignment_to_use[2][0:annotation_on_up_end - 1]:
|
| 260 |
+
if q == '.' or q == '-':
|
| 261 |
+
count_gap_pdb += 1
|
| 262 |
+
if alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
|
| 263 |
+
annotation_on_up_end - 1] == '.' and annotation_on_pdb_start == 'nan':
|
| 264 |
+
annotation_on_pdb_end = 'nan'
|
| 265 |
+
elif alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
|
| 266 |
+
annotation_on_up_end - 1] == '.' and annotation_on_pdb_start != 'nan':
|
| 267 |
+
annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
|
| 268 |
+
else:
|
| 269 |
+
annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
|
| 270 |
+
else:
|
| 271 |
+
annotation_on_pdb_end = 'nan'
|
| 272 |
+
except:
|
| 273 |
+
IndexError # Say isoform 2 is matched with the length 100, but canonical is 150 aa long. If there is an annotation at 105. position, for the isoform it throws an index error.
|
| 274 |
+
|
| 275 |
+
if annotation_on_pdb_start == 'nan' and annotation_on_pdb_end != 'nan':
|
| 276 |
+
annotation_on_pdb_start = annotation_on_up_start - count_gap_pdb
|
| 277 |
+
if annotation_on_pdb_start == annotation_on_pdb_end:
|
| 278 |
+
annotation_on_pdb_start = 'nan'
|
| 279 |
+
annotation_on_pdb_end = 'nan'
|
| 280 |
+
return annotation_on_up_start, annotation_on_up_end, annotation_on_pdb_start, annotation_on_pdb_end
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def annotation_pos_on_pdb(annot_positions, startGap, alignment_to_use, identifier):
|
| 284 |
+
newpos = []
|
| 285 |
+
if annot_positions != 'nan':
|
| 286 |
+
annot_positions = (str(annot_positions).replace("'", ''))
|
| 287 |
+
annot_positions = (str(annot_positions).replace('[', ''))
|
| 288 |
+
annot_positions = (str(annot_positions).replace("]", ''))
|
| 289 |
+
positionList_perAnnotation = annot_positions.split(',')
|
| 290 |
+
positionList_perAnnotation = [h.strip() for h in positionList_perAnnotation]
|
| 291 |
+
|
| 292 |
+
position_start_on_pdb = 'nan'
|
| 293 |
+
position_end_on_pdb = 'nan'
|
| 294 |
+
try:
|
| 295 |
+
positionList_perAnnotation = [i for i in positionList_perAnnotation if i != 'nan']
|
| 296 |
+
except:
|
| 297 |
+
TypeError
|
| 298 |
+
for position in range(len(positionList_perAnnotation)):
|
| 299 |
+
if ('-' not in str(positionList_perAnnotation[position])) and (str(positionList_perAnnotation[position]) != '?') and (str(positionList_perAnnotation[position]) != '') and (len(str(positionList_perAnnotation[position])) != 0):
|
| 300 |
+
count_gap = startGap
|
| 301 |
+
count_residue = 0
|
| 302 |
+
for j in alignment_to_use[0][startGap:]:
|
| 303 |
+
if j == '.' or j == '-':
|
| 304 |
+
count_gap += 1
|
| 305 |
+
else:
|
| 306 |
+
count_residue += 1
|
| 307 |
+
try:
|
| 308 |
+
if int(count_residue) == int(positionList_perAnnotation[position]):
|
| 309 |
+
break
|
| 310 |
+
except:
|
| 311 |
+
ValueError
|
| 312 |
+
|
| 313 |
+
annotation_on_up = int(positionList_perAnnotation[position]) + int(count_gap)
|
| 314 |
+
try:
|
| 315 |
+
pdb_residue_start = alignment_to_use[2][annotation_on_up - 1].strip()
|
| 316 |
+
except:
|
| 317 |
+
IndexError
|
| 318 |
+
pdb_residue_start = 'nan'
|
| 319 |
+
if pdb_residue_start != 'nan':
|
| 320 |
+
try:
|
| 321 |
+
if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
|
| 322 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
|
| 323 |
+
if (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][ran] != '.') and \
|
| 324 |
+
(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][
|
| 325 |
+
ran] != '-') and \
|
| 326 |
+
((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
|
| 327 |
+
ran] == '|') or
|
| 328 |
+
(alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
|
| 329 |
+
ran] == 'X')):
|
| 330 |
+
annotation_on_up += ran
|
| 331 |
+
break
|
| 332 |
+
elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
|
| 333 |
+
((alignment_to_use[1][annotation_on_up - 1] == '.') or (
|
| 334 |
+
alignment_to_use[1][annotation_on_up - 1] == '-')):
|
| 335 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
|
| 336 |
+
if ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == '|') or
|
| 337 |
+
(alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == 'X')):
|
| 338 |
+
annotation_on_up += ran
|
| 339 |
+
break
|
| 340 |
+
count_gap_pdb = 0
|
| 341 |
+
for q in alignment_to_use[2][0:annotation_on_up - 1]:
|
| 342 |
+
if q == '.' or q == '-':
|
| 343 |
+
count_gap_pdb += 1
|
| 344 |
+
if alignment_to_use[1][annotation_on_up] == '-' or alignment_to_use[1][
|
| 345 |
+
annotation_on_up] == '.':
|
| 346 |
+
annotation_on_pdb = 'nan'
|
| 347 |
+
else:
|
| 348 |
+
annotation_on_pdb = int(annotation_on_up) - count_gap_pdb
|
| 349 |
+
|
| 350 |
+
if count_gap_pdb == annotation_on_up:
|
| 351 |
+
annotation_on_pdb = 'nan'
|
| 352 |
+
try:
|
| 353 |
+
if alignment_to_use[2][count_gap_pdb + annotation_on_pdb - 1] == '.' or alignment_to_use[2][
|
| 354 |
+
count_gap_pdb + annotation_on_pdb - 1] == '-':
|
| 355 |
+
annotation_on_pdb = 'nan'
|
| 356 |
+
except:
|
| 357 |
+
IndexError
|
| 358 |
+
annotation_on_pdb = 'nan'
|
| 359 |
+
except:
|
| 360 |
+
IndexError
|
| 361 |
+
annotation_on_pdb = 'nan'
|
| 362 |
+
|
| 363 |
+
newpos.append(annotation_on_pdb)
|
| 364 |
+
|
| 365 |
+
elif ('-' in str(positionList_perAnnotation[position])) and (
|
| 366 |
+
str(positionList_perAnnotation[position]) != '?') and (
|
| 367 |
+
str(positionList_perAnnotation[position]) != ' ') and (
|
| 368 |
+
len(str(positionList_perAnnotation[position])) != 0):
|
| 369 |
+
try:
|
| 370 |
+
position_start_on_pdb = \
|
| 371 |
+
find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
|
| 372 |
+
startGap, alignment_to_use)[2]
|
| 373 |
+
position_end_on_pdb = \
|
| 374 |
+
find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
|
| 375 |
+
startGap, alignment_to_use)[3]
|
| 376 |
+
except:
|
| 377 |
+
ValueError
|
| 378 |
+
newpositions = str(position_start_on_pdb) + '-' + str(position_end_on_pdb)
|
| 379 |
+
newpos.append(newpositions)
|
| 380 |
+
else:
|
| 381 |
+
pass
|
| 382 |
+
try:
|
| 383 |
+
newpos = [i for i in newpos if i != 'nan']
|
| 384 |
+
except:
|
| 385 |
+
TypeError
|
| 386 |
+
return newpos
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def final_stage(df, annotation_list, alignment_path):
|
| 390 |
+
for i in df.index:
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
|
| 394 |
+
alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
|
| 395 |
+
df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
|
| 396 |
+
|
| 397 |
+
print()
|
| 398 |
+
df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
|
| 399 |
+
startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
|
| 400 |
+
alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
|
| 401 |
+
for annot in annotation_list:
|
| 402 |
+
df.at[i, annot] = annotation_pos_on_pdb(df.at[i, annot], startGap, alignment_to_use, identifier)
|
| 403 |
+
if str(df.at[i, 'domStart']) != 'nan' and str(df.at[i, 'domEnd']) != 'nan' and \
|
| 404 |
+
((str(df.at[i, 'domStart']) != '-1' and str(df.at[i, 'domEnd']) != '-1' and
|
| 405 |
+
str(df.at[i, 'domStart']) != '-1.0' and str(df.at[i, 'domEnd']) != '-1.0')):
|
| 406 |
+
domainLoc = str(df.at[i, 'domStart']).split('.')[0] + '-' + str(df.at[i, 'domEnd']).split('.')[0]
|
| 407 |
+
domain_pos = find_position_on_pdb_for_range_annotations(domainLoc, startGap, alignment_to_use)
|
| 408 |
+
df.at[i, 'domainStartonPDB'] = domain_pos[2]
|
| 409 |
+
df.at[i, 'domainEndonPDB'] = domain_pos[3]
|
| 410 |
+
elif str(df.at[i, 'domStart']) != '-1' or str(df.at[i, 'domEnd']) != '-1' or \
|
| 411 |
+
str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
|
| 412 |
+
df.at[i, 'domainStartonPDB'] = 'nan'
|
| 413 |
+
df.at[i, 'domainEndonPDB'] = 'nan'
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
df = df.astype(str)
|
| 417 |
+
return df
|
| 418 |
+
|
| 419 |
+
def alignment(dataframe_to_align, annotation_list, alignment_path):
|
| 420 |
+
domainList = ['domStart', 'domEnd']
|
| 421 |
+
result = final_stage(dataframe_to_align, annotation_list, alignment_path)
|
| 422 |
+
return result
|
| 423 |
+
#
|
code/add_annotations.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ssl
|
| 2 |
+
import requests as r
|
| 3 |
+
from decimal import *
|
| 4 |
+
import numpy as np
|
| 5 |
+
def add_annotations(dataframe):
|
| 6 |
+
print('Downloading UniProt sequence annotations...\n')
|
| 7 |
+
ssl._create_default_https_context = ssl._create_unverified_context
|
| 8 |
+
|
| 9 |
+
original_annot_name = ['DISULFID', 'INIT_MET', 'INTRAMEM', 'VARIANT', 'DNA_BIND', 'ACT_SITE', 'NP_BIND', 'LIPID',
|
| 10 |
+
'SITE',
|
| 11 |
+
'TRANSMEM', 'CROSSLNK', 'MUTAGEN', 'STRAND', 'HELIX', 'TURN', 'METAL', 'REPEAT', 'TOPO_DOM',
|
| 12 |
+
'CA_BIND', 'BINDING', 'REGION', 'SIGNAL', 'MOD_RES', 'ZN_FING', 'MOTIF', 'COILED', 'PEPTIDE',
|
| 13 |
+
'TRANSIT', 'CARBOHYD', 'PROPEP']
|
| 14 |
+
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
| 15 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
|
| 16 |
+
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
|
| 17 |
+
'region',
|
| 18 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
| 19 |
+
'transitPeptide', 'glycosylation', 'propeptide']
|
| 20 |
+
|
| 21 |
+
dataframe = dataframe.reset_index().drop(['index'], axis=1)
|
| 22 |
+
|
| 23 |
+
for annot in original_annot_name:
|
| 24 |
+
dataframe[annot] = ''
|
| 25 |
+
|
| 26 |
+
for protein in list(set(dataframe.uniprotID.to_list())):
|
| 27 |
+
print('Downloading annotations for ' + protein)
|
| 28 |
+
uniprot_entry = r.get("http://www.uniprot.org/uniprot/" + protein + ".txt")
|
| 29 |
+
uniprot_entry = uniprot_entry.text.split('\n')
|
| 30 |
+
|
| 31 |
+
annot_for_protein = []
|
| 32 |
+
for annotation in original_annot_name:
|
| 33 |
+
for line in uniprot_entry:
|
| 34 |
+
if annotation.strip() in line and line.startswith(
|
| 35 |
+
'FT') and 'evidence' not in line and 'ECO' not in line and 'note' not in line:
|
| 36 |
+
annot_for_protein.append(list(filter(None, line.split(' ')))[1:])
|
| 37 |
+
for select in annot_for_protein:
|
| 38 |
+
if select[0] not in dataframe.columns:
|
| 39 |
+
dataframe.loc[dataframe.uniprotID == protein, select[0]] = str((select[1] + '; '))
|
| 40 |
+
else:
|
| 41 |
+
dataframe.loc[dataframe.uniprotID == protein, select[0]] += str((select[1] + '; '))
|
| 42 |
+
for i in range(len(original_annot_name)):
|
| 43 |
+
dataframe = dataframe.rename(columns={original_annot_name[i]: annotation_list[i]})
|
| 44 |
+
|
| 45 |
+
# Fix annotation positions
|
| 46 |
+
print('Processing positions...\n')
|
| 47 |
+
for i in dataframe.index:
|
| 48 |
+
for annot in dataframe.columns[-30:]:
|
| 49 |
+
if annot != 'disulfide':
|
| 50 |
+
if dataframe.at[i, annot] != 'nan':
|
| 51 |
+
dataframe.at[i, annot] = ([x for x in [k.strip() for k in dataframe.at[i, annot].split(';')] if x])
|
| 52 |
+
if '..' not in str(dataframe.at[i, annot]):
|
| 53 |
+
pass
|
| 54 |
+
elif '..' in str(dataframe.at[i, annot]):
|
| 55 |
+
dataframe.at[i, annot] = str(dataframe.at[i, annot]).replace('..', '-')
|
| 56 |
+
else:
|
| 57 |
+
disulfide_annot = []
|
| 58 |
+
if dataframe.at[i, annot] != 'nan':
|
| 59 |
+
dataframe.at[i, annot]= dataframe.at[i, annot].split(';')
|
| 60 |
+
dataframe.at[i, annot] = [i.split('..') for i in dataframe.at[i, annot]]
|
| 61 |
+
dataframe.at[i, annot] =[e for v in dataframe.at[i, annot] for e in v]
|
| 62 |
+
dataframe.at[i, annot] = [i for i in dataframe.at[i, annot] if i != ' ']
|
| 63 |
+
|
| 64 |
+
# Add binary annotations
|
| 65 |
+
print('Adding binary annotations...\n')
|
| 66 |
+
dataframe = dataframe.astype('str')
|
| 67 |
+
for i in dataframe.index:
|
| 68 |
+
for k in annotation_list: # get the positions of each attribute as a list
|
| 69 |
+
txt = k + 'Binary'
|
| 70 |
+
dataframe.at[i, txt] = Decimal('nan')
|
| 71 |
+
try:
|
| 72 |
+
for positions in dataframe.at[i, k].split(','):
|
| 73 |
+
position = positions.strip('[').strip(']').replace("'", "")
|
| 74 |
+
if position != 'nan' and position != '' and '-' not in position and int(
|
| 75 |
+
dataframe.at[i, 'pos']) == int(position):
|
| 76 |
+
dataframe.at[i, txt] = '1'
|
| 77 |
+
break
|
| 78 |
+
elif position != 'nan' and position != '' and '-' not in position and int(
|
| 79 |
+
dataframe.at[i, 'pos']) != int(position):
|
| 80 |
+
dataframe.at[i, txt] = '0'
|
| 81 |
+
elif position != 'nan' and position != '' and '-' in position:
|
| 82 |
+
if int(position.split('-')[0]) < int(dataframe.at[i, 'pos']) < int(position.split('-')[1]):
|
| 83 |
+
dataframe.at[i, txt] = '1'
|
| 84 |
+
break
|
| 85 |
+
else:
|
| 86 |
+
dataframe.at[i, txt] = '0'
|
| 87 |
+
except:
|
| 88 |
+
ValueError
|
| 89 |
+
|
| 90 |
+
# Final corrections
|
| 91 |
+
|
| 92 |
+
dataframe = dataframe.replace({'[\'?\']': 'nan'})
|
| 93 |
+
dataframe = dataframe.replace({'[]': 'nan'})
|
| 94 |
+
return dataframe
|
| 95 |
+
|
code/add_domains.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
def add_domains(data, path_to_domains):
|
| 5 |
+
domains = pd.read_csv(path_to_domains, delimiter=' ')
|
| 6 |
+
data = data.merge(domains, right_on='proteinID', left_on='uniprotID', how='left')
|
| 7 |
+
data = data.drop(['proteinID'], axis=1)
|
| 8 |
+
# Label each data point as range or notRange based on the relative distance of mutation and domain boundaries.
|
| 9 |
+
data = data.astype('str')
|
| 10 |
+
data.domStart = data.domStart.astype('float')
|
| 11 |
+
data.domEnd = data.domEnd.astype('float')
|
| 12 |
+
|
| 13 |
+
for i in data.index:
|
| 14 |
+
if data.at[i, 'domain'] != 'nan':
|
| 15 |
+
if int(data.at[i, 'domStart']) <= int(data.at[i, 'pos']) <= int(data.at[i, 'domEnd']):
|
| 16 |
+
data.at[i, 'distance'] = 0
|
| 17 |
+
else:
|
| 18 |
+
distance = min(abs(int(data.at[i, 'domStart']) - int(data.at[i, 'pos'])),
|
| 19 |
+
abs(int(data.at[i, 'domEnd']) - int(data.at[i, 'pos'])))
|
| 20 |
+
data.at[i, 'distance'] = int(distance)
|
| 21 |
+
else:
|
| 22 |
+
data.at[i, 'distance'] = 'nan'
|
| 23 |
+
|
| 24 |
+
data = data.sort_values(by=['datapoint', 'distance']).reset_index(drop=True) # Distances will be sorted.
|
| 25 |
+
|
| 26 |
+
# Keep the one with the least distance. But we may have more than one range domains for a datapoint if distance = 0.
|
| 27 |
+
# For this reason first we need to separate range ones so that when we take the first occurance to get the closest one
|
| 28 |
+
# for non range ones, other distance=0 ones wont disappear.
|
| 29 |
+
|
| 30 |
+
data_range = data[data.distance == 0]
|
| 31 |
+
data_out_range = data[data.distance != 0]
|
| 32 |
+
|
| 33 |
+
# For the range ones, find the most occurance
|
| 34 |
+
|
| 35 |
+
dom = []
|
| 36 |
+
for i in data_range.index:
|
| 37 |
+
dom.append(data_range.at[i, 'domain'])
|
| 38 |
+
|
| 39 |
+
domainCount = Counter(dom) # Occurance of domains.
|
| 40 |
+
|
| 41 |
+
# For out of range ones, take the closest distance.
|
| 42 |
+
data_out_range = data_out_range.drop_duplicates(['datapoint'], keep='first') # Already sorted above.
|
| 43 |
+
domain_counts = pd.DataFrame(domainCount.items(), columns=['domain', 'count'])
|
| 44 |
+
data_range_counts = data_range.merge(domain_counts, on='domain')
|
| 45 |
+
data_range_counts = data_range_counts.sort_values(['datapoint', 'count'])
|
| 46 |
+
data_range_counts = data_range_counts.drop_duplicates(['datapoint'], keep='last') # Take with the higher count.
|
| 47 |
+
data_range_counts = data_range_counts.drop(['count'], axis=1)
|
| 48 |
+
|
| 49 |
+
# Merge them back together
|
| 50 |
+
|
| 51 |
+
frames = [data_range_counts, data_out_range]
|
| 52 |
+
data = pd.concat(frames, sort=False) # Here when you concat two data frames, we might have range and not range with
|
| 53 |
+
# min distance for the same data point. Delete the one coming from notRange one.
|
| 54 |
+
data = data.sort_values(['datapoint', 'distance']).reset_index(drop=True)
|
| 55 |
+
data = data.drop_duplicates(['datapoint'], keep='first')
|
| 56 |
+
data = data.astype(str)
|
| 57 |
+
return data
|
code/add_interface_pos.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def get_interface_positions(dataframe, column1, column2):
|
| 2 |
+
interface_positions = {}
|
| 3 |
+
for i in dataframe.index:
|
| 4 |
+
if dataframe.at[i, column1] not in interface_positions and dataframe.at[i, column1 + '_IRES'] != '[]':
|
| 5 |
+
interface_positions[dataframe.at[i, column1]] = dataframe.at[i, str(column1 + '_IRES')]
|
| 6 |
+
elif dataframe.at[i, column1] in interface_positions and dataframe.at[i, column1 + '_IRES'] != '[]':
|
| 7 |
+
interface_positions[dataframe.at[i, column1]] = interface_positions[dataframe.at[i, column1]].strip(
|
| 8 |
+
']') + ',' + (dataframe.at[i, str(column1 + '_IRES')]).strip('[')
|
| 9 |
+
if dataframe.at[i, column2] not in interface_positions and dataframe.at[i, column2 + '_IRES'] != '[]':
|
| 10 |
+
interface_positions[dataframe.at[i, column2]] = dataframe.at[i, str(column2 + '_IRES')]
|
| 11 |
+
elif dataframe.at[i, column2] in interface_positions and dataframe.at[i, column2 + '_IRES'] != '[]':
|
| 12 |
+
interface_positions[dataframe.at[i, column2]] = interface_positions[dataframe.at[i, column2]].strip(
|
| 13 |
+
']') + ',' + (dataframe.at[i, str(column2 + '_IRES')]).strip('[')
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
for key, value in interface_positions.items():
|
| 17 |
+
n = []
|
| 18 |
+
m = []
|
| 19 |
+
if value != '[]':
|
| 20 |
+
valueList = value.split(',')
|
| 21 |
+
valueList[0] = str(valueList[0]).strip('[')
|
| 22 |
+
valueList[-1] = str(valueList[-1]).strip(']')
|
| 23 |
+
for val in valueList:
|
| 24 |
+
if '-' in val:
|
| 25 |
+
for r in range(int(val.split('-')[0]), int(val.split('-')[1]) + 1):
|
| 26 |
+
n.append(r)
|
| 27 |
+
else:
|
| 28 |
+
m.append(int(val))
|
| 29 |
+
fin = m + n
|
| 30 |
+
|
| 31 |
+
interface_positions[key] = fin
|
| 32 |
+
except:
|
| 33 |
+
ValueError
|
| 34 |
+
|
| 35 |
+
return interface_positions
|
code/add_sasa.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import glob
|
| 2 |
+
import ssbio.utils
|
| 3 |
+
import subprocess
|
| 4 |
+
import ssbio
|
| 5 |
+
import os.path as op
|
| 6 |
+
from add_3Dalignment import *
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import gzip
|
| 10 |
+
import shutil
|
| 11 |
+
import streamlit as st
|
| 12 |
+
|
| 13 |
+
def run_freesasa(infile, outfile, include_hetatms=True, outdir=None, force_rerun=False, file_type = 'gzip'):
|
| 14 |
+
if not outdir:
|
| 15 |
+
outdir = ''
|
| 16 |
+
outfile = op.join(outdir, outfile)
|
| 17 |
+
if file_type == 'pdb':
|
| 18 |
+
if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
|
| 19 |
+
if include_hetatms:
|
| 20 |
+
shell_command = 'freesasa --format=rsa --hetatm {} -o {}'.format(infile, outfile)
|
| 21 |
+
else:
|
| 22 |
+
shell_command = 'freesasa --format=rsa {} -o {}'.format(infile, outfile)
|
| 23 |
+
command = subprocess.Popen(shell_command,
|
| 24 |
+
stdout=subprocess.PIPE,
|
| 25 |
+
stderr=subprocess.PIPE,
|
| 26 |
+
shell=True)
|
| 27 |
+
out, err = command.communicate()
|
| 28 |
+
|
| 29 |
+
elif file_type == 'gzip':
|
| 30 |
+
with gzip.open(infile, 'rb') as f_in:
|
| 31 |
+
with open('file_temp.pdb', 'wb') as f_out:
|
| 32 |
+
shutil.copyfileobj(f_in, f_out)
|
| 33 |
+
|
| 34 |
+
infile = 'file_temp.pdb'
|
| 35 |
+
|
| 36 |
+
if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
|
| 37 |
+
if include_hetatms:
|
| 38 |
+
shell_command = 'freesasa --format=rsa --hetatm {} -o {}'.format(infile, outfile)
|
| 39 |
+
else:
|
| 40 |
+
shell_command = 'freesasa --format=rsa {} -o {}'.format(infile, outfile)
|
| 41 |
+
command = subprocess.Popen(shell_command,
|
| 42 |
+
stdout=subprocess.PIPE,
|
| 43 |
+
stderr=subprocess.PIPE,
|
| 44 |
+
shell=True)
|
| 45 |
+
out, err = command.communicate()
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
return outfile
|
| 49 |
+
|
| 50 |
+
def calculate_freesasa(ID, model_num, existing_free_sasa, path_to_input,path_to_output_files, file_type = 'gzip'):
|
| 51 |
+
print('Calculating surface area...\n')
|
| 52 |
+
file_base = str(Path(path_to_input / '*'))
|
| 53 |
+
file_str = glob.glob(file_base)[0].split('-')[-1].split('.')[0]
|
| 54 |
+
if file_type == 'gzip':
|
| 55 |
+
if ID not in existing_free_sasa:
|
| 56 |
+
fullID = f'AF-{ID}-F{model_num}-{file_str }.pdb.gz'
|
| 57 |
+
run_freesasa(Path(path_to_input / fullID),
|
| 58 |
+
Path(path_to_output_files / f'freesasa_files/{fullID}.txt'), include_hetatms=True,
|
| 59 |
+
outdir=None, force_rerun=False)
|
| 60 |
+
elif file_type == 'pdb':
|
| 61 |
+
if ID not in existing_free_sasa:
|
| 62 |
+
fullID = f'AF-{ID}-F{model_num}-model_v1.pdb'
|
| 63 |
+
run_freesasa(Path(path_to_input / fullID),
|
| 64 |
+
Path(path_to_output_files / f'freesasa_files/{fullID}.txt'), include_hetatms=True,
|
| 65 |
+
outdir=None, force_rerun=False)
|
| 66 |
+
|
| 67 |
+
def sasa(source, pdbID, uniprotID, sasa_pos, wt, mode, path_to_output_files,file_type = 'gzip'):
|
| 68 |
+
if mode == 1:
|
| 69 |
+
sasa = 'nan'
|
| 70 |
+
for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
|
| 71 |
+
if source == 'PDB':
|
| 72 |
+
fname = str(filename).split('.')[0].split('/')[-1].upper()
|
| 73 |
+
elif source == 'MODBASE':
|
| 74 |
+
fname = str(filename).split('.')[0].split('/')[-1]
|
| 75 |
+
elif source == 'SWISSSMODEL':
|
| 76 |
+
fname = str(filename).split('_')[2]
|
| 77 |
+
if pdbID == fname:
|
| 78 |
+
files = open(filename, 'r')
|
| 79 |
+
file = files.readlines()
|
| 80 |
+
for k in file:
|
| 81 |
+
|
| 82 |
+
if k.strip()[10:13] == sasa_pos:
|
| 83 |
+
residue = str(k[4:7].strip())
|
| 84 |
+
if wt == threeToOne(residue):
|
| 85 |
+
sasa = str(k[22:28]).strip('\n')
|
| 86 |
+
return (sasa)
|
| 87 |
+
elif wt != threeToOne(residue):
|
| 88 |
+
sasa = str(k[22:28]).strip('\n') + '*'
|
| 89 |
+
return (sasa)
|
| 90 |
+
else:
|
| 91 |
+
return 'nan' #######
|
| 92 |
+
|
| 93 |
+
if mode == 2:
|
| 94 |
+
if sasa_pos != np.NaN:
|
| 95 |
+
sasa = 'nan'
|
| 96 |
+
if file_type == 'pdb':
|
| 97 |
+
for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
|
| 98 |
+
fname = list(filter(None, filename.split('.'))).split('/')[-1].upper()
|
| 99 |
+
if uniprotID == fname:
|
| 100 |
+
files = open(filename, 'r')
|
| 101 |
+
file = files.readlines()
|
| 102 |
+
for k in file:
|
| 103 |
+
if k.strip()[10:13] == sasa_pos:
|
| 104 |
+
residue = str(k[4:7].strip())
|
| 105 |
+
if wt == threeToOne(residue):
|
| 106 |
+
sasa = str(k[22:28]).strip('\n')
|
| 107 |
+
elif wt != threeToOne(residue):
|
| 108 |
+
sasa = str(k[22:28]).strip('\n') + '*'
|
| 109 |
+
|
| 110 |
+
return sasa
|
| 111 |
+
elif file_type == 'gzip':
|
| 112 |
+
for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
|
| 113 |
+
fname = list(filter(None, str(filename).split('.')))[0].split('/')[-1].split('-')[1].upper()
|
| 114 |
+
|
| 115 |
+
if uniprotID == fname:
|
| 116 |
+
files = open(filename, 'r')
|
| 117 |
+
file = files.readlines()
|
| 118 |
+
for k in file:
|
| 119 |
+
if str(k.strip()[10:13]) == str(sasa_pos):
|
| 120 |
+
residue = str(k[4:7].strip())
|
| 121 |
+
if wt == threeToOne(residue):
|
| 122 |
+
sasa = str(k[22:28]).strip('\n')
|
| 123 |
+
elif wt != threeToOne(residue):
|
| 124 |
+
sasa = str(k[22:28]).strip('\n') + '*'
|
| 125 |
+
else:
|
| 126 |
+
sasa = 'nan'
|
| 127 |
+
|
| 128 |
+
return sasa
|
| 129 |
+
else:
|
| 130 |
+
sasa = 'nan'
|
| 131 |
+
return sasa
|
code/add_sequence.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests as r
|
| 2 |
+
from io import StringIO
|
| 3 |
+
from Bio import SeqIO
|
| 4 |
+
import xml.etree.ElementTree as ET
|
| 5 |
+
|
| 6 |
+
def get_uniprot_seq(protein_id):
|
| 7 |
+
print('Fetching UniProt Sequences for ID: ', protein_id)
|
| 8 |
+
baseUrl = "http://www.uniprot.org/uniprot/"
|
| 9 |
+
currentUrl = baseUrl + protein_id + ".fasta"
|
| 10 |
+
response = r.post(currentUrl)
|
| 11 |
+
cData = ''.join(response.text)
|
| 12 |
+
Seq = StringIO(cData)
|
| 13 |
+
pSeq = list(SeqIO.parse(Seq, 'fasta'))
|
| 14 |
+
try:
|
| 15 |
+
return str(pSeq[0].seq)
|
| 16 |
+
except:
|
| 17 |
+
IndexError
|
| 18 |
+
return str('')
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_isoforms(protein_id):
|
| 22 |
+
print('Fetching UniProt Isoforms for ID: ', protein_id)
|
| 23 |
+
try:
|
| 24 |
+
# a dictionary storing the sequence of your isoforms, key: accesion number, value: sequence
|
| 25 |
+
isoforms = dict()
|
| 26 |
+
# make a call to EBI API
|
| 27 |
+
req = r.get('https://www.ebi.ac.uk/proteins/api/proteins/{}/isoforms'.format(protein_id))
|
| 28 |
+
# parse the returned XML
|
| 29 |
+
uniprot = ET.fromstring(req.text)
|
| 30 |
+
for isoform in uniprot:
|
| 31 |
+
# get the sequence
|
| 32 |
+
seq = isoform.find('{http://uniprot.org/uniprot}sequence')
|
| 33 |
+
|
| 34 |
+
# get the accession number
|
| 35 |
+
iso_accession = isoform.find('{http://uniprot.org/uniprot}accession')
|
| 36 |
+
|
| 37 |
+
# add the values to the dictionary
|
| 38 |
+
if seq.text and iso_accession.text:
|
| 39 |
+
isoforms[iso_accession.text] = seq.text
|
| 40 |
+
return isoforms
|
| 41 |
+
except:
|
| 42 |
+
AttributeError
|
| 43 |
+
isoforms = {}
|
| 44 |
+
return isoforms
|
code/add_structure.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import time
|
| 3 |
+
import json
|
| 4 |
+
import zlib
|
| 5 |
+
from xml.etree import ElementTree
|
| 6 |
+
from urllib.parse import urlparse, parse_qs, urlencode
|
| 7 |
+
import requests
|
| 8 |
+
from requests.adapters import HTTPAdapter, Retry
|
| 9 |
+
from unipressed import IdMappingClient
|
| 10 |
+
|
| 11 |
+
## Code adapted from UniProt documentation.
|
| 12 |
+
def get_pdb_ids_2(protein_id):
|
| 13 |
+
POLLING_INTERVAL = 5
|
| 14 |
+
API_URL = "https://rest.uniprot.org"
|
| 15 |
+
|
| 16 |
+
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
|
| 17 |
+
session = requests.Session()
|
| 18 |
+
session.mount("https://", HTTPAdapter(max_retries=retries))
|
| 19 |
+
|
| 20 |
+
def check_response(response):
|
| 21 |
+
try:
|
| 22 |
+
response.raise_for_status()
|
| 23 |
+
except requests.HTTPError:
|
| 24 |
+
print(response.json())
|
| 25 |
+
raise
|
| 26 |
+
|
| 27 |
+
def submit_id_mapping(from_db, to_db, ids):
|
| 28 |
+
request = requests.post(
|
| 29 |
+
f"{API_URL}/idmapping/run",
|
| 30 |
+
data={"from": from_db, "to": to_db, "ids": ids},
|
| 31 |
+
)
|
| 32 |
+
check_response(request)
|
| 33 |
+
return request.json()["jobId"]
|
| 34 |
+
|
| 35 |
+
def get_next_link(headers):
|
| 36 |
+
re_next_link = re.compile(r'<(.+)>; rel="next"')
|
| 37 |
+
if "Link" in headers:
|
| 38 |
+
match = re_next_link.match(headers["Link"])
|
| 39 |
+
if match:
|
| 40 |
+
return match.group(1)
|
| 41 |
+
|
| 42 |
+
def check_id_mapping_results_ready(job_id):
|
| 43 |
+
while True:
|
| 44 |
+
request = session.get(f"{API_URL}/idmapping/status/{job_id}")
|
| 45 |
+
check_response(request)
|
| 46 |
+
j = request.json()
|
| 47 |
+
if "jobStatus" in j:
|
| 48 |
+
if j["jobStatus"] == "RUNNING":
|
| 49 |
+
print(f"Retrying in {POLLING_INTERVAL}s")
|
| 50 |
+
time.sleep(POLLING_INTERVAL)
|
| 51 |
+
else:
|
| 52 |
+
raise Exception(j["jobStatus"])
|
| 53 |
+
else:
|
| 54 |
+
return bool(j["results"] or j["failedIds"])
|
| 55 |
+
|
| 56 |
+
def get_batch(batch_response, file_format, compressed):
|
| 57 |
+
batch_url = get_next_link(batch_response.headers)
|
| 58 |
+
while batch_url:
|
| 59 |
+
batch_response = session.get(batch_url)
|
| 60 |
+
batch_response.raise_for_status()
|
| 61 |
+
yield decode_results(batch_response, file_format, compressed)
|
| 62 |
+
batch_url = get_next_link(batch_response.headers)
|
| 63 |
+
|
| 64 |
+
def combine_batches(all_results, batch_results, file_format):
|
| 65 |
+
if file_format == "json":
|
| 66 |
+
for key in ("results", "failedIds"):
|
| 67 |
+
if key in batch_results and batch_results[key]:
|
| 68 |
+
all_results[key] += batch_results[key]
|
| 69 |
+
elif file_format == "tsv":
|
| 70 |
+
return all_results + batch_results[1:]
|
| 71 |
+
else:
|
| 72 |
+
return all_results + batch_results
|
| 73 |
+
return all_results
|
| 74 |
+
|
| 75 |
+
def get_id_mapping_results_link(job_id):
|
| 76 |
+
url = f"{API_URL}/idmapping/details/{job_id}"
|
| 77 |
+
request = session.get(url)
|
| 78 |
+
check_response(request)
|
| 79 |
+
return request.json()["redirectURL"]
|
| 80 |
+
|
| 81 |
+
def decode_results(response, file_format, compressed):
|
| 82 |
+
if compressed:
|
| 83 |
+
decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
|
| 84 |
+
if file_format == "json":
|
| 85 |
+
j = json.loads(decompressed.decode("utf-8"))
|
| 86 |
+
return j
|
| 87 |
+
elif file_format == "tsv":
|
| 88 |
+
return [line for line in decompressed.decode("utf-8").split("\n") if line]
|
| 89 |
+
elif file_format == "xlsx":
|
| 90 |
+
return [decompressed]
|
| 91 |
+
elif file_format == "xml":
|
| 92 |
+
return [decompressed.decode("utf-8")]
|
| 93 |
+
else:
|
| 94 |
+
return decompressed.decode("utf-8")
|
| 95 |
+
elif file_format == "json":
|
| 96 |
+
return response.json()
|
| 97 |
+
elif file_format == "tsv":
|
| 98 |
+
return [line for line in response.text.split("\n") if line]
|
| 99 |
+
elif file_format == "xlsx":
|
| 100 |
+
return [response.content]
|
| 101 |
+
elif file_format == "xml":
|
| 102 |
+
return [response.text]
|
| 103 |
+
return response.text
|
| 104 |
+
|
| 105 |
+
def get_xml_namespace(element):
|
| 106 |
+
m = re.match(r"\{(.*)\}", element.tag)
|
| 107 |
+
return m.groups()[0] if m else ""
|
| 108 |
+
|
| 109 |
+
def merge_xml_results(xml_results):
|
| 110 |
+
merged_root = ElementTree.fromstring(xml_results[0])
|
| 111 |
+
for result in xml_results[1:]:
|
| 112 |
+
root = ElementTree.fromstring(result)
|
| 113 |
+
for child in root.findall("{http://uniprot.org/uniprot}entry"):
|
| 114 |
+
merged_root.insert(-1, child)
|
| 115 |
+
ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
|
| 116 |
+
return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def get_id_mapping_results_search(url):
|
| 120 |
+
parsed = urlparse(url)
|
| 121 |
+
query = parse_qs(parsed.query)
|
| 122 |
+
file_format = query["format"][0] if "format" in query else "json"
|
| 123 |
+
if "size" in query:
|
| 124 |
+
size = int(query["size"][0])
|
| 125 |
+
else:
|
| 126 |
+
size = 500
|
| 127 |
+
query["size"] = size
|
| 128 |
+
compressed = (
|
| 129 |
+
query["compressed"][0].lower() == "true" if "compressed" in query else False
|
| 130 |
+
)
|
| 131 |
+
parsed = parsed._replace(query=urlencode(query, doseq=True))
|
| 132 |
+
url = parsed.geturl()
|
| 133 |
+
request = session.get(url)
|
| 134 |
+
check_response(request)
|
| 135 |
+
results = decode_results(request, file_format, compressed)
|
| 136 |
+
total = int(request.headers["x-total-results"])
|
| 137 |
+
for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
|
| 138 |
+
results = combine_batches(results, batch, file_format)
|
| 139 |
+
if file_format == "xml":
|
| 140 |
+
return merge_xml_results(results)
|
| 141 |
+
return results
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
job_id = submit_id_mapping(
|
| 145 |
+
from_db="UniProtKB_AC-ID", to_db="PDB", ids=protein_id
|
| 146 |
+
)
|
| 147 |
+
if check_id_mapping_results_ready(job_id):
|
| 148 |
+
link = get_id_mapping_results_link(job_id)
|
| 149 |
+
results = get_id_mapping_results_search(link)
|
| 150 |
+
# Equivalently using the stream endpoint which is more demanding
|
| 151 |
+
# on the API and so is less stable:
|
| 152 |
+
# results = get_id_mapping_results_stream(link)
|
| 153 |
+
|
| 154 |
+
return [i['to'] for i in results['results']]
|
| 155 |
+
def get_pdb_ids(protein_id):
|
| 156 |
+
try:
|
| 157 |
+
request = IdMappingClient.submit(
|
| 158 |
+
source="UniProtKB_AC-ID", dest="PDB", ids={protein_id})
|
| 159 |
+
|
| 160 |
+
time.sleep(2.0)
|
| 161 |
+
pdb_list = list(request.each_result())
|
| 162 |
+
return [i['to'] for i in pdb_list]
|
| 163 |
+
except requests.exceptions.HTTPError:
|
| 164 |
+
get_pdb_ids_2(protein_id)
|
| 165 |
+
except KeyError:
|
| 166 |
+
get_pdb_ids_2(protein_id)
|
| 167 |
+
|
| 168 |
+
|
code/alphafold_featureVector.py
ADDED
|
@@ -0,0 +1,579 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# IMPORT NECESSARY MODULES AND LIBRARIES
|
| 2 |
+
from timeit import default_timer as timer
|
| 3 |
+
import xml.etree.ElementTree as ET
|
| 4 |
+
from collections import Counter
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
from io import StringIO
|
| 7 |
+
from decimal import *
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import requests as r
|
| 10 |
+
import os.path as op
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import subprocess
|
| 13 |
+
import argparse
|
| 14 |
+
import ssbio.utils
|
| 15 |
+
import warnings
|
| 16 |
+
import sys
|
| 17 |
+
import pathlib
|
| 18 |
+
import os, glob
|
| 19 |
+
import math
|
| 20 |
+
import ssbio
|
| 21 |
+
import ssl
|
| 22 |
+
import gzip
|
| 23 |
+
import ast
|
| 24 |
+
import itertools
|
| 25 |
+
|
| 26 |
+
from Bio.Align import substitution_matrices
|
| 27 |
+
from Bio.PDB.Polypeptide import *
|
| 28 |
+
from Bio.PDB import PDBList
|
| 29 |
+
from Bio import Align
|
| 30 |
+
from Bio import SeqIO
|
| 31 |
+
from Bio.PDB import *
|
| 32 |
+
import numpy as np
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# FUNCTIONS
|
| 38 |
+
from calc_pc_property import *
|
| 39 |
+
from add_domains import *
|
| 40 |
+
from add_annotations import *
|
| 41 |
+
from add_structure import *
|
| 42 |
+
from add_alignment import *
|
| 43 |
+
from manage_files import *
|
| 44 |
+
from add_3Dalignment import *
|
| 45 |
+
from add_sasa import *
|
| 46 |
+
from standard import *
|
| 47 |
+
from add_interface_pos import *
|
| 48 |
+
from standard import *
|
| 49 |
+
from uniprotSequenceMatch import uniprotSequenceMatch
|
| 50 |
+
from process_input import clean_data
|
| 51 |
+
from alphafold_model import *
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def alphafold(input_set, mode, impute):
|
| 55 |
+
start = timer()
|
| 56 |
+
# Necessary lists
|
| 57 |
+
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
| 58 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
|
| 59 |
+
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
|
| 60 |
+
'region',
|
| 61 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
| 62 |
+
'transitPeptide', 'glycosylation', 'propeptide']
|
| 63 |
+
|
| 64 |
+
change_names = {'Disulfide bond': 'disulfide', 'Initiator methionine': 'intMet',
|
| 65 |
+
'Natural variant': 'naturalVariant',
|
| 66 |
+
'DNA binding': 'dnaBinding',
|
| 67 |
+
'Active site': 'activeSite', 'Nucleotide binding': 'nucleotideBinding', 'Lipidation': 'lipidation',
|
| 68 |
+
'Site': 'site', 'Transmembrane': 'transmembrane', 'Cross-link': 'crosslink',
|
| 69 |
+
'Mutagenesis': 'mutagenesis', 'Beta strand': 'strand', 'Helix': 'helix', 'Turn': 'turn',
|
| 70 |
+
'Metal binding': 'metalBinding', 'Repeat': 'repeat',
|
| 71 |
+
'Topological domain': 'topologicalDomain', 'Calcium binding': 'caBinding',
|
| 72 |
+
'Binding site': 'bindingSite',
|
| 73 |
+
'Region': 'region', 'Signal peptide': 'signalPeptide', 'Modified residue': 'modifiedResidue',
|
| 74 |
+
'Zinc finger': 'zincFinger', 'Motif': 'motif', 'Coiled coil': 'coiledCoil', 'Peptide': 'peptide',
|
| 75 |
+
'Transit peptide': 'transitPeptide', 'Glycosylation': 'glycosylation', 'Propeptide': 'propeptide',
|
| 76 |
+
'Intramembrane': 'intramembrane'}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
## Standardizing input
|
| 80 |
+
data = clean_data(input_set)
|
| 81 |
+
|
| 82 |
+
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, alphafold_path, alphafold_summary= manage_files(mode)
|
| 83 |
+
out_path = path_to_output_files / 'log.txt'
|
| 84 |
+
sys.stdout = open(out_path, 'w')
|
| 85 |
+
print('Creating directories...')
|
| 86 |
+
file_base = str(Path(alphafold_path / '*'))
|
| 87 |
+
file_str = glob.glob(file_base)[0].split('-')[-1].split('.')[0]
|
| 88 |
+
## Physicochemical properties
|
| 89 |
+
print('Adding physicochemical properties...\n')
|
| 90 |
+
data = add_physicochemical(data)
|
| 91 |
+
|
| 92 |
+
## Domains
|
| 93 |
+
print('Adding domains\n')
|
| 94 |
+
data = add_domains(data, path_to_domains)
|
| 95 |
+
|
| 96 |
+
## Processing data frame
|
| 97 |
+
data = data.astype(str)
|
| 98 |
+
data = data.replace({'NaN': np.NaN, 'nan': np.NaN})
|
| 99 |
+
data.domain = data.domain.replace({np.NaN: '-1'}) # Fill -1 if NaN - standardization.
|
| 100 |
+
data.domStart = data.domStart.replace({np.NaN: '-1'})
|
| 101 |
+
data.domEnd = data.domEnd.replace({np.NaN: '-1'})
|
| 102 |
+
data.distance = data.distance.replace({np.NaN: '-1'})
|
| 103 |
+
fisherResult = pd.read_csv(fisher_path, sep='\t')
|
| 104 |
+
significant_domains = fisherResult.domain.to_list()
|
| 105 |
+
|
| 106 |
+
data = data.reset_index()
|
| 107 |
+
data = data.drop(columns=['index'])
|
| 108 |
+
|
| 109 |
+
## not_match_in_uniprot : Data points not matched to UniProt sequence
|
| 110 |
+
## uniprot_matched: Data points matched to UniProt sequence. Proceed with this data frame
|
| 111 |
+
## canonical_fasta : Dataframe including canonical sequence for the protein of interest. Obtained from UniProt.
|
| 112 |
+
## isoform_fasta: Dataframe including isoform sequences for the protein of interest. Obtained from UniProt.
|
| 113 |
+
not_match_in_uniprot, uniprot_matched, canonical_fasta, isoform_fasta = uniprotSequenceMatch(data)
|
| 114 |
+
|
| 115 |
+
not_match_in_uniprot = not_match_in_uniprot.reset_index().drop(['index'], axis=1)
|
| 116 |
+
|
| 117 |
+
for key in change_names.keys():
|
| 118 |
+
not_match_in_uniprot[key] = ''
|
| 119 |
+
not_match_in_uniprot = not_match_in_uniprot.rename(columns=change_names)
|
| 120 |
+
uniprot_matched = add_annotations(uniprot_matched)
|
| 121 |
+
|
| 122 |
+
for w in uniprot_matched.index:
|
| 123 |
+
for q in annotation_list:
|
| 124 |
+
per_protein = []
|
| 125 |
+
if uniprot_matched.at[w, q] != 'nan':
|
| 126 |
+
fix = ast.literal_eval(uniprot_matched.at[w, q])
|
| 127 |
+
for z in fix:
|
| 128 |
+
if '-' in z:
|
| 129 |
+
per_protein += np.arange(int(z.split('-')[0]), int(z.split('-')[1])+1,1).tolist()
|
| 130 |
+
else:
|
| 131 |
+
try:
|
| 132 |
+
per_protein.append(int(z))
|
| 133 |
+
except:
|
| 134 |
+
ValueError
|
| 135 |
+
uniprot_matched.at[w, q] = per_protein
|
| 136 |
+
else:
|
| 137 |
+
uniprot_matched.at[w, q] = 'nan'
|
| 138 |
+
uniprot_matched = uniprot_matched.rename(columns=change_names)
|
| 139 |
+
uniprot_matched['wt_sequence_match'] = uniprot_matched['wt_sequence_match'].astype(str)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Avoiding downloading files for SASA calculation if already downloaded.
|
| 143 |
+
|
| 144 |
+
existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
|
| 145 |
+
existing_free_sasa = [str(i) for i in existing_free_sasa]
|
| 146 |
+
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
| 147 |
+
## Decide if the wild type amino acid is on canonical or isoform sequence. Selected sequence will be used for the
|
| 148 |
+
## sequence alignment.
|
| 149 |
+
for i in uniprot_matched.index:
|
| 150 |
+
if len(uniprot_matched.at[i, 'uniprotSequence']) >= int(uniprot_matched.at[i, 'pos']):
|
| 151 |
+
wt = uniprot_matched.at[i, 'wt']
|
| 152 |
+
can = str(uniprot_matched.at[i, 'uniprotSequence'])[int(uniprot_matched.at[i, 'pos']) - 1]
|
| 153 |
+
if wt == can:
|
| 154 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'm'
|
| 155 |
+
elif wt != can:
|
| 156 |
+
isoList = isoform_fasta[
|
| 157 |
+
isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
|
| 158 |
+
for k in isoList:
|
| 159 |
+
if len(k) >= int(uniprot_matched.at[i, 'pos']):
|
| 160 |
+
resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
|
| 161 |
+
if wt == resInIso:
|
| 162 |
+
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
|
| 163 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
|
| 164 |
+
uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
|
| 165 |
+
break
|
| 166 |
+
|
| 167 |
+
elif len(uniprot_matched.at[i, 'uniprotSequence']) < int(uniprot_matched.at[i, 'pos']):
|
| 168 |
+
isoList = isoform_fasta[
|
| 169 |
+
isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
|
| 170 |
+
for k in isoList:
|
| 171 |
+
if len(k) >= int(uniprot_matched.at[i, 'pos']):
|
| 172 |
+
resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
|
| 173 |
+
wt = uniprot_matched.at[i, 'wt']
|
| 174 |
+
if wt == resInIso:
|
| 175 |
+
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
|
| 176 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
|
| 177 |
+
uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
|
| 178 |
+
break
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
uniprot_matched = uniprot_matched.replace({'nan': np.NaN})
|
| 183 |
+
for annot in ['Domain', 'Alternative sequence', 'Chain', 'Sequence conflict', 'Compositional bias']:
|
| 184 |
+
try:
|
| 185 |
+
uniprot_matched = uniprot_matched.drop(columns=annot)
|
| 186 |
+
except:
|
| 187 |
+
KeyError
|
| 188 |
+
|
| 189 |
+
print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
|
| 190 |
+
% (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
|
| 191 |
+
len(uniprot_matched.drop_duplicates(['datapoint']))))
|
| 192 |
+
|
| 193 |
+
## Adding interface residue information.
|
| 194 |
+
|
| 195 |
+
data_interface = pd.read_csv(path_to_interfaces, sep='\t')
|
| 196 |
+
interface_positions = get_interface_positions(data_interface, 'P1', 'P2')
|
| 197 |
+
|
| 198 |
+
interface_dataframe = pd.DataFrame()
|
| 199 |
+
for key, val in interface_positions.items():
|
| 200 |
+
k = pd.Series((key, str(list(set(val)))))
|
| 201 |
+
interface_dataframe = interface_dataframe.append(k, ignore_index=True)
|
| 202 |
+
interface_dataframe.columns = ['uniprotID', 'interface_positions']
|
| 203 |
+
|
| 204 |
+
uniprot_matched = uniprot_matched.merge(interface_dataframe, on='uniprotID', how='left')
|
| 205 |
+
uniprot_matched.interface_positions = uniprot_matched.interface_positions.astype('str')
|
| 206 |
+
|
| 207 |
+
## PDB info file is pre-generated for time concerns. Includes summary data of AlphaFold structures.
|
| 208 |
+
## With new updates, can be updated separately.
|
| 209 |
+
|
| 210 |
+
pdb_info = pd.read_csv(alphafold_summary, sep='\t')
|
| 211 |
+
|
| 212 |
+
## Keeping how many models each AlphaFold structure has.
|
| 213 |
+
model_count = modelCount(alphafold_path)
|
| 214 |
+
for k, v in model_count.items():
|
| 215 |
+
model_count[k] = int(v / 2) # two types of files for each file.
|
| 216 |
+
uniprot_matched = uniprot_matched.astype(str)
|
| 217 |
+
uniprot_matched.domStart = uniprot_matched.domStart.astype(float)
|
| 218 |
+
uniprot_matched.domEnd = uniprot_matched.domEnd.astype(float)
|
| 219 |
+
uniprot_matched.domStart = uniprot_matched.domStart.astype(int)
|
| 220 |
+
uniprot_matched.domEnd = uniprot_matched.domEnd.astype(int)
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
## Main part to add annotation information, align sequences, finding distances
|
| 225 |
+
|
| 226 |
+
for i in uniprot_matched.index:
|
| 227 |
+
print('Processing', i, 'of', len(uniprot_matched))
|
| 228 |
+
if len(uniprot_matched.at[i, 'uniprotSequence']) >= int(uniprot_matched.at[i, 'pos']):
|
| 229 |
+
wt = uniprot_matched.at[i, 'wt']
|
| 230 |
+
can = str(uniprot_matched.at[i, 'uniprotSequence'])[int(uniprot_matched.at[i, 'pos']) - 1]
|
| 231 |
+
## Information about whether the mutation is found on the canonical or isoform sequence.
|
| 232 |
+
|
| 233 |
+
if wt == can:
|
| 234 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'm'
|
| 235 |
+
elif wt != can:
|
| 236 |
+
isoList = isoform_fasta[
|
| 237 |
+
isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
|
| 238 |
+
for k in isoList:
|
| 239 |
+
if len(k) >= int(uniprot_matched.at[i, 'pos']):
|
| 240 |
+
resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
|
| 241 |
+
if wt == resInIso:
|
| 242 |
+
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
|
| 243 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
|
| 244 |
+
uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
|
| 245 |
+
break
|
| 246 |
+
elif len(uniprot_matched.at[i, 'uniprotSequence']) < int(uniprot_matched.at[i, 'pos']):
|
| 247 |
+
isoList = isoform_fasta[
|
| 248 |
+
isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
|
| 249 |
+
for k in isoList:
|
| 250 |
+
if len(k) >= int(uniprot_matched.at[i, 'pos']):
|
| 251 |
+
resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
|
| 252 |
+
wt = uniprot_matched.at[i, 'wt']
|
| 253 |
+
if wt == resInIso:
|
| 254 |
+
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
|
| 255 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
|
| 256 |
+
uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
|
| 257 |
+
break
|
| 258 |
+
uniprotID = uniprot_matched.at[i, 'uniprotID']
|
| 259 |
+
datapoint = uniprot_matched.at[i, 'datapoint']
|
| 260 |
+
|
| 261 |
+
for k in annotation_list:
|
| 262 |
+
txt = k + 'Binary'
|
| 263 |
+
|
| 264 |
+
if (str(uniprot_matched.at[i, txt]) == '0') or (str(uniprot_matched.at[i, txt]) == '0.0'):
|
| 265 |
+
uniprot_matched.at[i, txt] = '1'
|
| 266 |
+
elif (str(uniprot_matched.at[i, txt]).lower() == 'nan') | (str(uniprot_matched.at[i, txt]) == np.NaN) :
|
| 267 |
+
uniprot_matched.at[i, txt] = '0'
|
| 268 |
+
elif (str(uniprot_matched.at[i, txt]) == '1') or (str(uniprot_matched.at[i, txt]) == '1.0'):
|
| 269 |
+
uniprot_matched.at[i, txt] = '2'
|
| 270 |
+
## Search in all models.
|
| 271 |
+
models_for_protein = [val for key, val in model_count.items() if
|
| 272 |
+
uniprotID in key.split(';')] # We have this many models for the protein.
|
| 273 |
+
which_model_mutation = which_model(
|
| 274 |
+
int(uniprot_matched.at[i, 'pos'])) # List of models in which the mutation can be found.
|
| 275 |
+
models_for_all_annotations = {}
|
| 276 |
+
for annot in annotation_list:
|
| 277 |
+
if len(uniprot_matched.at[i, annot]) != 0 and type(uniprot_matched.at[i, annot]) != list:
|
| 278 |
+
uniprot_matched.at[i, annot] = list(
|
| 279 |
+
map(str.strip, uniprot_matched.at[i, annot].strip('][').replace('"', '').split(',')))
|
| 280 |
+
models_for_annotations = {} # Recording which position is found in which model file.
|
| 281 |
+
for annot_position in uniprot_matched.at[i, annot]:
|
| 282 |
+
if annot_position != 'nan' and annot_position != '':
|
| 283 |
+
models_for_that_position = which_model(int(annot_position))
|
| 284 |
+
else:
|
| 285 |
+
models_for_that_position = {}
|
| 286 |
+
for key, val in models_for_that_position.items():
|
| 287 |
+
if key not in models_for_annotations.keys():
|
| 288 |
+
models_for_annotations[key] = [val]
|
| 289 |
+
else:
|
| 290 |
+
models_for_annotations[key] += [val]
|
| 291 |
+
models_for_all_annotations[annot] = models_for_annotations
|
| 292 |
+
new_dict = {}
|
| 293 |
+
for key, val in models_for_all_annotations.items():
|
| 294 |
+
subdict = {k: v for k, v in val.items() if k in which_model_mutation}
|
| 295 |
+
subdict = dict(sorted(subdict.items()))
|
| 296 |
+
new_dict[key] = subdict
|
| 297 |
+
new_dict = reduce_model_dict(new_dict)
|
| 298 |
+
models_we_need = list(set(itertools.chain.from_iterable(
|
| 299 |
+
[list(ov.keys()) for ok, ov in new_dict.items()]))) # Read models with these numbers
|
| 300 |
+
info_per_model = {} # her bir datapoint için baştan yazılıyor.
|
| 301 |
+
dist_of_annots = {}
|
| 302 |
+
all_domain_distances = []
|
| 303 |
+
|
| 304 |
+
for mod in models_we_need:
|
| 305 |
+
print('---------PRINTING FOR MODEL--------', mod)
|
| 306 |
+
dist_of_annots[str(mod)] = {}
|
| 307 |
+
info_per_model[mod] = {}
|
| 308 |
+
info_per_model[mod]['datapoint'] = datapoint
|
| 309 |
+
identifier = uniprot_matched.at[i, 'uniprotSequence']
|
| 310 |
+
try:
|
| 311 |
+
pdbSequence = pdb_info.loc[(pdb_info.uniprotID == uniprotID) & (
|
| 312 |
+
pdb_info.model_num == mod)].sequence.item()
|
| 313 |
+
except:
|
| 314 |
+
ValueError
|
| 315 |
+
pdbSequence = 'nan'
|
| 316 |
+
if pdbSequence != 'nan': # The number in models we need might not be present for that protein. Preventng error.
|
| 317 |
+
pdbSequence = pdb_info.loc[(pdb_info.uniprotID == uniprotID) & (pdb_info.model_num == mod)].sequence.item()
|
| 318 |
+
alignment_list = do_alignment(uniprot_matched.at[i, 'datapoint'], uniprot_matched.at[i, 'uniprotSequence'],
|
| 319 |
+
pdbSequence, Path(path_to_output_files / 'alignment_files'))
|
| 320 |
+
pdb_alignStatus = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[0]
|
| 321 |
+
info_per_model[mod]['pdb_alignStatus'] = pdb_alignStatus
|
| 322 |
+
mutationPositionOnPDB = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[1]
|
| 323 |
+
info_per_model[mod]['mutationPositionOnPDB'] = mutationPositionOnPDB
|
| 324 |
+
startGap = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[2]
|
| 325 |
+
info_per_model[mod]['startGap'] = startGap
|
| 326 |
+
alignment_to_use = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[3]
|
| 327 |
+
for annot in annotation_list:
|
| 328 |
+
if new_dict[annot] == {}:
|
| 329 |
+
annotation_pos_on_pdb_ = []
|
| 330 |
+
else:
|
| 331 |
+
try:
|
| 332 |
+
annotation_pos_on_pdb_ = annotation_pos_on_pdb(new_dict[annot][mod], startGap, alignment_to_use,
|
| 333 |
+
identifier)
|
| 334 |
+
except:
|
| 335 |
+
KeyError
|
| 336 |
+
info_per_model[mod][annot] = annotation_pos_on_pdb_
|
| 337 |
+
|
| 338 |
+
pdb_path = Path(f'{alphafold_path}/AF-{uniprotID}-F{mod}-{file_str}.pdb.gz')
|
| 339 |
+
|
| 340 |
+
if get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan', 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
|
| 341 |
+
'gzip') != None:
|
| 342 |
+
|
| 343 |
+
alignments, coords, resnums_for_sasa = get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan',
|
| 344 |
+
'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
|
| 345 |
+
'gzip')
|
| 346 |
+
alignments = alignments[0]
|
| 347 |
+
|
| 348 |
+
calculate_freesasa(uniprotID, mod, existing_free_sasa, alphafold_path, path_to_output_files)
|
| 349 |
+
if (mutationPositionOnPDB != 'nan'):
|
| 350 |
+
if (int(mutationPositionOnPDB) <= 1400):
|
| 351 |
+
try:
|
| 352 |
+
coordMut = get_coords(mutationPositionOnPDB, alignments, coords, resnums_for_sasa, mode)[0]
|
| 353 |
+
except:
|
| 354 |
+
ValueError
|
| 355 |
+
coordMut = 'nan'
|
| 356 |
+
else:
|
| 357 |
+
coordMut = np.NaN
|
| 358 |
+
|
| 359 |
+
sasa_pos = get_coords(mutationPositionOnPDB, alignments, coords, resnums_for_sasa, mode)[2]
|
| 360 |
+
sasa_val = sasa('alphafold', 'nan', uniprotID, sasa_pos, uniprot_matched.at[i, 'wt'], mode,
|
| 361 |
+
path_to_output_files, file_type='gzip')
|
| 362 |
+
|
| 363 |
+
if sasa_val != None:
|
| 364 |
+
uniprot_matched.at[i, 'sasa'] = sasa_val
|
| 365 |
+
else:
|
| 366 |
+
coordMut = 'nan'
|
| 367 |
+
sasa_val = 'nan'
|
| 368 |
+
uniprot_matched.at[i, 'sasa'] = sasa_val
|
| 369 |
+
|
| 370 |
+
domainPositionOnPDB_list = list(
|
| 371 |
+
range(int(uniprot_matched.at[i, 'domStart']), int(uniprot_matched.at[i, 'domEnd'])))
|
| 372 |
+
domain_distances = []
|
| 373 |
+
if len(domainPositionOnPDB_list) != 0:
|
| 374 |
+
for domain_ in domainPositionOnPDB_list:
|
| 375 |
+
coordDomain = get_coords(domain_, alignments, coords, resnums_for_sasa, mode)[0]
|
| 376 |
+
distance_dom = float(find_distance(coordMut,
|
| 377 |
+
coordDomain)) # bu bir anotasyonun bir modeldeki bir tane pozisyonu için.
|
| 378 |
+
domain_distances.append(distance_dom)
|
| 379 |
+
minimum_domain = min(domain_distances) # minimum for one model.
|
| 380 |
+
else:
|
| 381 |
+
minimum_domain = np.NaN
|
| 382 |
+
all_domain_distances.append(minimum_domain)
|
| 383 |
+
list_dist_of_annots = []
|
| 384 |
+
for key, val in info_per_model.items():
|
| 385 |
+
modNum = key
|
| 386 |
+
min_annots = {} # Write from scratch for each annotation.
|
| 387 |
+
|
| 388 |
+
if modNum == mod:
|
| 389 |
+
for label, annotPos in val.items(): # For each annotation type, calculate all distances of the annot positions.
|
| 390 |
+
if label in annotation_list:
|
| 391 |
+
all_annot_distance_per_model = [] # All distances of an annoation in hat model
|
| 392 |
+
for annot_position in annotPos:
|
| 393 |
+
if (annot_position != 'nan'):
|
| 394 |
+
if (int(annot_position) <= 1400):
|
| 395 |
+
coordAnnot = \
|
| 396 |
+
get_coords(annot_position, alignments, coords, resnums_for_sasa, mode)[
|
| 397 |
+
0]
|
| 398 |
+
distance = float(find_distance(coordMut,
|
| 399 |
+
coordAnnot)) # bu bir anotasyonun bir modeldeki bir tane pozisyonu için.
|
| 400 |
+
all_annot_distance_per_model.append(distance)
|
| 401 |
+
if all_annot_distance_per_model != []:
|
| 402 |
+
all_annot_distance_per_model = [float(i) for i in all_annot_distance_per_model]
|
| 403 |
+
try:
|
| 404 |
+
minimum_position = float(min(all_annot_distance_per_model))
|
| 405 |
+
except:
|
| 406 |
+
ValueError
|
| 407 |
+
minimum_position = 'nan'
|
| 408 |
+
min_annots[label] = float(
|
| 409 |
+
minimum_position) # Minimum of the annotation in this model.
|
| 410 |
+
if min_annots != {}:
|
| 411 |
+
list_dist_of_annots.append(min_annots)
|
| 412 |
+
dist_of_annots[str(
|
| 413 |
+
mod)] = list_dist_of_annots # Getting minimum of all possible models
|
| 414 |
+
# uniprot_matched.at[i, annotation_type] = minimum_position
|
| 415 |
+
else:
|
| 416 |
+
print('Model File Not Found')
|
| 417 |
+
|
| 418 |
+
uniprot_matched.at[i, 'sasa'] = np.NaN
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
if len(all_domain_distances) != 0:
|
| 422 |
+
uniprot_matched.at[i, 'domaindistance3D'] = min(all_domain_distances)
|
| 423 |
+
else:
|
| 424 |
+
uniprot_matched.at[i, 'domaindistance3D'] = np.NaN
|
| 425 |
+
dist_of_annots_min_of_all = {}
|
| 426 |
+
flat = [item for sublist in list(dist_of_annots.values()) for item in sublist]
|
| 427 |
+
for f in flat:
|
| 428 |
+
for key, val in f.items():
|
| 429 |
+
if key not in dist_of_annots_min_of_all.keys():
|
| 430 |
+
dist_of_annots_min_of_all[key] = val
|
| 431 |
+
elif (key in dist_of_annots_min_of_all.keys()) & (float(dist_of_annots_min_of_all[key]) > float(val)):
|
| 432 |
+
dist_of_annots_min_of_all[key] = val
|
| 433 |
+
key_list = []
|
| 434 |
+
for key, val in dist_of_annots_min_of_all.items():
|
| 435 |
+
uniprot_matched.at[i, key] = val
|
| 436 |
+
key_list.append(key)
|
| 437 |
+
remaining = list(set(annotation_list) - set(key_list))
|
| 438 |
+
|
| 439 |
+
for rem in remaining:
|
| 440 |
+
uniprot_matched.at[i, rem] = ''
|
| 441 |
+
uniprot_matched.at[i, 'distances'] = [dist_of_annots]
|
| 442 |
+
|
| 443 |
+
if (uniprot_matched.at[i, 'sasa'] != None) & (uniprot_matched.at[i, 'sasa'] != np.NaN) & (
|
| 444 |
+
str(uniprot_matched.at[i, 'sasa']) != 'nan'):
|
| 445 |
+
if '*' in uniprot_matched.at[i, 'sasa']:
|
| 446 |
+
uniprot_matched.at[i, 'sasa'] = uniprot_matched.at[i, 'sasa'].split('*')[0]
|
| 447 |
+
try:
|
| 448 |
+
uniprot_matched.at[i, 'sasa'] = float(uniprot_matched.at[i, 'sasa'].strip())
|
| 449 |
+
except:
|
| 450 |
+
TypeError
|
| 451 |
+
|
| 452 |
+
if float(uniprot_matched.at[i, 'sasa']) < 5:
|
| 453 |
+
uniprot_matched.at[i, 'trsh4'] = 'core'
|
| 454 |
+
elif float(uniprot_matched.at[i, 'sasa']) >= 5:
|
| 455 |
+
uniprot_matched.at[i, 'trsh4'] = 'surface'
|
| 456 |
+
elif str(uniprot_matched.at[i, 'sasa']) == 'nan':
|
| 457 |
+
uniprot_matched.at[i, 'trsh4'] = 'nan'
|
| 458 |
+
else:
|
| 459 |
+
uniprot_matched.at[i, 'trsh4'] = 'nan'
|
| 460 |
+
if (str(uniprot_matched.at[i, 'pos']) in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
|
| 461 |
+
i, 'trsh4'] == 'surface':
|
| 462 |
+
uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'interface'
|
| 463 |
+
elif (str(uniprot_matched.at[i, 'pos']) not in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
|
| 464 |
+
i, 'trsh4'] == 'surface':
|
| 465 |
+
uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'surface'
|
| 466 |
+
elif (str(uniprot_matched.at[i, 'pos']) not in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
|
| 467 |
+
i, 'trsh4'] == 'core':
|
| 468 |
+
uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'core'
|
| 469 |
+
elif (str(uniprot_matched.at[i, 'pos']) in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
|
| 470 |
+
i, 'trsh4'] == 'core':
|
| 471 |
+
uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'conflict'
|
| 472 |
+
elif uniprot_matched.at[i, 'trsh4'] == 'nan':
|
| 473 |
+
uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'nan'
|
| 474 |
+
if uniprot_matched.at[i, 'domain'] in significant_domains:
|
| 475 |
+
uniprot_matched.at[i, 'domain_fisher'] = uniprot_matched.at[i, 'domain']
|
| 476 |
+
else:
|
| 477 |
+
uniprot_matched.at[i, 'domain_fisher'] = 'NULL'
|
| 478 |
+
uniprot_matched = uniprot_matched.round(2)
|
| 479 |
+
uniprot_matched = uniprot_matched.astype(str)
|
| 480 |
+
|
| 481 |
+
uniprot_matched[ 'domain'] = uniprot_matched['domain'].replace({'-1': 'NULL'})
|
| 482 |
+
uniprot_matched = uniprot_matched.drop_duplicates()
|
| 483 |
+
uniprot_matched.rename(
|
| 484 |
+
columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
|
| 485 |
+
'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
|
| 486 |
+
'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
|
| 487 |
+
'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state',
|
| 488 |
+
'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin',
|
| 489 |
+
'intramembraneBinary': 'intramembrane_bin',
|
| 490 |
+
'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin',
|
| 491 |
+
'activeSiteBinary': 'activeSite_bin',
|
| 492 |
+
'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin',
|
| 493 |
+
'siteBinary': 'site_bin',
|
| 494 |
+
'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin',
|
| 495 |
+
'mutagenesisBinary': 'mutagenesis_bin',
|
| 496 |
+
'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin',
|
| 497 |
+
'metalBindingBinary': 'metalBinding_bin',
|
| 498 |
+
'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin',
|
| 499 |
+
'caBindingBinary': 'caBinding_bin',
|
| 500 |
+
'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin',
|
| 501 |
+
'signalPeptideBinary': 'signalPeptide_bin',
|
| 502 |
+
'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin',
|
| 503 |
+
'motifBinary': 'motif_bin',
|
| 504 |
+
'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin',
|
| 505 |
+
'transitPeptideBinary': 'transitPeptide_bin',
|
| 506 |
+
'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin',
|
| 507 |
+
'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist',
|
| 508 |
+
'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist',
|
| 509 |
+
'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist',
|
| 510 |
+
'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', 'site': 'site_dist',
|
| 511 |
+
'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist',
|
| 512 |
+
'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', 'turn': 'turn_dist',
|
| 513 |
+
'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist',
|
| 514 |
+
'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist',
|
| 515 |
+
'bindingSite': 'bindingSite_dist', 'region': 'region_dist',
|
| 516 |
+
'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist',
|
| 517 |
+
'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist',
|
| 518 |
+
'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
|
| 519 |
+
'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
|
| 520 |
+
|
| 521 |
+
uniprot_matched = uniprot_matched[
|
| 522 |
+
['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity', 'volume',
|
| 523 |
+
'granthamScore', 'domains_all',
|
| 524 |
+
'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin',
|
| 525 |
+
'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin',
|
| 526 |
+
'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin',
|
| 527 |
+
'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin',
|
| 528 |
+
'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin',
|
| 529 |
+
'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin',
|
| 530 |
+
'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin',
|
| 531 |
+
'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin',
|
| 532 |
+
'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist',
|
| 533 |
+
'intramembrane_dist',
|
| 534 |
+
'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist',
|
| 535 |
+
'nucleotideBinding_dist', 'lipidation_dist', 'site_dist',
|
| 536 |
+
'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist',
|
| 537 |
+
'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist',
|
| 538 |
+
'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist',
|
| 539 |
+
'bindingSite_dist', 'region_dist', 'signalPeptide_dist',
|
| 540 |
+
'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist',
|
| 541 |
+
'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist',
|
| 542 |
+
'glycosylation_dist', 'propeptide_dist']]
|
| 543 |
+
uniprot_matched = uniprot_matched.reset_index()
|
| 544 |
+
uniprot_matched = uniprot_matched.drop(columns = {'index'})
|
| 545 |
+
# Imputation
|
| 546 |
+
if (impute == 'True') or (impute == 'true'):
|
| 547 |
+
filler = [20.71, 46.67, 28.13,15.5, 35.94, 21.84, 25.15, 45.15, 29.81, 29.91, 34.67, 24.72, 10.66,11.55, 13.02,
|
| 548 |
+
21.54,27.42, 38.39, 30.44, 20.9, 25.82, 46.12, 32.1, 35.96, 35.86, 37.88, 19.09, 35.2, 26.95, 37.48]
|
| 549 |
+
col_index = 0
|
| 550 |
+
|
| 551 |
+
for col_ in uniprot_matched.columns[-30:]:
|
| 552 |
+
uniprot_matched[col_] = uniprot_matched[col_].fillna(filler[col_index])
|
| 553 |
+
uniprot_matched[col_] = uniprot_matched[col_].replace({'nan': filler[col_index]})
|
| 554 |
+
uniprot_matched[col_] = uniprot_matched[col_].replace({'': filler[col_index]})
|
| 555 |
+
"""
|
| 556 |
+
if uniprot_matched[col_].values == '':
|
| 557 |
+
uniprot_matched[col_] = filler[col_index]
|
| 558 |
+
"""
|
| 559 |
+
col_index += 1
|
| 560 |
+
|
| 561 |
+
uniprot_matched['domains_3Ddist'] = uniprot_matched['domains_3Ddist'].fillna(29.78)
|
| 562 |
+
uniprot_matched['sasa'] = uniprot_matched['sasa'].fillna(35.6)
|
| 563 |
+
uniprot_matched['location_3state'] = uniprot_matched['location_3state'].fillna('unknown')
|
| 564 |
+
elif (impute == 'False') or (impute == 'false'):
|
| 565 |
+
pass
|
| 566 |
+
uniprot_matched = uniprot_matched.replace({'nan': np.NaN})
|
| 567 |
+
uniprot_matched = uniprot_matched.replace({'['']': np.NaN})
|
| 568 |
+
uniprot_matched.to_csv(path_to_output_files / 'featurevector_alphafold.txt', index=False, sep='\t')
|
| 569 |
+
if len(uniprot_matched) == 0:
|
| 570 |
+
print(
|
| 571 |
+
'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
|
| 572 |
+
|
| 573 |
+
print('Feature vector successfully created...')
|
| 574 |
+
end = timer()
|
| 575 |
+
hours, rem = divmod(end - start, 3600)
|
| 576 |
+
minutes, seconds = divmod(rem, 60)
|
| 577 |
+
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
| 578 |
+
sys.stdout.close()
|
| 579 |
+
return uniprot_matched
|
code/alphafold_model.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
import glob
|
| 3 |
+
def reduce_model_dict(dict):
|
| 4 |
+
for key, val in dict.items():
|
| 5 |
+
used = []
|
| 6 |
+
for key2, val2 in val.items():
|
| 7 |
+
new = []
|
| 8 |
+
for i in val2:
|
| 9 |
+
if i not in used:
|
| 10 |
+
new.append(i)
|
| 11 |
+
used.append(i)
|
| 12 |
+
val[key2] = new
|
| 13 |
+
return dict
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def which_model(position):
|
| 17 |
+
models_dict = {}
|
| 18 |
+
x = 1
|
| 19 |
+
for i, j in zip(range(1400, 27000, 200), range(1, 27000, 200)):
|
| 20 |
+
if position <= i and position >= j:
|
| 21 |
+
models_dict[x] = position
|
| 22 |
+
x += 1
|
| 23 |
+
return models_dict
|
| 24 |
+
|
| 25 |
+
def modelCount(path_to_models):
|
| 26 |
+
count_list = []
|
| 27 |
+
for file in list(path_to_models.glob("*")):
|
| 28 |
+
protein_id = str(file).split('-')[1]
|
| 29 |
+
count_list.append(protein_id)
|
| 30 |
+
count_dict = Counter(count_list)
|
| 31 |
+
count_dict = {';'.join(sorted(k for k in count_dict.keys() if count_dict[k] == v)): v for v in
|
| 32 |
+
set(count_dict.values())}
|
| 33 |
+
return count_dict
|
code/calc_pc_property.py
ADDED
|
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
def compositionValues(aa1, aa2):
|
| 3 |
+
compositionValues = {'S': 1.42, 'R': 0.65, 'L': 0, 'P': 0.39, 'T': 0.71, 'A': 0, 'V': 0, 'G': 0.74,
|
| 4 |
+
'I': 0, 'F': 0, 'Y': 0.20, 'C': 2.75, 'H': 0.58, 'Q': 0.89, 'N': 1.33, 'K': 0.33,
|
| 5 |
+
'D': 1.38, 'E': 0.92, 'M': 0, 'W': 0.13}
|
| 6 |
+
dif = round((compositionValues[aa1] - compositionValues[aa2]), 2)
|
| 7 |
+
return (dif)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def polarityValues(aa1, aa2):
|
| 11 |
+
polarityValues = {'S': 9.2, 'R': 10.5, 'L': 4.9, 'P': 8.0, 'T': 8.6, 'A': 8.1, 'V': 5.9, 'G': 9.0,
|
| 12 |
+
'I': 5.2, 'F': 5.2, 'Y': 6.2, 'C': 5.5, 'H': 10.4, 'Q': 10.5, 'N': 11.6, 'K': 11.3,
|
| 13 |
+
'D': 13.0, 'E': 12.3, 'M': 5.7, 'W': 5.4}
|
| 14 |
+
dif = round((polarityValues[aa1] - polarityValues[aa2]), 2)
|
| 15 |
+
return (dif)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def volumeValues(aa1, aa2):
|
| 19 |
+
volumeValues = {'S': 32, 'R': 124, 'L': 111, 'P': 32.5, 'T': 61, 'A': 31, 'V': 84, 'G': 3,
|
| 20 |
+
'I': 111, 'F': 132, 'Y': 136, 'C': 55, 'H': 96, 'Q': 85, 'N': 56, 'K': 119,
|
| 21 |
+
'D': 54, 'E': 83, 'M': 105, 'W': 170}
|
| 22 |
+
dif = round((volumeValues[aa1] - volumeValues[aa2]), 2)
|
| 23 |
+
return (dif)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def add_physicochemical(df):
|
| 27 |
+
grantham_dict = {
|
| 28 |
+
('A', 'A'): '0',
|
| 29 |
+
('A', 'C'): '195',
|
| 30 |
+
('A', 'D'): '126',
|
| 31 |
+
('A', 'E'): '107',
|
| 32 |
+
('A', 'F'): '113',
|
| 33 |
+
('A', 'G'): '60',
|
| 34 |
+
('A', 'H'): '86',
|
| 35 |
+
('A', 'I'): '94',
|
| 36 |
+
('A', 'K'): '106',
|
| 37 |
+
('A', 'L'): '96',
|
| 38 |
+
('A', 'M'): '84',
|
| 39 |
+
('A', 'N'): '111',
|
| 40 |
+
('A', 'P'): '27',
|
| 41 |
+
('A', 'Q'): '91',
|
| 42 |
+
('A', 'R'): '112',
|
| 43 |
+
('A', 'S'): '99',
|
| 44 |
+
('A', 'T'): '58',
|
| 45 |
+
('A', 'V'): '64',
|
| 46 |
+
('A', 'W'): '148',
|
| 47 |
+
('A', 'Y'): '112',
|
| 48 |
+
('C', 'A'): '195',
|
| 49 |
+
('C', 'C'): '0',
|
| 50 |
+
('C', 'D'): '154',
|
| 51 |
+
('C', 'E'): '170',
|
| 52 |
+
('C', 'F'): '205',
|
| 53 |
+
('C', 'G'): '159',
|
| 54 |
+
('C', 'H'): '174',
|
| 55 |
+
('C', 'I'): '198',
|
| 56 |
+
('C', 'K'): '202',
|
| 57 |
+
('C', 'L'): '198',
|
| 58 |
+
('C', 'M'): '196',
|
| 59 |
+
('C', 'N'): '139',
|
| 60 |
+
('C', 'P'): '169',
|
| 61 |
+
('C', 'Q'): '154',
|
| 62 |
+
('C', 'R'): '180',
|
| 63 |
+
('C', 'S'): '112',
|
| 64 |
+
('C', 'T'): '149',
|
| 65 |
+
('C', 'V'): '192',
|
| 66 |
+
('C', 'W'): '215',
|
| 67 |
+
('C', 'Y'): '194',
|
| 68 |
+
('D', 'A'): '126',
|
| 69 |
+
('D', 'C'): '154',
|
| 70 |
+
('D', 'D'): '0',
|
| 71 |
+
('D', 'E'): '45',
|
| 72 |
+
('D', 'F'): '177',
|
| 73 |
+
('D', 'G'): '94',
|
| 74 |
+
('D', 'H'): '81',
|
| 75 |
+
('D', 'I'): '168',
|
| 76 |
+
('D', 'K'): '101',
|
| 77 |
+
('D', 'L'): '172',
|
| 78 |
+
('D', 'M'): '160',
|
| 79 |
+
('D', 'N'): '23',
|
| 80 |
+
('D', 'P'): '108',
|
| 81 |
+
('D', 'Q'): '61',
|
| 82 |
+
('D', 'R'): '96',
|
| 83 |
+
('D', 'S'): '65',
|
| 84 |
+
('D', 'T'): '85',
|
| 85 |
+
('D', 'V'): '152',
|
| 86 |
+
('D', 'W'): '181',
|
| 87 |
+
('D', 'Y'): '160',
|
| 88 |
+
('E', 'A'): '107',
|
| 89 |
+
('E', 'C'): '170',
|
| 90 |
+
('E', 'D'): '45',
|
| 91 |
+
('E', 'E'): '0',
|
| 92 |
+
('E', 'F'): '140',
|
| 93 |
+
('E', 'G'): '98',
|
| 94 |
+
('E', 'H'): '40',
|
| 95 |
+
('E', 'I'): '134',
|
| 96 |
+
('E', 'K'): '56',
|
| 97 |
+
('E', 'L'): '138',
|
| 98 |
+
('E', 'M'): '126',
|
| 99 |
+
('E', 'N'): '42',
|
| 100 |
+
('E', 'P'): '93',
|
| 101 |
+
('E', 'Q'): '29',
|
| 102 |
+
('E', 'R'): '54',
|
| 103 |
+
('E', 'S'): '80',
|
| 104 |
+
('E', 'T'): '65',
|
| 105 |
+
('E', 'V'): '121',
|
| 106 |
+
('E', 'W'): '152',
|
| 107 |
+
('E', 'Y'): '122',
|
| 108 |
+
('F', 'A'): '113',
|
| 109 |
+
('F', 'C'): '205',
|
| 110 |
+
('F', 'D'): '177',
|
| 111 |
+
('F', 'E'): '140',
|
| 112 |
+
('F', 'F'): '0',
|
| 113 |
+
('F', 'G'): '153',
|
| 114 |
+
('F', 'H'): '100',
|
| 115 |
+
('F', 'I'): '21',
|
| 116 |
+
('F', 'K'): '102',
|
| 117 |
+
('F', 'L'): '22',
|
| 118 |
+
('F', 'M'): '28',
|
| 119 |
+
('F', 'N'): '158',
|
| 120 |
+
('F', 'P'): '114',
|
| 121 |
+
('F', 'Q'): '116',
|
| 122 |
+
('F', 'R'): '97',
|
| 123 |
+
('F', 'S'): '155',
|
| 124 |
+
('F', 'T'): '103',
|
| 125 |
+
('F', 'V'): '50',
|
| 126 |
+
('F', 'W'): '40',
|
| 127 |
+
('F', 'Y'): '22',
|
| 128 |
+
('G', 'A'): '60',
|
| 129 |
+
('G', 'C'): '159',
|
| 130 |
+
('G', 'D'): '94',
|
| 131 |
+
('G', 'E'): '98',
|
| 132 |
+
('G', 'F'): '153',
|
| 133 |
+
('G', 'G'): '0',
|
| 134 |
+
('G', 'H'): '98',
|
| 135 |
+
('G', 'I'): '135',
|
| 136 |
+
('G', 'K'): '127',
|
| 137 |
+
('G', 'L'): '138',
|
| 138 |
+
('G', 'M'): '127',
|
| 139 |
+
('G', 'N'): '80',
|
| 140 |
+
('G', 'P'): '42',
|
| 141 |
+
('G', 'Q'): '87',
|
| 142 |
+
('G', 'R'): '125',
|
| 143 |
+
('G', 'S'): '56',
|
| 144 |
+
('G', 'T'): '59',
|
| 145 |
+
('G', 'V'): '109',
|
| 146 |
+
('G', 'W'): '184',
|
| 147 |
+
('G', 'Y'): '147',
|
| 148 |
+
('H', 'A'): '86',
|
| 149 |
+
('H', 'C'): '174',
|
| 150 |
+
('H', 'D'): '81',
|
| 151 |
+
('H', 'E'): '40',
|
| 152 |
+
('H', 'F'): '100',
|
| 153 |
+
('H', 'G'): '98',
|
| 154 |
+
('H', 'H'): '0',
|
| 155 |
+
('H', 'I'): '94',
|
| 156 |
+
('H', 'K'): '32',
|
| 157 |
+
('H', 'L'): '99',
|
| 158 |
+
('H', 'M'): '87',
|
| 159 |
+
('H', 'N'): '68',
|
| 160 |
+
('H', 'P'): '77',
|
| 161 |
+
('H', 'Q'): '24',
|
| 162 |
+
('H', 'R'): '29',
|
| 163 |
+
('H', 'S'): '89',
|
| 164 |
+
('H', 'T'): '47',
|
| 165 |
+
('H', 'V'): '84',
|
| 166 |
+
('H', 'W'): '115',
|
| 167 |
+
('H', 'Y'): '83',
|
| 168 |
+
('I', 'A'): '94',
|
| 169 |
+
('I', 'C'): '198',
|
| 170 |
+
('I', 'D'): '168',
|
| 171 |
+
('I', 'E'): '134',
|
| 172 |
+
('I', 'F'): '21',
|
| 173 |
+
('I', 'G'): '135',
|
| 174 |
+
('I', 'H'): '94',
|
| 175 |
+
('I', 'I'): '0',
|
| 176 |
+
('I', 'K'): '102',
|
| 177 |
+
('I', 'L'): '5',
|
| 178 |
+
('I', 'M'): '10',
|
| 179 |
+
('I', 'N'): '149',
|
| 180 |
+
('I', 'P'): '95',
|
| 181 |
+
('I', 'Q'): '109',
|
| 182 |
+
('I', 'R'): '97',
|
| 183 |
+
('I', 'S'): '142',
|
| 184 |
+
('I', 'T'): '89',
|
| 185 |
+
('I', 'V'): '29',
|
| 186 |
+
('I', 'W'): '61',
|
| 187 |
+
('I', 'Y'): '33',
|
| 188 |
+
('K', 'A'): '106',
|
| 189 |
+
('K', 'C'): '202',
|
| 190 |
+
('K', 'D'): '101',
|
| 191 |
+
('K', 'E'): '56',
|
| 192 |
+
('K', 'F'): '102',
|
| 193 |
+
('K', 'G'): '127',
|
| 194 |
+
('K', 'H'): '32',
|
| 195 |
+
('K', 'I'): '102',
|
| 196 |
+
('K', 'K'): '0',
|
| 197 |
+
('K', 'L'): '107',
|
| 198 |
+
('K', 'M'): '95',
|
| 199 |
+
('K', 'N'): '94',
|
| 200 |
+
('K', 'P'): '103',
|
| 201 |
+
('K', 'Q'): '53',
|
| 202 |
+
('K', 'R'): '26',
|
| 203 |
+
('K', 'S'): '121',
|
| 204 |
+
('K', 'T'): '78',
|
| 205 |
+
('K', 'V'): '97',
|
| 206 |
+
('K', 'W'): '110',
|
| 207 |
+
('K', 'Y'): '85',
|
| 208 |
+
('L', 'A'): '96',
|
| 209 |
+
('L', 'C'): '198',
|
| 210 |
+
('L', 'D'): '172',
|
| 211 |
+
('L', 'E'): '138',
|
| 212 |
+
('L', 'F'): '22',
|
| 213 |
+
('L', 'G'): '138',
|
| 214 |
+
('L', 'H'): '99',
|
| 215 |
+
('L', 'I'): '5',
|
| 216 |
+
('L', 'K'): '107',
|
| 217 |
+
('L', 'L'): '0',
|
| 218 |
+
('L', 'M'): '15',
|
| 219 |
+
('L', 'N'): '153',
|
| 220 |
+
('L', 'P'): '98',
|
| 221 |
+
('L', 'Q'): '113',
|
| 222 |
+
('L', 'R'): '102',
|
| 223 |
+
('L', 'S'): '145',
|
| 224 |
+
('L', 'T'): '92',
|
| 225 |
+
('L', 'V'): '32',
|
| 226 |
+
('L', 'W'): '61',
|
| 227 |
+
('L', 'Y'): '36',
|
| 228 |
+
('M', 'A'): '84',
|
| 229 |
+
('M', 'C'): '196',
|
| 230 |
+
('M', 'D'): '160',
|
| 231 |
+
('M', 'E'): '126',
|
| 232 |
+
('M', 'F'): '28',
|
| 233 |
+
('M', 'G'): '127',
|
| 234 |
+
('M', 'H'): '87',
|
| 235 |
+
('M', 'I'): '10',
|
| 236 |
+
('M', 'K'): '95',
|
| 237 |
+
('M', 'L'): '15',
|
| 238 |
+
('M', 'M'): '0',
|
| 239 |
+
('M', 'N'): '142',
|
| 240 |
+
('M', 'P'): '87',
|
| 241 |
+
('M', 'Q'): '101',
|
| 242 |
+
('M', 'R'): '91',
|
| 243 |
+
('M', 'S'): '135',
|
| 244 |
+
('M', 'T'): '81',
|
| 245 |
+
('M', 'V'): '21',
|
| 246 |
+
('M', 'W'): '67',
|
| 247 |
+
('M', 'Y'): '36',
|
| 248 |
+
('N', 'A'): '111',
|
| 249 |
+
('N', 'C'): '139',
|
| 250 |
+
('N', 'D'): '23',
|
| 251 |
+
('N', 'E'): '42',
|
| 252 |
+
('N', 'F'): '158',
|
| 253 |
+
('N', 'G'): '80',
|
| 254 |
+
('N', 'H'): '68',
|
| 255 |
+
('N', 'I'): '149',
|
| 256 |
+
('N', 'K'): '94',
|
| 257 |
+
('N', 'L'): '153',
|
| 258 |
+
('N', 'M'): '142',
|
| 259 |
+
('N', 'N'): '0',
|
| 260 |
+
('N', 'P'): '91',
|
| 261 |
+
('N', 'Q'): '46',
|
| 262 |
+
('N', 'R'): '86',
|
| 263 |
+
('N', 'S'): '46',
|
| 264 |
+
('N', 'T'): '65',
|
| 265 |
+
('N', 'V'): '133',
|
| 266 |
+
('N', 'W'): '174',
|
| 267 |
+
('N', 'Y'): '143',
|
| 268 |
+
('P', 'A'): '27',
|
| 269 |
+
('P', 'C'): '169',
|
| 270 |
+
('P', 'D'): '108',
|
| 271 |
+
('P', 'E'): '93',
|
| 272 |
+
('P', 'F'): '114',
|
| 273 |
+
('P', 'G'): '42',
|
| 274 |
+
('P', 'H'): '77',
|
| 275 |
+
('P', 'I'): '95',
|
| 276 |
+
('P', 'K'): '103',
|
| 277 |
+
('P', 'L'): '98',
|
| 278 |
+
('P', 'M'): '87',
|
| 279 |
+
('P', 'N'): '91',
|
| 280 |
+
('P', 'P'): '0',
|
| 281 |
+
('P', 'Q'): '76',
|
| 282 |
+
('P', 'R'): '103',
|
| 283 |
+
('P', 'S'): '74',
|
| 284 |
+
('P', 'T'): '38',
|
| 285 |
+
('P', 'V'): '68',
|
| 286 |
+
('P', 'W'): '147',
|
| 287 |
+
('P', 'Y'): '110',
|
| 288 |
+
('Q', 'A'): '91',
|
| 289 |
+
('Q', 'C'): '154',
|
| 290 |
+
('Q', 'D'): '61',
|
| 291 |
+
('Q', 'E'): '29',
|
| 292 |
+
('Q', 'F'): '116',
|
| 293 |
+
('Q', 'G'): '87',
|
| 294 |
+
('Q', 'H'): '24',
|
| 295 |
+
('Q', 'I'): '109',
|
| 296 |
+
('Q', 'K'): '53',
|
| 297 |
+
('Q', 'L'): '113',
|
| 298 |
+
('Q', 'M'): '101',
|
| 299 |
+
('Q', 'N'): '46',
|
| 300 |
+
('Q', 'P'): '76',
|
| 301 |
+
('Q', 'Q'): '0',
|
| 302 |
+
('Q', 'R'): '43',
|
| 303 |
+
('Q', 'S'): '68',
|
| 304 |
+
('Q', 'T'): '42',
|
| 305 |
+
('Q', 'V'): '96',
|
| 306 |
+
('Q', 'W'): '130',
|
| 307 |
+
('Q', 'Y'): '99',
|
| 308 |
+
('R', 'A'): '112',
|
| 309 |
+
('R', 'C'): '180',
|
| 310 |
+
('R', 'D'): '96',
|
| 311 |
+
('R', 'E'): '54',
|
| 312 |
+
('R', 'F'): '97',
|
| 313 |
+
('R', 'G'): '125',
|
| 314 |
+
('R', 'H'): '29',
|
| 315 |
+
('R', 'I'): '97',
|
| 316 |
+
('R', 'K'): '26',
|
| 317 |
+
('R', 'L'): '102',
|
| 318 |
+
('R', 'M'): '91',
|
| 319 |
+
('R', 'N'): '86',
|
| 320 |
+
('R', 'P'): '103',
|
| 321 |
+
('R', 'Q'): '43',
|
| 322 |
+
('R', 'R'): '0',
|
| 323 |
+
('R', 'S'): '110',
|
| 324 |
+
('R', 'T'): '71',
|
| 325 |
+
('R', 'V'): '96',
|
| 326 |
+
('R', 'W'): '101',
|
| 327 |
+
('R', 'Y'): '77',
|
| 328 |
+
('S', 'A'): '99',
|
| 329 |
+
('S', 'C'): '112',
|
| 330 |
+
('S', 'D'): '65',
|
| 331 |
+
('S', 'E'): '80',
|
| 332 |
+
('S', 'F'): '155',
|
| 333 |
+
('S', 'G'): '56',
|
| 334 |
+
('S', 'H'): '89',
|
| 335 |
+
('S', 'I'): '142',
|
| 336 |
+
('S', 'K'): '121',
|
| 337 |
+
('S', 'L'): '145',
|
| 338 |
+
('S', 'M'): '135',
|
| 339 |
+
('S', 'N'): '46',
|
| 340 |
+
('S', 'P'): '74',
|
| 341 |
+
('S', 'Q'): '68',
|
| 342 |
+
('S', 'R'): '110',
|
| 343 |
+
('S', 'S'): '0',
|
| 344 |
+
('S', 'T'): '58',
|
| 345 |
+
('S', 'V'): '124',
|
| 346 |
+
('S', 'W'): '177',
|
| 347 |
+
('S', 'Y'): '144',
|
| 348 |
+
('T', 'A'): '58',
|
| 349 |
+
('T', 'C'): '149',
|
| 350 |
+
('T', 'D'): '85',
|
| 351 |
+
('T', 'E'): '65',
|
| 352 |
+
('T', 'F'): '103',
|
| 353 |
+
('T', 'G'): '59',
|
| 354 |
+
('T', 'H'): '47',
|
| 355 |
+
('T', 'I'): '89',
|
| 356 |
+
('T', 'K'): '78',
|
| 357 |
+
('T', 'L'): '92',
|
| 358 |
+
('T', 'M'): '81',
|
| 359 |
+
('T', 'N'): '65',
|
| 360 |
+
('T', 'P'): '38',
|
| 361 |
+
('T', 'Q'): '42',
|
| 362 |
+
('T', 'R'): '71',
|
| 363 |
+
('T', 'S'): '58',
|
| 364 |
+
('T', 'T'): '0',
|
| 365 |
+
('T', 'V'): '69',
|
| 366 |
+
('T', 'W'): '128',
|
| 367 |
+
('T', 'Y'): '92',
|
| 368 |
+
('V', 'A'): '64',
|
| 369 |
+
('V', 'C'): '192',
|
| 370 |
+
('V', 'D'): '152',
|
| 371 |
+
('V', 'E'): '121',
|
| 372 |
+
('V', 'F'): '50',
|
| 373 |
+
('V', 'G'): '109',
|
| 374 |
+
('V', 'H'): '84',
|
| 375 |
+
('V', 'I'): '29',
|
| 376 |
+
('V', 'K'): '97',
|
| 377 |
+
('V', 'L'): '32',
|
| 378 |
+
('V', 'M'): '21',
|
| 379 |
+
('V', 'N'): '133',
|
| 380 |
+
('V', 'P'): '68',
|
| 381 |
+
('V', 'Q'): '96',
|
| 382 |
+
('V', 'R'): '96',
|
| 383 |
+
('V', 'S'): '124',
|
| 384 |
+
('V', 'T'): '69',
|
| 385 |
+
('V', 'V'): '0',
|
| 386 |
+
('V', 'W'): '88',
|
| 387 |
+
('V', 'Y'): '55',
|
| 388 |
+
('W', 'A'): '148',
|
| 389 |
+
('W', 'C'): '215',
|
| 390 |
+
('W', 'D'): '181',
|
| 391 |
+
('W', 'E'): '152',
|
| 392 |
+
('W', 'F'): '40',
|
| 393 |
+
('W', 'G'): '184',
|
| 394 |
+
('W', 'H'): '115',
|
| 395 |
+
('W', 'I'): '61',
|
| 396 |
+
('W', 'K'): '110',
|
| 397 |
+
('W', 'L'): '61',
|
| 398 |
+
('W', 'M'): '67',
|
| 399 |
+
('W', 'N'): '174',
|
| 400 |
+
('W', 'P'): '147',
|
| 401 |
+
('W', 'Q'): '130',
|
| 402 |
+
('W', 'R'): '101',
|
| 403 |
+
('W', 'S'): '177',
|
| 404 |
+
('W', 'T'): '128',
|
| 405 |
+
('W', 'V'): '88',
|
| 406 |
+
('W', 'W'): '0',
|
| 407 |
+
('W', 'Y'): '37',
|
| 408 |
+
('Y', 'A'): '112',
|
| 409 |
+
('Y', 'C'): '194',
|
| 410 |
+
('Y', 'D'): '160',
|
| 411 |
+
('Y', 'E'): '122',
|
| 412 |
+
('Y', 'F'): '22',
|
| 413 |
+
('Y', 'G'): '147',
|
| 414 |
+
('Y', 'H'): '83',
|
| 415 |
+
('Y', 'I'): '33',
|
| 416 |
+
('Y', 'K'): '85',
|
| 417 |
+
('Y', 'L'): '36',
|
| 418 |
+
('Y', 'M'): '36',
|
| 419 |
+
('Y', 'N'): '143',
|
| 420 |
+
('Y', 'P'): '110',
|
| 421 |
+
('Y', 'Q'): '99',
|
| 422 |
+
('Y', 'R'): '77',
|
| 423 |
+
('Y', 'S'): '144',
|
| 424 |
+
('Y', 'T'): '92',
|
| 425 |
+
('Y', 'V'): '55',
|
| 426 |
+
('Y', 'W'): '37',
|
| 427 |
+
('Y', 'Y'): '0'
|
| 428 |
+
}
|
| 429 |
+
for i in df.index:
|
| 430 |
+
try:
|
| 431 |
+
df.at[i, 'composition'] = compositionValues(df.at[i, 'wt'], df.at[i, 'mut'])
|
| 432 |
+
df.at[i, 'polarity'] = polarityValues(df.at[i, 'wt'], df.at[i, 'mut'])
|
| 433 |
+
df.at[i, 'volume'] = volumeValues(df.at[i, 'wt'], df.at[i, 'mut'])
|
| 434 |
+
df.at[i, 'granthamScore'] = grantham_dict[df.at[i, 'wt'], df.at[i, 'mut']]
|
| 435 |
+
except:
|
| 436 |
+
KeyError
|
| 437 |
+
df.at[i, 'composition'] = 'nan'
|
| 438 |
+
df.at[i, 'polarity'] = 'nan'
|
| 439 |
+
df.at[i, 'volume'] = 'nan'
|
| 440 |
+
df.at[i, 'granthamScore'] = 'nan'
|
| 441 |
+
return df
|
code/create_swissmodelSummary.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
| 0 |
help='Enter the directory where meta-data is found.',
|
| 1 |
default=1)
|
| 2 |
os.makedirs('input_files/extract_swissmodel_structures/', exist_ok=True)
|
| 3 |
all_swissmodel = open('input_files/swissmodel_structures.txt', 'w')
|
| 4 |
all_swissmodel.write('UniProtKB_ac iso_id uniprot_seq_length uniprot_seq_md5 coordinate_id provider from to template qmeandisco_global seqid url')
|
| 5 |
all_swissmodel.write('\n')
|
| 6 |
for f in glob.glob(f'{meta_data}/*.tar.gz'):
|
| 7 |
name = f.split('/')[-1].split('.')[0]
|
| 8 |
with tarfile.open(f) as tar:
|
| 9 |
tar.extractall(f'input_files/extract_swissmodel_structures/{name}')
|
| 10 |
with open(f'input_files/extract_swissmodel_structures/{name}/SWISS-MODEL_Repository/INDEX') as x:
|
| 11 |
lines = (x.readlines())[7:]
|
| 12 |
for line in lines:
|
| 13 |
all_swissmodel.write(line)
|
| 14 |
shutil.rmtree('input_files/extract_swissmodel_structures/')
|
| 15 |
swissmodel_file()
|
|
|
|
| 1 |
+
'''
|
| 2 |
help='Enter the directory where meta-data is found.',
|
| 3 |
default=1)
|
| 4 |
os.makedirs('input_files/extract_swissmodel_structures/', exist_ok=True)
|
| 5 |
all_swissmodel = open('input_files/swissmodel_structures.txt', 'w')
|
| 6 |
all_swissmodel.write('UniProtKB_ac iso_id uniprot_seq_length uniprot_seq_md5 coordinate_id provider from to template qmeandisco_global seqid url')
|
| 7 |
all_swissmodel.write('\n')
|
| 8 |
for f in glob.glob(f'{meta_data}/*.tar.gz'):
|
| 9 |
name = f.split('/')[-1].split('.')[0]
|
| 10 |
with tarfile.open(f) as tar:
|
| 11 |
tar.extractall(f'input_files/extract_swissmodel_structures/{name}')
|
| 12 |
with open(f'input_files/extract_swissmodel_structures/{name}/SWISS-MODEL_Repository/INDEX') as x:
|
| 13 |
lines = (x.readlines())[7:]
|
| 14 |
for line in lines:
|
| 15 |
all_swissmodel.write(line)
|
| 16 |
shutil.rmtree('input_files/extract_swissmodel_structures/')
|
| 17 |
swissmodel_file()
|
code/get_alphafoldStructures.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tarfile, glob, os
|
| 2 |
+
from biopandas.pdb import PandasPdb
|
| 3 |
+
import argparse
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
parser = argparse.ArgumentParser(description='ASCARIS')
|
| 7 |
+
|
| 8 |
+
parser.add_argument('-file_name', '--file_name',
|
| 9 |
+
help='Enter the file tar file name to untar',
|
| 10 |
+
default=1)
|
| 11 |
+
|
| 12 |
+
args = parser.parse_args()
|
| 13 |
+
|
| 14 |
+
alphafold = args.file_name
|
| 15 |
+
|
| 16 |
+
def threeToOne(variant):
|
| 17 |
+
if variant == "ALA":
|
| 18 |
+
variant = "A"
|
| 19 |
+
elif variant == "ARG":
|
| 20 |
+
variant = "R"
|
| 21 |
+
elif variant == "VAL":
|
| 22 |
+
variant = "V"
|
| 23 |
+
elif variant == "GLU":
|
| 24 |
+
variant = "E"
|
| 25 |
+
elif variant == "PRO":
|
| 26 |
+
variant = "P"
|
| 27 |
+
elif variant == "LEU":
|
| 28 |
+
variant = "L"
|
| 29 |
+
elif variant == "GLY":
|
| 30 |
+
variant = "G"
|
| 31 |
+
elif variant == "ASN":
|
| 32 |
+
variant = "N"
|
| 33 |
+
elif variant == "SER":
|
| 34 |
+
variant = "S"
|
| 35 |
+
elif variant == "GLN":
|
| 36 |
+
variant = "Q"
|
| 37 |
+
elif variant == "THR":
|
| 38 |
+
variant = "T"
|
| 39 |
+
elif variant == "MET":
|
| 40 |
+
variant = "M"
|
| 41 |
+
elif variant == "LYS":
|
| 42 |
+
variant = "K"
|
| 43 |
+
elif variant == "ASP":
|
| 44 |
+
variant = "D"
|
| 45 |
+
elif variant == "ILE":
|
| 46 |
+
variant = "I"
|
| 47 |
+
elif variant == "PHE":
|
| 48 |
+
variant = "F"
|
| 49 |
+
elif variant == "TRP":
|
| 50 |
+
variant = "W"
|
| 51 |
+
elif variant == "TYR":
|
| 52 |
+
variant = "Y"
|
| 53 |
+
elif variant == "HIS":
|
| 54 |
+
variant = "H"
|
| 55 |
+
elif variant == "CYS":
|
| 56 |
+
variant = "C"
|
| 57 |
+
elif variant == 'UNK':
|
| 58 |
+
variant = 'X'
|
| 59 |
+
elif variant == 'ASX':
|
| 60 |
+
variant = 'O'
|
| 61 |
+
return (variant)
|
| 62 |
+
# Unzip AlphaFold structures
|
| 63 |
+
|
| 64 |
+
def create_file():
|
| 65 |
+
os.makedirs('input_files/alphafold_structures/', exist_ok=True)
|
| 66 |
+
for f in glob.glob(f'input_files/{alphafold}'):
|
| 67 |
+
with tarfile.open(f) as tar:
|
| 68 |
+
tar.extractall(f'input_files/alphafold_structures/')
|
| 69 |
+
|
| 70 |
+
# Create summary file
|
| 71 |
+
alphafold_summary_file = open('input_files/alphafold_summary.txt', 'w')
|
| 72 |
+
alphafold_summary_file.write('uniprotID\tchain\tsequence\tmodel_num')
|
| 73 |
+
alphafold_summary_file.write('\n')
|
| 74 |
+
for f in glob.glob('input_files/alphafold_structures/*pdb*'):
|
| 75 |
+
str1 = PandasPdb().read_pdb(f)
|
| 76 |
+
str1 = str1.df['ATOM']
|
| 77 |
+
str1 = str1[['alt_loc', 'residue_name', 'residue_number', 'atom_name', 'insertion', 'chain_id']]
|
| 78 |
+
str1 = str1[str1.atom_name == 'CA']
|
| 79 |
+
str1['residue_name'] = str1['residue_name'].apply(lambda x: threeToOne(x))
|
| 80 |
+
str1['alt_loc'] = str1['alt_loc'].replace({'': np.NaN})
|
| 81 |
+
str1 = str1.drop_duplicates(['residue_name', 'residue_number'])
|
| 82 |
+
structure_residues_pdb = ''.join(str1.residue_name.to_list())
|
| 83 |
+
model_no = f.split('-')[2].strip()[1:]
|
| 84 |
+
up_name = f.split('-')[1].strip()
|
| 85 |
+
chain_id = list(set(str1.chain_id.to_list()))[0]
|
| 86 |
+
alphafold_summary_file.write(up_name)
|
| 87 |
+
alphafold_summary_file.write('\t')
|
| 88 |
+
alphafold_summary_file.write(chain_id)
|
| 89 |
+
alphafold_summary_file.write('\t')
|
| 90 |
+
alphafold_summary_file.write(structure_residues_pdb)
|
| 91 |
+
alphafold_summary_file.write('\t')
|
| 92 |
+
alphafold_summary_file.write(model_no)
|
| 93 |
+
alphafold_summary_file.write('\n')
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
if __name__ == '__main__':
|
| 97 |
+
create_file()
|
code/main.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pdb_featureVector
|
| 2 |
+
import alphafold_featureVector
|
| 3 |
+
import argparse
|
| 4 |
+
|
| 5 |
+
parser = argparse.ArgumentParser(description='ASCARIS')
|
| 6 |
+
|
| 7 |
+
parser.add_argument('-s', '--source_option',
|
| 8 |
+
help='Selection of input structure data.\n 1: PDB Structures (default), 2: AlphaFold Structures',
|
| 9 |
+
default=1)
|
| 10 |
+
parser.add_argument('-i', '--input_datapoint',
|
| 11 |
+
help='Input file or query datapoint\n Option 1: Comma-separated list of idenfiers (UniProt ID-wt residue-position-mutated residue (e.g. Q9Y4W6-N-432-T or Q9Y4W6-N-432-T, Q9Y4W6-N-432-T)) \n Option 2: Enter comma-separated file path')
|
| 12 |
+
|
| 13 |
+
parser.add_argument('-impute', '--imputation_state', default='True',
|
| 14 |
+
help='Whether resulting feature vector should be imputed or not. Default True.')
|
| 15 |
+
|
| 16 |
+
args = parser.parse_args()
|
| 17 |
+
|
| 18 |
+
input_set = args.input_datapoint
|
| 19 |
+
mode = args.source_option
|
| 20 |
+
impute = args.imputation_state
|
| 21 |
+
|
| 22 |
+
def run_featureVector(input_set, mode, impute):
|
| 23 |
+
print('*****************************************')
|
| 24 |
+
print('Feature vector generation is in progress. \nPlease check log file for updates..')
|
| 25 |
+
print('*****************************************')
|
| 26 |
+
mode = int(mode)
|
| 27 |
+
if mode == 1:
|
| 28 |
+
pdb_featureVector.pdb(input_set, mode, impute)
|
| 29 |
+
elif mode == 2:
|
| 30 |
+
alphafold_featureVector.alphafold(input_set, mode, impute)
|
| 31 |
+
|
| 32 |
+
if __name__ == '__main__':
|
| 33 |
+
run_featureVector(input_set, mode, impute)
|
| 34 |
+
|
| 35 |
+
|
code/manage_files.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
def manage_files(mode):
|
| 4 |
+
if mode== 1:
|
| 5 |
+
path_to_input_files = Path('input_files')
|
| 6 |
+
path_to_domains = path_to_input_files / 'domains.txt'
|
| 7 |
+
swiss_model_path = path_to_input_files / 'INDEX.json'
|
| 8 |
+
fisher_path = path_to_input_files / 'significant_domains.txt'
|
| 9 |
+
path_to_interfaces = path_to_input_files / 'H_sapiens_interfacesHQ.txt'
|
| 10 |
+
|
| 11 |
+
path_to_output_files = Path('out_files/pdb')
|
| 12 |
+
os.makedirs(path_to_output_files / 'pdb_structures/', exist_ok=True)
|
| 13 |
+
os.makedirs(path_to_output_files / 'alignment_files/', exist_ok=True)
|
| 14 |
+
os.makedirs(path_to_output_files / 'swissmodel_structures/', exist_ok=True)
|
| 15 |
+
os.makedirs(path_to_output_files / 'modbase_structures/', exist_ok=True)
|
| 16 |
+
os.makedirs(path_to_output_files / 'modbase_structures_individual/', exist_ok=True)
|
| 17 |
+
os.makedirs(path_to_output_files / 'freesasa_files/', exist_ok=True)
|
| 18 |
+
os.makedirs(path_to_output_files / '3D_alignment/', exist_ok=True)
|
| 19 |
+
path_to_alignment_files = path_to_output_files / 'alignment_files'
|
| 20 |
+
path_3D_alignment = path_to_output_files / '3D_alignment'
|
| 21 |
+
path_to_freesasa = path_to_output_files / 'freesasa_files'
|
| 22 |
+
buffer = path_to_output_files / 'file_buffer.txt'
|
| 23 |
+
outpath = path_to_output_files / 'feature_vector.txt'
|
| 24 |
+
|
| 25 |
+
return path_to_input_files, path_to_output_files, path_to_domains,fisher_path, path_to_interfaces, buffer
|
| 26 |
+
|
| 27 |
+
elif mode == 2:
|
| 28 |
+
path_to_input_files = Path('input_files')
|
| 29 |
+
path_to_domains = path_to_input_files / 'domains.txt'
|
| 30 |
+
fisher_path = path_to_input_files / 'significant_domains.txt'
|
| 31 |
+
alphafold_summary = path_to_input_files / 'alphafold_summary.txt'
|
| 32 |
+
path_to_interfaces = path_to_input_files / 'H_sapiens_interfacesHQ.txt'
|
| 33 |
+
# Unzip before using
|
| 34 |
+
alphafold_path = Path(path_to_input_files/'alphafold_structures')
|
| 35 |
+
|
| 36 |
+
path_to_output_files = Path('out_files/alphafold')
|
| 37 |
+
os.makedirs(path_to_output_files, exist_ok=True)
|
| 38 |
+
os.makedirs(path_to_output_files / 'freesasa_files', exist_ok=True)
|
| 39 |
+
os.makedirs(path_to_output_files / 'alignment_files', exist_ok=True)
|
| 40 |
+
os.makedirs(path_to_output_files / '3D_alignment', exist_ok=True)
|
| 41 |
+
|
| 42 |
+
return path_to_input_files,path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, alphafold_path, alphafold_summary
|
code/pdb_featureVector.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
code/process_input.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
def clean_data(input_set):
|
| 4 |
+
data = pd.DataFrame()
|
| 5 |
+
try:
|
| 6 |
+
if ',' in input_set:
|
| 7 |
+
input_set = [i.strip() for i in input_set.split(',')]
|
| 8 |
+
for i in input_set:
|
| 9 |
+
data = data.append(pd.Series([j.strip() for j in i.split('-')]), ignore_index=True)
|
| 10 |
+
data.columns = ['uniprotID', 'wt', 'pos', 'mut']
|
| 11 |
+
elif '\t' in input_set:
|
| 12 |
+
input_set = [i.strip() for i in input_set.split('\t')]
|
| 13 |
+
for i in input_set:
|
| 14 |
+
data = data.append(pd.Series([j.strip() for j in i.split('-')]), ignore_index=True)
|
| 15 |
+
data.columns = ['uniprotID', 'wt', 'pos', 'mut']
|
| 16 |
+
|
| 17 |
+
elif '-' in input_set:
|
| 18 |
+
data = data.append(pd.Series([j.strip() for j in input_set.split('-')]), ignore_index=True)
|
| 19 |
+
data.columns = ['uniprotID', 'wt', 'pos', 'mut']
|
| 20 |
+
|
| 21 |
+
elif '.txt' in input_set:
|
| 22 |
+
data = pd.read_csv(input_set, sep='\t', names=['uniprotID', 'wt', 'pos', 'mut'])
|
| 23 |
+
data = data[['uniprotID', 'wt', 'pos', 'mut']]
|
| 24 |
+
|
| 25 |
+
# Exclude termination codons, synonymous mutations and any non-standard residues such as Sec, 4 or 6.
|
| 26 |
+
aa_list = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
|
| 27 |
+
data.wt = data.wt.str.strip()
|
| 28 |
+
data.mut = data.mut.str.strip()
|
| 29 |
+
data = data[data.wt.isin(aa_list)]
|
| 30 |
+
data = data[data.mut.isin(aa_list)]
|
| 31 |
+
|
| 32 |
+
for i in data.index:
|
| 33 |
+
data.at[i, 'datapoint'] = data.at[i, 'uniprotID'] + data.at[i, 'wt'] + str(data.at[i, 'pos']) + data.at[i, 'mut']
|
| 34 |
+
|
| 35 |
+
data = data.astype(str)
|
| 36 |
+
return data
|
| 37 |
+
except:
|
| 38 |
+
ValueError
|
| 39 |
+
print('Please check the input format.')
|
| 40 |
+
|
code/standard.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def standardize(df, get_columns):
|
| 2 |
+
cols_to_change = ['sasa', 'domaindistance3D', 'disulfide', 'intMet', 'intramembrane',
|
| 3 |
+
'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
|
| 4 |
+
'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
|
| 5 |
+
'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding',
|
| 6 |
+
'topologicalDomain', 'bindingSite', 'region', 'signalPeptide',
|
| 7 |
+
'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
| 8 |
+
'transitPeptide', 'glycosylation', 'propeptide']
|
| 9 |
+
for col in cols_to_change: # because in the other ones, they are 3D distance. Here, no distance calculated.
|
| 10 |
+
df[col] = 'nan'
|
| 11 |
+
df = df[get_columns.columns]
|
| 12 |
+
|
| 13 |
+
return df
|
code/uniprotSequenceMatch.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from add_sequence import *
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
def uniprotSequenceMatch(data):
|
| 6 |
+
print('Retrieving UniProt sequences...\n')
|
| 7 |
+
|
| 8 |
+
canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence'])
|
| 9 |
+
up_list = list(set(data['uniprotID'].to_list()))
|
| 10 |
+
for i in range(len(up_list)):
|
| 11 |
+
canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i])
|
| 12 |
+
canonical_fasta.at[i, 'uniprotID'] = up_list[i]
|
| 13 |
+
|
| 14 |
+
canonical_fasta = canonical_fasta.drop_duplicates()
|
| 15 |
+
isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence'])
|
| 16 |
+
iso_dict = []
|
| 17 |
+
for i in range(len(up_list)):
|
| 18 |
+
iso_dict.append(get_isoforms(up_list[i]))
|
| 19 |
+
|
| 20 |
+
index = 0
|
| 21 |
+
for i in iso_dict:
|
| 22 |
+
for key, val in i.items():
|
| 23 |
+
isoform_fasta.at[index, 'uniprotID'] = key
|
| 24 |
+
isoform_fasta.at[index, 'isoformSequence'] = val
|
| 25 |
+
index += 1
|
| 26 |
+
isoform_fasta = isoform_fasta.drop_duplicates()
|
| 27 |
+
|
| 28 |
+
for i in isoform_fasta.index:
|
| 29 |
+
isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip()
|
| 30 |
+
isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6]
|
| 31 |
+
print('Sequence files created...\n')
|
| 32 |
+
|
| 33 |
+
data = data.merge(canonical_fasta, on='uniprotID', how='left')
|
| 34 |
+
data = data.replace({'': np.NaN, 'nan': np.NaN})
|
| 35 |
+
data['whichIsoform'] = np.NaN
|
| 36 |
+
data['wt_sequence_match'] = np.NaN
|
| 37 |
+
not_match_in_uniprot = data[data.uniprotSequence.isna()]
|
| 38 |
+
uniprot_matched = data[~data.uniprotSequence.isna()]
|
| 39 |
+
|
| 40 |
+
return not_match_in_uniprot, uniprot_matched, canonical_fasta, isoform_fasta
|
input_files/H_sapiens_interfacesHQ.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90fb5f5fe31e20921290e0da588d50d2939feedac80767cdd3b46225ce849b8d
|
| 3 |
+
size 19252152
|
input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.cif.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5a22037a2ae883cc095f647170271d6a69f38de045206e99c4ac5586658ccb3
|
| 3 |
+
size 26598
|
input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.pdb.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93e034885f400396df77e65944c65e8d22000f011343a98d8f7727b97b378860
|
| 3 |
+
size 18469
|
input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.cif.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:367a7e9d82ad6a452f643eed923237ed149cc3cf1dabef23304d4e4f5711a191
|
| 3 |
+
size 25647
|
input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.pdb.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:449fa624948266313cdf18a365e11036b6eaa5502395ed88b58f1841ebf70e60
|
| 3 |
+
size 17763
|
input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.cif.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35de071f52a5644df10d8181b5c6034b04734895e155b68d3e3f5133e98f3ef6
|
| 3 |
+
size 27026
|
input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.pdb.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a509714d54bdf9b9ad7a9bcdccc4122e256cec371fb04e251f68e2e67ade17a
|
| 3 |
+
size 18748
|
input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.cif.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6b9e658af67a6b4ca14f5c960c4629140eb78588c46cfe1fab3bbe2c1c7d17e
|
| 3 |
+
size 25157
|
input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.pdb.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b840d9a1c9de25dd6484ad2675f26e578e883c277d4e332247cb1f45a7706ffb
|
| 3 |
+
size 17329
|
input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.cif.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e9077d070c0fea099e5afdc10d4c599367064518be2412088e8f7f2213156f91
|
| 3 |
+
size 26786
|
input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.pdb.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1abd18dc11f67b8b3a3dd8b30c4a74fec7fefec62c601153401ca5c550c96dbd
|
| 3 |
+
size 18678
|
input_files/alphafold_structures/AF-A0A0A0MS02-F1-model_v1.cif.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db309cbaaf7d073230b4ab1a98ecc8213c6cfebfe87cc4f6f3990944feef7059
|
| 3 |
+
size 26727
|