Spaces:

HUBioDataLab
/

ASCARIS

Sleeping

App Files Files Community

fatmacankara commited on Jul 21, 2023

Commit

c2a02c6

0 Parent(s):

Duplicate from fatmacankara/ASCARIS

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +38 -0
README.md +14 -0
app.py +129 -0
code/__pycache__/add_3Dalignment.cpython-37.pyc +0 -0
code/__pycache__/add_alignment.cpython-37.pyc +0 -0
code/__pycache__/add_annotations.cpython-37.pyc +0 -0
code/__pycache__/add_domains.cpython-37.pyc +0 -0
code/__pycache__/add_interface_pos.cpython-37.pyc +0 -0
code/__pycache__/add_sasa.cpython-37.pyc +0 -0
code/__pycache__/add_sequence.cpython-37.pyc +0 -0
code/__pycache__/add_structure.cpython-37.pyc +0 -0
code/__pycache__/alphafold_featureVector.cpython-37.pyc +0 -0
code/__pycache__/alphafold_model.cpython-37.pyc +0 -0
code/__pycache__/calc_pc_property.cpython-37.pyc +0 -0
code/__pycache__/manage_files.cpython-37.pyc +0 -0
code/__pycache__/pdb_featureVector.cpython-37.pyc +0 -0
code/__pycache__/process_input.cpython-37.pyc +0 -0
code/__pycache__/standard.cpython-37.pyc +0 -0
code/__pycache__/uniprotSequenceMatch.cpython-37.pyc +0 -0
code/add_3Dalignment.py +261 -0
code/add_alignment.py +423 -0
code/add_annotations.py +95 -0
code/add_domains.py +57 -0
code/add_interface_pos.py +35 -0
code/add_sasa.py +131 -0
code/add_sequence.py +44 -0
code/add_structure.py +168 -0
code/alphafold_featureVector.py +579 -0
code/alphafold_model.py +33 -0
code/calc_pc_property.py +441 -0
code/create_swissmodelSummary.py +1 -0
code/get_alphafoldStructures.py +97 -0
code/main.py +35 -0
code/manage_files.py +42 -0
code/pdb_featureVector.py +0 -0
code/process_input.py +40 -0
code/standard.py +13 -0
code/uniprotSequenceMatch.py +40 -0
input_files/H_sapiens_interfacesHQ.txt +3 -0
input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.cif.gz +3 -0
input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.pdb.gz +3 -0
input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.cif.gz +3 -0
input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.pdb.gz +3 -0
input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.cif.gz +3 -0
input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.pdb.gz +3 -0
input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.cif.gz +3 -0
input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.pdb.gz +3 -0
input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.cif.gz +3 -0
input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.pdb.gz +3 -0
input_files/alphafold_structures/AF-A0A0A0MS02-F1-model_v1.cif.gz +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,38 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+input_files/alphafold_summary.txt filter=lfs diff=lfs merge=lfs -text
+input_files/H_sapiens_interfacesHQ.txt filter=lfs diff=lfs merge=lfs -text
+input_files/swissmodel_structures.txt filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: ASCARIS
+emoji: 🦀
+colorFrom: indigo
+colorTo: gray
+sdk: streamlit
+python_version: '3.7'
+sdk_version: 1.21.0
+app_file: app.py
+pinned: false
+duplicated_from: fatmacankara/ASCARIS
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import streamlit as st
+import pandas as pd
+from os import path
+import sys
+import streamlit.components.v1 as components
+sys.path.append('code/')
+#sys.path.append('ASCARIS/code/')
+import pdb_featureVector
+import alphafold_featureVector
+import argparse
+from st_aggrid import AgGrid, GridOptionsBuilder, JsCode,GridUpdateMode
+showWarningOnDirectExecution = False
+def download_button(object_to_download, download_filename):
+    if isinstance(object_to_download, pd.DataFrame):
+        object_to_download = object_to_download.to_csv(index=False)
+    # Try JSON encode for everything else
+    else:
+        object_to_download = json.dumps(object_to_download)
+    try:
+        # some strings <-> bytes conversions necessary here
+        b64 = base64.b64encode(object_to_download.encode()).decode()
+    except AttributeError as e:
+        b64 = base64.b64encode(object_to_download).decode()
+    dl_link = f"""<html><head><title>Start Auto Download file</title><script src="http://code.jquery.com/jquery-3.2.1.min.js"></script><script>$('<a href="data:text/csv;base64,{b64}" download="{download_filename}">')[0].click()</script></head></html>"""
+    return dl_link
+def download_df():
+    components.html(
+        download_button(selected_df, st.session_state.filename),
+        height=0,
+    )
+original_title = '<p style="font-family:Trebuchet MS; color:#FD7456; font-size: 35px; font-weight:bold; text-align:center">Welcome to ASCARIS</p>'
+st.markdown(original_title, unsafe_allow_html=True)
+st.write('')
+st.write('')
+st.write('')
+st.write('')
+source = st.selectbox('Select Protein Structure Database (1: PDB, SwissModel, Modbase 2: AlphaFold)',[1,2])
+impute = st.selectbox('Select Imputation',[True, False])
+input_data = st.text_input('Enter Input Variation')
+#sys.path.append(path.abspath('../code/'))
+parser = argparse.ArgumentParser(description='ASCARIS')
+parser.add_argument('-s', '--source_option',
+                    help='Selection of input structure data.\n 1: PDB Structures (default), 2: AlphaFold Structures',
+                    default=1)
+parser.add_argument('-i', '--input_datapoint',
+                    help='Input file or query datapoint\n Option 1: Comma-separated list of idenfiers (UniProt ID-wt residue-position-mutated residue (e.g. Q9Y4W6-N-432-T or Q9Y4W6-N-432-T, Q9Y4W6-N-432-T)) \n Option 2: Enter comma-separated file path')
+parser.add_argument('-impute', '--imputation_state', default='True',
+                    help='Whether resulting feature vector should be imputed or not. Default True.')
+args = parser.parse_args()
+input_set = input_data
+mode = source
+impute = impute
+print('*****************************************')
+print('Feature vector generation is in progress. \nPlease check log file for updates..')
+print('*****************************************')
+mode = int(mode)
+with st.spinner('In progress...This may take a while...'):
+    try:
+        if mode == 1:
+            selected_df = pdb_featureVector.pdb(input_set, mode, impute)
+            int_builder = GridOptionsBuilder.from_dataframe(selected_df)
+            int_builder.configure_default_column(editable=False, filterable=True, cellStyle={'text-align': 'center'})
+            int_builder.configure_pagination(enabled=True, paginationAutoPageSize=False, paginationPageSize=10)
+            int_builder.configure_selection(selection_mode='multiple', use_checkbox=True)
+            gridoptions = int_builder.build()
+            int_return = AgGrid(selected_df,
+                        width='100%',
+                        height=(len(selected_df) + 4) * 35.2 + 3,
+                        theme='light',
+                        enable_enterprise_modules=False,
+                        gridOptions=gridoptions,
+                        fit_columns_on_grid_load=False,
+                        update_mode=GridUpdateMode.SELECTION_CHANGED, # or MODEL_CHANGED
+                        custom_css={".ag-header-cell-label": {"justify-content": "center"}})
+            st.success('Feature vector successfully created.')
+        elif mode == 2:
+            selected_df = alphafold_featureVector.alphafold(input_set, mode, impute)
+            int_builder = GridOptionsBuilder.from_dataframe(selected_df)
+            int_builder.configure_default_column(editable=False, filterable=True, cellStyle={'text-align': 'center'})
+            int_builder.configure_pagination(enabled=True, paginationAutoPageSize=False, paginationPageSize=10)
+            int_builder.configure_selection(selection_mode='multiple', use_checkbox=True)
+            gridoptions = int_builder.build()
+            int_return = AgGrid(selected_df,
+                        width='100%',
+                        height=(len(selected_df) + 4) * 35.2 + 3,
+                        theme='light',
+                        enable_enterprise_modules=False,
+                        gridOptions=gridoptions,
+                        fit_columns_on_grid_load=False,
+                        update_mode=GridUpdateMode.SELECTION_CHANGED, # or MODEL_CHANGED
+                        custom_css={".ag-header-cell-label": {"justify-content": "center"}})
+            st.success('Feature vector successfully created.')
+    except:
+        pass
+        download_df = pd.DataFrame()
+with st.form("my_form", clear_on_submit=False):
+                st.text_input("Enter filename", key="filename")
+                submit = st.form_submit_button("Download feature vector", on_click=download_df)

code/__pycache__/add_3Dalignment.cpython-37.pyc ADDED Viewed

Binary file (5.67 kB). View file

code/__pycache__/add_alignment.cpython-37.pyc ADDED Viewed

Binary file (7.99 kB). View file

code/__pycache__/add_annotations.cpython-37.pyc ADDED Viewed

Binary file (3.78 kB). View file

code/__pycache__/add_domains.cpython-37.pyc ADDED Viewed

Binary file (1.44 kB). View file

code/__pycache__/add_interface_pos.cpython-37.pyc ADDED Viewed

Binary file (1.12 kB). View file

code/__pycache__/add_sasa.cpython-37.pyc ADDED Viewed

Binary file (3.17 kB). View file

code/__pycache__/add_sequence.cpython-37.pyc ADDED Viewed

Binary file (1.27 kB). View file

code/__pycache__/add_structure.cpython-37.pyc ADDED Viewed

Binary file (5.93 kB). View file

code/__pycache__/alphafold_featureVector.cpython-37.pyc ADDED Viewed

Binary file (15.4 kB). View file

code/__pycache__/alphafold_model.cpython-37.pyc ADDED Viewed

Binary file (1.35 kB). View file

code/__pycache__/calc_pc_property.cpython-37.pyc ADDED Viewed

Binary file (8.84 kB). View file

code/__pycache__/manage_files.cpython-37.pyc ADDED Viewed

Binary file (1.43 kB). View file

code/__pycache__/pdb_featureVector.cpython-37.pyc ADDED Viewed

Binary file (33.7 kB). View file

code/__pycache__/process_input.cpython-37.pyc ADDED Viewed

Binary file (1.69 kB). View file

code/__pycache__/standard.cpython-37.pyc ADDED Viewed

Binary file (749 Bytes). View file

code/__pycache__/uniprotSequenceMatch.cpython-37.pyc ADDED Viewed

Binary file (1.28 kB). View file

code/add_3Dalignment.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+This code file produces alignments between the structure and the sequence for a given protein.
+"""
+import math
+import glob
+import numpy as np
+from Bio import Align
+import gzip
+from pathlib import Path
+from Bio.Align import substitution_matrices
+aligner = Align.PairwiseAligner()
+def distance(x1, y1, z1, x2, y2, z2):
+    d = math.sqrt(math.pow(x2 - x1, 2) +
+                  math.pow(y2 - y1, 2) +
+                  math.pow(z2 - z1, 2) * 1.0)
+    return d
+def find_distance(coordMut, coordAnnot):
+    if coordMut != np.NaN:
+        try:
+            dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
+                            float(coordAnnot[1]), float(coordAnnot[2]))
+            return "%.2f" % dist
+        except:
+            ValueError
+            dist = 'nan'
+            return dist
+    else:
+        return np.NaN
+def threeToOne(variant):
+    if variant == "ALA":
+        variant = "A"
+    elif variant == "ARG":
+        variant = "R"
+    elif variant == "VAL":
+        variant = "V"
+    elif variant == "GLU":
+        variant = "E"
+    elif variant == "PRO":
+        variant = "P"
+    elif variant == "LEU":
+        variant = "L"
+    elif variant == "GLY":
+        variant = "G"
+    elif variant == "ASN":
+        variant = "N"
+    elif variant == "SER":
+        variant = "S"
+    elif variant == "GLN":
+        variant = "Q"
+    elif variant == "THR":
+        variant = "T"
+    elif variant == "MET":
+        variant = "M"
+    elif variant == "LYS":
+        variant = "K"
+    elif variant == "ASP":
+        variant = "D"
+    elif variant == "ILE":
+        variant = "I"
+    elif variant == "PHE":
+        variant = "F"
+    elif variant == "TRP":
+        variant = "W"
+    elif variant == "TYR":
+        variant = "Y"
+    elif variant == "HIS":
+        variant = "H"
+    elif variant == "CYS":
+        variant = "C"
+    elif variant == 'UNK':
+        variant = 'X'
+    elif variant == 'ASX':
+        variant = 'O'
+    return (variant)
+def get_coords(annot, alignments, coords, resnums_for_sasa, mode):
+    if mode == 1:
+        for alignment in alignments[0]:
+            alignment = (str(alignment).strip().split('\n'))
+            startGap = 0
+            if alignment[0].startswith('.'):
+                for k in alignment[0]:
+                    if k == '.' or k == '-':
+                        startGap += 1
+                    else:
+                        break
+            countGap = startGap
+            countResidue = 0
+            for j in alignment[0][startGap:]:
+                if j == '.' or j == '-':
+                    countGap += 1
+                else:
+                    countResidue += 1
+                if countResidue == float(annot):
+                    break
+            countGap_pdb = 0
+            countResidue_pdb = 0
+            for m in alignment[2][0:countResidue + countGap - 1]:
+                if m == '.' or m == '-':
+                    countGap_pdb += 1
+            posAtom = countResidue + countGap - countGap_pdb
+            realpdbStart = 0
+            for j in alignment[2]:
+                if j == '.' or j == '-':
+                    realpdbStart += 1
+                else:
+                    break
+            if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
+                try:
+                    coordinates = alignments[1]
+                    residue_numbers = alignments[2]
+                    coordWeWant = coordinates[posAtom - 1]
+                    residue_number_we_want = residue_numbers[posAtom - 1]
+                except:
+                    IndexError
+                    coordWeWant = 'nan'
+            else:
+                coordWeWant = 'nan'
+            return coordWeWant, posAtom, residue_number_we_want
+    if mode == 2:
+        if annot != 'nan':
+            if int(annot) <= 1400:
+                alignment = (str(alignments).strip().split('\n'))
+                startGap = 0
+                if alignment[0].startswith('.'):
+                    for k in alignment[0]:
+                        if k == '.' or k == '-':
+                            startGap += 1
+                        else:
+                            break
+                countGap = startGap
+                countResidue = 0
+                for j in alignment[0][startGap:]:
+                    if j == '.' or j == '-':
+                        countGap += 1
+                    else:
+                        countResidue += 1
+                    if countResidue == float(annot):
+                        break
+                countGap_pdb = 0
+                countResidue_pdb = 0
+                for m in alignment[2][0:countResidue + countGap - 1]:
+                    if m == '.' or m == '-':
+                        countGap_pdb += 1
+                posAtom = countResidue + countGap - countGap_pdb
+                realpdbStart = 0
+                for j in alignment[2]:
+                    if j == '.' or j == '-':
+                        realpdbStart += 1
+                    else:
+                        break
+                if len(alignment[2]) > (countResidue + countGap - 1):
+                    if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
+                        try:
+                            coordinates = coords
+                            residue_numbers = resnums_for_sasa
+                            coordWeWant = coordinates[posAtom - 1]
+                            residue_number_we_want = residue_numbers[posAtom - 1]
+                        except:
+                            IndexError
+                            coordWeWant = 'nan'
+                            residue_number_we_want = 'nan'
+                    else:
+                        coordWeWant = 'nan'
+                        residue_number_we_want = 'nan'
+                    return coordWeWant, posAtom, residue_number_we_want
+                else:
+                    coordWeWant = 'nan'
+                    residue_number_we_want = 'nan'
+                    return coordWeWant, posAtom, residue_number_we_want
+            else:
+                return np.NaN, np.NaN, np.NaN
+        else:
+            return np.NaN, np.NaN, np.NaN
+def get_alignments_3D(identifier, model_num, pdb_path, pdbSequence, source, chain, pdbID, mode, path_3D_alignment,file_format = 'gzip'):
+    if mode == 1:
+        atomSequence = ''
+        coords = []
+        resnums_for_sasa = []
+        with open(pdb_path, encoding="utf8") as f:
+            for line in f.readlines():
+                if source != 'MODBASE':
+                    if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21].upper() == chain.upper():
+                        atomSequence += threeToOne(line[17:20].strip())
+                        coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
+                        resnums_for_sasa.append(line[22:26].strip())
+                    elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
+                        atomSequence += threeToOne(line[17:20].strip())
+                        coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
+                        resnums_for_sasa.append(line[22:26].strip())
+                else:
+                    if line[0:7].strip() == 'ATOM' and line[13:15].strip() == 'CA':
+                        atomSequence += threeToOne(line[17:20].strip())
+                        coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
+                        resnums_for_sasa.append(line[22:26].strip())
+        f = open(Path(path_3D_alignment / f'{identifier}_{pdbID}_{str(chain)}_alignment.txt'),"w")
+        aligner.mode = 'local'
+        aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
+        aligner.open_gap_score = -11
+        aligner.extend_gap_score = -1
+        alignments = aligner.align(pdbSequence, atomSequence)
+        alignments = (list(alignments))
+        for alignment in alignments:
+            f.write(str(alignment))
+            f.write('\n')
+            f.write('\n')
+        return alignments, coords, resnums_for_sasa
+    elif mode==2:
+            atomSequence = ''
+            coords = []
+            resnums_for_sasa = []
+            if file_format == 'txt':
+                with open(name, encoding="utf8") as f:
+                    for line in f.readlines():
+                        if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
+                            atomSequence += threeToOne(line[17:20].strip())
+                            coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
+                            resnums_for_sasa.append(line[22:26].strip())
+                        elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
+                            atomSequence += threeToOne(line[17:20].strip())
+                            coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
+                            resnums_for_sasa.append(line[22:26].strip())
+            elif file_format == 'gzip':
+                with gzip.open(pdb_path, mode='rb') as f:
+                    for line in f:
+                        line = line.decode()
+                        if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
+                            atomSequence += threeToOne(line[17:20].strip())
+                            coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
+                            resnums_for_sasa.append(line[22:26].strip())
+                        elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
+                            atomSequence += threeToOne(line[17:20].strip())
+                            coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
+                            resnums_for_sasa.append(line[22:26].strip())
+            f = open(Path(path_3D_alignment / f'{identifier}_{str(model_num)}_3Dalignment.txt'),"w")
+            aligner.mode = 'local'
+            aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
+            aligner.open_gap_score = -11
+            aligner.extend_gap_score = -1
+            alignments = aligner.align(pdbSequence, atomSequence)
+            alignments = (list(alignments))
+            for alignment in alignments:
+                f.write(str(alignment))
+                f.write('\n')
+                f.write('\n')
+            return alignments, coords, resnums_for_sasa

code/add_alignment.py ADDED Viewed

	@@ -0,0 +1,423 @@

+from Bio import Align
+from Bio.Align import substitution_matrices
+from pathlib import Path
+import streamlit as st
+from Bio.pairwise2 import format_alignment
+from Bio import pairwise2
+from Bio import pairwise2
+from Bio.SubsMat import MatrixInfo as matlist
+"""
+def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
+    aligner = Align.PairwiseAligner()
+    #print(f'Aligning Datapoint: {identifier}')
+    if len(pdbSequence) >= 1:
+        f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
+        aligner.mode = 'local'
+        aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
+        aligner.open_gap_score = -11
+        aligner.extend_gap_score = -1
+        alignments = aligner.align(uniprotSequence, pdbSequence)
+        alignments = (list(alignments))
+        merge_in_threes = str(alignments[0]).split('\n')
+        K = 3
+        res = ["".join(str(alignments[0]).split('\n')[idx: idx + K]) for idx in range(len(str(alignments[0]).split('\n')) - K + 1)]
+        slice_val = slice(0,len(res),4)
+        writtenlist = res[slice_val]
+        new_alignment = []
+        for i in writtenlist:
+            cont1 = list(filter(None, i.split('target')))
+            cont2 = cont1[0].split('query')
+            target_pos = (list(filter(None,cont2[0].split(' '))))[0]
+            target = (list(filter(None,cont2[0].split(' '))))[1]
+            alg_pos = (list(filter(None,cont2[0].split(' '))))[2]
+            alg = (list(filter(None,cont2[0].split(' '))))[3]
+            query_pos = (list(filter(None,cont2[1].split(' '))))[0]
+            query = (list(filter(None,cont2[1].split(' '))))[1]
+            if int(target_pos)>0:
+                new_target = int(target_pos) * 'X' + target
+            else:
+                new_target = int(target_pos) * ' ' + target
+            if int(alg_pos)>0:
+                new_alg = int(target_pos) * 'X' + target
+            else:
+                new_alg = int(target_pos) * ' ' + alg
+            if int(query_pos)>0:
+                new_query = int(target_pos) * 'X' + target
+            else:
+                new_query = int(target_pos) * ' ' + target
+            new_alignment.append(new_target+'\n' +new_alg +'\n' +new_query)
+        alignment_list = []
+        k = 0
+        for alignment in new_alignment:
+            k += 1
+            st.write('COUNT', k)
+            st.write('alignment')
+            st.write(alignment)
+            f.write(str(alignment))
+            f.write('\n')
+            f.write('\n')
+            alignment = (str(alignment).strip().split('\n'))
+            alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
+            st.write('alignment_updated')
+            st.write(alignment)
+            alignment_list.append(alignment)
+    return alignment_list
+"""
+def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
+    aligner = Align.PairwiseAligner()
+    #print(f'Aligning Datapoint: {identifier}')
+    if len(pdbSequence) >= 1:
+        f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
+        aligner.mode = 'local'
+        aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
+        aligner.open_gap_score = -11
+        aligner.extend_gap_score = -1
+        alignments = aligner.align(uniprotSequence, pdbSequence)
+        sub_matrix = matlist.blosum62
+        alignments2 = pairwise2.align.localds(uniprotSequence, pdbSequence, sub_matrix, -11, -1)
+        alignment_list = []
+        k = 0
+        for alignment in alignments:
+            f.write(str(alignment))
+            f.write('\n')
+            f.write('\n')
+            alignment = (str(alignment).strip().split('\n'))
+            alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
+            alignment_list.append(alignment)
+    return alignment_list
+def mutation_position_on_pdb(alignment_list, pos):
+    which_alignment_to_go = 0
+    for alignment in alignment_list:
+        #char_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
+        #for char in alignment[1]:
+          #  if char in char_list:
+            #    alignment[1] = alignment[1].replace(char, '.')
+        which_alignment_to_go += 1
+        alignment_uniprot = alignment[0]
+        alignment_pdb = alignment[2]
+        startGap = 0
+        if alignment_uniprot.startswith('.') or alignment_uniprot.startswith('-'):
+            for k in alignment_uniprot:
+                if k == '.' or k == '-':
+                    startGap += 1
+                else:
+                    break
+        countGap = startGap
+        countResidue = 0
+        canonicalRes = ' '
+        pdbRes = ' '
+        for j in alignment_uniprot[startGap:]:
+            if j == '.' or j == '-':
+                countGap += 1
+            else:
+                countResidue += 1
+            if int(countResidue) == int(pos):
+                canonicalRes = alignment_uniprot[countResidue + countGap - 1]
+                try:
+                    pdbRes = alignment_pdb[countResidue + countGap - 1]
+                except:
+                    IndexError
+                    pdbRes = 'nan'
+                break
+        if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
+            if canonicalRes == pdbRes:
+                pdb_alignStatus = 'aligned'
+            elif canonicalRes != pdbRes:
+                pdb_alignStatus = 'aligned*'
+            countGap_pdb = 0
+            countResidue_pdb = 0
+            pdbRes = ' '
+            for j in alignment_pdb[0:countResidue + countGap - 1]:
+                if j == '.' or j == '-':
+                    countGap_pdb += 1
+            if alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
+                countResidue + countGap - 1] == '-':
+                mutationPositionOnPDB = 'nan'
+                posPDB = 'nan'
+            else:
+                posPDB = countResidue + countGap - countGap_pdb
+                mutationPositionOnPDB = str(posPDB)
+            break
+        elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
+                alignment[1][poscountResidue+ countGap - 1] == '-')):
+            pdb_alignStatus = 'not_aligned'
+            mutationPositionOnPDB = 'nan'
+        elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
+                alignment[1][countResidue + countGap - 1] == '-')):
+            pdb_alignStatus = 'not_aligned'
+            mutationPositionOnPDB = 'nan'
+        elif alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
+            countResidue + countGap - 1] == '-':
+            mutationPositionOnPDB = 'nan'
+            posPDB = 'nan'
+    return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
+def find_position_on_pdb_for_range_annotations(posAnnotation, startGap, alignment_to_use):
+    annotation_on_pdb_start = 'nan'
+    annotation_on_pdb_end = 'nan'
+    pos1 = int(posAnnotation.split('-')[0])
+    count_gap = startGap
+    count_residue = 0
+    for j in alignment_to_use[0][startGap:]:
+        if j == '.' or j == '-':
+            count_gap += 1
+        else:
+            count_residue += 1
+        if int(count_residue) == int(pos1):  # count gaps until the first position
+            break
+    annotation_on_up_start = int(pos1) + int(count_gap)
+    pos2 = int(posAnnotation.split('-')[1])
+    count_gap = startGap
+    count_residue = 0
+    for j in alignment_to_use[0][startGap:]:
+        if j == '.' or j == '-':
+            count_gap += 1
+        else:
+            count_residue += 1
+        if int(count_residue) == int(pos2):  # count gaps until the first position
+            break
+    annotation_on_up_end = int(pos2) + int(count_gap)
+    try:
+        pdb_residue_start = alignment_to_use[2][annotation_on_up_start - 1].strip()
+        if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
+            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
+                if (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '.') and \
+                        (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '-') and \
+                        ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
+                         (alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
+                    annotation_on_up_start += ran
+                    break
+        elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
+                ((alignment_to_use[1][annotation_on_up_start - 1] == '.') or (
+                        alignment_to_use[1][annotation_on_up_start - 1] == '-')):
+            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
+                if ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
+                        (alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
+                    annotation_on_up_start += ran
+                    break
+        count_gap_pdb = 0
+        if annotation_on_up_start != 'nan':
+            for q in alignment_to_use[2][0:annotation_on_up_start - 1]:
+                if q == '.' or q == '-':
+                    count_gap_pdb += 1
+            if alignment_to_use[1][annotation_on_up_start] == '-' or alignment_to_use[1][annotation_on_up_start] == '.':
+                annotation_on_pdb_start = 'nan'
+            else:
+                annotation_on_pdb_start = int(annotation_on_up_start) - count_gap_pdb
+        else:
+            annotation_on_pdb_start = 'nan'
+    except:
+        IndexError
+    try:
+        pdb_residue_end = alignment_to_use[2][annotation_on_up_end - 1].strip()
+        if pdb_residue_end == '.' or pdb_residue_end == '-':
+            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
+                if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
+                        (alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
+                    annotation_on_up_start += (ran - 1)
+                    annotation_on_up_end = annotation_on_up_start
+                    break
+        elif (pdb_residue_end != '.') and (pdb_residue_end != '-') and \
+                ((alignment_to_use[1][annotation_on_up_end - 1] == '.') or (
+                        alignment_to_use[1][annotation_on_up_end - 1] == '-')):
+            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
+                if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
+                        (alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
+                    annotation_on_up_start += (ran - 1)
+                    annotation_on_up_end = annotation_on_up_start
+                    break
+        count_gap_pdb = 0
+        if annotation_on_up_end != 'nan':
+            for q in alignment_to_use[2][0:annotation_on_up_end - 1]:
+                if q == '.' or q == '-':
+                    count_gap_pdb += 1
+            if alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
+                annotation_on_up_end - 1] == '.' and annotation_on_pdb_start == 'nan':
+                annotation_on_pdb_end = 'nan'
+            elif alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
+                annotation_on_up_end - 1] == '.' and annotation_on_pdb_start != 'nan':
+                annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
+            else:
+                annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
+        else:
+            annotation_on_pdb_end = 'nan'
+    except:
+        IndexError  # Say isoform 2 is matched with the length 100, but canonical is 150 aa long. If there is an annotation at 105. position, for the isoform it throws an index error.
+    if annotation_on_pdb_start == 'nan' and annotation_on_pdb_end != 'nan':
+        annotation_on_pdb_start = annotation_on_up_start - count_gap_pdb
+    if annotation_on_pdb_start == annotation_on_pdb_end:
+        annotation_on_pdb_start = 'nan'
+        annotation_on_pdb_end = 'nan'
+    return annotation_on_up_start, annotation_on_up_end, annotation_on_pdb_start, annotation_on_pdb_end
+def annotation_pos_on_pdb(annot_positions, startGap, alignment_to_use, identifier):
+    newpos = []
+    if annot_positions != 'nan':
+        annot_positions = (str(annot_positions).replace("'", ''))
+        annot_positions = (str(annot_positions).replace('[', ''))
+        annot_positions = (str(annot_positions).replace("]", ''))
+        positionList_perAnnotation = annot_positions.split(',')
+        positionList_perAnnotation = [h.strip() for h in positionList_perAnnotation]
+        position_start_on_pdb = 'nan'
+        position_end_on_pdb = 'nan'
+        try:
+            positionList_perAnnotation = [i for i in positionList_perAnnotation if i != 'nan']
+        except:
+            TypeError
+        for position in range(len(positionList_perAnnotation)):
+            if ('-' not in str(positionList_perAnnotation[position])) and (str(positionList_perAnnotation[position]) != '?') and (str(positionList_perAnnotation[position]) != '') and (len(str(positionList_perAnnotation[position])) != 0):
+                count_gap = startGap
+                count_residue = 0
+                for j in alignment_to_use[0][startGap:]:
+                    if j == '.' or j == '-':
+                        count_gap += 1
+                    else:
+                        count_residue += 1
+                    try:
+                        if int(count_residue) == int(positionList_perAnnotation[position]):
+                            break
+                    except:
+                        ValueError
+                annotation_on_up = int(positionList_perAnnotation[position]) + int(count_gap)
+                try:
+                    pdb_residue_start = alignment_to_use[2][annotation_on_up - 1].strip()
+                except:
+                    IndexError
+                    pdb_residue_start = 'nan'
+                if pdb_residue_start != 'nan':
+                    try:
+                        if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
+                            for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
+                                if (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][ran] != '.') and \
+                                        (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][
+                                             ran] != '-') and \
+                                        ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
+                                              ran] == '|') or
+                                         (alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
+                                              ran] == 'X')):
+                                    annotation_on_up += ran
+                                    break
+                        elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
+                                ((alignment_to_use[1][annotation_on_up - 1] == '.') or (
+                                        alignment_to_use[1][annotation_on_up - 1] == '-')):
+                            for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
+                                if ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == '|') or
+                                        (alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == 'X')):
+                                    annotation_on_up += ran
+                                    break
+                        count_gap_pdb = 0
+                        for q in alignment_to_use[2][0:annotation_on_up - 1]:
+                            if q == '.' or q == '-':
+                                count_gap_pdb += 1
+                        if alignment_to_use[1][annotation_on_up] == '-' or alignment_to_use[1][
+                            annotation_on_up] == '.':
+                            annotation_on_pdb = 'nan'
+                        else:
+                            annotation_on_pdb = int(annotation_on_up) - count_gap_pdb
+                        if count_gap_pdb == annotation_on_up:
+                            annotation_on_pdb = 'nan'
+                        try:
+                            if alignment_to_use[2][count_gap_pdb + annotation_on_pdb - 1] == '.' or alignment_to_use[2][
+                                count_gap_pdb + annotation_on_pdb - 1] == '-':
+                                annotation_on_pdb = 'nan'
+                        except:
+                            IndexError
+                            annotation_on_pdb = 'nan'
+                    except:
+                        IndexError
+                        annotation_on_pdb = 'nan'
+                    newpos.append(annotation_on_pdb)
+            elif ('-' in str(positionList_perAnnotation[position])) and (
+                    str(positionList_perAnnotation[position]) != '?') and (
+                    str(positionList_perAnnotation[position]) != ' ') and (
+                    len(str(positionList_perAnnotation[position])) != 0):
+                try:
+                    position_start_on_pdb = \
+                        find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
+                                                                   startGap, alignment_to_use)[2]
+                    position_end_on_pdb = \
+                        find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
+                                                                   startGap, alignment_to_use)[3]
+                except:
+                    ValueError
+                newpositions = str(position_start_on_pdb) + '-' + str(position_end_on_pdb)
+                newpos.append(newpositions)
+            else:
+                pass
+    try:
+        newpos = [i for i in newpos if i != 'nan']
+    except:
+        TypeError
+    return newpos
+def final_stage(df, annotation_list, alignment_path):
+    for i in df.index:
+        identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
+        alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
+        df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
+        print()
+        df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
+        startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
+        alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
+        for annot in annotation_list:
+            df.at[i, annot] = annotation_pos_on_pdb(df.at[i, annot], startGap, alignment_to_use, identifier)
+        if str(df.at[i, 'domStart']) != 'nan' and str(df.at[i, 'domEnd']) != 'nan' and \
+                ((str(df.at[i, 'domStart']) != '-1' and str(df.at[i, 'domEnd']) != '-1' and
+                  str(df.at[i, 'domStart']) != '-1.0' and str(df.at[i, 'domEnd']) != '-1.0')):
+            domainLoc = str(df.at[i, 'domStart']).split('.')[0] + '-' + str(df.at[i, 'domEnd']).split('.')[0]
+            domain_pos = find_position_on_pdb_for_range_annotations(domainLoc, startGap, alignment_to_use)
+            df.at[i, 'domainStartonPDB'] = domain_pos[2]
+            df.at[i, 'domainEndonPDB'] = domain_pos[3]
+        elif str(df.at[i, 'domStart']) != '-1' or str(df.at[i, 'domEnd']) != '-1' or \
+                str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
+            df.at[i, 'domainStartonPDB'] = 'nan'
+            df.at[i, 'domainEndonPDB'] = 'nan'
+    df = df.astype(str)
+    return df
+def alignment(dataframe_to_align, annotation_list, alignment_path):
+    domainList = ['domStart', 'domEnd']
+    result = final_stage(dataframe_to_align, annotation_list, alignment_path)
+    return result
+#

code/add_annotations.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import ssl
+import requests as r
+from decimal import *
+import numpy as np
+def add_annotations(dataframe):
+    print('Downloading UniProt sequence annotations...\n')
+    ssl._create_default_https_context = ssl._create_unverified_context
+    original_annot_name = ['DISULFID', 'INIT_MET', 'INTRAMEM', 'VARIANT', 'DNA_BIND', 'ACT_SITE', 'NP_BIND', 'LIPID',
+                           'SITE',
+                           'TRANSMEM', 'CROSSLNK', 'MUTAGEN', 'STRAND', 'HELIX', 'TURN', 'METAL', 'REPEAT', 'TOPO_DOM',
+                           'CA_BIND', 'BINDING', 'REGION', 'SIGNAL', 'MOD_RES', 'ZN_FING', 'MOTIF', 'COILED', 'PEPTIDE',
+                           'TRANSIT', 'CARBOHYD', 'PROPEP']
+    annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                       'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
+                       'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
+                       'region',
+                       'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
+                       'transitPeptide', 'glycosylation', 'propeptide']
+    dataframe = dataframe.reset_index().drop(['index'], axis=1)
+    for annot in original_annot_name:
+        dataframe[annot] = ''
+    for protein in list(set(dataframe.uniprotID.to_list())):
+        print('Downloading annotations for ' + protein)
+        uniprot_entry = r.get("http://www.uniprot.org/uniprot/" + protein + ".txt")
+        uniprot_entry = uniprot_entry.text.split('\n')
+        annot_for_protein = []
+        for annotation in original_annot_name:
+            for line in uniprot_entry:
+                if annotation.strip() in line and line.startswith(
+                        'FT') and 'evidence' not in line and 'ECO' not in line and 'note' not in line:
+                    annot_for_protein.append(list(filter(None, line.split(' ')))[1:])
+        for select in annot_for_protein:
+            if select[0] not in dataframe.columns:
+                dataframe.loc[dataframe.uniprotID == protein, select[0]] = str((select[1] + '; '))
+            else:
+                dataframe.loc[dataframe.uniprotID == protein, select[0]] += str((select[1] + '; '))
+    for i in range(len(original_annot_name)):
+        dataframe = dataframe.rename(columns={original_annot_name[i]: annotation_list[i]})
+    # Fix annotation positions
+    print('Processing positions...\n')
+    for i in dataframe.index:
+        for annot in dataframe.columns[-30:]:
+            if annot != 'disulfide':
+                if dataframe.at[i, annot] != 'nan':
+                    dataframe.at[i, annot] = ([x for x in [k.strip() for k in dataframe.at[i, annot].split(';')] if x])
+                    if '..' not in str(dataframe.at[i, annot]):
+                        pass
+                    elif '..' in str(dataframe.at[i, annot]):
+                        dataframe.at[i, annot] = str(dataframe.at[i, annot]).replace('..', '-')
+            else:
+                disulfide_annot = []
+                if dataframe.at[i, annot] != 'nan':
+                    dataframe.at[i, annot]=  dataframe.at[i, annot].split(';')
+                    dataframe.at[i, annot] = [i.split('..') for i in dataframe.at[i, annot]]
+                    dataframe.at[i, annot] =[e for v in  dataframe.at[i, annot] for e in v]
+                    dataframe.at[i, annot] = [i for i in dataframe.at[i, annot] if i != ' ']
+    # Add binary annotations
+    print('Adding binary annotations...\n')
+    dataframe = dataframe.astype('str')
+    for i in dataframe.index:
+        for k in annotation_list:  # get the positions of each attribute as a list
+            txt = k + 'Binary'
+            dataframe.at[i, txt] = Decimal('nan')
+            try:
+                for positions in dataframe.at[i, k].split(','):
+                    position = positions.strip('[').strip(']').replace("'", "")
+                    if position != 'nan' and position != '' and '-' not in position and int(
+                            dataframe.at[i, 'pos']) == int(position):
+                        dataframe.at[i, txt] = '1'
+                        break
+                    elif position != 'nan' and position != '' and '-' not in position and int(
+                            dataframe.at[i, 'pos']) != int(position):
+                        dataframe.at[i, txt] = '0'
+                    elif position != 'nan' and position != '' and '-' in position:
+                        if int(position.split('-')[0]) < int(dataframe.at[i, 'pos']) < int(position.split('-')[1]):
+                            dataframe.at[i, txt] = '1'
+                            break
+                        else:
+                            dataframe.at[i, txt] = '0'
+            except:
+                ValueError
+    # Final corrections
+    dataframe = dataframe.replace({'[\'?\']': 'nan'})
+    dataframe = dataframe.replace({'[]': 'nan'})
+    return dataframe

code/add_domains.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from collections import Counter
+import pandas as pd
+def add_domains(data, path_to_domains):
+    domains = pd.read_csv(path_to_domains, delimiter=' ')
+    data = data.merge(domains, right_on='proteinID', left_on='uniprotID', how='left')
+    data = data.drop(['proteinID'], axis=1)
+    # Label each data point as range or notRange based on the relative distance of mutation and domain boundaries.
+    data = data.astype('str')
+    data.domStart = data.domStart.astype('float')
+    data.domEnd = data.domEnd.astype('float')
+    for i in data.index:
+        if data.at[i, 'domain'] != 'nan':
+            if int(data.at[i, 'domStart']) <= int(data.at[i, 'pos']) <= int(data.at[i, 'domEnd']):
+                data.at[i, 'distance'] = 0
+            else:
+                distance = min(abs(int(data.at[i, 'domStart']) - int(data.at[i, 'pos'])),
+                               abs(int(data.at[i, 'domEnd']) - int(data.at[i, 'pos'])))
+                data.at[i, 'distance'] = int(distance)
+        else:
+            data.at[i, 'distance'] = 'nan'
+    data = data.sort_values(by=['datapoint', 'distance']).reset_index(drop=True)  # Distances will be sorted.
+    # Keep the one with the least distance. But we may have more than one range domains for a datapoint if distance = 0.
+    # For this reason first we need to separate range ones so that when we take the first occurance to get the closest one
+    # for non range ones, other distance=0 ones wont disappear.
+    data_range = data[data.distance == 0]
+    data_out_range = data[data.distance != 0]
+    # For the range ones, find the most occurance
+    dom = []
+    for i in data_range.index:
+        dom.append(data_range.at[i, 'domain'])
+    domainCount = Counter(dom)  # Occurance of domains.
+    # For out of range ones, take the closest distance.
+    data_out_range = data_out_range.drop_duplicates(['datapoint'], keep='first')  # Already sorted above.
+    domain_counts = pd.DataFrame(domainCount.items(), columns=['domain', 'count'])
+    data_range_counts = data_range.merge(domain_counts, on='domain')
+    data_range_counts = data_range_counts.sort_values(['datapoint', 'count'])
+    data_range_counts = data_range_counts.drop_duplicates(['datapoint'], keep='last')  # Take with the higher count.
+    data_range_counts = data_range_counts.drop(['count'], axis=1)
+    # Merge them back together
+    frames = [data_range_counts, data_out_range]
+    data = pd.concat(frames, sort=False)  # Here when you concat two data frames, we might have range and not range with
+    # min distance for the same data point. Delete the one coming from notRange one.
+    data = data.sort_values(['datapoint', 'distance']).reset_index(drop=True)
+    data = data.drop_duplicates(['datapoint'], keep='first')
+    data = data.astype(str)
+    return data

code/add_interface_pos.py ADDED Viewed

	@@ -0,0 +1,35 @@

+def get_interface_positions(dataframe, column1, column2):
+    interface_positions = {}
+    for i in dataframe.index:
+        if dataframe.at[i, column1] not in interface_positions and dataframe.at[i, column1 + '_IRES'] != '[]':
+            interface_positions[dataframe.at[i, column1]] = dataframe.at[i, str(column1 + '_IRES')]
+        elif dataframe.at[i, column1] in interface_positions and dataframe.at[i, column1 + '_IRES'] != '[]':
+            interface_positions[dataframe.at[i, column1]] = interface_positions[dataframe.at[i, column1]].strip(
+                ']') + ',' + (dataframe.at[i, str(column1 + '_IRES')]).strip('[')
+        if dataframe.at[i, column2] not in interface_positions and dataframe.at[i, column2 + '_IRES'] != '[]':
+            interface_positions[dataframe.at[i, column2]] = dataframe.at[i, str(column2 + '_IRES')]
+        elif dataframe.at[i, column2] in interface_positions and dataframe.at[i, column2 + '_IRES'] != '[]':
+            interface_positions[dataframe.at[i, column2]] = interface_positions[dataframe.at[i, column2]].strip(
+                ']') + ',' + (dataframe.at[i, str(column2 + '_IRES')]).strip('[')
+    try:
+        for key, value in interface_positions.items():
+            n = []
+            m = []
+            if value != '[]':
+                valueList = value.split(',')
+                valueList[0] = str(valueList[0]).strip('[')
+                valueList[-1] = str(valueList[-1]).strip(']')
+                for val in valueList:
+                    if '-' in val:
+                        for r in range(int(val.split('-')[0]), int(val.split('-')[1]) + 1):
+                            n.append(r)
+                    else:
+                        m.append(int(val))
+                fin = m + n
+                interface_positions[key] = fin
+    except:
+        ValueError
+    return interface_positions

code/add_sasa.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import glob
+import ssbio.utils
+import subprocess
+import ssbio
+import os.path as op
+from add_3Dalignment import *
+import os
+from pathlib import Path
+import gzip
+import shutil
+import streamlit as st
+def run_freesasa(infile, outfile, include_hetatms=True, outdir=None, force_rerun=False, file_type = 'gzip'):
+    if not outdir:
+        outdir = ''
+    outfile = op.join(outdir, outfile)
+    if file_type == 'pdb':
+        if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
+            if include_hetatms:
+                shell_command = 'freesasa --format=rsa --hetatm {} -o {}'.format(infile, outfile)
+            else:
+                shell_command = 'freesasa --format=rsa {} -o {}'.format(infile, outfile)
+            command = subprocess.Popen(shell_command,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE,
+                                       shell=True)
+            out, err = command.communicate()
+    elif file_type == 'gzip':
+        with gzip.open(infile, 'rb') as f_in:
+            with open('file_temp.pdb', 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        infile = 'file_temp.pdb'
+        if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
+            if include_hetatms:
+                shell_command = 'freesasa --format=rsa --hetatm {} -o {}'.format(infile, outfile)
+            else:
+                shell_command = 'freesasa --format=rsa {} -o {}'.format(infile, outfile)
+            command = subprocess.Popen(shell_command,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE,
+                                       shell=True)
+            out, err = command.communicate()
+    return outfile
+def calculate_freesasa(ID, model_num, existing_free_sasa, path_to_input,path_to_output_files, file_type = 'gzip'):
+    print('Calculating surface area...\n')
+    file_base = str(Path(path_to_input / '*'))
+    file_str = glob.glob(file_base)[0].split('-')[-1].split('.')[0]
+    if file_type == 'gzip':
+        if ID not in existing_free_sasa:
+            fullID = f'AF-{ID}-F{model_num}-{file_str }.pdb.gz'
+            run_freesasa(Path(path_to_input / fullID),
+                         Path(path_to_output_files / f'freesasa_files/{fullID}.txt'), include_hetatms=True,
+                         outdir=None, force_rerun=False)
+    elif file_type == 'pdb':
+        if ID not in existing_free_sasa:
+            fullID = f'AF-{ID}-F{model_num}-model_v1.pdb'
+            run_freesasa(Path(path_to_input / fullID),
+                         Path(path_to_output_files / f'freesasa_files/{fullID}.txt'), include_hetatms=True,
+                         outdir=None, force_rerun=False)
+def sasa(source, pdbID, uniprotID, sasa_pos, wt, mode, path_to_output_files,file_type = 'gzip'):
+    if mode == 1:
+        sasa = 'nan'
+        for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
+            if source == 'PDB':
+                fname = str(filename).split('.')[0].split('/')[-1].upper()
+            elif source == 'MODBASE':
+                fname = str(filename).split('.')[0].split('/')[-1]
+            elif source == 'SWISSSMODEL':
+                fname = str(filename).split('_')[2]
+            if pdbID == fname:
+                files = open(filename, 'r')
+                file = files.readlines()
+                for k in file:
+                    if k.strip()[10:13] == sasa_pos:
+                        residue = str(k[4:7].strip())
+                        if wt == threeToOne(residue):
+                            sasa = str(k[22:28]).strip('\n')
+                            return (sasa)
+                        elif wt != threeToOne(residue):
+                            sasa = str(k[22:28]).strip('\n') + '*'
+                            return (sasa)
+                        else:
+                            return 'nan'  #######
+    if mode == 2:
+        if sasa_pos != np.NaN:
+            sasa = 'nan'
+            if file_type == 'pdb':
+                for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
+                    fname = list(filter(None, filename.split('.'))).split('/')[-1].upper()
+                    if uniprotID == fname:
+                        files = open(filename, 'r')
+                        file = files.readlines()
+                        for k in file:
+                            if k.strip()[10:13] == sasa_pos:
+                                residue = str(k[4:7].strip())
+                                if wt == threeToOne(residue):
+                                    sasa = str(k[22:28]).strip('\n')
+                                elif wt != threeToOne(residue):
+                                    sasa = str(k[22:28]).strip('\n') + '*'
+                return sasa
+            elif file_type == 'gzip':
+                for filename in  list(Path(path_to_output_files / 'freesasa_files').glob("*")):
+                    fname = list(filter(None, str(filename).split('.')))[0].split('/')[-1].split('-')[1].upper()
+                    if uniprotID == fname:
+                        files = open(filename, 'r')
+                        file = files.readlines()
+                        for k in file:
+                            if str(k.strip()[10:13]) == str(sasa_pos):
+                                residue = str(k[4:7].strip())
+                                if wt == threeToOne(residue):
+                                    sasa = str(k[22:28]).strip('\n')
+                                elif wt != threeToOne(residue):
+                                    sasa = str(k[22:28]).strip('\n') + '*'
+                                else:
+                                    sasa = 'nan'
+                return sasa
+        else:
+            sasa = 'nan'
+            return sasa

code/add_sequence.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import requests as r
+from io import StringIO
+from Bio import SeqIO
+import xml.etree.ElementTree as ET
+def get_uniprot_seq(protein_id):
+    print('Fetching UniProt Sequences for ID: ', protein_id)
+    baseUrl = "http://www.uniprot.org/uniprot/"
+    currentUrl = baseUrl + protein_id + ".fasta"
+    response = r.post(currentUrl)
+    cData = ''.join(response.text)
+    Seq = StringIO(cData)
+    pSeq = list(SeqIO.parse(Seq, 'fasta'))
+    try:
+        return str(pSeq[0].seq)
+    except:
+        IndexError
+        return str('')
+def get_isoforms(protein_id):
+    print('Fetching UniProt Isoforms for ID: ', protein_id)
+    try:
+        # a dictionary storing the sequence of your isoforms, key: accesion number, value: sequence
+        isoforms = dict()
+        # make a call to EBI API
+        req = r.get('https://www.ebi.ac.uk/proteins/api/proteins/{}/isoforms'.format(protein_id))
+        # parse the returned XML
+        uniprot = ET.fromstring(req.text)
+        for isoform in uniprot:
+            # get the sequence
+            seq = isoform.find('{http://uniprot.org/uniprot}sequence')
+            # get the accession number
+            iso_accession = isoform.find('{http://uniprot.org/uniprot}accession')
+            # add the values to the dictionary
+            if seq.text and iso_accession.text:
+                isoforms[iso_accession.text] = seq.text
+        return isoforms
+    except:
+        AttributeError
+        isoforms = {}
+        return isoforms

code/add_structure.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import re
+import time
+import json
+import zlib
+from xml.etree import ElementTree
+from urllib.parse import urlparse, parse_qs, urlencode
+import requests
+from requests.adapters import HTTPAdapter, Retry
+from unipressed import IdMappingClient
+## Code adapted from UniProt documentation.
+def get_pdb_ids_2(protein_id):
+    POLLING_INTERVAL = 5
+    API_URL = "https://rest.uniprot.org"
+    retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
+    session = requests.Session()
+    session.mount("https://", HTTPAdapter(max_retries=retries))
+    def check_response(response):
+        try:
+            response.raise_for_status()
+        except requests.HTTPError:
+            print(response.json())
+            raise
+    def submit_id_mapping(from_db, to_db, ids):
+        request = requests.post(
+            f"{API_URL}/idmapping/run",
+            data={"from": from_db, "to": to_db, "ids": ids},
+        )
+        check_response(request)
+        return request.json()["jobId"]
+    def get_next_link(headers):
+        re_next_link = re.compile(r'<(.+)>; rel="next"')
+        if "Link" in headers:
+            match = re_next_link.match(headers["Link"])
+            if match:
+                return match.group(1)
+    def check_id_mapping_results_ready(job_id):
+        while True:
+            request = session.get(f"{API_URL}/idmapping/status/{job_id}")
+            check_response(request)
+            j = request.json()
+            if "jobStatus" in j:
+                if j["jobStatus"] == "RUNNING":
+                    print(f"Retrying in {POLLING_INTERVAL}s")
+                    time.sleep(POLLING_INTERVAL)
+                else:
+                    raise Exception(j["jobStatus"])
+            else:
+                return bool(j["results"] or j["failedIds"])
+    def get_batch(batch_response, file_format, compressed):
+        batch_url = get_next_link(batch_response.headers)
+        while batch_url:
+            batch_response = session.get(batch_url)
+            batch_response.raise_for_status()
+            yield decode_results(batch_response, file_format, compressed)
+            batch_url = get_next_link(batch_response.headers)
+    def combine_batches(all_results, batch_results, file_format):
+        if file_format == "json":
+            for key in ("results", "failedIds"):
+                if key in batch_results and batch_results[key]:
+                    all_results[key] += batch_results[key]
+        elif file_format == "tsv":
+            return all_results + batch_results[1:]
+        else:
+            return all_results + batch_results
+        return all_results
+    def get_id_mapping_results_link(job_id):
+        url = f"{API_URL}/idmapping/details/{job_id}"
+        request = session.get(url)
+        check_response(request)
+        return request.json()["redirectURL"]
+    def decode_results(response, file_format, compressed):
+        if compressed:
+            decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
+            if file_format == "json":
+                j = json.loads(decompressed.decode("utf-8"))
+                return j
+            elif file_format == "tsv":
+                return [line for line in decompressed.decode("utf-8").split("\n") if line]
+            elif file_format == "xlsx":
+                return [decompressed]
+            elif file_format == "xml":
+                return [decompressed.decode("utf-8")]
+            else:
+                return decompressed.decode("utf-8")
+        elif file_format == "json":
+            return response.json()
+        elif file_format == "tsv":
+            return [line for line in response.text.split("\n") if line]
+        elif file_format == "xlsx":
+            return [response.content]
+        elif file_format == "xml":
+            return [response.text]
+        return response.text
+    def get_xml_namespace(element):
+        m = re.match(r"\{(.*)\}", element.tag)
+        return m.groups()[0] if m else ""
+    def merge_xml_results(xml_results):
+        merged_root = ElementTree.fromstring(xml_results[0])
+        for result in xml_results[1:]:
+            root = ElementTree.fromstring(result)
+            for child in root.findall("{http://uniprot.org/uniprot}entry"):
+                merged_root.insert(-1, child)
+        ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
+        return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
+    def get_id_mapping_results_search(url):
+        parsed = urlparse(url)
+        query = parse_qs(parsed.query)
+        file_format = query["format"][0] if "format" in query else "json"
+        if "size" in query:
+            size = int(query["size"][0])
+        else:
+            size = 500
+            query["size"] = size
+        compressed = (
+            query["compressed"][0].lower() == "true" if "compressed" in query else False
+        )
+        parsed = parsed._replace(query=urlencode(query, doseq=True))
+        url = parsed.geturl()
+        request = session.get(url)
+        check_response(request)
+        results = decode_results(request, file_format, compressed)
+        total = int(request.headers["x-total-results"])
+        for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
+            results = combine_batches(results, batch, file_format)
+        if file_format == "xml":
+            return merge_xml_results(results)
+        return results
+    job_id = submit_id_mapping(
+        from_db="UniProtKB_AC-ID", to_db="PDB", ids=protein_id
+    )
+    if check_id_mapping_results_ready(job_id):
+        link = get_id_mapping_results_link(job_id)
+        results = get_id_mapping_results_search(link)
+        # Equivalently using the stream endpoint which is more demanding
+        # on the API and so is less stable:
+        # results = get_id_mapping_results_stream(link)
+    return [i['to'] for i in results['results']]
+def get_pdb_ids(protein_id):
+    try:
+        request = IdMappingClient.submit(
+            source="UniProtKB_AC-ID", dest="PDB", ids={protein_id})
+        time.sleep(2.0)
+        pdb_list = list(request.each_result())
+        return [i['to'] for i in pdb_list]
+    except requests.exceptions.HTTPError:
+        get_pdb_ids_2(protein_id)
+    except KeyError:
+        get_pdb_ids_2(protein_id)

code/alphafold_featureVector.py ADDED Viewed

	@@ -0,0 +1,579 @@

+# IMPORT NECESSARY MODULES AND LIBRARIES
+from timeit import default_timer as timer
+import xml.etree.ElementTree as ET
+from collections import Counter
+from bs4 import BeautifulSoup
+from io import StringIO
+from decimal import *
+import pandas as pd
+import requests as r
+import os.path as op
+from pathlib import Path
+import subprocess
+import argparse
+import ssbio.utils
+import warnings
+import sys
+import pathlib
+import os, glob
+import math
+import ssbio
+import ssl
+import gzip
+import ast
+import itertools
+from Bio.Align import substitution_matrices
+from Bio.PDB.Polypeptide import *
+from Bio.PDB import PDBList
+from Bio import Align
+from Bio import SeqIO
+from Bio.PDB import *
+import numpy as np
+# FUNCTIONS
+from calc_pc_property import *
+from add_domains import *
+from add_annotations import *
+from add_structure import *
+from add_alignment import *
+from manage_files import *
+from add_3Dalignment import *
+from add_sasa import *
+from standard import *
+from add_interface_pos import *
+from standard import *
+from uniprotSequenceMatch import uniprotSequenceMatch
+from process_input import clean_data
+from alphafold_model import *
+def alphafold(input_set, mode, impute):
+    start = timer()
+    # Necessary lists
+    annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                       'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
+                       'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
+                       'region',
+                       'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
+                       'transitPeptide', 'glycosylation', 'propeptide']
+    change_names = {'Disulfide bond': 'disulfide', 'Initiator methionine': 'intMet',
+                    'Natural variant': 'naturalVariant',
+                    'DNA binding': 'dnaBinding',
+                    'Active site': 'activeSite', 'Nucleotide binding': 'nucleotideBinding', 'Lipidation': 'lipidation',
+                    'Site': 'site', 'Transmembrane': 'transmembrane', 'Cross-link': 'crosslink',
+                    'Mutagenesis': 'mutagenesis', 'Beta strand': 'strand', 'Helix': 'helix', 'Turn': 'turn',
+                    'Metal binding': 'metalBinding', 'Repeat': 'repeat',
+                    'Topological domain': 'topologicalDomain', 'Calcium binding': 'caBinding',
+                    'Binding site': 'bindingSite',
+                    'Region': 'region', 'Signal peptide': 'signalPeptide', 'Modified residue': 'modifiedResidue',
+                    'Zinc finger': 'zincFinger', 'Motif': 'motif', 'Coiled coil': 'coiledCoil', 'Peptide': 'peptide',
+                    'Transit peptide': 'transitPeptide', 'Glycosylation': 'glycosylation', 'Propeptide': 'propeptide',
+                    'Intramembrane': 'intramembrane'}
+    ## Standardizing input
+    data = clean_data(input_set)
+    path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, alphafold_path, alphafold_summary= manage_files(mode)
+    out_path = path_to_output_files / 'log.txt'
+    sys.stdout = open(out_path, 'w')
+    print('Creating directories...')
+    file_base = str(Path(alphafold_path / '*'))
+    file_str = glob.glob(file_base)[0].split('-')[-1].split('.')[0]
+    ## Physicochemical properties
+    print('Adding physicochemical properties...\n')
+    data = add_physicochemical(data)
+    ## Domains
+    print('Adding domains\n')
+    data = add_domains(data, path_to_domains)
+    ## Processing data frame
+    data = data.astype(str)
+    data = data.replace({'NaN': np.NaN, 'nan': np.NaN})
+    data.domain = data.domain.replace({np.NaN: '-1'})  # Fill -1 if NaN - standardization.
+    data.domStart = data.domStart.replace({np.NaN: '-1'})
+    data.domEnd = data.domEnd.replace({np.NaN: '-1'})
+    data.distance = data.distance.replace({np.NaN: '-1'})
+    fisherResult = pd.read_csv(fisher_path, sep='\t')
+    significant_domains = fisherResult.domain.to_list()
+    data = data.reset_index()
+    data = data.drop(columns=['index'])
+    ## not_match_in_uniprot : Data points not matched to UniProt sequence
+    ## uniprot_matched: Data points matched to UniProt sequence. Proceed with this data frame
+    ## canonical_fasta : Dataframe including canonical sequence for the protein of interest. Obtained from UniProt.
+    ## isoform_fasta: Dataframe including isoform sequences for the protein of interest. Obtained from UniProt.
+    not_match_in_uniprot, uniprot_matched, canonical_fasta, isoform_fasta = uniprotSequenceMatch(data)
+    not_match_in_uniprot = not_match_in_uniprot.reset_index().drop(['index'], axis=1)
+    for key in change_names.keys():
+        not_match_in_uniprot[key] = ''
+    not_match_in_uniprot = not_match_in_uniprot.rename(columns=change_names)
+    uniprot_matched = add_annotations(uniprot_matched)
+    for w in uniprot_matched.index:
+        for q in annotation_list:
+            per_protein = []
+            if uniprot_matched.at[w, q] != 'nan':
+                fix = ast.literal_eval(uniprot_matched.at[w, q])
+                for z in fix:
+                    if '-' in z:
+                        per_protein += np.arange(int(z.split('-')[0]), int(z.split('-')[1])+1,1).tolist()
+                    else:
+                        try:
+                            per_protein.append(int(z))
+                        except:
+                            ValueError
+                uniprot_matched.at[w, q] = per_protein
+            else:
+                uniprot_matched.at[w, q] = 'nan'
+    uniprot_matched = uniprot_matched.rename(columns=change_names)
+    uniprot_matched['wt_sequence_match'] = uniprot_matched['wt_sequence_match'].astype(str)
+    ## Avoiding downloading files for SASA calculation if already downloaded.
+    existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
+    existing_free_sasa = [str(i) for i in existing_free_sasa]
+    existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
+    ## Decide if the wild type amino acid is on canonical or isoform sequence. Selected sequence will be used for the
+    ## sequence alignment.
+    for i in uniprot_matched.index:
+        if len(uniprot_matched.at[i, 'uniprotSequence']) >= int(uniprot_matched.at[i, 'pos']):
+            wt = uniprot_matched.at[i, 'wt']
+            can = str(uniprot_matched.at[i, 'uniprotSequence'])[int(uniprot_matched.at[i, 'pos']) - 1]
+            if wt == can:
+                uniprot_matched.at[i, 'wt_sequence_match'] = 'm'
+            elif wt != can:
+                isoList = isoform_fasta[
+                    isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
+                for k in isoList:
+                    if len(k) >= int(uniprot_matched.at[i, 'pos']):
+                        resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
+                        if wt == resInIso:
+                            whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
+                            uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
+                            uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
+                            break
+        elif len(uniprot_matched.at[i, 'uniprotSequence']) < int(uniprot_matched.at[i, 'pos']):
+            isoList = isoform_fasta[
+                isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
+            for k in isoList:
+                if len(k) >= int(uniprot_matched.at[i, 'pos']):
+                    resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
+                    wt = uniprot_matched.at[i, 'wt']
+                    if wt == resInIso:
+                        whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
+                        uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
+                        uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
+                        break
+    uniprot_matched = uniprot_matched.replace({'nan': np.NaN})
+    for annot in ['Domain', 'Alternative sequence', 'Chain', 'Sequence conflict', 'Compositional bias']:
+        try:
+            uniprot_matched = uniprot_matched.drop(columns=annot)
+        except:
+            KeyError
+    print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
+          % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
+             len(uniprot_matched.drop_duplicates(['datapoint']))))
+    ## Adding interface residue information.
+    data_interface = pd.read_csv(path_to_interfaces, sep='\t')
+    interface_positions = get_interface_positions(data_interface, 'P1', 'P2')
+    interface_dataframe = pd.DataFrame()
+    for key, val in interface_positions.items():
+        k = pd.Series((key, str(list(set(val)))))
+        interface_dataframe = interface_dataframe.append(k, ignore_index=True)
+    interface_dataframe.columns = ['uniprotID', 'interface_positions']
+    uniprot_matched = uniprot_matched.merge(interface_dataframe, on='uniprotID', how='left')
+    uniprot_matched.interface_positions = uniprot_matched.interface_positions.astype('str')
+    ## PDB info file is pre-generated for time concerns. Includes summary data of AlphaFold structures.
+    ## With new updates, can be updated separately.
+    pdb_info = pd.read_csv(alphafold_summary, sep='\t')
+    ## Keeping how many models each AlphaFold structure has.
+    model_count = modelCount(alphafold_path)
+    for k, v in model_count.items():
+        model_count[k] = int(v / 2)  # two types of files for each file.
+    uniprot_matched = uniprot_matched.astype(str)
+    uniprot_matched.domStart = uniprot_matched.domStart.astype(float)
+    uniprot_matched.domEnd = uniprot_matched.domEnd.astype(float)
+    uniprot_matched.domStart = uniprot_matched.domStart.astype(int)
+    uniprot_matched.domEnd = uniprot_matched.domEnd.astype(int)
+    ## Main part to add annotation information, align sequences, finding distances
+    for i in uniprot_matched.index:
+        print('Processing', i, 'of', len(uniprot_matched))
+        if len(uniprot_matched.at[i, 'uniprotSequence']) >= int(uniprot_matched.at[i, 'pos']):
+            wt = uniprot_matched.at[i, 'wt']
+            can = str(uniprot_matched.at[i, 'uniprotSequence'])[int(uniprot_matched.at[i, 'pos']) - 1]
+            ## Information about whether the mutation is found on the canonical or isoform sequence.
+            if wt == can:
+                uniprot_matched.at[i, 'wt_sequence_match'] = 'm'
+            elif wt != can:
+                isoList = isoform_fasta[
+                    isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
+                for k in isoList:
+                    if len(k) >= int(uniprot_matched.at[i, 'pos']):
+                        resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
+                        if wt == resInIso:
+                            whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
+                            uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
+                            uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
+                            break
+        elif len(uniprot_matched.at[i, 'uniprotSequence']) < int(uniprot_matched.at[i, 'pos']):
+            isoList = isoform_fasta[
+                isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
+            for k in isoList:
+                if len(k) >= int(uniprot_matched.at[i, 'pos']):
+                    resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
+                    wt = uniprot_matched.at[i, 'wt']
+                    if wt == resInIso:
+                        whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
+                        uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
+                        uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
+                        break
+        uniprotID = uniprot_matched.at[i, 'uniprotID']
+        datapoint = uniprot_matched.at[i, 'datapoint']
+        for k in annotation_list:
+            txt = k + 'Binary'
+            if (str(uniprot_matched.at[i, txt]) == '0') or (str(uniprot_matched.at[i, txt]) == '0.0'):
+                uniprot_matched.at[i, txt] = '1'
+            elif (str(uniprot_matched.at[i, txt]).lower() == 'nan') | (str(uniprot_matched.at[i, txt]) == np.NaN) :
+                uniprot_matched.at[i, txt] = '0'
+            elif (str(uniprot_matched.at[i, txt]) == '1') or (str(uniprot_matched.at[i, txt]) == '1.0'):
+                uniprot_matched.at[i, txt] = '2'
+        ## Search in all models.
+        models_for_protein = [val for key, val in model_count.items() if
+                              uniprotID in key.split(';')]  # We have this many models for the protein.
+        which_model_mutation = which_model(
+            int(uniprot_matched.at[i, 'pos']))  # List of models in which the mutation can be found.
+        models_for_all_annotations = {}
+        for annot in annotation_list:
+            if len(uniprot_matched.at[i, annot]) != 0 and type(uniprot_matched.at[i, annot]) != list:
+                uniprot_matched.at[i, annot] = list(
+                    map(str.strip, uniprot_matched.at[i, annot].strip('][').replace('"', '').split(',')))
+            models_for_annotations = {}  # Recording which position is found in which model file.
+            for annot_position in uniprot_matched.at[i, annot]:
+                if annot_position != 'nan' and annot_position != '':
+                    models_for_that_position = which_model(int(annot_position))
+                else:
+                    models_for_that_position = {}
+                for key, val in models_for_that_position.items():
+                    if key not in models_for_annotations.keys():
+                        models_for_annotations[key] = [val]
+                    else:
+                        models_for_annotations[key] += [val]
+            models_for_all_annotations[annot] = models_for_annotations
+        new_dict = {}
+        for key, val in models_for_all_annotations.items():
+            subdict = {k: v for k, v in val.items() if k in which_model_mutation}
+            subdict = dict(sorted(subdict.items()))
+            new_dict[key] = subdict
+        new_dict = reduce_model_dict(new_dict)
+        models_we_need = list(set(itertools.chain.from_iterable(
+            [list(ov.keys()) for ok, ov in new_dict.items()])))  # Read models with these numbers
+        info_per_model = {}  # her bir datapoint için baştan yazılıyor.
+        dist_of_annots = {}
+        all_domain_distances = []
+        for mod in models_we_need:
+            print('---------PRINTING FOR MODEL--------', mod)
+            dist_of_annots[str(mod)] = {}
+            info_per_model[mod] = {}
+            info_per_model[mod]['datapoint'] = datapoint
+            identifier = uniprot_matched.at[i, 'uniprotSequence']
+            try:
+                pdbSequence = pdb_info.loc[(pdb_info.uniprotID == uniprotID) & (
+                        pdb_info.model_num == mod)].sequence.item()
+            except:
+                ValueError
+                pdbSequence = 'nan'
+            if pdbSequence != 'nan':  # The number in models we need might not be present for that protein. Preventng error.
+                pdbSequence = pdb_info.loc[(pdb_info.uniprotID == uniprotID) & (pdb_info.model_num == mod)].sequence.item()
+                alignment_list = do_alignment(uniprot_matched.at[i, 'datapoint'], uniprot_matched.at[i, 'uniprotSequence'],
+                                              pdbSequence, Path(path_to_output_files / 'alignment_files'))
+                pdb_alignStatus = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[0]
+                info_per_model[mod]['pdb_alignStatus'] = pdb_alignStatus
+                mutationPositionOnPDB = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[1]
+                info_per_model[mod]['mutationPositionOnPDB'] = mutationPositionOnPDB
+                startGap = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[2]
+                info_per_model[mod]['startGap'] = startGap
+                alignment_to_use = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[3]
+                for annot in annotation_list:
+                    if new_dict[annot] == {}:
+                        annotation_pos_on_pdb_ = []
+                    else:
+                        try:
+                            annotation_pos_on_pdb_ = annotation_pos_on_pdb(new_dict[annot][mod], startGap, alignment_to_use,
+                                                                           identifier)
+                        except:
+                            KeyError
+                    info_per_model[mod][annot] = annotation_pos_on_pdb_
+                pdb_path = Path(f'{alphafold_path}/AF-{uniprotID}-F{mod}-{file_str}.pdb.gz')
+                if get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan', 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
+                                     'gzip') != None:
+                    alignments, coords, resnums_for_sasa = get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan',
+                                                                            'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
+                                                                            'gzip')
+                    alignments = alignments[0]
+                    calculate_freesasa(uniprotID, mod, existing_free_sasa, alphafold_path, path_to_output_files)
+                    if (mutationPositionOnPDB != 'nan'):
+                        if (int(mutationPositionOnPDB) <= 1400):
+                            try:
+                                coordMut = get_coords(mutationPositionOnPDB, alignments, coords, resnums_for_sasa, mode)[0]
+                            except:
+                                ValueError
+                                coordMut = 'nan'
+                        else:
+                            coordMut = np.NaN
+                        sasa_pos = get_coords(mutationPositionOnPDB, alignments, coords, resnums_for_sasa, mode)[2]
+                        sasa_val = sasa('alphafold', 'nan', uniprotID, sasa_pos, uniprot_matched.at[i, 'wt'], mode,
+                                        path_to_output_files, file_type='gzip')
+                        if sasa_val != None:
+                            uniprot_matched.at[i, 'sasa'] = sasa_val
+                    else:
+                        coordMut = 'nan'
+                        sasa_val = 'nan'
+                        uniprot_matched.at[i, 'sasa'] = sasa_val
+                    domainPositionOnPDB_list = list(
+                        range(int(uniprot_matched.at[i, 'domStart']), int(uniprot_matched.at[i, 'domEnd'])))
+                    domain_distances = []
+                    if len(domainPositionOnPDB_list) != 0:
+                        for domain_ in domainPositionOnPDB_list:
+                            coordDomain = get_coords(domain_, alignments, coords, resnums_for_sasa, mode)[0]
+                            distance_dom = float(find_distance(coordMut,
+                                                               coordDomain))  # bu bir anotasyonun bir modeldeki bir tane pozisyonu için.
+                            domain_distances.append(distance_dom)
+                        minimum_domain = min(domain_distances)  # minimum for one model.
+                    else:
+                        minimum_domain = np.NaN
+                    all_domain_distances.append(minimum_domain)
+                    list_dist_of_annots = []
+                    for key, val in info_per_model.items():
+                        modNum = key
+                        min_annots = {}  # Write from scratch for each annotation.
+                        if modNum == mod:
+                            for label, annotPos in val.items():  # For each annotation type, calculate all distances of the annot positions.
+                                if label in annotation_list:
+                                    all_annot_distance_per_model = []  # All distances of an annoation in hat model
+                                    for annot_position in annotPos:
+                                        if (annot_position != 'nan'):
+                                            if (int(annot_position) <= 1400):
+                                                coordAnnot = \
+                                                    get_coords(annot_position, alignments, coords, resnums_for_sasa, mode)[
+                                                        0]
+                                                distance = float(find_distance(coordMut,
+                                                                               coordAnnot))  # bu bir anotasyonun bir modeldeki bir tane pozisyonu için.
+                                                all_annot_distance_per_model.append(distance)
+                                    if all_annot_distance_per_model != []:
+                                        all_annot_distance_per_model = [float(i) for i in all_annot_distance_per_model]
+                                        try:
+                                            minimum_position = float(min(all_annot_distance_per_model))
+                                        except:
+                                            ValueError
+                                            minimum_position = 'nan'
+                                        min_annots[label] = float(
+                                            minimum_position)  # Minimum of the annotation in this model.
+                        if min_annots != {}:
+                            list_dist_of_annots.append(min_annots)
+                    dist_of_annots[str(
+                        mod)] = list_dist_of_annots  # Getting minimum of all possible models
+                #                uniprot_matched.at[i, annotation_type] = minimum_position
+                else:
+                    print('Model File Not Found')
+                    uniprot_matched.at[i, 'sasa'] = np.NaN
+        if len(all_domain_distances) != 0:
+            uniprot_matched.at[i, 'domaindistance3D'] = min(all_domain_distances)
+        else:
+            uniprot_matched.at[i, 'domaindistance3D'] = np.NaN
+        dist_of_annots_min_of_all = {}
+        flat = [item for sublist in list(dist_of_annots.values()) for item in sublist]
+        for f in flat:
+            for key, val in f.items():
+                if key not in dist_of_annots_min_of_all.keys():
+                    dist_of_annots_min_of_all[key] = val
+                elif (key in dist_of_annots_min_of_all.keys()) & (float(dist_of_annots_min_of_all[key]) > float(val)):
+                    dist_of_annots_min_of_all[key] = val
+        key_list = []
+        for key, val in dist_of_annots_min_of_all.items():
+            uniprot_matched.at[i, key] = val
+            key_list.append(key)
+        remaining = list(set(annotation_list) - set(key_list))
+        for rem in remaining:
+            uniprot_matched.at[i, rem] = ''
+        uniprot_matched.at[i, 'distances'] = [dist_of_annots]
+        if (uniprot_matched.at[i, 'sasa'] != None) & (uniprot_matched.at[i, 'sasa'] != np.NaN) & (
+                str(uniprot_matched.at[i, 'sasa']) != 'nan'):
+            if '*' in uniprot_matched.at[i, 'sasa']:
+                uniprot_matched.at[i, 'sasa'] = uniprot_matched.at[i, 'sasa'].split('*')[0]
+        try:
+            uniprot_matched.at[i, 'sasa'] = float(uniprot_matched.at[i, 'sasa'].strip())
+        except:
+            TypeError
+        if float(uniprot_matched.at[i, 'sasa']) < 5:
+            uniprot_matched.at[i, 'trsh4'] = 'core'
+        elif float(uniprot_matched.at[i, 'sasa']) >= 5:
+            uniprot_matched.at[i, 'trsh4'] = 'surface'
+        elif str(uniprot_matched.at[i, 'sasa']) == 'nan':
+            uniprot_matched.at[i, 'trsh4'] = 'nan'
+        else:
+            uniprot_matched.at[i, 'trsh4'] = 'nan'
+        if (str(uniprot_matched.at[i, 'pos']) in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
+            i, 'trsh4'] == 'surface':
+            uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'interface'
+        elif (str(uniprot_matched.at[i, 'pos']) not in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
+            i, 'trsh4'] == 'surface':
+            uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'surface'
+        elif (str(uniprot_matched.at[i, 'pos']) not in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
+            i, 'trsh4'] == 'core':
+            uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'core'
+        elif (str(uniprot_matched.at[i, 'pos']) in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
+            i, 'trsh4'] == 'core':
+            uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'conflict'
+        elif uniprot_matched.at[i, 'trsh4'] == 'nan':
+            uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'nan'
+        if uniprot_matched.at[i, 'domain'] in significant_domains:
+            uniprot_matched.at[i, 'domain_fisher'] = uniprot_matched.at[i, 'domain']
+        else:
+            uniprot_matched.at[i, 'domain_fisher'] = 'NULL'
+        uniprot_matched = uniprot_matched.round(2)
+        uniprot_matched = uniprot_matched.astype(str)
+    uniprot_matched[ 'domain'] = uniprot_matched['domain'].replace({'-1': 'NULL'})
+    uniprot_matched = uniprot_matched.drop_duplicates()
+    uniprot_matched.rename(
+        columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
+                 'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
+                 'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
+                 'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state',
+                 'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin',
+                 'intramembraneBinary': 'intramembrane_bin',
+                 'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin',
+                 'activeSiteBinary': 'activeSite_bin',
+                 'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin',
+                 'siteBinary': 'site_bin',
+                 'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin',
+                 'mutagenesisBinary': 'mutagenesis_bin',
+                 'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin',
+                 'metalBindingBinary': 'metalBinding_bin',
+                 'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin',
+                 'caBindingBinary': 'caBinding_bin',
+                 'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin',
+                 'signalPeptideBinary': 'signalPeptide_bin',
+                 'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin',
+                 'motifBinary': 'motif_bin',
+                 'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin',
+                 'transitPeptideBinary': 'transitPeptide_bin',
+                 'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin',
+                 'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist',
+                 'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist',
+                 'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist',
+                 'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', 'site': 'site_dist',
+                 'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist',
+                 'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', 'turn': 'turn_dist',
+                 'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist',
+                 'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist',
+                 'bindingSite': 'bindingSite_dist', 'region': 'region_dist',
+                 'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist',
+                 'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist',
+                 'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
+                 'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
+    uniprot_matched = uniprot_matched[
+        ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity', 'volume',
+         'granthamScore', 'domains_all',
+         'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin',
+         'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin',
+         'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin',
+         'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin',
+         'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin',
+         'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin',
+         'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin',
+         'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin',
+         'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist',
+         'intramembrane_dist',
+         'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist',
+         'nucleotideBinding_dist', 'lipidation_dist', 'site_dist',
+         'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist',
+         'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist',
+         'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist',
+         'bindingSite_dist', 'region_dist', 'signalPeptide_dist',
+         'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist',
+         'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist',
+         'glycosylation_dist', 'propeptide_dist']]
+    uniprot_matched = uniprot_matched.reset_index()
+    uniprot_matched = uniprot_matched.drop(columns = {'index'})
+    # Imputation
+    if (impute == 'True') or (impute == 'true'):
+        filler = [20.71, 46.67, 28.13,15.5, 35.94, 21.84, 25.15, 45.15, 29.81, 29.91, 34.67, 24.72, 10.66,11.55, 13.02,
+                  21.54,27.42, 38.39, 30.44, 20.9, 25.82, 46.12, 32.1, 35.96, 35.86, 37.88, 19.09, 35.2, 26.95, 37.48]
+        col_index = 0
+        for col_ in uniprot_matched.columns[-30:]:
+            uniprot_matched[col_] = uniprot_matched[col_].fillna(filler[col_index])
+            uniprot_matched[col_] = uniprot_matched[col_].replace({'nan': filler[col_index]})
+            uniprot_matched[col_] = uniprot_matched[col_].replace({'': filler[col_index]})
+            """
+            if uniprot_matched[col_].values == '':
+                uniprot_matched[col_] = filler[col_index]
+            """
+            col_index += 1
+        uniprot_matched['domains_3Ddist'] = uniprot_matched['domains_3Ddist'].fillna(29.78)
+        uniprot_matched['sasa'] = uniprot_matched['sasa'].fillna(35.6)
+        uniprot_matched['location_3state'] = uniprot_matched['location_3state'].fillna('unknown')
+    elif (impute == 'False') or (impute == 'false'):
+        pass
+    uniprot_matched = uniprot_matched.replace({'nan': np.NaN})
+    uniprot_matched = uniprot_matched.replace({'['']': np.NaN})
+    uniprot_matched.to_csv(path_to_output_files / 'featurevector_alphafold.txt', index=False, sep='\t')
+    if len(uniprot_matched) == 0:
+        print(
+            'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
+    print('Feature vector successfully created...')
+    end = timer()
+    hours, rem = divmod(end - start, 3600)
+    minutes, seconds = divmod(rem, 60)
+    print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
+    sys.stdout.close()
+    return uniprot_matched

code/alphafold_model.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from collections import Counter
+import glob
+def reduce_model_dict(dict):
+    for key, val in dict.items():
+        used = []
+        for key2, val2 in val.items():
+            new = []
+            for i in val2:
+                if i not in used:
+                    new.append(i)
+                    used.append(i)
+                val[key2] = new
+    return dict
+def which_model(position):
+    models_dict = {}
+    x = 1
+    for i, j in zip(range(1400, 27000, 200), range(1, 27000, 200)):
+        if position <= i and position >= j:
+            models_dict[x] = position
+        x += 1
+    return models_dict
+def modelCount(path_to_models):
+    count_list = []
+    for file in list(path_to_models.glob("*")):
+        protein_id = str(file).split('-')[1]
+        count_list.append(protein_id)
+    count_dict = Counter(count_list)
+    count_dict = {';'.join(sorted(k for k in count_dict.keys() if count_dict[k] == v)): v for v in
+                  set(count_dict.values())}
+    return count_dict

code/calc_pc_property.py ADDED Viewed

	@@ -0,0 +1,441 @@

+def compositionValues(aa1, aa2):
+    compositionValues = {'S': 1.42, 'R': 0.65, 'L': 0, 'P': 0.39, 'T': 0.71, 'A': 0, 'V': 0, 'G': 0.74,
+                         'I': 0, 'F': 0, 'Y': 0.20, 'C': 2.75, 'H': 0.58, 'Q': 0.89, 'N': 1.33, 'K': 0.33,
+                         'D': 1.38, 'E': 0.92, 'M': 0, 'W': 0.13}
+    dif = round((compositionValues[aa1] - compositionValues[aa2]), 2)
+    return (dif)
+def polarityValues(aa1, aa2):
+    polarityValues = {'S': 9.2, 'R': 10.5, 'L': 4.9, 'P': 8.0, 'T': 8.6, 'A': 8.1, 'V': 5.9, 'G': 9.0,
+                      'I': 5.2, 'F': 5.2, 'Y': 6.2, 'C': 5.5, 'H': 10.4, 'Q': 10.5, 'N': 11.6, 'K': 11.3,
+                      'D': 13.0, 'E': 12.3, 'M': 5.7, 'W': 5.4}
+    dif = round((polarityValues[aa1] - polarityValues[aa2]), 2)
+    return (dif)
+def volumeValues(aa1, aa2):
+    volumeValues = {'S': 32, 'R': 124, 'L': 111, 'P': 32.5, 'T': 61, 'A': 31, 'V': 84, 'G': 3,
+                    'I': 111, 'F': 132, 'Y': 136, 'C': 55, 'H': 96, 'Q': 85, 'N': 56, 'K': 119,
+                    'D': 54, 'E': 83, 'M': 105, 'W': 170}
+    dif = round((volumeValues[aa1] - volumeValues[aa2]), 2)
+    return (dif)
+def add_physicochemical(df):
+    grantham_dict = {
+        ('A', 'A'): '0',
+        ('A', 'C'): '195',
+        ('A', 'D'): '126',
+        ('A', 'E'): '107',
+        ('A', 'F'): '113',
+        ('A', 'G'): '60',
+        ('A', 'H'): '86',
+        ('A', 'I'): '94',
+        ('A', 'K'): '106',
+        ('A', 'L'): '96',
+        ('A', 'M'): '84',
+        ('A', 'N'): '111',
+        ('A', 'P'): '27',
+        ('A', 'Q'): '91',
+        ('A', 'R'): '112',
+        ('A', 'S'): '99',
+        ('A', 'T'): '58',
+        ('A', 'V'): '64',
+        ('A', 'W'): '148',
+        ('A', 'Y'): '112',
+        ('C', 'A'): '195',
+        ('C', 'C'): '0',
+        ('C', 'D'): '154',
+        ('C', 'E'): '170',
+        ('C', 'F'): '205',
+        ('C', 'G'): '159',
+        ('C', 'H'): '174',
+        ('C', 'I'): '198',
+        ('C', 'K'): '202',
+        ('C', 'L'): '198',
+        ('C', 'M'): '196',
+        ('C', 'N'): '139',
+        ('C', 'P'): '169',
+        ('C', 'Q'): '154',
+        ('C', 'R'): '180',
+        ('C', 'S'): '112',
+        ('C', 'T'): '149',
+        ('C', 'V'): '192',
+        ('C', 'W'): '215',
+        ('C', 'Y'): '194',
+        ('D', 'A'): '126',
+        ('D', 'C'): '154',
+        ('D', 'D'): '0',
+        ('D', 'E'): '45',
+        ('D', 'F'): '177',
+        ('D', 'G'): '94',
+        ('D', 'H'): '81',
+        ('D', 'I'): '168',
+        ('D', 'K'): '101',
+        ('D', 'L'): '172',
+        ('D', 'M'): '160',
+        ('D', 'N'): '23',
+        ('D', 'P'): '108',
+        ('D', 'Q'): '61',
+        ('D', 'R'): '96',
+        ('D', 'S'): '65',
+        ('D', 'T'): '85',
+        ('D', 'V'): '152',
+        ('D', 'W'): '181',
+        ('D', 'Y'): '160',
+        ('E', 'A'): '107',
+        ('E', 'C'): '170',
+        ('E', 'D'): '45',
+        ('E', 'E'): '0',
+        ('E', 'F'): '140',
+        ('E', 'G'): '98',
+        ('E', 'H'): '40',
+        ('E', 'I'): '134',
+        ('E', 'K'): '56',
+        ('E', 'L'): '138',
+        ('E', 'M'): '126',
+        ('E', 'N'): '42',
+        ('E', 'P'): '93',
+        ('E', 'Q'): '29',
+        ('E', 'R'): '54',
+        ('E', 'S'): '80',
+        ('E', 'T'): '65',
+        ('E', 'V'): '121',
+        ('E', 'W'): '152',
+        ('E', 'Y'): '122',
+        ('F', 'A'): '113',
+        ('F', 'C'): '205',
+        ('F', 'D'): '177',
+        ('F', 'E'): '140',
+        ('F', 'F'): '0',
+        ('F', 'G'): '153',
+        ('F', 'H'): '100',
+        ('F', 'I'): '21',
+        ('F', 'K'): '102',
+        ('F', 'L'): '22',
+        ('F', 'M'): '28',
+        ('F', 'N'): '158',
+        ('F', 'P'): '114',
+        ('F', 'Q'): '116',
+        ('F', 'R'): '97',
+        ('F', 'S'): '155',
+        ('F', 'T'): '103',
+        ('F', 'V'): '50',
+        ('F', 'W'): '40',
+        ('F', 'Y'): '22',
+        ('G', 'A'): '60',
+        ('G', 'C'): '159',
+        ('G', 'D'): '94',
+        ('G', 'E'): '98',
+        ('G', 'F'): '153',
+        ('G', 'G'): '0',
+        ('G', 'H'): '98',
+        ('G', 'I'): '135',
+        ('G', 'K'): '127',
+        ('G', 'L'): '138',
+        ('G', 'M'): '127',
+        ('G', 'N'): '80',
+        ('G', 'P'): '42',
+        ('G', 'Q'): '87',
+        ('G', 'R'): '125',
+        ('G', 'S'): '56',
+        ('G', 'T'): '59',
+        ('G', 'V'): '109',
+        ('G', 'W'): '184',
+        ('G', 'Y'): '147',
+        ('H', 'A'): '86',
+        ('H', 'C'): '174',
+        ('H', 'D'): '81',
+        ('H', 'E'): '40',
+        ('H', 'F'): '100',
+        ('H', 'G'): '98',
+        ('H', 'H'): '0',
+        ('H', 'I'): '94',
+        ('H', 'K'): '32',
+        ('H', 'L'): '99',
+        ('H', 'M'): '87',
+        ('H', 'N'): '68',
+        ('H', 'P'): '77',
+        ('H', 'Q'): '24',
+        ('H', 'R'): '29',
+        ('H', 'S'): '89',
+        ('H', 'T'): '47',
+        ('H', 'V'): '84',
+        ('H', 'W'): '115',
+        ('H', 'Y'): '83',
+        ('I', 'A'): '94',
+        ('I', 'C'): '198',
+        ('I', 'D'): '168',
+        ('I', 'E'): '134',
+        ('I', 'F'): '21',
+        ('I', 'G'): '135',
+        ('I', 'H'): '94',
+        ('I', 'I'): '0',
+        ('I', 'K'): '102',
+        ('I', 'L'): '5',
+        ('I', 'M'): '10',
+        ('I', 'N'): '149',
+        ('I', 'P'): '95',
+        ('I', 'Q'): '109',
+        ('I', 'R'): '97',
+        ('I', 'S'): '142',
+        ('I', 'T'): '89',
+        ('I', 'V'): '29',
+        ('I', 'W'): '61',
+        ('I', 'Y'): '33',
+        ('K', 'A'): '106',
+        ('K', 'C'): '202',
+        ('K', 'D'): '101',
+        ('K', 'E'): '56',
+        ('K', 'F'): '102',
+        ('K', 'G'): '127',
+        ('K', 'H'): '32',
+        ('K', 'I'): '102',
+        ('K', 'K'): '0',
+        ('K', 'L'): '107',
+        ('K', 'M'): '95',
+        ('K', 'N'): '94',
+        ('K', 'P'): '103',
+        ('K', 'Q'): '53',
+        ('K', 'R'): '26',
+        ('K', 'S'): '121',
+        ('K', 'T'): '78',
+        ('K', 'V'): '97',
+        ('K', 'W'): '110',
+        ('K', 'Y'): '85',
+        ('L', 'A'): '96',
+        ('L', 'C'): '198',
+        ('L', 'D'): '172',
+        ('L', 'E'): '138',
+        ('L', 'F'): '22',
+        ('L', 'G'): '138',
+        ('L', 'H'): '99',
+        ('L', 'I'): '5',
+        ('L', 'K'): '107',
+        ('L', 'L'): '0',
+        ('L', 'M'): '15',
+        ('L', 'N'): '153',
+        ('L', 'P'): '98',
+        ('L', 'Q'): '113',
+        ('L', 'R'): '102',
+        ('L', 'S'): '145',
+        ('L', 'T'): '92',
+        ('L', 'V'): '32',
+        ('L', 'W'): '61',
+        ('L', 'Y'): '36',
+        ('M', 'A'): '84',
+        ('M', 'C'): '196',
+        ('M', 'D'): '160',
+        ('M', 'E'): '126',
+        ('M', 'F'): '28',
+        ('M', 'G'): '127',
+        ('M', 'H'): '87',
+        ('M', 'I'): '10',
+        ('M', 'K'): '95',
+        ('M', 'L'): '15',
+        ('M', 'M'): '0',
+        ('M', 'N'): '142',
+        ('M', 'P'): '87',
+        ('M', 'Q'): '101',
+        ('M', 'R'): '91',
+        ('M', 'S'): '135',
+        ('M', 'T'): '81',
+        ('M', 'V'): '21',
+        ('M', 'W'): '67',
+        ('M', 'Y'): '36',
+        ('N', 'A'): '111',
+        ('N', 'C'): '139',
+        ('N', 'D'): '23',
+        ('N', 'E'): '42',
+        ('N', 'F'): '158',
+        ('N', 'G'): '80',
+        ('N', 'H'): '68',
+        ('N', 'I'): '149',
+        ('N', 'K'): '94',
+        ('N', 'L'): '153',
+        ('N', 'M'): '142',
+        ('N', 'N'): '0',
+        ('N', 'P'): '91',
+        ('N', 'Q'): '46',
+        ('N', 'R'): '86',
+        ('N', 'S'): '46',
+        ('N', 'T'): '65',
+        ('N', 'V'): '133',
+        ('N', 'W'): '174',
+        ('N', 'Y'): '143',
+        ('P', 'A'): '27',
+        ('P', 'C'): '169',
+        ('P', 'D'): '108',
+        ('P', 'E'): '93',
+        ('P', 'F'): '114',
+        ('P', 'G'): '42',
+        ('P', 'H'): '77',
+        ('P', 'I'): '95',
+        ('P', 'K'): '103',
+        ('P', 'L'): '98',
+        ('P', 'M'): '87',
+        ('P', 'N'): '91',
+        ('P', 'P'): '0',
+        ('P', 'Q'): '76',
+        ('P', 'R'): '103',
+        ('P', 'S'): '74',
+        ('P', 'T'): '38',
+        ('P', 'V'): '68',
+        ('P', 'W'): '147',
+        ('P', 'Y'): '110',
+        ('Q', 'A'): '91',
+        ('Q', 'C'): '154',
+        ('Q', 'D'): '61',
+        ('Q', 'E'): '29',
+        ('Q', 'F'): '116',
+        ('Q', 'G'): '87',
+        ('Q', 'H'): '24',
+        ('Q', 'I'): '109',
+        ('Q', 'K'): '53',
+        ('Q', 'L'): '113',
+        ('Q', 'M'): '101',
+        ('Q', 'N'): '46',
+        ('Q', 'P'): '76',
+        ('Q', 'Q'): '0',
+        ('Q', 'R'): '43',
+        ('Q', 'S'): '68',
+        ('Q', 'T'): '42',
+        ('Q', 'V'): '96',
+        ('Q', 'W'): '130',
+        ('Q', 'Y'): '99',
+        ('R', 'A'): '112',
+        ('R', 'C'): '180',
+        ('R', 'D'): '96',
+        ('R', 'E'): '54',
+        ('R', 'F'): '97',
+        ('R', 'G'): '125',
+        ('R', 'H'): '29',
+        ('R', 'I'): '97',
+        ('R', 'K'): '26',
+        ('R', 'L'): '102',
+        ('R', 'M'): '91',
+        ('R', 'N'): '86',
+        ('R', 'P'): '103',
+        ('R', 'Q'): '43',
+        ('R', 'R'): '0',
+        ('R', 'S'): '110',
+        ('R', 'T'): '71',
+        ('R', 'V'): '96',
+        ('R', 'W'): '101',
+        ('R', 'Y'): '77',
+        ('S', 'A'): '99',
+        ('S', 'C'): '112',
+        ('S', 'D'): '65',
+        ('S', 'E'): '80',
+        ('S', 'F'): '155',
+        ('S', 'G'): '56',
+        ('S', 'H'): '89',
+        ('S', 'I'): '142',
+        ('S', 'K'): '121',
+        ('S', 'L'): '145',
+        ('S', 'M'): '135',
+        ('S', 'N'): '46',
+        ('S', 'P'): '74',
+        ('S', 'Q'): '68',
+        ('S', 'R'): '110',
+        ('S', 'S'): '0',
+        ('S', 'T'): '58',
+        ('S', 'V'): '124',
+        ('S', 'W'): '177',
+        ('S', 'Y'): '144',
+        ('T', 'A'): '58',
+        ('T', 'C'): '149',
+        ('T', 'D'): '85',
+        ('T', 'E'): '65',
+        ('T', 'F'): '103',
+        ('T', 'G'): '59',
+        ('T', 'H'): '47',
+        ('T', 'I'): '89',
+        ('T', 'K'): '78',
+        ('T', 'L'): '92',
+        ('T', 'M'): '81',
+        ('T', 'N'): '65',
+        ('T', 'P'): '38',
+        ('T', 'Q'): '42',
+        ('T', 'R'): '71',
+        ('T', 'S'): '58',
+        ('T', 'T'): '0',
+        ('T', 'V'): '69',
+        ('T', 'W'): '128',
+        ('T', 'Y'): '92',
+        ('V', 'A'): '64',
+        ('V', 'C'): '192',
+        ('V', 'D'): '152',
+        ('V', 'E'): '121',
+        ('V', 'F'): '50',
+        ('V', 'G'): '109',
+        ('V', 'H'): '84',
+        ('V', 'I'): '29',
+        ('V', 'K'): '97',
+        ('V', 'L'): '32',
+        ('V', 'M'): '21',
+        ('V', 'N'): '133',
+        ('V', 'P'): '68',
+        ('V', 'Q'): '96',
+        ('V', 'R'): '96',
+        ('V', 'S'): '124',
+        ('V', 'T'): '69',
+        ('V', 'V'): '0',
+        ('V', 'W'): '88',
+        ('V', 'Y'): '55',
+        ('W', 'A'): '148',
+        ('W', 'C'): '215',
+        ('W', 'D'): '181',
+        ('W', 'E'): '152',
+        ('W', 'F'): '40',
+        ('W', 'G'): '184',
+        ('W', 'H'): '115',
+        ('W', 'I'): '61',
+        ('W', 'K'): '110',
+        ('W', 'L'): '61',
+        ('W', 'M'): '67',
+        ('W', 'N'): '174',
+        ('W', 'P'): '147',
+        ('W', 'Q'): '130',
+        ('W', 'R'): '101',
+        ('W', 'S'): '177',
+        ('W', 'T'): '128',
+        ('W', 'V'): '88',
+        ('W', 'W'): '0',
+        ('W', 'Y'): '37',
+        ('Y', 'A'): '112',
+        ('Y', 'C'): '194',
+        ('Y', 'D'): '160',
+        ('Y', 'E'): '122',
+        ('Y', 'F'): '22',
+        ('Y', 'G'): '147',
+        ('Y', 'H'): '83',
+        ('Y', 'I'): '33',
+        ('Y', 'K'): '85',
+        ('Y', 'L'): '36',
+        ('Y', 'M'): '36',
+        ('Y', 'N'): '143',
+        ('Y', 'P'): '110',
+        ('Y', 'Q'): '99',
+        ('Y', 'R'): '77',
+        ('Y', 'S'): '144',
+        ('Y', 'T'): '92',
+        ('Y', 'V'): '55',
+        ('Y', 'W'): '37',
+        ('Y', 'Y'): '0'
+    }
+    for i in df.index:
+        try:
+            df.at[i, 'composition'] = compositionValues(df.at[i, 'wt'], df.at[i, 'mut'])
+            df.at[i, 'polarity'] = polarityValues(df.at[i, 'wt'], df.at[i, 'mut'])
+            df.at[i, 'volume'] = volumeValues(df.at[i, 'wt'], df.at[i, 'mut'])
+            df.at[i, 'granthamScore'] = grantham_dict[df.at[i, 'wt'], df.at[i, 'mut']]
+        except:
+            KeyError
+            df.at[i, 'composition'] = 'nan'
+            df.at[i, 'polarity'] = 'nan'
+            df.at[i, 'volume'] = 'nan'
+            df.at[i, 'granthamScore'] = 'nan'
+    return df

code/create_swissmodelSummary.py ADDED Viewed

@@ -0,0 +1 @@
                    help='Enter the directory where meta-data is found.',
                    default=1)
    os.makedirs('input_files/extract_swissmodel_structures/', exist_ok=True)
    all_swissmodel = open('input_files/swissmodel_structures.txt', 'w')
    all_swissmodel.write('UniProtKB_ac	iso_id	uniprot_seq_length	uniprot_seq_md5	coordinate_id	provider	from	to	template	qmeandisco_global	seqid	url')
    all_swissmodel.write('\n')
    for f in glob.glob(f'{meta_data}/*.tar.gz'):
        name = f.split('/')[-1].split('.')[0]
        with tarfile.open(f) as tar:
            tar.extractall(f'input_files/extract_swissmodel_structures/{name}')
            with open(f'input_files/extract_swissmodel_structures/{name}/SWISS-MODEL_Repository/INDEX') as x:
                lines = (x.readlines())[7:]
                for line in lines:
                    all_swissmodel.write(line)
    shutil.rmtree('input_files/extract_swissmodel_structures/')
    swissmodel_file()

+'''
                    help='Enter the directory where meta-data is found.',
                    default=1)
    os.makedirs('input_files/extract_swissmodel_structures/', exist_ok=True)
    all_swissmodel = open('input_files/swissmodel_structures.txt', 'w')
    all_swissmodel.write('UniProtKB_ac	iso_id	uniprot_seq_length	uniprot_seq_md5	coordinate_id	provider	from	to	template	qmeandisco_global	seqid	url')
    all_swissmodel.write('\n')
    for f in glob.glob(f'{meta_data}/*.tar.gz'):
        name = f.split('/')[-1].split('.')[0]
        with tarfile.open(f) as tar:
            tar.extractall(f'input_files/extract_swissmodel_structures/{name}')
            with open(f'input_files/extract_swissmodel_structures/{name}/SWISS-MODEL_Repository/INDEX') as x:
                lines = (x.readlines())[7:]
                for line in lines:
                    all_swissmodel.write(line)
    shutil.rmtree('input_files/extract_swissmodel_structures/')
    swissmodel_file()

code/get_alphafoldStructures.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import tarfile, glob, os
+from biopandas.pdb import PandasPdb
+import argparse
+import numpy as np
+parser = argparse.ArgumentParser(description='ASCARIS')
+parser.add_argument('-file_name', '--file_name',
+                    help='Enter the file tar file name to untar',
+                    default=1)
+args = parser.parse_args()
+alphafold = args.file_name
+def threeToOne(variant):
+    if variant == "ALA":
+        variant = "A"
+    elif variant == "ARG":
+        variant = "R"
+    elif variant == "VAL":
+        variant = "V"
+    elif variant == "GLU":
+        variant = "E"
+    elif variant == "PRO":
+        variant = "P"
+    elif variant == "LEU":
+        variant = "L"
+    elif variant == "GLY":
+        variant = "G"
+    elif variant == "ASN":
+        variant = "N"
+    elif variant == "SER":
+        variant = "S"
+    elif variant == "GLN":
+        variant = "Q"
+    elif variant == "THR":
+        variant = "T"
+    elif variant == "MET":
+        variant = "M"
+    elif variant == "LYS":
+        variant = "K"
+    elif variant == "ASP":
+        variant = "D"
+    elif variant == "ILE":
+        variant = "I"
+    elif variant == "PHE":
+        variant = "F"
+    elif variant == "TRP":
+        variant = "W"
+    elif variant == "TYR":
+        variant = "Y"
+    elif variant == "HIS":
+        variant = "H"
+    elif variant == "CYS":
+        variant = "C"
+    elif variant == 'UNK':
+        variant = 'X'
+    elif variant == 'ASX':
+        variant = 'O'
+    return (variant)
+# Unzip AlphaFold structures
+def create_file():
+    os.makedirs('input_files/alphafold_structures/', exist_ok=True)
+    for f in glob.glob(f'input_files/{alphafold}'):
+        with tarfile.open(f) as tar:
+            tar.extractall(f'input_files/alphafold_structures/')
+    # Create summary file
+    alphafold_summary_file = open('input_files/alphafold_summary.txt', 'w')
+    alphafold_summary_file.write('uniprotID\tchain\tsequence\tmodel_num')
+    alphafold_summary_file.write('\n')
+    for f in glob.glob('input_files/alphafold_structures/*pdb*'):
+        str1 = PandasPdb().read_pdb(f)
+        str1 = str1.df['ATOM']
+        str1 = str1[['alt_loc', 'residue_name', 'residue_number', 'atom_name', 'insertion', 'chain_id']]
+        str1 = str1[str1.atom_name == 'CA']
+        str1['residue_name'] = str1['residue_name'].apply(lambda x: threeToOne(x))
+        str1['alt_loc'] = str1['alt_loc'].replace({'': np.NaN})
+        str1 = str1.drop_duplicates(['residue_name', 'residue_number'])
+        structure_residues_pdb = ''.join(str1.residue_name.to_list())
+        model_no = f.split('-')[2].strip()[1:]
+        up_name = f.split('-')[1].strip()
+        chain_id = list(set(str1.chain_id.to_list()))[0]
+        alphafold_summary_file.write(up_name)
+        alphafold_summary_file.write('\t')
+        alphafold_summary_file.write(chain_id)
+        alphafold_summary_file.write('\t')
+        alphafold_summary_file.write(structure_residues_pdb)
+        alphafold_summary_file.write('\t')
+        alphafold_summary_file.write(model_no)
+        alphafold_summary_file.write('\n')
+if __name__ == '__main__':
+    create_file()

code/main.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import pdb_featureVector
+import alphafold_featureVector
+import argparse
+parser = argparse.ArgumentParser(description='ASCARIS')
+parser.add_argument('-s', '--source_option',
+                    help='Selection of input structure data.\n 1: PDB Structures (default), 2: AlphaFold Structures',
+                    default=1)
+parser.add_argument('-i', '--input_datapoint',
+                    help='Input file or query datapoint\n Option 1: Comma-separated list of idenfiers (UniProt ID-wt residue-position-mutated residue (e.g. Q9Y4W6-N-432-T or Q9Y4W6-N-432-T, Q9Y4W6-N-432-T)) \n Option 2: Enter comma-separated file path')
+parser.add_argument('-impute', '--imputation_state', default='True',
+                    help='Whether resulting feature vector should be imputed or not. Default True.')
+args = parser.parse_args()
+input_set = args.input_datapoint
+mode = args.source_option
+impute = args.imputation_state
+def run_featureVector(input_set, mode, impute):
+    print('*****************************************')
+    print('Feature vector generation is in progress. \nPlease check log file for updates..')
+    print('*****************************************')
+    mode = int(mode)
+    if mode == 1:
+        pdb_featureVector.pdb(input_set, mode, impute)
+    elif mode == 2:
+        alphafold_featureVector.alphafold(input_set, mode, impute)
+if __name__ == '__main__':
+    run_featureVector(input_set, mode, impute)

code/manage_files.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+from pathlib import Path
+def manage_files(mode):
+    if mode== 1:
+        path_to_input_files = Path('input_files')
+        path_to_domains = path_to_input_files / 'domains.txt'
+        swiss_model_path = path_to_input_files / 'INDEX.json'
+        fisher_path = path_to_input_files / 'significant_domains.txt'
+        path_to_interfaces = path_to_input_files / 'H_sapiens_interfacesHQ.txt'
+        path_to_output_files = Path('out_files/pdb')
+        os.makedirs(path_to_output_files / 'pdb_structures/', exist_ok=True)
+        os.makedirs(path_to_output_files / 'alignment_files/', exist_ok=True)
+        os.makedirs(path_to_output_files / 'swissmodel_structures/', exist_ok=True)
+        os.makedirs(path_to_output_files / 'modbase_structures/', exist_ok=True)
+        os.makedirs(path_to_output_files / 'modbase_structures_individual/', exist_ok=True)
+        os.makedirs(path_to_output_files / 'freesasa_files/', exist_ok=True)
+        os.makedirs(path_to_output_files / '3D_alignment/', exist_ok=True)
+        path_to_alignment_files = path_to_output_files / 'alignment_files'
+        path_3D_alignment = path_to_output_files / '3D_alignment'
+        path_to_freesasa = path_to_output_files / 'freesasa_files'
+        buffer = path_to_output_files / 'file_buffer.txt'
+        outpath = path_to_output_files / 'feature_vector.txt'
+        return path_to_input_files, path_to_output_files, path_to_domains,fisher_path, path_to_interfaces, buffer
+    elif mode == 2:
+        path_to_input_files = Path('input_files')
+        path_to_domains = path_to_input_files / 'domains.txt'
+        fisher_path = path_to_input_files / 'significant_domains.txt'
+        alphafold_summary = path_to_input_files / 'alphafold_summary.txt'
+        path_to_interfaces = path_to_input_files / 'H_sapiens_interfacesHQ.txt'
+        # Unzip before using
+        alphafold_path = Path(path_to_input_files/'alphafold_structures')
+        path_to_output_files = Path('out_files/alphafold')
+        os.makedirs(path_to_output_files, exist_ok=True)
+        os.makedirs(path_to_output_files / 'freesasa_files', exist_ok=True)
+        os.makedirs(path_to_output_files / 'alignment_files', exist_ok=True)
+        os.makedirs(path_to_output_files / '3D_alignment', exist_ok=True)
+        return path_to_input_files,path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, alphafold_path, alphafold_summary

code/pdb_featureVector.py ADDED Viewed

The diff for this file is too large to render. See raw diff

code/process_input.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import pandas as pd
+def clean_data(input_set):
+    data = pd.DataFrame()
+    try:
+        if ',' in input_set:
+            input_set = [i.strip() for i in input_set.split(',')]
+            for i in input_set:
+                data = data.append(pd.Series([j.strip() for j in i.split('-')]), ignore_index=True)
+            data.columns = ['uniprotID', 'wt', 'pos', 'mut']
+        elif '\t' in input_set:
+            input_set = [i.strip() for i in input_set.split('\t')]
+            for i in input_set:
+                data = data.append(pd.Series([j.strip() for j in i.split('-')]), ignore_index=True)
+            data.columns = ['uniprotID', 'wt', 'pos', 'mut']
+        elif '-' in input_set:
+            data = data.append(pd.Series([j.strip() for j in input_set.split('-')]), ignore_index=True)
+            data.columns = ['uniprotID', 'wt', 'pos', 'mut']
+        elif '.txt' in input_set:
+            data = pd.read_csv(input_set, sep='\t', names=['uniprotID', 'wt', 'pos', 'mut'])
+        data = data[['uniprotID', 'wt', 'pos', 'mut']]
+        # Exclude termination codons, synonymous mutations and any non-standard residues such as Sec, 4 or 6.
+        aa_list = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
+        data.wt = data.wt.str.strip()
+        data.mut = data.mut.str.strip()
+        data = data[data.wt.isin(aa_list)]
+        data = data[data.mut.isin(aa_list)]
+        for i in data.index:
+            data.at[i, 'datapoint'] = data.at[i, 'uniprotID'] + data.at[i, 'wt'] + str(data.at[i, 'pos']) + data.at[i, 'mut']
+        data = data.astype(str)
+        return data
+    except:
+        ValueError
+        print('Please check the input format.')

code/standard.py ADDED Viewed

	@@ -0,0 +1,13 @@

+def standardize(df, get_columns):
+    cols_to_change = ['sasa', 'domaindistance3D', 'disulfide', 'intMet', 'intramembrane',
+                      'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
+                      'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
+                      'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding',
+                      'topologicalDomain', 'bindingSite', 'region', 'signalPeptide',
+                      'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
+                      'transitPeptide', 'glycosylation', 'propeptide']
+    for col in cols_to_change:  # because in the other ones, they are 3D distance. Here, no distance calculated.
+        df[col] = 'nan'
+    df = df[get_columns.columns]
+    return df

code/uniprotSequenceMatch.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from add_sequence import *
+import pandas as pd
+import numpy as np
+def uniprotSequenceMatch(data):
+    print('Retrieving UniProt sequences...\n')
+    canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence'])
+    up_list = list(set(data['uniprotID'].to_list()))
+    for i in range(len(up_list)):
+        canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i])
+        canonical_fasta.at[i, 'uniprotID'] = up_list[i]
+    canonical_fasta = canonical_fasta.drop_duplicates()
+    isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence'])
+    iso_dict = []
+    for i in range(len(up_list)):
+        iso_dict.append(get_isoforms(up_list[i]))
+    index = 0
+    for i in iso_dict:
+        for key, val in i.items():
+            isoform_fasta.at[index, 'uniprotID'] = key
+            isoform_fasta.at[index, 'isoformSequence'] = val
+            index += 1
+    isoform_fasta = isoform_fasta.drop_duplicates()
+    for i in isoform_fasta.index:
+        isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip()
+        isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6]
+    print('Sequence files created...\n')
+    data = data.merge(canonical_fasta, on='uniprotID', how='left')
+    data = data.replace({'': np.NaN, 'nan': np.NaN})
+    data['whichIsoform'] = np.NaN
+    data['wt_sequence_match'] = np.NaN
+    not_match_in_uniprot = data[data.uniprotSequence.isna()]
+    uniprot_matched = data[~data.uniprotSequence.isna()]
+    return not_match_in_uniprot, uniprot_matched, canonical_fasta, isoform_fasta

input_files/H_sapiens_interfacesHQ.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90fb5f5fe31e20921290e0da588d50d2939feedac80767cdd3b46225ce849b8d
+size 19252152

input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.cif.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5a22037a2ae883cc095f647170271d6a69f38de045206e99c4ac5586658ccb3
+size 26598

input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.pdb.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93e034885f400396df77e65944c65e8d22000f011343a98d8f7727b97b378860
+size 18469

input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.cif.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:367a7e9d82ad6a452f643eed923237ed149cc3cf1dabef23304d4e4f5711a191
+size 25647

input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.pdb.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:449fa624948266313cdf18a365e11036b6eaa5502395ed88b58f1841ebf70e60
+size 17763

input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.cif.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35de071f52a5644df10d8181b5c6034b04734895e155b68d3e3f5133e98f3ef6
+size 27026

input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.pdb.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a509714d54bdf9b9ad7a9bcdccc4122e256cec371fb04e251f68e2e67ade17a
+size 18748

input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.cif.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6b9e658af67a6b4ca14f5c960c4629140eb78588c46cfe1fab3bbe2c1c7d17e
+size 25157

input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.pdb.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b840d9a1c9de25dd6484ad2675f26e578e883c277d4e332247cb1f45a7706ffb
+size 17329

input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.cif.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9077d070c0fea099e5afdc10d4c599367064518be2412088e8f7f2213156f91
+size 26786

input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.pdb.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1abd18dc11f67b8b3a3dd8b30c4a74fec7fefec62c601153401ca5c550c96dbd
+size 18678

input_files/alphafold_structures/AF-A0A0A0MS02-F1-model_v1.cif.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db309cbaaf7d073230b4ab1a98ecc8213c6cfebfe87cc4f6f3990944feef7059
+size 26727