Spaces:

HUBioDataLab
/

ASCARIS

Sleeping

App Files Files Community

fatmacankara commited on Jul 23, 2023

Commit

83a7ed8

1 Parent(s): 001c319

Update code/add_alignment.py

Browse files

Files changed (1) hide show

code/add_alignment.py +10 -100

code/add_alignment.py CHANGED Viewed

@@ -1,114 +1,36 @@
 from Bio import Align
 from Bio.Align import substitution_matrices
 from pathlib import Path
-import streamlit as st
 from Bio.pairwise2 import format_alignment
-from Bio import pairwise2
-from Bio import pairwise2
-from Bio.SubsMat import MatrixInfo as matlist
-"""
 def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
-    aligner = Align.PairwiseAligner()
     #print(f'Aligning Datapoint: {identifier}')
     if len(pdbSequence) >= 1:
-        f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
         aligner.mode = 'local'
         aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
         aligner.open_gap_score = -11
         aligner.extend_gap_score = -1
         alignments = aligner.align(uniprotSequence, pdbSequence)
         alignments = (list(alignments))
-        merge_in_threes = str(alignments[0]).split('\n')
-        K = 3
-        res = ["".join(str(alignments[0]).split('\n')[idx: idx + K]) for idx in range(len(str(alignments[0]).split('\n')) - K + 1)]
-        slice_val = slice(0,len(res),4)
-        writtenlist = res[slice_val]
-        new_alignment = []
-        for i in writtenlist:
-            cont1 = list(filter(None, i.split('target')))
-            cont2 = cont1[0].split('query')
-            target_pos = (list(filter(None,cont2[0].split(' '))))[0]
-            target = (list(filter(None,cont2[0].split(' '))))[1]
-            alg_pos = (list(filter(None,cont2[0].split(' '))))[2]
-            alg = (list(filter(None,cont2[0].split(' '))))[3]
-            query_pos = (list(filter(None,cont2[1].split(' '))))[0]
-            query = (list(filter(None,cont2[1].split(' '))))[1]
-            if int(target_pos)>0:
-                new_target = int(target_pos) * 'X' + target
-            else:
-                new_target = int(target_pos) * ' ' + target
-            if int(alg_pos)>0:
-                new_alg = int(target_pos) * 'X' + target
-            else:
-                new_alg = int(target_pos) * ' ' + alg
-            if int(query_pos)>0:
-                new_query = int(target_pos) * 'X' + target
-            else:
-                new_query = int(target_pos) * ' ' + target
-            new_alignment.append(new_target+'\n' +new_alg +'\n' +new_query)
         alignment_list = []
-        k = 0
-        for alignment in new_alignment:
-            k += 1
-            st.write('COUNT', k)
-            st.write('alignment')
-            st.write(alignment)
-            f.write(str(alignment))
-            f.write('\n')
-            f.write('\n')
-            alignment = (str(alignment).strip().split('\n'))
-            alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
-            st.write('alignment_updated')
-            st.write(alignment)
-            alignment_list.append(alignment)
-    return alignment_list
-"""
-def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
-    aligner = Align.PairwiseAligner()
-    #print(f'Aligning Datapoint: {identifier}')
-    if len(pdbSequence) >= 1:
-        f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
-        aligner.mode = 'local'
-        aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
-        aligner.open_gap_score = -11
-        aligner.extend_gap_score = -1
-        alignments = aligner.align(uniprotSequence, pdbSequence)
-        sub_matrix = matlist.blosum62
-        alignments2 = pairwise2.align.localds(uniprotSequence, pdbSequence, sub_matrix, -11, -1)
-        alignment_list = []
-        k = 0
         for alignment in alignments:
             f.write(str(alignment))
             f.write('\n')
             f.write('\n')
             alignment = (str(alignment).strip().split('\n'))
             alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
             alignment_list.append(alignment)
     return alignment_list
 def mutation_position_on_pdb(alignment_list, pos):
     which_alignment_to_go = 0
     for alignment in alignment_list:
-        #char_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
-        #for char in alignment[1]:
-          #  if char in char_list:
-            #    alignment[1] = alignment[1].replace(char, '.')
         which_alignment_to_go += 1
         alignment_uniprot = alignment[0]
         alignment_pdb = alignment[2]
@@ -119,7 +41,6 @@ def mutation_position_on_pdb(alignment_list, pos):
                     startGap += 1
                 else:
                     break
         countGap = startGap
         countResidue = 0
         canonicalRes = ' '
@@ -129,7 +50,6 @@ def mutation_position_on_pdb(alignment_list, pos):
                 countGap += 1
             else:
                 countResidue += 1
             if int(countResidue) == int(pos):
                 canonicalRes = alignment_uniprot[countResidue + countGap - 1]
                 try:
@@ -138,7 +58,6 @@ def mutation_position_on_pdb(alignment_list, pos):
                     IndexError
                     pdbRes = 'nan'
                 break
         if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
             if canonicalRes == pdbRes:
                 pdb_alignStatus = 'aligned'
@@ -154,16 +73,12 @@ def mutation_position_on_pdb(alignment_list, pos):
                 countResidue + countGap - 1] == '-':
                 mutationPositionOnPDB = 'nan'
                 posPDB = 'nan'
             else:
                 posPDB = countResidue + countGap - countGap_pdb
                 mutationPositionOnPDB = str(posPDB)
             break
         elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
-                alignment[1][poscountResidue+ countGap - 1] == '-')):
             pdb_alignStatus = 'not_aligned'
             mutationPositionOnPDB = 'nan'
         elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
@@ -174,7 +89,10 @@ def mutation_position_on_pdb(alignment_list, pos):
             countResidue + countGap - 1] == '-':
             mutationPositionOnPDB = 'nan'
             posPDB = 'nan'
     return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
@@ -388,13 +306,9 @@ def annotation_pos_on_pdb(annot_positions, startGap, alignment_to_use, identifie
 def final_stage(df, annotation_list, alignment_path):
     for i in df.index:
         identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
         alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
         df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
-        print()
         df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
         startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
         alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
@@ -411,13 +325,9 @@ def final_stage(df, annotation_list, alignment_path):
                 str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
             df.at[i, 'domainStartonPDB'] = 'nan'
             df.at[i, 'domainEndonPDB'] = 'nan'
-    df = df.astype(str)
     return df
 def alignment(dataframe_to_align, annotation_list, alignment_path):
     domainList = ['domStart', 'domEnd']
     result = final_stage(dataframe_to_align, annotation_list, alignment_path)
     return result
-#

 from Bio import Align
 from Bio.Align import substitution_matrices
 from pathlib import Path
+aligner = Align.PairwiseAligner()
 from Bio.pairwise2 import format_alignment
 def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
     #print(f'Aligning Datapoint: {identifier}')
     if len(pdbSequence) >= 1:
+        f = open(Path(alignment_path / f'{identifier}_alignment.txt'),
+                 "w")
         aligner.mode = 'local'
         aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
         aligner.open_gap_score = -11
         aligner.extend_gap_score = -1
         alignments = aligner.align(uniprotSequence, pdbSequence)
         alignments = (list(alignments))
         alignment_list = []
         for alignment in alignments:
             f.write(str(alignment))
             f.write('\n')
             f.write('\n')
             alignment = (str(alignment).strip().split('\n'))
             alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
             alignment_list.append(alignment)
     return alignment_list
 def mutation_position_on_pdb(alignment_list, pos):
     which_alignment_to_go = 0
     for alignment in alignment_list:
         which_alignment_to_go += 1
         alignment_uniprot = alignment[0]
         alignment_pdb = alignment[2]
                     startGap += 1
                 else:
                     break
         countGap = startGap
         countResidue = 0
         canonicalRes = ' '
                 countGap += 1
             else:
                 countResidue += 1
             if int(countResidue) == int(pos):
                 canonicalRes = alignment_uniprot[countResidue + countGap - 1]
                 try:
                     IndexError
                     pdbRes = 'nan'
                 break
         if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
             if canonicalRes == pdbRes:
                 pdb_alignStatus = 'aligned'
                 countResidue + countGap - 1] == '-':
                 mutationPositionOnPDB = 'nan'
                 posPDB = 'nan'
             else:
                 posPDB = countResidue + countGap - countGap_pdb
                 mutationPositionOnPDB = str(posPDB)
             break
         elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
+                alignment[1][countResidue + countGap - 1] == '-')):
             pdb_alignStatus = 'not_aligned'
             mutationPositionOnPDB = 'nan'
         elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
             countResidue + countGap - 1] == '-':
             mutationPositionOnPDB = 'nan'
             posPDB = 'nan'
+        else:
+            pdb_alignStatus = 'not_aligned'
+            mutationPositionOnPDB = 'nan'
+    print(pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
     return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
 def final_stage(df, annotation_list, alignment_path):
     for i in df.index:
         identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
         alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
         df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
         df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
         startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
         alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
                 str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
             df.at[i, 'domainStartonPDB'] = 'nan'
             df.at[i, 'domainEndonPDB'] = 'nan'
     return df
 def alignment(dataframe_to_align, annotation_list, alignment_path):
     domainList = ['domStart', 'domEnd']
     result = final_stage(dataframe_to_align, annotation_list, alignment_path)
     return result