Spaces:

HUBioDataLab
/

ASCARIS

Sleeping

App Files Files Community

fatmacankara commited on Aug 25, 2023

Commit

af56dfe

1 Parent(s): bb5a846

Update code/pdb_featureVector.py

Browse files

Files changed (1) hide show

code/pdb_featureVector.py +8 -33

code/pdb_featureVector.py CHANGED Viewed

@@ -163,8 +163,7 @@ def pdb(input_set, mode, impute):
                             data.at[i, 'wt_sequence_match'] = 'i'
                             data.at[i, 'whichIsoform'] = whichIsoform
                             break
-        print('MATCHING UNIPTOR')
-        print(data.to_string())
         data.wt_sequence_match = data.wt_sequence_match.astype('str')
         data.replace({'': 'nan'}, inplace=True)
         data_size = len(data.drop_duplicates(['datapoint']))
@@ -196,7 +195,6 @@ def pdb(input_set, mode, impute):
             pdbs = [item for sublist in pdbs for item in sublist]
         else:
-            print('PDB List Empty')
             pdbs = []
         print('Processing PDB structures...\n')
         if pdbs == []:
@@ -274,12 +272,7 @@ def pdb(input_set, mode, impute):
                     if chain_id in pdb_data_list:
                     # Print UniProt IDs, chain ID, and resolution for the current model
                         chain_id = chain.get_id()
-                        #st.write(f"---- Information for Chain {chain_id} in Model {i} ----")
-                        #st.write(f"UniProt IDs: {', '.join(uniprot_ids)}")
-                        #st.write(f"Chain ID: {chain_id}")
-                        #st.write(f"PDB ID: {search.upper()}")
-                        #st.write(f"Resolution: {resolution}")
-                        #st.write(f"Sequence: {sequence}")
                         pdb_fasta.at[index, 'pdbID'] = search
                         pdb_fasta.at[index, 'chain'] = chain_id
                         pdb_fasta.at[index, 'pdbSequence'] = str(seqs[chain_id])
@@ -289,10 +282,6 @@ def pdb(input_set, mode, impute):
                         pdb_info.at[index, 'resolution'] = resolution
                         index += 1
-        st.write('PDB INFO')
-        st.write(pdb_info)
-        st.write('PDB FASTA')
-        st.write(pdb_fasta)
         print('PDB file processing finished..')
         for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
             try:
@@ -308,15 +297,13 @@ def pdb(input_set, mode, impute):
                     filename.rename(filename_replace_ext.with_suffix('.pdb'))
             except:
                 FileNotFoundError
-        st.write('uniprot_matched before')
-        st.write(uniprot_matched)
         uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
         uniprot_matched = uniprot_matched.astype(str)
         uniprot_matched = uniprot_matched.drop_duplicates()
         uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
         uniprot_matched = uniprot_matched.astype(str)
-        st.write('uniprot_matched after')
-        st.write(uniprot_matched)
         with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
                 (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
                 uniprot_matched.resolution != 'None'))].drop_duplicates()
@@ -434,18 +421,12 @@ def pdb(input_set, mode, impute):
         with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
         with_pdb = None
-        print('dfM')
-        print(dfM.to_string())
-        print('dfNM')
-        print(dfNM)
         print('Aligning sequences...\n')
         aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
-        print('aligned_m')
-        print(aligned_m.to_string())
-        print('aligned_nm')
-        print(aligned_nm.to_string())
         # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
@@ -488,9 +469,7 @@ def pdb(input_set, mode, impute):
         yes_pdb_no_match = after_up_pdb_alignment[
             (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
         no_pdb = no_pdb.copy()
-        print('-----PDB ALIGNED-----')
-        print(pdb_aligned.to_string())
         print('PDB matching is completed...\n')
         print('SUMMARY')
@@ -892,7 +871,6 @@ def pdb(input_set, mode, impute):
                 if protein not in existing_modbase_models:
                     print('Downloading Modbase models for ', protein)
                     url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
-                    print(url)
                     req = requests.get(url)
                     name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
                     with open(name, 'wb') as f:
@@ -1389,7 +1367,6 @@ def pdb(input_set, mode, impute):
         aligner = Align.PairwiseAligner()
         print('Proceeding to 3D distance calculation...\n')
-        print(data.to_string())
         data.domainEndonPDB = data.domainEndonPDB.astype(str)
         data.domainStartonPDB = data.domainStartonPDB.astype(str)
@@ -1419,8 +1396,7 @@ def pdb(input_set, mode, impute):
             pdbID = data.at[i, 'pdbID']
             alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
-            print('main_alignments')
-            print(list(alignments))
             mutPos = data.at[i, 'mutationPositionOnPDB']
             try:
                 coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
@@ -1549,7 +1525,6 @@ def pdb(input_set, mode, impute):
         data.positions = data.positions.astype('str')
         for i in data.index:
             if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
-                print((str(data.at[i, 'pos']) in data.at[i, 'positions']))
                 data.at[i, 'threeState_trsh4_HQ'] = 'interface'
             elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
                 data.at[i, 'threeState_trsh4_HQ'] = 'surface'

                             data.at[i, 'wt_sequence_match'] = 'i'
                             data.at[i, 'whichIsoform'] = whichIsoform
                             break
         data.wt_sequence_match = data.wt_sequence_match.astype('str')
         data.replace({'': 'nan'}, inplace=True)
         data_size = len(data.drop_duplicates(['datapoint']))
             pdbs = [item for sublist in pdbs for item in sublist]
         else:
             pdbs = []
         print('Processing PDB structures...\n')
         if pdbs == []:
                     if chain_id in pdb_data_list:
                     # Print UniProt IDs, chain ID, and resolution for the current model
                         chain_id = chain.get_id()
                         pdb_fasta.at[index, 'pdbID'] = search
                         pdb_fasta.at[index, 'chain'] = chain_id
                         pdb_fasta.at[index, 'pdbSequence'] = str(seqs[chain_id])
                         pdb_info.at[index, 'resolution'] = resolution
                         index += 1
         print('PDB file processing finished..')
         for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
             try:
                     filename.rename(filename_replace_ext.with_suffix('.pdb'))
             except:
                 FileNotFoundError
         uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
         uniprot_matched = uniprot_matched.astype(str)
         uniprot_matched = uniprot_matched.drop_duplicates()
         uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
         uniprot_matched = uniprot_matched.astype(str)
         with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
                 (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
                 uniprot_matched.resolution != 'None'))].drop_duplicates()
         with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
         with_pdb = None
         print('Aligning sequences...\n')
         aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
         yes_pdb_no_match = after_up_pdb_alignment[
             (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
         no_pdb = no_pdb.copy()
         print('PDB matching is completed...\n')
         print('SUMMARY')
                 if protein not in existing_modbase_models:
                     print('Downloading Modbase models for ', protein)
                     url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
                     req = requests.get(url)
                     name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
                     with open(name, 'wb') as f:
         aligner = Align.PairwiseAligner()
         print('Proceeding to 3D distance calculation...\n')
         data.domainEndonPDB = data.domainEndonPDB.astype(str)
         data.domainStartonPDB = data.domainStartonPDB.astype(str)
             pdbID = data.at[i, 'pdbID']
             alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
             mutPos = data.at[i, 'mutationPositionOnPDB']
             try:
                 coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
         data.positions = data.positions.astype('str')
         for i in data.index:
             if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
                 data.at[i, 'threeState_trsh4_HQ'] = 'interface'
             elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
                 data.at[i, 'threeState_trsh4_HQ'] = 'surface'