Spaces:
Sleeping
Sleeping
Commit
·
af56dfe
1
Parent(s):
bb5a846
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +8 -33
code/pdb_featureVector.py
CHANGED
|
@@ -163,8 +163,7 @@ def pdb(input_set, mode, impute):
|
|
| 163 |
data.at[i, 'wt_sequence_match'] = 'i'
|
| 164 |
data.at[i, 'whichIsoform'] = whichIsoform
|
| 165 |
break
|
| 166 |
-
|
| 167 |
-
print(data.to_string())
|
| 168 |
data.wt_sequence_match = data.wt_sequence_match.astype('str')
|
| 169 |
data.replace({'': 'nan'}, inplace=True)
|
| 170 |
data_size = len(data.drop_duplicates(['datapoint']))
|
|
@@ -196,7 +195,6 @@ def pdb(input_set, mode, impute):
|
|
| 196 |
pdbs = [item for sublist in pdbs for item in sublist]
|
| 197 |
|
| 198 |
else:
|
| 199 |
-
print('PDB List Empty')
|
| 200 |
pdbs = []
|
| 201 |
print('Processing PDB structures...\n')
|
| 202 |
if pdbs == []:
|
|
@@ -274,12 +272,7 @@ def pdb(input_set, mode, impute):
|
|
| 274 |
if chain_id in pdb_data_list:
|
| 275 |
# Print UniProt IDs, chain ID, and resolution for the current model
|
| 276 |
chain_id = chain.get_id()
|
| 277 |
-
|
| 278 |
-
#st.write(f"UniProt IDs: {', '.join(uniprot_ids)}")
|
| 279 |
-
#st.write(f"Chain ID: {chain_id}")
|
| 280 |
-
#st.write(f"PDB ID: {search.upper()}")
|
| 281 |
-
#st.write(f"Resolution: {resolution}")
|
| 282 |
-
#st.write(f"Sequence: {sequence}")
|
| 283 |
pdb_fasta.at[index, 'pdbID'] = search
|
| 284 |
pdb_fasta.at[index, 'chain'] = chain_id
|
| 285 |
pdb_fasta.at[index, 'pdbSequence'] = str(seqs[chain_id])
|
|
@@ -289,10 +282,6 @@ def pdb(input_set, mode, impute):
|
|
| 289 |
pdb_info.at[index, 'resolution'] = resolution
|
| 290 |
index += 1
|
| 291 |
|
| 292 |
-
st.write('PDB INFO')
|
| 293 |
-
st.write(pdb_info)
|
| 294 |
-
st.write('PDB FASTA')
|
| 295 |
-
st.write(pdb_fasta)
|
| 296 |
print('PDB file processing finished..')
|
| 297 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
| 298 |
try:
|
|
@@ -308,15 +297,13 @@ def pdb(input_set, mode, impute):
|
|
| 308 |
filename.rename(filename_replace_ext.with_suffix('.pdb'))
|
| 309 |
except:
|
| 310 |
FileNotFoundError
|
| 311 |
-
|
| 312 |
-
st.write(uniprot_matched)
|
| 313 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
| 314 |
uniprot_matched = uniprot_matched.astype(str)
|
| 315 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
| 316 |
uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
|
| 317 |
uniprot_matched = uniprot_matched.astype(str)
|
| 318 |
-
|
| 319 |
-
st.write(uniprot_matched)
|
| 320 |
with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
|
| 321 |
(uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
|
| 322 |
uniprot_matched.resolution != 'None'))].drop_duplicates()
|
|
@@ -434,18 +421,12 @@ def pdb(input_set, mode, impute):
|
|
| 434 |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
|
| 435 |
with_pdb = None
|
| 436 |
|
| 437 |
-
|
| 438 |
-
print(dfM.to_string())
|
| 439 |
-
print('dfNM')
|
| 440 |
-
print(dfNM)
|
| 441 |
print('Aligning sequences...\n')
|
| 442 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 443 |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 444 |
|
| 445 |
-
|
| 446 |
-
print(aligned_m.to_string())
|
| 447 |
-
print('aligned_nm')
|
| 448 |
-
print(aligned_nm.to_string())
|
| 449 |
|
| 450 |
|
| 451 |
# When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
|
|
@@ -488,9 +469,7 @@ def pdb(input_set, mode, impute):
|
|
| 488 |
yes_pdb_no_match = after_up_pdb_alignment[
|
| 489 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
| 490 |
no_pdb = no_pdb.copy()
|
| 491 |
-
|
| 492 |
-
print('-----PDB ALIGNED-----')
|
| 493 |
-
print(pdb_aligned.to_string())
|
| 494 |
|
| 495 |
print('PDB matching is completed...\n')
|
| 496 |
print('SUMMARY')
|
|
@@ -892,7 +871,6 @@ def pdb(input_set, mode, impute):
|
|
| 892 |
if protein not in existing_modbase_models:
|
| 893 |
print('Downloading Modbase models for ', protein)
|
| 894 |
url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
|
| 895 |
-
print(url)
|
| 896 |
req = requests.get(url)
|
| 897 |
name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
|
| 898 |
with open(name, 'wb') as f:
|
|
@@ -1389,7 +1367,6 @@ def pdb(input_set, mode, impute):
|
|
| 1389 |
|
| 1390 |
aligner = Align.PairwiseAligner()
|
| 1391 |
print('Proceeding to 3D distance calculation...\n')
|
| 1392 |
-
print(data.to_string())
|
| 1393 |
data.domainEndonPDB = data.domainEndonPDB.astype(str)
|
| 1394 |
data.domainStartonPDB = data.domainStartonPDB.astype(str)
|
| 1395 |
|
|
@@ -1419,8 +1396,7 @@ def pdb(input_set, mode, impute):
|
|
| 1419 |
pdbID = data.at[i, 'pdbID']
|
| 1420 |
|
| 1421 |
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
|
| 1422 |
-
|
| 1423 |
-
print(list(alignments))
|
| 1424 |
mutPos = data.at[i, 'mutationPositionOnPDB']
|
| 1425 |
try:
|
| 1426 |
coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
|
|
@@ -1549,7 +1525,6 @@ def pdb(input_set, mode, impute):
|
|
| 1549 |
data.positions = data.positions.astype('str')
|
| 1550 |
for i in data.index:
|
| 1551 |
if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
|
| 1552 |
-
print((str(data.at[i, 'pos']) in data.at[i, 'positions']))
|
| 1553 |
data.at[i, 'threeState_trsh4_HQ'] = 'interface'
|
| 1554 |
elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
|
| 1555 |
data.at[i, 'threeState_trsh4_HQ'] = 'surface'
|
|
|
|
| 163 |
data.at[i, 'wt_sequence_match'] = 'i'
|
| 164 |
data.at[i, 'whichIsoform'] = whichIsoform
|
| 165 |
break
|
| 166 |
+
|
|
|
|
| 167 |
data.wt_sequence_match = data.wt_sequence_match.astype('str')
|
| 168 |
data.replace({'': 'nan'}, inplace=True)
|
| 169 |
data_size = len(data.drop_duplicates(['datapoint']))
|
|
|
|
| 195 |
pdbs = [item for sublist in pdbs for item in sublist]
|
| 196 |
|
| 197 |
else:
|
|
|
|
| 198 |
pdbs = []
|
| 199 |
print('Processing PDB structures...\n')
|
| 200 |
if pdbs == []:
|
|
|
|
| 272 |
if chain_id in pdb_data_list:
|
| 273 |
# Print UniProt IDs, chain ID, and resolution for the current model
|
| 274 |
chain_id = chain.get_id()
|
| 275 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
pdb_fasta.at[index, 'pdbID'] = search
|
| 277 |
pdb_fasta.at[index, 'chain'] = chain_id
|
| 278 |
pdb_fasta.at[index, 'pdbSequence'] = str(seqs[chain_id])
|
|
|
|
| 282 |
pdb_info.at[index, 'resolution'] = resolution
|
| 283 |
index += 1
|
| 284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
print('PDB file processing finished..')
|
| 286 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
| 287 |
try:
|
|
|
|
| 297 |
filename.rename(filename_replace_ext.with_suffix('.pdb'))
|
| 298 |
except:
|
| 299 |
FileNotFoundError
|
| 300 |
+
|
|
|
|
| 301 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
| 302 |
uniprot_matched = uniprot_matched.astype(str)
|
| 303 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
| 304 |
uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
|
| 305 |
uniprot_matched = uniprot_matched.astype(str)
|
| 306 |
+
|
|
|
|
| 307 |
with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
|
| 308 |
(uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
|
| 309 |
uniprot_matched.resolution != 'None'))].drop_duplicates()
|
|
|
|
| 421 |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
|
| 422 |
with_pdb = None
|
| 423 |
|
| 424 |
+
|
|
|
|
|
|
|
|
|
|
| 425 |
print('Aligning sequences...\n')
|
| 426 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 427 |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 428 |
|
| 429 |
+
|
|
|
|
|
|
|
|
|
|
| 430 |
|
| 431 |
|
| 432 |
# When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
|
|
|
|
| 469 |
yes_pdb_no_match = after_up_pdb_alignment[
|
| 470 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
| 471 |
no_pdb = no_pdb.copy()
|
| 472 |
+
|
|
|
|
|
|
|
| 473 |
|
| 474 |
print('PDB matching is completed...\n')
|
| 475 |
print('SUMMARY')
|
|
|
|
| 871 |
if protein not in existing_modbase_models:
|
| 872 |
print('Downloading Modbase models for ', protein)
|
| 873 |
url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
|
|
|
|
| 874 |
req = requests.get(url)
|
| 875 |
name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
|
| 876 |
with open(name, 'wb') as f:
|
|
|
|
| 1367 |
|
| 1368 |
aligner = Align.PairwiseAligner()
|
| 1369 |
print('Proceeding to 3D distance calculation...\n')
|
|
|
|
| 1370 |
data.domainEndonPDB = data.domainEndonPDB.astype(str)
|
| 1371 |
data.domainStartonPDB = data.domainStartonPDB.astype(str)
|
| 1372 |
|
|
|
|
| 1396 |
pdbID = data.at[i, 'pdbID']
|
| 1397 |
|
| 1398 |
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
|
| 1399 |
+
|
|
|
|
| 1400 |
mutPos = data.at[i, 'mutationPositionOnPDB']
|
| 1401 |
try:
|
| 1402 |
coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
|
|
|
|
| 1525 |
data.positions = data.positions.astype('str')
|
| 1526 |
for i in data.index:
|
| 1527 |
if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
|
|
|
|
| 1528 |
data.at[i, 'threeState_trsh4_HQ'] = 'interface'
|
| 1529 |
elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
|
| 1530 |
data.at[i, 'threeState_trsh4_HQ'] = 'surface'
|