Spaces:
Sleeping
Sleeping
Commit
·
1744db1
1
Parent(s):
108cc29
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +5 -28
code/pdb_featureVector.py
CHANGED
|
@@ -95,8 +95,6 @@ def pdb(input_set, mode, impute):
|
|
| 95 |
data.domStart = data.domStart.replace({'nan': '-1'})
|
| 96 |
data.domEnd = data.domEnd.replace({'nan': '-1'})
|
| 97 |
data.distance = data.distance.replace({'nan': '-1'})
|
| 98 |
-
st.write('1')
|
| 99 |
-
st.write(data)
|
| 100 |
"""
|
| 101 |
STEP 4
|
| 102 |
Retrieve canonical and isoform UniProt sequences.
|
|
@@ -202,7 +200,6 @@ def pdb(input_set, mode, impute):
|
|
| 202 |
else:
|
| 203 |
pdbs = []
|
| 204 |
print('Processing PDB structures...\n')
|
| 205 |
-
st.write('2')
|
| 206 |
if pdbs == []:
|
| 207 |
print('No PDB structure found for the query. ')
|
| 208 |
print('Starting PDB structures download...\n')
|
|
@@ -303,7 +300,6 @@ def pdb(input_set, mode, impute):
|
|
| 303 |
filename.rename(filename_replace_ext.with_suffix('.pdb'))
|
| 304 |
except:
|
| 305 |
FileNotFoundError
|
| 306 |
-
st.write('3')
|
| 307 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
| 308 |
uniprot_matched = uniprot_matched.astype(str)
|
| 309 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
|
@@ -408,9 +404,7 @@ def pdb(input_set, mode, impute):
|
|
| 408 |
dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
| 409 |
dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
|
| 410 |
dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
|
| 411 |
-
|
| 412 |
-
st.write(dfM)
|
| 413 |
-
st.write(dfNM)
|
| 414 |
dfM = dfM.astype(str)
|
| 415 |
dfNM = dfNM.astype(str)
|
| 416 |
|
|
@@ -432,12 +426,8 @@ def pdb(input_set, mode, impute):
|
|
| 432 |
|
| 433 |
print('Aligning sequences...\n')
|
| 434 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 435 |
-
st.write('aligned_m')
|
| 436 |
-
st.write(aligned_m)
|
| 437 |
-
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 438 |
-
st.write(aligned_nm)
|
| 439 |
|
| 440 |
-
|
| 441 |
|
| 442 |
|
| 443 |
|
|
@@ -508,7 +498,6 @@ def pdb(input_set, mode, impute):
|
|
| 508 |
|
| 509 |
print('Proceeding to SwissModel search...')
|
| 510 |
print('------------------------------------\n')
|
| 511 |
-
st.write('5')
|
| 512 |
# At this point we have 4 dataframes
|
| 513 |
# 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
|
| 514 |
# 1a. aligned --- we are done with this.
|
|
@@ -607,7 +596,6 @@ def pdb(input_set, mode, impute):
|
|
| 607 |
|
| 608 |
with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
|
| 609 |
with_swiss_models = with_swiss_models[to_swiss.columns]
|
| 610 |
-
st.write('6')
|
| 611 |
# Add model info.
|
| 612 |
|
| 613 |
with_swiss_models = with_swiss_models.astype(str)
|
|
@@ -713,7 +701,6 @@ def pdb(input_set, mode, impute):
|
|
| 713 |
swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
|
| 714 |
else:
|
| 715 |
swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
|
| 716 |
-
st.write('7')
|
| 717 |
swissmodels_fasta = swissmodels_fasta.astype(str)
|
| 718 |
|
| 719 |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
|
|
@@ -828,7 +815,6 @@ def pdb(input_set, mode, impute):
|
|
| 828 |
to_swiss_columns = to_swiss.columns
|
| 829 |
to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
|
| 830 |
to_swiss = None
|
| 831 |
-
st.write('8')
|
| 832 |
# CONTROL
|
| 833 |
|
| 834 |
"""
|
|
@@ -1325,7 +1311,6 @@ def pdb(input_set, mode, impute):
|
|
| 1325 |
swiss['source'] = 'SWISSMODEL'
|
| 1326 |
modbase['source'] = 'MODBASE'
|
| 1327 |
data = pd.concat([swiss, modbase, pdb])
|
| 1328 |
-
st.write(data)
|
| 1329 |
data.reset_index(inplace=True)
|
| 1330 |
data.drop(['index'], axis=1, inplace=True)
|
| 1331 |
data = data.astype('str')
|
|
@@ -1344,7 +1329,6 @@ def pdb(input_set, mode, impute):
|
|
| 1344 |
existing_free_sasa = [str(i) for i in existing_free_sasa]
|
| 1345 |
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
| 1346 |
print('Calculation RSA for PDB Structure Files...\n')
|
| 1347 |
-
st.write(existing_free_sasa)
|
| 1348 |
pdb_only = data[data.source == 'PDB']
|
| 1349 |
|
| 1350 |
|
|
@@ -1381,7 +1365,6 @@ def pdb(input_set, mode, impute):
|
|
| 1381 |
existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
|
| 1382 |
existing_free_sasa = [str(i) for i in existing_free_sasa]
|
| 1383 |
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
| 1384 |
-
st.write(existing_free_sasa)
|
| 1385 |
annotation_list += ['domainStartonPDB', 'domainEndonPDB']
|
| 1386 |
|
| 1387 |
folder_path = path_to_output_files / 'freesasa_files'
|
|
@@ -1397,8 +1380,6 @@ def pdb(input_set, mode, impute):
|
|
| 1397 |
modbase_only = None
|
| 1398 |
data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C')
|
| 1399 |
data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C')
|
| 1400 |
-
st.write('after')
|
| 1401 |
-
st.write(data)
|
| 1402 |
for i in data.index:
|
| 1403 |
id_ = data.at[i, 'pdbID'].lower()
|
| 1404 |
up_id_ = data.at[i, 'uniprotID']
|
|
@@ -1420,13 +1401,11 @@ def pdb(input_set, mode, impute):
|
|
| 1420 |
|
| 1421 |
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
|
| 1422 |
mutPos = data.at[i, 'mutationPositionOnPDB']
|
| 1423 |
-
st.write('mutpos', mutPos)
|
| 1424 |
try:
|
| 1425 |
coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
|
| 1426 |
except:
|
| 1427 |
ValueError
|
| 1428 |
coordMut = 'nan'
|
| 1429 |
-
st.write('coordMut', coordMut)
|
| 1430 |
try:
|
| 1431 |
sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
|
| 1432 |
data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
|
|
@@ -1434,8 +1413,7 @@ def pdb(input_set, mode, impute):
|
|
| 1434 |
except:
|
| 1435 |
ValueError
|
| 1436 |
data.at[i, 'sasa'] = 'nan' # mutation position is nan
|
| 1437 |
-
|
| 1438 |
-
st.write(data)
|
| 1439 |
for annot in annotation_list:
|
| 1440 |
annotx = []
|
| 1441 |
try:
|
|
@@ -1501,8 +1479,7 @@ def pdb(input_set, mode, impute):
|
|
| 1501 |
k = pd.Series((key, str(list(set(val)))))
|
| 1502 |
interface_dataframe = interface_dataframe.append(k, ignore_index=True)
|
| 1503 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
| 1504 |
-
|
| 1505 |
-
st.write(data)
|
| 1506 |
if len(data) == 0:
|
| 1507 |
data = pd.DataFrame(
|
| 1508 |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
|
@@ -1711,4 +1688,4 @@ def pdb(input_set, mode, impute):
|
|
| 1711 |
hours, rem = divmod(end - start, 3600)
|
| 1712 |
minutes, seconds = divmod(rem, 60)
|
| 1713 |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
| 1714 |
-
return ready
|
|
|
|
| 95 |
data.domStart = data.domStart.replace({'nan': '-1'})
|
| 96 |
data.domEnd = data.domEnd.replace({'nan': '-1'})
|
| 97 |
data.distance = data.distance.replace({'nan': '-1'})
|
|
|
|
|
|
|
| 98 |
"""
|
| 99 |
STEP 4
|
| 100 |
Retrieve canonical and isoform UniProt sequences.
|
|
|
|
| 200 |
else:
|
| 201 |
pdbs = []
|
| 202 |
print('Processing PDB structures...\n')
|
|
|
|
| 203 |
if pdbs == []:
|
| 204 |
print('No PDB structure found for the query. ')
|
| 205 |
print('Starting PDB structures download...\n')
|
|
|
|
| 300 |
filename.rename(filename_replace_ext.with_suffix('.pdb'))
|
| 301 |
except:
|
| 302 |
FileNotFoundError
|
|
|
|
| 303 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
| 304 |
uniprot_matched = uniprot_matched.astype(str)
|
| 305 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
|
|
|
| 404 |
dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
| 405 |
dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
|
| 406 |
dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
|
| 407 |
+
|
|
|
|
|
|
|
| 408 |
dfM = dfM.astype(str)
|
| 409 |
dfNM = dfNM.astype(str)
|
| 410 |
|
|
|
|
| 426 |
|
| 427 |
print('Aligning sequences...\n')
|
| 428 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
|
| 430 |
+
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
| 431 |
|
| 432 |
|
| 433 |
|
|
|
|
| 498 |
|
| 499 |
print('Proceeding to SwissModel search...')
|
| 500 |
print('------------------------------------\n')
|
|
|
|
| 501 |
# At this point we have 4 dataframes
|
| 502 |
# 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
|
| 503 |
# 1a. aligned --- we are done with this.
|
|
|
|
| 596 |
|
| 597 |
with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
|
| 598 |
with_swiss_models = with_swiss_models[to_swiss.columns]
|
|
|
|
| 599 |
# Add model info.
|
| 600 |
|
| 601 |
with_swiss_models = with_swiss_models.astype(str)
|
|
|
|
| 701 |
swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
|
| 702 |
else:
|
| 703 |
swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
|
|
|
|
| 704 |
swissmodels_fasta = swissmodels_fasta.astype(str)
|
| 705 |
|
| 706 |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
|
|
|
|
| 815 |
to_swiss_columns = to_swiss.columns
|
| 816 |
to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
|
| 817 |
to_swiss = None
|
|
|
|
| 818 |
# CONTROL
|
| 819 |
|
| 820 |
"""
|
|
|
|
| 1311 |
swiss['source'] = 'SWISSMODEL'
|
| 1312 |
modbase['source'] = 'MODBASE'
|
| 1313 |
data = pd.concat([swiss, modbase, pdb])
|
|
|
|
| 1314 |
data.reset_index(inplace=True)
|
| 1315 |
data.drop(['index'], axis=1, inplace=True)
|
| 1316 |
data = data.astype('str')
|
|
|
|
| 1329 |
existing_free_sasa = [str(i) for i in existing_free_sasa]
|
| 1330 |
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
| 1331 |
print('Calculation RSA for PDB Structure Files...\n')
|
|
|
|
| 1332 |
pdb_only = data[data.source == 'PDB']
|
| 1333 |
|
| 1334 |
|
|
|
|
| 1365 |
existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
|
| 1366 |
existing_free_sasa = [str(i) for i in existing_free_sasa]
|
| 1367 |
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
|
|
|
| 1368 |
annotation_list += ['domainStartonPDB', 'domainEndonPDB']
|
| 1369 |
|
| 1370 |
folder_path = path_to_output_files / 'freesasa_files'
|
|
|
|
| 1380 |
modbase_only = None
|
| 1381 |
data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C')
|
| 1382 |
data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C')
|
|
|
|
|
|
|
| 1383 |
for i in data.index:
|
| 1384 |
id_ = data.at[i, 'pdbID'].lower()
|
| 1385 |
up_id_ = data.at[i, 'uniprotID']
|
|
|
|
| 1401 |
|
| 1402 |
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
|
| 1403 |
mutPos = data.at[i, 'mutationPositionOnPDB']
|
|
|
|
| 1404 |
try:
|
| 1405 |
coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
|
| 1406 |
except:
|
| 1407 |
ValueError
|
| 1408 |
coordMut = 'nan'
|
|
|
|
| 1409 |
try:
|
| 1410 |
sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
|
| 1411 |
data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
|
|
|
|
| 1413 |
except:
|
| 1414 |
ValueError
|
| 1415 |
data.at[i, 'sasa'] = 'nan' # mutation position is nan
|
| 1416 |
+
|
|
|
|
| 1417 |
for annot in annotation_list:
|
| 1418 |
annotx = []
|
| 1419 |
try:
|
|
|
|
| 1479 |
k = pd.Series((key, str(list(set(val)))))
|
| 1480 |
interface_dataframe = interface_dataframe.append(k, ignore_index=True)
|
| 1481 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
| 1482 |
+
|
|
|
|
| 1483 |
if len(data) == 0:
|
| 1484 |
data = pd.DataFrame(
|
| 1485 |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
|
|
|
| 1688 |
hours, rem = divmod(end - start, 3600)
|
| 1689 |
minutes, seconds = divmod(rem, 60)
|
| 1690 |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
| 1691 |
+
return ready
|