Spaces:
Running
Running
Commit
·
c8c0720
1
Parent(s):
d9ba5a4
Create add_domains.py
Browse files- code/add_domains.py +32 -0
code/add_domains.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
def add_domains(data, path_to_domains):
|
| 6 |
+
DOMAINS = pd.read_csv(path_to_domains, delimiter=' ')
|
| 7 |
+
data = data.merge(DOMAINS, right_on='proteinID', left_on='uniprotID', how='left')
|
| 8 |
+
data.domStart = data.domStart.astype('Int64')
|
| 9 |
+
data.domEnd = data.domEnd.astype('Int64')
|
| 10 |
+
data = data.drop(['proteinID'], axis=1)
|
| 11 |
+
data['distance'] = np.NaN
|
| 12 |
+
zeroDistanceDomains = []
|
| 13 |
+
for i in data.index:
|
| 14 |
+
if pd.isna(data.at[i, 'domain']):
|
| 15 |
+
data.at[i, 'distance'] = np.NaN
|
| 16 |
+
else:
|
| 17 |
+
if int(data.at[i, 'domStart']) <= int(data.at[i, 'pos']) <= int(data.at[i, 'domEnd']):
|
| 18 |
+
data.at[i, 'distance'] = 0
|
| 19 |
+
DOMAIN_NAME = data.at[i, 'domain']
|
| 20 |
+
zeroDistanceDomains.append(DOMAIN_NAME)
|
| 21 |
+
data = data.sort_values(by=['datapoint', 'distance']).reset_index(drop=True) # Distances will be sorted.
|
| 22 |
+
|
| 23 |
+
ZeroDistance = data[data.distance == 0.0]
|
| 24 |
+
NotZeroDistance = data[data.distance != 0.0]
|
| 25 |
+
NotZeroDistance.distance = -1000
|
| 26 |
+
|
| 27 |
+
NotZeroDistance = NotZeroDistance[~NotZeroDistance.domain.isin(zeroDistanceDomains)]
|
| 28 |
+
|
| 29 |
+
data = pd.concat([ZeroDistance, NotZeroDistance], sort=False)
|
| 30 |
+
data.reset_index(drop=True, inplace=True)
|
| 31 |
+
data.fillna(-1, inplace=True)
|
| 32 |
+
return data
|