|
|
|
""" |
|
Created on Sat Jul 31 21:54:08 2021 |
|
|
|
@author: Osama |
|
""" |
|
|
|
from torch.utils.data import Dataset |
|
from Bio.PDB import Polypeptide |
|
import numpy as np |
|
import torch |
|
import pandas as pd |
|
import os |
|
|
|
import ast |
|
import pdb |
|
|
|
|
|
class InterpepComplexes(Dataset): |
|
|
|
def __init__(self, mode, |
|
encoded_data_directory = "../../datasets/interpep_data/"): |
|
|
|
self.mode = mode |
|
|
|
self.encoded_data_directory = encoded_data_directory |
|
|
|
self.train_dir = "../../datasets/interpep_data/train_examples.npy" |
|
|
|
self.test_dir = "../../datasets/interpep_data/test_examples.npy" |
|
|
|
self.val_dir = "../../datasets/interpep_data/val_examples.npy" |
|
|
|
|
|
self.test_list = np.load(self.test_dir) |
|
|
|
self.train_list = np.load(self.train_dir) |
|
|
|
self.val_list = np.load(self.val_dir) |
|
|
|
|
|
|
|
if mode == "train": |
|
self.num_data = len(self.train_list) |
|
elif mode == "val": |
|
self.num_data = len(self.val_list) |
|
elif mode == "test": |
|
self.num_data = len(self.test_list) |
|
|
|
|
|
|
|
def __getitem__(self, index): |
|
|
|
if self.mode == "train": |
|
item = self.train_list[index] |
|
elif self.mode == "val": |
|
item = self.val_list[index] |
|
elif self.mode == "test": |
|
item = self.test_list[index] |
|
|
|
file_dir = self.encoded_data_directory |
|
|
|
with np.load(file_dir + "fragment_data/" + item + ".npz") as data: |
|
temp_pep_sequence = data["target_sequence"] |
|
temp_binding_sites = data["binding_sites"] |
|
|
|
|
|
with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ |
|
item.split("_")[1] + ".npz") as data: |
|
temp_nodes = data["nodes"] |
|
|
|
|
|
binding = np.zeros(len(temp_nodes)) |
|
if len(temp_binding_sites) != 0: |
|
binding[temp_binding_sites] = 1 |
|
target = torch.LongTensor(binding) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nodes = temp_nodes[:, 0:20] |
|
|
|
prot_sequence = np.argmax(nodes, axis=-1) |
|
|
|
|
|
|
|
prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) |
|
|
|
|
|
|
|
pep_sequence = temp_pep_sequence |
|
|
|
pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) |
|
|
|
|
|
|
|
|
|
|
|
return pep_sequence, prot_sequence, target |
|
|
|
def __len__(self): |
|
return self.num_data |
|
|
|
class PPI(Dataset): |
|
|
|
def __init__(self, mode, csv_dir_path = "/home/u21307130002/PepNN/pepnn/datasets/ppi/"): |
|
|
|
self.mode = mode |
|
self.train_data = pd.read_csv(os.path.join(csv_dir_path, 'train.csv')) |
|
self.val_data = pd.read_csv(os.path.join(csv_dir_path, 'val.csv')) |
|
|
|
|
|
if self.mode == 'train': |
|
self.num_data = len(self.train_data) |
|
|
|
def __len__(self): |
|
return self.num_data |
|
|
|
def __getitem__(self, index): |
|
|
|
if torch.is_tensor(index): |
|
index = index.tolist() |
|
|
|
if self.mode == "train": |
|
item = self.train_data.iloc[index] |
|
elif self.mode == "val": |
|
item = self.val_data.iloc[index] |
|
elif self.mode == "test": |
|
item = self.test_data.iloc[index] |
|
else: |
|
item = None |
|
|
|
|
|
|
|
motif1 = ast.literal_eval(item['Chain_1_motifs']) |
|
motif2 = ast.literal_eval(item['Chain_2_motifs']) |
|
|
|
if len(motif1[0]) > len(motif2[0]): |
|
target = motif1 |
|
prot_sequence = item['Sequence1'] |
|
pep_sequence = item['Sequence2'] |
|
else: |
|
target = motif2 |
|
pep_sequence = item['Sequence1'] |
|
prot_sequence = item['Sequence2'] |
|
|
|
target = [int(motif.split('_')[1]) for motif in target] |
|
|
|
if target[-1] >= len(prot_sequence): |
|
pdb.set_trace() |
|
|
|
binding = np.zeros(len(prot_sequence)) |
|
if len(target) != 0: |
|
binding[target] = 1 |
|
target = torch.LongTensor(binding).float() |
|
|
|
|
|
|
|
|
|
|
|
|
|
return pep_sequence, prot_sequence, target |
|
|
|
|
|
|
|
|
|
class PepBindComplexes(Dataset): |
|
|
|
def __init__(self, mode, |
|
encoded_data_directory = "../../datasets/pepbind_data/"): |
|
|
|
self.mode = mode |
|
|
|
self.encoded_data_directory = encoded_data_directory |
|
|
|
self.train_dir = "../../datasets/pepbind_data/train_examples.npy" |
|
|
|
self.test_dir = "../../datasets/pepbind_data/test_examples.npy" |
|
|
|
self.val_dir = "../../datasets/pepbind_data/val_examples.npy" |
|
|
|
|
|
self.test_list = np.load(self.test_dir) |
|
|
|
self.train_list = np.load(self.train_dir) |
|
|
|
self.val_list = np.load(self.val_dir) |
|
|
|
|
|
if mode == "train": |
|
self.num_data = len(self.train_list) |
|
elif mode == "val": |
|
self.num_data = len(self.val_list) |
|
elif mode == "test": |
|
self.num_data = len(self.test_list) |
|
|
|
|
|
|
|
def __getitem__(self, index): |
|
|
|
if self.mode == "train": |
|
item = self.train_list[index] |
|
|
|
|
|
elif self.mode == "val": |
|
item = self.val_list[index] |
|
|
|
|
|
elif self.mode == "test": |
|
item = self.test_list[index] |
|
|
|
|
|
|
|
file_dir = self.encoded_data_directory |
|
|
|
|
|
with np.load(file_dir + "fragment_data/" + item + ".npz") as data: |
|
temp_pep_sequence = data["target_sequence"] |
|
temp_binding_sites = data["binding_sites"] |
|
|
|
|
|
with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ |
|
item.split("_")[1] + ".npz") as data: |
|
temp_nodes = data["nodes"] |
|
|
|
|
|
binding = np.zeros(len(temp_nodes)) |
|
if len(temp_binding_sites) != 0: |
|
binding[temp_binding_sites] = 1 |
|
target = torch.LongTensor(binding) |
|
|
|
nodes = temp_nodes[:, 0:20] |
|
|
|
prot_sequence = np.argmax(nodes, axis=-1) |
|
|
|
|
|
prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) |
|
|
|
|
|
pep_sequence = temp_pep_sequence |
|
|
|
pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) |
|
|
|
|
|
return pep_sequence, prot_sequence, target |
|
|
|
|
|
def __len__(self): |
|
return self.num_data |
|
|
|
class PeptideComplexes(Dataset): |
|
|
|
def __init__(self, mode, |
|
encoded_data_directory = "../../datasets/pepnn_data/all_data/"): |
|
|
|
self.mode = mode |
|
|
|
self.encoded_data_directory = encoded_data_directory |
|
|
|
self.train_dir = "../../datasets/pepnn_data/train_examples.npy" |
|
|
|
self.test_dir = "../../datasets/pepnn_test_data/test_examples.npy" |
|
|
|
self.val_dir = "../../datasets/pepnn_data/val_examples.npy" |
|
|
|
|
|
self.example_weights = np.load("../../datasets/pepnn_data/example_weights.npy") |
|
|
|
self.test_list = np.load(self.test_dir) |
|
|
|
self.train_list = np.load(self.train_dir) |
|
|
|
self.val_list = np.load(self.val_dir) |
|
|
|
|
|
|
|
if mode == "train": |
|
self.num_data = len(self.train_list) |
|
elif mode == "val": |
|
self.num_data = len(self.val_list) |
|
elif mode == "test": |
|
self.num_data = len(self.test_list) |
|
|
|
|
|
|
|
def __getitem__(self, index): |
|
|
|
|
|
if self.mode == "train": |
|
item = self.train_list[index] |
|
|
|
weight = self.example_weights[item] |
|
|
|
elif self.mode == "val": |
|
item = self.val_list[index] |
|
|
|
weight = self.example_weights[item] |
|
|
|
elif self.mode == "test": |
|
item = self.test_list[index] |
|
|
|
weight = 1 |
|
|
|
if self.mode != "test": |
|
file_dir = self.encoded_data_directory |
|
else: |
|
file_dir = "../../datasets/pepnn_test_data/all_data/" |
|
|
|
|
|
with np.load(file_dir + "fragment_data/" + item + ".npz") as data: |
|
temp_pep_sequence = data["target_sequence"] |
|
temp_binding_sites = data["binding_sites"] |
|
|
|
|
|
with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ |
|
item.split("_")[1] + ".npz") as data: |
|
temp_nodes = data["nodes"] |
|
|
|
|
|
binding = np.zeros(len(temp_nodes)) |
|
if len(temp_binding_sites) != 0: |
|
binding[temp_binding_sites] = 1 |
|
target = torch.LongTensor(binding) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nodes = temp_nodes[:, 0:20] |
|
|
|
prot_sequence = np.argmax(nodes, axis=-1) |
|
|
|
|
|
|
|
prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) |
|
|
|
|
|
|
|
pep_sequence = temp_pep_sequence |
|
|
|
pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) |
|
|
|
|
|
|
|
|
|
|
|
return pep_sequence, prot_sequence, target, weight |
|
|
|
|
|
def __len__(self): |
|
return self.num_data |
|
|
|
|
|
class BitenetComplexes(Dataset): |
|
|
|
def __init__(self, encoded_data_directory = "../bitenet_data/all_data/"): |
|
|
|
|
|
self.encoded_data_directory = encoded_data_directory |
|
|
|
|
|
|
|
|
|
self.train_dir = "../../datasets/bitenet_data/examples.npy" |
|
|
|
|
|
|
|
|
|
self.full_list = np.load(self.train_dir) |
|
|
|
|
|
|
|
|
|
self.num_data = len(self.full_list) |
|
|
|
|
|
|
|
|
|
def __getitem__(self, index): |
|
|
|
item = self.full_list[index] |
|
|
|
file_dir = self.encoded_data_directory |
|
|
|
with np.load(file_dir + "fragment_data/" + item[:-1] + "_" + item[-1] + ".npz") as data: |
|
temp_pep_sequence = data["target_sequence"] |
|
temp_binding_matrix = data["binding_matrix"] |
|
|
|
|
|
with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ |
|
item.split("_")[1][0] + ".npz") as data: |
|
temp_nodes = data["nodes"] |
|
|
|
|
|
binding_sum = np.sum(temp_binding_matrix, axis=0).T |
|
|
|
target = torch.LongTensor(binding_sum >= 1) |
|
|
|
|
|
|
|
nodes = temp_nodes[:, 0:20] |
|
|
|
prot_sequence = np.argmax(nodes, axis=-1) |
|
|
|
|
|
|
|
prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) |
|
|
|
|
|
|
|
pep_sequence = temp_pep_sequence |
|
|
|
pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) |
|
|
|
|
|
|
|
|
|
return pep_sequence, prot_sequence, target |
|
|
|
def __len__(self): |
|
return self.num_data |