AlienChen's picture
Upload 139 files
65bd8af verified
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 31 21:54:08 2021
@author: Osama
"""
from torch.utils.data import Dataset
from Bio.PDB import Polypeptide
import numpy as np
import torch
import pandas as pd
import os
# import esm
import ast
import pdb
class InterpepComplexes(Dataset):
def __init__(self, mode,
encoded_data_directory = "../../datasets/interpep_data/"):
self.mode = mode
self.encoded_data_directory = encoded_data_directory
self.train_dir = "../../datasets/interpep_data/train_examples.npy"
self.test_dir = "../../datasets/interpep_data/test_examples.npy"
self.val_dir = "../../datasets/interpep_data/val_examples.npy"
self.test_list = np.load(self.test_dir)
self.train_list = np.load(self.train_dir)
self.val_list = np.load(self.val_dir)
if mode == "train":
self.num_data = len(self.train_list)
elif mode == "val":
self.num_data = len(self.val_list)
elif mode == "test":
self.num_data = len(self.test_list)
def __getitem__(self, index):
if self.mode == "train":
item = self.train_list[index]
elif self.mode == "val":
item = self.val_list[index]
elif self.mode == "test":
item = self.test_list[index]
file_dir = self.encoded_data_directory
with np.load(file_dir + "fragment_data/" + item + ".npz") as data:
temp_pep_sequence = data["target_sequence"]
temp_binding_sites = data["binding_sites"]
with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\
item.split("_")[1] + ".npz") as data:
temp_nodes = data["nodes"]
binding = np.zeros(len(temp_nodes))
if len(temp_binding_sites) != 0:
binding[temp_binding_sites] = 1
target = torch.LongTensor(binding)
nodes = temp_nodes[:, 0:20]
prot_sequence = np.argmax(nodes, axis=-1)
prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence])
pep_sequence = temp_pep_sequence
pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1)
return pep_sequence, prot_sequence, target
def __len__(self):
return self.num_data
class PPI(Dataset):
def __init__(self, mode, csv_dir_path = "/home/u21307130002/PepNN/pepnn/datasets/ppi/"):
self.mode = mode
self.train_data = pd.read_csv(os.path.join(csv_dir_path, 'train.csv'))
self.val_data = pd.read_csv(os.path.join(csv_dir_path, 'val.csv'))
# self.test_data = pd.read_csv(os.path.join(csv_dir_path, 'test.csv'))
if self.mode == 'train':
self.num_data = len(self.train_data)
def __len__(self):
return self.num_data
def __getitem__(self, index):
# pdb.set_trace()
if torch.is_tensor(index):
index = index.tolist()
if self.mode == "train":
item = self.train_data.iloc[index]
elif self.mode == "val":
item = self.val_data.iloc[index]
elif self.mode == "test":
item = self.test_data.iloc[index]
else:
item = None
# print(item)
motif1 = ast.literal_eval(item['Chain_1_motifs'])
motif2 = ast.literal_eval(item['Chain_2_motifs'])
if len(motif1[0]) > len(motif2[0]):
target = motif1
prot_sequence = item['Sequence1']
pep_sequence = item['Sequence2']
else:
target = motif2
pep_sequence = item['Sequence1']
prot_sequence = item['Sequence2']
target = [int(motif.split('_')[1]) for motif in target]
if target[-1] >= len(prot_sequence):
pdb.set_trace()
binding = np.zeros(len(prot_sequence))
if len(target) != 0:
binding[target] = 1
target = torch.LongTensor(binding).float()
# print(f"peptide length: {len(pep_sequence)}")
# print(f"protein length: {len(prot_sequence)}")
# print(f"target length: {len(target)}")
# pdb.set_trace()
return pep_sequence, prot_sequence, target
class PepBindComplexes(Dataset):
def __init__(self, mode,
encoded_data_directory = "../../datasets/pepbind_data/"):
self.mode = mode
self.encoded_data_directory = encoded_data_directory
self.train_dir = "../../datasets/pepbind_data/train_examples.npy"
self.test_dir = "../../datasets/pepbind_data/test_examples.npy"
self.val_dir = "../../datasets/pepbind_data/val_examples.npy"
self.test_list = np.load(self.test_dir)
self.train_list = np.load(self.train_dir)
self.val_list = np.load(self.val_dir)
if mode == "train":
self.num_data = len(self.train_list)
elif mode == "val":
self.num_data = len(self.val_list)
elif mode == "test":
self.num_data = len(self.test_list)
def __getitem__(self, index):
if self.mode == "train":
item = self.train_list[index]
elif self.mode == "val":
item = self.val_list[index]
elif self.mode == "test":
item = self.test_list[index]
file_dir = self.encoded_data_directory
with np.load(file_dir + "fragment_data/" + item + ".npz") as data:
temp_pep_sequence = data["target_sequence"]
temp_binding_sites = data["binding_sites"]
with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\
item.split("_")[1] + ".npz") as data:
temp_nodes = data["nodes"]
binding = np.zeros(len(temp_nodes))
if len(temp_binding_sites) != 0:
binding[temp_binding_sites] = 1
target = torch.LongTensor(binding)
nodes = temp_nodes[:, 0:20]
prot_sequence = np.argmax(nodes, axis=-1)
prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence])
pep_sequence = temp_pep_sequence
pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1)
return pep_sequence, prot_sequence, target
def __len__(self):
return self.num_data
class PeptideComplexes(Dataset):
def __init__(self, mode,
encoded_data_directory = "../../datasets/pepnn_data/all_data/"):
self.mode = mode
self.encoded_data_directory = encoded_data_directory
self.train_dir = "../../datasets/pepnn_data/train_examples.npy"
self.test_dir = "../../datasets/pepnn_test_data/test_examples.npy"
self.val_dir = "../../datasets/pepnn_data/val_examples.npy"
self.example_weights = np.load("../../datasets/pepnn_data/example_weights.npy")
self.test_list = np.load(self.test_dir)
self.train_list = np.load(self.train_dir)
self.val_list = np.load(self.val_dir)
if mode == "train":
self.num_data = len(self.train_list)
elif mode == "val":
self.num_data = len(self.val_list)
elif mode == "test":
self.num_data = len(self.test_list)
def __getitem__(self, index):
if self.mode == "train":
item = self.train_list[index]
weight = self.example_weights[item]
elif self.mode == "val":
item = self.val_list[index]
weight = self.example_weights[item]
elif self.mode == "test":
item = self.test_list[index]
weight = 1
if self.mode != "test":
file_dir = self.encoded_data_directory
else:
file_dir = "../../datasets/pepnn_test_data/all_data/"
with np.load(file_dir + "fragment_data/" + item + ".npz") as data:
temp_pep_sequence = data["target_sequence"]
temp_binding_sites = data["binding_sites"]
with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\
item.split("_")[1] + ".npz") as data:
temp_nodes = data["nodes"]
binding = np.zeros(len(temp_nodes))
if len(temp_binding_sites) != 0:
binding[temp_binding_sites] = 1
target = torch.LongTensor(binding)
nodes = temp_nodes[:, 0:20]
prot_sequence = np.argmax(nodes, axis=-1)
prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence])
pep_sequence = temp_pep_sequence
pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1)
return pep_sequence, prot_sequence, target, weight
def __len__(self):
return self.num_data
class BitenetComplexes(Dataset):
def __init__(self, encoded_data_directory = "../bitenet_data/all_data/"):
self.encoded_data_directory = encoded_data_directory
self.train_dir = "../../datasets/bitenet_data/examples.npy"
self.full_list = np.load(self.train_dir)
self.num_data = len(self.full_list)
def __getitem__(self, index):
item = self.full_list[index]
file_dir = self.encoded_data_directory
with np.load(file_dir + "fragment_data/" + item[:-1] + "_" + item[-1] + ".npz") as data:
temp_pep_sequence = data["target_sequence"]
temp_binding_matrix = data["binding_matrix"]
with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\
item.split("_")[1][0] + ".npz") as data:
temp_nodes = data["nodes"]
binding_sum = np.sum(temp_binding_matrix, axis=0).T
target = torch.LongTensor(binding_sum >= 1)
nodes = temp_nodes[:, 0:20]
prot_sequence = np.argmax(nodes, axis=-1)
prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence])
pep_sequence = temp_pep_sequence
pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1)
return pep_sequence, prot_sequence, target
def __len__(self):
return self.num_data