ChatterjeeLab
/

muPPIt

Model card Files Files and versions Community

AlienChen commited on Aug 17, 2024

Commit

0977aa0

verified ·

1 Parent(s): dbeb56d

Delete dataset

Browse files

Files changed (8) hide show

dataset/PPI_README.md +0 -8
dataset/PPI_contamination.py +0 -169
dataset/PPI_extract_full_sequence.py +0 -176
dataset/PPI_final_contamination.py +0 -165
dataset/compute_class_weights.py +0 -47
dataset/peptide_static_batching.py +0 -83
dataset/prebatching.py +0 -197
dataset/static_prebatching.py +0 -162

dataset/PPI_README.md DELETED Viewed

@@ -1,8 +0,0 @@
-# 1. Run comtamination.py to get initial results and error sequences.
-`python -u contamination.py -i 1,2,3,4`
-# 2. Run extract_full_sequence.py to get full sequences for error sequences. extract_full_sequence.py can only run for one id at a time.
-`python -u extract_full_sequence.py -id 1`
-# 3. After getting full sequences for error sequences, run final_contamination.py to get the final results.
-`python -u final_contamination.py -i 1,2,3,4`

dataset/PPI_contamination.py DELETED Viewed

@@ -1,169 +0,0 @@
-"""BLOSUM guided motif contamination"""
-import pandas as pd
-import blosum as bl
-import ast
-import pickle
-import pandas as pd
-from Bio import SeqIO
-from math import ceil
-from sklearn.model_selection import train_test_split
-import random
-from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
-import argparse
-def main(i):
-    random.seed(42)
-    blosum = bl.BLOSUM(62)
-    def get_least_likely_substitution(residue):
-        if residue not in blosum:
-            return residue  # If residue is not in Blosum matrix, return it as is
-        matrix_keys = list(blosum.keys())
-        min_score = min(blosum[residue][r] for r in matrix_keys if r != '*' and r != 'J')
-        least_likely_residues = [r for r in matrix_keys if r != '*' and r != 'J' and blosum[residue][r] == min_score]
-        least_likely_residue = random.choice(least_likely_residues)
-        return least_likely_residue
-    csv_file = f"raw_data/processed_6A_results_batch_{i}.json"
-    df = pd.read_json(csv_file)
-    df.to_csv(f"raw_data/processed_6A_results_batch_{i}.csv", index=False)
-    df = pd.read_csv(f"raw_data/processed_6A_results_batch_{i}.csv")
-    output_csv = f"contaminated_data/processed_6A_results_batch_{i}.csv"
-    error_csv = f"contaminated_data/error_6A_results_batch_{i}.csv"
-    new_rows = []
-    error_rows = []
-    for idx, row in df.iterrows():
-        flag1 = False   # check whether there are errors when mutating Sequence1
-        flag2 = False   # check whether there are errors when mutation Sequence2
-        chain1 = row['Chain1'].upper()
-        chain2 = row['Chain2'].upper()
-        sequence1 = row['Sequence1']
-        sequence2 = row['Sequence2']
-        chain_1_motifs = ast.literal_eval(row['Chain_1_motifs'])
-        chain_2_motifs = ast.literal_eval(row['Chain_2_motifs'])
-        chain_1_offset = row['Chain_1_offset']
-        chain_2_offset = row['Chain_2_offset']
-        # Create a new entry by mutating sequence1
-        sequence1_list = list(sequence1)
-        modified_chain_1_motifs = []
-        if len(chain_1_motifs) > 0:
-            # Ignore entries where motif length equals sequence length cuz it'll be too hard for models to learn
-            if len(chain_1_motifs) == len(sequence1):
-                flag1 = True
-            for motif in chain_1_motifs:
-                res, pos = motif.split('_')
-                # Errors for motifs or there are unalignments between sequence and motif
-                if int(pos) >= len(sequence1) or int(pos) < 0 or res != sequence1[int(pos)]:
-                    error_rows.append({
-                        'PDB_ID': row['PDB_ID'] + '_' + chain1 + '_' + chain2,
-                        'Chain': chain1,
-                        'Sequence': sequence1,
-                        'Error_motif': motif,
-                        'Chain_offset': row['Chain_1_offset']
-                    })
-                    flag1 = True
-                    break
-                least_likely_residue = get_least_likely_substitution(res)
-                sequence1_list[int(pos)] = least_likely_residue
-                modified_chain_1_motifs.append(res + '_' + pos + '_' + least_likely_residue)
-            # only save the entries without errors or do not need to be ignored
-            if flag1 is False:
-                modified_sequence1 = ''.join(sequence1_list)
-                new_rows.append({
-                    'PDB_ID': row['PDB_ID'] + '_' + chain1 + '_' + chain2,
-                    'Chain1': chain1,
-                    'Sequence1': modified_sequence1,
-                    'Chain2': chain2,
-                    'Sequence2': sequence2,
-                    'Chain_1_motifs': str(modified_chain_1_motifs),
-                    'Chain_2_motifs': row['Chain_2_motifs'],
-                    'Chain_1_offset': row['Chain_1_offset'],
-                    'Chain_2_offset': row['Chain_2_offset'],
-                    'Modified_chain': chain1,
-                    'Original_sequence': sequence1,
-                })
-        # If sequence2 is the same as sequence1 and so as the motifs, do not need to mutate sequence2
-        if sequence1 == sequence2 and chain_1_motifs == chain_2_motifs:
-            continue
-        # Create a new entry by mutating sequence2, using the same logic as sequenc1
-        if len(chain_2_motifs) > 0:
-            if len(chain_2_motifs) == len(sequence2):
-                flag2 == True
-            sequence2_list = list(sequence2)
-            modified_chain_2_motifs = []
-            for motif in chain_2_motifs:
-                res, pos = motif.split('_')
-                if int(pos) >= len(sequence2) or int(pos) < 0 or res != sequence2[int(pos)]:
-                    error_rows.append({
-                        'PDB_ID': row['PDB_ID'] + '_' + chain1 + '_' + chain2,
-                        'Chain': chain2,
-                        'Sequence': sequence2,
-                        'Error_motif': motif,
-                        'Chain_offset': row['Chain_2_offset']
-                    })
-                    flag2 = True
-                    break
-                least_likely_residue = get_least_likely_substitution(res)
-                sequence2_list[int(pos)] = least_likely_residue
-                modified_chain_2_motifs.append(res + '_' + pos + '_' + least_likely_residue)
-            if flag2 is False:
-                modified_sequence2 = ''.join(sequence2_list)
-                new_rows.append({
-                    'PDB_ID': row['PDB_ID'] + '_' + chain2 + '_' + chain1,
-                    'Chain1': chain1,
-                    'Sequence1': sequence1,
-                    'Chain2': chain2,
-                    'Sequence2': modified_sequence2,
-                    'Chain_1_motifs': row['Chain_1_motifs'],
-                    'Chain_2_motifs': str(modified_chain_2_motifs),
-                    'Chain_1_offset': row['Chain_1_offset'],
-                    'Chain_2_offset': row['Chain_2_offset'],
-                    'Modified_chain': chain2,
-                    'Original_sequence': sequence2,
-                })
-    # Finished mutation
-    new_df = pd.DataFrame(new_rows)
-    # Deduplicate
-    columns_to_check = ['Sequence1', 'Sequence2', 'Chain_1_motifs', 'Chain_2_motifs', 'Chain_1_offset', 'Chain_2_offset']
-    deduplicated_new_df = new_df.drop_duplicates(subset=columns_to_check)
-    print(f"Number of rows before deduplication: {len(new_df)}")
-    print(f"Number of rows after deduplication: {len(deduplicated_new_df)}")
-    deduplicated_new_df.to_csv(output_csv, index=False)
-    # Save error sequences to another file
-    error_df = pd.DataFrame(error_rows)
-    error_df.to_csv(error_csv, index=False)
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-i')
-    args = parser.parse_args()
-    i_s = args.i    # 2,3,4,5,6,7,8,9,10
-    for i in i_s.split(','):
-        print(int(i))
-        main(int(i))

dataset/PPI_extract_full_sequence.py DELETED Viewed

@@ -1,176 +0,0 @@
-"""Pull full sequences from PDB files for error sequences"""
-import json
-import os
-import logging
-from Bio import PDB
-import warnings
-import requests
-import pickle
-import pandas as pd
-import argparse
-warnings.filterwarnings("ignore", category=PDB.PDBExceptions.PDBConstructionWarning)
-logging.basicConfig(filename='pdb.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-AA_CODE_MAP = {
-    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
-    'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
-    'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
-    'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
-    'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
-}
-# Missing residues are recoreded in REMARK 465 fields in pdb files
-def extract_remark_465(pdb_file_path):
-    remark_465_lines = []
-    with open(pdb_file_path, 'r') as file:
-        for line in file:
-            if line.startswith("REMARK 465     "):
-                remark_465_lines.append(line.strip())
-    return remark_465_lines[2:]
-def parse_remark_465(remark_465_lines):
-    missing_residues = {}
-    for line in remark_465_lines:
-        parts = line.split()
-        if len(parts) < 5:
-            continue
-        chain_id = parts[3]
-        resseq = int(parts[4])
-        resname = parts[2]
-        # print(resname, chain_id, resseq)
-        if chain_id not in missing_residues:
-            missing_residues[chain_id] = []
-        missing_residues[chain_id].append((resseq, resname))
-    return missing_residues
-def extract_sequences(structure, target_chain_id, missing_residues):
-    for chain in structure.get_chains():
-        chain_id = chain.get_id()
-        residues = list(chain.get_residues())
-        if chain_id == target_chain_id:
-            seq_list = []
-            resseq_set = set(res.get_id()[1] for res in residues)
-            min_resseq_struct = min(resseq_set, default=1)
-            max_resseq_struct = max(resseq_set, default=0)
-            max_resseq_missing = max((x[0] for x in missing_residues.get(chain_id, [])), default=0)
-            resseq_max = max(max_resseq_struct, max_resseq_missing)
-            for i in range(min_resseq_struct, resseq_max + 1):
-                if i in resseq_set:
-                    resname = next(res.get_resname() for res in residues if res.get_id()[1] == i)
-                    seq_list.append(AA_CODE_MAP.get(resname, 'X'))
-                elif chain_id in missing_residues and i in [x[0] for x in missing_residues[chain_id]]:
-                    resname = next(x[1] for x in missing_residues[chain_id] if x[0] == i)
-                    seq_list.append(AA_CODE_MAP.get(resname, 'X'))
-            chain_seq = ''.join(seq_list).strip('X')
-    return chain_seq
-def download_pdb(pdb_id, id):
-    # proxies = {
-    #     "http": "http://127.0.0.1:1080",
-    #     "https": "http://127.0.0.1:1080",
-    # }
-    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
-    file_path = f"pdb{id}/{pdb_id}.pdb"
-    while (True):
-        try:
-            # Download the PDB file
-            # response = requests.get(url, proxies=proxies)
-            response = requests.get(url)
-            response.raise_for_status()
-            with open(file_path, "wb") as file:
-                file.write(response.content)
-            # print(f"Downloaded {pdb_id}.pdb")
-            return file_path
-        except requests.exceptions.RequestException as e:
-            print(f"Failed to download {pdb_id}.pdb: {e}")
-            continue
-def delete_pdb(pdb_id, chain_id, id):
-    file_path = f"pdb{id}/{pdb_id}.pdb"
-    if os.path.exists(file_path):
-        os.remove(file_path)
-        print(f"Deleted {pdb_id}.pdb for {chain_id}")
-    else:
-        print(f"File {pdb_id}.pdb does not exist")
-def process_entry(entry, chain_id, results, id):
-    pdb_id = entry[0:4]
-    pdb_file_path = download_pdb(pdb_id, id)
-    if os.path.exists(pdb_file_path):
-        try:
-            parser = PDB.PDBParser()
-            structure = parser.get_structure(pdb_id, pdb_file_path)
-            # Extract and parse REMARK 465
-            remark_465 = extract_remark_465(pdb_file_path)
-            missing_residues = parse_remark_465(remark_465)
-            # Get the full sequences for target chain
-            chain_seq = extract_sequences(structure, chain_id, missing_residues)
-            for index, row in results.iterrows():
-                if row['PDB_ID'] == pdb_id:
-                    if row['Chain1'] == chain_id:
-                        results.at[index, 'Sequence1'] = chain_seq
-                    elif row['Chain2'] == chain_id:
-                        results.at[index, 'Sequence2'] = chain_seq
-                    else:
-                        NotImplementedError
-            delete_pdb(pdb_id, chain_id, id)
-        except Exception as e:
-            logging.error(f'Failed to process {pdb_id}: {str(e)}')
-    else:
-        logging.error(f'PDB file {pdb_id}.pdb not found')
-def main(id):
-    # Load the PDB_ID list and corresponding chain ID
-    df = pd.read_csv(f'contaminated_data/error_6A_results_batch_{id}.csv')
-    # print(df)
-    pdb_id_list = df['PDB_ID'].tolist()
-    chain_id_list = df['Chain'].tolist()
-    processed = []
-    rs = pd.read_csv(f'raw_data/processed_6A_results_batch_{id}.csv')
-    for i in range(len(pdb_id_list)):
-        entry, chain_id = pdb_id_list[i], chain_id_list[i].upper()  # 6x85_D_F, D
-        if {entry: chain_id} not in processed:
-            processed.append({entry: chain_id})
-            process_entry(entry, chain_id, rs, id)
-        if i % 100 == 0:
-            rs.to_csv(f'raw_data/corrected_processed_6A_results_batch_{id}.csv', index=False)
-            print(f"Saving for i={i}")
-    rs.to_csv(f'raw_data/corrected_processed_6A_results_batch_{id}.csv', index=False)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-id')
-    args = parser.parse_args()
-    print(int(args.id))
-    if not os.path.exists(f"pdb{int(args.id)}"):
-        os.makedirs(f"pdb{int(args.id)}")
-    main(int(args.id))

dataset/PPI_final_contamination.py DELETED Viewed

@@ -1,165 +0,0 @@
-"""Final motif contamination after pulling full sequences"""
-import pandas as pd
-import blosum as bl
-import ast
-import pickle
-import pandas as pd
-from Bio import SeqIO
-from math import ceil
-from sklearn.model_selection import train_test_split
-import random
-from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
-import argparse
-def main(i):
-    random.seed(42)
-    blosum = bl.BLOSUM(62)
-    def get_least_likely_substitution(residue):
-        if residue not in blosum:
-            return residue  # If residue is not in Blosum matrix, return it as is
-        matrix_keys = list(blosum.keys())
-        min_score = min(blosum[residue][r] for r in matrix_keys if r != '*' and r != 'J')
-        least_likely_residues = [r for r in matrix_keys if r != '*' and r != 'J' and blosum[residue][r] == min_score]
-        least_likely_residue = random.choice(least_likely_residues)
-        return least_likely_residue
-    df = pd.read_csv(f"raw_data/corrected_processed_6A_results_batch_{i}.csv")
-    output_csv = f"contaminated_data/processed_6A_results_batch_{i}.csv"
-    error_csv = f"contaminated_data/error_6A_results_batch_{i}.csv"
-    new_rows = []
-    error_rows = []
-    for idx, row in df.iterrows():
-        flag1 = False   # check whether there are errors when mutating Sequence1
-        flag2 = False   # check whether there are errors when mutation Sequence2
-        chain1 = row['Chain1'].upper()
-        chain2 = row['Chain2'].upper()
-        sequence1 = row['Sequence1']
-        sequence2 = row['Sequence2']
-        chain_1_motifs = ast.literal_eval(row['Chain_1_motifs'])
-        chain_2_motifs = ast.literal_eval(row['Chain_2_motifs'])
-        chain_1_offset = row['Chain_1_offset']
-        chain_2_offset = row['Chain_2_offset']
-        # Create a new entry by mutating sequence1
-        sequence1_list = list(sequence1)
-        modified_chain_1_motifs = []
-        if len(chain_1_motifs) > 0:
-            # Ignore entries where motif length equals sequence length cuz it'll be too hard for models to learn
-            if len(chain_1_motifs) == len(sequence1):
-                flag1 = True
-            for motif in chain_1_motifs:
-                res, pos = motif.split('_')
-                # Errors for motifs or there are unalignments between sequence and motif
-                if int(pos) >= len(sequence1) or int(pos) < 0 or res != sequence1[int(pos)]:
-                    error_rows.append({
-                        'PDB_ID': row['PDB_ID'] + '_' + chain1 + '_' + chain2,
-                        'Chain': chain1,
-                        'Sequence': sequence1,
-                        'Error_motif': motif,
-                        'Chain_offset': row['Chain_1_offset']
-                    })
-                    flag1 = True
-                    break
-                least_likely_residue = get_least_likely_substitution(res)
-                sequence1_list[int(pos)] = least_likely_residue
-                modified_chain_1_motifs.append(res + '_' + pos + '_' + least_likely_residue)
-            # only save the entries without errors or do not need to be ignored
-            if flag1 is False:
-                modified_sequence1 = ''.join(sequence1_list)
-                new_rows.append({
-                    'PDB_ID': row['PDB_ID'] + '_' + chain1 + '_' + chain2,
-                    'Chain1': chain1,
-                    'Sequence1': modified_sequence1,
-                    'Chain2': chain2,
-                    'Sequence2': sequence2,
-                    'Chain_1_motifs': str(modified_chain_1_motifs),
-                    'Chain_2_motifs': row['Chain_2_motifs'],
-                    'Chain_1_offset': row['Chain_1_offset'],
-                    'Chain_2_offset': row['Chain_2_offset'],
-                    'Modified_chain': chain1,
-                    'Original_sequence': sequence1,
-                })
-        # if sequence2 is the same as sequence1 and so as the motifs, do not need to mutate sequence2
-        if sequence1 == sequence2 and chain_1_motifs == chain_2_motifs:
-            continue
-        # Create a new entry by mutating sequence2, using the same logic as sequenc1
-        if len(chain_2_motifs) > 0:
-            if len(chain_2_motifs) == len(sequence2):
-                flag2 == True
-            sequence2_list = list(sequence2)
-            modified_chain_2_motifs = []
-            for motif in chain_2_motifs:
-                res, pos = motif.split('_')
-                if int(pos) >= len(sequence2) or int(pos) < 0 or res != sequence2[int(pos)]:
-                    error_rows.append({
-                        'PDB_ID': row['PDB_ID'] + '_' + chain1 + '_' + chain2,
-                        'Chain': chain2,
-                        'Sequence': sequence2,
-                        'Error_motif': motif,
-                        'Chain_offset': row['Chain_2_offset']
-                    })
-                    flag2 = True
-                    break
-                least_likely_residue = get_least_likely_substitution(res)
-                sequence2_list[int(pos)] = least_likely_residue
-                modified_chain_2_motifs.append(res + '_' + pos + '_' + least_likely_residue)
-            if flag2 is False:
-                modified_sequence2 = ''.join(sequence2_list)
-                new_rows.append({
-                    'PDB_ID': row['PDB_ID'] + '_' + chain2 + '_' + chain1,
-                    'Chain1': chain1,
-                    'Sequence1': sequence1,
-                    'Chain2': chain2,
-                    'Sequence2': modified_sequence2,
-                    'Chain_1_motifs': row['Chain_1_motifs'],
-                    'Chain_2_motifs': str(modified_chain_2_motifs),
-                    'Chain_1_offset': row['Chain_1_offset'],
-                    'Chain_2_offset': row['Chain_2_offset'],
-                    'Modified_chain': chain2,
-                    'Original_sequence': sequence2,
-                })
-    # Finished mutation
-    new_df = pd.DataFrame(new_rows)
-    # Deduplicate
-    columns_to_check = ['Sequence1', 'Sequence2', 'Chain_1_motifs', 'Chain_2_motifs', 'Chain_1_offset', 'Chain_2_offset']
-    deduplicated_new_df = new_df.drop_duplicates(subset=columns_to_check)
-    print(f"Number of rows before deduplication: {len(new_df)}")
-    print(f"Number of rows after deduplication: {len(deduplicated_new_df)}")
-    deduplicated_new_df.to_csv(output_csv, index=False)
-    error_df = pd.DataFrame(error_rows)
-    error_df.to_csv(error_csv, index=False)
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-i')
-    args = parser.parse_args()
-    i_s = args.i    # 2,3,4,5,6,7,8,9,10
-    for i in i_s.split(','):
-        print(int(i))
-        main(int(i))

dataset/compute_class_weights.py DELETED Viewed

@@ -1,47 +0,0 @@
-import pandas as pd
-import ast
-from sklearn.model_selection import train_test_split
-import numpy as np
-import torch.nn.functional as F
-import torch
-from torch.utils.data import Dataset
-from datasets import Dataset as HFDataset, DatasetDict
-from transformers import AutoTokenizer
-import pdb
-def main():
-    data = pd.read_csv('dataset_drop_500.csv')
-    print(len(data))
-    binding_sites = data['mutTarget_motifs'].tolist()
-    targets = data['Target'].tolist()
-    # No need for padding the first position of binding sites for class weight calculations
-    binding_sites = [ast.literal_eval(binding_site) for binding_site in binding_sites]
-    binding_sites = [len(binding_site) for binding_site in binding_sites]
-    targets = [len(seq) for seq in targets]
-    pdb.set_trace()
-    train_val_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
-    train_data, val_data = train_test_split(train_val_data, test_size=0.25, random_state=42)
-    train_index = train_data.index.to_numpy()
-    print(len(train_index))
-    return
-    train_binding_dataset = [binding_sites[i] for i in train_index]
-    train_targets = [targets[i] for i in train_index]
-    num_binding_sites = sum(train_binding_dataset)
-    num_total = sum(train_targets)
-    num_non_binding_sites = num_total - num_binding_sites
-    weight_for_binding = num_total / (2 * num_binding_sites)
-    weight_for_non_binding = num_total / (2 * num_non_binding_sites)
-    print(weight_for_binding, weight_for_non_binding)
-if __name__ == "__main__":
-    main()

dataset/peptide_static_batching.py DELETED Viewed

@@ -1,83 +0,0 @@
-import pandas as pd
-import ast
-from sklearn.model_selection import train_test_split
-import numpy as np
-import torch.nn.functional as F
-import torch
-from torch.utils.data import Dataset
-from datasets import Dataset as HFDataset, DatasetDict
-from transformers import AutoTokenizer
-import pdb
-class TripletDataset(Dataset):
-    def __init__(self, anchors, positives, binding_sites, tokenizer, max_sequence_length=40000):
-        self.anchors = anchors
-        self.positives = positives
-        self.binding_sites = binding_sites
-        self.tokenizer = tokenizer
-        self.max_sequence_length = max_sequence_length
-        self.triplets = []
-        self.precompute_triplets()
-    def __len__(self):
-        return len(self.triplets)
-    def __getitem__(self, index):
-        return self.triplets[index]
-    def precompute_triplets(self):
-        self.triplets = []
-        for anchor, positive, binding_site in zip(self.anchors, self.positives, self.binding_sites):
-            anchor_tokens = self.tokenizer(anchor, return_tensors='pt', padding=True, truncation=True,
-                                           max_length=self.max_sequence_length)
-            positive_tokens = self.tokenizer(positive, return_tensors='pt', padding=True, truncation=True,
-                                             max_length=self.max_sequence_length)
-            # mask out the first and last tokens due to being <bos> and <eos>
-            anchor_tokens['attention_mask'][0][0] = 0
-            anchor_tokens['attention_mask'][0][-1] = 0
-            positive_tokens['attention_mask'][0][0] = 0
-            positive_tokens['attention_mask'][0][-1] = 0
-            self.triplets.append((anchor_tokens, positive_tokens, binding_site))
-            # pdb.set_trace()
-        return self.triplets
-def main():
-    data = pd.read_csv('/home/tc415/muPPIt/dataset/pep_prot/pep_prot_test.csv')
-    print(len(data))
-    positives = data['Binder'].tolist()
-    anchors = data['Target'].tolist()
-    binding_sites = data['Motif'].tolist()
-    # We should plus 1 because there will be a start token after embedded by ESM-2
-    binding_sites = [binding_site.split(',') for binding_site in binding_sites]
-    binding_sites = [[int(site) + 1 for site in binding_site] for binding_site in binding_sites]
-    train_anchor_dataset = np.array(anchors)
-    train_positive_dataset = np.array(positives)
-    train_binding_dataset = binding_sites
-    # Create an instance of the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    # Initialize the TripletDataset
-    train_dataset = TripletDataset(train_anchor_dataset, train_positive_dataset, train_binding_dataset, tokenizer=tokenizer, max_sequence_length=50000)
-    train_prebatched_data_dict = {
-        'anchors': [batch[0] for batch in train_dataset.triplets],
-        'positives': [batch[1] for batch in train_dataset.triplets],
-        'binding_site': [batch[2] for batch in train_dataset.triplets]
-    }
-    # Convert the dictionary to a HuggingFace Dataset
-    train_hf_dataset = HFDataset.from_dict(train_prebatched_data_dict)
-    train_hf_dataset.save_to_disk('/home/tc415/muPPIt/dataset/pep_prot_test')
-if __name__ == "__main__":
-    main()

dataset/prebatching.py DELETED Viewed

@@ -1,197 +0,0 @@
-import pandas as pd
-import ast
-from sklearn.model_selection import train_test_split
-import numpy as np
-import torch.nn.functional as F
-import torch
-from torch.utils.data import Dataset
-from datasets import Dataset as HFDataset, DatasetDict
-from transformers import AutoTokenizer
-from lightning.pytorch import seed_everything
-import pdb
-class TripletDataset(Dataset):
-    def __init__(self, anchors, positives, negatives, binding_sites, tokenizer, max_sequence_length=40000):
-        self.anchors = anchors
-        self.positives = positives
-        self.negatives = negatives
-        self.binding_sites = binding_sites
-        self.tokenizer = tokenizer
-        self.max_sequence_length = max_sequence_length
-        self.triplets = self.precompute_triplets()
-        self.batch_indices = self.get_batch_indices()
-        self.prebatched_data = self.create_prebatched_data()
-    def __len__(self):
-        return len(self.batch_indices)
-    def __getitem__(self, index):
-        batch = self.prebatched_data[index]
-        return batch
-    def precompute_triplets(self):
-        triplets = []
-        for anchor, positive, negative, binding_site in zip(self.anchors, self.positives, self.negatives,
-                                                            self.binding_sites):
-            triplets.append((anchor, positive, negative, binding_site))
-        return triplets
-    def get_batch_indices(self):
-        sizes = [(len(anchor) + len(positive) + len(negative), i) for i, (anchor, positive, negative, _) in
-                 enumerate(self.triplets)]
-        sizes.sort()
-        batches = []
-        buf = []
-        current_buf_len = 0
-        def _flush_current_buf():
-            nonlocal current_buf_len, buf
-            if len(buf) == 0:
-                return
-            batches.append(buf)
-            buf = []
-            current_buf_len = 0
-        for sz, i in sizes:
-            if current_buf_len + sz > self.max_sequence_length:
-                _flush_current_buf()
-            buf.append(i)
-            current_buf_len += sz
-        _flush_current_buf()
-        return batches
-    def create_prebatched_data(self):
-        prebatched_data = []
-        for batch_indices in self.batch_indices:
-            anchor_batch = []
-            positive_batch = []
-            negative_batch = []
-            binding_site_batch = []
-            for index in batch_indices:
-                anchor, positive, negative, binding_site = self.triplets[index]
-                anchor_batch.append(anchor)
-                positive_batch.append(positive)
-                negative_batch.append(negative)
-                binding_site_batch.append(binding_site)
-            anchor_tokens = self.tokenizer(anchor_batch, return_tensors='pt', padding=True, truncation=True,
-                                           max_length=self.max_sequence_length)
-            positive_tokens = self.tokenizer(positive_batch, return_tensors='pt', padding=True, truncation=True,
-                                             max_length=self.max_sequence_length)
-            negative_tokens = self.tokenizer(negative_batch, return_tensors='pt', padding=True, truncation=True,
-                                             max_length=self.max_sequence_length)
-            n, max_length = negative_tokens['input_ids'].shape[0], negative_tokens['input_ids'].shape[1]
-            target = torch.zeros(n, max_length)
-            for i in range(len(binding_site_batch)):
-                binding_site = binding_site_batch[i]
-                target[i,binding_site] = 1
-            # mask out the first column because it corresponds to the start token
-            anchor_tokens['attention_mask'][:, 0] = 0
-            positive_tokens['attention_mask'][:, 0] = 0
-            negative_tokens['attention_mask'][:, 0] = 0
-            prebatched_data.append({
-                'anchor_input_ids': anchor_tokens['input_ids'],
-                'anchor_attention_mask': anchor_tokens['attention_mask'],
-                'positive_input_ids': positive_tokens['input_ids'],
-                'positive_attention_mask': positive_tokens['attention_mask'],
-                'negative_input_ids': negative_tokens['input_ids'],
-                'negative_attention_mask': negative_tokens['attention_mask'],
-                'binding_site': target
-            })
-        return prebatched_data
-def main():
-    seed_everything(42)
-    data = pd.read_csv('dataset/dataset.csv')
-    negatives = data['mutTarget'].tolist()
-    positives = data['Binder'].tolist()
-    anchors = data['Target'].tolist()
-    binding_sites = data['mutTarget_motifs'].tolist()
-    # We should plus 1 because there will be a start token after embedded by ESM-2
-    binding_sites = [ast.literal_eval(binding_site) for binding_site in binding_sites]
-    binding_sites = [[int(site.split('_')[1]) + 1 for site in binding_site] for binding_site in binding_sites]
-    train_val_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
-    train_data, val_data = train_test_split(train_val_data, test_size=0.25, random_state=42)
-    train_index = train_data.index.to_numpy()
-    val_index = val_data.index.to_numpy()
-    test_index = test_data.index.to_numpy()
-    train_anchor_dataset = np.array(anchors)[train_index]
-    train_negative_dataset = np.array(negatives)[train_index]
-    train_positive_dataset = np.array(positives)[train_index]
-    train_binding_dataset = [binding_sites[i] for i in train_index]
-    val_anchor_dataset = np.array(anchors)[val_index]
-    val_negative_dataset = np.array(negatives)[val_index]
-    val_positive_dataset = np.array(positives)[val_index]
-    val_binding_dataset = [binding_sites[i] for i in val_index]
-    test_anchor_dataset = np.array(anchors)[test_index]
-    test_negative_dataset = np.array(negatives)[test_index]
-    test_positive_dataset = np.array(positives)[test_index]
-    test_binding_dataset = [binding_sites[i] for i in test_index]
-    # Create an instance of the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    # Initialize the TripletDataset
-    train_dataset = TripletDataset(train_anchor_dataset, train_positive_dataset, train_negative_dataset, train_binding_dataset, tokenizer=tokenizer, max_sequence_length=40000)
-    val_dataset = TripletDataset(val_anchor_dataset, val_positive_dataset, val_negative_dataset, val_binding_dataset, tokenizer=tokenizer, max_sequence_length=40000)
-    test_dataset = TripletDataset(test_anchor_dataset, test_positive_dataset, test_negative_dataset, test_binding_dataset, tokenizer=tokenizer, max_sequence_length=40000)
-    # Convert the prebatched data to a dictionary with each batch as an entry
-    train_prebatched_data_dict = {
-        'anchor_input_ids': [batch['anchor_input_ids'].numpy() for batch in train_dataset.prebatched_data],
-        'anchor_attention_mask': [batch['anchor_attention_mask'].numpy() for batch in train_dataset.prebatched_data],
-        'positive_input_ids': [batch['positive_input_ids'].numpy() for batch in train_dataset.prebatched_data],
-        'positive_attention_mask': [batch['positive_attention_mask'].numpy() for batch in train_dataset.prebatched_data],
-        'negative_input_ids': [batch['negative_input_ids'].numpy() for batch in train_dataset.prebatched_data],
-        'negative_attention_mask': [batch['negative_attention_mask'].numpy() for batch in train_dataset.prebatched_data],
-        'binding_site': [batch['binding_site'].numpy() for batch in train_dataset.prebatched_data]
-    }
-    val_prebatched_data_dict = {
-        'anchor_input_ids': [batch['anchor_input_ids'].numpy() for batch in val_dataset.prebatched_data],
-        'anchor_attention_mask': [batch['anchor_attention_mask'].numpy() for batch in val_dataset.prebatched_data],
-        'positive_input_ids': [batch['positive_input_ids'].numpy() for batch in val_dataset.prebatched_data],
-        'positive_attention_mask': [batch['positive_attention_mask'].numpy() for batch in val_dataset.prebatched_data],
-        'negative_input_ids': [batch['negative_input_ids'].numpy() for batch in val_dataset.prebatched_data],
-        'negative_attention_mask': [batch['negative_attention_mask'].numpy() for batch in val_dataset.prebatched_data],
-        'binding_site': [batch['binding_site'].numpy() for batch in val_dataset.prebatched_data]
-    }
-    test_prebatched_data_dict = {
-        'anchor_input_ids': [batch['anchor_input_ids'].numpy() for batch in test_dataset.prebatched_data],
-        'anchor_attention_mask': [batch['anchor_attention_mask'].numpy() for batch in test_dataset.prebatched_data],
-        'positive_input_ids': [batch['positive_input_ids'].numpy() for batch in test_dataset.prebatched_data],
-        'positive_attention_mask': [batch['positive_attention_mask'].numpy() for batch in test_dataset.prebatched_data],
-        'negative_input_ids': [batch['negative_input_ids'].numpy() for batch in test_dataset.prebatched_data],
-        'negative_attention_mask': [batch['negative_attention_mask'].numpy() for batch in test_dataset.prebatched_data],
-        'binding_site': [batch['binding_site'].numpy() for batch in test_dataset.prebatched_data]
-    }
-    # Convert the dictionary to a HuggingFace Dataset
-    train_hf_dataset = HFDataset.from_dict(train_prebatched_data_dict)
-    train_hf_dataset.save_to_disk('train_mut')
-    val_hf_dataset = HFDataset.from_dict(val_prebatched_data_dict)
-    val_hf_dataset.save_to_disk('val_mut')
-    test_hf_dataset = HFDataset.from_dict(test_prebatched_data_dict)
-    test_hf_dataset.save_to_disk('test_mut')
-if __name__ == "__main__":
-    main()

dataset/static_prebatching.py DELETED Viewed

@@ -1,162 +0,0 @@
-import pandas as pd
-import ast
-from sklearn.model_selection import train_test_split
-import numpy as np
-import torch.nn.functional as F
-import torch
-from torch.utils.data import Dataset
-from datasets import Dataset as HFDataset, DatasetDict
-from transformers import AutoTokenizer
-import pdb
-class TripletDataset(Dataset):
-    def __init__(self, anchors, positives, negatives, binding_sites, tokenizer, max_sequence_length=40000):
-        self.anchors = anchors
-        self.positives = positives
-        self.negatives = negatives
-        self.binding_sites = binding_sites
-        self.tokenizer = tokenizer
-        self.max_sequence_length = max_sequence_length
-        self.triplets = []
-        self.precompute_triplets()
-    def __len__(self):
-        return len(self.triplets)
-    def __getitem__(self, index):
-        return self.triplets[index]
-    def precompute_triplets(self):
-        self.triplets = []
-        for anchor, positive, negative, binding_site in zip(self.anchors, self.positives, self.negatives,
-                                                            self.binding_sites):
-            anchor_tokens = self.tokenizer(anchor, return_tensors='pt', padding=True, truncation=True,
-                                           max_length=self.max_sequence_length)
-            positive_tokens = self.tokenizer(positive, return_tensors='pt', padding=True, truncation=True,
-                                             max_length=self.max_sequence_length)
-            negative_tokens = self.tokenizer(negative, return_tensors='pt', padding=True, truncation=True,
-                                             max_length=self.max_sequence_length)
-            # mask out the first and last tokens due to being <bos> and <eos>
-            anchor_tokens['attention_mask'][0][0] = 0
-            anchor_tokens['attention_mask'][0][-1] = 0
-            positive_tokens['attention_mask'][0][0] = 0
-            positive_tokens['attention_mask'][0][-1] = 0
-            negative_tokens['attention_mask'][0][0] = 0
-            negative_tokens['attention_mask'][0][-1] = 0
-            self.triplets.append((anchor_tokens, positive_tokens, negative_tokens, binding_site))
-            # pdb.set_trace()
-        return self.triplets
-def main():
-    data = pd.read_csv('dataset_drop_500.csv')
-    print(len(data))
-    negatives = data['mutTarget'].tolist()
-    positives = data['Binder'].tolist()
-    anchors = data['Target'].tolist()
-    binding_sites = data['mutTarget_motifs'].tolist()
-    # We should plus 1 because there will be a start token after embedded by ESM-2
-    binding_sites = [ast.literal_eval(binding_site) for binding_site in binding_sites]
-    binding_sites = [[int(site.split('_')[1]) + 1 for site in binding_site] for binding_site in binding_sites]
-    train_val_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
-    train_data, val_data = train_test_split(train_val_data, test_size=0.25, random_state=42)
-    train_index = train_data.index.to_numpy()
-    val_index = val_data.index.to_numpy()
-    test_index = test_data.index.to_numpy()
-    train_anchor_dataset = np.array(anchors)[train_index]
-    train_negative_dataset = np.array(negatives)[train_index]
-    train_positive_dataset = np.array(positives)[train_index]
-    train_binding_dataset = [binding_sites[i] for i in train_index]
-    val_anchor_dataset = np.array(anchors)[val_index]
-    val_negative_dataset = np.array(negatives)[val_index]
-    val_positive_dataset = np.array(positives)[val_index]
-    val_binding_dataset = [binding_sites[i] for i in val_index]
-    test_anchor_dataset = np.array(anchors)[test_index]
-    test_negative_dataset = np.array(negatives)[test_index]
-    test_positive_dataset = np.array(positives)[test_index]
-    test_binding_dataset = [binding_sites[i] for i in test_index]
-    # Create an instance of the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    # Initialize the TripletDataset
-    train_dataset = TripletDataset(train_anchor_dataset, train_positive_dataset, train_negative_dataset, train_binding_dataset, tokenizer=tokenizer, max_sequence_length=50000)
-    val_dataset = TripletDataset(val_anchor_dataset, val_positive_dataset, val_negative_dataset, val_binding_dataset, tokenizer=tokenizer, max_sequence_length=50000)
-    test_dataset = TripletDataset(test_anchor_dataset, test_positive_dataset, test_negative_dataset, test_binding_dataset, tokenizer=tokenizer, max_sequence_length=50000)
-    train_prebatched_data_dict = {
-        'anchors': [batch[0] for batch in train_dataset.triplets],
-        'positives': [batch[1] for batch in train_dataset.triplets],
-        # 'negatives': [batch[2] for batch in train_dataset.triplets],
-        'binding_site': [batch[3] for batch in train_dataset.triplets]
-    }
-    val_prebatched_data_dict = {
-        'anchors': [batch[0] for batch in val_dataset.triplets],
-        'positives': [batch[1] for batch in val_dataset.triplets],
-        # 'negatives': [batch[2] for batch in val_dataset.triplets],
-        'binding_site': [batch[3] for batch in val_dataset.triplets]
-    }
-    test_prebatched_data_dict = {
-        'anchors': [batch[0] for batch in test_dataset.triplets],
-        'positives': [batch[1] for batch in test_dataset.triplets],
-        # 'negatives': [batch[2] for batch in test_dataset.triplets],
-        'binding_site': [batch[3] for batch in test_dataset.triplets]
-    }
-    # Convert the prebatched data to a dictionary with each batch as an entry
-    # train_prebatched_data_dict = {
-    #     'anchor_input_ids': [batch[0]['input_ids'].numpy() for batch in train_dataset.triplets],
-    #     'anchor_attention_mask': [batch[0]['attention_mask'].numpy() for batch in train_dataset.triplets],
-    #     'positive_input_ids': [batch[1]['input_ids'].numpy() for batch in train_dataset.triplets],
-    #     'positive_attention_mask': [batch[1]['attention_mask'].numpy() for batch in train_dataset.triplets],
-    #     'negative_input_ids': [batch[2]['input_ids'].numpy() for batch in train_dataset.triplets],
-    #     'negative_attention_mask': [batch[2]['attention_mask'].numpy() for batch in train_dataset.triplets],
-    #     'binding_site': [batch[3] for batch in train_dataset.triplets]
-    # }
-    #
-    # val_prebatched_data_dict = {
-    #     'anchor_input_ids': [batch[0]['input_ids'].numpy() for batch in val_dataset.triplets],
-    #     'anchor_attention_mask': [batch[0]['attention_mask'].numpy() for batch in val_dataset.triplets],
-    #     'positive_input_ids': [batch[1]['input_ids'].numpy() for batch in val_dataset.triplets],
-    #     'positive_attention_mask': [batch[1]['attention_mask'].numpy() for batch in val_dataset.triplets],
-    #     'negative_input_ids': [batch[2]['input_ids'].numpy() for batch in val_dataset.triplets],
-    #     'negative_attention_mask': [batch[2]['attention_mask'].numpy() for batch in val_dataset.triplets],
-    #     'binding_site': [batch[3] for batch in val_dataset.triplets]
-    # }
-    # test_prebatched_data_dict = {
-    #     'anchor_input_ids': [batch[0]['input_ids'].numpy() for batch in test_dataset.triplets],
-    #     'anchor_attention_mask': [batch[0]['attention_mask'].numpy() for batch in test_dataset.triplets],
-    #     'positive_input_ids': [batch[1]['input_ids'].numpy() for batch in test_dataset.triplets],
-    #     'positive_attention_mask': [batch[1]['attention_mask'].numpy() for batch in test_dataset.triplets],
-    #     'negative_input_ids': [batch[2]['input_ids'].numpy() for batch in test_dataset.triplets],
-    #     'negative_attention_mask': [batch[2]['attention_mask'].numpy() for batch in test_dataset.triplets],
-    #     'binding_site': [batch[3] for batch in test_dataset.triplets]
-    # }
-    # Convert the dictionary to a HuggingFace Dataset
-    train_hf_dataset = HFDataset.from_dict(train_prebatched_data_dict)
-    train_hf_dataset.save_to_disk('/home/tc415/muPPIt/dataset/train_dataset_drop_500')
-    val_hf_dataset = HFDataset.from_dict(val_prebatched_data_dict)
-    val_hf_dataset.save_to_disk('/home/tc415/muPPIt/dataset/val_dataset_drop_500')
-    test_hf_dataset = HFDataset.from_dict(test_prebatched_data_dict)
-    test_hf_dataset.save_to_disk('/home/tc415/muPPIt/dataset/test_dataset_drop_500')
-if __name__ == "__main__":
-    main()