ChatterjeeLab
/

muPPIt

Model card Files Files and versions Community

AlienChen commited on Aug 17, 2024

Commit

dbeb56d

verified ·

1 Parent(s): f900c86

Delete muppit

Browse files

Files changed (17) hide show

muppit/.gitkeep +0 -0
muppit/__init__.py +0 -0
muppit/calculate_steps.py +0 -72
muppit/finetune.py +0 -386
muppit/models/.gitattributes +0 -1
muppit/models/.gitkeep +0 -0
muppit/models/__init__.py +0 -3
muppit/models/dataloaders.py +0 -426
muppit/models/layers.py +0 -44
muppit/models/models.py +0 -238
muppit/models/modules.py +0 -213
muppit/models/score_domain.py +0 -40
muppit/predict.py +0 -118
muppit/scripts/.gitkeep +0 -0
muppit/scripts/predict_binding_site.py +0 -149
muppit/test_evaluator.py +0 -197
muppit/train_evaluator.py +0 -408

muppit/.gitkeep DELETED Viewed

File without changes

muppit/__init__.py DELETED Viewed

File without changes

muppit/calculate_steps.py DELETED Viewed

@@ -1,72 +0,0 @@
-import math
-# def calculate_steps_per_epoch(total_samples, batch_size_per_gpu, num_gpus, scheduling):
-#     # Calculate total batch size across all GPUs
-#     total_batch_size = batch_size_per_gpu * num_gpus
-#
-#     # Calculate total batches per epoch
-#     batches_per_epoch = math.ceil(total_samples / total_batch_size)
-#
-#     steps_per_epoch = []
-#     current_accumulation_factor = 1  # Default accumulation factor
-#
-#     for epoch in range(max(scheduling.keys()) + 1):
-#         # Update accumulation factor if it's defined for the current epoch
-#         if epoch in scheduling:
-#             current_accumulation_factor = scheduling[epoch]
-#
-#         effective_steps = math.ceil(batches_per_epoch / current_accumulation_factor)
-#         steps_per_epoch.append(effective_steps)
-#
-#     return steps_per_epoch
-def calculate_total_steps(total_samples, batch_size, num_gpus, accumulation_schedule, max_epochs):
-    total_steps = 0
-    for epoch in range(max_epochs):
-        # Determine the accumulation steps for the current epoch
-        for start_epoch, steps in accumulation_schedule.items():
-            if start_epoch > epoch:
-                break
-            accumulation_steps = steps
-        effective_batch_size = batch_size * num_gpus * accumulation_steps
-        steps_per_epoch = (total_samples + effective_batch_size - 1) // effective_batch_size
-        total_steps += steps_per_epoch
-        print(f'Epoch {epoch}: {steps_per_epoch} steps (accumulation_steps={accumulation_steps})')
-    return total_steps
-total_samples = 4804  # Replace with the actual number of samples in your dataset
-batch_size = 32
-num_gpus = 1
-accumulation_schedule = {0: 4, 3: 3, 10: 2}
-max_epochs = 20
-total_steps = calculate_total_steps(total_samples, batch_size, num_gpus, accumulation_schedule, max_epochs)
-print(f"Total Steps: {total_steps}")
-# total_samples = 309503  # Replace with the actual number of samples in your dataset
-# batch_size = 32
-# num_gpus = 7
-# accumulation_schedule = {0: 4, 2: 2, 7: 1}
-# max_epochs = 10
-#
-# total_steps = calculate_total_steps(total_samples, batch_size, num_gpus, accumulation_schedule, max_epochs)
-# print(f"Total Steps: {total_steps}")
-#
-# # Example usage
-# total_samples = 309503
-# batch_size_per_gpu = 16
-# num_gpus = 7
-# scheduling = {0: 4, 5: 3, 10: 2, 13: 1}
-#
-# steps_per_epoch = calculate_steps_per_epoch(total_samples, batch_size_per_gpu, num_gpus, scheduling)
-# for epoch, steps in enumerate(steps_per_epoch):
-#     print(f"Epoch {epoch}: {steps} steps")
-#
-# print(f"Total steps: {sum(steps_per_epoch)}")

muppit/finetune.py DELETED Viewed

@@ -1,386 +0,0 @@
-import pdb
-from pytorch_lightning.strategies import DDPStrategy
-import torch
-import torch.nn.functional as F
-from torch.utils.data import DataLoader, DistributedSampler
-from datasets import load_from_disk
-import pytorch_lightning as pl
-from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, \
-    Timer, TQDMProgressBar, LearningRateMonitor, StochasticWeightAveraging, GradientAccumulationScheduler
-from pytorch_lightning.loggers import WandbLogger
-from torch.optim.lr_scheduler import _LRScheduler
-from transformers.optimization import get_cosine_schedule_with_warmup
-from argparse import ArgumentParser
-import os
-import uuid
-import numpy as np
-import torch.distributed as dist
-from models import *
-from torch.nn.utils.rnn import pad_sequence
-from transformers import AutoTokenizer, get_cosine_schedule_with_warmup
-from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
-from torch.optim import Adam, AdamW
-from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef
-import gc
-os.environ["TORCH_CPP_LOG_LEVEL"]="INFO"
-os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
-os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
-def collate_fn(batch):
-    # Unpack the batch
-    anchors = []
-    positives = []
-    # negatives = []
-    binding_sites = []
-    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    for b in batch:
-        anchors.append(b['anchors'])
-        positives.append(b['positives'])
-        binding_sites.append(b['binding_site'])
-    # Collate the tensors using torch's pad_sequence
-    anchor_input_ids = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['input_ids']).squeeze(0) for item in anchors], batch_first=True, padding_value=tokenizer.pad_token_id)
-    anchor_attention_mask = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['attention_mask']).squeeze(0) for item in anchors], batch_first=True, padding_value=0)
-    positive_input_ids = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['input_ids']).squeeze(0) for item in positives], batch_first=True, padding_value=tokenizer.pad_token_id)
-    positive_attention_mask = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['attention_mask']).squeeze(0) for item in positives], batch_first=True, padding_value=0)
-    n, max_length = anchor_input_ids.shape[0], anchor_input_ids.shape[1]
-    site = torch.zeros(n, max_length)
-    for i in range(len(binding_sites)):
-        binding_site = binding_sites[i]
-        site[i, binding_site] = 1
-    # Return the collated batch
-    return {
-        'anchor_input_ids': anchor_input_ids.int(),
-        'anchor_attention_mask': anchor_attention_mask.int(),
-        'positive_input_ids': positive_input_ids.int(),
-        'positive_attention_mask': positive_attention_mask.int(),
-        'binding_site': site
-    }
-class CustomDataModule(pl.LightningDataModule):
-    def __init__(self, train_dataset, val_dataset, tokenizer, batch_size: int = 128):
-        super().__init__()
-        self.train_dataset = train_dataset
-        self.val_dataset = val_dataset
-        self.batch_size = batch_size
-        self.tokenizer = tokenizer
-    def train_dataloader(self):
-        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn,
-                          num_workers=8, pin_memory=True)
-    def val_dataloader(self):
-        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=collate_fn, num_workers=8,
-                          pin_memory=True)
-    def setup(self, stage=None):
-        if stage == 'test' or stage is None:
-            test_dataset = load_from_disk('/home/tc415/muPPIt/dataset/pep_prot_test')
-            self.test_dataloader = DataLoader(self.test_dataset, batch_size=self.batch_size, collate_fn=collate_fn,
-                                              num_workers=8, pin_memory=True)
-class CosineAnnealingWithWarmup(_LRScheduler):
-    def __init__(self, optimizer, warmup_steps, total_steps, base_lr, max_lr, min_lr, last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        self.total_steps = total_steps
-        self.base_lr = base_lr
-        self.max_lr = max_lr
-        self.min_lr = min_lr
-        super(CosineAnnealingWithWarmup, self).__init__(optimizer, last_epoch)
-        print(f"SELF BASE LRS = {self.base_lrs}")
-    def get_lr(self):
-        if self.last_epoch < self.warmup_steps:
-            # Linear warmup phase from base_lr to max_lr
-            return [self.base_lr + (self.max_lr - self.base_lr) * (self.last_epoch / self.warmup_steps) for base_lr in self.base_lrs]
-        # Cosine annealing phase from max_lr to min_lr
-        progress = (self.last_epoch - self.warmup_steps) / (self.total_steps - self.warmup_steps)
-        cosine_decay = 0.5 * (1 + np.cos(np.pi * progress))
-        decayed_lr = self.min_lr + (self.max_lr - self.min_lr) * cosine_decay
-        return [decayed_lr for base_lr in self.base_lrs]
-class PeptideModel(pl.LightningModule):
-    def __init__(self, n_layers, d_model, n_head,
-                 d_k, d_v, d_inner, dropout=0.2,
-                 learning_rate=0.00001, max_epochs=15):
-        super(PeptideModel, self).__init__()
-        self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
-        # freeze all the esm_model parameters
-        for param in self.esm_model.parameters():
-            param.requires_grad = False
-        self.repeated_module = RepeatedModule2(n_layers, d_model,
-                                               n_head, d_k, d_v, d_inner, dropout=dropout)
-        self.final_attention_layer = MultiHeadAttentionSequence(n_head, d_model,
-                                                                d_k, d_v, dropout=dropout)
-        self.final_ffn = FFN(d_model, d_inner, dropout=dropout)
-        self.output_projection_prot = nn.Linear(d_model, 1)
-        self.learning_rate = learning_rate
-        self.max_epochs = max_epochs
-        self.classification_threshold = nn.Parameter(torch.tensor(0.5))  # Initial threshold
-        self.historical_memory = 0.9
-        self.class_weights = torch.tensor([3.6625710315221727, 0.5790496079007189])  # binding_site weights, non-bidning site weights
-    def forward(self, binder_tokens, target_tokens):
-        peptide_sequence = self.esm_model(**binder_tokens).last_hidden_state
-        protein_sequence = self.esm_model(**target_tokens).last_hidden_state
-        prot_enc, sequence_enc, sequence_attention_list, prot_attention_list, \
-            seq_prot_attention_list, seq_prot_attention_list = self.repeated_module(peptide_sequence,
-                                                                                    protein_sequence)
-        prot_enc, final_prot_seq_attention = self.final_attention_layer(prot_enc, sequence_enc, sequence_enc)
-        prot_enc = self.final_ffn(prot_enc)
-        prot_enc = self.output_projection_prot(prot_enc)
-        return prot_enc
-    def training_step(self, batch, batch_idx):
-        opt = self.optimizers()
-        lr = opt.param_groups[0]['lr']
-        self.log('learning_rate', lr, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        target_tokens = {'input_ids': batch['anchor_input_ids'].to(self.device),
-                         'attention_mask': batch['anchor_attention_mask'].to(self.device)}
-        binder_tokens = {'input_ids': batch['positive_input_ids'].to(self.device),
-                         'attention_mask': batch['positive_attention_mask'].to(self.device)}
-        binding_site = batch['binding_site'].to(self.device)
-        mask = target_tokens['attention_mask']
-        outputs_nodes = self.forward(binder_tokens, target_tokens).squeeze(-1)
-        weight = self.class_weights[0] * binding_site + self.class_weights[1] * (1 - binding_site)
-        loss = F.binary_cross_entropy_with_logits(outputs_nodes, binding_site, weight=weight, reduction='none')
-        masked_loss = loss * mask
-        mean_loss = masked_loss.sum() / mask.sum()
-        # print('logging')
-        self.log('train_loss', mean_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        return mean_loss
-    def validation_step(self, batch, batch_idx):
-        target_tokens = {'input_ids': batch['anchor_input_ids'].to(self.device),
-                         'attention_mask': batch['anchor_attention_mask'].to(self.device)}
-        binder_tokens = {'input_ids': batch['positive_input_ids'].to(self.device),
-                         'attention_mask': batch['positive_attention_mask'].to(self.device)}
-        binding_site = batch['binding_site'].to(self.device)
-        mask = target_tokens['attention_mask']
-        outputs_nodes = self.forward(binder_tokens, target_tokens).squeeze(-1)
-        weight = self.class_weights[0] * binding_site + self.class_weights[1] * (1 - binding_site)
-        loss = F.binary_cross_entropy_with_logits(outputs_nodes, binding_site, weight=weight, reduction='none')
-        # Apply the mask to the loss
-        masked_loss = loss * mask
-        # Compute the mean loss only over the valid positions
-        mean_loss = masked_loss.sum() / mask.sum()
-        # Calculate predictions and apply mask
-        sigmoid_outputs = torch.sigmoid(outputs_nodes)
-        total = mask.sum()
-        self.update_class_thresholds(sigmoid_outputs, binding_site, mask)
-        self.log('threshold', self.classification_threshold, on_epoch=True)
-        predict = (sigmoid_outputs >= self.classification_threshold).float()
-        correct = ((predict == binding_site) * mask).sum()
-        accuracy = correct / total
-        # Compute AUC
-        outputs_nodes_flat = sigmoid_outputs[mask.bool()].float().cpu().detach().numpy().flatten()
-        binding_site_flat = binding_site[mask.bool()].float().cpu().detach().numpy().flatten()
-        predictions_flat = predict[mask.bool()].float().cpu().detach().numpy().flatten()
-        auc = roc_auc_score(binding_site_flat, outputs_nodes_flat)
-        f1 = f1_score(binding_site_flat, predictions_flat)
-        mcc = matthews_corrcoef(binding_site_flat, predictions_flat)
-        self.log('val_loss', mean_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('val_accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('val_auc', auc, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('val_f1', f1, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('val_mcc', mcc, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-    def configure_optimizers(self):
-        print(f"MAX STEPS = {self.max_epochs}")
-        optimizer = AdamW(self.parameters(), lr=self.learning_rate, betas=(0.9, 0.95))
-        # schedulers = LinearWarmupCosineAnnealingLR(optimizer, warmup_epochs=0.1*self.max_epochs,
-        #                                            max_epochs=self.max_epochs,
-        #                                            warmup_start_lr=5e-4,
-        #                                            eta_min=0.1 * self.learning_rate)
-        base_lr = 0
-        max_lr = self.learning_rate
-        min_lr = 0.1 * self.learning_rate
-        schedulers = CosineAnnealingWithWarmup(optimizer, warmup_steps=76, total_steps=1231,
-                                              base_lr=base_lr, max_lr=max_lr, min_lr=min_lr)
-        lr_schedulers = {
-            "scheduler": schedulers,
-            "name": 'learning_rate_logs',
-            "interval": 'step',  # The scheduler updates the learning rate at every step (not epoch)
-            'frequency': 1  # The scheduler updates the learning rate after every batch
-        }
-        return [optimizer], [lr_schedulers]
-    def update_class_thresholds(self, inputs, targets, mask):
-        with torch.no_grad():
-            min_threshold_value = 0.001
-            thresholds = torch.arange(0.1, 1.0, 0.05, device=inputs.device)
-            best_f1_score = 0
-            best_threshold = min_threshold_value
-            for threshold in thresholds:
-                binary_predictions = (inputs >= threshold).float()
-                tp = ((binary_predictions * targets) * mask).sum().item()
-                fp = ((binary_predictions * (1 - targets)) * mask).sum().item()
-                fn = (((1 - binary_predictions) * targets) * mask).sum().item()
-                precision = tp / (tp + fp + 1e-7)
-                recall = tp / (tp + fn + 1e-7)
-                f1_score = 2 * precision * recall / (precision + recall + 1e-7)
-                if f1_score > best_f1_score:
-                    best_f1_score = f1_score
-                    best_threshold = threshold
-            updated_threshold = self.historical_memory * self.classification_threshold + (
-                        1 - self.historical_memory) * best_threshold
-            self.classification_threshold = nn.Parameter(torch.clamp(updated_threshold, min=min_threshold_value))
-            gc.collect()
-            torch.cuda.empty_cache()
-    def training_epoch_end(self, outputs):
-        gc.collect()
-        torch.cuda.empty_cache()
-        super().training_epoch_end(outputs)
-    def validation_epoch_end(self, outputs):
-        gc.collect()
-        torch.cuda.empty_cache()
-        super().validation_epoch_end(outputs)
-def main():
-    parser = ArgumentParser()
-    parser.add_argument("-o", dest="output_file", help="File for output of model parameters", required=True, type=str)
-    parser.add_argument("-d", dest="dataset", required=False, type=str, default="pepnn",
-                        help="Which dataset to train on, pepnn, pepbind, or interpep")
-    parser.add_argument("-lr", type=float, default=1e-3)
-    parser.add_argument("-batch_size", type=int, default=2, help="Batch size")
-    parser.add_argument("-n_layers", type=int, default=6, help="Number of layers")
-    parser.add_argument("-d_model", type=int, default=64, help="Dimension of model")
-    parser.add_argument("-n_head", type=int, default=6, help="Number of heads")
-    parser.add_argument("-d_inner", type=int, default=64)
-    # parser.add_argument("-sm", dest="saved_model", help="File containing initial params", required=False, type=str,
-    #                     default=None)
-    parser.add_argument("-sm", default=None, help="File containing initial params", type=str)
-    parser.add_argument("--max_epochs", type=int, default=15, help="Max number of epochs to train")
-    args = parser.parse_args()
-    print(args.max_epochs)
-    # Initialize the process group for distributed training
-    dist.init_process_group(backend="nccl")
-    train_dataset = load_from_disk('/home/tc415/muPPIt/dataset/pep_prot_train')
-    val_dataset = load_from_disk('/home/tc415/muPPIt/dataset/pep_prot_val')
-    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    data_module = CustomDataModule(train_dataset, val_dataset, tokenizer=tokenizer, batch_size=args.batch_size)
-    model = PeptideModel(6, 64, 6, 64, 128, 64, dropout=0.2,
-                         learning_rate=args.lr, max_epochs=args.max_epochs)
-    if args.sm:
-        model = PeptideModel.load_from_checkpoint(args.sm,
-                                                  n_layers=args.n_layers,
-                                                  d_model=args.d_model,
-                                                  n_head=args.n_head,
-                                                  d_k=64,
-                                                  d_v=128,
-                                                  d_inner=64,
-                                                  dropout=0.2,
-                                                  learning_rate=args.lr,
-                                                  max_epochs=args.max_epochs)
-    run_id = str(uuid.uuid4())
-    print("Classification Thresholds:")
-    print(model.classification_threshold)
-    logger = WandbLogger(project=f"bind_evaluator",
-                         name=f"finetune_lr={args.lr}_nlayers={args.n_layers}_dmodel={args.d_model}_nhead={args.n_head}_dinner={args.d_inner}",
-                         # display on the web
-                         # save_dir=f'./pl_logs/',
-                         job_type='model-training',
-                         id=run_id)
-    checkpoint_callback = ModelCheckpoint(
-        monitor='val_mcc',
-        dirpath=args.output_file,
-        filename='model-{epoch:02d}-{val_mcc:.2f}',
-        save_top_k=1,
-        mode='max',
-    )
-    early_stopping_callback = EarlyStopping(
-        monitor='val_mcc',
-        patience=5,
-        verbose=True,
-        mode='max'
-    )
-    accumulator = GradientAccumulationScheduler(scheduling={0: 4, 3: 3, 10: 2})
-    trainer = pl.Trainer(
-        max_epochs=args.max_epochs,
-        accelerator='gpu',
-        strategy='ddp',
-        precision='bf16',
-        logger=logger,
-        devices=[0],
-        callbacks=[checkpoint_callback, accumulator, early_stopping_callback],
-        gradient_clip_val=1.0
-    )
-    trainer.fit(model, datamodule=data_module)
-    best_model_path = checkpoint_callback.best_model_path
-    print(best_model_path)
-if __name__ == "__main__":
-    main()

muppit/models/.gitattributes DELETED Viewed

	@@ -1 +0,0 @@
1	- ProtBert-BFD/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text

muppit/models/.gitkeep DELETED Viewed

File without changes

muppit/models/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .models import *
-from .score_domain import *
-from .dataloaders import *

muppit/models/dataloaders.py DELETED Viewed

@@ -1,426 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Sat Jul 31 21:54:08 2021
-@author: Osama
-"""
-from torch.utils.data import Dataset
-from Bio.PDB import Polypeptide
-import numpy as np
-import torch
-import pandas as pd
-import os
-import esm
-import ast
-import pdb
-class InterpepComplexes(Dataset):
-    def __init__(self, mode,
-                 encoded_data_directory = "../../datasets/interpep_data/"):
-        self.mode = mode
-        self.encoded_data_directory = encoded_data_directory
-        self.train_dir = "../../datasets/interpep_data/train_examples.npy"
-        self.test_dir = "../../datasets/interpep_data/test_examples.npy"
-        self.val_dir = "../../datasets/interpep_data/val_examples.npy"
-        self.test_list = np.load(self.test_dir)
-        self.train_list = np.load(self.train_dir)
-        self.val_list = np.load(self.val_dir)
-        if mode == "train":
-            self.num_data = len(self.train_list)
-        elif mode == "val":
-            self.num_data = len(self.val_list)
-        elif mode == "test":
-            self.num_data = len(self.test_list)
-    def __getitem__(self, index):
-        if self.mode == "train":
-            item = self.train_list[index]
-        elif self.mode == "val":
-            item = self.val_list[index]
-        elif self.mode == "test":
-            item = self.test_list[index]
-        file_dir = self.encoded_data_directory
-        with np.load(file_dir + "fragment_data/" + item + ".npz") as data:
-            temp_pep_sequence = data["target_sequence"]
-            temp_binding_sites = data["binding_sites"]
-        with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\
-                     item.split("_")[1] + ".npz") as data:
-            temp_nodes = data["nodes"]
-        binding = np.zeros(len(temp_nodes))
-        if len(temp_binding_sites) != 0:
-            binding[temp_binding_sites] = 1
-        target = torch.LongTensor(binding)
-        nodes = temp_nodes[:, 0:20]
-        prot_sequence = np.argmax(nodes, axis=-1)
-        prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence])
-        pep_sequence = temp_pep_sequence
-        pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1)
-        return pep_sequence, prot_sequence, target
-    def __len__(self):
-        return self.num_data
-class PPI(Dataset):
-    def __init__(self, mode, csv_dir_path = "/home/u21307130002/PepNN/pepnn/datasets/ppi/"):
-        self.mode = mode
-        self.train_data = pd.read_csv(os.path.join(csv_dir_path, 'train.csv'))
-        self.val_data = pd.read_csv(os.path.join(csv_dir_path, 'val.csv'))
-        # self.test_data = pd.read_csv(os.path.join(csv_dir_path, 'test.csv'))
-        if self.mode == 'train':
-            self.num_data = len(self.train_data)
-    def __len__(self):
-        return self.num_data
-    def __getitem__(self, index):
-        # pdb.set_trace()
-        if torch.is_tensor(index):
-            index = index.tolist()
-        if self.mode == "train":
-            item = self.train_data.iloc[index]
-        elif self.mode == "val":
-            item = self.val_data.iloc[index]
-        elif self.mode == "test":
-            item = self.test_data.iloc[index]
-        else:
-            item = None
-        # print(item)
-        motif1 = ast.literal_eval(item['Chain_1_motifs'])
-        motif2 = ast.literal_eval(item['Chain_2_motifs'])
-        if len(motif1[0]) > len(motif2[0]):
-            target = motif1
-            prot_sequence = item['Sequence1']
-            pep_sequence = item['Sequence2']
-        else:
-            target = motif2
-            pep_sequence = item['Sequence1']
-            prot_sequence = item['Sequence2']
-        target = [int(motif.split('_')[1]) for motif in target]
-        if target[-1] >= len(prot_sequence):
-            pdb.set_trace()
-        binding = np.zeros(len(prot_sequence))
-        if len(target) != 0:
-            binding[target] = 1
-        target = torch.LongTensor(binding).float()
-        # print(f"peptide length: {len(pep_sequence)}")
-        # print(f"protein length: {len(prot_sequence)}")
-        # print(f"target length: {len(target)}")
-        # pdb.set_trace()
-        return pep_sequence, prot_sequence, target
-class PepBindComplexes(Dataset):
-    def __init__(self, mode,
-                 encoded_data_directory = "../../datasets/pepbind_data/"):
-        self.mode = mode
-        self.encoded_data_directory = encoded_data_directory
-        self.train_dir = "../../datasets/pepbind_data/train_examples.npy"
-        self.test_dir = "../../datasets/pepbind_data/test_examples.npy"
-        self.val_dir = "../../datasets/pepbind_data/val_examples.npy"
-        self.test_list = np.load(self.test_dir)
-        self.train_list = np.load(self.train_dir)
-        self.val_list = np.load(self.val_dir)
-        if mode == "train":
-            self.num_data = len(self.train_list)
-        elif mode == "val":
-            self.num_data = len(self.val_list)
-        elif mode == "test":
-            self.num_data = len(self.test_list)
-    def __getitem__(self, index):
-        if self.mode == "train":
-            item = self.train_list[index]
-        elif self.mode == "val":
-            item = self.val_list[index]
-        elif self.mode == "test":
-            item = self.test_list[index]
-        file_dir = self.encoded_data_directory
-        with np.load(file_dir + "fragment_data/" + item + ".npz") as data:
-            temp_pep_sequence = data["target_sequence"]
-            temp_binding_sites = data["binding_sites"]
-        with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\
-                     item.split("_")[1] + ".npz") as data:
-            temp_nodes = data["nodes"]
-        binding = np.zeros(len(temp_nodes))
-        if len(temp_binding_sites) != 0:
-            binding[temp_binding_sites] = 1
-        target = torch.LongTensor(binding)
-        nodes = temp_nodes[:, 0:20]
-        prot_sequence = np.argmax(nodes, axis=-1)
-        prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence])
-        pep_sequence = temp_pep_sequence
-        pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1)
-        return pep_sequence, prot_sequence, target
-    def __len__(self):
-        return self.num_data
-class PeptideComplexes(Dataset):
-    def __init__(self, mode,
-                 encoded_data_directory = "../../datasets/pepnn_data/all_data/"):
-        self.mode = mode
-        self.encoded_data_directory = encoded_data_directory
-        self.train_dir = "../../datasets/pepnn_data/train_examples.npy"
-        self.test_dir = "../../datasets/pepnn_test_data/test_examples.npy"
-        self.val_dir = "../../datasets/pepnn_data/val_examples.npy"
-        self.example_weights = np.load("../../datasets/pepnn_data/example_weights.npy")
-        self.test_list = np.load(self.test_dir)
-        self.train_list = np.load(self.train_dir)
-        self.val_list = np.load(self.val_dir)
-        if mode == "train":
-            self.num_data = len(self.train_list)
-        elif mode == "val":
-            self.num_data = len(self.val_list)
-        elif mode == "test":
-            self.num_data = len(self.test_list)
-    def __getitem__(self, index):
-        if self.mode == "train":
-            item = self.train_list[index]
-            weight = self.example_weights[item]
-        elif self.mode == "val":
-            item = self.val_list[index]
-            weight = self.example_weights[item]
-        elif self.mode == "test":
-            item = self.test_list[index]
-            weight = 1
-        if self.mode != "test":
-            file_dir = self.encoded_data_directory
-        else:
-            file_dir = "../../datasets/pepnn_test_data/all_data/"
-        with np.load(file_dir + "fragment_data/" + item + ".npz") as data:
-            temp_pep_sequence = data["target_sequence"]
-            temp_binding_sites = data["binding_sites"]
-        with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\
-                     item.split("_")[1] + ".npz") as data:
-            temp_nodes = data["nodes"]
-        binding = np.zeros(len(temp_nodes))
-        if len(temp_binding_sites) != 0:
-            binding[temp_binding_sites] = 1
-        target = torch.LongTensor(binding)
-        nodes = temp_nodes[:, 0:20]
-        prot_sequence = np.argmax(nodes, axis=-1)
-        prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence])
-        pep_sequence = temp_pep_sequence
-        pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1)
-        return pep_sequence, prot_sequence, target, weight
-    def __len__(self):
-        return self.num_data
-class BitenetComplexes(Dataset):
-    def __init__(self, encoded_data_directory = "../bitenet_data/all_data/"):
-        self.encoded_data_directory = encoded_data_directory
-        self.train_dir = "../../datasets/bitenet_data/examples.npy"
-        self.full_list = np.load(self.train_dir)
-        self.num_data = len(self.full_list)
-    def __getitem__(self, index):
-        item = self.full_list[index]
-        file_dir = self.encoded_data_directory
-        with np.load(file_dir + "fragment_data/" + item[:-1] + "_" + item[-1]  + ".npz") as data:
-            temp_pep_sequence = data["target_sequence"]
-            temp_binding_matrix = data["binding_matrix"]
-        with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\
-                     item.split("_")[1][0] + ".npz") as data:
-            temp_nodes = data["nodes"]
-        binding_sum = np.sum(temp_binding_matrix, axis=0).T
-        target = torch.LongTensor(binding_sum >= 1)
-        nodes = temp_nodes[:, 0:20]
-        prot_sequence = np.argmax(nodes, axis=-1)
-        prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence])
-        pep_sequence = temp_pep_sequence
-        pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1)
-        return pep_sequence, prot_sequence, target
-    def __len__(self):
-        return self.num_data

muppit/models/layers.py DELETED Viewed

@@ -1,44 +0,0 @@
-from torch import nn
-from .modules import *
-class ReciprocalLayer(nn.Module):
-    def __init__(self, d_model, d_inner, n_head, d_k, d_v):
-        super().__init__()
-        self.sequence_attention_layer = MultiHeadAttentionSequence(n_head, d_model,
-                                                                d_k, d_v)
-        self.protein_attention_layer = MultiHeadAttentionSequence(n_head, d_model,
-                                                               d_k, d_v)
-        self.reciprocal_attention_layer = MultiHeadAttentionReciprocal(n_head, d_model,
-                                                                           d_k, d_v)
-        self.ffn_seq = FFN(d_model, d_inner)
-        self.ffn_protein = FFN(d_model, d_inner)
-    def forward(self, sequence_enc, protein_seq_enc):
-        prot_enc, prot_attention = self.protein_attention_layer(protein_seq_enc, protein_seq_enc, protein_seq_enc)
-        seq_enc, sequence_attention = self.sequence_attention_layer(sequence_enc, sequence_enc, sequence_enc)
-        prot_enc, seq_enc, prot_seq_attention, seq_prot_attention = self.reciprocal_attention_layer(prot_enc,
-                                                                                   seq_enc,
-                                                                                   seq_enc,
-                                                                                   prot_enc)
-        prot_enc = self.ffn_protein(prot_enc)
-        seq_enc = self.ffn_seq(seq_enc)
-        return prot_enc, seq_enc, prot_attention, sequence_attention, prot_seq_attention, seq_prot_attention

muppit/models/models.py DELETED Viewed

@@ -1,238 +0,0 @@
-import pdb
-import numpy as np
-import torch
-import torch.nn as nn
-from .layers import *
-from .modules import *
-import pdb
-from transformers import EsmModel, EsmTokenizer
-def to_var(x):
-    if torch.cuda.is_available():
-        x = x.cuda()
-    return x
-class RepeatedModule2(nn.Module):
-    def __init__(self, n_layers, d_model,
-                 n_head, d_k, d_v, d_inner, dropout=0.1):
-        super().__init__()
-        self.linear1 = nn.Linear(1280, d_model)
-        self.linear2 = nn.Linear(1280, d_model)
-        self.sequence_embedding = nn.Embedding(20, d_model)
-        self.d_model = d_model
-        self.reciprocal_layer_stack = nn.ModuleList([
-            ReciprocalLayer(d_model, d_inner, n_head, d_k, d_v)
-            for _ in range(n_layers)])
-        self.dropout = nn.Dropout(dropout)
-        self.dropout_2 = nn.Dropout(dropout)
-    def forward(self, peptide_sequence, protein_sequence):
-        sequence_attention_list = []
-        prot_attention_list = []
-        prot_seq_attention_list = []
-        seq_prot_attention_list = []
-        sequence_enc = self.dropout(self.linear1(peptide_sequence))
-        prot_enc = self.dropout_2(self.linear2(protein_sequence))
-        for reciprocal_layer in self.reciprocal_layer_stack:
-            prot_enc, sequence_enc, prot_attention, sequence_attention, prot_seq_attention, seq_prot_attention = \
-                reciprocal_layer(sequence_enc, prot_enc)
-            sequence_attention_list.append(sequence_attention)
-            prot_attention_list.append(prot_attention)
-            prot_seq_attention_list.append(prot_seq_attention)
-            seq_prot_attention_list.append(seq_prot_attention)
-        return prot_enc, sequence_enc, sequence_attention_list, prot_attention_list, \
-            seq_prot_attention_list, seq_prot_attention_list
-class RepeatedModule(nn.Module):
-    def __init__(self, n_layers, d_model,
-                 n_head, d_k, d_v, d_inner, dropout=0.1):
-        super().__init__()
-        self.linear = nn.Linear(1024, d_model)
-        self.sequence_embedding = nn.Embedding(20, d_model)
-        self.d_model = d_model
-        self.reciprocal_layer_stack = nn.ModuleList([
-                ReciprocalLayer(d_model,  d_inner,  n_head, d_k, d_v)
-                for _ in range(n_layers)])
-        self.dropout = nn.Dropout(dropout)
-        self.dropout_2 = nn.Dropout(dropout)
-    def _positional_embedding(self, batches, number):
-        result = torch.exp(torch.arange(0, self.d_model,2,dtype=torch.float32)*-1*(np.log(10000)/self.d_model))
-        numbers = torch.arange(0, number, dtype=torch.float32)
-        numbers = numbers.unsqueeze(0)
-        numbers = numbers.unsqueeze(2)
-        result = numbers*result
-        result = torch.cat((torch.sin(result), torch.cos(result)),2)
-        return result
-    def forward(self, peptide_sequence, protein_sequence):
-        sequence_attention_list = []
-        prot_attention_list = []
-        prot_seq_attention_list = []
-        seq_prot_attention_list = []
-        sequence_enc = self.sequence_embedding(peptide_sequence)
-        sequence_enc += to_var(self._positional_embedding(peptide_sequence.shape[0],
-                                                           peptide_sequence.shape[1]))
-        sequence_enc = self.dropout(sequence_enc)
-        prot_enc = self.dropout_2(self.linear(protein_sequence))
-        for reciprocal_layer in self.reciprocal_layer_stack:
-            prot_enc, sequence_enc, prot_attention, sequence_attention, prot_seq_attention, seq_prot_attention =\
-                reciprocal_layer(sequence_enc, prot_enc)
-            sequence_attention_list.append(sequence_attention)
-            prot_attention_list.append(prot_attention)
-            prot_seq_attention_list.append(prot_seq_attention)
-            seq_prot_attention_list.append(seq_prot_attention)
-        return prot_enc, sequence_enc, sequence_attention_list, prot_attention_list,\
-            seq_prot_attention_list, seq_prot_attention_list
-class FullModel(nn.Module):
-    def __init__(self, n_layers, d_model, n_head,
-                 d_k, d_v, d_inner, return_attention=False, dropout=0.2):
-        super().__init__()
-        self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
-        # freeze all the esm_model parameters
-        for param in self.esm_model.parameters():
-            param.requires_grad = False
-        self.repeated_module = RepeatedModule2(n_layers, d_model,
-                                               n_head, d_k, d_v, d_inner, dropout=dropout)
-        self.final_attention_layer = MultiHeadAttentionSequence(n_head, d_model,
-                                                                d_k, d_v, dropout=dropout)
-        self.final_ffn = FFN(d_model, d_inner, dropout=dropout)
-        self.output_projection_prot = nn.Linear(d_model, 1)
-        self.sigmoid = nn.Sigmoid()
-        self.return_attention = return_attention
-    def forward(self, binder_tokens, target_tokens):
-        with torch.no_grad():
-            peptide_sequence = self.esm_model(**binder_tokens).last_hidden_state
-            protein_sequence = self.esm_model(**target_tokens).last_hidden_state
-        # pdb.set_trace()
-        prot_enc, sequence_enc, sequence_attention_list, prot_attention_list, \
-            seq_prot_attention_list, seq_prot_attention_list = self.repeated_module(peptide_sequence,
-                                                                                    protein_sequence)
-        prot_enc, final_prot_seq_attention = self.final_attention_layer(prot_enc, sequence_enc, sequence_enc)
-        # pdb.set_trace()
-        prot_enc = self.final_ffn(prot_enc)
-        prot_enc = self.sigmoid(self.output_projection_prot(prot_enc))
-        return prot_enc
-class Original_FullModel(nn.Module):
-    def __init__(self, n_layers, d_model, n_head,
-                 d_k, d_v, d_inner, return_attention=False, dropout=0.2):
-        super().__init__()
-        self.repeated_module = RepeatedModule(n_layers, d_model,
-                               n_head, d_k, d_v, d_inner, dropout=dropout)
-        self.final_attention_layer = MultiHeadAttentionSequence(n_head, d_model,
-                                                                d_k, d_v, dropout=dropout)
-        self.final_ffn = FFN(d_model, d_inner, dropout=dropout)
-        self.output_projection_prot = nn.Linear(d_model, 2)
-        self.softmax_prot =nn.LogSoftmax(dim=-1)
-        self.return_attention = return_attention
-    def forward(self, peptide_sequence, protein_sequence):
-        prot_enc, sequence_enc, sequence_attention_list, prot_attention_list,\
-            seq_prot_attention_list, seq_prot_attention_list = self.repeated_module(peptide_sequence,
-                                                                                    protein_sequence)
-        prot_enc, final_prot_seq_attention  = self.final_attention_layer(prot_enc, sequence_enc, sequence_enc)
-        prot_enc = self.final_ffn(prot_enc)
-        prot_enc = self.softmax_prot(self.output_projection_prot(prot_enc))
-        if not self.return_attention:
-            return prot_enc
-        else:
-            return prot_enc, sequence_attention_list, prot_attention_list,\
-            seq_prot_attention_list, seq_prot_attention_list

muppit/models/modules.py DELETED Viewed

@@ -1,213 +0,0 @@
-from torch import nn
-import numpy as np
-import torch
-import torch.nn.functional as F
-def to_var(x):
-    if torch.cuda.is_available():
-        x = x.cuda()
-    return x
-class MultiHeadAttentionSequence(nn.Module):
-    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
-        super().__init__()
-        self.n_head = n_head
-        self.d_model = d_model
-        self.d_k = d_k
-        self.d_v = d_v
-        self.W_Q = nn.Linear(d_model, n_head*d_k)
-        self.W_K = nn.Linear(d_model, n_head*d_k)
-        self.W_V = nn.Linear(d_model, n_head*d_v)
-        self.W_O = nn.Linear(n_head*d_v, d_model)
-        self.layer_norm = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, q, k, v):
-        batch, len_q, _ = q.size()
-        batch, len_k, _ = k.size()
-        batch, len_v, _ = v.size()
-        Q = self.W_Q(q).view([batch, len_q, self.n_head, self.d_k])
-        K = self.W_K(k).view([batch, len_k, self.n_head, self.d_k])
-        V = self.W_V(v).view([batch, len_v, self.n_head, self.d_v])
-        Q = Q.transpose(1, 2)
-        K = K.transpose(1, 2).transpose(2, 3)
-        V = V.transpose(1, 2)
-        attention = torch.matmul(Q, K)
-        attention = attention /np.sqrt(self.d_k)
-        attention = F.softmax(attention, dim=-1)
-        output = torch.matmul(attention, V)
-        output = output.transpose(1, 2).reshape([batch, len_q, self.d_v*self.n_head])
-        output = self.W_O(output)
-        output = self.dropout(output)
-        output = self.layer_norm(output + q)
-        return output, attention
-class MultiHeadAttentionReciprocal(nn.Module):
-    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
-        super().__init__()
-        self.n_head = n_head
-        self.d_model = d_model
-        self.d_k = d_k
-        self.d_v = d_v
-        self.W_Q = nn.Linear(d_model, n_head*d_k)
-        self.W_K = nn.Linear(d_model, n_head*d_k)
-        self.W_V = nn.Linear(d_model, n_head*d_v)
-        self.W_O = nn.Linear(n_head*d_v, d_model)
-        self.W_V_2 = nn.Linear(d_model, n_head*d_v)
-        self.W_O_2 = nn.Linear(n_head*d_v, d_model)
-        self.layer_norm = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-        self.layer_norm_2 = nn.LayerNorm(d_model)
-        self.dropout_2 = nn.Dropout(dropout)
-    def forward(self, q, k, v, v_2):
-        batch, len_q, _ = q.size()
-        batch, len_k, _ = k.size()
-        batch, len_v, _ = v.size()
-        batch, len_v_2, _ = v_2.size()
-        Q = self.W_Q(q).view([batch, len_q, self.n_head, self.d_k])
-        K = self.W_K(k).view([batch, len_k, self.n_head, self.d_k])
-        V = self.W_V(v).view([batch, len_v, self.n_head, self.d_v])
-        V_2 = self.W_V_2(v_2).view([batch, len_v_2, self.n_head, self.d_v])
-        Q = Q.transpose(1, 2)
-        K = K.transpose(1, 2).transpose(2, 3)
-        V = V.transpose(1, 2)
-        V_2 = V_2.transpose(1,2)
-        attention = torch.matmul(Q, K)
-        attention = attention /np.sqrt(self.d_k)
-        attention_2 = attention.transpose(-2, -1)
-        attention = F.softmax(attention, dim=-1)
-        attention_2 = F.softmax(attention_2, dim=-1)
-        output = torch.matmul(attention, V)
-        output_2 = torch.matmul(attention_2, V_2)
-        output = output.transpose(1, 2).reshape([batch, len_q, self.d_v*self.n_head])
-        output_2 = output_2.transpose(1, 2).reshape([batch, len_k, self.d_v*self.n_head])
-        output = self.W_O(output)
-        output_2 = self.W_O_2(output_2)
-        output = self.dropout(output)
-        output = self.layer_norm(output + q)
-        output_2 = self.dropout(output_2)
-        output_2 = self.layer_norm(output_2 + k)
-        return output, output_2, attention, attention_2
-class FFN(nn.Module):
-    def __init__(self, d_in, d_hid, dropout=0.1):
-        super().__init__()
-        self.layer_1 = nn.Conv1d(d_in, d_hid,1)
-        self.layer_2 = nn.Conv1d(d_hid, d_in,1)
-        self.relu = nn.ReLU()
-        self.layer_norm = nn.LayerNorm(d_in)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        residual = x
-        output = self.layer_1(x.transpose(1, 2))
-        output = self.relu(output)
-        output = self.layer_2(output)
-        output = self.dropout(output)
-        output = self.layer_norm(output.transpose(1, 2)+residual)
-        return output

muppit/models/score_domain.py DELETED Viewed

@@ -1,40 +0,0 @@
-from scipy.stats import norm
-import numpy as np
-import os
-def score(outputs):
-    weight = 0.03
-    binding_size_dist = np.load(os.path.join(os.path.dirname(__file__), "../params/binding_size_train_dist.npy"))
-    mean = np.mean(binding_size_dist)
-    std = np.std(binding_size_dist)
-    dist  = norm(mean, std)
-    max_score = 0
-    scores = np.exp(outputs[0])[:, 1]
-    indices = np.argsort(-1*scores)
-    for j in range(1, len(indices)):
-        score = (1-weight)*np.mean(scores[indices[:j]]) +  weight*(dist.pdf(j/len(indices)))
-        if score > max_score:
-            max_score = score
-    return max_score

muppit/predict.py DELETED Viewed

@@ -1,118 +0,0 @@
-import torch
-import pytorch_lightning as pl
-from torch.utils.data import DataLoader
-from datasets import load_from_disk
-from transformers import AutoTokenizer
-from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef
-from argparse import ArgumentParser
-import os
-import torch.distributed as dist
-from models import *  # Import your model and other necessary classes/functions here
-class PeptideModel(pl.LightningModule):
-    def __init__(self, n_layers, d_model, n_head,
-                 d_k, d_v, d_inner, dropout=0.2,
-                 learning_rate=0.00001, max_epochs=15):
-        super(PeptideModel, self).__init__()
-        self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
-        # freeze all the esm_model parameters
-        for param in self.esm_model.parameters():
-            param.requires_grad = False
-        self.repeated_module = RepeatedModule2(n_layers, d_model,
-                                               n_head, d_k, d_v, d_inner, dropout=dropout)
-        self.final_attention_layer = MultiHeadAttentionSequence(n_head, d_model,
-                                                                d_k, d_v, dropout=dropout)
-        self.final_ffn = FFN(d_model, d_inner, dropout=dropout)
-        self.output_projection_prot = nn.Linear(d_model, 1)
-        self.learning_rate = learning_rate
-        self.max_epochs = max_epochs
-        self.classification_threshold = nn.Parameter(torch.tensor(0.5))  # Initial threshold
-        self.historical_memory = 0.9
-        self.class_weights = torch.tensor(
-            [3.000471363174231, 0.5999811490272925])  # binding_site weights, non-bidning site weights
-    def forward(self, binder_tokens, target_tokens):
-        peptide_sequence = self.esm_model(**binder_tokens).last_hidden_state
-        protein_sequence = self.esm_model(**target_tokens).last_hidden_state
-        prot_enc, sequence_enc, sequence_attention_list, prot_attention_list, \
-            seq_prot_attention_list, seq_prot_attention_list = self.repeated_module(peptide_sequence,
-                                                                                    protein_sequence)
-        prot_enc, final_prot_seq_attention = self.final_attention_layer(prot_enc, sequence_enc, sequence_enc)
-        prot_enc = self.final_ffn(prot_enc)
-        prot_enc = self.output_projection_prot(prot_enc)
-        return torch.sigmoid(prot_enc)
-def main():
-    parser = ArgumentParser()
-    parser.add_argument("-sm", default='/home/tc415/muPPIt/muppit/train_base_1/model-epoch=14-val_loss=0.40.ckpt',
-                        help="File containing initial params", type=str)
-    parser.add_argument("-batch_size", type=int, default=32, help="Batch size")
-    parser.add_argument("-lr", type=float, default=1e-3)
-    parser.add_argument("-n_layers", type=int, default=6, help="Number of layers")
-    parser.add_argument("-d_model", type=int, default=64, help="Dimension of model")
-    parser.add_argument("-n_head", type=int, default=6, help="Number of heads")
-    parser.add_argument("-d_inner", type=int, default=64)
-    parser.add_argument("-target", type=str)
-    parser.add_argument("-binder", type=str)
-    args = parser.parse_args()
-    # print(args)
-    device = 'cuda:0'
-    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    anchor_tokens = tokenizer(args.target, return_tensors='pt', padding=True, truncation=True, max_length=40000)
-    positive_tokens = tokenizer(args.binder, return_tensors='pt', padding=True, truncation=True, max_length=40000)
-    anchor_tokens['attention_mask'][0][0] = 0
-    anchor_tokens['attention_mask'][0][-1] = 0
-    positive_tokens['attention_mask'][0][0] = 0
-    positive_tokens['attention_mask'][0][-1] = 0
-    target_tokens = {'input_ids': anchor_tokens["input_ids"].to(device),
-                     'attention_mask': anchor_tokens["attention_mask"].to(device)}
-    binder_tokens = {'input_ids': positive_tokens['input_ids'].to(device),
-                     'attention_mask': positive_tokens['attention_mask'].to(device)}
-    print(binder_tokens['input_ids'].shape)
-    model = PeptideModel.load_from_checkpoint(args.sm,
-                                              n_layers=args.n_layers,
-                                              d_model=args.d_model,
-                                              n_head=args.n_head,
-                                              d_k=64,
-                                              d_v=128,
-                                              d_inner=64).to(device)
-    model.eval()
-    prediction = model(binder_tokens, target_tokens).squeeze(-1)[0][1:-1]
-    print(prediction.shape)
-    print(model.classification_threshold)
-    binding_site = []
-    for i in range(len(prediction)):
-        if prediction[i] >= model.classification_threshold:
-            binding_site.append(i)
-    print(binding_site)
-    print(len(binding_site))
-if __name__ == "__main__":
-    main()

muppit/scripts/.gitkeep DELETED Viewed

File without changes

muppit/scripts/predict_binding_site.py DELETED Viewed

@@ -1,149 +0,0 @@
-from Bio import SeqIO
-from Bio.PDB import Polypeptide
-from transformers import BertModel, BertTokenizer, pipeline
-from pepnn_seq.models import FullModel
-from pepnn_seq.models import score
-import pandas as pd
-import numpy as np
-import torch
-import argparse
-import os
-def to_var(x):
-    if torch.cuda.is_available():
-        x = x.cuda()
-    return x
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-prot", dest="input_protein_file", required=False, type=str,
-                        help="Fasta file with protein sequence")
-    parser.add_argument("-pep", dest="input_peptide_file", required=False, type=str, default=None,
-                        help="Fasta file with peptide sequence")
-    parser.add_argument("-o", dest="output_directory", required=False, type=str, default=None,
-                        help="Output directory")
-    parser.add_argument("-p", dest="params", required=False, type=str, default="../params/params.pth",
-                        help="Model parameters")
-    args = parser.parse_args()
-    if args.output_directory == None:
-        output_directory = os.path.split(args.input_protein_file)[-1].split(".")[0] + "_seq"
-    else:
-        output_directory = args.output_directory
-    if not os.path.exists(output_directory):
-        os.mkdir(output_directory)
-    records = SeqIO.parse(args.input_protein_file, format="fasta")
-    prot_sequence = ' '.join(list(records)[0].seq)
-    protbert_dir = os.path.join(os.path.dirname(__file__), '../models/ProtBert-BFD/')
-    vocabFilePath = os.path.join(protbert_dir, 'vocab.txt')
-    tokenizer = BertTokenizer(vocabFilePath, do_lower_case=False )
-    seq_embedding = BertModel.from_pretrained(protbert_dir)
-    if  torch.cuda.is_available():
-        seq_embedding = pipeline('feature-extraction', model=seq_embedding, tokenizer=tokenizer, device=0)
-    else:
-        seq_embedding = pipeline('feature-extraction', model=seq_embedding, tokenizer=tokenizer, device=-1)
-    embedding = seq_embedding(prot_sequence)
-    embedding = np.array(embedding)
-    seq_len = len(prot_sequence.replace(" ", ""))
-    start_Idx = 1
-    end_Idx = seq_len+1
-    seq_emd = embedding[0][start_Idx:end_Idx]
-    prot_seq = to_var(torch.FloatTensor(seq_emd).unsqueeze(0))
-    if args.input_peptide_file != None:
-        records = SeqIO.parse(args.input_peptide_file, format="fasta")
-        pep_sequence = str(list(records)[0].seq).replace("X", "")
-        pep_sequence = [Polypeptide.d1_to_index[i] for i in pep_sequence]
-    else:
-        pep_sequence = [5 for i in range(10)]
-    pep_seq = to_var(torch.LongTensor(pep_sequence).unsqueeze(0))
-    model = FullModel(6, 64, 6,
-                      64, 128, 64, dropout=0.2)
-    if torch.cuda.is_available():
-        model.load_state_dict(torch.load(os.path.join(os.path.dirname(__file__), args.params)))
-    else:
-        model.load_state_dict(torch.load(os.path.join(os.path.dirname(__file__), args.params),
-                                         map_location='cpu'))
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        model.cuda()
-    model.eval()
-    if torch.cuda.is_available():
-        outputs = model(pep_seq, prot_seq).cpu().detach().numpy()
-    else:
-        outputs = model(pep_seq, prot_seq).detach().numpy()
-    # compute score for the domain and output file
-    score_prm = score(outputs)
-    with open(output_directory + "/prm_score.txt", 'w') as output_file:
-        output_file.writelines("The input protein's score is {0:.2f}".format(score_prm))
-    # output prediction as csv
-    outputs = np.exp(outputs[0])
-    amino_acids = []
-    probabilities = []
-    position = []
-    for index, aa in enumerate(prot_sequence.split(" ")):
-        probabilities.append(outputs[index, 1])
-        amino_acids.append(aa)
-        position.append(index+1)
-    output = pd.DataFrame()
-    output["Position"] = position
-    output["Amino acid"] =  amino_acids
-    output["Probabilities"] = probabilities
-    output.to_csv(output_directory + "/binding_site_prediction.csv", index=False)

muppit/test_evaluator.py DELETED Viewed

@@ -1,197 +0,0 @@
-import torch
-import pytorch_lightning as pl
-from torch.utils.data import DataLoader
-from datasets import load_from_disk
-from transformers import AutoTokenizer
-from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef
-from argparse import ArgumentParser
-import os
-import torch.distributed as dist
-from models import *  # Import your model and other necessary classes/functions here
-def collate_fn(batch):
-    # Unpack the batch
-    anchors = []
-    positives = []
-    # negatives = []
-    binding_sites = []
-    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    for b in batch:
-        anchors.append(b['anchors'])
-        positives.append(b['positives'])
-        # negatives.append(b['negatives'])
-        binding_sites.append(b['binding_site'])
-    # Collate the tensors using torch's pad_sequence
-    anchor_input_ids = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['input_ids']).squeeze(0) for item in anchors], batch_first=True, padding_value=tokenizer.pad_token_id)
-    anchor_attention_mask = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['attention_mask']).squeeze(0) for item in anchors], batch_first=True, padding_value=0)
-    positive_input_ids = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['input_ids']).squeeze(0) for item in positives], batch_first=True, padding_value=tokenizer.pad_token_id)
-    positive_attention_mask = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['attention_mask']).squeeze(0) for item in positives], batch_first=True, padding_value=0)
-    n, max_length = anchor_input_ids.shape[0], anchor_input_ids.shape[1]
-    site = torch.zeros(n, max_length)
-    for i in range(len(binding_sites)):
-        binding_site = binding_sites[i]
-        site[i, binding_site] = 1
-    # Return the collated batch
-    return {
-        'anchor_input_ids': anchor_input_ids.int(),
-        'anchor_attention_mask': anchor_attention_mask.int(),
-        'positive_input_ids': positive_input_ids.int(),
-        'positive_attention_mask': positive_attention_mask.int(),
-        # 'negative_input_ids': negative_input_ids.int(),
-        # 'negative_attention_mask': negative_attention_mask.int(),
-        'binding_site': site
-    }
-class CustomDataModule(pl.LightningDataModule):
-    def __init__(self, tokenizer, batch_size: int = 128):
-        super().__init__()
-        self.batch_size = batch_size
-        self.tokenizer = tokenizer
-    def test_dataloader(self):
-        test_dataset = load_from_disk('/home/tc415/muPPIt/dataset/test_dataset_drop_500')
-        return DataLoader(test_dataset, batch_size=self.batch_size, collate_fn=collate_fn, num_workers=8, pin_memory=True)
-class PeptideModel(pl.LightningModule):
-    def __init__(self, n_layers, d_model, n_head,
-                 d_k, d_v, d_inner, dropout=0.2,
-                 learning_rate=0.00001, max_epochs=15):
-        super(PeptideModel, self).__init__()
-        self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
-        # freeze all the esm_model parameters
-        for param in self.esm_model.parameters():
-            param.requires_grad = False
-        self.repeated_module = RepeatedModule2(n_layers, d_model,
-                                               n_head, d_k, d_v, d_inner, dropout=dropout)
-        self.final_attention_layer = MultiHeadAttentionSequence(n_head, d_model,
-                                                                d_k, d_v, dropout=dropout)
-        self.final_ffn = FFN(d_model, d_inner, dropout=dropout)
-        self.output_projection_prot = nn.Linear(d_model, 1)
-        self.learning_rate = learning_rate
-        self.max_epochs = max_epochs
-        self.classification_threshold = nn.Parameter(torch.tensor(0.5))  # Initial threshold
-        self.historical_memory = 0.9
-        self.class_weights = torch.tensor([3.000471363174231, 0.5999811490272925])  # binding_site weights, non-bidning site weights
-    def forward(self, binder_tokens, target_tokens):
-        peptide_sequence = self.esm_model(**binder_tokens).last_hidden_state
-        protein_sequence = self.esm_model(**target_tokens).last_hidden_state
-        prot_enc, sequence_enc, sequence_attention_list, prot_attention_list, \
-            seq_prot_attention_list, seq_prot_attention_list = self.repeated_module(peptide_sequence,
-                                                                                    protein_sequence)
-        prot_enc, final_prot_seq_attention = self.final_attention_layer(prot_enc, sequence_enc, sequence_enc)
-        prot_enc = self.final_ffn(prot_enc)
-        prot_enc = self.output_projection_prot(prot_enc)
-        return prot_enc
-    def test_step(self, batch, batch_idx):
-        target_tokens = {'input_ids': batch['anchor_input_ids'].to(self.device),
-                         'attention_mask': batch['anchor_attention_mask'].to(self.device)}
-        binder_tokens = {'input_ids': batch['positive_input_ids'].to(self.device),
-                         'attention_mask': batch['positive_attention_mask'].to(self.device)}
-        binding_site = batch['binding_site'].to(self.device)
-        mask = target_tokens['attention_mask']
-        outputs_nodes = self.forward(binder_tokens, target_tokens).squeeze(-1)
-        weight = self.class_weights[0] * binding_site + self.class_weights[1] * (1 - binding_site)
-        loss = F.binary_cross_entropy_with_logits(outputs_nodes, binding_site, weight=weight, reduction='none')
-        masked_loss = loss * mask
-        mean_loss = masked_loss.sum() / mask.sum()
-        sigmoid_outputs = torch.sigmoid(outputs_nodes)
-        total = mask.sum()
-        # self.update_class_thresholds(sigmoid_outputs, binding_site, mask)
-        # self.log('threshold', self.classification_threshold, on_epoch=True)
-        predict = (sigmoid_outputs >= self.classification_threshold).float()
-        correct = ((predict == binding_site) * mask).sum()
-        accuracy = correct / total
-        outputs_nodes_flat = sigmoid_outputs[mask.bool()].float().cpu().detach().numpy().flatten()
-        binding_site_flat = binding_site[mask.bool()].float().cpu().detach().numpy().flatten()
-        predictions_flat = predict[mask.bool()].float().cpu().detach().numpy().flatten()
-        auc = roc_auc_score(binding_site_flat, outputs_nodes_flat)
-        f1 = f1_score(binding_site_flat, predictions_flat)
-        mcc = matthews_corrcoef(binding_site_flat, predictions_flat)
-        self.log('test_loss', mean_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('test_accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('test_auc', auc, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('test_f1', f1, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('test_mcc', mcc, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-def main():
-    parser = ArgumentParser()
-    parser.add_argument("-sm", default='/home/tc415/muPPIt/muppit/train_base_1/model-epoch=14-val_loss=0.40.ckpt', help="File containing initial params", type=str)
-    parser.add_argument("-batch_size", type=int, default=32, help="Batch size")
-    parser.add_argument("-lr", type=float, default=1e-3)
-    parser.add_argument("-n_layers", type=int, default=6, help="Number of layers")
-    parser.add_argument("-d_model", type=int, default=64, help="Dimension of model")
-    parser.add_argument("-n_head", type=int, default=6, help="Number of heads")
-    parser.add_argument("-d_inner", type=int, default=64)
-    args = parser.parse_args()
-    print(args.sm)
-    # Initialize the process group for distributed training
-    dist.init_process_group(backend='nccl')
-    test_dataset = load_from_disk('/home/tc415/muPPIt/dataset/pep_prot_test')
-    # print(len(test_dataset))
-    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    data_module = CustomDataModule(tokenizer, args.batch_size)
-    model = PeptideModel.load_from_checkpoint(args.sm,
-                                              n_layers=args.n_layers,
-                                              d_model=args.d_model,
-                                              n_head=args.n_head,
-                                              d_k=64,
-                                              d_v=128,
-                                              d_inner=64)
-    print(f"Class threshold = {model.classification_threshold}")
-    trainer = pl.Trainer(accelerator='gpu',
-                         devices=[0,1,2,3,4,5,6,7],
-                         strategy='ddp',
-                         precision='bf16')
-    results = trainer.test(model, datamodule=data_module)
-    print(results)
-if __name__ == "__main__":
-    main()

muppit/train_evaluator.py DELETED Viewed

@@ -1,408 +0,0 @@
-import pdb
-from pytorch_lightning.strategies import DDPStrategy
-import torch
-import torch.nn.functional as F
-from torch.utils.data import DataLoader, DistributedSampler
-from datasets import load_from_disk
-import pytorch_lightning as pl
-from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, \
-    Timer, TQDMProgressBar, LearningRateMonitor, StochasticWeightAveraging, GradientAccumulationScheduler
-from pytorch_lightning.loggers import WandbLogger
-from torch.optim.lr_scheduler import _LRScheduler
-from transformers.optimization import get_cosine_schedule_with_warmup
-from argparse import ArgumentParser
-import os
-import uuid
-import numpy as np
-import torch.distributed as dist
-from models import *
-from torch.nn.utils.rnn import pad_sequence
-from transformers import AutoTokenizer, get_cosine_schedule_with_warmup
-from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
-from torch.optim import Adam, AdamW
-from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef
-import gc
-os.environ["TORCH_CPP_LOG_LEVEL"]="INFO"
-os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
-os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
-def compute_class_weights(targets):
-    num_binding_sites = targets.sum()
-    num_non_binding_sites = targets.numel() - num_binding_sites
-    total = num_binding_sites + num_non_binding_sites
-    weight_for_binding = total / (2 * num_binding_sites)
-    weight_for_non_binding = total / (2 * num_non_binding_sites)
-    return torch.tensor([weight_for_non_binding, weight_for_binding])
-def collate_fn(batch):
-    # Unpack the batch
-    anchors = []
-    positives = []
-    # negatives = []
-    binding_sites = []
-    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    for b in batch:
-        anchors.append(b['anchors'])
-        positives.append(b['positives'])
-        # negatives.append(b['negatives'])
-        binding_sites.append(b['binding_site'])
-    # Collate the tensors using torch's pad_sequence
-    anchor_input_ids = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['input_ids']).squeeze(0) for item in anchors], batch_first=True, padding_value=tokenizer.pad_token_id)
-    anchor_attention_mask = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['attention_mask']).squeeze(0) for item in anchors], batch_first=True, padding_value=0)
-    positive_input_ids = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['input_ids']).squeeze(0) for item in positives], batch_first=True, padding_value=tokenizer.pad_token_id)
-    positive_attention_mask = torch.nn.utils.rnn.pad_sequence(
-        [torch.Tensor(item['attention_mask']).squeeze(0) for item in positives], batch_first=True, padding_value=0)
-    # negative_input_ids = torch.nn.utils.rnn.pad_sequence(
-    #     [torch.Tensor(item['input_ids']).squeeze(0) for item in negatives], batch_first=True, padding_value=tokenizer.pad_token_id)
-    # negative_attention_mask = torch.nn.utils.rnn.pad_sequence(
-    #     [torch.Tensor(item['attention_mask']).squeeze(0) for item in negatives], batch_first=True, padding_value=0)
-    # assert anchor_input_ids.shape == negative_input_ids.shape
-    n, max_length = anchor_input_ids.shape[0], anchor_input_ids.shape[1]
-    site = torch.zeros(n, max_length)
-    for i in range(len(binding_sites)):
-        binding_site = binding_sites[i]
-        site[i, binding_site] = 1
-    # Return the collated batch
-    return {
-        'anchor_input_ids': anchor_input_ids.int(),
-        'anchor_attention_mask': anchor_attention_mask.int(),
-        'positive_input_ids': positive_input_ids.int(),
-        'positive_attention_mask': positive_attention_mask.int(),
-        # 'negative_input_ids': negative_input_ids.int(),
-        # 'negative_attention_mask': negative_attention_mask.int(),
-        'binding_site': site
-    }
-class CustomDataModule(pl.LightningDataModule):
-    def __init__(self, train_dataset, val_dataset, tokenizer, batch_size: int = 128):
-        super().__init__()
-        self.train_dataset = train_dataset
-        self.val_dataset = val_dataset
-        self.batch_size = batch_size
-        self.tokenizer = tokenizer
-    def train_dataloader(self):
-        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn,
-                          num_workers=8, pin_memory=True)
-    def val_dataloader(self):
-        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=collate_fn, num_workers=8,
-                          pin_memory=True)
-    def setup(self, stage=None):
-        if stage == 'test' or stage is None:
-            test_dataset = load_from_disk('/home/tc415/muPPIt/dataset/test_dataset_static')
-            self.test_dataloader = DataLoader(self.test_dataset, batch_size=self.batch_size, collate_fn=collate_fn,
-                                              num_workers=8, pin_memory=True)
-class CosineAnnealingWithWarmup(_LRScheduler):
-    def __init__(self, optimizer, warmup_steps, total_steps, base_lr, max_lr, min_lr, last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        self.total_steps = total_steps
-        self.base_lr = base_lr
-        self.max_lr = max_lr
-        self.min_lr = min_lr
-        super(CosineAnnealingWithWarmup, self).__init__(optimizer, last_epoch)
-        print(f"SELF BASE LRS = {self.base_lrs}")
-    def get_lr(self):
-        if self.last_epoch < self.warmup_steps:
-            # Linear warmup phase from base_lr to max_lr
-            return [self.base_lr + (self.max_lr - self.base_lr) * (self.last_epoch / self.warmup_steps) for base_lr in self.base_lrs]
-        # Cosine annealing phase from max_lr to min_lr
-        progress = (self.last_epoch - self.warmup_steps) / (self.total_steps - self.warmup_steps)
-        cosine_decay = 0.5 * (1 + np.cos(np.pi * progress))
-        decayed_lr = self.min_lr + (self.max_lr - self.min_lr) * cosine_decay
-        return [decayed_lr for base_lr in self.base_lrs]
-class PeptideModel(pl.LightningModule):
-    def __init__(self, n_layers, d_model, n_head,
-                 d_k, d_v, d_inner, dropout=0.2,
-                 learning_rate=0.00001, max_epochs=15):
-        super(PeptideModel, self).__init__()
-        self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
-        # freeze all the esm_model parameters
-        for param in self.esm_model.parameters():
-            param.requires_grad = False
-        self.repeated_module = RepeatedModule2(n_layers, d_model,
-                                               n_head, d_k, d_v, d_inner, dropout=dropout)
-        self.final_attention_layer = MultiHeadAttentionSequence(n_head, d_model,
-                                                                d_k, d_v, dropout=dropout)
-        self.final_ffn = FFN(d_model, d_inner, dropout=dropout)
-        self.output_projection_prot = nn.Linear(d_model, 1)
-        self.learning_rate = learning_rate
-        self.max_epochs = max_epochs
-        self.classification_threshold = nn.Parameter(torch.tensor(0.5))  # Initial threshold
-        self.historical_memory = 0.9
-        self.class_weights = torch.tensor([3.000471363174231, 0.5999811490272925])  # binding_site weights, non-bidning site weights
-    def forward(self, binder_tokens, target_tokens):
-        peptide_sequence = self.esm_model(**binder_tokens).last_hidden_state
-        protein_sequence = self.esm_model(**target_tokens).last_hidden_state
-        prot_enc, sequence_enc, sequence_attention_list, prot_attention_list, \
-            seq_prot_attention_list, seq_prot_attention_list = self.repeated_module(peptide_sequence,
-                                                                                    protein_sequence)
-        prot_enc, final_prot_seq_attention = self.final_attention_layer(prot_enc, sequence_enc, sequence_enc)
-        prot_enc = self.final_ffn(prot_enc)
-        prot_enc = self.output_projection_prot(prot_enc)
-        return prot_enc
-    def training_step(self, batch, batch_idx):
-        opt = self.optimizers()
-        lr = opt.param_groups[0]['lr']
-        self.log('learning_rate', lr, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        target_tokens = {'input_ids': batch['anchor_input_ids'].to(self.device),
-                         'attention_mask': batch['anchor_attention_mask'].to(self.device)}
-        binder_tokens = {'input_ids': batch['positive_input_ids'].to(self.device),
-                         'attention_mask': batch['positive_attention_mask'].to(self.device)}
-        binding_site = batch['binding_site'].to(self.device)
-        mask = target_tokens['attention_mask']
-        outputs_nodes = self.forward(binder_tokens, target_tokens).squeeze(-1)
-        weight = self.class_weights[0] * binding_site + self.class_weights[1] * (1 - binding_site)
-        loss = F.binary_cross_entropy_with_logits(outputs_nodes, binding_site, weight=weight, reduction='none')
-        masked_loss = loss * mask
-        mean_loss = masked_loss.sum() / mask.sum()
-        # print('logging')
-        self.log('train_loss', mean_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        return mean_loss
-    def validation_step(self, batch, batch_idx):
-        target_tokens = {'input_ids': batch['anchor_input_ids'].to(self.device),
-                         'attention_mask': batch['anchor_attention_mask'].to(self.device)}
-        binder_tokens = {'input_ids': batch['positive_input_ids'].to(self.device),
-                         'attention_mask': batch['positive_attention_mask'].to(self.device)}
-        binding_site = batch['binding_site'].to(self.device)
-        mask = target_tokens['attention_mask']
-        outputs_nodes = self.forward(binder_tokens, target_tokens).squeeze(-1)
-        weight = self.class_weights[0] * binding_site + self.class_weights[1] * (1 - binding_site)
-        loss = F.binary_cross_entropy_with_logits(outputs_nodes, binding_site, weight=weight, reduction='none')
-        # Apply the mask to the loss
-        masked_loss = loss * mask
-        # Compute the mean loss only over the valid positions
-        mean_loss = masked_loss.sum() / mask.sum()
-        # Calculate predictions and apply mask
-        sigmoid_outputs = torch.sigmoid(outputs_nodes)
-        total = mask.sum()
-        self.update_class_thresholds(sigmoid_outputs, binding_site, mask)
-        self.log('threshold', self.classification_threshold, on_epoch=True)
-        predict = (sigmoid_outputs >= self.classification_threshold).float()
-        correct = ((predict == binding_site) * mask).sum()
-        accuracy = correct / total
-        # Compute AUC
-        outputs_nodes_flat = sigmoid_outputs[mask.bool()].float().cpu().detach().numpy().flatten()
-        binding_site_flat = binding_site[mask.bool()].float().cpu().detach().numpy().flatten()
-        predictions_flat = predict[mask.bool()].float().cpu().detach().numpy().flatten()
-        auc = roc_auc_score(binding_site_flat, outputs_nodes_flat)
-        f1 = f1_score(binding_site_flat, predictions_flat)
-        mcc = matthews_corrcoef(binding_site_flat, predictions_flat)
-        self.log('val_loss', mean_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('val_accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('val_auc', auc, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('val_f1', f1, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-        self.log('val_mcc', mcc, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
-    def configure_optimizers(self):
-        print(f"MAX STEPS = {self.max_epochs}")
-        optimizer = AdamW(self.parameters(), lr=self.learning_rate, betas=(0.9, 0.95))
-        # schedulers = LinearWarmupCosineAnnealingLR(optimizer, warmup_epochs=0.1*self.max_epochs,
-        #                                            max_epochs=self.max_epochs,
-        #                                            warmup_start_lr=5e-4,
-        #                                            eta_min=0.1 * self.learning_rate)
-        base_lr = 1e-4
-        max_lr = self.learning_rate
-        min_lr = 0.1 * self.learning_rate
-        schedulers = CosineAnnealingWithWarmup(optimizer, warmup_steps=692, total_steps=8293,
-                                              base_lr=base_lr, max_lr=max_lr, min_lr=min_lr)
-        lr_schedulers = {
-            "scheduler": schedulers,
-            "name": 'learning_rate_logs',
-            "interval": 'step',  # The scheduler updates the learning rate at every step (not epoch)
-            'frequency': 1  # The scheduler updates the learning rate after every batch
-        }
-        return [optimizer], [lr_schedulers]
-    def update_class_thresholds(self, inputs, targets, mask):
-        with torch.no_grad():
-            min_threshold_value = 0.001
-            thresholds = torch.arange(0.1, 1.0, 0.05, device=inputs.device)
-            best_f1_score = 0
-            best_threshold = min_threshold_value
-            for threshold in thresholds:
-                binary_predictions = (inputs >= threshold).float()
-                tp = ((binary_predictions * targets) * mask).sum().item()
-                fp = ((binary_predictions * (1 - targets)) * mask).sum().item()
-                fn = (((1 - binary_predictions) * targets) * mask).sum().item()
-                precision = tp / (tp + fp + 1e-7)
-                recall = tp / (tp + fn + 1e-7)
-                f1_score = 2 * precision * recall / (precision + recall + 1e-7)
-                if f1_score > best_f1_score:
-                    best_f1_score = f1_score
-                    best_threshold = threshold
-            updated_threshold = self.historical_memory * self.classification_threshold + (
-                        1 - self.historical_memory) * best_threshold
-            self.classification_threshold = nn.Parameter(torch.clamp(updated_threshold, min=min_threshold_value))
-            gc.collect()
-            torch.cuda.empty_cache()
-    def training_epoch_end(self, outputs):
-        gc.collect()
-        torch.cuda.empty_cache()
-        super().training_epoch_end(outputs)
-    def validation_epoch_end(self, outputs):
-        gc.collect()
-        torch.cuda.empty_cache()
-        super().validation_epoch_end(outputs)
-def main():
-    parser = ArgumentParser()
-    parser.add_argument("-o", dest="output_file", help="File for output of model parameters", required=True, type=str)
-    parser.add_argument("-d", dest="dataset", required=False, type=str, default="pepnn",
-                        help="Which dataset to train on, pepnn, pepbind, or interpep")
-    parser.add_argument("-lr", type=float, default=1e-3)
-    parser.add_argument("-batch_size", type=int, default=2, help="Batch size")
-    parser.add_argument("-n_layers", type=int, default=6, help="Number of layers")
-    parser.add_argument("-d_model", type=int, default=64, help="Dimension of model")
-    parser.add_argument("-n_head", type=int, default=6, help="Number of heads")
-    parser.add_argument("-d_inner", type=int, default=64)
-    # parser.add_argument("-sm", dest="saved_model", help="File containing initial params", required=False, type=str,
-    #                     default=None)
-    parser.add_argument("-sm", default=None, help="File containing initial params", type=str)
-    parser.add_argument("--max_epochs", type=int, default=15, help="Max number of epochs to train")
-    args = parser.parse_args()
-    # Initialize the process group for distributed training
-    dist.init_process_group(backend='nccl')
-    train_dataset = load_from_disk('/home/tc415/muPPIt/dataset/train_dataset_drop_500')
-    val_dataset = load_from_disk('/home/tc415/muPPIt/dataset/val_dataset_drop_500')
-    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    data_module = CustomDataModule(train_dataset, val_dataset, tokenizer=tokenizer, batch_size=args.batch_size)
-    # Calculate the number of training steps and warm-up steps
-    train_dataloader = data_module.train_dataloader()
-    num_training_steps = len(train_dataloader) * args.max_epochs
-    num_warmup_steps = int(0.1 * num_training_steps)  # Warm-up for 10% of training steps
-    model = PeptideModel(6, 64, 6, 64, 128, 64, dropout=0.2,
-                         learning_rate=args.lr, max_epochs=num_training_steps)
-    if args.sm:
-        model = PeptideModel.load_from_checkpoint(args.sm,
-                                                  n_layers=args.n_layers,
-                                                  d_model=args.d_model,
-                                                  n_head=args.n_head,
-                                                  d_k=64,
-                                                  d_v=128,
-                                                  d_inner=64,
-                                                  dropout=0.3,
-                                                  learning_rate=args.lr,
-                                                  max_epochs=args.max_epochs)
-    run_id = str(uuid.uuid4())
-    print("Classification Thresholds:")
-    print(model.classification_threshold)
-    logger = WandbLogger(project=f"bind_evaluator",
-                         name=f"continue_lr={args.lr}_nlayers={args.n_layers}_dmodel={args.d_model}_nhead={args.n_head}_dinner={args.d_inner}",
-                         # display on the web
-                         # save_dir=f'./pl_logs/',
-                         job_type='model-training',
-                         id=run_id)
-    checkpoint_callback = ModelCheckpoint(
-        monitor='val_mcc',
-        dirpath=args.output_file,
-        filename='model-{epoch:02d}-{val_loss:.2f}',
-        save_top_k=1,
-        mode='max',
-    )
-    early_stopping_callback = EarlyStopping(
-        monitor='val_mcc',
-        patience=5,
-        verbose=True,
-        mode='max'
-    )
-    accumulator = GradientAccumulationScheduler(scheduling={0: 4, 2: 2, 7: 1})
-    trainer = pl.Trainer(
-        max_epochs=args.max_epochs,
-        accelerator='gpu',
-        strategy='ddp',
-        precision='bf16',
-        logger=logger,
-        devices=[0,1,2,3,4,5,6],
-        callbacks=[checkpoint_callback, accumulator, early_stopping_callback],
-        gradient_clip_val=1.0
-    )
-    trainer.fit(model, datamodule=data_module)
-    best_model_path = checkpoint_callback.best_model_path
-    print(best_model_path)
-if __name__ == "__main__":
-    main()