|
import torch |
|
import pandas as pd |
|
from transformers import AutoTokenizer |
|
from tqdm import tqdm |
|
import pdb |
|
|
|
def compute_average_hamming_distance(gen_peptides, train_peptides, tokenizer, alphabet): |
|
|
|
|
|
gen_tensor = tokenizer(gen_peptides, return_tensors='pt')['input_ids'] |
|
train_tensor = tokenizer(train_peptides, return_tensors='pt')['input_ids'] |
|
|
|
|
|
|
|
|
|
|
|
same_positions = (gen_tensor[:, None, :] == train_tensor[None, :, :]) |
|
|
|
|
|
|
|
hamming_matrix = (~same_positions).sum(dim=-1) |
|
|
|
|
|
avg_hamming_distance = hamming_matrix.float().mean().item() |
|
return avg_hamming_distance |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D") |
|
alphabet = list(tokenizer.get_vocab().keys()) |
|
average_hds = {} |
|
|
|
for length in range(6,50): |
|
df_samples = pd.read_csv(f'/home/tc415/discrete-diffusion-guidance/samples/step_32/{length}.csv') |
|
generated_peptides = df_samples['sequence'].tolist() |
|
|
|
df_test = pd.read_csv(f'/home/tc415/discrete-diffusion-guidance/dataset/peptide/test.csv') |
|
df_test = df_test[df_test['Length'] == length] |
|
test_peptides = df_test['Sequence'].tolist() |
|
|
|
average_hd = compute_average_hamming_distance(generated_peptides, test_peptides, tokenizer, alphabet) |
|
average_hds[length] = average_hd |
|
|
|
print(average_hds) |
|
|