muPPIt / muppit /evaluation /identify_outliers.py
AlienChen's picture
Upload 139 files
65bd8af verified
import pandas as pd
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
for i in range(6, 50):
df = pd.read_csv(f'/home/tc415/discrete-diffusion-guidance/samples/step_128/{i}.csv')
seqs = df['sequence'].tolist()
outliers = []
for seq in seqs:
tok = tokenizer(seq)['input_ids']
if len(tok) - 2 != i:
outliers.append(seq)
print(f"{i}.csv: {outliers}")