File size: 467 Bytes
65bd8af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
import pandas as pd
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
for i in range(6, 50):
df = pd.read_csv(f'/home/tc415/discrete-diffusion-guidance/samples/step_128/{i}.csv')
seqs = df['sequence'].tolist()
outliers = []
for seq in seqs:
tok = tokenizer(seq)['input_ids']
if len(tok) - 2 != i:
outliers.append(seq)
print(f"{i}.csv: {outliers}")
|