File size: 4,140 Bytes
f6d6286 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
"""
Downloads and evaluates HellaSwag in Python.
https://github.com/rowanz/hellaswag
"""
import os
import json
import requests
import tiktoken
from tqdm import tqdm
import torch
from torch.nn import functional as F
DATA_DOWNLOADED_PATH = '"data/hellaswag"'
def download_file(url:str, fname:str, chunk_size=1024):
resp = requests.get(url, stream=True)
total = int(resp.headers.get("content-length", 0 ))
with open(fname, "wb") as file, tqdm(
desc = fname,
total=total,
unit="iB",
unit_scale=True,
unit_divisor=1024
)as bar:
for data in resp.iter_content(chunk_size=chunk_size):
size = file.write(data)
bar.update(size)
hellaswags = {
"train": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl",
"val": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl",
"test": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_test.jsonl",
}
enc = tiktoken.get_encoding("gpt2")
def download(split):
"""Downloads HellaSwag DATA_DOWNLOADED_PATH"""
os.makedirs(DATA_DOWNLOADED_PATH, exist_ok=True)
data_url = hellaswags[split]
data_filename = os.path.join(DATA_DOWNLOADED_PATH, f"hellaswag_{split}.jsonl")
if not os.path.exists(data_filename):
print(f"Downloading {data_url} to {data_filename}...")
download_file(data_url, data_filename)
def render_example(example):
"""
Given the example as a dictionary, render it as three torch tensors:
- tokens (the tokens of context + completion, of size 4xN, as there are always 4 candidates)
- mask (is 1 in the region of the candidate completion, where we evaluate likelihoods)
- label (the index of the correct completion, which we hope has the highest likelihood)
"""
ctx = example["ctx"]
label = example["label"]
endings = example["endings"]
# data needed to reproduce this eval on the C size
data = {
"label": label,
"ctx_tokens": None,
"ending_tokens": [],
}
# gather up all the tokens
ctx_tokens = enc.encode(ctx)
data["ctx_tokens"] = ctx_tokens
tok_rows = []
mask_rows = []
for end in endings:
end_tokens = enc.encode(" " + end) # note: prepending " " because GPT-2 tokenizer
tok_rows.append(ctx_tokens + end_tokens)
mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens))
data["ending_tokens"].append(end_tokens)
# have to be careful during the collation because the number of tokens in each row can differ
max_len = max(len(row) for row in tok_rows)
tokens = torch.zeros((4, max_len), dtype=torch.long)
mask = torch.zeros((4, max_len), dtype=torch.long)
for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)):
tokens[i, :len(tok_row)] = torch.tensor(tok_row)
mask[i, :len(mask_row)] = torch.tensor(mask_row)
return data, tokens, mask, label
def iterate_examples(split):
# there are 10,042 examples in total in val
download(split)
with open(os.path.join(DATA_DOWNLOADED_PATH, f"hellaswag_{split}.jsonl"), "r") as f:
for line in f:
example = json.loads(line)
yield example
def get_most_likely_row(tokens, mask, logits):
shift_logits = (logits[..., :-1, :]).contiguous() #this will be x for loss calculation
shift_tokens = (tokens[..., 1:]).contiguous() #this will be y for loss calculation
shift_mask = (mask[..., 1:]).contiguous() #shifting same as tokens shifted
flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
flat_shift_tokens = shift_tokens.view(-1)
shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
shift_losses = shift_losses.view(tokens.size(0), -1)
masked_shift_losses = shift_losses * shift_mask
sum_loss = masked_shift_losses.sum(dim=1)
avg_loss = sum_loss / shift_mask.sum(dim=1)
pred_norm = avg_loss.argmin().item() #taking the index of minimum loss
return pred_norm
|