Spaces:
Build error
Build error
Jon Gauthier
commited on
Commit
·
8cca3d0
1
Parent(s):
8fe0b5d
refactor metric to support evaluating `all-2020` split
Browse files- syntaxgym.py +37 -19
- test/test_syntaxgym.py +18 -1
syntaxgym.py
CHANGED
|
@@ -187,14 +187,25 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
| 187 |
|
| 188 |
tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
|
| 189 |
|
| 190 |
-
# Flatten sentences, enforcing that sentences are always ordered by the same condition
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
all_sentences = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
for item in dataset:
|
| 194 |
-
for condition_name in
|
| 195 |
# Get idx of condition for this item.
|
| 196 |
condition_idx = item["conditions"]["condition_name"].index(condition_name)
|
|
|
|
| 197 |
all_sentences.append(item["conditions"]["content"][condition_idx])
|
|
|
|
|
|
|
| 198 |
|
| 199 |
# Tokenize sentences and split into batches.
|
| 200 |
all_tokenized_sentences = tokenizer(all_sentences, return_tensors="pt",
|
|
@@ -205,7 +216,7 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
| 205 |
# Compute surprisal per-batch and combine into a single surprisal tensor.
|
| 206 |
n_sentences, n_timesteps = all_tokenized_sentences["input_ids"].shape
|
| 207 |
surprisals = torch.zeros(n_sentences, n_timesteps - 1).float().to(device)
|
| 208 |
-
for i, batch in enumerate(datasets.logging.tqdm(tokenized_batches)) :
|
| 209 |
batch = batch.to(device)
|
| 210 |
with torch.no_grad():
|
| 211 |
# logits are B * T * V
|
|
@@ -219,22 +230,29 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
| 219 |
|
| 220 |
surprisals[i * batch_size : (i + 1) * batch_size] = b_surprisals_gt
|
| 221 |
|
| 222 |
-
#
|
| 223 |
-
surprisals = surprisals.reshape((len(dataset), len(condition_order), -1))
|
| 224 |
-
offset_mapping = all_tokenized_sentences["offset_mapping"] \
|
| 225 |
-
.reshape((len(dataset), len(condition_order), -1, 2))
|
| 226 |
-
|
| 227 |
-
# Now evaluate per-item.
|
| 228 |
results = {}
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
return results
|
| 240 |
|
|
|
|
| 187 |
|
| 188 |
tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
|
| 189 |
|
| 190 |
+
# Flatten sentences, enforcing that sentences are always ordered by the same condition
|
| 191 |
+
# within-suite.
|
| 192 |
+
condition_orders = {}
|
| 193 |
+
for item in dataset:
|
| 194 |
+
condition_orders[item["suite_name"]] = item["conditions"]["condition_name"]
|
| 195 |
+
# Flattened batch of sentences
|
| 196 |
all_sentences = []
|
| 197 |
+
# Mapping from sentence back to originating suite
|
| 198 |
+
all_sentence_suites = []
|
| 199 |
+
# Mapping from item back to originating suite
|
| 200 |
+
all_item_suites = []
|
| 201 |
for item in dataset:
|
| 202 |
+
for condition_name in condition_orders[item["suite_name"]]:
|
| 203 |
# Get idx of condition for this item.
|
| 204 |
condition_idx = item["conditions"]["condition_name"].index(condition_name)
|
| 205 |
+
|
| 206 |
all_sentences.append(item["conditions"]["content"][condition_idx])
|
| 207 |
+
all_sentence_suites.append(item["suite_name"])
|
| 208 |
+
all_item_suites.append(item["suite_name"])
|
| 209 |
|
| 210 |
# Tokenize sentences and split into batches.
|
| 211 |
all_tokenized_sentences = tokenizer(all_sentences, return_tensors="pt",
|
|
|
|
| 216 |
# Compute surprisal per-batch and combine into a single surprisal tensor.
|
| 217 |
n_sentences, n_timesteps = all_tokenized_sentences["input_ids"].shape
|
| 218 |
surprisals = torch.zeros(n_sentences, n_timesteps - 1).float().to(device)
|
| 219 |
+
for i, batch in enumerate(datasets.logging.tqdm(tokenized_batches, desc="Computing surprisals", unit="batch")) :
|
| 220 |
batch = batch.to(device)
|
| 221 |
with torch.no_grad():
|
| 222 |
# logits are B * T * V
|
|
|
|
| 230 |
|
| 231 |
surprisals[i * batch_size : (i + 1) * batch_size] = b_surprisals_gt
|
| 232 |
|
| 233 |
+
# Aggregate results within-suite
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
results = {}
|
| 235 |
+
all_sentence_suites = np.array(all_sentence_suites)
|
| 236 |
+
all_item_suites = np.array(all_item_suites)
|
| 237 |
+
for suite, condition_order in datasets.logging.tqdm(condition_orders.items(), unit="suite"):
|
| 238 |
+
suite_sentence_idxs = np.where(all_sentence_suites == suite)[0]
|
| 239 |
+
suite_item_idxs = np.where(all_item_suites == suite)[0]
|
| 240 |
+
suite_surprisals = surprisals[suite_sentence_idxs]
|
| 241 |
+
|
| 242 |
+
# Reshape to intuitive axes n_items * n_conditions * ...
|
| 243 |
+
suite_surprisals = suite_surprisals.reshape((len(suite_item_idxs), len(condition_order), -1))
|
| 244 |
+
suite_offset_mapping = all_tokenized_sentences["offset_mapping"][suite_sentence_idxs] \
|
| 245 |
+
.reshape((len(suite_item_idxs), len(condition_order), -1, 2))
|
| 246 |
+
|
| 247 |
+
# Evaluate per-item
|
| 248 |
+
suite_result = SyntaxGymMetricSuiteResult(suite, [], [])
|
| 249 |
+
suite_items = datasets.logging.tqdm([dataset[idx] for idx in suite_item_idxs], unit="item")
|
| 250 |
+
for item, item_surprisals, item_offset_mapping in zip(suite_items, suite_surprisals, suite_offset_mapping):
|
| 251 |
+
result_i = self._compute_item(item, item_surprisals, item_offset_mapping, condition_order)
|
| 252 |
+
suite_result.prediction_results.append(result_i["prediction_results"])
|
| 253 |
+
suite_result.region_totals.append(result_i["region_totals"])
|
| 254 |
+
|
| 255 |
+
results[suite] = suite_result
|
| 256 |
|
| 257 |
return results
|
| 258 |
|
test/test_syntaxgym.py
CHANGED
|
@@ -513,4 +513,21 @@ def test_gpt_subordination_region_totals(syntaxgym_metric):
|
|
| 513 |
for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
|
| 514 |
pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
|
| 515 |
key=lambda x: -x[1]))
|
| 516 |
-
np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
|
| 514 |
pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
|
| 515 |
key=lambda x: -x[1]))
|
| 516 |
+
np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
def test_evaluation_all_vs_single(syntaxgym_metric):
|
| 520 |
+
"""
|
| 521 |
+
Check that a suite's performance is the same when evaluated in the composite
|
| 522 |
+
benchmark vs. evaluated independently.
|
| 523 |
+
"""
|
| 524 |
+
|
| 525 |
+
suite_name = "number_prep"
|
| 526 |
+
full_dataset = datasets.load_dataset("cpllab/syntaxgym")
|
| 527 |
+
sub_dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
|
| 528 |
+
model_id = "hf-internal-testing/tiny-xlm-roberta"
|
| 529 |
+
|
| 530 |
+
full_result = syntaxgym_metric.compute(dataset=full_dataset["test"], model_id=model_id)
|
| 531 |
+
sub_result = syntaxgym_metric.compute(dataset=sub_dataset["test"], model_id=model_id)
|
| 532 |
+
|
| 533 |
+
assert full_result[suite_name].prediction_results == sub_result[suite_name].prediction_results
|