Upload metrics.py with huggingface_hub
Browse files- metrics.py +35 -5
metrics.py
CHANGED
|
@@ -4,7 +4,6 @@ import uuid
|
|
| 4 |
from abc import ABC, abstractmethod
|
| 5 |
from collections import Counter
|
| 6 |
from dataclasses import field
|
| 7 |
-
from statistics import mean
|
| 8 |
from typing import Any, Dict, Generator, List, Optional, Tuple
|
| 9 |
|
| 10 |
import evaluate
|
|
@@ -1360,9 +1359,12 @@ class Perplexity(BulkInstanceMetric):
|
|
| 1360 |
instance_scores_list.append(scores[index])
|
| 1361 |
index += 1
|
| 1362 |
instance_scores["reference_scores"] = instance_scores_list
|
| 1363 |
-
instance_scores[self.main_score] = mean(instance_scores_list)
|
| 1364 |
|
| 1365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1366 |
all_instances_scores.append(instance_scores)
|
| 1367 |
|
| 1368 |
return all_instances_scores
|
|
@@ -1405,11 +1407,18 @@ class Perplexity(BulkInstanceMetric):
|
|
| 1405 |
tokens_source, tokens_target
|
| 1406 |
)
|
| 1407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1408 |
# the model returns mean over all batch. We run the CE again without reduction
|
| 1409 |
-
# and
|
| 1410 |
loss_fct = torch.nn.CrossEntropyLoss(
|
| 1411 |
ignore_index=-100, reduction="none"
|
| 1412 |
)
|
|
|
|
|
|
|
|
|
|
| 1413 |
loss = loss_fct(
|
| 1414 |
logits.view(-1, logits.size(-1)), labels.view(-1)
|
| 1415 |
)
|
|
@@ -1420,8 +1429,29 @@ class Perplexity(BulkInstanceMetric):
|
|
| 1420 |
labels > 0, dim=1
|
| 1421 |
)
|
| 1422 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1423 |
# append the batch scores to the list of all scores
|
| 1424 |
-
scores.append(
|
| 1425 |
|
| 1426 |
return torch.cat(scores, dim=0).tolist()
|
| 1427 |
|
|
|
|
| 4 |
from abc import ABC, abstractmethod
|
| 5 |
from collections import Counter
|
| 6 |
from dataclasses import field
|
|
|
|
| 7 |
from typing import Any, Dict, Generator, List, Optional, Tuple
|
| 8 |
|
| 9 |
import evaluate
|
|
|
|
| 1359 |
instance_scores_list.append(scores[index])
|
| 1360 |
index += 1
|
| 1361 |
instance_scores["reference_scores"] = instance_scores_list
|
|
|
|
| 1362 |
|
| 1363 |
+
# max seems more useful than mean for common use cases like
|
| 1364 |
+
# context relevance, where what we want to know is if there
|
| 1365 |
+
# is at least one good result in the context. Using mean will
|
| 1366 |
+
# bring the score down due to bad contexts at the tail.
|
| 1367 |
+
instance_scores[self.main_score] = max(instance_scores_list)
|
| 1368 |
all_instances_scores.append(instance_scores)
|
| 1369 |
|
| 1370 |
return all_instances_scores
|
|
|
|
| 1407 |
tokens_source, tokens_target
|
| 1408 |
)
|
| 1409 |
|
| 1410 |
+
# logits is a tensor of size: batch_size * len(target) * vocab_size
|
| 1411 |
+
# because for each example in the batch, the model predicted the
|
| 1412 |
+
# logit at every position in the target, for every vocab item.
|
| 1413 |
+
|
| 1414 |
# the model returns mean over all batch. We run the CE again without reduction
|
| 1415 |
+
# and extract the mean for each document
|
| 1416 |
loss_fct = torch.nn.CrossEntropyLoss(
|
| 1417 |
ignore_index=-100, reduction="none"
|
| 1418 |
)
|
| 1419 |
+
|
| 1420 |
+
# logits.size(-1) = the dimension of the vocabulary
|
| 1421 |
+
# labels.view(-1) = flattens the labels tensor to 1d
|
| 1422 |
loss = loss_fct(
|
| 1423 |
logits.view(-1, logits.size(-1)), labels.view(-1)
|
| 1424 |
)
|
|
|
|
| 1429 |
labels > 0, dim=1
|
| 1430 |
)
|
| 1431 |
|
| 1432 |
+
# e^-average(cross-entropy-loss(logits) == geometric mean of the probabilities
|
| 1433 |
+
# proof:
|
| 1434 |
+
# * CE-loss of logits is computed by transforming the logits to
|
| 1435 |
+
# probabilities by softmax, and then -log(p) is returned, where
|
| 1436 |
+
# p is the probability of the gold label.
|
| 1437 |
+
# * Averaging the CE loss is computed by summing over -log(p) and
|
| 1438 |
+
# then dividing by the length of the gold labels.
|
| 1439 |
+
# * Thus, pr_score = (-log(p_1) + ... + -log(p_n)) / n
|
| 1440 |
+
# = -log(p_1 * ... * p_n) * 1/n
|
| 1441 |
+
# * Therefore,
|
| 1442 |
+
# e^(-pr_score) = e^(log(p_1 * ... * p_n) * 1/n)
|
| 1443 |
+
# = (e^(log(p_1 * ... * p_n))) ^ 1/n
|
| 1444 |
+
# = p_1 * ... * p_n) ^ 1/n
|
| 1445 |
+
# = geometric mean of [p_1, ..., p_n]
|
| 1446 |
+
#
|
| 1447 |
+
# in principle we could have computed the geometric mean directly over the
|
| 1448 |
+
# probabilities instead of e^(average cross entropy loss of the logits),
|
| 1449 |
+
# but the current approach is more stable numerically. See for example:
|
| 1450 |
+
# https://stackoverflow.com/questions/59722983/how-to-calculate-geometric-mean-in-a-differentiable-way
|
| 1451 |
+
geometric_mean = (-batch_loss).exp()
|
| 1452 |
+
|
| 1453 |
# append the batch scores to the list of all scores
|
| 1454 |
+
scores.append(geometric_mean)
|
| 1455 |
|
| 1456 |
return torch.cat(scores, dim=0).tolist()
|
| 1457 |
|