Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Jan 11, 2024

Commit

dc6018c

verified ·

1 Parent(s): 803d9a3

Upload metrics.py with huggingface_hub

Browse files

Files changed (1) hide show

metrics.py +356 -49

metrics.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import logging
 import re
 import string
 import uuid
@@ -14,6 +13,7 @@ from scipy.stats import bootstrap
 from .artifact import Artifact
 from .dataclass import InternalField, OptionalField
 from .operator import (
     MultiStreamOperator,
     SingleStreamOperator,
@@ -23,7 +23,9 @@ from .operator import (
 from .operators import CopyFields
 from .random_utils import get_seed
 from .stream import MultiStream, Stream
 # The default number of resamples used to estimate the confidence intervals
 # global and instances metrics. Use None to disable confidence interval computation by default.
 _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS = 1000
@@ -61,6 +63,7 @@ class MetricWithConfidenceInterval(Metric):
     # Use None to disable confidence interval computation.
     n_resamples: int = None
     confidence_level: float = 0.95
     @staticmethod
     def new_random_generator():
@@ -79,7 +82,7 @@ class MetricWithConfidenceInterval(Metric):
             and num_predictions > 1
         )
-    def score_based_confidence_interval(self, score_names: List[str], instances):
         """Compute confidence intervals based on existing scores, already computed on the input instances.
         score_names: List[str]
@@ -94,6 +97,10 @@ class MetricWithConfidenceInterval(Metric):
         if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
             return result
         for score_name in score_names:
             scores = [
                 instance["score"]["instance"][score_name] for instance in instances
@@ -131,7 +138,7 @@ class MetricWithConfidenceInterval(Metric):
                 except Exception as e:
                     # this happens in edge cases, for example, when the sampling creates a
                     # sample where all strings are empty and this fails bleu.
-                    logging.info(f"Warning in {self.__class__.__name__}", e)
                     return np.nan
             scores = numpy.apply_along_axis(
@@ -341,7 +348,7 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
                         global_score["score_name"] = self.main_score
                 confidence_interval = self.score_based_confidence_interval(
-                    score_names=[self.main_score], instances=instances
                 )
                 global_score.update(confidence_interval)
@@ -411,7 +418,7 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
                         global_score["score_name"] = self.main_score
                 confidence_interval = self.score_based_confidence_interval(
-                    score_names=[self.main_score], instances=instances
                 )
                 global_score.update(confidence_interval)
@@ -473,6 +480,23 @@ class Accuracy(InstanceMetric):
         return result
 class MetricPipeline(MultiStreamOperator, Metric):
     main_score: str = None
     preprocess_steps: Optional[List[StreamingOperator]] = field(default_factory=list)
@@ -512,9 +536,29 @@ class HuggingfaceMetric(GlobalMetric):
     scale: float = 1.0  # optional scaling of main results
     scaled_fields: list = None
     hf_compute_args: Dict[str, Any] = OptionalField(default_factory=dict)
     experiment_id: str = OptionalField(default_factory=lambda: str(uuid.uuid4()))
     def prepare(self):
         super().prepare()
         self.metric = evaluate.load(
@@ -527,8 +571,36 @@ class HuggingfaceMetric(GlobalMetric):
         predictions: List[Any],
         additional_inputs: List[Dict],
     ) -> dict:
         result = self.metric.compute(
-            predictions=predictions, references=references, **self.hf_compute_args
         )
         if self.hf_main_score:
             result[self.main_score] = result[self.hf_main_score]
@@ -559,6 +631,7 @@ class HuggingfaceBulkMetric(BulkInstanceMetric):
     hf_metric_fields: List[str]
     hf_compute_args: dict = {}
     def prepare(self):
         super().prepare()
@@ -570,8 +643,23 @@ class HuggingfaceBulkMetric(BulkInstanceMetric):
         predictions: List[str],
         additional_inputs: List[Any],
     ) -> List[Dict[str, Any]]:
         scores = self.metric.compute(
-            predictions=predictions, references=references, **self.hf_compute_args
         )
         # convert dict of lists to a list of dicts
@@ -656,10 +744,11 @@ class F1MultiLabel(GlobalMetric):
     main_score = "f1_macro"
     average = None  # Report per class then aggregate by mean
     classes_to_ignore = ["none"]
     def prepare(self):
         super().prepare()
-        self._metric = evaluate.load("f1", "multilabel")
     def add_str_to_id(self, str):
         if str not in self.str_to_id:
@@ -683,22 +772,10 @@ class F1MultiLabel(GlobalMetric):
     ) -> dict:
         self.str_to_id = {}
         self.id_to_str = {}
-        assert all(
-            len(reference) == 1 for reference in references
-        ), "Only a single reference per prediction is allowed in F1 multi label metric"
         references = [reference[0] for reference in references]
-        for reference in references:
-            assert isinstance(
-                references, list
-            ), f"Each reference is expected to list of strings in F1 multi label metric. Received reference: {reference}"
-        for prediction in predictions:
-            assert isinstance(
-                prediction, list
-            ), f"Each prediction is expected to list of strings in F1 multi label metric. Received prediction: {prediction}"
         labels = [
             lbl
             for lbl in {label for reference in references for label in reference}
@@ -732,19 +809,60 @@ class F1MultiLabel(GlobalMetric):
             average=self.average,
             labels=labels_param,
         )
-        if isinstance(result["f1"], numpy.ndarray):
             from statistics import mean
-            assert len(result["f1"]) == len(
-                labels
-            ), f'F1 result ({result["f1"]}) has more entries than labels ({labels})'
-            final_result = {self.main_score: mean(result["f1"])}
             for i, label in enumerate(labels):
-                final_result["f1_" + label] = result["f1"][i]
         else:
-            final_result = {self.main_score: result["f1"]}
         return final_result
 class F1MicroMultiLabel(F1MultiLabel):
     main_score = "f1_micro"
@@ -868,27 +986,36 @@ class MatthewsCorrelation(HuggingfaceMetric):
 class CustomF1(GlobalMetric):
     main_score = "f1_micro"
-    classes = None
     zero_division = 0.0
     @abstractmethod
-    def get_element_group(self, element):
         pass
     @abstractmethod
-    def get_element_representation(self, element):
         pass
-    def group_elements(self, elements_list):
         return {
             k: Counter(
                 [
-                    self.get_element_representation(value)
                     for value in elements_list
-                    if self.get_element_group(value) == k
                 ]
             )
-            for k in {self.get_element_group(e) for e in elements_list}
         }
     def calculate_groups_ratio(self, actual_group, total_group):
@@ -910,30 +1037,46 @@ class CustomF1(GlobalMetric):
         except ZeroDivisionError:
             return self.zero_division
     def compute(
         self,
-        references: List[Any],
         predictions: List[Any],
         additional_inputs: List[Dict],
     ) -> dict:
         # in case reference are List[List[List[Any]]] and predictions are List[List[Any]]:
-        if isinstance(references[0], list) and isinstance(references[0][0], list):
             references = [element[0] for element in references]
         assert len(references) == len(predictions), (
             f"references size ({len(references)})"
             f" doesn't mach predictions sise ({len(references)})."
         )
-        if self.classes is None:
-            classes = {
-                self.get_element_group(e) for sublist in references for e in sublist
-            }
         else:
-            classes = self.classes
         groups_statistics = {}
-        for references_batch, predictions_batch in zip(references, predictions):
-            grouped_references = self.group_elements(references_batch)
-            grouped_predictions = self.group_elements(predictions_batch)
             all_groups = set(grouped_references.keys()).union(
                 grouped_predictions.keys()
             )
@@ -976,7 +1119,7 @@ class CustomF1(GlobalMetric):
                 rn_total + rn,
                 rd_total + rd,
             )
-            if group in classes:
                 f1_result[f"f1_{group}"] = self.f1(pn, pd, rn, rd)
                 recall_result[f"recall_{group}"] = self.recall(pn, pd, rn, rd)
                 precision_result[f"precision_{group}"] = self.precision(pn, pd, rn, rd)
@@ -995,7 +1138,7 @@ class CustomF1(GlobalMetric):
         except ZeroDivisionError:
             result["f1_macro"] = self.zero_division
             result["recall_macro"] = self.zero_division
-            result["micro_macro"] = self.zero_division
         amount_of_predictions = pd_total
         if amount_of_predictions == 0:
@@ -1013,10 +1156,10 @@ class CustomF1(GlobalMetric):
 class NER(CustomF1):
-    def get_element_group(self, element):
         return element[1]
-    def get_element_representation(self, element):
         return str(element)
@@ -1042,6 +1185,7 @@ def normalize_answer(s):
 class TokenOverlap(InstanceMetric):
     reduction_map = {"mean": ["f1", "precision", "recall"]}
     main_score = "f1"
     def compute(
         self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
@@ -1075,6 +1219,7 @@ class BertScore(HuggingfaceBulkMetric):
     main_score = "f1"
     reduction_map = {"mean": ["f1", "precision", "recall"]}
     hf_metric_fields = ["f1", "precision", "recall"]
     model_name: str
     def prepare(self):
@@ -1223,3 +1368,165 @@ class NDCG(GlobalMetric):
                 ]
             scores.append(self.eval([q_references], [q_predictions]))
         return {self.main_score: mean(scores) if len(scores) > 0 else np.nan}

 import re
 import string
 import uuid
 from .artifact import Artifact
 from .dataclass import InternalField, OptionalField
+from .logging_utils import get_logger
 from .operator import (
     MultiStreamOperator,
     SingleStreamOperator,
 from .operators import CopyFields
 from .random_utils import get_seed
 from .stream import MultiStream, Stream
+from .type_utils import isoftype
+logger = get_logger()
 # The default number of resamples used to estimate the confidence intervals
 # global and instances metrics. Use None to disable confidence interval computation by default.
 _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS = 1000
     # Use None to disable confidence interval computation.
     n_resamples: int = None
     confidence_level: float = 0.95
+    ci_scores: List[str] = None
     @staticmethod
     def new_random_generator():
             and num_predictions > 1
         )
+    def score_based_confidence_interval(self, instances):
         """Compute confidence intervals based on existing scores, already computed on the input instances.
         score_names: List[str]
         if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
             return result
+        score_names = (
+            self.ci_scores if self.ci_scores is not None else [self.main_score]
+        )
         for score_name in score_names:
             scores = [
                 instance["score"]["instance"][score_name] for instance in instances
                 except Exception as e:
                     # this happens in edge cases, for example, when the sampling creates a
                     # sample where all strings are empty and this fails bleu.
+                    logger.info(f"Warning in {self.__class__.__name__}", e)
                     return np.nan
             scores = numpy.apply_along_axis(
                         global_score["score_name"] = self.main_score
                 confidence_interval = self.score_based_confidence_interval(
+                    instances=instances
                 )
                 global_score.update(confidence_interval)
                         global_score["score_name"] = self.main_score
                 confidence_interval = self.score_based_confidence_interval(
+                    instances=instances
                 )
                 global_score.update(confidence_interval)
         return result
+class StringContainment(InstanceMetric):
+    reduction_map = {"mean": ["string_containment"]}
+    main_score = "string_containment"
+    def compute(
+        self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
+    ) -> dict:
+        result = {
+            self.main_score: float(
+                any(str(reference) in prediction for reference in references)
+            )
+        }
+        result["score"] = result[self.main_score]
+        result["score_name"] = self.main_score
+        return result
 class MetricPipeline(MultiStreamOperator, Metric):
     main_score: str = None
     preprocess_steps: Optional[List[StreamingOperator]] = field(default_factory=list)
     scale: float = 1.0  # optional scaling of main results
     scaled_fields: list = None
+    # This are fixed arguments  passed to compute method
     hf_compute_args: Dict[str, Any] = OptionalField(default_factory=dict)
+    # These are additional input fields passed to HF compute method (a list with one value per instance)
+    hf_additional_input_fields: List = OptionalField(default_factory=list)
+    # These are additional input fields that are passed as one value
+    hf_additional_input_fields_pass_one_value: List = OptionalField(
+        default_factory=list
+    )
     experiment_id: str = OptionalField(default_factory=lambda: str(uuid.uuid4()))
+    def verify(self):
+        assert (
+            self.hf_additional_input_fields is None
+            or isoftype(self.hf_additional_input_fields, List[str])
+        ), f"Argument hf_additional_input_fields should be either None or List[str]. It is now: {self.hf_additional_input_fields}."
+        assert (
+            self.hf_additional_input_fields_pass_one_value is None
+            or isoftype(self.hf_additional_input_fields_pass_one_value, List[str])
+        ), f"Argument hf_additional_input_fields_pass_one_value should be either None or List[str]. It is now: {self.hf_additional_input_fields_pass_one_value}."
+        return super().verify()
     def prepare(self):
         super().prepare()
         self.metric = evaluate.load(
         predictions: List[Any],
         additional_inputs: List[Dict],
     ) -> dict:
+        passed_additional_inputs = {}
+        for additional_input_field in self.hf_additional_input_fields:
+            assert (
+                additional_input_field in additional_inputs[0]
+            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in additional inputs: {additional_inputs[0]}"
+            passed_additional_inputs[additional_input_field] = [
+                additional_input[additional_input_field]
+                for additional_input in additional_inputs
+            ]
+        for additional_input_field in self.hf_additional_input_fields_pass_one_value:
+            assert (
+                additional_input_field in additional_inputs[0]
+            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in additional inputs: {additional_inputs[0]}"
+            values = {
+                additional_input[additional_input_field]
+                for additional_input in additional_inputs
+            }
+            assert (
+                len(values) == 1
+            ), f"Values of '{additional_input_field}' field required by {__class__.__name__}  should all be the same, but have multiple values {values}"
+            passed_additional_inputs[additional_input_field] = next(iter(values))
+        # add check that all required fields in self.metrics are in passed_additional_inputs       print(passed_additional_inputs)
         result = self.metric.compute(
+            predictions=predictions,
+            references=references,
+            **passed_additional_inputs,
+            **self.hf_compute_args,
         )
         if self.hf_main_score:
             result[self.main_score] = result[self.hf_main_score]
     hf_metric_fields: List[str]
     hf_compute_args: dict = {}
+    hf_additional_input_fields: List = OptionalField(default_factory=list)
     def prepare(self):
         super().prepare()
         predictions: List[str],
         additional_inputs: List[Any],
     ) -> List[Dict[str, Any]]:
+        passed_additional_inputs = {}
+        passed_additional_inputs = {}
+        for additional_input_field in self.hf_additional_input_fields:
+            assert (
+                additional_input_field in additional_inputs[0]
+            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in additional inputs: {additional_inputs[0]}"
+            passed_additional_inputs[additional_input_field] = [
+                additional_input[additional_input_field]
+                for additional_input in additional_inputs
+            ]
+        # add check that all required fields in self.metrics are in passed_additional_inputs
         scores = self.metric.compute(
+            predictions=predictions,
+            references=references,
+            **passed_additional_inputs,
+            **self.hf_compute_args,
         )
         # convert dict of lists to a list of dicts
     main_score = "f1_macro"
     average = None  # Report per class then aggregate by mean
     classes_to_ignore = ["none"]
+    metric = "f1"
     def prepare(self):
         super().prepare()
+        self._metric = evaluate.load(self.metric, "multilabel")
     def add_str_to_id(self, str):
         if str not in self.str_to_id:
     ) -> dict:
         self.str_to_id = {}
         self.id_to_str = {}
+        self._validate_references_and_prediction(references, predictions)
         references = [reference[0] for reference in references]
         labels = [
             lbl
             for lbl in {label for reference in references for label in reference}
             average=self.average,
             labels=labels_param,
         )
+        if isinstance(result[self.metric], numpy.ndarray):
             from statistics import mean
+            assert (
+                len(result[self.metric]) == len(labels)
+            ), f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
+            final_result = {self.main_score: mean(result[self.metric])}
             for i, label in enumerate(labels):
+                final_result[self.metric + "_" + label] = result[self.metric][i]
         else:
+            final_result = {self.main_score: result[self.metric]}
         return final_result
+    def _validate_references_and_prediction(self, references, predictions):
+        for reference in references:
+            if not len(reference) == 1:
+                raise ValueError(
+                    f"Only a single reference per prediction is allowed in F1 multi label metric. Received reference: {reference}"
+                )
+            if not isoftype(reference[0], List[str]):
+                raise ValueError(
+                    f"Each reference is expected to be a list of strings in F1 multi label metric. Received reference: '{reference[0]}'"
+                )
+        for prediction in predictions:
+            if not isoftype(prediction, List[str]):
+                raise ValueError(
+                    f"Each prediction is expected to be a list of strings in F1 multi label metric. Received prediction: '{prediction}'"
+                )
+class PrecisionMacroMultiLabel(F1MultiLabel):
+    main_score = "precision_macro"
+    metric = "precision"
+    average = "macro"
+class PrecisionMicroMultiLabel(F1MultiLabel):
+    main_score = "precision_micro"
+    metric = "precision"
+    average = "micro"
+class RecallMacroMultiLabel(F1MultiLabel):
+    main_score = "recall_macro"
+    metric = "recall"
+    average = "macro"
+class RecallMicroMultiLabel(F1MultiLabel):
+    main_score = "recall_micro"
+    metric = "recall"
+    average = "micro"
 class F1MicroMultiLabel(F1MultiLabel):
     main_score = "f1_micro"
 class CustomF1(GlobalMetric):
     main_score = "f1_micro"
+    groups = None
     zero_division = 0.0
     @abstractmethod
+    def get_element_group(self, element, additional_input):
         pass
     @abstractmethod
+    def get_element_representation(self, element, additional_input):
         pass
+    def should_ignore_element(self, element, additional_input):
+        return False
+    def group_elements(self, elements_list, additional_input):
+        if not isinstance(elements_list, list):
+            elements_list = [elements_list]
         return {
             k: Counter(
                 [
+                    self.get_element_representation(value, additional_input)
                     for value in elements_list
+                    if self.get_element_group(value, additional_input) == k
                 ]
             )
+            for k in {
+                self.get_element_group(e, additional_input)
+                for e in elements_list
+                if not self.should_ignore_element(e, additional_input)
+            }
         }
     def calculate_groups_ratio(self, actual_group, total_group):
         except ZeroDivisionError:
             return self.zero_division
+    def get_groups(self, elements, additional_inputs):
+        groups = set()
+        for sublist, additional_input in zip(elements, additional_inputs):
+            for e in sublist:
+                if self.should_ignore_element(e, additional_input):
+                    continue
+                groups.add(self.get_element_group(e, additional_input))
+        return groups
     def compute(
         self,
+        references: List[List[Any]],
         predictions: List[Any],
         additional_inputs: List[Dict],
     ) -> dict:
         # in case reference are List[List[List[Any]]] and predictions are List[List[Any]]:
+        if (
+            isinstance(references[0], list)
+            and len(references[0]) > 0
+            and isinstance(references[0][0], list)
+        ):
             references = [element[0] for element in references]
         assert len(references) == len(predictions), (
             f"references size ({len(references)})"
             f" doesn't mach predictions sise ({len(references)})."
         )
+        if self.groups is None:
+            groups = self.get_groups(references, additional_inputs)
         else:
+            groups = self.groups
         groups_statistics = {}
+        for references_batch, predictions_batch, additional_input in zip(
+            references, predictions, additional_inputs
+        ):
+            grouped_references = self.group_elements(references_batch, additional_input)
+            grouped_predictions = self.group_elements(
+                predictions_batch, additional_input
+            )
             all_groups = set(grouped_references.keys()).union(
                 grouped_predictions.keys()
             )
                 rn_total + rn,
                 rd_total + rd,
             )
+            if group in groups:
                 f1_result[f"f1_{group}"] = self.f1(pn, pd, rn, rd)
                 recall_result[f"recall_{group}"] = self.recall(pn, pd, rn, rd)
                 precision_result[f"precision_{group}"] = self.precision(pn, pd, rn, rd)
         except ZeroDivisionError:
             result["f1_macro"] = self.zero_division
             result["recall_macro"] = self.zero_division
+            result["precision_macro"] = self.zero_division
         amount_of_predictions = pd_total
         if amount_of_predictions == 0:
 class NER(CustomF1):
+    def get_element_group(self, element, additional_input):
         return element[1]
+    def get_element_representation(self, element, additional_input):
         return str(element)
 class TokenOverlap(InstanceMetric):
     reduction_map = {"mean": ["f1", "precision", "recall"]}
     main_score = "f1"
+    ci_scores = ["f1", "precision", "recall"]
     def compute(
         self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
     main_score = "f1"
     reduction_map = {"mean": ["f1", "precision", "recall"]}
     hf_metric_fields = ["f1", "precision", "recall"]
+    ci_scores = ["f1", "precision", "recall"]
     model_name: str
     def prepare(self):
                 ]
             scores.append(self.eval([q_references], [q_predictions]))
         return {self.main_score: mean(scores) if len(scores) > 0 else np.nan}
+class RetrievalMetric(InstanceMetric):
+    def compute(
+        self, references: List[Any], prediction: Any, additional_inputs: Dict
+    ) -> dict:
+        # digest input
+        pred_ids: List[Any] = prediction
+        ref_ids: List[Any] = list(dict.fromkeys(references))
+        # relevance_at_k: 1-based dictionary of indicators (0/1), telling whether
+        # the doc id retrieved at position k (assuming it is 1-based, so k starts
+        # from 1) is in the gold doc ids or not.
+        # For example, assuming that in the retrieved docs we have correct predictions
+        # at positions 2, 4 and 5 (1-based), the dict will look like:
+        # {1: 0, 2: 1, 3: 0, 4: 1, 5: 1, ...}
+        relevance_at_k = {
+            k + 1: 1 if doc_id in ref_ids else 0 for k, doc_id in enumerate(pred_ids)
+        }
+        # relevance_sum_at_k: 1-based dictionary of counts, where the value at k determines
+        # how many gold doc ids have been observed up to index k.
+        relevance_sum_at_k = {}
+        for k, value in relevance_at_k.items():
+            relevance_sum_at_k[k] = relevance_sum_at_k.get(k - 1, 0) + value
+        # precision_at_k: the precision of the top k retrieved documents. For example,
+        # assuming that only 1 out of the first 4 retrieved documents is correct, the
+        # value at 4 will be 1/4.
+        precision_at_k = {k: value / k for k, value in relevance_sum_at_k.items()}
+        # recall_at_k: the recall of the top k retrieved documents. For example,
+        # assuming that only 2 out of the 3 gold documents are in the top 5 results,
+        # the value at 5 will be 2/3.
+        n_refs = len(ref_ids)
+        recall_at_k = {
+            k: value / n_refs if n_refs > 0 else 0
+            for k, value in relevance_sum_at_k.items()
+        }
+        # rank - the 1-based index of the first hit of a gold doc id. So 1
+        # means first position.
+        rank = 0
+        for k, relevance in relevance_at_k.items():
+            if relevance == 1:
+                rank = k
+                break
+        # match_at_k: whether we have a match at the top k retrieved documents
+        match_at_k = {
+            k: 1.0 if value > 0 else 0.0 for k, value in relevance_sum_at_k.items()
+        }
+        return self._compute(
+            relevance_at_k,
+            relevance_sum_at_k,
+            precision_at_k,
+            recall_at_k,
+            match_at_k,
+            rank,
+        )
+    @abstractmethod
+    def _compute(
+        self,
+        relevance_at_k,
+        relevance_sum_at_k,
+        precision_at_k,
+        recall_at_k,
+        match_at_k,
+        rank,
+    ) -> dict:
+        pass
+class MRR(RetrievalMetric):
+    reduction_map = {"mean": ["mrr"]}
+    main_score = "mrr"
+    def _compute(
+        self,
+        relevance_at_k,
+        relevance_sum_at_k,
+        precision_at_k,
+        recall_at_k,
+        match_at_k,
+        rank,
+    ) -> dict:
+        return {self.main_score: 1 / rank if rank > 0 else 0}
+class MAP(RetrievalMetric):
+    reduction_map = {"mean": ["map"]}
+    main_score = "map"
+    def _compute(
+        self,
+        relevance_at_k,
+        relevance_sum_at_k,
+        precision_at_k,
+        recall_at_k,
+        match_at_k,
+        rank,
+    ) -> dict:
+        result = 0
+        if len(relevance_at_k) > 0:
+            total = sum(relevance_at_k.values())
+            if total > 0:
+                dot = sum(relevance_at_k[k] * precision_at_k[k] for k in relevance_at_k)
+                result = dot / total
+        return {self.main_score: result}
+class RetrievalAtK(RetrievalMetric):
+    k_list: List[int]
+    main_score: str = None
+    reduction_map: Dict[str, List[str]] = None
+    def prepare(self):
+        super().prepare()
+        self.main_score = self.score_name("match", self.k_list[0])
+        self.ci_scores = [
+            self.score_name(measure, k)
+            for measure in ["precision", "recall", "match"]
+            for k in self.k_list
+        ]
+        self.reduction_map = {"mean": self.ci_scores}
+    @staticmethod
+    def score_name(measure: str, k: int):
+        return f"{measure}_at_{k}"
+    def _compute(
+        self,
+        relevance_at_k,
+        relevance_sum_at_k,
+        precision_at_k,
+        recall_at_k,
+        match_at_k,
+        rank,
+    ) -> dict:
+        result = {}
+        for measure_array, measure_name in [
+            (precision_at_k, "precision"),
+            (recall_at_k, "recall"),
+            (match_at_k, "match"),
+        ]:
+            max_k = max(measure_array.keys())
+            for k in self.k_list:
+                result[self.score_name(measure_name, k)] = measure_array[min(k, max_k)]
+        return result
+class KPA(CustomF1):
+    def get_element_group(self, element, additional_input):
+        return additional_input["keypoint"]
+    def get_element_representation(self, element, additional_input):
+        return additional_input["keypoint"]
+    def should_ignore_element(self, element, additional_input):
+        return element == "none"