Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on 1 day ago

Commit

03d14e5

verified ·

1 Parent(s): b66f73e

Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

api.py +1 -1
inference.py +7 -4
llm_as_judge.py +76 -50
llm_as_judge_constants.py +22 -2
metric_utils.py +20 -3
metrics.py +60 -6
operators.py +3 -1
version.py +1 -1

api.py CHANGED Viewed

@@ -310,7 +310,7 @@ def fill_metadata(**kwargs):
 def evaluate(
-    predictions,
     dataset: Union[Dataset, IterableDataset] = None,
     data=None,
     calc_confidence_intervals: bool = True,

 def evaluate(
+    predictions: Optional[List[str]] = None,
     dataset: Union[Dataset, IterableDataset] = None,
     data=None,
     calc_confidence_intervals: bool = True,

inference.py CHANGED Viewed

@@ -281,7 +281,7 @@ class InferenceEngine(Artifact):
                                 missing_examples.append(
                                     (i, item)
                                 )  # each element is index in batch and example
-                        # infare on missing examples only, without indices
                         logger.info(
                             f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})"
@@ -825,11 +825,14 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
             tools = []
             for instance in batch:
                 sources.append(instance["source"])
-                if "task_data" in instance and "__tools__" in instance["task_data"]:
                     task_data = instance["task_data"]
                     if isinstance(task_data, str):
                         task_data = json.loads(task_data)
-                    tools.append(task_data["__tools__"])
                 else:
                     tools.append(None)
             # Tokenize inputs for the batch
@@ -3715,7 +3718,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         "bam": {"max_tokens": "max_new_tokens", "model": "model_name"},
         "watsonx-sdk": {"model": "model_name"},
         "rits": {"model": "model_name"},
-        "hf-local": {"model": "model_name"},
     }
     def get_return_object(self, **kwargs):

                                 missing_examples.append(
                                     (i, item)
                                 )  # each element is index in batch and example
+                        # infere on missing examples only, without indices
                         logger.info(
                             f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})"
             tools = []
             for instance in batch:
                 sources.append(instance["source"])
+                if "task_data" in instance:
                     task_data = instance["task_data"]
                     if isinstance(task_data, str):
                         task_data = json.loads(task_data)
+                    if "__tools__" in task_data:
+                        tools.append(task_data["__tools__"])
+                    else:
+                        tools.append(None)
                 else:
                     tools.append(None)
             # Tokenize inputs for the batch
         "bam": {"max_tokens": "max_new_tokens", "model": "model_name"},
         "watsonx-sdk": {"model": "model_name"},
         "rits": {"model": "model_name"},
+        "hf-local": {"model": "model_name", "max_tokens": "max_new_tokens"},
     }
     def get_return_object(self, **kwargs):

llm_as_judge.py CHANGED Viewed

@@ -43,6 +43,7 @@ from .llm_as_judge_utils import (
     rank_indexes,
 )
 from .logging_utils import get_logger
 from .metrics import BulkInstanceMetric
 from .task import Task
 from .templates import Template
@@ -66,7 +67,7 @@ class LLMJudge(BulkInstanceMetric):
     """Flag to check for positional bias. Detecting for positional bias duplicates the amount of inference calls."""
     context_fields: Union[str, List[str], Dict[str, str]] = ["context"]
-    """Fields to be used as context. If a dict is provided, the keys are used as the final names in the prompts, while the values are used to access the context variable values in the `task_data` object."""
     generate_summaries: bool = False
     """Flag to generate summaries of the assessments. Defaults to `False`."""
@@ -78,20 +79,15 @@ class LLMJudge(BulkInstanceMetric):
     """Flag to include prompts in the result. Defaults to `True`."""
     criteria_field: str = None
-    """The field specifying the evaluation criteria in the `task_data` object."""
     criteria: Criteria = None
-    """The criteria used for evaluation. If the `criteria_field` is provided, it will take precedence."""
     def prepare(self):
         """Prepares the `LLMJudge` instance by setting up context fields and evaluator name."""
         super().prepare()
-        if isinstance(self.context_fields, str):
-            self.context_fields = [self.context_fields]
-        if isinstance(self.context_fields, List):
-            self.context_fields = {
-                context_field: context_field for context_field in self.context_fields
-            }
         if self.evaluator_name is None:
             self.evaluator_name = self.inference_engine.get_engine_id()
@@ -112,24 +108,43 @@ class LLMJudge(BulkInstanceMetric):
             )
         return
-    def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]:
         """Extracts and parses context fields from task data.
         Args:
             task_data (List[Dict[str, Any]]): The task data containing context information.
         Returns:
             List[Dict[str, str]]: A list of parsed context dictionaries.
         """
-        return [
-            get_parsed_context(
-                {
-                    context_field_name: dict_get(td, context_field)
-                    for context_field_name, context_field in self.context_fields.items()
-                }
             )
-            for td in task_data
-        ]
     def perform_evaluation_step(
         self,
@@ -211,7 +226,7 @@ class LLMJudge(BulkInstanceMetric):
             logger.info(
                 f"Reading criteria from the task_data field '{self.criteria_field}'"
             )
-            criterias = [
                 fetch_artifact(task_data_instance[self.criteria_field])[0]
                 for task_data_instance in task_data
             ]
@@ -219,18 +234,11 @@ class LLMJudge(BulkInstanceMetric):
             logger.info(
                 "Reading criteria from self. Criteria is a single CriteriaWithOptions, replicating it for all predictions"
             )
-            criterias: List[Criteria] = [self.criteria] * eval_count
-        unique_criteria_names = list({criteria.name for criteria in criterias})
         logger.info(f"Criteria names are '{', '.join(unique_criteria_names)}'")
-        return criterias
-    def update_eval_fields_from_criteria(self, criteria: List[Criteria]):
-        if not self.context_fields:
-            self.context_fields = {
-                context_field: context_field
-                for context_field in criteria[0].context_fields
-            }
     def get_predictions(
         self,
@@ -238,11 +246,28 @@ class LLMJudge(BulkInstanceMetric):
         criteria: List[Criteria],
         predictions: List[str],
     ) -> List[str]:
-        if not predictions and criteria[0].prediction_field:
-            return [
-                dict_get(td, criteria[i].prediction_field)
-                for i, td in enumerate(task_data)
-            ]
         return predictions
@@ -540,26 +565,25 @@ class LLMJudgeDirect(LLMJudge):
         evaluations_count = len(task_data)
         # TODO: find out how to serialize and deserialize enums
-        criterias = self.get_criteria(task_data, evaluations_count)
-        self.update_eval_fields_from_criteria(criterias)
-        predictions = self.get_predictions(task_data, criterias, predictions)
-        self.__set_main_score(criterias)
-        contexts = self.get_contexts(task_data)
         if self.check_positional_bias:
-            criterias += [
                 CriteriaWithOptions(
                     name=criteria.name,
                     description=criteria.description,
                     option_map=criteria.option_map,
                     options=list(reversed(criteria.options)),
                 )
-                for criteria in criterias
             ]
             contexts += contexts
             predictions += predictions
         parsed_criterias = [
-            self.__get_parsed_criteria(criteria) for criteria in criterias
         ]
         (
@@ -659,7 +683,7 @@ class LLMJudgeDirect(LLMJudge):
             option_selection_outputs,
             selections,
             evaluations_count,
-            criterias,
         )
         return self.clean_results(results)
@@ -1384,9 +1408,13 @@ class LLMJudgePairwise(LLMJudge):
         logger.info(
             f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}'
         )
         predictions = self.__convert_predictions_to_dicts(predictions)
         self.__set_main_score(predictions)
-        instances_count = len(predictions)
         self.reduction_map = {"mean": ["score"]}
         self.reduction_map["mean"].extend(
             [f"{key}_winrate" for key in predictions[0].keys()]
@@ -1432,10 +1460,8 @@ class LLMJudgePairwise(LLMJudge):
             response_pairs_list.append(response_pairs)
             option_pairs_list.append(option_pairs)
-        criterias = self.get_criteria(task_data, instances_count)
-        contexts = self.get_contexts(task_data)
         if self.check_positional_bias:
-            criterias.extend(criterias)
             contexts.extend(contexts)
             for response_pairs, option_pairs in zip(
                 response_pairs_list, option_pairs_list
@@ -1454,8 +1480,8 @@ class LLMJudgePairwise(LLMJudge):
                 "response_b": response_pair[1],
                 "option_a": option_pair[0],
                 "option_b": option_pair[1],
-                "criteria_name": criterias[i].name,
-                "criteria_description": criterias[i].description,
                 "data_classification_policy": ["public"],
             }
             for i, (response_pairs, option_pairs) in enumerate(
@@ -1592,7 +1618,7 @@ class LLMJudgePairwise(LLMJudge):
                 selections[sli],
                 contests_count_list[i],
                 combination_indexes_list[i],
-                criterias[i],
             )
             results.append(instance_results)
             slice_start = slice_end

     rank_indexes,
 )
 from .logging_utils import get_logger
+from .metric_utils import EmptyPrediction
 from .metrics import BulkInstanceMetric
 from .task import Task
 from .templates import Template
     """Flag to check for positional bias. Detecting for positional bias duplicates the amount of inference calls."""
     context_fields: Union[str, List[str], Dict[str, str]] = ["context"]
+    """Fields to be used as context. If a dict is provided, the keys are used as the final names in the prompts, while the values are used to access the context variable values in the `task_data` object (it is recommended to provide the context_fields in the Criteria `context_fields` field as this field will be deprecated in the future)."""
     generate_summaries: bool = False
     """Flag to generate summaries of the assessments. Defaults to `False`."""
     """Flag to include prompts in the result. Defaults to `True`."""
     criteria_field: str = None
+    """The field specifying the evaluation criteria in the `task_data` object. If the `criteria` is provided, it will take precedence."""
     criteria: Criteria = None
+    """The criteria used for evaluation."""
     def prepare(self):
         """Prepares the `LLMJudge` instance by setting up context fields and evaluator name."""
         super().prepare()
+        self.context_fields = self.get_context_fields_as_dict(self.context_fields)
         if self.evaluator_name is None:
             self.evaluator_name = self.inference_engine.get_engine_id()
             )
         return
+    def get_context_fields_as_dict(self, context_fields: Union[str, List, Dict]):
+        result = context_fields if context_fields else {}
+        if isinstance(result, str):
+            result = [result]
+        if isinstance(result, List):
+            result = {context_field: context_field for context_field in result}
+        return result
+    def get_contexts(
+        self, task_data: List[Dict[str, Any]], criteria: List[Criteria]
+    ) -> List[Dict[str, str]]:
         """Extracts and parses context fields from task data.
         Args:
             task_data (List[Dict[str, Any]]): The task data containing context information.
+            criteria ( List[Criteria]): The criteria list from which to take the context fields if they weren't provided in the self.context_fields field
         Returns:
             List[Dict[str, str]]: A list of parsed context dictionaries.
         """
+        parsed_contexts = []
+        for i, td in enumerate(task_data):
+            context_fields_for_td = self.context_fields
+            if not context_fields_for_td and criteria[i].context_fields:
+                context_fields_for_td = self.get_context_fields_as_dict(
+                    criteria[i].context_fields
+                )
+            parsed_contexts.append(
+                get_parsed_context(
+                    {
+                        context_field_name: dict_get(td, context_field)
+                        for context_field_name, context_field in context_fields_for_td.items()
+                    }
+                )
             )
+        return parsed_contexts
     def perform_evaluation_step(
         self,
             logger.info(
                 f"Reading criteria from the task_data field '{self.criteria_field}'"
             )
+            criteria_list = [
                 fetch_artifact(task_data_instance[self.criteria_field])[0]
                 for task_data_instance in task_data
             ]
             logger.info(
                 "Reading criteria from self. Criteria is a single CriteriaWithOptions, replicating it for all predictions"
             )
+            criteria_list: List[Criteria] = [self.criteria] * eval_count
+        unique_criteria_names = list({criteria.name for criteria in criteria_list})
         logger.info(f"Criteria names are '{', '.join(unique_criteria_names)}'")
+        return criteria_list
     def get_predictions(
         self,
         criteria: List[Criteria],
         predictions: List[str],
     ) -> List[str]:
+        if not predictions or all(
+            (
+                isinstance(prediction, EmptyPrediction)
+                or prediction == str(EmptyPrediction())
+            )
+            for prediction in predictions
+        ):
+            predictions_from_task_data = []
+            for i, td in enumerate(task_data):
+                if (
+                    criteria[i].prediction_field is not None
+                    and criteria[i].prediction_field in td
+                ):
+                    predictions_from_task_data.append(
+                        dict_get(td, criteria[i].prediction_field)
+                    )
+                else:
+                    raise UnitxtError(
+                        "You must set either the predictions in the evaluate() call or specify the prediction field name to be taken from the task_data using the `Criteria`'s prediction_field field."
+                    )
+            return predictions_from_task_data
         return predictions
         evaluations_count = len(task_data)
         # TODO: find out how to serialize and deserialize enums
+        criteria_list = self.get_criteria(task_data, evaluations_count)
+        predictions = self.get_predictions(task_data, criteria_list, predictions)
+        contexts = self.get_contexts(task_data, criteria_list)
+        self.__set_main_score(criteria_list)
         if self.check_positional_bias:
+            criteria_list += [
                 CriteriaWithOptions(
                     name=criteria.name,
                     description=criteria.description,
                     option_map=criteria.option_map,
                     options=list(reversed(criteria.options)),
                 )
+                for criteria in criteria_list
             ]
             contexts += contexts
             predictions += predictions
         parsed_criterias = [
+            self.__get_parsed_criteria(criteria) for criteria in criteria_list
         ]
         (
             option_selection_outputs,
             selections,
             evaluations_count,
+            criteria_list,
         )
         return self.clean_results(results)
         logger.info(
             f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}'
         )
+        instances_count = len(predictions)
+        criteria_list = self.get_criteria(task_data, instances_count)
+        contexts = self.get_contexts(task_data, criteria_list)
+        predictions = self.get_predictions(task_data, criteria_list, predictions)
         predictions = self.__convert_predictions_to_dicts(predictions)
         self.__set_main_score(predictions)
         self.reduction_map = {"mean": ["score"]}
         self.reduction_map["mean"].extend(
             [f"{key}_winrate" for key in predictions[0].keys()]
             response_pairs_list.append(response_pairs)
             option_pairs_list.append(option_pairs)
         if self.check_positional_bias:
+            criteria_list.extend(criteria_list)
             contexts.extend(contexts)
             for response_pairs, option_pairs in zip(
                 response_pairs_list, option_pairs_list
                 "response_b": response_pair[1],
                 "option_a": option_pair[0],
                 "option_b": option_pair[1],
+                "criteria_name": criteria_list[i].name,
+                "criteria_description": criteria_list[i].description,
                 "data_classification_policy": ["public"],
             }
             for i, (response_pairs, option_pairs) in enumerate(
                 selections[sli],
                 contests_count_list[i],
                 combination_indexes_list[i],
+                criteria_list[i],
             )
             results.append(instance_results)
             slice_start = slice_end

llm_as_judge_constants.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 from enum import Enum
-from typing import Dict, List, Optional
 from .artifact import Artifact
@@ -11,15 +11,29 @@ class OptionSelectionStrategyEnum(str, Enum):
 class CriteriaOption(Artifact):
     name: str
     description: str
 class Criteria(Artifact):
     name: str
     description: str
     prediction_field: Optional[str] = None
-    context_fields: Optional[List[str]] = None
     @staticmethod
     def from_jsons(s: str):
@@ -36,8 +50,13 @@ class Criteria(Artifact):
 class CriteriaWithOptions(Criteria):
     options: List[CriteriaOption]
     option_map: Optional[Dict[str, float]] = None
     @staticmethod
     def from_jsons(s: str):
@@ -1262,6 +1281,7 @@ class DirectCriteriaCatalogEnum(Enum):
     COMPLIANCE_ASSISTANT_MESSAGE = CriteriaWithOptions(
         name="assistant_message_compliance",
         description="The Assistant message complies with the User message.",
         prediction_field="assistant message",
         options=[
             CriteriaOption(

 import json
 from enum import Enum
+from typing import Dict, List, Optional, Union
 from .artifact import Artifact
 class CriteriaOption(Artifact):
+    """A criteria option."""
     name: str
+    """The name of the criteria option"""
     description: str
+    """The description of the criteria option"""
 class Criteria(Artifact):
+    """Criteria used by PairwiseLLMJudge to run evaluations."""
     name: str
+    """The name of the crieria"""
     description: str
+    """The description of the crieria"""
     prediction_field: Optional[str] = None
+    """The prediction field name this criteria expects and refers to, e.g. answer/model response/summary"""
+    context_fields: Union[str, List[str], Dict[str, str]] = None
+    """The context field names this criteria expects, i.e. [context]/[source article, user questions]"""
     @staticmethod
     def from_jsons(s: str):
 class CriteriaWithOptions(Criteria):
+    """Criteria used by DirectLLMJudge to run evaluations."""
     options: List[CriteriaOption]
+    """The options that the judge can choose between"""
     option_map: Optional[Dict[str, float]] = None
+    """A mapping from the option names to numerical values to use as scores"""
     @staticmethod
     def from_jsons(s: str):
     COMPLIANCE_ASSISTANT_MESSAGE = CriteriaWithOptions(
         name="assistant_message_compliance",
         description="The Assistant message complies with the User message.",
+        context_fields=["user message"],
         prediction_field="assistant message",
         options=[
             CriteriaOption(

metric_utils.py CHANGED Viewed

@@ -49,6 +49,19 @@ def nan_mean(scores):
         return result
 class FromPredictionsAndOriginalData(StreamInitializerOperator):
     def zip(self, predictions, references):
         for prediction, original in zip(predictions, references):
@@ -61,10 +74,13 @@ class FromPredictionsAndOriginalData(StreamInitializerOperator):
     def process(
         self,
-        predictions: List[str],
-        references: Iterable,
         split_name: str = DEFAULT_STREAM_NAME,
     ) -> MultiStream:
         return MultiStream(
             {
                 split_name: DynamicStream(
@@ -86,7 +102,8 @@ class DeleteTargetPrefix(InstanceOperator, ArtifactFetcherMixin):
             if target_prefix is not None and len(target_prefix) > 0:
                 target_prefix = target_prefix.format(**instance["task_data"])
                 pattern = rf"^\s*{re.escape(target_prefix)}\s*"
-                instance["prediction"] = re.sub(pattern, "", instance["prediction"])
         return instance

         return result
+class EmptyPrediction:
+    def __repr__(self):
+        return "<__empty_prediction__>"
+    def __str__(self):
+        return "<__empty_prediction__>"
+def empty_predictions_generator():
+    while True:
+        yield EmptyPrediction()
 class FromPredictionsAndOriginalData(StreamInitializerOperator):
     def zip(self, predictions, references):
         for prediction, original in zip(predictions, references):
     def process(
         self,
+        predictions: Optional[List[str]] = None,
+        references: Optional[Iterable] = None,
         split_name: str = DEFAULT_STREAM_NAME,
     ) -> MultiStream:
+        if predictions is None:
+            predictions = empty_predictions_generator()
         return MultiStream(
             {
                 split_name: DynamicStream(
             if target_prefix is not None and len(target_prefix) > 0:
                 target_prefix = target_prefix.format(**instance["task_data"])
                 pattern = rf"^\s*{re.escape(target_prefix)}\s*"
+                if isinstance(instance["prediction"], str):
+                    instance["prediction"] = re.sub(pattern, "", instance["prediction"])
         return instance

metrics.py CHANGED Viewed

@@ -6146,12 +6146,16 @@ class NormalizedSacrebleu(HuggingfaceMetric):
 class CustomF1Fuzzy(CustomF1):
-    def calculate_groups_ratio(self, actual_group, total_group):
-        from fuzzywuzzy import fuzz
         tmp = []
         for actual_key in actual_group.keys():
-            max_score = self.fuzz_ratio
             best_total_key = None
             for total_key in total_group.keys():
@@ -6159,8 +6163,8 @@ class CustomF1Fuzzy(CustomF1):
                 tup_to = ast.literal_eval(total_key)
                 if tup_ac[1] == tup_to[1]:
-                    score = fuzz.ratio(tup_ac[0], tup_to[0])
-                    if score > max_score:
                         max_score = score
                         best_total_key = total_key
@@ -6173,7 +6177,57 @@ class CustomF1Fuzzy(CustomF1):
 class FuzzyNer(CustomF1Fuzzy):
     prediction_type = List[Tuple[str, str]]
-    fuzz_ratio = 75
     def get_element_group(self, element, additional_input):
         return element[1]

 class CustomF1Fuzzy(CustomF1):
+    min_score_for_match: float
+    @abstractmethod
+    def score(self, val1, val2) -> float:
+        pass
+    def calculate_groups_ratio(self, actual_group, total_group):
         tmp = []
         for actual_key in actual_group.keys():
+            max_score = self.min_score_for_match
             best_total_key = None
             for total_key in total_group.keys():
                 tup_to = ast.literal_eval(total_key)
                 if tup_ac[1] == tup_to[1]:
+                    score = self.score(tup_ac[0], tup_to[0])
+                    if score >= max_score:
                         max_score = score
                         best_total_key = total_key
 class FuzzyNer(CustomF1Fuzzy):
     prediction_type = List[Tuple[str, str]]
+    min_score_for_match = 0.750001  # Used to be > 0.75, and now changed to >= 0.750001
+    def score(self, val1, val2):
+        from fuzzywuzzy import fuzz
+        return fuzz.ratio(val1, val2) / 100.0
+    def get_element_group(self, element, additional_input):
+        return element[1]
+    def get_element_representation(self, element, additional_input):
+        return str(element)
+class MetricBasedNer(CustomF1Fuzzy):
+    """Calculates f1 metrics for NER , by comparing entity using a provided Unitxt metric.
+    While the Ner metric uses exact match to compare entities and FuzzyNer uses fuzzy matching,
+    this customiziable metric can use any Unitxt metric to compare entities, including LLM as Judge.
+    The metric must acceptstring prediction and references as input.  The similarity threshold is
+    set by the 'min_score_for_match' attribute.
+    Example:
+    MetricBasedNer(metric=Rouge(), min_score_for_match=0.9)
+    MetricBasedNer(metric="metrics.llm_as_judge.direct.watsonx.llama3_3_70b[criteria=metrics.llm_as_judge.direct.criteria.correctness_based_on_ground_truth,context_fields=ground_truth]")
+    """
+    prediction_type = List[Tuple[str, str]]
+    metric: Metric
+    min_score_for_match = 0.75
+    def score(self, val1, val2):
+        multi_stream = MultiStream.from_iterables(
+            {
+                "test": [
+                    {
+                        "prediction": val1,
+                        "references": [val2],
+                        "task_data": {
+                            "ground_truth": val2,
+                            "reference": val2,
+                        },
+                    }
+                ]
+            }
+        )
+        output_multi_stream = self.metric(multi_stream)
+        output_stream = output_multi_stream["test"]
+        result = next(iter(output_stream))
+        return result["score"]["global"]["score"]
     def get_element_group(self, element, additional_input):
         return element[1]

operators.py CHANGED Viewed

@@ -536,7 +536,9 @@ class InstanceFieldOperator(InstanceOperator):
                         continue
                     old_value = self.get_default
-            with error_context(self, field=from_field, action="Process Field"):
                 if self.process_every_value:
                     new_value = [
                         self.process_instance_value(value, instance)

                         continue
                     old_value = self.get_default
+            with error_context(
+                self, field=from_field, action="Process Field", value=old_value
+            ):
                 if self.process_every_value:
                     new_value = [
                         self.process_instance_value(value, instance)

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.26.2"


1	+ version = "1.26.3"