Upload folder using huggingface_hub
Browse files- api.py +1 -1
- inference.py +7 -4
- llm_as_judge.py +76 -50
- llm_as_judge_constants.py +22 -2
- metric_utils.py +20 -3
- metrics.py +60 -6
- operators.py +3 -1
- version.py +1 -1
api.py
CHANGED
@@ -310,7 +310,7 @@ def fill_metadata(**kwargs):
|
|
310 |
|
311 |
|
312 |
def evaluate(
|
313 |
-
predictions,
|
314 |
dataset: Union[Dataset, IterableDataset] = None,
|
315 |
data=None,
|
316 |
calc_confidence_intervals: bool = True,
|
|
|
310 |
|
311 |
|
312 |
def evaluate(
|
313 |
+
predictions: Optional[List[str]] = None,
|
314 |
dataset: Union[Dataset, IterableDataset] = None,
|
315 |
data=None,
|
316 |
calc_confidence_intervals: bool = True,
|
inference.py
CHANGED
@@ -281,7 +281,7 @@ class InferenceEngine(Artifact):
|
|
281 |
missing_examples.append(
|
282 |
(i, item)
|
283 |
) # each element is index in batch and example
|
284 |
-
#
|
285 |
|
286 |
logger.info(
|
287 |
f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})"
|
@@ -825,11 +825,14 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
|
|
825 |
tools = []
|
826 |
for instance in batch:
|
827 |
sources.append(instance["source"])
|
828 |
-
if "task_data" in instance
|
829 |
task_data = instance["task_data"]
|
830 |
if isinstance(task_data, str):
|
831 |
task_data = json.loads(task_data)
|
832 |
-
|
|
|
|
|
|
|
833 |
else:
|
834 |
tools.append(None)
|
835 |
# Tokenize inputs for the batch
|
@@ -3715,7 +3718,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
|
|
3715 |
"bam": {"max_tokens": "max_new_tokens", "model": "model_name"},
|
3716 |
"watsonx-sdk": {"model": "model_name"},
|
3717 |
"rits": {"model": "model_name"},
|
3718 |
-
"hf-local": {"model": "model_name"},
|
3719 |
}
|
3720 |
|
3721 |
def get_return_object(self, **kwargs):
|
|
|
281 |
missing_examples.append(
|
282 |
(i, item)
|
283 |
) # each element is index in batch and example
|
284 |
+
# infere on missing examples only, without indices
|
285 |
|
286 |
logger.info(
|
287 |
f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})"
|
|
|
825 |
tools = []
|
826 |
for instance in batch:
|
827 |
sources.append(instance["source"])
|
828 |
+
if "task_data" in instance:
|
829 |
task_data = instance["task_data"]
|
830 |
if isinstance(task_data, str):
|
831 |
task_data = json.loads(task_data)
|
832 |
+
if "__tools__" in task_data:
|
833 |
+
tools.append(task_data["__tools__"])
|
834 |
+
else:
|
835 |
+
tools.append(None)
|
836 |
else:
|
837 |
tools.append(None)
|
838 |
# Tokenize inputs for the batch
|
|
|
3718 |
"bam": {"max_tokens": "max_new_tokens", "model": "model_name"},
|
3719 |
"watsonx-sdk": {"model": "model_name"},
|
3720 |
"rits": {"model": "model_name"},
|
3721 |
+
"hf-local": {"model": "model_name", "max_tokens": "max_new_tokens"},
|
3722 |
}
|
3723 |
|
3724 |
def get_return_object(self, **kwargs):
|
llm_as_judge.py
CHANGED
@@ -43,6 +43,7 @@ from .llm_as_judge_utils import (
|
|
43 |
rank_indexes,
|
44 |
)
|
45 |
from .logging_utils import get_logger
|
|
|
46 |
from .metrics import BulkInstanceMetric
|
47 |
from .task import Task
|
48 |
from .templates import Template
|
@@ -66,7 +67,7 @@ class LLMJudge(BulkInstanceMetric):
|
|
66 |
"""Flag to check for positional bias. Detecting for positional bias duplicates the amount of inference calls."""
|
67 |
|
68 |
context_fields: Union[str, List[str], Dict[str, str]] = ["context"]
|
69 |
-
"""Fields to be used as context. If a dict is provided, the keys are used as the final names in the prompts, while the values are used to access the context variable values in the `task_data` object."""
|
70 |
|
71 |
generate_summaries: bool = False
|
72 |
"""Flag to generate summaries of the assessments. Defaults to `False`."""
|
@@ -78,20 +79,15 @@ class LLMJudge(BulkInstanceMetric):
|
|
78 |
"""Flag to include prompts in the result. Defaults to `True`."""
|
79 |
|
80 |
criteria_field: str = None
|
81 |
-
"""The field specifying the evaluation criteria in the `task_data` object."""
|
82 |
|
83 |
criteria: Criteria = None
|
84 |
-
"""The criteria used for evaluation.
|
85 |
|
86 |
def prepare(self):
|
87 |
"""Prepares the `LLMJudge` instance by setting up context fields and evaluator name."""
|
88 |
super().prepare()
|
89 |
-
|
90 |
-
self.context_fields = [self.context_fields]
|
91 |
-
if isinstance(self.context_fields, List):
|
92 |
-
self.context_fields = {
|
93 |
-
context_field: context_field for context_field in self.context_fields
|
94 |
-
}
|
95 |
|
96 |
if self.evaluator_name is None:
|
97 |
self.evaluator_name = self.inference_engine.get_engine_id()
|
@@ -112,24 +108,43 @@ class LLMJudge(BulkInstanceMetric):
|
|
112 |
)
|
113 |
return
|
114 |
|
115 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
"""Extracts and parses context fields from task data.
|
117 |
|
118 |
Args:
|
119 |
task_data (List[Dict[str, Any]]): The task data containing context information.
|
|
|
120 |
|
121 |
Returns:
|
122 |
List[Dict[str, str]]: A list of parsed context dictionaries.
|
123 |
"""
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
)
|
131 |
-
|
132 |
-
]
|
133 |
|
134 |
def perform_evaluation_step(
|
135 |
self,
|
@@ -211,7 +226,7 @@ class LLMJudge(BulkInstanceMetric):
|
|
211 |
logger.info(
|
212 |
f"Reading criteria from the task_data field '{self.criteria_field}'"
|
213 |
)
|
214 |
-
|
215 |
fetch_artifact(task_data_instance[self.criteria_field])[0]
|
216 |
for task_data_instance in task_data
|
217 |
]
|
@@ -219,18 +234,11 @@ class LLMJudge(BulkInstanceMetric):
|
|
219 |
logger.info(
|
220 |
"Reading criteria from self. Criteria is a single CriteriaWithOptions, replicating it for all predictions"
|
221 |
)
|
222 |
-
|
223 |
-
unique_criteria_names = list({criteria.name for criteria in
|
224 |
|
225 |
logger.info(f"Criteria names are '{', '.join(unique_criteria_names)}'")
|
226 |
-
return
|
227 |
-
|
228 |
-
def update_eval_fields_from_criteria(self, criteria: List[Criteria]):
|
229 |
-
if not self.context_fields:
|
230 |
-
self.context_fields = {
|
231 |
-
context_field: context_field
|
232 |
-
for context_field in criteria[0].context_fields
|
233 |
-
}
|
234 |
|
235 |
def get_predictions(
|
236 |
self,
|
@@ -238,11 +246,28 @@ class LLMJudge(BulkInstanceMetric):
|
|
238 |
criteria: List[Criteria],
|
239 |
predictions: List[str],
|
240 |
) -> List[str]:
|
241 |
-
if not predictions
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
return predictions
|
247 |
|
248 |
|
@@ -540,26 +565,25 @@ class LLMJudgeDirect(LLMJudge):
|
|
540 |
|
541 |
evaluations_count = len(task_data)
|
542 |
# TODO: find out how to serialize and deserialize enums
|
543 |
-
|
544 |
-
self.
|
545 |
-
|
546 |
-
self.__set_main_score(
|
547 |
-
contexts = self.get_contexts(task_data)
|
548 |
if self.check_positional_bias:
|
549 |
-
|
550 |
CriteriaWithOptions(
|
551 |
name=criteria.name,
|
552 |
description=criteria.description,
|
553 |
option_map=criteria.option_map,
|
554 |
options=list(reversed(criteria.options)),
|
555 |
)
|
556 |
-
for criteria in
|
557 |
]
|
558 |
contexts += contexts
|
559 |
predictions += predictions
|
560 |
|
561 |
parsed_criterias = [
|
562 |
-
self.__get_parsed_criteria(criteria) for criteria in
|
563 |
]
|
564 |
|
565 |
(
|
@@ -659,7 +683,7 @@ class LLMJudgeDirect(LLMJudge):
|
|
659 |
option_selection_outputs,
|
660 |
selections,
|
661 |
evaluations_count,
|
662 |
-
|
663 |
)
|
664 |
|
665 |
return self.clean_results(results)
|
@@ -1384,9 +1408,13 @@ class LLMJudgePairwise(LLMJudge):
|
|
1384 |
logger.info(
|
1385 |
f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}'
|
1386 |
)
|
|
|
|
|
|
|
|
|
|
|
1387 |
predictions = self.__convert_predictions_to_dicts(predictions)
|
1388 |
self.__set_main_score(predictions)
|
1389 |
-
instances_count = len(predictions)
|
1390 |
self.reduction_map = {"mean": ["score"]}
|
1391 |
self.reduction_map["mean"].extend(
|
1392 |
[f"{key}_winrate" for key in predictions[0].keys()]
|
@@ -1432,10 +1460,8 @@ class LLMJudgePairwise(LLMJudge):
|
|
1432 |
response_pairs_list.append(response_pairs)
|
1433 |
option_pairs_list.append(option_pairs)
|
1434 |
|
1435 |
-
criterias = self.get_criteria(task_data, instances_count)
|
1436 |
-
contexts = self.get_contexts(task_data)
|
1437 |
if self.check_positional_bias:
|
1438 |
-
|
1439 |
contexts.extend(contexts)
|
1440 |
for response_pairs, option_pairs in zip(
|
1441 |
response_pairs_list, option_pairs_list
|
@@ -1454,8 +1480,8 @@ class LLMJudgePairwise(LLMJudge):
|
|
1454 |
"response_b": response_pair[1],
|
1455 |
"option_a": option_pair[0],
|
1456 |
"option_b": option_pair[1],
|
1457 |
-
"criteria_name":
|
1458 |
-
"criteria_description":
|
1459 |
"data_classification_policy": ["public"],
|
1460 |
}
|
1461 |
for i, (response_pairs, option_pairs) in enumerate(
|
@@ -1592,7 +1618,7 @@ class LLMJudgePairwise(LLMJudge):
|
|
1592 |
selections[sli],
|
1593 |
contests_count_list[i],
|
1594 |
combination_indexes_list[i],
|
1595 |
-
|
1596 |
)
|
1597 |
results.append(instance_results)
|
1598 |
slice_start = slice_end
|
|
|
43 |
rank_indexes,
|
44 |
)
|
45 |
from .logging_utils import get_logger
|
46 |
+
from .metric_utils import EmptyPrediction
|
47 |
from .metrics import BulkInstanceMetric
|
48 |
from .task import Task
|
49 |
from .templates import Template
|
|
|
67 |
"""Flag to check for positional bias. Detecting for positional bias duplicates the amount of inference calls."""
|
68 |
|
69 |
context_fields: Union[str, List[str], Dict[str, str]] = ["context"]
|
70 |
+
"""Fields to be used as context. If a dict is provided, the keys are used as the final names in the prompts, while the values are used to access the context variable values in the `task_data` object (it is recommended to provide the context_fields in the Criteria `context_fields` field as this field will be deprecated in the future)."""
|
71 |
|
72 |
generate_summaries: bool = False
|
73 |
"""Flag to generate summaries of the assessments. Defaults to `False`."""
|
|
|
79 |
"""Flag to include prompts in the result. Defaults to `True`."""
|
80 |
|
81 |
criteria_field: str = None
|
82 |
+
"""The field specifying the evaluation criteria in the `task_data` object. If the `criteria` is provided, it will take precedence."""
|
83 |
|
84 |
criteria: Criteria = None
|
85 |
+
"""The criteria used for evaluation."""
|
86 |
|
87 |
def prepare(self):
|
88 |
"""Prepares the `LLMJudge` instance by setting up context fields and evaluator name."""
|
89 |
super().prepare()
|
90 |
+
self.context_fields = self.get_context_fields_as_dict(self.context_fields)
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
if self.evaluator_name is None:
|
93 |
self.evaluator_name = self.inference_engine.get_engine_id()
|
|
|
108 |
)
|
109 |
return
|
110 |
|
111 |
+
def get_context_fields_as_dict(self, context_fields: Union[str, List, Dict]):
|
112 |
+
result = context_fields if context_fields else {}
|
113 |
+
if isinstance(result, str):
|
114 |
+
result = [result]
|
115 |
+
if isinstance(result, List):
|
116 |
+
result = {context_field: context_field for context_field in result}
|
117 |
+
return result
|
118 |
+
|
119 |
+
def get_contexts(
|
120 |
+
self, task_data: List[Dict[str, Any]], criteria: List[Criteria]
|
121 |
+
) -> List[Dict[str, str]]:
|
122 |
"""Extracts and parses context fields from task data.
|
123 |
|
124 |
Args:
|
125 |
task_data (List[Dict[str, Any]]): The task data containing context information.
|
126 |
+
criteria ( List[Criteria]): The criteria list from which to take the context fields if they weren't provided in the self.context_fields field
|
127 |
|
128 |
Returns:
|
129 |
List[Dict[str, str]]: A list of parsed context dictionaries.
|
130 |
"""
|
131 |
+
parsed_contexts = []
|
132 |
+
for i, td in enumerate(task_data):
|
133 |
+
context_fields_for_td = self.context_fields
|
134 |
+
if not context_fields_for_td and criteria[i].context_fields:
|
135 |
+
context_fields_for_td = self.get_context_fields_as_dict(
|
136 |
+
criteria[i].context_fields
|
137 |
+
)
|
138 |
+
|
139 |
+
parsed_contexts.append(
|
140 |
+
get_parsed_context(
|
141 |
+
{
|
142 |
+
context_field_name: dict_get(td, context_field)
|
143 |
+
for context_field_name, context_field in context_fields_for_td.items()
|
144 |
+
}
|
145 |
+
)
|
146 |
)
|
147 |
+
return parsed_contexts
|
|
|
148 |
|
149 |
def perform_evaluation_step(
|
150 |
self,
|
|
|
226 |
logger.info(
|
227 |
f"Reading criteria from the task_data field '{self.criteria_field}'"
|
228 |
)
|
229 |
+
criteria_list = [
|
230 |
fetch_artifact(task_data_instance[self.criteria_field])[0]
|
231 |
for task_data_instance in task_data
|
232 |
]
|
|
|
234 |
logger.info(
|
235 |
"Reading criteria from self. Criteria is a single CriteriaWithOptions, replicating it for all predictions"
|
236 |
)
|
237 |
+
criteria_list: List[Criteria] = [self.criteria] * eval_count
|
238 |
+
unique_criteria_names = list({criteria.name for criteria in criteria_list})
|
239 |
|
240 |
logger.info(f"Criteria names are '{', '.join(unique_criteria_names)}'")
|
241 |
+
return criteria_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
243 |
def get_predictions(
|
244 |
self,
|
|
|
246 |
criteria: List[Criteria],
|
247 |
predictions: List[str],
|
248 |
) -> List[str]:
|
249 |
+
if not predictions or all(
|
250 |
+
(
|
251 |
+
isinstance(prediction, EmptyPrediction)
|
252 |
+
or prediction == str(EmptyPrediction())
|
253 |
+
)
|
254 |
+
for prediction in predictions
|
255 |
+
):
|
256 |
+
predictions_from_task_data = []
|
257 |
+
for i, td in enumerate(task_data):
|
258 |
+
if (
|
259 |
+
criteria[i].prediction_field is not None
|
260 |
+
and criteria[i].prediction_field in td
|
261 |
+
):
|
262 |
+
predictions_from_task_data.append(
|
263 |
+
dict_get(td, criteria[i].prediction_field)
|
264 |
+
)
|
265 |
+
else:
|
266 |
+
raise UnitxtError(
|
267 |
+
"You must set either the predictions in the evaluate() call or specify the prediction field name to be taken from the task_data using the `Criteria`'s prediction_field field."
|
268 |
+
)
|
269 |
+
return predictions_from_task_data
|
270 |
+
|
271 |
return predictions
|
272 |
|
273 |
|
|
|
565 |
|
566 |
evaluations_count = len(task_data)
|
567 |
# TODO: find out how to serialize and deserialize enums
|
568 |
+
criteria_list = self.get_criteria(task_data, evaluations_count)
|
569 |
+
predictions = self.get_predictions(task_data, criteria_list, predictions)
|
570 |
+
contexts = self.get_contexts(task_data, criteria_list)
|
571 |
+
self.__set_main_score(criteria_list)
|
|
|
572 |
if self.check_positional_bias:
|
573 |
+
criteria_list += [
|
574 |
CriteriaWithOptions(
|
575 |
name=criteria.name,
|
576 |
description=criteria.description,
|
577 |
option_map=criteria.option_map,
|
578 |
options=list(reversed(criteria.options)),
|
579 |
)
|
580 |
+
for criteria in criteria_list
|
581 |
]
|
582 |
contexts += contexts
|
583 |
predictions += predictions
|
584 |
|
585 |
parsed_criterias = [
|
586 |
+
self.__get_parsed_criteria(criteria) for criteria in criteria_list
|
587 |
]
|
588 |
|
589 |
(
|
|
|
683 |
option_selection_outputs,
|
684 |
selections,
|
685 |
evaluations_count,
|
686 |
+
criteria_list,
|
687 |
)
|
688 |
|
689 |
return self.clean_results(results)
|
|
|
1408 |
logger.info(
|
1409 |
f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}'
|
1410 |
)
|
1411 |
+
|
1412 |
+
instances_count = len(predictions)
|
1413 |
+
criteria_list = self.get_criteria(task_data, instances_count)
|
1414 |
+
contexts = self.get_contexts(task_data, criteria_list)
|
1415 |
+
predictions = self.get_predictions(task_data, criteria_list, predictions)
|
1416 |
predictions = self.__convert_predictions_to_dicts(predictions)
|
1417 |
self.__set_main_score(predictions)
|
|
|
1418 |
self.reduction_map = {"mean": ["score"]}
|
1419 |
self.reduction_map["mean"].extend(
|
1420 |
[f"{key}_winrate" for key in predictions[0].keys()]
|
|
|
1460 |
response_pairs_list.append(response_pairs)
|
1461 |
option_pairs_list.append(option_pairs)
|
1462 |
|
|
|
|
|
1463 |
if self.check_positional_bias:
|
1464 |
+
criteria_list.extend(criteria_list)
|
1465 |
contexts.extend(contexts)
|
1466 |
for response_pairs, option_pairs in zip(
|
1467 |
response_pairs_list, option_pairs_list
|
|
|
1480 |
"response_b": response_pair[1],
|
1481 |
"option_a": option_pair[0],
|
1482 |
"option_b": option_pair[1],
|
1483 |
+
"criteria_name": criteria_list[i].name,
|
1484 |
+
"criteria_description": criteria_list[i].description,
|
1485 |
"data_classification_policy": ["public"],
|
1486 |
}
|
1487 |
for i, (response_pairs, option_pairs) in enumerate(
|
|
|
1618 |
selections[sli],
|
1619 |
contests_count_list[i],
|
1620 |
combination_indexes_list[i],
|
1621 |
+
criteria_list[i],
|
1622 |
)
|
1623 |
results.append(instance_results)
|
1624 |
slice_start = slice_end
|
llm_as_judge_constants.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import json
|
2 |
from enum import Enum
|
3 |
-
from typing import Dict, List, Optional
|
4 |
|
5 |
from .artifact import Artifact
|
6 |
|
@@ -11,15 +11,29 @@ class OptionSelectionStrategyEnum(str, Enum):
|
|
11 |
|
12 |
|
13 |
class CriteriaOption(Artifact):
|
|
|
|
|
14 |
name: str
|
|
|
|
|
15 |
description: str
|
|
|
16 |
|
17 |
|
18 |
class Criteria(Artifact):
|
|
|
|
|
19 |
name: str
|
|
|
|
|
20 |
description: str
|
|
|
|
|
21 |
prediction_field: Optional[str] = None
|
22 |
-
|
|
|
|
|
|
|
23 |
|
24 |
@staticmethod
|
25 |
def from_jsons(s: str):
|
@@ -36,8 +50,13 @@ class Criteria(Artifact):
|
|
36 |
|
37 |
|
38 |
class CriteriaWithOptions(Criteria):
|
|
|
|
|
39 |
options: List[CriteriaOption]
|
|
|
|
|
40 |
option_map: Optional[Dict[str, float]] = None
|
|
|
41 |
|
42 |
@staticmethod
|
43 |
def from_jsons(s: str):
|
@@ -1262,6 +1281,7 @@ class DirectCriteriaCatalogEnum(Enum):
|
|
1262 |
COMPLIANCE_ASSISTANT_MESSAGE = CriteriaWithOptions(
|
1263 |
name="assistant_message_compliance",
|
1264 |
description="The Assistant message complies with the User message.",
|
|
|
1265 |
prediction_field="assistant message",
|
1266 |
options=[
|
1267 |
CriteriaOption(
|
|
|
1 |
import json
|
2 |
from enum import Enum
|
3 |
+
from typing import Dict, List, Optional, Union
|
4 |
|
5 |
from .artifact import Artifact
|
6 |
|
|
|
11 |
|
12 |
|
13 |
class CriteriaOption(Artifact):
|
14 |
+
"""A criteria option."""
|
15 |
+
|
16 |
name: str
|
17 |
+
"""The name of the criteria option"""
|
18 |
+
|
19 |
description: str
|
20 |
+
"""The description of the criteria option"""
|
21 |
|
22 |
|
23 |
class Criteria(Artifact):
|
24 |
+
"""Criteria used by PairwiseLLMJudge to run evaluations."""
|
25 |
+
|
26 |
name: str
|
27 |
+
"""The name of the crieria"""
|
28 |
+
|
29 |
description: str
|
30 |
+
"""The description of the crieria"""
|
31 |
+
|
32 |
prediction_field: Optional[str] = None
|
33 |
+
"""The prediction field name this criteria expects and refers to, e.g. answer/model response/summary"""
|
34 |
+
|
35 |
+
context_fields: Union[str, List[str], Dict[str, str]] = None
|
36 |
+
"""The context field names this criteria expects, i.e. [context]/[source article, user questions]"""
|
37 |
|
38 |
@staticmethod
|
39 |
def from_jsons(s: str):
|
|
|
50 |
|
51 |
|
52 |
class CriteriaWithOptions(Criteria):
|
53 |
+
"""Criteria used by DirectLLMJudge to run evaluations."""
|
54 |
+
|
55 |
options: List[CriteriaOption]
|
56 |
+
"""The options that the judge can choose between"""
|
57 |
+
|
58 |
option_map: Optional[Dict[str, float]] = None
|
59 |
+
"""A mapping from the option names to numerical values to use as scores"""
|
60 |
|
61 |
@staticmethod
|
62 |
def from_jsons(s: str):
|
|
|
1281 |
COMPLIANCE_ASSISTANT_MESSAGE = CriteriaWithOptions(
|
1282 |
name="assistant_message_compliance",
|
1283 |
description="The Assistant message complies with the User message.",
|
1284 |
+
context_fields=["user message"],
|
1285 |
prediction_field="assistant message",
|
1286 |
options=[
|
1287 |
CriteriaOption(
|
metric_utils.py
CHANGED
@@ -49,6 +49,19 @@ def nan_mean(scores):
|
|
49 |
return result
|
50 |
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
class FromPredictionsAndOriginalData(StreamInitializerOperator):
|
53 |
def zip(self, predictions, references):
|
54 |
for prediction, original in zip(predictions, references):
|
@@ -61,10 +74,13 @@ class FromPredictionsAndOriginalData(StreamInitializerOperator):
|
|
61 |
|
62 |
def process(
|
63 |
self,
|
64 |
-
predictions: List[str],
|
65 |
-
references: Iterable,
|
66 |
split_name: str = DEFAULT_STREAM_NAME,
|
67 |
) -> MultiStream:
|
|
|
|
|
|
|
68 |
return MultiStream(
|
69 |
{
|
70 |
split_name: DynamicStream(
|
@@ -86,7 +102,8 @@ class DeleteTargetPrefix(InstanceOperator, ArtifactFetcherMixin):
|
|
86 |
if target_prefix is not None and len(target_prefix) > 0:
|
87 |
target_prefix = target_prefix.format(**instance["task_data"])
|
88 |
pattern = rf"^\s*{re.escape(target_prefix)}\s*"
|
89 |
-
instance["prediction"]
|
|
|
90 |
return instance
|
91 |
|
92 |
|
|
|
49 |
return result
|
50 |
|
51 |
|
52 |
+
class EmptyPrediction:
|
53 |
+
def __repr__(self):
|
54 |
+
return "<__empty_prediction__>"
|
55 |
+
|
56 |
+
def __str__(self):
|
57 |
+
return "<__empty_prediction__>"
|
58 |
+
|
59 |
+
|
60 |
+
def empty_predictions_generator():
|
61 |
+
while True:
|
62 |
+
yield EmptyPrediction()
|
63 |
+
|
64 |
+
|
65 |
class FromPredictionsAndOriginalData(StreamInitializerOperator):
|
66 |
def zip(self, predictions, references):
|
67 |
for prediction, original in zip(predictions, references):
|
|
|
74 |
|
75 |
def process(
|
76 |
self,
|
77 |
+
predictions: Optional[List[str]] = None,
|
78 |
+
references: Optional[Iterable] = None,
|
79 |
split_name: str = DEFAULT_STREAM_NAME,
|
80 |
) -> MultiStream:
|
81 |
+
if predictions is None:
|
82 |
+
predictions = empty_predictions_generator()
|
83 |
+
|
84 |
return MultiStream(
|
85 |
{
|
86 |
split_name: DynamicStream(
|
|
|
102 |
if target_prefix is not None and len(target_prefix) > 0:
|
103 |
target_prefix = target_prefix.format(**instance["task_data"])
|
104 |
pattern = rf"^\s*{re.escape(target_prefix)}\s*"
|
105 |
+
if isinstance(instance["prediction"], str):
|
106 |
+
instance["prediction"] = re.sub(pattern, "", instance["prediction"])
|
107 |
return instance
|
108 |
|
109 |
|
metrics.py
CHANGED
@@ -6146,12 +6146,16 @@ class NormalizedSacrebleu(HuggingfaceMetric):
|
|
6146 |
|
6147 |
|
6148 |
class CustomF1Fuzzy(CustomF1):
|
6149 |
-
|
6150 |
-
|
|
|
|
|
|
|
6151 |
|
|
|
6152 |
tmp = []
|
6153 |
for actual_key in actual_group.keys():
|
6154 |
-
max_score = self.
|
6155 |
best_total_key = None
|
6156 |
|
6157 |
for total_key in total_group.keys():
|
@@ -6159,8 +6163,8 @@ class CustomF1Fuzzy(CustomF1):
|
|
6159 |
tup_to = ast.literal_eval(total_key)
|
6160 |
|
6161 |
if tup_ac[1] == tup_to[1]:
|
6162 |
-
score =
|
6163 |
-
if score
|
6164 |
max_score = score
|
6165 |
best_total_key = total_key
|
6166 |
|
@@ -6173,7 +6177,57 @@ class CustomF1Fuzzy(CustomF1):
|
|
6173 |
|
6174 |
class FuzzyNer(CustomF1Fuzzy):
|
6175 |
prediction_type = List[Tuple[str, str]]
|
6176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6177 |
|
6178 |
def get_element_group(self, element, additional_input):
|
6179 |
return element[1]
|
|
|
6146 |
|
6147 |
|
6148 |
class CustomF1Fuzzy(CustomF1):
|
6149 |
+
min_score_for_match: float
|
6150 |
+
|
6151 |
+
@abstractmethod
|
6152 |
+
def score(self, val1, val2) -> float:
|
6153 |
+
pass
|
6154 |
|
6155 |
+
def calculate_groups_ratio(self, actual_group, total_group):
|
6156 |
tmp = []
|
6157 |
for actual_key in actual_group.keys():
|
6158 |
+
max_score = self.min_score_for_match
|
6159 |
best_total_key = None
|
6160 |
|
6161 |
for total_key in total_group.keys():
|
|
|
6163 |
tup_to = ast.literal_eval(total_key)
|
6164 |
|
6165 |
if tup_ac[1] == tup_to[1]:
|
6166 |
+
score = self.score(tup_ac[0], tup_to[0])
|
6167 |
+
if score >= max_score:
|
6168 |
max_score = score
|
6169 |
best_total_key = total_key
|
6170 |
|
|
|
6177 |
|
6178 |
class FuzzyNer(CustomF1Fuzzy):
|
6179 |
prediction_type = List[Tuple[str, str]]
|
6180 |
+
min_score_for_match = 0.750001 # Used to be > 0.75, and now changed to >= 0.750001
|
6181 |
+
|
6182 |
+
def score(self, val1, val2):
|
6183 |
+
from fuzzywuzzy import fuzz
|
6184 |
+
|
6185 |
+
return fuzz.ratio(val1, val2) / 100.0
|
6186 |
+
|
6187 |
+
def get_element_group(self, element, additional_input):
|
6188 |
+
return element[1]
|
6189 |
+
|
6190 |
+
def get_element_representation(self, element, additional_input):
|
6191 |
+
return str(element)
|
6192 |
+
|
6193 |
+
|
6194 |
+
class MetricBasedNer(CustomF1Fuzzy):
|
6195 |
+
"""Calculates f1 metrics for NER , by comparing entity using a provided Unitxt metric.
|
6196 |
+
|
6197 |
+
While the Ner metric uses exact match to compare entities and FuzzyNer uses fuzzy matching,
|
6198 |
+
this customiziable metric can use any Unitxt metric to compare entities, including LLM as Judge.
|
6199 |
+
The metric must acceptstring prediction and references as input. The similarity threshold is
|
6200 |
+
set by the 'min_score_for_match' attribute.
|
6201 |
+
|
6202 |
+
Example:
|
6203 |
+
MetricBasedNer(metric=Rouge(), min_score_for_match=0.9)
|
6204 |
+
|
6205 |
+
MetricBasedNer(metric="metrics.llm_as_judge.direct.watsonx.llama3_3_70b[criteria=metrics.llm_as_judge.direct.criteria.correctness_based_on_ground_truth,context_fields=ground_truth]")
|
6206 |
+
"""
|
6207 |
+
|
6208 |
+
prediction_type = List[Tuple[str, str]]
|
6209 |
+
metric: Metric
|
6210 |
+
min_score_for_match = 0.75
|
6211 |
+
|
6212 |
+
def score(self, val1, val2):
|
6213 |
+
multi_stream = MultiStream.from_iterables(
|
6214 |
+
{
|
6215 |
+
"test": [
|
6216 |
+
{
|
6217 |
+
"prediction": val1,
|
6218 |
+
"references": [val2],
|
6219 |
+
"task_data": {
|
6220 |
+
"ground_truth": val2,
|
6221 |
+
"reference": val2,
|
6222 |
+
},
|
6223 |
+
}
|
6224 |
+
]
|
6225 |
+
}
|
6226 |
+
)
|
6227 |
+
output_multi_stream = self.metric(multi_stream)
|
6228 |
+
output_stream = output_multi_stream["test"]
|
6229 |
+
result = next(iter(output_stream))
|
6230 |
+
return result["score"]["global"]["score"]
|
6231 |
|
6232 |
def get_element_group(self, element, additional_input):
|
6233 |
return element[1]
|
operators.py
CHANGED
@@ -536,7 +536,9 @@ class InstanceFieldOperator(InstanceOperator):
|
|
536 |
continue
|
537 |
old_value = self.get_default
|
538 |
|
539 |
-
with error_context(
|
|
|
|
|
540 |
if self.process_every_value:
|
541 |
new_value = [
|
542 |
self.process_instance_value(value, instance)
|
|
|
536 |
continue
|
537 |
old_value = self.get_default
|
538 |
|
539 |
+
with error_context(
|
540 |
+
self, field=from_field, action="Process Field", value=old_value
|
541 |
+
):
|
542 |
if self.process_every_value:
|
543 |
new_value = [
|
544 |
self.process_instance_value(value, instance)
|
version.py
CHANGED
@@ -1 +1 @@
|
|
1 |
-
version = "1.26.
|
|
|
1 |
+
version = "1.26.3"
|