Elron commited on
Commit
03d14e5
·
verified ·
1 Parent(s): b66f73e

Upload folder using huggingface_hub

Browse files
Files changed (8) hide show
  1. api.py +1 -1
  2. inference.py +7 -4
  3. llm_as_judge.py +76 -50
  4. llm_as_judge_constants.py +22 -2
  5. metric_utils.py +20 -3
  6. metrics.py +60 -6
  7. operators.py +3 -1
  8. version.py +1 -1
api.py CHANGED
@@ -310,7 +310,7 @@ def fill_metadata(**kwargs):
310
 
311
 
312
  def evaluate(
313
- predictions,
314
  dataset: Union[Dataset, IterableDataset] = None,
315
  data=None,
316
  calc_confidence_intervals: bool = True,
 
310
 
311
 
312
  def evaluate(
313
+ predictions: Optional[List[str]] = None,
314
  dataset: Union[Dataset, IterableDataset] = None,
315
  data=None,
316
  calc_confidence_intervals: bool = True,
inference.py CHANGED
@@ -281,7 +281,7 @@ class InferenceEngine(Artifact):
281
  missing_examples.append(
282
  (i, item)
283
  ) # each element is index in batch and example
284
- # infare on missing examples only, without indices
285
 
286
  logger.info(
287
  f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})"
@@ -825,11 +825,14 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
825
  tools = []
826
  for instance in batch:
827
  sources.append(instance["source"])
828
- if "task_data" in instance and "__tools__" in instance["task_data"]:
829
  task_data = instance["task_data"]
830
  if isinstance(task_data, str):
831
  task_data = json.loads(task_data)
832
- tools.append(task_data["__tools__"])
 
 
 
833
  else:
834
  tools.append(None)
835
  # Tokenize inputs for the batch
@@ -3715,7 +3718,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
3715
  "bam": {"max_tokens": "max_new_tokens", "model": "model_name"},
3716
  "watsonx-sdk": {"model": "model_name"},
3717
  "rits": {"model": "model_name"},
3718
- "hf-local": {"model": "model_name"},
3719
  }
3720
 
3721
  def get_return_object(self, **kwargs):
 
281
  missing_examples.append(
282
  (i, item)
283
  ) # each element is index in batch and example
284
+ # infere on missing examples only, without indices
285
 
286
  logger.info(
287
  f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})"
 
825
  tools = []
826
  for instance in batch:
827
  sources.append(instance["source"])
828
+ if "task_data" in instance:
829
  task_data = instance["task_data"]
830
  if isinstance(task_data, str):
831
  task_data = json.loads(task_data)
832
+ if "__tools__" in task_data:
833
+ tools.append(task_data["__tools__"])
834
+ else:
835
+ tools.append(None)
836
  else:
837
  tools.append(None)
838
  # Tokenize inputs for the batch
 
3718
  "bam": {"max_tokens": "max_new_tokens", "model": "model_name"},
3719
  "watsonx-sdk": {"model": "model_name"},
3720
  "rits": {"model": "model_name"},
3721
+ "hf-local": {"model": "model_name", "max_tokens": "max_new_tokens"},
3722
  }
3723
 
3724
  def get_return_object(self, **kwargs):
llm_as_judge.py CHANGED
@@ -43,6 +43,7 @@ from .llm_as_judge_utils import (
43
  rank_indexes,
44
  )
45
  from .logging_utils import get_logger
 
46
  from .metrics import BulkInstanceMetric
47
  from .task import Task
48
  from .templates import Template
@@ -66,7 +67,7 @@ class LLMJudge(BulkInstanceMetric):
66
  """Flag to check for positional bias. Detecting for positional bias duplicates the amount of inference calls."""
67
 
68
  context_fields: Union[str, List[str], Dict[str, str]] = ["context"]
69
- """Fields to be used as context. If a dict is provided, the keys are used as the final names in the prompts, while the values are used to access the context variable values in the `task_data` object."""
70
 
71
  generate_summaries: bool = False
72
  """Flag to generate summaries of the assessments. Defaults to `False`."""
@@ -78,20 +79,15 @@ class LLMJudge(BulkInstanceMetric):
78
  """Flag to include prompts in the result. Defaults to `True`."""
79
 
80
  criteria_field: str = None
81
- """The field specifying the evaluation criteria in the `task_data` object."""
82
 
83
  criteria: Criteria = None
84
- """The criteria used for evaluation. If the `criteria_field` is provided, it will take precedence."""
85
 
86
  def prepare(self):
87
  """Prepares the `LLMJudge` instance by setting up context fields and evaluator name."""
88
  super().prepare()
89
- if isinstance(self.context_fields, str):
90
- self.context_fields = [self.context_fields]
91
- if isinstance(self.context_fields, List):
92
- self.context_fields = {
93
- context_field: context_field for context_field in self.context_fields
94
- }
95
 
96
  if self.evaluator_name is None:
97
  self.evaluator_name = self.inference_engine.get_engine_id()
@@ -112,24 +108,43 @@ class LLMJudge(BulkInstanceMetric):
112
  )
113
  return
114
 
115
- def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]:
 
 
 
 
 
 
 
 
 
 
116
  """Extracts and parses context fields from task data.
117
 
118
  Args:
119
  task_data (List[Dict[str, Any]]): The task data containing context information.
 
120
 
121
  Returns:
122
  List[Dict[str, str]]: A list of parsed context dictionaries.
123
  """
124
- return [
125
- get_parsed_context(
126
- {
127
- context_field_name: dict_get(td, context_field)
128
- for context_field_name, context_field in self.context_fields.items()
129
- }
 
 
 
 
 
 
 
 
 
130
  )
131
- for td in task_data
132
- ]
133
 
134
  def perform_evaluation_step(
135
  self,
@@ -211,7 +226,7 @@ class LLMJudge(BulkInstanceMetric):
211
  logger.info(
212
  f"Reading criteria from the task_data field '{self.criteria_field}'"
213
  )
214
- criterias = [
215
  fetch_artifact(task_data_instance[self.criteria_field])[0]
216
  for task_data_instance in task_data
217
  ]
@@ -219,18 +234,11 @@ class LLMJudge(BulkInstanceMetric):
219
  logger.info(
220
  "Reading criteria from self. Criteria is a single CriteriaWithOptions, replicating it for all predictions"
221
  )
222
- criterias: List[Criteria] = [self.criteria] * eval_count
223
- unique_criteria_names = list({criteria.name for criteria in criterias})
224
 
225
  logger.info(f"Criteria names are '{', '.join(unique_criteria_names)}'")
226
- return criterias
227
-
228
- def update_eval_fields_from_criteria(self, criteria: List[Criteria]):
229
- if not self.context_fields:
230
- self.context_fields = {
231
- context_field: context_field
232
- for context_field in criteria[0].context_fields
233
- }
234
 
235
  def get_predictions(
236
  self,
@@ -238,11 +246,28 @@ class LLMJudge(BulkInstanceMetric):
238
  criteria: List[Criteria],
239
  predictions: List[str],
240
  ) -> List[str]:
241
- if not predictions and criteria[0].prediction_field:
242
- return [
243
- dict_get(td, criteria[i].prediction_field)
244
- for i, td in enumerate(task_data)
245
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  return predictions
247
 
248
 
@@ -540,26 +565,25 @@ class LLMJudgeDirect(LLMJudge):
540
 
541
  evaluations_count = len(task_data)
542
  # TODO: find out how to serialize and deserialize enums
543
- criterias = self.get_criteria(task_data, evaluations_count)
544
- self.update_eval_fields_from_criteria(criterias)
545
- predictions = self.get_predictions(task_data, criterias, predictions)
546
- self.__set_main_score(criterias)
547
- contexts = self.get_contexts(task_data)
548
  if self.check_positional_bias:
549
- criterias += [
550
  CriteriaWithOptions(
551
  name=criteria.name,
552
  description=criteria.description,
553
  option_map=criteria.option_map,
554
  options=list(reversed(criteria.options)),
555
  )
556
- for criteria in criterias
557
  ]
558
  contexts += contexts
559
  predictions += predictions
560
 
561
  parsed_criterias = [
562
- self.__get_parsed_criteria(criteria) for criteria in criterias
563
  ]
564
 
565
  (
@@ -659,7 +683,7 @@ class LLMJudgeDirect(LLMJudge):
659
  option_selection_outputs,
660
  selections,
661
  evaluations_count,
662
- criterias,
663
  )
664
 
665
  return self.clean_results(results)
@@ -1384,9 +1408,13 @@ class LLMJudgePairwise(LLMJudge):
1384
  logger.info(
1385
  f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}'
1386
  )
 
 
 
 
 
1387
  predictions = self.__convert_predictions_to_dicts(predictions)
1388
  self.__set_main_score(predictions)
1389
- instances_count = len(predictions)
1390
  self.reduction_map = {"mean": ["score"]}
1391
  self.reduction_map["mean"].extend(
1392
  [f"{key}_winrate" for key in predictions[0].keys()]
@@ -1432,10 +1460,8 @@ class LLMJudgePairwise(LLMJudge):
1432
  response_pairs_list.append(response_pairs)
1433
  option_pairs_list.append(option_pairs)
1434
 
1435
- criterias = self.get_criteria(task_data, instances_count)
1436
- contexts = self.get_contexts(task_data)
1437
  if self.check_positional_bias:
1438
- criterias.extend(criterias)
1439
  contexts.extend(contexts)
1440
  for response_pairs, option_pairs in zip(
1441
  response_pairs_list, option_pairs_list
@@ -1454,8 +1480,8 @@ class LLMJudgePairwise(LLMJudge):
1454
  "response_b": response_pair[1],
1455
  "option_a": option_pair[0],
1456
  "option_b": option_pair[1],
1457
- "criteria_name": criterias[i].name,
1458
- "criteria_description": criterias[i].description,
1459
  "data_classification_policy": ["public"],
1460
  }
1461
  for i, (response_pairs, option_pairs) in enumerate(
@@ -1592,7 +1618,7 @@ class LLMJudgePairwise(LLMJudge):
1592
  selections[sli],
1593
  contests_count_list[i],
1594
  combination_indexes_list[i],
1595
- criterias[i],
1596
  )
1597
  results.append(instance_results)
1598
  slice_start = slice_end
 
43
  rank_indexes,
44
  )
45
  from .logging_utils import get_logger
46
+ from .metric_utils import EmptyPrediction
47
  from .metrics import BulkInstanceMetric
48
  from .task import Task
49
  from .templates import Template
 
67
  """Flag to check for positional bias. Detecting for positional bias duplicates the amount of inference calls."""
68
 
69
  context_fields: Union[str, List[str], Dict[str, str]] = ["context"]
70
+ """Fields to be used as context. If a dict is provided, the keys are used as the final names in the prompts, while the values are used to access the context variable values in the `task_data` object (it is recommended to provide the context_fields in the Criteria `context_fields` field as this field will be deprecated in the future)."""
71
 
72
  generate_summaries: bool = False
73
  """Flag to generate summaries of the assessments. Defaults to `False`."""
 
79
  """Flag to include prompts in the result. Defaults to `True`."""
80
 
81
  criteria_field: str = None
82
+ """The field specifying the evaluation criteria in the `task_data` object. If the `criteria` is provided, it will take precedence."""
83
 
84
  criteria: Criteria = None
85
+ """The criteria used for evaluation."""
86
 
87
  def prepare(self):
88
  """Prepares the `LLMJudge` instance by setting up context fields and evaluator name."""
89
  super().prepare()
90
+ self.context_fields = self.get_context_fields_as_dict(self.context_fields)
 
 
 
 
 
91
 
92
  if self.evaluator_name is None:
93
  self.evaluator_name = self.inference_engine.get_engine_id()
 
108
  )
109
  return
110
 
111
+ def get_context_fields_as_dict(self, context_fields: Union[str, List, Dict]):
112
+ result = context_fields if context_fields else {}
113
+ if isinstance(result, str):
114
+ result = [result]
115
+ if isinstance(result, List):
116
+ result = {context_field: context_field for context_field in result}
117
+ return result
118
+
119
+ def get_contexts(
120
+ self, task_data: List[Dict[str, Any]], criteria: List[Criteria]
121
+ ) -> List[Dict[str, str]]:
122
  """Extracts and parses context fields from task data.
123
 
124
  Args:
125
  task_data (List[Dict[str, Any]]): The task data containing context information.
126
+ criteria ( List[Criteria]): The criteria list from which to take the context fields if they weren't provided in the self.context_fields field
127
 
128
  Returns:
129
  List[Dict[str, str]]: A list of parsed context dictionaries.
130
  """
131
+ parsed_contexts = []
132
+ for i, td in enumerate(task_data):
133
+ context_fields_for_td = self.context_fields
134
+ if not context_fields_for_td and criteria[i].context_fields:
135
+ context_fields_for_td = self.get_context_fields_as_dict(
136
+ criteria[i].context_fields
137
+ )
138
+
139
+ parsed_contexts.append(
140
+ get_parsed_context(
141
+ {
142
+ context_field_name: dict_get(td, context_field)
143
+ for context_field_name, context_field in context_fields_for_td.items()
144
+ }
145
+ )
146
  )
147
+ return parsed_contexts
 
148
 
149
  def perform_evaluation_step(
150
  self,
 
226
  logger.info(
227
  f"Reading criteria from the task_data field '{self.criteria_field}'"
228
  )
229
+ criteria_list = [
230
  fetch_artifact(task_data_instance[self.criteria_field])[0]
231
  for task_data_instance in task_data
232
  ]
 
234
  logger.info(
235
  "Reading criteria from self. Criteria is a single CriteriaWithOptions, replicating it for all predictions"
236
  )
237
+ criteria_list: List[Criteria] = [self.criteria] * eval_count
238
+ unique_criteria_names = list({criteria.name for criteria in criteria_list})
239
 
240
  logger.info(f"Criteria names are '{', '.join(unique_criteria_names)}'")
241
+ return criteria_list
 
 
 
 
 
 
 
242
 
243
  def get_predictions(
244
  self,
 
246
  criteria: List[Criteria],
247
  predictions: List[str],
248
  ) -> List[str]:
249
+ if not predictions or all(
250
+ (
251
+ isinstance(prediction, EmptyPrediction)
252
+ or prediction == str(EmptyPrediction())
253
+ )
254
+ for prediction in predictions
255
+ ):
256
+ predictions_from_task_data = []
257
+ for i, td in enumerate(task_data):
258
+ if (
259
+ criteria[i].prediction_field is not None
260
+ and criteria[i].prediction_field in td
261
+ ):
262
+ predictions_from_task_data.append(
263
+ dict_get(td, criteria[i].prediction_field)
264
+ )
265
+ else:
266
+ raise UnitxtError(
267
+ "You must set either the predictions in the evaluate() call or specify the prediction field name to be taken from the task_data using the `Criteria`'s prediction_field field."
268
+ )
269
+ return predictions_from_task_data
270
+
271
  return predictions
272
 
273
 
 
565
 
566
  evaluations_count = len(task_data)
567
  # TODO: find out how to serialize and deserialize enums
568
+ criteria_list = self.get_criteria(task_data, evaluations_count)
569
+ predictions = self.get_predictions(task_data, criteria_list, predictions)
570
+ contexts = self.get_contexts(task_data, criteria_list)
571
+ self.__set_main_score(criteria_list)
 
572
  if self.check_positional_bias:
573
+ criteria_list += [
574
  CriteriaWithOptions(
575
  name=criteria.name,
576
  description=criteria.description,
577
  option_map=criteria.option_map,
578
  options=list(reversed(criteria.options)),
579
  )
580
+ for criteria in criteria_list
581
  ]
582
  contexts += contexts
583
  predictions += predictions
584
 
585
  parsed_criterias = [
586
+ self.__get_parsed_criteria(criteria) for criteria in criteria_list
587
  ]
588
 
589
  (
 
683
  option_selection_outputs,
684
  selections,
685
  evaluations_count,
686
+ criteria_list,
687
  )
688
 
689
  return self.clean_results(results)
 
1408
  logger.info(
1409
  f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}'
1410
  )
1411
+
1412
+ instances_count = len(predictions)
1413
+ criteria_list = self.get_criteria(task_data, instances_count)
1414
+ contexts = self.get_contexts(task_data, criteria_list)
1415
+ predictions = self.get_predictions(task_data, criteria_list, predictions)
1416
  predictions = self.__convert_predictions_to_dicts(predictions)
1417
  self.__set_main_score(predictions)
 
1418
  self.reduction_map = {"mean": ["score"]}
1419
  self.reduction_map["mean"].extend(
1420
  [f"{key}_winrate" for key in predictions[0].keys()]
 
1460
  response_pairs_list.append(response_pairs)
1461
  option_pairs_list.append(option_pairs)
1462
 
 
 
1463
  if self.check_positional_bias:
1464
+ criteria_list.extend(criteria_list)
1465
  contexts.extend(contexts)
1466
  for response_pairs, option_pairs in zip(
1467
  response_pairs_list, option_pairs_list
 
1480
  "response_b": response_pair[1],
1481
  "option_a": option_pair[0],
1482
  "option_b": option_pair[1],
1483
+ "criteria_name": criteria_list[i].name,
1484
+ "criteria_description": criteria_list[i].description,
1485
  "data_classification_policy": ["public"],
1486
  }
1487
  for i, (response_pairs, option_pairs) in enumerate(
 
1618
  selections[sli],
1619
  contests_count_list[i],
1620
  combination_indexes_list[i],
1621
+ criteria_list[i],
1622
  )
1623
  results.append(instance_results)
1624
  slice_start = slice_end
llm_as_judge_constants.py CHANGED
@@ -1,6 +1,6 @@
1
  import json
2
  from enum import Enum
3
- from typing import Dict, List, Optional
4
 
5
  from .artifact import Artifact
6
 
@@ -11,15 +11,29 @@ class OptionSelectionStrategyEnum(str, Enum):
11
 
12
 
13
  class CriteriaOption(Artifact):
 
 
14
  name: str
 
 
15
  description: str
 
16
 
17
 
18
  class Criteria(Artifact):
 
 
19
  name: str
 
 
20
  description: str
 
 
21
  prediction_field: Optional[str] = None
22
- context_fields: Optional[List[str]] = None
 
 
 
23
 
24
  @staticmethod
25
  def from_jsons(s: str):
@@ -36,8 +50,13 @@ class Criteria(Artifact):
36
 
37
 
38
  class CriteriaWithOptions(Criteria):
 
 
39
  options: List[CriteriaOption]
 
 
40
  option_map: Optional[Dict[str, float]] = None
 
41
 
42
  @staticmethod
43
  def from_jsons(s: str):
@@ -1262,6 +1281,7 @@ class DirectCriteriaCatalogEnum(Enum):
1262
  COMPLIANCE_ASSISTANT_MESSAGE = CriteriaWithOptions(
1263
  name="assistant_message_compliance",
1264
  description="The Assistant message complies with the User message.",
 
1265
  prediction_field="assistant message",
1266
  options=[
1267
  CriteriaOption(
 
1
  import json
2
  from enum import Enum
3
+ from typing import Dict, List, Optional, Union
4
 
5
  from .artifact import Artifact
6
 
 
11
 
12
 
13
  class CriteriaOption(Artifact):
14
+ """A criteria option."""
15
+
16
  name: str
17
+ """The name of the criteria option"""
18
+
19
  description: str
20
+ """The description of the criteria option"""
21
 
22
 
23
  class Criteria(Artifact):
24
+ """Criteria used by PairwiseLLMJudge to run evaluations."""
25
+
26
  name: str
27
+ """The name of the crieria"""
28
+
29
  description: str
30
+ """The description of the crieria"""
31
+
32
  prediction_field: Optional[str] = None
33
+ """The prediction field name this criteria expects and refers to, e.g. answer/model response/summary"""
34
+
35
+ context_fields: Union[str, List[str], Dict[str, str]] = None
36
+ """The context field names this criteria expects, i.e. [context]/[source article, user questions]"""
37
 
38
  @staticmethod
39
  def from_jsons(s: str):
 
50
 
51
 
52
  class CriteriaWithOptions(Criteria):
53
+ """Criteria used by DirectLLMJudge to run evaluations."""
54
+
55
  options: List[CriteriaOption]
56
+ """The options that the judge can choose between"""
57
+
58
  option_map: Optional[Dict[str, float]] = None
59
+ """A mapping from the option names to numerical values to use as scores"""
60
 
61
  @staticmethod
62
  def from_jsons(s: str):
 
1281
  COMPLIANCE_ASSISTANT_MESSAGE = CriteriaWithOptions(
1282
  name="assistant_message_compliance",
1283
  description="The Assistant message complies with the User message.",
1284
+ context_fields=["user message"],
1285
  prediction_field="assistant message",
1286
  options=[
1287
  CriteriaOption(
metric_utils.py CHANGED
@@ -49,6 +49,19 @@ def nan_mean(scores):
49
  return result
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  class FromPredictionsAndOriginalData(StreamInitializerOperator):
53
  def zip(self, predictions, references):
54
  for prediction, original in zip(predictions, references):
@@ -61,10 +74,13 @@ class FromPredictionsAndOriginalData(StreamInitializerOperator):
61
 
62
  def process(
63
  self,
64
- predictions: List[str],
65
- references: Iterable,
66
  split_name: str = DEFAULT_STREAM_NAME,
67
  ) -> MultiStream:
 
 
 
68
  return MultiStream(
69
  {
70
  split_name: DynamicStream(
@@ -86,7 +102,8 @@ class DeleteTargetPrefix(InstanceOperator, ArtifactFetcherMixin):
86
  if target_prefix is not None and len(target_prefix) > 0:
87
  target_prefix = target_prefix.format(**instance["task_data"])
88
  pattern = rf"^\s*{re.escape(target_prefix)}\s*"
89
- instance["prediction"] = re.sub(pattern, "", instance["prediction"])
 
90
  return instance
91
 
92
 
 
49
  return result
50
 
51
 
52
+ class EmptyPrediction:
53
+ def __repr__(self):
54
+ return "<__empty_prediction__>"
55
+
56
+ def __str__(self):
57
+ return "<__empty_prediction__>"
58
+
59
+
60
+ def empty_predictions_generator():
61
+ while True:
62
+ yield EmptyPrediction()
63
+
64
+
65
  class FromPredictionsAndOriginalData(StreamInitializerOperator):
66
  def zip(self, predictions, references):
67
  for prediction, original in zip(predictions, references):
 
74
 
75
  def process(
76
  self,
77
+ predictions: Optional[List[str]] = None,
78
+ references: Optional[Iterable] = None,
79
  split_name: str = DEFAULT_STREAM_NAME,
80
  ) -> MultiStream:
81
+ if predictions is None:
82
+ predictions = empty_predictions_generator()
83
+
84
  return MultiStream(
85
  {
86
  split_name: DynamicStream(
 
102
  if target_prefix is not None and len(target_prefix) > 0:
103
  target_prefix = target_prefix.format(**instance["task_data"])
104
  pattern = rf"^\s*{re.escape(target_prefix)}\s*"
105
+ if isinstance(instance["prediction"], str):
106
+ instance["prediction"] = re.sub(pattern, "", instance["prediction"])
107
  return instance
108
 
109
 
metrics.py CHANGED
@@ -6146,12 +6146,16 @@ class NormalizedSacrebleu(HuggingfaceMetric):
6146
 
6147
 
6148
  class CustomF1Fuzzy(CustomF1):
6149
- def calculate_groups_ratio(self, actual_group, total_group):
6150
- from fuzzywuzzy import fuzz
 
 
 
6151
 
 
6152
  tmp = []
6153
  for actual_key in actual_group.keys():
6154
- max_score = self.fuzz_ratio
6155
  best_total_key = None
6156
 
6157
  for total_key in total_group.keys():
@@ -6159,8 +6163,8 @@ class CustomF1Fuzzy(CustomF1):
6159
  tup_to = ast.literal_eval(total_key)
6160
 
6161
  if tup_ac[1] == tup_to[1]:
6162
- score = fuzz.ratio(tup_ac[0], tup_to[0])
6163
- if score > max_score:
6164
  max_score = score
6165
  best_total_key = total_key
6166
 
@@ -6173,7 +6177,57 @@ class CustomF1Fuzzy(CustomF1):
6173
 
6174
  class FuzzyNer(CustomF1Fuzzy):
6175
  prediction_type = List[Tuple[str, str]]
6176
- fuzz_ratio = 75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6177
 
6178
  def get_element_group(self, element, additional_input):
6179
  return element[1]
 
6146
 
6147
 
6148
  class CustomF1Fuzzy(CustomF1):
6149
+ min_score_for_match: float
6150
+
6151
+ @abstractmethod
6152
+ def score(self, val1, val2) -> float:
6153
+ pass
6154
 
6155
+ def calculate_groups_ratio(self, actual_group, total_group):
6156
  tmp = []
6157
  for actual_key in actual_group.keys():
6158
+ max_score = self.min_score_for_match
6159
  best_total_key = None
6160
 
6161
  for total_key in total_group.keys():
 
6163
  tup_to = ast.literal_eval(total_key)
6164
 
6165
  if tup_ac[1] == tup_to[1]:
6166
+ score = self.score(tup_ac[0], tup_to[0])
6167
+ if score >= max_score:
6168
  max_score = score
6169
  best_total_key = total_key
6170
 
 
6177
 
6178
  class FuzzyNer(CustomF1Fuzzy):
6179
  prediction_type = List[Tuple[str, str]]
6180
+ min_score_for_match = 0.750001 # Used to be > 0.75, and now changed to >= 0.750001
6181
+
6182
+ def score(self, val1, val2):
6183
+ from fuzzywuzzy import fuzz
6184
+
6185
+ return fuzz.ratio(val1, val2) / 100.0
6186
+
6187
+ def get_element_group(self, element, additional_input):
6188
+ return element[1]
6189
+
6190
+ def get_element_representation(self, element, additional_input):
6191
+ return str(element)
6192
+
6193
+
6194
+ class MetricBasedNer(CustomF1Fuzzy):
6195
+ """Calculates f1 metrics for NER , by comparing entity using a provided Unitxt metric.
6196
+
6197
+ While the Ner metric uses exact match to compare entities and FuzzyNer uses fuzzy matching,
6198
+ this customiziable metric can use any Unitxt metric to compare entities, including LLM as Judge.
6199
+ The metric must acceptstring prediction and references as input. The similarity threshold is
6200
+ set by the 'min_score_for_match' attribute.
6201
+
6202
+ Example:
6203
+ MetricBasedNer(metric=Rouge(), min_score_for_match=0.9)
6204
+
6205
+ MetricBasedNer(metric="metrics.llm_as_judge.direct.watsonx.llama3_3_70b[criteria=metrics.llm_as_judge.direct.criteria.correctness_based_on_ground_truth,context_fields=ground_truth]")
6206
+ """
6207
+
6208
+ prediction_type = List[Tuple[str, str]]
6209
+ metric: Metric
6210
+ min_score_for_match = 0.75
6211
+
6212
+ def score(self, val1, val2):
6213
+ multi_stream = MultiStream.from_iterables(
6214
+ {
6215
+ "test": [
6216
+ {
6217
+ "prediction": val1,
6218
+ "references": [val2],
6219
+ "task_data": {
6220
+ "ground_truth": val2,
6221
+ "reference": val2,
6222
+ },
6223
+ }
6224
+ ]
6225
+ }
6226
+ )
6227
+ output_multi_stream = self.metric(multi_stream)
6228
+ output_stream = output_multi_stream["test"]
6229
+ result = next(iter(output_stream))
6230
+ return result["score"]["global"]["score"]
6231
 
6232
  def get_element_group(self, element, additional_input):
6233
  return element[1]
operators.py CHANGED
@@ -536,7 +536,9 @@ class InstanceFieldOperator(InstanceOperator):
536
  continue
537
  old_value = self.get_default
538
 
539
- with error_context(self, field=from_field, action="Process Field"):
 
 
540
  if self.process_every_value:
541
  new_value = [
542
  self.process_instance_value(value, instance)
 
536
  continue
537
  old_value = self.get_default
538
 
539
+ with error_context(
540
+ self, field=from_field, action="Process Field", value=old_value
541
+ ):
542
  if self.process_every_value:
543
  new_value = [
544
  self.process_instance_value(value, instance)
version.py CHANGED
@@ -1 +1 @@
1
- version = "1.26.2"
 
1
+ version = "1.26.3"