Upload llm_as_judge.py with huggingface_hub
Browse files- llm_as_judge.py +58 -0
llm_as_judge.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict, List
|
| 2 |
+
|
| 3 |
+
import evaluate
|
| 4 |
+
|
| 5 |
+
from .api import produce
|
| 6 |
+
from .inference import InferenceEngine
|
| 7 |
+
from .metrics import BulkInstanceMetric
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class LLMAsJudge(BulkInstanceMetric):
|
| 11 |
+
"""LLM as judge based metric class for evaluating correctness.
|
| 12 |
+
|
| 13 |
+
Attributes:
|
| 14 |
+
main_score (str): The main score used for evaluation.
|
| 15 |
+
reduction_map (dict): A dictionary specifying the reduction method for the metric.
|
| 16 |
+
betch_size (int): The size of the bulk.
|
| 17 |
+
recipe (str): The unitxt recipe that will be used to create the judge dataset.
|
| 18 |
+
inference (InferenceEngine): the module that creates the inference.
|
| 19 |
+
|
| 20 |
+
Methods:
|
| 21 |
+
prepare(self): Initialization method for the metric.
|
| 22 |
+
compute(self, references, predictions, additional_inputs): Method to compute the metric.
|
| 23 |
+
|
| 24 |
+
Usage:
|
| 25 |
+
metric = LlamaIndexCorrectnessMetric()
|
| 26 |
+
scores = metric.compute(references, prediction, additional_inputs)
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
main_score: str = "llm_as_judge"
|
| 30 |
+
reduction_map: Dict[str, List[str]] = None
|
| 31 |
+
batch_size: int = 32
|
| 32 |
+
recipe: str
|
| 33 |
+
inference_model: InferenceEngine
|
| 34 |
+
|
| 35 |
+
def prepare(self):
|
| 36 |
+
super().prepare()
|
| 37 |
+
if self.reduction_map is None:
|
| 38 |
+
self.reduction_map = {"mean": [self.main_score]}
|
| 39 |
+
|
| 40 |
+
def compute(
|
| 41 |
+
self,
|
| 42 |
+
references: List[List[Any]],
|
| 43 |
+
predictions: List[Any],
|
| 44 |
+
task_data: List[Dict],
|
| 45 |
+
) -> List[Dict[str, Any]]:
|
| 46 |
+
instances = [
|
| 47 |
+
{
|
| 48 |
+
**task_data_instance,
|
| 49 |
+
**{"model_output": prediction, "rating_label": "[[5]]"},
|
| 50 |
+
}
|
| 51 |
+
for task_data_instance, prediction in zip(task_data, predictions)
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
dataset = produce(instances, self.recipe)
|
| 55 |
+
verdicts = self.inference_model.infer(dataset)
|
| 56 |
+
meta_metric = evaluate.load("unitxt/metric")
|
| 57 |
+
meta_scores = meta_metric.compute(predictions=verdicts, references=dataset)
|
| 58 |
+
return [{self.main_score: instance["prediction"]} for instance in meta_scores]
|