Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Apr 17, 2024

Commit

f418928

verified ·

1 Parent(s): 5c531b1

Upload llm_as_judge.py with huggingface_hub

Browse files

Files changed (1) hide show

llm_as_judge.py +58 -0

llm_as_judge.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from typing import Any, Dict, List
+import evaluate
+from .api import produce
+from .inference import InferenceEngine
+from .metrics import BulkInstanceMetric
+class LLMAsJudge(BulkInstanceMetric):
+    """LLM as judge based metric class for evaluating correctness.
+    Attributes:
+        main_score (str): The main score used for evaluation.
+        reduction_map (dict): A dictionary specifying the reduction method for the metric.
+        betch_size (int): The size of the bulk.
+        recipe (str): The unitxt recipe that will be used to create the judge dataset.
+        inference (InferenceEngine): the module that creates the inference.
+    Methods:
+        prepare(self): Initialization method for the metric.
+        compute(self, references, predictions, additional_inputs): Method to compute the metric.
+    Usage:
+        metric = LlamaIndexCorrectnessMetric()
+        scores = metric.compute(references, prediction, additional_inputs)
+    """
+    main_score: str = "llm_as_judge"
+    reduction_map: Dict[str, List[str]] = None
+    batch_size: int = 32
+    recipe: str
+    inference_model: InferenceEngine
+    def prepare(self):
+        super().prepare()
+        if self.reduction_map is None:
+            self.reduction_map = {"mean": [self.main_score]}
+    def compute(
+        self,
+        references: List[List[Any]],
+        predictions: List[Any],
+        task_data: List[Dict],
+    ) -> List[Dict[str, Any]]:
+        instances = [
+            {
+                **task_data_instance,
+                **{"model_output": prediction, "rating_label": "[[5]]"},
+            }
+            for task_data_instance, prediction in zip(task_data, predictions)
+        ]
+        dataset = produce(instances, self.recipe)
+        verdicts = self.inference_model.infer(dataset)
+        meta_metric = evaluate.load("unitxt/metric")
+        meta_scores = meta_metric.compute(predictions=verdicts, references=dataset)
+        return [{self.main_score: instance["prediction"]} for instance in meta_scores]