Duibonduil commited on
Commit
843bf43
·
verified ·
1 Parent(s): 23ccadc

Upload evaluator.py

Browse files
Files changed (1) hide show
  1. aworld/evaluations/evaluator.py +71 -0
aworld/evaluations/evaluator.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ # Copyright (c) 2025 inclusionAI.
3
+ import abc
4
+ from dataclasses import dataclass, field
5
+ from typing import List, Optional, Dict, Any
6
+
7
+ from aworld.config.conf import EvaluationConfig
8
+ from aworld.core.context.base import Context
9
+
10
+
11
+ class EvaluationCriteria:
12
+ pass
13
+
14
+
15
+ @dataclass
16
+ class EvaluationResult:
17
+ task_id: str
18
+ metrics: Dict[str, Any] = field(default_factory=dict)
19
+ details: Dict[str, Any] = field(default_factory=dict)
20
+ score: float = 0.
21
+ passed: bool = False
22
+ error_message: Optional[str] = None
23
+
24
+
25
+ class Evaluator:
26
+ __metaclass__ = abc.ABCMeta
27
+
28
+ def __init__(self,
29
+ conf: EvaluationConfig,
30
+ dataset: object = None,
31
+ file_path: str = None,
32
+ context: Context = None,
33
+ results: List[str] = None,
34
+ ground_truth: List[str] = None):
35
+ self.conf = conf
36
+ self.context = context
37
+ self.dataset = dataset
38
+ self.file_path = file_path
39
+ self.results = results
40
+ self.ground_truth = ground_truth
41
+
42
+ self.eval_results = None
43
+
44
+ @abc.abstractmethod
45
+ async def run(self):
46
+ """The evaluation complete pipeline."""
47
+
48
+ async def evaluate(self) -> EvaluationResult:
49
+ """Evaluate the dataset/task.
50
+
51
+ Returns:
52
+ EvaluationResult
53
+ """
54
+ await self.pre_evaluate()
55
+ results = await self.do_evaluate()
56
+ return await self.post_evaluate(results)
57
+
58
+ @abc.abstractmethod
59
+ async def do_evaluate(self) -> EvaluationResult:
60
+ """Implement specific evaluation process."""
61
+
62
+ async def pre_evaluate(self) -> None:
63
+ """Can be used to perform any setup before evaluation."""
64
+
65
+ async def post_evaluate(self, evaluate_result: EvaluationResult) -> EvaluationResult:
66
+ """Used to perform integration testing or clean up tasks after evaluation.
67
+
68
+ Args:
69
+ evaluate_result: The result of the evaluate dataset.
70
+ """
71
+ return evaluate_result