Spaces:
Running
Running
update check_correctness
Browse files
utils.py
CHANGED
|
@@ -31,6 +31,7 @@ def check_correctness(sample, generation, timeout, debug=True):
|
|
| 31 |
print(f"global timeout")
|
| 32 |
return result[0]
|
| 33 |
|
|
|
|
| 34 |
def evaluate_generations(generations: list, level: str = "all", debug: bool = False):
|
| 35 |
"""We take the list of code generations and try to compile them
|
| 36 |
and the run their corresponding unit tests which are retrieved from the APPS dataset.
|
|
@@ -57,7 +58,7 @@ def evaluate_generations(generations: list, level: str = "all", debug: bool = Fa
|
|
| 57 |
for o_idx, o in enumerate(problem_generations):
|
| 58 |
curr_res = [-2]
|
| 59 |
try:
|
| 60 |
-
curr_res =
|
| 61 |
if debug:
|
| 62 |
print(f"\nSuccessful compilation of task {index}!")
|
| 63 |
fixed = []
|
|
@@ -207,5 +208,5 @@ def compute_metrics(generations, level="all", k_list=[1, 10, 100], count_errors=
|
|
| 207 |
metrics = get_results(results, count_errors=count_errors, k_list=k_list)
|
| 208 |
return metrics
|
| 209 |
|
| 210 |
-
#import doctest
|
| 211 |
-
#doctest.testmod()
|
|
|
|
| 31 |
print(f"global timeout")
|
| 32 |
return result[0]
|
| 33 |
|
| 34 |
+
|
| 35 |
def evaluate_generations(generations: list, level: str = "all", debug: bool = False):
|
| 36 |
"""We take the list of code generations and try to compile them
|
| 37 |
and the run their corresponding unit tests which are retrieved from the APPS dataset.
|
|
|
|
| 58 |
for o_idx, o in enumerate(problem_generations):
|
| 59 |
curr_res = [-2]
|
| 60 |
try:
|
| 61 |
+
curr_res = check_correctness(sample, o, timeout=TIMEOUT, debug=debug)
|
| 62 |
if debug:
|
| 63 |
print(f"\nSuccessful compilation of task {index}!")
|
| 64 |
fixed = []
|
|
|
|
| 208 |
metrics = get_results(results, count_errors=count_errors, k_list=k_list)
|
| 209 |
return metrics
|
| 210 |
|
| 211 |
+
# import doctest
|
| 212 |
+
# doctest.testmod()
|