Spaces:
Runtime error
Runtime error
future-xy
commited on
Commit
·
88d1c0e
1
Parent(s):
1ae96c8
fix display
Browse files- backend-cli.py +2 -3
- src/backend/tasks/measurement_task_utils.py +5 -9
- src/display/utils.py +4 -4
- src/leaderboard/read_evals.py +4 -1
- src/populate.py +12 -4
backend-cli.py
CHANGED
|
@@ -12,7 +12,6 @@ from src.backend.run_eval_suite import run_evaluation
|
|
| 12 |
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
| 13 |
from src.backend.sort_queue import sort_models_by_priority
|
| 14 |
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
|
| 15 |
-
LIMIT=2
|
| 16 |
from src.backend.manage_requests import EvalRequest
|
| 17 |
from src.leaderboard.read_evals import EvalResult
|
| 18 |
|
|
@@ -150,10 +149,10 @@ def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
|
|
| 150 |
else:
|
| 151 |
raise
|
| 152 |
|
| 153 |
-
print("RESULTS", results)
|
| 154 |
|
| 155 |
dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
|
| 156 |
-
print(dumped)
|
| 157 |
|
| 158 |
output_path = os.path.join(
|
| 159 |
EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json"
|
|
|
|
| 12 |
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
| 13 |
from src.backend.sort_queue import sort_models_by_priority
|
| 14 |
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
|
|
|
|
| 15 |
from src.backend.manage_requests import EvalRequest
|
| 16 |
from src.leaderboard.read_evals import EvalResult
|
| 17 |
|
|
|
|
| 149 |
else:
|
| 150 |
raise
|
| 151 |
|
| 152 |
+
# print("RESULTS", results)
|
| 153 |
|
| 154 |
dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
|
| 155 |
+
# print(dumped)
|
| 156 |
|
| 157 |
output_path = os.path.join(
|
| 158 |
EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json"
|
src/backend/tasks/measurement_task_utils.py
CHANGED
|
@@ -8,21 +8,17 @@ def process_results_decorator(func):
|
|
| 8 |
def wrapper(self, doc, results, *args, **kwargs):
|
| 9 |
# We process the results here
|
| 10 |
processed_results = [r[0] for r in results]
|
| 11 |
-
|
| 12 |
-
# end_to_end_time = end_to_end_time / batch_size
|
| 13 |
-
# prefilling_time = prefilling_time / batch_size
|
| 14 |
-
# token_per_sec = output_length / (decoding_time / batch_size)
|
| 15 |
|
| 16 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
| 17 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
| 18 |
-
|
| 19 |
-
print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time},
|
| 20 |
|
| 21 |
# Now call the original process_results with the processed results
|
| 22 |
result_dict = func(self, doc, processed_results, *args, **kwargs)
|
| 23 |
result_dict["end_to_end_time"] = end_to_end_time
|
| 24 |
result_dict["prefilling_time"] = prefilling_time
|
| 25 |
-
result_dict["
|
| 26 |
return result_dict
|
| 27 |
return wrapper
|
| 28 |
|
|
@@ -33,7 +29,7 @@ def aggregation_decorator(func):
|
|
| 33 |
aggregation_list = func(self, *args, **kwargs)
|
| 34 |
aggregation_list["end_to_end_time"] = mean
|
| 35 |
aggregation_list["prefilling_time"] = mean
|
| 36 |
-
aggregation_list["
|
| 37 |
return aggregation_list
|
| 38 |
return wrapper
|
| 39 |
|
|
@@ -44,7 +40,7 @@ def higher_is_better_decorator(func):
|
|
| 44 |
higher_is_better_dict = func(self, *args, **kwargs)
|
| 45 |
higher_is_better_dict["end_to_end_time"] = False
|
| 46 |
higher_is_better_dict["prefilling_time"] = False
|
| 47 |
-
higher_is_better_dict["
|
| 48 |
return higher_is_better_dict
|
| 49 |
return wrapper
|
| 50 |
|
|
|
|
| 8 |
def wrapper(self, doc, results, *args, **kwargs):
|
| 9 |
# We process the results here
|
| 10 |
processed_results = [r[0] for r in results]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
| 13 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
| 14 |
+
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
| 15 |
+
print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
| 16 |
|
| 17 |
# Now call the original process_results with the processed results
|
| 18 |
result_dict = func(self, doc, processed_results, *args, **kwargs)
|
| 19 |
result_dict["end_to_end_time"] = end_to_end_time
|
| 20 |
result_dict["prefilling_time"] = prefilling_time
|
| 21 |
+
result_dict["decoding_throughput"] = decoding_throughput
|
| 22 |
return result_dict
|
| 23 |
return wrapper
|
| 24 |
|
|
|
|
| 29 |
aggregation_list = func(self, *args, **kwargs)
|
| 30 |
aggregation_list["end_to_end_time"] = mean
|
| 31 |
aggregation_list["prefilling_time"] = mean
|
| 32 |
+
aggregation_list["decoding_throughput"] = mean
|
| 33 |
return aggregation_list
|
| 34 |
return wrapper
|
| 35 |
|
|
|
|
| 40 |
higher_is_better_dict = func(self, *args, **kwargs)
|
| 41 |
higher_is_better_dict["end_to_end_time"] = False
|
| 42 |
higher_is_better_dict["prefilling_time"] = False
|
| 43 |
+
higher_is_better_dict["decoding_throughput"] = True
|
| 44 |
return higher_is_better_dict
|
| 45 |
return wrapper
|
| 46 |
|
src/display/utils.py
CHANGED
|
@@ -73,12 +73,12 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
| 73 |
# Inference framework
|
| 74 |
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent("Inference framework", "str", True)])
|
| 75 |
|
| 76 |
-
# System performance metrics
|
| 77 |
-
auto_eval_column_dict.append(["prefilling_time", ColumnContent, ColumnContent("Prefilling time (s)", "number", True)])
|
| 78 |
-
auto_eval_column_dict.append(["token_per_second", ColumnContent, ColumnContent("Tokens/s", "number", True)])
|
| 79 |
-
|
| 80 |
for task in Tasks:
|
| 81 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
# Model information
|
| 84 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
|
|
|
| 73 |
# Inference framework
|
| 74 |
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent("Inference framework", "str", True)])
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
for task in Tasks:
|
| 77 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 78 |
+
# System performance metrics
|
| 79 |
+
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} End-to-end time (s)", "number", True)])
|
| 80 |
+
auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} Prefilling time (s)", "number", True)])
|
| 81 |
+
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} Decoding throughput (tok/s)", "number", True)])
|
| 82 |
|
| 83 |
# Model information
|
| 84 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -107,7 +107,10 @@ class EvalResult:
|
|
| 107 |
multiplier = 1.0
|
| 108 |
if "squad" in benchmark:
|
| 109 |
multiplier = 1.0
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
| 111 |
# print('RESULTS', data['results'])
|
| 112 |
# print('XXX', benchmark, metric, value, multiplier)
|
| 113 |
results[benchmark][metric] = value * multiplier
|
|
|
|
| 107 |
multiplier = 1.0
|
| 108 |
if "squad" in benchmark:
|
| 109 |
multiplier = 1.0
|
| 110 |
+
if "time" in metric:
|
| 111 |
+
multiplier = 1.0
|
| 112 |
+
if "throughput" in metric:
|
| 113 |
+
multiplier = 1.0
|
| 114 |
# print('RESULTS', data['results'])
|
| 115 |
# print('XXX', benchmark, metric, value, multiplier)
|
| 116 |
results[benchmark][metric] = value * multiplier
|
src/populate.py
CHANGED
|
@@ -30,7 +30,8 @@ def get_leaderboard_df(
|
|
| 30 |
raw_data[result_idx], requests_path_open_llm
|
| 31 |
)
|
| 32 |
|
| 33 |
-
all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
|
|
|
|
| 34 |
|
| 35 |
name_to_bm_map = {}
|
| 36 |
|
|
@@ -45,15 +46,22 @@ def get_leaderboard_df(
|
|
| 45 |
name_to_bm_map[name] = bm
|
| 46 |
|
| 47 |
# bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
all_data_json = []
|
| 50 |
for entry in all_data_json_:
|
| 51 |
new_entry = copy.deepcopy(entry)
|
| 52 |
-
|
| 53 |
for k, v in entry.items():
|
| 54 |
if k in name_to_bm_map:
|
| 55 |
benchmark, metric = name_to_bm_map[k]
|
| 56 |
new_entry[k] = entry[k][metric]
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
all_data_json += [new_entry]
|
| 59 |
|
|
@@ -69,10 +77,10 @@ def get_leaderboard_df(
|
|
| 69 |
df[col] = np.nan
|
| 70 |
|
| 71 |
if not df.empty:
|
| 72 |
-
df = df
|
| 73 |
|
| 74 |
# filter out if any of the benchmarks have not been produced
|
| 75 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 76 |
|
| 77 |
return raw_data, df
|
| 78 |
|
|
|
|
| 30 |
raw_data[result_idx], requests_path_open_llm
|
| 31 |
)
|
| 32 |
|
| 33 |
+
# all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
|
| 34 |
+
all_data_json_ = [v.to_dict() for v in raw_data] # include incomplete evals
|
| 35 |
|
| 36 |
name_to_bm_map = {}
|
| 37 |
|
|
|
|
| 46 |
name_to_bm_map[name] = bm
|
| 47 |
|
| 48 |
# bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
|
| 49 |
+
system_metrics_to_name_map = {
|
| 50 |
+
"end_to_end_time": "End-to-end time (s)",
|
| 51 |
+
"prefilling_time": "Prefilling time (s)",
|
| 52 |
+
"decoding_throughput": "Decoding throughput (tok/s)",
|
| 53 |
+
}
|
| 54 |
|
| 55 |
all_data_json = []
|
| 56 |
for entry in all_data_json_:
|
| 57 |
new_entry = copy.deepcopy(entry)
|
|
|
|
| 58 |
for k, v in entry.items():
|
| 59 |
if k in name_to_bm_map:
|
| 60 |
benchmark, metric = name_to_bm_map[k]
|
| 61 |
new_entry[k] = entry[k][metric]
|
| 62 |
+
for sys_metric, metric_namne in system_metrics_to_name_map.items():
|
| 63 |
+
if sys_metric in entry[k]:
|
| 64 |
+
new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
|
| 65 |
|
| 66 |
all_data_json += [new_entry]
|
| 67 |
|
|
|
|
| 77 |
df[col] = np.nan
|
| 78 |
|
| 79 |
if not df.empty:
|
| 80 |
+
df = df.round(decimals=2)
|
| 81 |
|
| 82 |
# filter out if any of the benchmarks have not been produced
|
| 83 |
+
# df = df[has_no_nan_values(df, benchmark_cols)]
|
| 84 |
|
| 85 |
return raw_data, df
|
| 86 |
|